aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile23
-rw-r--r--arch/x86/kernel/acpi/boot.c72
-rw-r--r--arch/x86/kernel/acpi/sleep.c25
-rw-r--r--arch/x86/kernel/alternative.c46
-rw-r--r--arch/x86/kernel/amd_iommu.c606
-rw-r--r--arch/x86/kernel/amd_iommu_init.c579
-rw-r--r--arch/x86/kernel/aperture_64.c7
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)1197
-rw-r--r--arch/x86/kernel/apic_64.c1393
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c13
-rw-r--r--arch/x86/kernel/bios_uv.c141
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c105
-rw-r--r--arch/x86/kernel/cpu/amd.c549
-rw-r--r--arch/x86/kernel/cpu/amd_64.c222
-rw-r--r--arch/x86/kernel/cpu/bugs.c29
-rw-r--r--arch/x86/kernel/cpu/centaur.c15
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1009
-rw-r--r--arch/x86/kernel/cpu/common_64.c681
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c22
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c55
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c151
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c6
-rw-r--r--arch/x86/kernel/cpu/cyrix.c73
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c357
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c178
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c281
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c109
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c18
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c14
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/dumpstack_32.c449
-rw-r--r--arch/x86/kernel/dumpstack_64.c575
-rw-r--r--arch/x86/kernel/e820.c65
-rw-r--r--arch/x86/kernel/early-quirks.c120
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/efi.c10
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/entry_32.S115
-rw-r--r--arch/x86/kernel/entry_64.S220
-rw-r--r--arch/x86/kernel/es7000_32.c363
-rw-r--r--arch/x86/kernel/ftrace.c124
-rw-r--r--arch/x86/kernel/genapic_64.c87
-rw-r--r--arch/x86/kernel/genapic_flat_64.c66
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c157
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head64.c17
-rw-r--r--arch/x86/kernel/head_32.S42
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c510
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)2147
-rw-r--r--arch/x86/kernel/io_apic_32.c2900
-rw-r--r--arch/x86/kernel/io_delay.c11
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c9
-rw-r--r--arch/x86/kernel/irq.c189
-rw-r--r--arch/x86/kernel/irq_32.c201
-rw-r--r--arch/x86/kernel/irq_64.c169
-rw-r--r--arch/x86/kernel/irqinit_32.c84
-rw-r--r--arch/x86/kernel/irqinit_64.c74
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c32
-rw-r--r--arch/x86/kernel/ldt.c16
-rw-r--r--arch/x86/kernel/machine_kexec_32.c57
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c851
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c225
-rw-r--r--arch/x86/kernel/msr.c42
-rw-r--r--arch/x86/kernel/nmi.c50
-rw-r--r--arch/x86/kernel/numaq_32.c204
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c15
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c210
-rw-r--r--arch/x86/kernel/pci-dma.c347
-rw-r--r--arch/x86/kernel/pci-gart_64.c181
-rw-r--r--arch/x86/kernel/pci-nommu.c20
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c4
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/process_32.c108
-rw-r--r--arch/x86/kernel/process_64.c261
-rw-r--r--arch/x86/kernel/ptrace.c653
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c44
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/rtc.c42
-rw-r--r--arch/x86/kernel/setup.c305
-rw-r--r--arch/x86/kernel/setup_percpu.c53
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c280
-rw-r--r--arch/x86/kernel/signal_64.c313
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c382
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c9
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c5
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)902
-rw-r--r--arch/x86/kernel/traps_64.c1217
-rw-r--r--arch/x86/kernel/tsc.c424
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c96
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S17
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/xsave.c345
167 files changed, 16356 insertions, 13064 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..d7e5a58ee22f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
13endif 14endif
14 15
15# 16#
@@ -22,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
22CFLAGS_tsc.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
23 24
24obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
25obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
26obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
27obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
28obj-$(CONFIG_X86_VISWS) += visws_quirks.o 29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -37,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
37 38
38obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
39obj-y += process.o 40obj-y += process.o
40obj-y += i387.o 41obj-y += i387.o xsave.o
41obj-y += ptrace.o 42obj-y += ptrace.o
42obj-y += ds.o 43obj-y += ds.o
43obj-$(CONFIG_X86_32) += tls.o 44obj-$(CONFIG_X86_32) += tls.o
@@ -50,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
50obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
51obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
52obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
53obj-$(CONFIG_MICROCODE) += microcode.o
54obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
55apm-y := apm_32.o 55apm-y := apm_32.o
56obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -60,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
62obj-$(CONFIG_X86_MPPARSE) += mpparse.o 62obj-$(CONFIG_X86_MPPARSE) += mpparse.o
63obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
64obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic.o
65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
71obj-$(CONFIG_X86_ES7000) += es7000_32.o
71obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
72obj-y += vsmp_64.o 73obj-y += vsmp_64.o
73obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
@@ -88,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
88obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
89obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
90obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
91obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
92obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
93 94
94obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -98,10 +99,18 @@ scx200-y += scx200_32.o
98 99
99obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
100 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
101### 107###
102# 64 bit specific files 108# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
111 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
107 116
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..8c1f76abae9e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -154,10 +153,21 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
154} 153}
155 154
156#ifdef CONFIG_PCI_MMCONFIG 155#ifdef CONFIG_PCI_MMCONFIG
156
157static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
158
157/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 159/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
158struct acpi_mcfg_allocation *pci_mmcfg_config; 160struct acpi_mcfg_allocation *pci_mmcfg_config;
159int pci_mmcfg_config_num; 161int pci_mmcfg_config_num;
160 162
163static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
164{
165 if (!strcmp(mcfg->header.oem_id, "SGI"))
166 acpi_mcfg_64bit_base_addr = TRUE;
167
168 return 0;
169}
170
161int __init acpi_parse_mcfg(struct acpi_table_header *header) 171int __init acpi_parse_mcfg(struct acpi_table_header *header)
162{ 172{
163 struct acpi_table_mcfg *mcfg; 173 struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
190 } 200 }
191 201
192 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 202 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
203
204 acpi_mcfg_oem_check(mcfg);
205
193 for (i = 0; i < pci_mmcfg_config_num; ++i) { 206 for (i = 0; i < pci_mmcfg_config_num; ++i) {
194 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 207 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
208 !acpi_mcfg_64bit_base_addr) {
195 printk(KERN_ERR PREFIX 209 printk(KERN_ERR PREFIX
196 "MMCONFIG not in low 4GB of memory\n"); 210 "MMCONFIG not in low 4GB of memory\n");
197 kfree(pci_mmcfg_config); 211 kfree(pci_mmcfg_config);
@@ -239,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
239 return; 253 return;
240 } 254 }
241 255
242#ifdef CONFIG_X86_32
243 if (boot_cpu_physical_apicid != -1U) 256 if (boot_cpu_physical_apicid != -1U)
244 ver = apic_version[boot_cpu_physical_apicid]; 257 ver = apic_version[boot_cpu_physical_apicid];
245#endif
246 258
247 generic_processor_info(id, ver); 259 generic_processor_info(id, ver);
248} 260}
@@ -761,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
761 773
762 set_fixmap_nocache(FIX_APIC_BASE, address); 774 set_fixmap_nocache(FIX_APIC_BASE, address);
763 if (boot_cpu_physical_apicid == -1U) { 775 if (boot_cpu_physical_apicid == -1U) {
764 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 776 boot_cpu_physical_apicid = read_apic_id();
765#ifdef CONFIG_X86_32
766 apic_version[boot_cpu_physical_apicid] = 777 apic_version[boot_cpu_physical_apicid] =
767 GET_APIC_VERSION(apic_read(APIC_LVR)); 778 GET_APIC_VERSION(apic_read(APIC_LVR));
768#endif
769 } 779 }
770} 780}
771 781
@@ -1021,7 +1031,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1031 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif 1032#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci); 1033 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); 1034 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025 1035
1026#ifdef CONFIG_X86_ES7000 1036#ifdef CONFIG_X86_ES7000
1027 /* 1037 /*
@@ -1127,8 +1137,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1127 return gsi; 1137 return gsi;
1128 } 1138 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1139 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1140 pr_debug("Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1141 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32 1142#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1143 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else 1144#else
@@ -1247,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1247 1257
1248 count = 1258 count =
1249 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 1259 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
1250 NR_IRQ_VECTORS); 1260 nr_irqs);
1251 if (count < 0) { 1261 if (count < 0) {
1252 printk(KERN_ERR PREFIX 1262 printk(KERN_ERR PREFIX
1253 "Error parsing interrupt source overrides entry\n"); 1263 "Error parsing interrupt source overrides entry\n");
@@ -1267,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1267 1277
1268 count = 1278 count =
1269 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 1279 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
1270 NR_IRQ_VECTORS); 1280 nr_irqs);
1271 if (count < 0) { 1281 if (count < 0) {
1272 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); 1282 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1273 /* TBD: Cleanup to allow fallback to MPS */ 1283 /* TBD: Cleanup to allow fallback to MPS */
@@ -1337,7 +1347,9 @@ static void __init acpi_process_madt(void)
1337 acpi_ioapic = 1; 1347 acpi_ioapic = 1;
1338 1348
1339 smp_found_config = 1; 1349 smp_found_config = 1;
1350#ifdef CONFIG_X86_32
1340 setup_apic_routing(); 1351 setup_apic_routing();
1352#endif
1341 } 1353 }
1342 } 1354 }
1343 if (error == -EINVAL) { 1355 if (error == -EINVAL) {
@@ -1407,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1407 */ 1419 */
1408static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1420static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1409{ 1421{
1410 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1422 /*
1411 acpi_skip_timer_override = 1; 1423 * The ati_ixp4x0_rev() early PCI quirk should have set
1424 * the acpi_skip_timer_override flag already:
1425 */
1426 if (!acpi_skip_timer_override) {
1427 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1428 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1429 d->ident);
1430 acpi_skip_timer_override = 1;
1431 }
1412 return 0; 1432 return 0;
1413} 1433}
1414 1434
@@ -1579,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1579 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), 1599 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1580 }, 1600 },
1581 }, 1601 },
1602 {}
1603};
1604
1605/* second table for DMI checks that should run after early-quirks */
1606static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1582 /* 1607 /*
1583 * HP laptops which use a DSDT reporting as HP/SB400/10000, 1608 * HP laptops which use a DSDT reporting as HP/SB400/10000,
1584 * which includes some code which overrides all temperature 1609 * which includes some code which overrides all temperature
@@ -1591,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1591 */ 1616 */
1592 { 1617 {
1593 .callback = dmi_ignore_irq0_timer_override, 1618 .callback = dmi_ignore_irq0_timer_override,
1619 .ident = "HP nx6115 laptop",
1620 .matches = {
1621 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1622 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1623 },
1624 },
1625 {
1626 .callback = dmi_ignore_irq0_timer_override,
1594 .ident = "HP NX6125 laptop", 1627 .ident = "HP NX6125 laptop",
1595 .matches = { 1628 .matches = {
1596 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1629 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1605,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1605 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1638 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1606 }, 1639 },
1607 }, 1640 },
1641 {
1642 .callback = dmi_ignore_irq0_timer_override,
1643 .ident = "HP 6715b laptop",
1644 .matches = {
1645 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1646 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1647 },
1648 },
1608 {} 1649 {}
1609}; 1650};
1610 1651
@@ -1691,6 +1732,9 @@ int __init early_acpi_boot_init(void)
1691 1732
1692int __init acpi_boot_init(void) 1733int __init acpi_boot_init(void)
1693{ 1734{
1735 /* those are executed after early-quirks are executed */
1736 dmi_check_system(acpi_dmi_table_late);
1737
1694 /* 1738 /*
1695 * If acpi_disabled, bail out 1739 * If acpi_disabled, bail out
1696 * One exception: acpi=ht continues far enough to enumerate LAPICs 1740 * One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..806b4e9051b4 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,8 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
13#include <asm/desc.h>
12 14
13#include "realmode/wakeup.h" 15#include "realmode/wakeup.h"
14#include "sleep.h" 16#include "sleep.h"
@@ -19,19 +21,10 @@ unsigned long acpi_realmode_flags;
19/* address in low memory of the wakeup routine. */ 21/* address in low memory of the wakeup routine. */
20static unsigned long acpi_realmode; 22static unsigned long acpi_realmode;
21 23
22#ifdef CONFIG_64BIT 24#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
23static char temp_stack[10240]; 25static char temp_stack[4096];
24#endif 26#endif
25 27
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 28/**
36 * acpi_save_state_mem - save kernel state 29 * acpi_save_state_mem - save kernel state
37 * 30 *
@@ -94,7 +87,7 @@ int acpi_save_state_mem(void)
94#endif /* !CONFIG_64BIT */ 87#endif /* !CONFIG_64BIT */
95 88
96 header->pmode_cr0 = read_cr0(); 89 header->pmode_cr0 = read_cr0();
97 header->pmode_cr4 = read_cr4(); 90 header->pmode_cr4 = read_cr4_safe();
98 header->realmode_flags = acpi_realmode_flags; 91 header->realmode_flags = acpi_realmode_flags;
99 header->real_magic = 0x12345678; 92 header->real_magic = 0x12345678;
100 93
@@ -105,7 +98,9 @@ int acpi_save_state_mem(void)
105#else /* CONFIG_64BIT */ 98#else /* CONFIG_64BIT */
106 header->trampoline_segment = setup_trampoline() >> 4; 99 header->trampoline_segment = setup_trampoline() >> 4;
107#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
108 stack_start.sp = temp_stack + 4096; 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
109#endif 104#endif
110 initial_code = (unsigned long)wakeup_long64; 105 initial_code = (unsigned long)wakeup_long64;
111 saved_magic = 0x123456789abcdef0; 106 saved_magic = 0x123456789abcdef0;
@@ -158,6 +153,10 @@ static int __init acpi_sleep_setup(char *str)
158 acpi_realmode_flags |= 2; 153 acpi_realmode_flags |= 2;
159 if (strncmp(str, "s3_beep", 7) == 0) 154 if (strncmp(str, "s3_beep", 7) == 0)
160 acpi_realmode_flags |= 4; 155 acpi_realmode_flags |= 4;
156#ifdef CONFIG_HIBERNATION
157 if (strncmp(str, "s4_nohwsig", 10) == 0)
158 acpi_no_s4_hw_signature();
159#endif
161 if (strncmp(str, "old_ordering", 12) == 0) 160 if (strncmp(str, "old_ordering", 12) == 0)
162 acpi_old_suspend_ordering(); 161 acpi_old_suspend_ordering();
163 str = strchr(str, ','); 162 str = strchr(str, ',');
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146const unsigned char *const *find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop {
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165const unsigned char *const *find_nop_table(void) 157const unsigned char *const *find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
241 continue; 231 continue;
242 if (*ptr > text_end) 232 if (*ptr > text_end)
243 continue; 233 continue;
244 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
245 }; 236 };
246} 237}
247 238
248static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
249{ 240{
250 u8 **ptr; 241 u8 **ptr;
251 char insn[1];
252 242
253 if (noreplace_smp) 243 if (noreplace_smp)
254 return; 244 return;
255 245
256 add_nops(insn, 1);
257 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
258 if (*ptr < text) 247 if (*ptr < text)
259 continue; 248 continue;
260 if (*ptr > text_end) 249 if (*ptr > text_end)
261 continue; 250 continue;
262 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
263 }; 253 };
264} 254}
265 255
@@ -454,7 +444,7 @@ void __init alternative_instructions(void)
454 _text, _etext); 444 _text, _etext);
455 445
456 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
457 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
458 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
459 } 449 }
460#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..a8fd9ebdc8e2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,36 +23,149 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31 31
32#define to_pages(addr, size) \ 32#define EXIT_LOOP_COUNT 10000000
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 33
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 35
37struct command { 36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
40/*
41 * general struct to manage commands send to an IOMMU
42 */
43struct iommu_cmd {
38 u32 data[4]; 44 u32 data[4];
39}; 45};
40 46
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 47static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 48 struct unity_map_entry *e);
43 49
50/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 51static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 52{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 53 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 54}
48 55
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 56/****************************************************************************
57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
154 * IOMMU command queuing functions
155 *
156 ****************************************************************************/
157
158/*
159 * Writes the command to the IOMMUs command buffer and informs the
160 * hardware about the new command. Must be called with iommu->lock held.
161 */
162static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 163{
51 u32 tail, head; 164 u32 tail, head;
52 u8 *target; 165 u8 *target;
53 166
54 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 167 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
55 target = (iommu->cmd_buf + tail); 168 target = iommu->cmd_buf + tail;
56 memcpy_toio(target, cmd, sizeof(*cmd)); 169 memcpy_toio(target, cmd, sizeof(*cmd));
57 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 170 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
58 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 171 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -63,7 +176,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 176 return 0;
64} 177}
65 178
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 179/*
180 * General queuing function for commands. Takes iommu->lock and calls
181 * __iommu_queue_command().
182 */
183static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 184{
68 unsigned long flags; 185 unsigned long flags;
69 int ret; 186 int ret;
@@ -75,35 +192,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 192 return ret;
76} 193}
77 194
195/*
196 * This function is called whenever we need to ensure that the IOMMU has
197 * completed execution of all commands we sent. It sends a
198 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
199 * us about that by writing a value to a physical address we pass with
200 * the command.
201 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 203{
80 int ret; 204 int ret = 0, ready = 0;
81 struct command cmd; 205 unsigned status = 0;
82 volatile u64 ready = 0; 206 struct iommu_cmd cmd;
83 unsigned long ready_phys = virt_to_phys(&ready); 207 unsigned long flags, i = 0;
84 208
85 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 212
91 iommu->need_sync = 0; 213 iommu->need_sync = 0;
92 214
93 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
94 218
95 if (ret) 219 if (ret)
96 return ret; 220 goto out;
221
222 while (!ready && (i < EXIT_LOOP_COUNT)) {
223 ++i;
224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
227 }
228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
97 232
98 while (!ready) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
99 cpu_relax(); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
100 237
101 return 0; 238 return 0;
102} 239}
103 240
241/*
242 * Command send function for invalidating a device table entry
243 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 245{
106 struct command cmd; 246 struct iommu_cmd cmd;
247 int ret;
107 248
108 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
109 250
@@ -111,37 +252,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
111 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
112 cmd.data[0] = devid; 253 cmd.data[0] = devid;
113 254
255 ret = iommu_queue_command(iommu, &cmd);
256
114 iommu->need_sync = 1; 257 iommu->need_sync = 1;
115 258
116 return iommu_queue_command(iommu, &cmd); 259 return ret;
117} 260}
118 261
262/*
263 * Generic command send function for invalidaing TLB entries
264 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 265static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
121{ 267{
122 struct command cmd; 268 struct iommu_cmd cmd;
269 int ret;
123 270
124 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 272 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 274 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 275 cmd.data[2] = lower_32_bits(address);
129 cmd.data[3] = HIGH_U32(address); 276 cmd.data[3] = upper_32_bits(address);
130 if (s) 277 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 281
282 ret = iommu_queue_command(iommu, &cmd);
283
135 iommu->need_sync = 1; 284 iommu->need_sync = 1;
136 285
137 return iommu_queue_command(iommu, &cmd); 286 return ret;
138} 287}
139 288
289/*
290 * TLB invalidation function which is called from the mapping functions.
291 * It invalidates a single PTE if the range to flush is within a single
292 * page. Otherwise it flushes the whole TLB of the IOMMU.
293 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 294static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 295 u64 address, size_t size)
142{ 296{
143 int s = 0; 297 int s = 0;
144 unsigned pages = to_pages(address, size); 298 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
145 299
146 address &= PAGE_MASK; 300 address &= PAGE_MASK;
147 301
@@ -159,6 +313,28 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 313 return 0;
160} 314}
161 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
324/****************************************************************************
325 *
326 * The functions below are used the create the page table mappings for
327 * unity mapped regions.
328 *
329 ****************************************************************************/
330
331/*
332 * Generic mapping functions. It maps a physical address into a DMA
333 * address space. It allocates the page table pages if necessary.
334 * In the future it can be extended to a generic mapping function
335 * supporting all features of AMD IOMMU page tables like level skipping
336 * and full 64 bit address spaces.
337 */
162static int iommu_map(struct protection_domain *dom, 338static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 339 unsigned long bus_addr,
164 unsigned long phys_addr, 340 unsigned long phys_addr,
@@ -209,6 +385,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 385 return 0;
210} 386}
211 387
388/*
389 * This function checks if a specific unity mapping entry is needed for
390 * this specific IOMMU.
391 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 392static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 393 struct unity_map_entry *entry)
214{ 394{
@@ -223,6 +403,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 403 return 0;
224} 404}
225 405
406/*
407 * Init the unity mappings for a specific IOMMU in the system
408 *
409 * Basically iterates over all unity mapping entries and applies them to
410 * the default domain DMA of that IOMMU if necessary.
411 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 412static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 413{
228 struct unity_map_entry *entry; 414 struct unity_map_entry *entry;
@@ -239,6 +425,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 425 return 0;
240} 426}
241 427
428/*
429 * This function actually applies the mapping to the page table of the
430 * dma_ops domain.
431 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 432static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 433 struct unity_map_entry *e)
244{ 434{
@@ -261,6 +451,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 451 return 0;
262} 452}
263 453
454/*
455 * Inits the unity mappings required for a specific device
456 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 457static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 458 u16 devid)
266{ 459{
@@ -278,33 +471,48 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 471 return 0;
279} 472}
280 473
281static unsigned long dma_mask_to_pages(unsigned long mask) 474/****************************************************************************
282{ 475 *
283 return (mask >> PAGE_SHIFT) + 476 * The next functions belong to the address allocator for the dma_ops
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 477 * interface functions. They work like the allocators in the other IOMMU
285} 478 * drivers. Its basically a bitmap which marks the allocated pages in
479 * the aperture. Maybe it could be enhanced in the future to a more
480 * efficient allocator.
481 *
482 ****************************************************************************/
286 483
484/*
485 * The address allocator core function.
486 *
487 * called with domain->lock held
488 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
289 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
290{ 494{
291 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
292 unsigned long address; 496 unsigned long address;
293 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
294 unsigned long boundary_size; 497 unsigned long boundary_size;
295 498
296 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
297 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
298 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
299 503
300 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
301 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
302 508
303 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
304 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
305 if (address == -1) 511 if (address == -1) {
306 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
307 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
308 516
309 if (likely(address != -1)) { 517 if (likely(address != -1)) {
310 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -317,6 +525,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 525 return address;
318} 526}
319 527
528/*
529 * The address free function.
530 *
531 * called with domain->lock held
532 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 533static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 534 unsigned long address,
322 unsigned int pages) 535 unsigned int pages)
@@ -325,6 +538,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 538 iommu_area_free(dom->bitmap, address, pages);
326} 539}
327 540
541/****************************************************************************
542 *
543 * The next functions belong to the domain allocation. A domain is
544 * allocated for every IOMMU as the default domain. If device isolation
545 * is enabled, every device get its own domain. The most important thing
546 * about domains is the page table mapping the DMA address space they
547 * contain.
548 *
549 ****************************************************************************/
550
328static u16 domain_id_alloc(void) 551static u16 domain_id_alloc(void)
329{ 552{
330 unsigned long flags; 553 unsigned long flags;
@@ -342,6 +565,10 @@ static u16 domain_id_alloc(void)
342 return id; 565 return id;
343} 566}
344 567
568/*
569 * Used to reserve address ranges in the aperture (e.g. for exclusion
570 * ranges.
571 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 572static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 573 unsigned long start_page,
347 unsigned int pages) 574 unsigned int pages)
@@ -351,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
351 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
352 pages = last_page - start_page; 579 pages = last_page - start_page;
353 580
354 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
355} 582}
356 583
357static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -382,6 +609,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 609 free_page((unsigned long)p1);
383} 610}
384 611
612/*
613 * Free a domain, only used if something went wrong in the
614 * allocation path and we need to free an already allocated page table
615 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 616static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 617{
387 if (!dom) 618 if (!dom)
@@ -396,6 +627,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 627 kfree(dom);
397} 628}
398 629
630/*
631 * Allocates a new protection domain usable for the dma_ops functions.
632 * It also intializes the page table and the address allocator data
633 * structures required for the dma_ops interface
634 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 635static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 636 unsigned order)
401{ 637{
@@ -436,14 +672,24 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
438 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
678 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
442 int pages = to_pages(iommu->exclusion_start, 682 int pages = iommu_num_pages(iommu->exclusion_start,
443 iommu->exclusion_length); 683 iommu->exclusion_length,
684 PAGE_SIZE);
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 685 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 686 }
446 687
688 /*
689 * At the last step, build the page tables so we don't need to
690 * allocate page table pages in the dma_ops mapping/unmapping
691 * path.
692 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 693 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 694 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 695 GFP_KERNEL);
@@ -472,6 +718,10 @@ free_dma_dom:
472 return NULL; 718 return NULL;
473} 719}
474 720
721/*
722 * Find out the protection domain structure for a given PCI device. This
723 * will give us the pointer to the page table root for example.
724 */
475static struct protection_domain *domain_for_device(u16 devid) 725static struct protection_domain *domain_for_device(u16 devid)
476{ 726{
477 struct protection_domain *dom; 727 struct protection_domain *dom;
@@ -484,6 +734,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 734 return dom;
485} 735}
486 736
737/*
738 * If a device is not yet associated with a domain, this function does
739 * assigns it visible for the hardware
740 */
487static void set_device_domain(struct amd_iommu *iommu, 741static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 742 struct protection_domain *domain,
489 u16 devid) 743 u16 devid)
@@ -492,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu,
492 746
493 u64 pte_root = virt_to_phys(domain->pt_root); 747 u64 pte_root = virt_to_phys(domain->pt_root);
494 748
495 pte_root |= (domain->mode & 0x07) << 9; 749 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
496 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 750 << DEV_ENTRY_MODE_SHIFT;
751 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
497 752
498 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 753 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
499 amd_iommu_dev_table[devid].data[0] = pte_root; 754 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
500 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 755 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
501 amd_iommu_dev_table[devid].data[2] = domain->id; 756 amd_iommu_dev_table[devid].data[2] = domain->id;
502 757
503 amd_iommu_pd_table[devid] = domain; 758 amd_iommu_pd_table[devid] = domain;
@@ -508,6 +763,58 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 763 iommu->need_sync = 1;
509} 764}
510 765
766/*****************************************************************************
767 *
768 * The next functions belong to the dma_ops mapping/unmapping code.
769 *
770 *****************************************************************************/
771
772/*
773 * This function checks if the driver got a valid device from the caller to
774 * avoid dereferencing invalid pointers.
775 */
776static bool check_device(struct device *dev)
777{
778 if (!dev || !dev->dma_mask)
779 return false;
780
781 return true;
782}
783
784/*
785 * In this function the list of preallocated protection domains is traversed to
786 * find the domain for a specific device
787 */
788static struct dma_ops_domain *find_protection_domain(u16 devid)
789{
790 struct dma_ops_domain *entry, *ret = NULL;
791 unsigned long flags;
792
793 if (list_empty(&iommu_pd_list))
794 return NULL;
795
796 spin_lock_irqsave(&iommu_pd_list_lock, flags);
797
798 list_for_each_entry(entry, &iommu_pd_list, list) {
799 if (entry->target_dev == devid) {
800 ret = entry;
801 list_del(&ret->list);
802 break;
803 }
804 }
805
806 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
807
808 return ret;
809}
810
811/*
812 * In the dma_ops path we only have the struct device. This function
813 * finds the corresponding IOMMU, the protection domain and the
814 * requestor id for a given device.
815 * If the device is not yet associated with a domain this is also done
816 * in this function.
817 */
511static int get_device_resources(struct device *dev, 818static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 819 struct amd_iommu **iommu,
513 struct protection_domain **domain, 820 struct protection_domain **domain,
@@ -517,26 +824,30 @@ static int get_device_resources(struct device *dev,
517 struct pci_dev *pcidev; 824 struct pci_dev *pcidev;
518 u16 _bdf; 825 u16 _bdf;
519 826
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 827 *iommu = NULL;
828 *domain = NULL;
829 *bdf = 0xffff;
830
831 if (dev->bus != &pci_bus_type)
832 return 0;
521 833
522 pcidev = to_pci_dev(dev); 834 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 835 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 836
525 if (_bdf >= amd_iommu_last_bdf) { 837 /* device not translated by any IOMMU in the system? */
526 *iommu = NULL; 838 if (_bdf > amd_iommu_last_bdf)
527 *domain = NULL;
528 *bdf = 0xffff;
529 return 0; 839 return 0;
530 }
531 840
532 *bdf = amd_iommu_alias_table[_bdf]; 841 *bdf = amd_iommu_alias_table[_bdf];
533 842
534 *iommu = amd_iommu_rlookup_table[*bdf]; 843 *iommu = amd_iommu_rlookup_table[*bdf];
535 if (*iommu == NULL) 844 if (*iommu == NULL)
536 return 0; 845 return 0;
537 dma_dom = (*iommu)->default_dom;
538 *domain = domain_for_device(*bdf); 846 *domain = domain_for_device(*bdf);
539 if (*domain == NULL) { 847 if (*domain == NULL) {
848 dma_dom = find_protection_domain(*bdf);
849 if (!dma_dom)
850 dma_dom = (*iommu)->default_dom;
540 *domain = &dma_dom->domain; 851 *domain = &dma_dom->domain;
541 set_device_domain(*iommu, *domain, *bdf); 852 set_device_domain(*iommu, *domain, *bdf);
542 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 853 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -547,6 +858,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 858 return 1;
548} 859}
549 860
861/*
862 * This is the generic map function. It maps one 4kb page at paddr to
863 * the given address in the DMA address space for the domain.
864 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 865static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 866 struct dma_ops_domain *dom,
552 unsigned long address, 867 unsigned long address,
@@ -578,6 +893,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 893 return (dma_addr_t)address;
579} 894}
580 895
896/*
897 * The generic unmapping function for on page in the DMA address space.
898 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 899static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 900 struct dma_ops_domain *dom,
583 unsigned long address) 901 unsigned long address)
@@ -597,22 +915,35 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 915 *pte = 0ULL;
598} 916}
599 917
918/*
919 * This function contains common code for mapping of a physically
920 * contiguous memory region into DMA address space. It is uses by all
921 * mapping functions provided by this IOMMU driver.
922 * Must be called with the domain lock held.
923 */
600static dma_addr_t __map_single(struct device *dev, 924static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 925 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 926 struct dma_ops_domain *dma_dom,
603 phys_addr_t paddr, 927 phys_addr_t paddr,
604 size_t size, 928 size_t size,
605 int dir) 929 int dir,
930 bool align,
931 u64 dma_mask)
606{ 932{
607 dma_addr_t offset = paddr & ~PAGE_MASK; 933 dma_addr_t offset = paddr & ~PAGE_MASK;
608 dma_addr_t address, start; 934 dma_addr_t address, start;
609 unsigned int pages; 935 unsigned int pages;
936 unsigned long align_mask = 0;
610 int i; 937 int i;
611 938
612 pages = to_pages(paddr, size); 939 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
613 paddr &= PAGE_MASK; 940 paddr &= PAGE_MASK;
614 941
615 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 942 if (align)
943 align_mask = (1UL << get_order(size)) - 1;
944
945 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
946 dma_mask);
616 if (unlikely(address == bad_dma_address)) 947 if (unlikely(address == bad_dma_address))
617 goto out; 948 goto out;
618 949
@@ -624,10 +955,20 @@ static dma_addr_t __map_single(struct device *dev,
624 } 955 }
625 address += offset; 956 address += offset;
626 957
958 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
959 iommu_flush_tlb(iommu, dma_dom->domain.id);
960 dma_dom->need_flush = false;
961 } else if (unlikely(iommu_has_npcache(iommu)))
962 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
963
627out: 964out:
628 return address; 965 return address;
629} 966}
630 967
968/*
969 * Does the reverse of the __map_single function. Must be called with
970 * the domain lock held too
971 */
631static void __unmap_single(struct amd_iommu *iommu, 972static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 973 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 974 dma_addr_t dma_addr,
@@ -640,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
640 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 981 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
641 return; 982 return;
642 983
643 pages = to_pages(dma_addr, size); 984 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
644 dma_addr &= PAGE_MASK; 985 dma_addr &= PAGE_MASK;
645 start = dma_addr; 986 start = dma_addr;
646 987
@@ -650,8 +991,14 @@ static void __unmap_single(struct amd_iommu *iommu,
650 } 991 }
651 992
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 993 dma_ops_free_addresses(dma_dom, dma_addr, pages);
994
995 if (amd_iommu_unmap_flush)
996 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
653} 997}
654 998
999/*
1000 * The exported map_single function for dma_ops.
1001 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 1002static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 1003 size_t size, int dir)
657{ 1004{
@@ -660,21 +1007,26 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
660 struct protection_domain *domain; 1007 struct protection_domain *domain;
661 u16 devid; 1008 u16 devid;
662 dma_addr_t addr; 1009 dma_addr_t addr;
1010 u64 dma_mask;
1011
1012 if (!check_device(dev))
1013 return bad_dma_address;
1014
1015 dma_mask = *dev->dma_mask;
663 1016
664 get_device_resources(dev, &iommu, &domain, &devid); 1017 get_device_resources(dev, &iommu, &domain, &devid);
665 1018
666 if (iommu == NULL || domain == NULL) 1019 if (iommu == NULL || domain == NULL)
1020 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 1021 return (dma_addr_t)paddr;
668 1022
669 spin_lock_irqsave(&domain->lock, flags); 1023 spin_lock_irqsave(&domain->lock, flags);
670 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1024 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1025 dma_mask);
671 if (addr == bad_dma_address) 1026 if (addr == bad_dma_address)
672 goto out; 1027 goto out;
673 1028
674 if (iommu_has_npcache(iommu)) 1029 if (unlikely(iommu->need_sync))
675 iommu_flush_pages(iommu, domain->id, addr, size);
676
677 if (iommu->need_sync)
678 iommu_completion_wait(iommu); 1030 iommu_completion_wait(iommu);
679 1031
680out: 1032out:
@@ -683,6 +1035,9 @@ out:
683 return addr; 1035 return addr;
684} 1036}
685 1037
1038/*
1039 * The exported unmap_single function for dma_ops.
1040 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 1041static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 1042 size_t size, int dir)
688{ 1043{
@@ -691,21 +1046,25 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
691 struct protection_domain *domain; 1046 struct protection_domain *domain;
692 u16 devid; 1047 u16 devid;
693 1048
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1049 if (!check_device(dev) ||
1050 !get_device_resources(dev, &iommu, &domain, &devid))
1051 /* device not handled by any AMD IOMMU */
695 return; 1052 return;
696 1053
697 spin_lock_irqsave(&domain->lock, flags); 1054 spin_lock_irqsave(&domain->lock, flags);
698 1055
699 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1056 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
700 1057
701 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1058 if (unlikely(iommu->need_sync))
702
703 if (iommu->need_sync)
704 iommu_completion_wait(iommu); 1059 iommu_completion_wait(iommu);
705 1060
706 spin_unlock_irqrestore(&domain->lock, flags); 1061 spin_unlock_irqrestore(&domain->lock, flags);
707} 1062}
708 1063
1064/*
1065 * This is a special map_sg function which is used if we should map a
1066 * device which is not handled by an AMD IOMMU in the system.
1067 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 1068static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 1069 int nelems, int dir)
711{ 1070{
@@ -720,6 +1079,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 1079 return nelems;
721} 1080}
722 1081
1082/*
1083 * The exported map_sg function for dma_ops (handles scatter-gather
1084 * lists).
1085 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 1086static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 1087 int nelems, int dir)
725{ 1088{
@@ -731,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
731 struct scatterlist *s; 1094 struct scatterlist *s;
732 phys_addr_t paddr; 1095 phys_addr_t paddr;
733 int mapped_elems = 0; 1096 int mapped_elems = 0;
1097 u64 dma_mask;
1098
1099 if (!check_device(dev))
1100 return 0;
1101
1102 dma_mask = *dev->dma_mask;
734 1103
735 get_device_resources(dev, &iommu, &domain, &devid); 1104 get_device_resources(dev, &iommu, &domain, &devid);
736 1105
@@ -743,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
743 paddr = sg_phys(s); 1112 paddr = sg_phys(s);
744 1113
745 s->dma_address = __map_single(dev, iommu, domain->priv, 1114 s->dma_address = __map_single(dev, iommu, domain->priv,
746 paddr, s->length, dir); 1115 paddr, s->length, dir, false,
1116 dma_mask);
747 1117
748 if (s->dma_address) { 1118 if (s->dma_address) {
749 s->dma_length = s->length; 1119 s->dma_length = s->length;
750 mapped_elems++; 1120 mapped_elems++;
751 } else 1121 } else
752 goto unmap; 1122 goto unmap;
753 if (iommu_has_npcache(iommu))
754 iommu_flush_pages(iommu, domain->id, s->dma_address,
755 s->dma_length);
756 } 1123 }
757 1124
758 if (iommu->need_sync) 1125 if (unlikely(iommu->need_sync))
759 iommu_completion_wait(iommu); 1126 iommu_completion_wait(iommu);
760 1127
761out: 1128out:
@@ -775,6 +1142,10 @@ unmap:
775 goto out; 1142 goto out;
776} 1143}
777 1144
1145/*
1146 * The exported map_sg function for dma_ops (handles scatter-gather
1147 * lists).
1148 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 1149static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 1150 int nelems, int dir)
780{ 1151{
@@ -785,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
785 u16 devid; 1156 u16 devid;
786 int i; 1157 int i;
787 1158
788 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1159 if (!check_device(dev) ||
1160 !get_device_resources(dev, &iommu, &domain, &devid))
789 return; 1161 return;
790 1162
791 spin_lock_irqsave(&domain->lock, flags); 1163 spin_lock_irqsave(&domain->lock, flags);
@@ -793,17 +1165,18 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
793 for_each_sg(sglist, s, nelems, i) { 1165 for_each_sg(sglist, s, nelems, i) {
794 __unmap_single(iommu, domain->priv, s->dma_address, 1166 __unmap_single(iommu, domain->priv, s->dma_address,
795 s->dma_length, dir); 1167 s->dma_length, dir);
796 iommu_flush_pages(iommu, domain->id, s->dma_address,
797 s->dma_length);
798 s->dma_address = s->dma_length = 0; 1168 s->dma_address = s->dma_length = 0;
799 } 1169 }
800 1170
801 if (iommu->need_sync) 1171 if (unlikely(iommu->need_sync))
802 iommu_completion_wait(iommu); 1172 iommu_completion_wait(iommu);
803 1173
804 spin_unlock_irqrestore(&domain->lock, flags); 1174 spin_unlock_irqrestore(&domain->lock, flags);
805} 1175}
806 1176
1177/*
1178 * The exported alloc_coherent function for dma_ops.
1179 */
807static void *alloc_coherent(struct device *dev, size_t size, 1180static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 1181 dma_addr_t *dma_addr, gfp_t flag)
809{ 1182{
@@ -813,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
813 struct protection_domain *domain; 1186 struct protection_domain *domain;
814 u16 devid; 1187 u16 devid;
815 phys_addr_t paddr; 1188 phys_addr_t paddr;
1189 u64 dma_mask = dev->coherent_dma_mask;
1190
1191 if (!check_device(dev))
1192 return NULL;
1193
1194 if (!get_device_resources(dev, &iommu, &domain, &devid))
1195 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
816 1196
1197 flag |= __GFP_ZERO;
817 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1198 virt_addr = (void *)__get_free_pages(flag, get_order(size));
818 if (!virt_addr) 1199 if (!virt_addr)
819 return 0; 1200 return 0;
820 1201
821 memset(virt_addr, 0, size);
822 paddr = virt_to_phys(virt_addr); 1202 paddr = virt_to_phys(virt_addr);
823 1203
824 get_device_resources(dev, &iommu, &domain, &devid);
825
826 if (!iommu || !domain) { 1204 if (!iommu || !domain) {
827 *dma_addr = (dma_addr_t)paddr; 1205 *dma_addr = (dma_addr_t)paddr;
828 return virt_addr; 1206 return virt_addr;
829 } 1207 }
830 1208
1209 if (!dma_mask)
1210 dma_mask = *dev->dma_mask;
1211
831 spin_lock_irqsave(&domain->lock, flags); 1212 spin_lock_irqsave(&domain->lock, flags);
832 1213
833 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1214 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
834 size, DMA_BIDIRECTIONAL); 1215 size, DMA_BIDIRECTIONAL, true, dma_mask);
835 1216
836 if (*dma_addr == bad_dma_address) { 1217 if (*dma_addr == bad_dma_address) {
837 free_pages((unsigned long)virt_addr, get_order(size)); 1218 free_pages((unsigned long)virt_addr, get_order(size));
@@ -839,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
839 goto out; 1220 goto out;
840 } 1221 }
841 1222
842 if (iommu_has_npcache(iommu)) 1223 if (unlikely(iommu->need_sync))
843 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
844
845 if (iommu->need_sync)
846 iommu_completion_wait(iommu); 1224 iommu_completion_wait(iommu);
847 1225
848out: 1226out:
@@ -851,6 +1229,9 @@ out:
851 return virt_addr; 1229 return virt_addr;
852} 1230}
853 1231
1232/*
1233 * The exported free_coherent function for dma_ops.
1234 */
854static void free_coherent(struct device *dev, size_t size, 1235static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1236 void *virt_addr, dma_addr_t dma_addr)
856{ 1237{
@@ -859,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size,
859 struct protection_domain *domain; 1240 struct protection_domain *domain;
860 u16 devid; 1241 u16 devid;
861 1242
1243 if (!check_device(dev))
1244 return;
1245
862 get_device_resources(dev, &iommu, &domain, &devid); 1246 get_device_resources(dev, &iommu, &domain, &devid);
863 1247
864 if (!iommu || !domain) 1248 if (!iommu || !domain)
@@ -867,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size,
867 spin_lock_irqsave(&domain->lock, flags); 1251 spin_lock_irqsave(&domain->lock, flags);
868 1252
869 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1253 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
870 iommu_flush_pages(iommu, domain->id, dma_addr, size);
871 1254
872 if (iommu->need_sync) 1255 if (unlikely(iommu->need_sync))
873 iommu_completion_wait(iommu); 1256 iommu_completion_wait(iommu);
874 1257
875 spin_unlock_irqrestore(&domain->lock, flags); 1258 spin_unlock_irqrestore(&domain->lock, flags);
@@ -879,6 +1262,32 @@ free_mem:
879} 1262}
880 1263
881/* 1264/*
1265 * This function is called by the DMA layer to find out if we can handle a
1266 * particular device. It is part of the dma_ops.
1267 */
1268static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1269{
1270 u16 bdf;
1271 struct pci_dev *pcidev;
1272
1273 /* No device or no PCI device */
1274 if (!dev || dev->bus != &pci_bus_type)
1275 return 0;
1276
1277 pcidev = to_pci_dev(dev);
1278
1279 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1280
1281 /* Out of our scope? */
1282 if (bdf > amd_iommu_last_bdf)
1283 return 0;
1284
1285 return 1;
1286}
1287
1288/*
1289 * The function for pre-allocating protection domains.
1290 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1291 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1292 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1293 * For now we have to.
@@ -893,7 +1302,7 @@ void prealloc_protection_domains(void)
893 1302
894 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1303 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
895 devid = (dev->bus->number << 8) | dev->devfn; 1304 devid = (dev->bus->number << 8) | dev->devfn;
896 if (devid >= amd_iommu_last_bdf) 1305 if (devid > amd_iommu_last_bdf)
897 continue; 1306 continue;
898 devid = amd_iommu_alias_table[devid]; 1307 devid = amd_iommu_alias_table[devid];
899 if (domain_for_device(devid)) 1308 if (domain_for_device(devid))
@@ -905,10 +1314,9 @@ void prealloc_protection_domains(void)
905 if (!dma_dom) 1314 if (!dma_dom)
906 continue; 1315 continue;
907 init_unity_mappings_for_device(dma_dom, devid); 1316 init_unity_mappings_for_device(dma_dom, devid);
908 set_device_domain(iommu, &dma_dom->domain, devid); 1317 dma_dom->target_dev = devid;
909 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1318
910 dma_dom->domain.id); 1319 list_add_tail(&dma_dom->list, &iommu_pd_list);
911 print_devid(devid, 1);
912 } 1320 }
913} 1321}
914 1322
@@ -919,14 +1327,23 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
919 .unmap_single = unmap_single, 1327 .unmap_single = unmap_single,
920 .map_sg = map_sg, 1328 .map_sg = map_sg,
921 .unmap_sg = unmap_sg, 1329 .unmap_sg = unmap_sg,
1330 .dma_supported = amd_iommu_dma_supported,
922}; 1331};
923 1332
1333/*
1334 * The function which clues the AMD IOMMU driver into dma_ops.
1335 */
924int __init amd_iommu_init_dma_ops(void) 1336int __init amd_iommu_init_dma_ops(void)
925{ 1337{
926 struct amd_iommu *iommu; 1338 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1339 int order = amd_iommu_aperture_order;
928 int ret; 1340 int ret;
929 1341
1342 /*
1343 * first allocate a default protection domain for every IOMMU we
1344 * found in the system. Devices not assigned to any other
1345 * protection domain will be assigned to the default one.
1346 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1347 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1348 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1349 if (iommu->default_dom == NULL)
@@ -936,6 +1353,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1353 goto free_domains;
937 } 1354 }
938 1355
1356 /*
1357 * If device isolation is enabled, pre-allocate the protection
1358 * domains for each device.
1359 */
939 if (amd_iommu_isolate) 1360 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1361 prealloc_protection_domains();
941 1362
@@ -947,6 +1368,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1368 gart_iommu_aperture = 0;
948#endif 1369#endif
949 1370
1371 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1372 dma_ops = &amd_iommu_dma_ops;
951 1373
952 return 0; 1374 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..0cdcda35a05f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,23 +22,17 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
28#include <asm/gart.h> 30#include <asm/iommu.h>
29 31
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 36
43#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 38#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +65,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 65#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 66#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 67
68/*
69 * ACPI table definitions
70 *
71 * These data structures are laid over the table to parse the important values
72 * out of it.
73 */
74
75/*
76 * structure describing one IOMMU in the ACPI table. Typically followed by one
77 * or more ivhd_entrys.
78 */
74struct ivhd_header { 79struct ivhd_header {
75 u8 type; 80 u8 type;
76 u8 flags; 81 u8 flags;
@@ -83,6 +88,10 @@ struct ivhd_header {
83 u32 reserved; 88 u32 reserved;
84} __attribute__((packed)); 89} __attribute__((packed));
85 90
91/*
92 * A device entry describing which devices a specific IOMMU translates and
93 * which requestor ids they use.
94 */
86struct ivhd_entry { 95struct ivhd_entry {
87 u8 type; 96 u8 type;
88 u16 devid; 97 u16 devid;
@@ -90,6 +99,10 @@ struct ivhd_entry {
90 u32 ext; 99 u32 ext;
91} __attribute__((packed)); 100} __attribute__((packed));
92 101
102/*
103 * An AMD IOMMU memory definition structure. It defines things like exclusion
104 * ranges for devices and regions that should be unity mapped.
105 */
93struct ivmd_header { 106struct ivmd_header {
94 u8 type; 107 u8 type;
95 u8 flags; 108 u8 flags;
@@ -103,22 +116,81 @@ struct ivmd_header {
103 116
104static int __initdata amd_iommu_detected; 117static int __initdata amd_iommu_detected;
105 118
106u16 amd_iommu_last_bdf; 119u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 120 to handle */
108unsigned amd_iommu_aperture_order = 26; 121LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 122 we find in ACPI */
123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
126
127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
128 system */
110 129
111struct list_head amd_iommu_list; 130/*
131 * Pointer to the device table which is shared by all AMD IOMMUs
132 * it is indexed by the PCI device id or the HT unit id and contains
133 * information about the domain the device belongs to as well as the
134 * page table root pointer.
135 */
112struct dev_table_entry *amd_iommu_dev_table; 136struct dev_table_entry *amd_iommu_dev_table;
137
138/*
139 * The alias table is a driver specific data structure which contains the
140 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
141 * More than one device can share the same requestor id.
142 */
113u16 *amd_iommu_alias_table; 143u16 *amd_iommu_alias_table;
144
145/*
146 * The rlookup table is used to find the IOMMU which is responsible
147 * for a specific device. It is also indexed by the PCI device id.
148 */
114struct amd_iommu **amd_iommu_rlookup_table; 149struct amd_iommu **amd_iommu_rlookup_table;
150
151/*
152 * The pd table (protection domain table) is used to find the protection domain
153 * data structure a device belongs to. Indexed with the PCI device id too.
154 */
115struct protection_domain **amd_iommu_pd_table; 155struct protection_domain **amd_iommu_pd_table;
156
157/*
158 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
159 * to know which ones are already in use.
160 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 161unsigned long *amd_iommu_pd_alloc_bitmap;
117 162
118static u32 dev_table_size; 163static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 164static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 165static u32 rlookup_table_size; /* size if the rlookup table */
166
167static inline void update_last_devid(u16 devid)
168{
169 if (devid > amd_iommu_last_bdf)
170 amd_iommu_last_bdf = devid;
171}
172
173static inline unsigned long tbl_size(int entry_size)
174{
175 unsigned shift = PAGE_SHIFT +
176 get_order(amd_iommu_last_bdf * entry_size);
177
178 return 1UL << shift;
179}
121 180
181/****************************************************************************
182 *
183 * AMD IOMMU MMIO register space handling functions
184 *
185 * These functions are used to program the IOMMU device registers in
186 * MMIO space required for that driver.
187 *
188 ****************************************************************************/
189
190/*
191 * This function set the exclusion range in the IOMMU. DMA accesses to the
192 * exclusion range are passed through untranslated
193 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 194static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 195{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 196 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,9 +209,10 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 209 &entry, sizeof(entry));
138} 210}
139 211
212/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 213static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 214{
142 u32 entry; 215 u64 entry;
143 216
144 BUG_ON(iommu->mmio_base == NULL); 217 BUG_ON(iommu->mmio_base == NULL);
145 218
@@ -149,6 +222,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 222 &entry, sizeof(entry));
150} 223}
151 224
225/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 226static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 227{
154 u32 ctrl; 228 u32 ctrl;
@@ -162,20 +236,35 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
162{ 236{
163 u32 ctrl; 237 u32 ctrl;
164 238
165 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
166 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 242}
169 243
244/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
171{ 246{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
173 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
174 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
175 253
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 255}
178 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
264/*
265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
266 * the system has one.
267 */
179static u8 * __init iommu_map_mmio_space(u64 address) 268static u8 * __init iommu_map_mmio_space(u64 address)
180{ 269{
181 u8 *ret; 270 u8 *ret;
@@ -199,16 +288,41 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 288 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 289}
201 290
291/****************************************************************************
292 *
293 * The functions below belong to the first pass of AMD IOMMU ACPI table
294 * parsing. In this pass we try to find out the highest device id this
295 * code has to handle. Upon this information the size of the shared data
296 * structures is determined later.
297 *
298 ****************************************************************************/
299
300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
309 * This function reads the last device id the IOMMU has to handle from the PCI
310 * capability header for this IOMMU
311 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 312static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 313{
204 u32 cap; 314 u32 cap;
205 315
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 316 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 317 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 318
209 return 0; 319 return 0;
210} 320}
211 321
322/*
323 * After reading the highest device id from the IOMMU PCI capability header
324 * this function looks if there is a higher device id defined in the ACPI table
325 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 326static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 327{
214 u8 *p = (void *)h, *end = (void *)h; 328 u8 *p = (void *)h, *end = (void *)h;
@@ -229,12 +343,13 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 343 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 344 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 345 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 346 /* all the above subfield types refer to device ids */
347 update_last_devid(dev->devid);
233 break; 348 break;
234 default: 349 default:
235 break; 350 break;
236 } 351 }
237 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
238 } 353 }
239 354
240 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -242,6 +357,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 357 return 0;
243} 358}
244 359
360/*
361 * Iterate over all IVHD entries in the ACPI table and find the highest device
362 * id which we need to handle. This is the first of three functions which parse
363 * the ACPI table. So we check the checksum here.
364 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 365static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 366{
247 int i; 367 int i;
@@ -277,19 +397,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 397 return 0;
278} 398}
279 399
400/****************************************************************************
401 *
402 * The following functions belong the the code path which parses the ACPI table
403 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
404 * data structures, initialize the device/alias/rlookup table and also
405 * basically initialize the hardware.
406 *
407 ****************************************************************************/
408
409/*
410 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
411 * write commands to that buffer later and the IOMMU will execute them
412 * asynchronously
413 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 414static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 415{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 416 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 417 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 418 u64 entry;
285 419
286 if (cmd_buf == NULL) 420 if (cmd_buf == NULL)
287 return NULL; 421 return NULL;
288 422
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 423 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 424
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 425 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 426 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +434,35 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 434
303static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 436{
305 if (iommu->cmd_buf) 437 free_pages((unsigned long)iommu->cmd_buf,
306 free_pages((unsigned long)iommu->cmd_buf, 438 get_order(iommu->cmd_buf_size));
307 get_order(CMD_BUFFER_SIZE)); 439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
308} 463}
309 464
465/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 466static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 467{
312 int i = (bit >> 5) & 0x07; 468 int i = (bit >> 5) & 0x07;
@@ -315,7 +471,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 471 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 472}
317 473
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 474/* Writes the specific IOMMU for a device into the rlookup table */
475static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
476{
477 amd_iommu_rlookup_table[devid] = iommu;
478}
479
480/*
481 * This function takes the device specific flags read from the ACPI
482 * table and sets up the device table entry with that information
483 */
484static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
485 u16 devid, u32 flags, u32 ext_flags)
319{ 486{
320 if (flags & ACPI_DEVFLAG_INITPASS) 487 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 488 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +498,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 498 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 499 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 500 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 501
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 502 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 503}
340 504
505/*
506 * Reads the device exclusion range from ACPI and initialize IOMMU with
507 * it
508 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 509static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 510{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 511 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,27 +514,45 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 514 return;
347 515
348 if (iommu) { 516 if (iommu) {
517 /*
518 * We only can configure exclusion ranges per IOMMU, not
519 * per device. But we can enable the exclusion range per
520 * device. This is done here
521 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 522 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 523 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 524 iommu->exclusion_length = m->range_length;
352 } 525 }
353} 526}
354 527
528/*
529 * This function reads some important data from the IOMMU PCI space and
530 * initializes the driver data structure with it. It reads the hardware
531 * capabilities and the first/last device entries
532 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 534{
357 int bus = PCI_BUS(iommu->devid);
358 int dev = PCI_SLOT(iommu->devid);
359 int fn = PCI_FUNC(iommu->devid);
360 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
361 u32 range; 536 u32 range, misc;
362 537
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
364 539 &iommu->cap);
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 541 &range);
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
544
545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
546 MMIO_GET_FD(range));
547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
368} 550}
369 551
552/*
553 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
554 * initializes the hardware and our data structures with it.
555 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 556static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 557 struct ivhd_header *h)
372{ 558{
@@ -374,7 +560,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 560 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 561 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 562 u32 ext_flags = 0;
377 bool alias = 0; 563 bool alias = false;
378 struct ivhd_entry *e; 564 struct ivhd_entry *e;
379 565
380 /* 566 /*
@@ -414,22 +600,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 600 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 601 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 602 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 603 set_dev_entry_from_acpi(iommu, dev_i,
604 e->flags, 0);
418 break; 605 break;
419 case IVHD_DEV_SELECT: 606 case IVHD_DEV_SELECT:
420 devid = e->devid; 607 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 608 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 609 break;
423 case IVHD_DEV_SELECT_RANGE_START: 610 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 611 devid_start = e->devid;
425 flags = e->flags; 612 flags = e->flags;
426 ext_flags = 0; 613 ext_flags = 0;
427 alias = 0; 614 alias = false;
428 break; 615 break;
429 case IVHD_DEV_ALIAS: 616 case IVHD_DEV_ALIAS:
430 devid = e->devid; 617 devid = e->devid;
431 devid_to = e->ext >> 8; 618 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 619 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 620 amd_iommu_alias_table[devid] = devid_to;
434 break; 621 break;
435 case IVHD_DEV_ALIAS_RANGE: 622 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +624,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 624 flags = e->flags;
438 devid_to = e->ext >> 8; 625 devid_to = e->ext >> 8;
439 ext_flags = 0; 626 ext_flags = 0;
440 alias = 1; 627 alias = true;
441 break; 628 break;
442 case IVHD_DEV_EXT_SELECT: 629 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 630 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 631 set_dev_entry_from_acpi(iommu, devid, e->flags,
632 e->ext);
445 break; 633 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 634 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 635 devid_start = e->devid;
448 flags = e->flags; 636 flags = e->flags;
449 ext_flags = e->ext; 637 ext_flags = e->ext;
450 alias = 0; 638 alias = false;
451 break; 639 break;
452 case IVHD_DEV_RANGE_END: 640 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 641 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 642 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 643 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 644 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 645 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 646 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 647 flags, ext_flags);
460 } 648 }
@@ -463,10 +651,11 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
463 break; 651 break;
464 } 652 }
465 653
466 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
467 } 655 }
468} 656}
469 657
658/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 659static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 660{
472 u16 i; 661 u16 i;
@@ -480,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
480static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
481{ 670{
482 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
483 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
484} 674}
485 675
@@ -494,6 +684,11 @@ static void __init free_iommu_all(void)
494 } 684 }
495} 685}
496 686
687/*
688 * This function clues the initialization function for one IOMMU
689 * together and also allocates the command buffer and programs the
690 * hardware. It does NOT enable the IOMMU. This is done afterwards.
691 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 692static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 693{
499 spin_lock_init(&iommu->lock); 694 spin_lock_init(&iommu->lock);
@@ -502,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
502 /* 697 /*
503 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
504 */ 699 */
505 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
506 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
507 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
508 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
509 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -514,13 +713,23 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
514 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
515 return -ENOMEM; 714 return -ENOMEM;
516 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
517 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
518 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
519 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
520 725
521 return 0; 726 return pci_enable_device(iommu->dev);
522} 727}
523 728
729/*
730 * Iterates over all IOMMU entries in the ACPI table, allocates the
731 * IOMMU structure and initializes it with init_iommu_one()
732 */
524static int __init init_iommu_all(struct acpi_table_header *table) 733static int __init init_iommu_all(struct acpi_table_header *table)
525{ 734{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 735 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +737,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 737 struct amd_iommu *iommu;
529 int ret; 738 int ret;
530 739
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 740 end += table->length;
534 p += IVRS_HEADER_LENGTH; 741 p += IVRS_HEADER_LENGTH;
535 742
@@ -555,6 +762,103 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 762 return 0;
556} 763}
557 764
765/****************************************************************************
766 *
767 * The following functions initialize the MSI interrupts for all IOMMUs
768 * in the system. Its a bit challenging because there could be multiple
769 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
770 * pci_dev.
771 *
772 ****************************************************************************/
773
774static int __init iommu_setup_msix(struct amd_iommu *iommu)
775{
776 struct amd_iommu *curr;
777 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
778 int nvec = 0, i;
779
780 list_for_each_entry(curr, &amd_iommu_list, list) {
781 if (curr->dev == iommu->dev) {
782 entries[nvec].entry = curr->evt_msi_num;
783 entries[nvec].vector = 0;
784 curr->int_enabled = true;
785 nvec++;
786 }
787 }
788
789 if (pci_enable_msix(iommu->dev, entries, nvec)) {
790 pci_disable_msix(iommu->dev);
791 return 1;
792 }
793
794 for (i = 0; i < nvec; ++i) {
795 int r = request_irq(entries->vector, amd_iommu_int_handler,
796 IRQF_SAMPLE_RANDOM,
797 "AMD IOMMU",
798 NULL);
799 if (r)
800 goto out_free;
801 }
802
803 return 0;
804
805out_free:
806 for (i -= 1; i >= 0; --i)
807 free_irq(entries->vector, NULL);
808
809 pci_disable_msix(iommu->dev);
810
811 return 1;
812}
813
814static int __init iommu_setup_msi(struct amd_iommu *iommu)
815{
816 int r;
817 struct amd_iommu *curr;
818
819 list_for_each_entry(curr, &amd_iommu_list, list) {
820 if (curr->dev == iommu->dev)
821 curr->int_enabled = true;
822 }
823
824
825 if (pci_enable_msi(iommu->dev))
826 return 1;
827
828 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
829 IRQF_SAMPLE_RANDOM,
830 "AMD IOMMU",
831 NULL);
832
833 if (r) {
834 pci_disable_msi(iommu->dev);
835 return 1;
836 }
837
838 return 0;
839}
840
841static int __init iommu_init_msi(struct amd_iommu *iommu)
842{
843 if (iommu->int_enabled)
844 return 0;
845
846 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
847 return iommu_setup_msix(iommu);
848 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msi(iommu);
850
851 return 1;
852}
853
854/****************************************************************************
855 *
856 * The next functions belong to the third pass of parsing the ACPI
857 * table. In this last pass the memory mapping requirements are
858 * gathered (like exclusion and unity mapping reanges).
859 *
860 ****************************************************************************/
861
558static void __init free_unity_maps(void) 862static void __init free_unity_maps(void)
559{ 863{
560 struct unity_map_entry *entry, *next; 864 struct unity_map_entry *entry, *next;
@@ -565,6 +869,7 @@ static void __init free_unity_maps(void)
565 } 869 }
566} 870}
567 871
872/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 873static int __init init_exclusion_range(struct ivmd_header *m)
569{ 874{
570 int i; 875 int i;
@@ -574,7 +879,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
574 set_device_exclusion_range(m->devid, m); 879 set_device_exclusion_range(m->devid, m);
575 break; 880 break;
576 case ACPI_IVMD_TYPE_ALL: 881 case ACPI_IVMD_TYPE_ALL:
577 for (i = 0; i < amd_iommu_last_bdf; ++i) 882 for (i = 0; i <= amd_iommu_last_bdf; ++i)
578 set_device_exclusion_range(i, m); 883 set_device_exclusion_range(i, m);
579 break; 884 break;
580 case ACPI_IVMD_TYPE_RANGE: 885 case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +893,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 893 return 0;
589} 894}
590 895
896/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 897static int __init init_unity_map_range(struct ivmd_header *m)
592{ 898{
593 struct unity_map_entry *e = 0; 899 struct unity_map_entry *e = 0;
@@ -619,13 +925,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 925 return 0;
620} 926}
621 927
928/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 929static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 930{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 931 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 932 struct ivmd_header *m;
626 933
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 934 end += table->length;
630 p += IVRS_HEADER_LENGTH; 935 p += IVRS_HEADER_LENGTH;
631 936
@@ -642,12 +947,32 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 947 return 0;
643} 948}
644 949
950/*
951 * Init the device table to not allow DMA access for devices and
952 * suppress all page faults
953 */
954static void init_device_table(void)
955{
956 u16 devid;
957
958 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
959 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
960 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
961 }
962}
963
964/*
965 * This function finally enables all IOMMUs found in the system after
966 * they have been initialized
967 */
645static void __init enable_iommus(void) 968static void __init enable_iommus(void)
646{ 969{
647 struct amd_iommu *iommu; 970 struct amd_iommu *iommu;
648 971
649 list_for_each_entry(iommu, &amd_iommu_list, list) { 972 list_for_each_entry(iommu, &amd_iommu_list, list) {
650 iommu_set_exclusion_range(iommu); 973 iommu_set_exclusion_range(iommu);
974 iommu_init_msi(iommu);
975 iommu_enable_event_logging(iommu);
651 iommu_enable(iommu); 976 iommu_enable(iommu);
652 } 977 }
653} 978}
@@ -678,6 +1003,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 1003 .cls = &amd_iommu_sysdev_class,
679}; 1004};
680 1005
1006/*
1007 * This is the core init function for AMD IOMMU hardware in the system.
1008 * This function is called from the generic x86 DMA layer initialization
1009 * code.
1010 *
1011 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1012 * three times:
1013 *
1014 * 1 pass) Find the highest PCI device id the driver has to handle.
1015 * Upon this information the size of the data structures is
1016 * determined that needs to be allocated.
1017 *
1018 * 2 pass) Initialize the data structures just allocated with the
1019 * information in the ACPI table about available AMD IOMMUs
1020 * in the system. It also maps the PCI devices in the
1021 * system to specific IOMMUs
1022 *
1023 * 3 pass) After the basic data structures are allocated and
1024 * initialized we update them with information about memory
1025 * remapping requirements parsed out of the ACPI table in
1026 * this last pass.
1027 *
1028 * After that the hardware is initialized and ready to go. In the last
1029 * step we do some Linux specific things like registering the driver in
1030 * the dma_ops interface and initializing the suspend/resume support
1031 * functions. Finally it prints some information about AMD IOMMUs and
1032 * the driver state and enables the hardware.
1033 */
681int __init amd_iommu_init(void) 1034int __init amd_iommu_init(void)
682{ 1035{
683 int i, ret = 0; 1036 int i, ret = 0;
@@ -699,14 +1052,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1052 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 1053 return -ENODEV;
701 1054
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 1055 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 1056 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 1057 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 1058
706 ret = -ENOMEM; 1059 ret = -ENOMEM;
707 1060
708 /* Device table - directly used by all IOMMUs */ 1061 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 1062 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 1063 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 1064 if (amd_iommu_dev_table == NULL)
712 goto out; 1065 goto out;
@@ -730,27 +1083,26 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 1083 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 1084 * This table has the same size as the rlookup_table
732 */ 1085 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 1086 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 1087 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 1088 if (amd_iommu_pd_table == NULL)
736 goto free; 1089 goto free;
737 1090
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 1091 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1092 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 1093 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 1094 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 1095 goto free;
742 1096
1097 /* init the device table */
1098 init_device_table();
1099
743 /* 1100 /*
744 * memory is allocated now; initialize the device table with all zeroes 1101 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 1102 */
747 memset(amd_iommu_dev_table, 0, dev_table_size); 1103 for (i = 0; i <= amd_iommu_last_bdf; ++i)
748 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 1104 amd_iommu_alias_table[i] = i;
750 1105
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 1106 /*
755 * never allocate domain 0 because its used as the non-allocated and 1107 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 1108 * error value placeholder
@@ -768,15 +1120,15 @@ int __init amd_iommu_init(void)
768 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1120 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
769 goto free; 1121 goto free;
770 1122
771 ret = amd_iommu_init_dma_ops(); 1123 ret = sysdev_class_register(&amd_iommu_sysdev_class);
772 if (ret) 1124 if (ret)
773 goto free; 1125 goto free;
774 1126
775 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1127 ret = sysdev_register(&device_amd_iommu);
776 if (ret) 1128 if (ret)
777 goto free; 1129 goto free;
778 1130
779 ret = sysdev_register(&device_amd_iommu); 1131 ret = amd_iommu_init_dma_ops();
780 if (ret) 1132 if (ret)
781 goto free; 1133 goto free;
782 1134
@@ -791,28 +1143,29 @@ int __init amd_iommu_init(void)
791 else 1143 else
792 printk("disabled\n"); 1144 printk("disabled\n");
793 1145
1146 if (amd_iommu_unmap_flush)
1147 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1148 else
1149 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1150
794out: 1151out:
795 return ret; 1152 return ret;
796 1153
797free: 1154free:
798 if (amd_iommu_pd_alloc_bitmap) 1155 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1156 get_order(MAX_DOMAIN_ID/8));
800 1157
801 if (amd_iommu_pd_table) 1158 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 1159 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 1160
805 if (amd_iommu_rlookup_table) 1161 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 1162 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 1163
809 if (amd_iommu_alias_table) 1164 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 1165 get_order(alias_table_size));
811 get_order(alias_table_size));
812 1166
813 if (amd_iommu_dev_table) 1167 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 1168 get_order(dev_table_size));
815 get_order(dev_table_size));
816 1169
817 free_iommu_all(); 1170 free_iommu_all();
818 1171
@@ -821,6 +1174,13 @@ free:
821 goto out; 1174 goto out;
822} 1175}
823 1176
1177/****************************************************************************
1178 *
1179 * Early detect code. This code runs at IOMMU detection time in the DMA
1180 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1181 * IOMMUs
1182 *
1183 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1184static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1185{
826 return 0; 1186 return 0;
@@ -828,7 +1188,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1188
829void __init amd_iommu_detect(void) 1189void __init amd_iommu_detect(void)
830{ 1190{
831 if (swiotlb || no_iommu || iommu_detected) 1191 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1192 return;
833 1193
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1194 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,11 +1201,20 @@ void __init amd_iommu_detect(void)
841 } 1201 }
842} 1202}
843 1203
1204/****************************************************************************
1205 *
1206 * Parsing functions for the AMD IOMMU specific kernel command line
1207 * options.
1208 *
1209 ****************************************************************************/
1210
844static int __init parse_amd_iommu_options(char *str) 1211static int __init parse_amd_iommu_options(char *str)
845{ 1212{
846 for (; *str; ++str) { 1213 for (; *str; ++str) {
847 if (strcmp(str, "isolate") == 0) 1214 if (strncmp(str, "isolate", 7) == 0)
848 amd_iommu_isolate = 1; 1215 amd_iommu_isolate = 1;
1216 if (strncmp(str, "fullflush", 11) == 0)
1217 amd_iommu_unmap_flush = true;
849 } 1218 }
850 1219
851 return 1; 1220 return 1;
@@ -853,20 +1222,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1222
854static int __init parse_amd_iommu_size_options(char *str) 1223static int __init parse_amd_iommu_size_options(char *str)
855{ 1224{
856 for (; *str; ++str) { 1225 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1226
858 amd_iommu_aperture_order = 25; 1227 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1228 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1229
871 return 1; 1230 return 1;
872} 1231}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
@@ -454,11 +455,11 @@ out:
454 force_iommu || 455 force_iommu ||
455 valid_agp || 456 valid_agp ||
456 fallback_aper_force) { 457 fallback_aper_force) {
457 printk(KERN_ERR 458 printk(KERN_INFO
458 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
459 printk(KERN_ERR 460 printk(KERN_INFO
460 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
461 printk(KERN_ERR 462 printk(KERN_INFO
462 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
463 32 << fallback_aper_order); 464 32 << fallback_aper_order);
464 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index a437d027f20b..04a7f960bbc0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/ioport.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
27#include <linux/clockchips.h> 28#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 29#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h>
31 33
32#include <asm/atomic.h> 34#include <asm/atomic.h>
33#include <asm/smp.h> 35#include <asm/smp.h>
@@ -36,8 +38,14 @@
36#include <asm/desc.h> 38#include <asm/desc.h>
37#include <asm/arch_hooks.h> 39#include <asm/arch_hooks.h>
38#include <asm/hpet.h> 40#include <asm/hpet.h>
41#include <asm/pgalloc.h>
39#include <asm/i8253.h> 42#include <asm/i8253.h>
40#include <asm/nmi.h> 43#include <asm/nmi.h>
44#include <asm/idle.h>
45#include <asm/proto.h>
46#include <asm/timex.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
41 49
42#include <mach_apic.h> 50#include <mach_apic.h>
43#include <mach_apicdef.h> 51#include <mach_apicdef.h>
@@ -50,20 +58,60 @@
50# error SPURIOUS_APIC_VECTOR definition error 58# error SPURIOUS_APIC_VECTOR definition error
51#endif 59#endif
52 60
53unsigned long mp_lapic_addr; 61#ifdef CONFIG_X86_32
54
55/* 62/*
56 * Knob to control our willingness to enable the local APIC. 63 * Knob to control our willingness to enable the local APIC.
57 * 64 *
58 * +1=force-enable 65 * +1=force-enable
59 */ 66 */
60static int force_enable_local_apic; 67static int force_enable_local_apic;
61int disable_apic; 68/*
69 * APIC command line parameters
70 */
71static int __init parse_lapic(char *arg)
72{
73 force_enable_local_apic = 1;
74 return 0;
75}
76early_param("lapic", parse_lapic);
77/* Local APIC was disabled by the BIOS and enabled by the kernel */
78static int enabled_via_apicbase;
79
80#endif
62 81
63/* Local APIC timer verification ok */ 82#ifdef CONFIG_X86_64
64static int local_apic_timer_verify_ok; 83static int apic_calibrate_pmtmr __initdata;
84static __init int setup_apicpmtimer(char *s)
85{
86 apic_calibrate_pmtmr = 1;
87 notsc_setup(NULL);
88 return 0;
89}
90__setup("apicpmtimer", setup_apicpmtimer);
91#endif
92
93#ifdef CONFIG_X86_64
94#define HAVE_X2APIC
95#endif
96
97#ifdef HAVE_X2APIC
98int x2apic;
99/* x2apic enabled before OS handover */
100int x2apic_preenabled;
101int disable_x2apic;
102static __init int setup_nox2apic(char *str)
103{
104 disable_x2apic = 1;
105 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
106 return 0;
107}
108early_param("nox2apic", setup_nox2apic);
109#endif
110
111unsigned long mp_lapic_addr;
112int disable_apic;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 113/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 114static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 115/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 116int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 117EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -75,7 +123,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 123/*
76 * Debug level, exported for io_apic.c 124 * Debug level, exported for io_apic.c
77 */ 125 */
78int apic_verbosity; 126unsigned int apic_verbosity;
79 127
80int pic_mode; 128int pic_mode;
81 129
@@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
112}; 160};
113static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 161static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
114 162
115/* Local APIC was disabled by the BIOS and enabled by the kernel */
116static int enabled_via_apicbase;
117
118static unsigned long apic_phys; 163static unsigned long apic_phys;
119 164
120/* 165/*
@@ -130,7 +175,11 @@ static inline int lapic_get_version(void)
130 */ 175 */
131static inline int lapic_is_integrated(void) 176static inline int lapic_is_integrated(void)
132{ 177{
178#ifdef CONFIG_X86_64
179 return 1;
180#else
133 return APIC_INTEGRATED(lapic_get_version()); 181 return APIC_INTEGRATED(lapic_get_version());
182#endif
134} 183}
135 184
136/* 185/*
@@ -145,13 +194,18 @@ static int modern_apic(void)
145 return lapic_get_version() >= 0x14; 194 return lapic_get_version() >= 0x14;
146} 195}
147 196
148void apic_wait_icr_idle(void) 197/*
198 * Paravirt kernels also might be using these below ops. So we still
199 * use generic apic_read()/apic_write(), which might be pointing to different
200 * ops in PARAVIRT case.
201 */
202void xapic_wait_icr_idle(void)
149{ 203{
150 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 204 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
151 cpu_relax(); 205 cpu_relax();
152} 206}
153 207
154u32 safe_apic_wait_icr_idle(void) 208u32 safe_xapic_wait_icr_idle(void)
155{ 209{
156 u32 send_status; 210 u32 send_status;
157 int timeout; 211 int timeout;
@@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void)
167 return send_status; 221 return send_status;
168} 222}
169 223
224void xapic_icr_write(u32 low, u32 id)
225{
226 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
227 apic_write(APIC_ICR, low);
228}
229
230u64 xapic_icr_read(void)
231{
232 u32 icr1, icr2;
233
234 icr2 = apic_read(APIC_ICR2);
235 icr1 = apic_read(APIC_ICR);
236
237 return icr1 | ((u64)icr2 << 32);
238}
239
240static struct apic_ops xapic_ops = {
241 .read = native_apic_mem_read,
242 .write = native_apic_mem_write,
243 .icr_read = xapic_icr_read,
244 .icr_write = xapic_icr_write,
245 .wait_icr_idle = xapic_wait_icr_idle,
246 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
247};
248
249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
250EXPORT_SYMBOL_GPL(apic_ops);
251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
170/** 288/**
171 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
172 */ 290 */
173void __cpuinit enable_NMI_through_LVT0(void) 291void __cpuinit enable_NMI_through_LVT0(void)
174{ 292{
175 unsigned int v = APIC_DM_NMI; 293 unsigned int v;
294
295 /* unmask and set to NMI */
296 v = APIC_DM_NMI;
176 297
177 /* Level triggered for 82489DX */ 298 /* Level triggered for 82489DX (32bit mode) */
178 if (!lapic_is_integrated()) 299 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 300 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 301
302 apic_write(APIC_LVT0, v);
181} 303}
182 304
305#ifdef CONFIG_X86_32
183/** 306/**
184 * get_physical_broadcast - Get number of physical broadcast IDs 307 * get_physical_broadcast - Get number of physical broadcast IDs
185 */ 308 */
@@ -187,15 +310,20 @@ int get_physical_broadcast(void)
187{ 310{
188 return modern_apic() ? 0xff : 0xf; 311 return modern_apic() ? 0xff : 0xf;
189} 312}
313#endif
190 314
191/** 315/**
192 * lapic_get_maxlvt - get the maximum number of local vector table entries 316 * lapic_get_maxlvt - get the maximum number of local vector table entries
193 */ 317 */
194int lapic_get_maxlvt(void) 318int lapic_get_maxlvt(void)
195{ 319{
196 unsigned int v = apic_read(APIC_LVR); 320 unsigned int v;
197 321
198 /* 82489DXs do not report # of LVT entries. */ 322 v = apic_read(APIC_LVR);
323 /*
324 * - we always have APIC integrated on 64bit mode
325 * - 82489DXs do not report # of LVT entries
326 */
199 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 327 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
200} 328}
201 329
@@ -203,7 +331,7 @@ int lapic_get_maxlvt(void)
203 * Local APIC timer 331 * Local APIC timer
204 */ 332 */
205 333
206/* Clock divisor is set to 16 */ 334/* Clock divisor */
207#define APIC_DIVISOR 16 335#define APIC_DIVISOR 16
208 336
209/* 337/*
@@ -229,27 +357,61 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 357 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 358 lvtt_value |= APIC_LVT_MASKED;
231 359
232 apic_write_around(APIC_LVTT, lvtt_value); 360 apic_write(APIC_LVTT, lvtt_value);
233 361
234 /* 362 /*
235 * Divide PICLK by 16 363 * Divide PICLK by 16
236 */ 364 */
237 tmp_value = apic_read(APIC_TDCR); 365 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 366 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 367 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 368 APIC_TDR_DIV_16);
241 369
242 if (!oneshot) 370 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 371 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 372}
245 373
246/* 374/*
375 * Setup extended LVT, AMD specific (K8, family 10h)
376 *
377 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
378 * MCE interrupts are supported. Thus MCE offset must be set to 0.
379 *
380 * If mask=1, the LVT entry does not generate interrupts while mask=0
381 * enables the vector. See also the BKDGs.
382 */
383
384#define APIC_EILVT_LVTOFF_MCE 0
385#define APIC_EILVT_LVTOFF_IBS 1
386
387static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
388{
389 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
390 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
391
392 apic_write(reg, v);
393}
394
395u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
396{
397 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
398 return APIC_EILVT_LVTOFF_MCE;
399}
400
401u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
402{
403 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
404 return APIC_EILVT_LVTOFF_IBS;
405}
406EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
407
408/*
247 * Program the next event, relative to now 409 * Program the next event, relative to now
248 */ 410 */
249static int lapic_next_event(unsigned long delta, 411static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 412 struct clock_event_device *evt)
251{ 413{
252 apic_write_around(APIC_TMICT, delta); 414 apic_write(APIC_TMICT, delta);
253 return 0; 415 return 0;
254} 416}
255 417
@@ -262,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
262 unsigned long flags; 424 unsigned long flags;
263 unsigned int v; 425 unsigned int v;
264 426
265 /* Lapic used for broadcast ? */ 427 /* Lapic used as dummy for broadcast ? */
266 if (!local_apic_timer_verify_ok) 428 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
267 return; 429 return;
268 430
269 local_irq_save(flags); 431 local_irq_save(flags);
@@ -278,7 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 440 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 441 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 442 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 443 apic_write(APIC_LVTT, v);
282 break; 444 break;
283 case CLOCK_EVT_MODE_RESUME: 445 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 446 /* Nothing to do here */
@@ -302,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
302 * Setup the local APIC timer for this CPU. Copy the initilized values 464 * Setup the local APIC timer for this CPU. Copy the initilized values
303 * of the boot CPU and register the clock event in the framework. 465 * of the boot CPU and register the clock event in the framework.
304 */ 466 */
305static void __devinit setup_APIC_timer(void) 467static void __cpuinit setup_APIC_timer(void)
306{ 468{
307 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
308 470
@@ -372,39 +534,53 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 534 }
373} 535}
374 536
375/* 537static int __init calibrate_by_pmtimer(long deltapm, long *delta)
376 * Setup the boot APIC 538{
377 * 539 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
378 * Calibrate and verify the result. 540 const long pm_thresh = pm_100ms / 100;
379 */ 541 unsigned long mult;
380void __init setup_boot_APIC_clock(void) 542 u64 res;
543
544#ifndef CONFIG_X86_PM_TIMER
545 return -1;
546#endif
547
548 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
549
550 /* Check, if the PM timer is available */
551 if (!deltapm)
552 return -1;
553
554 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
555
556 if (deltapm > (pm_100ms - pm_thresh) &&
557 deltapm < (pm_100ms + pm_thresh)) {
558 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
559 } else {
560 res = (((u64)deltapm) * mult) >> 22;
561 do_div(res, 1000000);
562 printk(KERN_WARNING "APIC calibration not consistent "
563 "with PM Timer: %ldms instead of 100ms\n",
564 (long)res);
565 /* Correct the lapic counter value */
566 res = (((u64)(*delta)) * pm_100ms);
567 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
569 "%lu (%ld)\n", (unsigned long)res, *delta);
570 *delta = (long)res;
571 }
572
573 return 0;
574}
575
576static int __init calibrate_APIC_clock(void)
381{ 577{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 578 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
384 const long pm_thresh = pm_100ms/100;
385 void (*real_handler)(struct clock_event_device *dev); 579 void (*real_handler)(struct clock_event_device *dev);
386 unsigned long deltaj; 580 unsigned long deltaj;
387 long delta, deltapm; 581 long delta;
388 int pm_referenced = 0; 582 int pm_referenced = 0;
389 583
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 584 local_irq_disable();
409 585
410 /* Replace the global interrupt handler */ 586 /* Replace the global interrupt handler */
@@ -412,10 +588,10 @@ void __init setup_boot_APIC_clock(void)
412 global_clock_event->event_handler = lapic_cal_handler; 588 global_clock_event->event_handler = lapic_cal_handler;
413 589
414 /* 590 /*
415 * Setup the APIC counter to 1e9. There is no way the lapic 591 * Setup the APIC counter to maximum. There is no way the lapic
416 * can underflow in the 100ms detection time frame 592 * can underflow in the 100ms detection time frame
417 */ 593 */
418 __setup_APIC_LVTT(1000000000, 0, 0); 594 __setup_APIC_LVTT(0xffffffff, 0, 0);
419 595
420 /* Let the interrupts run */ 596 /* Let the interrupts run */
421 local_irq_enable(); 597 local_irq_enable();
@@ -432,34 +608,9 @@ void __init setup_boot_APIC_clock(void)
432 delta = lapic_cal_t1 - lapic_cal_t2; 608 delta = lapic_cal_t1 - lapic_cal_t2;
433 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 609 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
434 610
435 /* Check, if the PM timer is available */ 611 /* we trust the PM based calibration if possible */
436 deltapm = lapic_cal_pm2 - lapic_cal_pm1; 612 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
437 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 613 &delta);
438
439 if (deltapm) {
440 unsigned long mult;
441 u64 res;
442
443 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
444
445 if (deltapm > (pm_100ms - pm_thresh) &&
446 deltapm < (pm_100ms + pm_thresh)) {
447 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
448 } else {
449 res = (((u64) deltapm) * mult) >> 22;
450 do_div(res, 1000000);
451 printk(KERN_WARNING "APIC calibration not consistent "
452 "with PM Timer: %ldms instead of 100ms\n",
453 (long)res);
454 /* Correct the lapic counter value */
455 res = (((u64) delta) * pm_100ms);
456 do_div(res, deltapm);
457 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
458 "%lu (%ld)\n", (unsigned long) res, delta);
459 delta = (long) res;
460 }
461 pm_referenced = 1;
462 }
463 614
464 /* Calculate the scaled math multiplication factor */ 615 /* Calculate the scaled math multiplication factor */
465 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 616 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -489,8 +640,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 640 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 641 calibration_result % (1000000 / HZ));
491 642
492 local_apic_timer_verify_ok = 1;
493
494 /* 643 /*
495 * Do a sanity check on the APIC calibration result 644 * Do a sanity check on the APIC calibration result
496 */ 645 */
@@ -498,13 +647,15 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 647 local_irq_enable();
499 printk(KERN_WARNING 648 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 649 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 650 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 651 }
506 652
507 /* We trust the pm timer based calibration */ 653 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
654
655 /*
656 * PM timer calibration failed or not turned on
657 * so lets try APIC timer based calibration
658 */
508 if (!pm_referenced) { 659 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 660 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
510 661
@@ -536,34 +687,68 @@ void __init setup_boot_APIC_clock(void)
536 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 687 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
537 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 688 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
538 else 689 else
539 local_apic_timer_verify_ok = 0; 690 levt->features |= CLOCK_EVT_FEAT_DUMMY;
540 } else 691 } else
541 local_irq_enable(); 692 local_irq_enable();
542 693
543 if (!local_apic_timer_verify_ok) { 694 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
544 printk(KERN_WARNING 695 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 696 "APIC timer disabled due to verification failure.\n");
697 return -1;
698 }
699
700 return 0;
701}
702
703/*
704 * Setup the boot APIC
705 *
706 * Calibrate and verify the result.
707 */
708void __init setup_boot_APIC_clock(void)
709{
710 /*
711 * The local apic timer can be disabled via the kernel
712 * commandline or from the CPU detection code. Register the lapic
713 * timer as a dummy clock event source on SMP systems, so the
714 * broadcast mechanism is used. On UP systems simply ignore it.
715 */
716 if (disable_apic_timer) {
717 printk(KERN_INFO "Disabling APIC timer\n");
546 /* No broadcast on UP ! */ 718 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 719 if (num_possible_cpus() > 1) {
548 return; 720 lapic_clockevent.mult = 1;
549 } else { 721 setup_APIC_timer();
550 /* 722 }
551 * If nmi_watchdog is set to IO_APIC, we need the 723 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy 724 }
553 * device. 725
554 */ 726 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
555 if (nmi_watchdog != NMI_IO_APIC) 727 "calibrating APIC timer ...\n");
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 728
557 else 729 if (calibrate_APIC_clock()) {
558 printk(KERN_WARNING "APIC timer registered as dummy," 730 /* No broadcast on UP ! */
559 " due to nmi_watchdog=%d!\n", nmi_watchdog); 731 if (num_possible_cpus() > 1)
732 setup_APIC_timer();
733 return;
560 } 734 }
561 735
736 /*
737 * If nmi_watchdog is set to IO_APIC, we need the
738 * PIT/HPET going. Otherwise register lapic as a dummy
739 * device.
740 */
741 if (nmi_watchdog != NMI_IO_APIC)
742 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
743 else
744 printk(KERN_WARNING "APIC timer registered as dummy,"
745 " due to nmi_watchdog=%d!\n", nmi_watchdog);
746
562 /* Setup the lapic or request the broadcast */ 747 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 748 setup_APIC_timer();
564} 749}
565 750
566void __devinit setup_secondary_APIC_clock(void) 751void __cpuinit setup_secondary_APIC_clock(void)
567{ 752{
568 setup_APIC_timer(); 753 setup_APIC_timer();
569} 754}
@@ -598,7 +783,11 @@ static void local_apic_timer_interrupt(void)
598 /* 783 /*
599 * the NMI deadlock-detector uses this. 784 * the NMI deadlock-detector uses this.
600 */ 785 */
786#ifdef CONFIG_X86_64
787 add_pda(apic_timer_irqs, 1);
788#else
601 per_cpu(irq_stat, cpu).apic_timer_irqs++; 789 per_cpu(irq_stat, cpu).apic_timer_irqs++;
790#endif
602 791
603 evt->event_handler(evt); 792 evt->event_handler(evt);
604} 793}
@@ -625,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
625 * Besides, if we don't timer interrupts ignore the global 814 * Besides, if we don't timer interrupts ignore the global
626 * interrupt lock, which is the WrongThing (tm) to do. 815 * interrupt lock, which is the WrongThing (tm) to do.
627 */ 816 */
817#ifdef CONFIG_X86_64
818 exit_idle();
819#endif
628 irq_enter(); 820 irq_enter();
629 local_apic_timer_interrupt(); 821 local_apic_timer_interrupt();
630 irq_exit(); 822 irq_exit();
@@ -638,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier)
638} 830}
639 831
640/* 832/*
641 * Setup extended LVT, AMD specific (K8, family 10h)
642 *
643 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
644 * MCE interrupts are supported. Thus MCE offset must be set to 0.
645 */
646
647#define APIC_EILVT_LVTOFF_MCE 0
648#define APIC_EILVT_LVTOFF_IBS 1
649
650static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
651{
652 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
653 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
654 apic_write(reg, v);
655}
656
657u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
658{
659 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
660 return APIC_EILVT_LVTOFF_MCE;
661}
662
663u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
664{
665 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
666 return APIC_EILVT_LVTOFF_IBS;
667}
668
669/*
670 * Local APIC start and shutdown 833 * Local APIC start and shutdown
671 */ 834 */
672 835
@@ -693,45 +856,41 @@ void clear_local_APIC(void)
693 */ 856 */
694 if (maxlvt >= 3) { 857 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 858 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 859 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 860 }
698 /* 861 /*
699 * Careful: we have to set masks only first to deassert 862 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 863 * any level-triggered sources.
701 */ 864 */
702 v = apic_read(APIC_LVTT); 865 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 866 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 867 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 868 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 869 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 870 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 871 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 872 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 873 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 874 }
712 875
713 /* lets not touch this if we didn't frob it */ 876 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 877#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
715 if (maxlvt >= 5) { 878 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 879 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 880 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 881 }
719#endif 882#endif
720 /* 883 /*
721 * Clean APIC state for other OSs: 884 * Clean APIC state for other OSs:
722 */ 885 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 886 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 887 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 888 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 889 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 890 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 891 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 892 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 893
731#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif
735 /* Integrated APIC (!82489DX) ? */ 894 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 895 if (lapic_is_integrated()) {
737 if (maxlvt > 3) 896 if (maxlvt > 3)
@@ -746,7 +905,7 @@ void clear_local_APIC(void)
746 */ 905 */
747void disable_local_APIC(void) 906void disable_local_APIC(void)
748{ 907{
749 unsigned long value; 908 unsigned int value;
750 909
751 clear_local_APIC(); 910 clear_local_APIC();
752 911
@@ -756,8 +915,9 @@ void disable_local_APIC(void)
756 */ 915 */
757 value = apic_read(APIC_SPIV); 916 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 917 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 918 apic_write(APIC_SPIV, value);
760 919
920#ifdef CONFIG_X86_32
761 /* 921 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 922 * When LAPIC was disabled by the BIOS and enabled by the kernel,
763 * restore the disabled state. 923 * restore the disabled state.
@@ -769,6 +929,7 @@ void disable_local_APIC(void)
769 l &= ~MSR_IA32_APICBASE_ENABLE; 929 l &= ~MSR_IA32_APICBASE_ENABLE;
770 wrmsr(MSR_IA32_APICBASE, l, h); 930 wrmsr(MSR_IA32_APICBASE, l, h);
771 } 931 }
932#endif
772} 933}
773 934
774/* 935/*
@@ -785,11 +946,15 @@ void lapic_shutdown(void)
785 return; 946 return;
786 947
787 local_irq_save(flags); 948 local_irq_save(flags);
788 clear_local_APIC();
789 949
790 if (enabled_via_apicbase) 950#ifdef CONFIG_X86_32
951 if (!enabled_via_apicbase)
952 clear_local_APIC();
953 else
954#endif
791 disable_local_APIC(); 955 disable_local_APIC();
792 956
957
793 local_irq_restore(flags); 958 local_irq_restore(flags);
794} 959}
795 960
@@ -834,6 +999,12 @@ int __init verify_local_APIC(void)
834 */ 999 */
835 reg0 = apic_read(APIC_ID); 1000 reg0 = apic_read(APIC_ID);
836 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 1001 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1002 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
1003 reg1 = apic_read(APIC_ID);
1004 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1005 apic_write(APIC_ID, reg0);
1006 if (reg1 != (reg0 ^ APIC_ID_MASK))
1007 return 0;
837 1008
838 /* 1009 /*
839 * The next two are just to see if we have sane values. 1010 * The next two are just to see if we have sane values.
@@ -859,14 +1030,15 @@ void __init sync_Arb_IDs(void)
859 */ 1030 */
860 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1031 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
861 return; 1032 return;
1033
862 /* 1034 /*
863 * Wait for idle. 1035 * Wait for idle.
864 */ 1036 */
865 apic_wait_icr_idle(); 1037 apic_wait_icr_idle();
866 1038
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 1039 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 1040 apic_write(APIC_ICR, APIC_DEST_ALLINC |
869 | APIC_DM_INIT); 1041 APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 1042}
871 1043
872/* 1044/*
@@ -874,7 +1046,7 @@ void __init sync_Arb_IDs(void)
874 */ 1046 */
875void __init init_bsp_APIC(void) 1047void __init init_bsp_APIC(void)
876{ 1048{
877 unsigned long value; 1049 unsigned int value;
878 1050
879 /* 1051 /*
880 * Don't do the setup now if we have a SMP BIOS as the 1052 * Don't do the setup now if we have a SMP BIOS as the
@@ -895,60 +1067,66 @@ void __init init_bsp_APIC(void)
895 value &= ~APIC_VECTOR_MASK; 1067 value &= ~APIC_VECTOR_MASK;
896 value |= APIC_SPIV_APIC_ENABLED; 1068 value |= APIC_SPIV_APIC_ENABLED;
897 1069
1070#ifdef CONFIG_X86_32
898 /* This bit is reserved on P4/Xeon and should be cleared */ 1071 /* This bit is reserved on P4/Xeon and should be cleared */
899 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 1072 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
900 (boot_cpu_data.x86 == 15)) 1073 (boot_cpu_data.x86 == 15))
901 value &= ~APIC_SPIV_FOCUS_DISABLED; 1074 value &= ~APIC_SPIV_FOCUS_DISABLED;
902 else 1075 else
1076#endif
903 value |= APIC_SPIV_FOCUS_DISABLED; 1077 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 1078 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 1079 apic_write(APIC_SPIV, value);
906 1080
907 /* 1081 /*
908 * Set up the virtual wire mode. 1082 * Set up the virtual wire mode.
909 */ 1083 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 1084 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 1085 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 1086 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 1087 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 1088 apic_write(APIC_LVT1, value);
915} 1089}
916 1090
917static void __cpuinit lapic_setup_esr(void) 1091static void __cpuinit lapic_setup_esr(void)
918{ 1092{
919 unsigned long oldvalue, value, maxlvt; 1093 unsigned int oldvalue, value, maxlvt;
920 if (lapic_is_integrated() && !esr_disable) {
921 /* !82489DX */
922 maxlvt = lapic_get_maxlvt();
923 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
924 apic_write(APIC_ESR, 0);
925 oldvalue = apic_read(APIC_ESR);
926 1094
927 /* enables sending errors */ 1095 if (!lapic_is_integrated()) {
928 value = ERROR_APIC_VECTOR; 1096 printk(KERN_INFO "No ESR for 82489DX.\n");
929 apic_write_around(APIC_LVTERR, value); 1097 return;
1098 }
1099
1100 if (esr_disable) {
930 /* 1101 /*
931 * spec says clear errors after enabling vector. 1102 * Something untraceable is creating bad interrupts on
1103 * secondary quads ... for the moment, just leave the
1104 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh
932 */ 1106 */
933 if (maxlvt > 3) 1107 printk(KERN_INFO "Leaving ESR disabled.\n");
934 apic_write(APIC_ESR, 0); 1108 return;
935 value = apic_read(APIC_ESR);
936 if (value != oldvalue)
937 apic_printk(APIC_VERBOSE, "ESR value before enabling "
938 "vector: 0x%08lx after: 0x%08lx\n",
939 oldvalue, value);
940 } else {
941 if (esr_disable)
942 /*
943 * Something untraceable is creating bad interrupts on
944 * secondary quads ... for the moment, just leave the
945 * ESR disabled - we can't do anything useful with the
946 * errors anyway - mbligh
947 */
948 printk(KERN_INFO "Leaving ESR disabled.\n");
949 else
950 printk(KERN_INFO "No ESR for 82489DX.\n");
951 } 1109 }
1110
1111 maxlvt = lapic_get_maxlvt();
1112 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1113 apic_write(APIC_ESR, 0);
1114 oldvalue = apic_read(APIC_ESR);
1115
1116 /* enables sending errors */
1117 value = ERROR_APIC_VECTOR;
1118 apic_write(APIC_LVTERR, value);
1119
1120 /*
1121 * spec says clear errors after enabling vector.
1122 */
1123 if (maxlvt > 3)
1124 apic_write(APIC_ESR, 0);
1125 value = apic_read(APIC_ESR);
1126 if (value != oldvalue)
1127 apic_printk(APIC_VERBOSE, "ESR value before enabling "
1128 "vector: 0x%08x after: 0x%08x\n",
1129 oldvalue, value);
952} 1130}
953 1131
954 1132
@@ -957,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
957 */ 1135 */
958void __cpuinit setup_local_APIC(void) 1136void __cpuinit setup_local_APIC(void)
959{ 1137{
960 unsigned long value, integrated; 1138 unsigned int value;
961 int i, j; 1139 int i, j;
962 1140
1141#ifdef CONFIG_X86_32
963 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1142 /* Pound the ESR really hard over the head with a big hammer - mbligh */
964 if (esr_disable) { 1143 if (lapic_is_integrated() && esr_disable) {
965 apic_write(APIC_ESR, 0); 1144 apic_write(APIC_ESR, 0);
966 apic_write(APIC_ESR, 0); 1145 apic_write(APIC_ESR, 0);
967 apic_write(APIC_ESR, 0); 1146 apic_write(APIC_ESR, 0);
968 apic_write(APIC_ESR, 0); 1147 apic_write(APIC_ESR, 0);
969 } 1148 }
1149#endif
970 1150
971 integrated = lapic_is_integrated(); 1151 preempt_disable();
972 1152
973 /* 1153 /*
974 * Double-check whether this APIC is really registered. 1154 * Double-check whether this APIC is really registered.
1155 * This is meaningless in clustered apic mode, so we skip it.
975 */ 1156 */
976 if (!apic_id_registered()) 1157 if (!apic_id_registered())
977 WARN_ON_ONCE(1); 1158 BUG();
978 1159
979 /* 1160 /*
980 * Intel recommends to set DFR, LDR and TPR before enabling 1161 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -989,7 +1170,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 1170 */
990 value = apic_read(APIC_TASKPRI); 1171 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 1172 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 1173 apic_write(APIC_TASKPRI, value);
993 1174
994 /* 1175 /*
995 * After a crash, we no longer service the interrupts and a pending 1176 * After a crash, we no longer service the interrupts and a pending
@@ -1020,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
1020 */ 1201 */
1021 value |= APIC_SPIV_APIC_ENABLED; 1202 value |= APIC_SPIV_APIC_ENABLED;
1022 1203
1204#ifdef CONFIG_X86_32
1023 /* 1205 /*
1024 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 1206 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
1025 * certain networking cards. If high frequency interrupts are 1207 * certain networking cards. If high frequency interrupts are
@@ -1040,14 +1222,19 @@ void __cpuinit setup_local_APIC(void)
1040 * See also the comment in end_level_ioapic_irq(). --macro 1222 * See also the comment in end_level_ioapic_irq(). --macro
1041 */ 1223 */
1042 1224
1043 /* Enable focus processor (bit==0) */ 1225 /*
1226 * - enable focus processor (bit==0)
1227 * - 64bit mode always use processor focus
1228 * so no need to set it
1229 */
1044 value &= ~APIC_SPIV_FOCUS_DISABLED; 1230 value &= ~APIC_SPIV_FOCUS_DISABLED;
1231#endif
1045 1232
1046 /* 1233 /*
1047 * Set spurious IRQ vector 1234 * Set spurious IRQ vector
1048 */ 1235 */
1049 value |= SPURIOUS_APIC_VECTOR; 1236 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1237 apic_write(APIC_SPIV, value);
1051 1238
1052 /* 1239 /*
1053 * Set up LVT0, LVT1: 1240 * Set up LVT0, LVT1:
@@ -1069,7 +1256,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1256 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1257 smp_processor_id());
1071 } 1258 }
1072 apic_write_around(APIC_LVT0, value); 1259 apic_write(APIC_LVT0, value);
1073 1260
1074 /* 1261 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1262 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1078,25 +1265,178 @@ void __cpuinit setup_local_APIC(void)
1078 value = APIC_DM_NMI; 1265 value = APIC_DM_NMI;
1079 else 1266 else
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1267 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1268 if (!lapic_is_integrated()) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1269 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1270 apic_write(APIC_LVT1, value);
1271
1272 preempt_enable();
1084} 1273}
1085 1274
1086void __cpuinit end_local_APIC_setup(void) 1275void __cpuinit end_local_APIC_setup(void)
1087{ 1276{
1088 unsigned long value;
1089
1090 lapic_setup_esr(); 1277 lapic_setup_esr();
1091 /* Disable the local apic timer */ 1278
1092 value = apic_read(APIC_LVTT); 1279#ifdef CONFIG_X86_32
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1280 {
1094 apic_write_around(APIC_LVTT, value); 1281 unsigned int value;
1282 /* Disable the local apic timer */
1283 value = apic_read(APIC_LVTT);
1284 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1285 apic_write(APIC_LVTT, value);
1286 }
1287#endif
1095 1288
1096 setup_apic_nmi_watchdog(NULL); 1289 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1290 apic_pm_activate();
1098} 1291}
1099 1292
1293#ifdef HAVE_X2APIC
1294void check_x2apic(void)
1295{
1296 int msr, msr2;
1297
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299
1300 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops;
1304 }
1305}
1306
1307void enable_x2apic(void)
1308{
1309 int msr, msr2;
1310
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 }
1316}
1317
1318void enable_IR_x2apic(void)
1319{
1320#ifdef CONFIG_INTR_REMAP
1321 int ret;
1322 unsigned long flags;
1323
1324 if (!cpu_has_x2apic)
1325 return;
1326
1327 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO
1329 "Skipped enabling x2apic and Interrupt-remapping "
1330 "because of nox2apic\n");
1331 return;
1332 }
1333
1334 if (x2apic_preenabled && disable_x2apic)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336
1337 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO
1339 "Skipped enabling x2apic and Interrupt-remapping "
1340 "because of skipping io-apic setup\n");
1341 return;
1342 }
1343
1344 ret = dmar_table_init();
1345 if (ret) {
1346 printk(KERN_INFO
1347 "dmar_table_init() failed with %d:\n", ret);
1348
1349 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed");
1351 else
1352 printk(KERN_INFO
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return;
1355 }
1356
1357 local_irq_save(flags);
1358 mask_8259A();
1359
1360 ret = save_mask_IO_APIC_setup();
1361 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
1363 goto end;
1364 }
1365
1366 ret = enable_intr_remapping(1);
1367
1368 if (ret && x2apic_preenabled) {
1369 local_irq_restore(flags);
1370 panic("x2apic enabled by bios. But IR enabling failed");
1371 }
1372
1373 if (ret)
1374 goto end_restore;
1375
1376 if (!x2apic) {
1377 x2apic = 1;
1378 apic_ops = &x2apic_ops;
1379 enable_x2apic();
1380 }
1381
1382end_restore:
1383 if (ret)
1384 /*
1385 * IR enabling failed
1386 */
1387 restore_IO_APIC_setup();
1388 else
1389 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1390
1391end:
1392 unmask_8259A();
1393 local_irq_restore(flags);
1394
1395 if (!ret) {
1396 if (!x2apic_preenabled)
1397 printk(KERN_INFO
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else
1400 printk(KERN_INFO
1401 "Enabled Interrupt-remapping\n");
1402 } else
1403 printk(KERN_ERR
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else
1406 if (!cpu_has_x2apic)
1407 return;
1408
1409 if (x2apic_preenabled)
1410 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP");
1412
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n");
1415#endif
1416
1417 return;
1418}
1419#endif /* HAVE_X2APIC */
1420
1421#ifdef CONFIG_X86_64
1422/*
1423 * Detect and enable local APICs on non-SMP boards.
1424 * Original code written by Keir Fraser.
1425 * On AMD64 we trust the BIOS - if it says no APIC it is likely
1426 * not correctly set up (usually the APIC timer won't work etc.)
1427 */
1428static int __init detect_init_APIC(void)
1429{
1430 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n");
1432 return -1;
1433 }
1434
1435 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1436 boot_cpu_physical_apicid = 0;
1437 return 0;
1438}
1439#else
1100/* 1440/*
1101 * Detect and initialize APIC 1441 * Detect and initialize APIC
1102 */ 1442 */
@@ -1175,12 +1515,46 @@ no_apic:
1175 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1515 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1176 return -1; 1516 return -1;
1177} 1517}
1518#endif
1519
1520#ifdef CONFIG_X86_64
1521void __init early_init_lapic_mapping(void)
1522{
1523 unsigned long phys_addr;
1524
1525 /*
1526 * If no local APIC can be found then go out
1527 * : it means there is no mpatable and MADT
1528 */
1529 if (!smp_found_config)
1530 return;
1531
1532 phys_addr = mp_lapic_addr;
1533
1534 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1535 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1536 APIC_BASE, phys_addr);
1537
1538 /*
1539 * Fetch the APIC ID of the BSP in case we have a
1540 * default configuration (or the MP table is broken).
1541 */
1542 boot_cpu_physical_apicid = read_apic_id();
1543}
1544#endif
1178 1545
1179/** 1546/**
1180 * init_apic_mappings - initialize APIC mappings 1547 * init_apic_mappings - initialize APIC mappings
1181 */ 1548 */
1182void __init init_apic_mappings(void) 1549void __init init_apic_mappings(void)
1183{ 1550{
1551#ifdef HAVE_X2APIC
1552 if (x2apic) {
1553 boot_cpu_physical_apicid = read_apic_id();
1554 return;
1555 }
1556#endif
1557
1184 /* 1558 /*
1185 * If no local APIC can be found then set up a fake all 1559 * If no local APIC can be found then set up a fake all
1186 * zeroes page to simulate the local APIC and another 1560 * zeroes page to simulate the local APIC and another
@@ -1193,30 +1567,36 @@ void __init init_apic_mappings(void)
1193 apic_phys = mp_lapic_addr; 1567 apic_phys = mp_lapic_addr;
1194 1568
1195 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1569 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1196 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, 1570 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1197 apic_phys); 1571 APIC_BASE, apic_phys);
1198 1572
1199 /* 1573 /*
1200 * Fetch the APIC ID of the BSP in case we have a 1574 * Fetch the APIC ID of the BSP in case we have a
1201 * default configuration (or the MP table is broken). 1575 * default configuration (or the MP table is broken).
1202 */ 1576 */
1203 if (boot_cpu_physical_apicid == -1U) 1577 if (boot_cpu_physical_apicid == -1U)
1204 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1578 boot_cpu_physical_apicid = read_apic_id();
1205
1206} 1579}
1207 1580
1208/* 1581/*
1209 * This initializes the IO-APIC and APIC hardware if this is 1582 * This initializes the IO-APIC and APIC hardware if this is
1210 * a UP kernel. 1583 * a UP kernel.
1211 */ 1584 */
1212
1213int apic_version[MAX_APICS]; 1585int apic_version[MAX_APICS];
1214 1586
1215int __init APIC_init_uniprocessor(void) 1587int __init APIC_init_uniprocessor(void)
1216{ 1588{
1217 if (disable_apic) 1589#ifdef CONFIG_X86_64
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1590 if (disable_apic) {
1219 1591 printk(KERN_INFO "Apic disabled\n");
1592 return -1;
1593 }
1594 if (!cpu_has_apic) {
1595 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n");
1597 return -1;
1598 }
1599#else
1220 if (!smp_found_config && !cpu_has_apic) 1600 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1601 return -1;
1222 1602
@@ -1225,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
1225 */ 1605 */
1226 if (!cpu_has_apic && 1606 if (!cpu_has_apic &&
1227 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1228 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
1229 boot_cpu_physical_apicid); 1609 boot_cpu_physical_apicid);
1230 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1231 return -1; 1611 return -1;
1232 } 1612 }
1613#endif
1233 1614
1234 verify_local_APIC(); 1615#ifdef HAVE_X2APIC
1616 enable_IR_x2apic();
1617#endif
1618#ifdef CONFIG_X86_64
1619 setup_apic_routing();
1620#endif
1235 1621
1622 verify_local_APIC();
1236 connect_bsp_APIC(); 1623 connect_bsp_APIC();
1237 1624
1625#ifdef CONFIG_X86_64
1626 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1627#else
1238 /* 1628 /*
1239 * Hack: In case of kdump, after a crash, kernel might be booting 1629 * Hack: In case of kdump, after a crash, kernel might be booting
1240 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1630 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1241 * might be zero if read from MP tables. Get it from LAPIC. 1631 * might be zero if read from MP tables. Get it from LAPIC.
1242 */ 1632 */
1243#ifdef CONFIG_CRASH_DUMP 1633# ifdef CONFIG_CRASH_DUMP
1244 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1634 boot_cpu_physical_apicid = read_apic_id();
1635# endif
1245#endif 1636#endif
1246 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1637 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1247
1248 setup_local_APIC(); 1638 setup_local_APIC();
1249 1639
1640#ifdef CONFIG_X86_64
1641 /*
1642 * Now enable IO-APICs, actually call clear_IO_APIC
1643 * We need clear_IO_APIC before enabling vector on BP
1644 */
1645 if (!skip_ioapic_setup && nr_ioapics)
1646 enable_IO_APIC();
1647#endif
1648
1250#ifdef CONFIG_X86_IO_APIC 1649#ifdef CONFIG_X86_IO_APIC
1251 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) 1650 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1252#endif 1651#endif
1253 localise_nmi_watchdog(); 1652 localise_nmi_watchdog();
1254 end_local_APIC_setup(); 1653 end_local_APIC_setup();
1654
1255#ifdef CONFIG_X86_IO_APIC 1655#ifdef CONFIG_X86_IO_APIC
1256 if (smp_found_config) 1656 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1257 if (!skip_ioapic_setup && nr_ioapics) 1657 setup_IO_APIC();
1258 setup_IO_APIC(); 1658# ifdef CONFIG_X86_64
1659 else
1660 nr_ioapics = 0;
1661# endif
1259#endif 1662#endif
1663
1664#ifdef CONFIG_X86_64
1665 setup_boot_APIC_clock();
1666 check_nmi_watchdog();
1667#else
1260 setup_boot_clock(); 1668 setup_boot_clock();
1669#endif
1261 1670
1262 return 0; 1671 return 0;
1263} 1672}
@@ -1271,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
1271 */ 1680 */
1272void smp_spurious_interrupt(struct pt_regs *regs) 1681void smp_spurious_interrupt(struct pt_regs *regs)
1273{ 1682{
1274 unsigned long v; 1683 u32 v;
1275 1684
1685#ifdef CONFIG_X86_64
1686 exit_idle();
1687#endif
1276 irq_enter(); 1688 irq_enter();
1277 /* 1689 /*
1278 * Check if this really is a spurious interrupt and ACK it 1690 * Check if this really is a spurious interrupt and ACK it
@@ -1283,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1283 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1284 ack_APIC_irq(); 1696 ack_APIC_irq();
1285 1697
1698#ifdef CONFIG_X86_64
1699 add_pda(irq_spurious_count, 1);
1700#else
1286 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1287 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1288 "should never happen.\n", smp_processor_id()); 1703 "should never happen.\n", smp_processor_id());
1289 __get_cpu_var(irq_stat).irq_spurious_count++; 1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1290 irq_exit(); 1706 irq_exit();
1291} 1707}
1292 1708
@@ -1295,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1295 */ 1711 */
1296void smp_error_interrupt(struct pt_regs *regs) 1712void smp_error_interrupt(struct pt_regs *regs)
1297{ 1713{
1298 unsigned long v, v1; 1714 u32 v, v1;
1299 1715
1716#ifdef CONFIG_X86_64
1717 exit_idle();
1718#endif
1300 irq_enter(); 1719 irq_enter();
1301 /* First tickle the hardware, only then report what went on. -- REW */ 1720 /* First tickle the hardware, only then report what went on. -- REW */
1302 v = apic_read(APIC_ESR); 1721 v = apic_read(APIC_ESR);
@@ -1315,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs)
1315 6: Received illegal vector 1734 6: Received illegal vector
1316 7: Illegal register address 1735 7: Illegal register address
1317 */ 1736 */
1318 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1319 smp_processor_id(), v , v1); 1738 smp_processor_id(), v , v1);
1320 irq_exit(); 1739 irq_exit();
1321} 1740}
1322 1741
1323#ifdef CONFIG_SMP
1324void __init smp_intr_init(void)
1325{
1326 /*
1327 * IRQ0 must be given a fixed assignment and initialized,
1328 * because it's used before the IO-APIC is set up.
1329 */
1330 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1331
1332 /*
1333 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1334 * IPI, driven by wakeup.
1335 */
1336 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1337
1338 /* IPI for invalidation */
1339 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1340
1341 /* IPI for generic function call */
1342 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1343
1344 /* IPI for single call function */
1345 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1346 call_function_single_interrupt);
1347}
1348#endif
1349
1350/*
1351 * Initialize APIC interrupts
1352 */
1353void __init apic_intr_init(void)
1354{
1355#ifdef CONFIG_SMP
1356 smp_intr_init();
1357#endif
1358 /* self generated IPI for local APIC timer */
1359 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1360
1361 /* IPI vectors for APIC spurious and error interrupts */
1362 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1363 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1364
1365 /* thermal monitor LVT interrupt */
1366#ifdef CONFIG_X86_MCE_P4THERMAL
1367 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1368#endif
1369}
1370
1371/** 1742/**
1372 * connect_bsp_APIC - attach the APIC to the interrupt system 1743 * connect_bsp_APIC - attach the APIC to the interrupt system
1373 */ 1744 */
1374void __init connect_bsp_APIC(void) 1745void __init connect_bsp_APIC(void)
1375{ 1746{
1747#ifdef CONFIG_X86_32
1376 if (pic_mode) { 1748 if (pic_mode) {
1377 /* 1749 /*
1378 * Do not trust the local APIC being empty at bootup. 1750 * Do not trust the local APIC being empty at bootup.
@@ -1387,6 +1759,7 @@ void __init connect_bsp_APIC(void)
1387 outb(0x70, 0x22); 1759 outb(0x70, 0x22);
1388 outb(0x01, 0x23); 1760 outb(0x01, 0x23);
1389 } 1761 }
1762#endif
1390 enable_apic_mode(); 1763 enable_apic_mode();
1391} 1764}
1392 1765
@@ -1399,6 +1772,9 @@ void __init connect_bsp_APIC(void)
1399 */ 1772 */
1400void disconnect_bsp_APIC(int virt_wire_setup) 1773void disconnect_bsp_APIC(int virt_wire_setup)
1401{ 1774{
1775 unsigned int value;
1776
1777#ifdef CONFIG_X86_32
1402 if (pic_mode) { 1778 if (pic_mode) {
1403 /* 1779 /*
1404 * Put the board back into PIC mode (has an effect only on 1780 * Put the board back into PIC mode (has an effect only on
@@ -1410,56 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1410 "entering PIC mode.\n"); 1786 "entering PIC mode.\n");
1411 outb(0x70, 0x22); 1787 outb(0x70, 0x22);
1412 outb(0x00, 0x23); 1788 outb(0x00, 0x23);
1413 } else { 1789 return;
1414 /* Go back to Virtual Wire compatibility mode */ 1790 }
1415 unsigned long value; 1791#endif
1416 1792
1417 /* For the spurious interrupt use vector F, and enable it */ 1793 /* Go back to Virtual Wire compatibility mode */
1418 value = apic_read(APIC_SPIV); 1794
1419 value &= ~APIC_VECTOR_MASK; 1795 /* For the spurious interrupt use vector F, and enable it */
1420 value |= APIC_SPIV_APIC_ENABLED; 1796 value = apic_read(APIC_SPIV);
1421 value |= 0xf; 1797 value &= ~APIC_VECTOR_MASK;
1422 apic_write_around(APIC_SPIV, value); 1798 value |= APIC_SPIV_APIC_ENABLED;
1423 1799 value |= 0xf;
1424 if (!virt_wire_setup) { 1800 apic_write(APIC_SPIV, value);
1425 /*
1426 * For LVT0 make it edge triggered, active high,
1427 * external and enabled
1428 */
1429 value = apic_read(APIC_LVT0);
1430 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1431 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value);
1436 } else {
1437 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1439 }
1440 1801
1802 if (!virt_wire_setup) {
1441 /* 1803 /*
1442 * For LVT1 make it edge triggered, active high, nmi and 1804 * For LVT0 make it edge triggered, active high,
1443 * enabled 1805 * external and enabled
1444 */ 1806 */
1445 value = apic_read(APIC_LVT1); 1807 value = apic_read(APIC_LVT0);
1446 value &= ~( 1808 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1447 APIC_MODE_MASK | APIC_SEND_PENDING |
1448 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1809 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1810 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1811 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1812 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1452 apic_write_around(APIC_LVT1, value); 1813 apic_write(APIC_LVT0, value);
1814 } else {
1815 /* Disable LVT0 */
1816 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1453 } 1817 }
1454}
1455 1818
1456unsigned int __cpuinitdata maxcpus = NR_CPUS; 1819 /*
1820 * For LVT1 make it edge triggered, active high,
1821 * nmi and enabled
1822 */
1823 value = apic_read(APIC_LVT1);
1824 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1825 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1826 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1827 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1828 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1829 apic_write(APIC_LVT1, value);
1830}
1457 1831
1458void __cpuinit generic_processor_info(int apicid, int version) 1832void __cpuinit generic_processor_info(int apicid, int version)
1459{ 1833{
1460 int cpu; 1834 int cpu;
1461 cpumask_t tmp_map; 1835 cpumask_t tmp_map;
1462 physid_mask_t phys_cpu;
1463 1836
1464 /* 1837 /*
1465 * Validate version 1838 * Validate version
@@ -1472,36 +1845,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1472 } 1845 }
1473 apic_version[apicid] = version; 1846 apic_version[apicid] = version;
1474 1847
1475 phys_cpu = apicid_to_cpu_present(apicid);
1476 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1477
1478 if (num_processors >= NR_CPUS) { 1848 if (num_processors >= NR_CPUS) {
1479 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1849 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1480 " Processor ignored.\n", NR_CPUS); 1850 " Processor ignored.\n", NR_CPUS);
1481 return; 1851 return;
1482 } 1852 }
1483 1853
1484 if (num_processors >= maxcpus) {
1485 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1486 " Processor ignored.\n", maxcpus);
1487 return;
1488 }
1489
1490 num_processors++; 1854 num_processors++;
1491 cpus_complement(tmp_map, cpu_present_map); 1855 cpus_complement(tmp_map, cpu_present_map);
1492 cpu = first_cpu(tmp_map); 1856 cpu = first_cpu(tmp_map);
1493 1857
1494 if (apicid == boot_cpu_physical_apicid) 1858 physid_set(apicid, phys_cpu_present_map);
1859 if (apicid == boot_cpu_physical_apicid) {
1495 /* 1860 /*
1496 * x86_bios_cpu_apicid is required to have processors listed 1861 * x86_bios_cpu_apicid is required to have processors listed
1497 * in same order as logical cpu numbers. Hence the first 1862 * in same order as logical cpu numbers. Hence the first
1498 * entry is BSP, and so on. 1863 * entry is BSP, and so on.
1499 */ 1864 */
1500 cpu = 0; 1865 cpu = 0;
1501 1866 }
1502 if (apicid > max_physical_apicid) 1867 if (apicid > max_physical_apicid)
1503 max_physical_apicid = apicid; 1868 max_physical_apicid = apicid;
1504 1869
1870#ifdef CONFIG_X86_32
1505 /* 1871 /*
1506 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1872 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1507 * but we need to work other dependencies like SMP_SUSPEND etc 1873 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1521,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1521 def_to_bigsmp = 1; 1887 def_to_bigsmp = 1;
1522 } 1888 }
1523 } 1889 }
1524#ifdef CONFIG_SMP 1890#endif
1891
1892#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1525 /* are we being called early in kernel startup? */ 1893 /* are we being called early in kernel startup? */
1526 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1894 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1527 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1895 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1534,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1534 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1902 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1535 } 1903 }
1536#endif 1904#endif
1905
1537 cpu_set(cpu, cpu_possible_map); 1906 cpu_set(cpu, cpu_possible_map);
1538 cpu_set(cpu, cpu_present_map); 1907 cpu_set(cpu, cpu_present_map);
1539} 1908}
1540 1909
1910#ifdef CONFIG_X86_64
1911int hard_smp_processor_id(void)
1912{
1913 return read_apic_id();
1914}
1915#endif
1916
1541/* 1917/*
1542 * Power management 1918 * Power management
1543 */ 1919 */
1544#ifdef CONFIG_PM 1920#ifdef CONFIG_PM
1545 1921
1546static struct { 1922static struct {
1923 /*
1924 * 'active' is true if the local APIC was enabled by us and
1925 * not the BIOS; this signifies that we are also responsible
1926 * for disabling it before entering apm/acpi suspend
1927 */
1547 int active; 1928 int active;
1548 /* r/w apic fields */ 1929 /* r/w apic fields */
1549 unsigned int apic_id; 1930 unsigned int apic_id;
@@ -1584,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1584 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1965 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1585 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1966 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1586 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1967 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1587#ifdef CONFIG_X86_MCE_P4THERMAL 1968#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1588 if (maxlvt >= 5) 1969 if (maxlvt >= 5)
1589 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1970 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1590#endif 1971#endif
@@ -1608,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev)
1608 1989
1609 local_irq_save(flags); 1990 local_irq_save(flags);
1610 1991
1611 /* 1992#ifdef HAVE_X2APIC
1612 * Make sure the APICBASE points to the right address 1993 if (x2apic)
1613 * 1994 enable_x2apic();
1614 * FIXME! This will be wrong if we ever support suspend on 1995 else
1615 * SMP! We'll need to do this as part of the CPU restore! 1996#endif
1616 */ 1997 {
1617 rdmsr(MSR_IA32_APICBASE, l, h); 1998 /*
1618 l &= ~MSR_IA32_APICBASE_BASE; 1999 * Make sure the APICBASE points to the right address
1619 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 2000 *
1620 wrmsr(MSR_IA32_APICBASE, l, h); 2001 * FIXME! This will be wrong if we ever support suspend on
2002 * SMP! We'll need to do this as part of the CPU restore!
2003 */
2004 rdmsr(MSR_IA32_APICBASE, l, h);
2005 l &= ~MSR_IA32_APICBASE_BASE;
2006 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
2007 wrmsr(MSR_IA32_APICBASE, l, h);
2008 }
1621 2009
1622 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 2010 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1623 apic_write(APIC_ID, apic_pm_state.apic_id); 2011 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1627,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev)
1627 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 2015 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1628 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 2016 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1629 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 2017 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1630#ifdef CONFIG_X86_MCE_P4THERMAL 2018#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1631 if (maxlvt >= 5) 2019 if (maxlvt >= 5)
1632 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 2020 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1633#endif 2021#endif
@@ -1641,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev)
1641 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 2029 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1642 apic_write(APIC_ESR, 0); 2030 apic_write(APIC_ESR, 0);
1643 apic_read(APIC_ESR); 2031 apic_read(APIC_ESR);
2032
1644 local_irq_restore(flags); 2033 local_irq_restore(flags);
2034
1645 return 0; 2035 return 0;
1646} 2036}
1647 2037
@@ -1661,7 +2051,7 @@ static struct sys_device device_lapic = {
1661 .cls = &lapic_sysclass, 2051 .cls = &lapic_sysclass,
1662}; 2052};
1663 2053
1664static void __devinit apic_pm_activate(void) 2054static void __cpuinit apic_pm_activate(void)
1665{ 2055{
1666 apic_pm_state.active = 1; 2056 apic_pm_state.active = 1;
1667} 2057}
@@ -1687,30 +2077,101 @@ static void apic_pm_activate(void) { }
1687 2077
1688#endif /* CONFIG_PM */ 2078#endif /* CONFIG_PM */
1689 2079
2080#ifdef CONFIG_X86_64
1690/* 2081/*
1691 * APIC command line parameters 2082 * apic_is_clustered_box() -- Check if we can expect good TSC
2083 *
2084 * Thus far, the major user of this is IBM's Summit2 series:
2085 *
2086 * Clustered boxes may have unsynced TSC problems if they are
2087 * multi-chassis. Use available data to take a good guess.
2088 * If in doubt, go HPET.
1692 */ 2089 */
1693static int __init parse_lapic(char *arg) 2090__cpuinit int apic_is_clustered_box(void)
1694{ 2091{
1695 force_enable_local_apic = 1; 2092 int i, clusters, zeros;
1696 return 0; 2093 unsigned id;
2094 u16 *bios_cpu_apicid;
2095 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2096
2097 /*
2098 * there is not this kind of box with AMD CPU yet.
2099 * Some AMD box with quadcore cpu and 8 sockets apicid
2100 * will be [4, 0x23] or [8, 0x27] could be thought to
2101 * vsmp box still need checking...
2102 */
2103 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2104 return 0;
2105
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108
2109 for (i = 0; i < NR_CPUS; i++) {
2110 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i];
2113 }
2114 else if (i < nr_cpu_ids) {
2115 if (cpu_present(i))
2116 id = per_cpu(x86_bios_cpu_apicid, i);
2117 else
2118 continue;
2119 }
2120 else
2121 break;
2122
2123 if (id != BAD_APICID)
2124 __set_bit(APIC_CLUSTERID(id), clustermap);
2125 }
2126
2127 /* Problem: Partially populated chassis may not have CPUs in some of
2128 * the APIC clusters they have been allocated. Only present CPUs have
2129 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
2130 * Since clusters are allocated sequentially, count zeros only if
2131 * they are bounded by ones.
2132 */
2133 clusters = 0;
2134 zeros = 0;
2135 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
2136 if (test_bit(i, clustermap)) {
2137 clusters += 1 + zeros;
2138 zeros = 0;
2139 } else
2140 ++zeros;
2141 }
2142
2143 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2144 * not guaranteed to be synced between boards
2145 */
2146 if (is_vsmp_box() && clusters > 1)
2147 return 1;
2148
2149 /*
2150 * If clusters > 2, then should be multi-chassis.
2151 * May have to revisit this when multi-core + hyperthreaded CPUs come
2152 * out, but AFAIK this will work even for them.
2153 */
2154 return (clusters > 2);
1697} 2155}
1698early_param("lapic", parse_lapic); 2156#endif
1699 2157
1700static int __init parse_nolapic(char *arg) 2158/*
2159 * APIC command line parameters
2160 */
2161static int __init setup_disableapic(char *arg)
1701{ 2162{
1702 disable_apic = 1; 2163 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 2164 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 2165 return 0;
1705} 2166}
1706early_param("nolapic", parse_nolapic); 2167early_param("disableapic", setup_disableapic);
1707 2168
1708static int __init parse_disable_lapic_timer(char *arg) 2169/* same as disableapic, for compatibility */
2170static int __init setup_nolapic(char *arg)
1709{ 2171{
1710 local_apic_timer_disabled = 1; 2172 return setup_disableapic(arg);
1711 return 0;
1712} 2173}
1713early_param("nolapic_timer", parse_disable_lapic_timer); 2174early_param("nolapic", setup_nolapic);
1714 2175
1715static int __init parse_lapic_timer_c2_ok(char *arg) 2176static int __init parse_lapic_timer_c2_ok(char *arg)
1716{ 2177{
@@ -1719,15 +2180,43 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1719} 2180}
1720early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 2181early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1721 2182
1722static int __init apic_set_verbosity(char *str) 2183static int __init parse_disable_apic_timer(char *arg)
1723{ 2184{
1724 if (strcmp("debug", str) == 0) 2185 disable_apic_timer = 1;
2186 return 0;
2187}
2188early_param("noapictimer", parse_disable_apic_timer);
2189
2190static int __init parse_nolapic_timer(char *arg)
2191{
2192 disable_apic_timer = 1;
2193 return 0;
2194}
2195early_param("nolapic_timer", parse_nolapic_timer);
2196
2197static int __init apic_set_verbosity(char *arg)
2198{
2199 if (!arg) {
2200#ifdef CONFIG_X86_64
2201 skip_ioapic_setup = 0;
2202 return 0;
2203#endif
2204 return -EINVAL;
2205 }
2206
2207 if (strcmp("debug", arg) == 0)
1725 apic_verbosity = APIC_DEBUG; 2208 apic_verbosity = APIC_DEBUG;
1726 else if (strcmp("verbose", str) == 0) 2209 else if (strcmp("verbose", arg) == 0)
1727 apic_verbosity = APIC_VERBOSE; 2210 apic_verbosity = APIC_VERBOSE;
1728 return 1; 2211 else {
2212 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
2213 " use apic=verbose or apic=debug\n", arg);
2214 return -EINVAL;
2215 }
2216
2217 return 0;
1729} 2218}
1730__setup("apic=", apic_set_verbosity); 2219early_param("apic", apic_set_verbosity);
1731 2220
1732static int __init lapic_insert_resource(void) 2221static int __init lapic_insert_resource(void)
1733{ 2222{
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 1e3d32e27c14..000000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1393 +0,0 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/ioport.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30
31#include <asm/atomic.h>
32#include <asm/smp.h>
33#include <asm/mtrr.h>
34#include <asm/mpspec.h>
35#include <asm/hpet.h>
36#include <asm/pgalloc.h>
37#include <asm/nmi.h>
38#include <asm/idle.h>
39#include <asm/proto.h>
40#include <asm/timex.h>
41#include <asm/apic.h>
42
43#include <mach_ipi.h>
44#include <mach_apic.h>
45
46static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata;
48int disable_apic;
49
50/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok;
52EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
53
54/*
55 * Debug level, exported for io_apic.c
56 */
57int apic_verbosity;
58
59/* Have we found an MP table */
60int smp_found_config;
61
62static struct resource lapic_resource = {
63 .name = "Local APIC",
64 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
65};
66
67static unsigned int calibration_result;
68
69static int lapic_next_event(unsigned long delta,
70 struct clock_event_device *evt);
71static void lapic_timer_setup(enum clock_event_mode mode,
72 struct clock_event_device *evt);
73static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void);
75
76static struct clock_event_device lapic_clockevent = {
77 .name = "lapic",
78 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
79 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
80 .shift = 32,
81 .set_mode = lapic_timer_setup,
82 .set_next_event = lapic_next_event,
83 .broadcast = lapic_timer_broadcast,
84 .rating = 100,
85 .irq = -1,
86};
87static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
88
89static unsigned long apic_phys;
90
91unsigned long mp_lapic_addr;
92
93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/*
95 * Get the LAPIC version
96 */
97static inline int lapic_get_version(void)
98{
99 return GET_APIC_VERSION(apic_read(APIC_LVR));
100}
101
102/*
103 * Check, if the APIC is integrated or a seperate chip
104 */
105static inline int lapic_is_integrated(void)
106{
107 return 1;
108}
109
110/*
111 * Check, whether this is a modern or a first generation APIC
112 */
113static int modern_apic(void)
114{
115 /* AMD systems use old APIC versions, so check the CPU */
116 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
117 boot_cpu_data.x86 >= 0xf)
118 return 1;
119 return lapic_get_version() >= 0x14;
120}
121
122void apic_wait_icr_idle(void)
123{
124 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
125 cpu_relax();
126}
127
128u32 safe_apic_wait_icr_idle(void)
129{
130 u32 send_status;
131 int timeout;
132
133 timeout = 0;
134 do {
135 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
136 if (!send_status)
137 break;
138 udelay(100);
139 } while (timeout++ < 1000);
140
141 return send_status;
142}
143
144/**
145 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
146 */
147void __cpuinit enable_NMI_through_LVT0(void)
148{
149 unsigned int v;
150
151 /* unmask and set to NMI */
152 v = APIC_DM_NMI;
153 apic_write(APIC_LVT0, v);
154}
155
156/**
157 * lapic_get_maxlvt - get the maximum number of local vector table entries
158 */
159int lapic_get_maxlvt(void)
160{
161 unsigned int v, maxlvt;
162
163 v = apic_read(APIC_LVR);
164 maxlvt = GET_APIC_MAXLVT(v);
165 return maxlvt;
166}
167
168/*
169 * This function sets up the local APIC timer, with a timeout of
170 * 'clocks' APIC bus clock. During calibration we actually call
171 * this function twice on the boot CPU, once with a bogus timeout
172 * value, second time for real. The other (noncalibrating) CPUs
173 * call this function only once, with the real, calibrated value.
174 *
175 * We do reads before writes even if unnecessary, to get around the
176 * P5 APIC double write bug.
177 */
178
179static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
180{
181 unsigned int lvtt_value, tmp_value;
182
183 lvtt_value = LOCAL_TIMER_VECTOR;
184 if (!oneshot)
185 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
186 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED;
188
189 apic_write(APIC_LVTT, lvtt_value);
190
191 /*
192 * Divide PICLK by 16
193 */
194 tmp_value = apic_read(APIC_TDCR);
195 apic_write(APIC_TDCR, (tmp_value
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
197 | APIC_TDR_DIV_16);
198
199 if (!oneshot)
200 apic_write(APIC_TMICT, clocks);
201}
202
203/*
204 * Setup extended LVT, AMD specific (K8, family 10h)
205 *
206 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
207 * MCE interrupts are supported. Thus MCE offset must be set to 0.
208 */
209
210#define APIC_EILVT_LVTOFF_MCE 0
211#define APIC_EILVT_LVTOFF_IBS 1
212
213static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
214{
215 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
216 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
217
218 apic_write(reg, v);
219}
220
221u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
222{
223 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
224 return APIC_EILVT_LVTOFF_MCE;
225}
226
227u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
228{
229 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
230 return APIC_EILVT_LVTOFF_IBS;
231}
232
233/*
234 * Program the next event, relative to now
235 */
236static int lapic_next_event(unsigned long delta,
237 struct clock_event_device *evt)
238{
239 apic_write(APIC_TMICT, delta);
240 return 0;
241}
242
243/*
244 * Setup the lapic timer in periodic or oneshot mode
245 */
246static void lapic_timer_setup(enum clock_event_mode mode,
247 struct clock_event_device *evt)
248{
249 unsigned long flags;
250 unsigned int v;
251
252 /* Lapic used as dummy for broadcast ? */
253 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
254 return;
255
256 local_irq_save(flags);
257
258 switch (mode) {
259 case CLOCK_EVT_MODE_PERIODIC:
260 case CLOCK_EVT_MODE_ONESHOT:
261 __setup_APIC_LVTT(calibration_result,
262 mode != CLOCK_EVT_MODE_PERIODIC, 1);
263 break;
264 case CLOCK_EVT_MODE_UNUSED:
265 case CLOCK_EVT_MODE_SHUTDOWN:
266 v = apic_read(APIC_LVTT);
267 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
268 apic_write(APIC_LVTT, v);
269 break;
270 case CLOCK_EVT_MODE_RESUME:
271 /* Nothing to do here */
272 break;
273 }
274
275 local_irq_restore(flags);
276}
277
278/*
279 * Local APIC timer broadcast function
280 */
281static void lapic_timer_broadcast(cpumask_t mask)
282{
283#ifdef CONFIG_SMP
284 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
285#endif
286}
287
288/*
289 * Setup the local APIC timer for this CPU. Copy the initilized values
290 * of the boot CPU and register the clock event in the framework.
291 */
292static void setup_APIC_timer(void)
293{
294 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
295
296 memcpy(levt, &lapic_clockevent, sizeof(*levt));
297 levt->cpumask = cpumask_of_cpu(smp_processor_id());
298
299 clockevents_register_device(levt);
300}
301
302/*
303 * In this function we calibrate APIC bus clocks to the external
304 * timer. Unfortunately we cannot use jiffies and the timer irq
305 * to calibrate, since some later bootup code depends on getting
306 * the first irq? Ugh.
307 *
308 * We want to do the calibration only once since we
309 * want to have local timer irqs syncron. CPUs connected
310 * by the same APIC bus have the very same bus frequency.
311 * And we want to have irqs off anyways, no accidental
312 * APIC irq that way.
313 */
314
315#define TICK_COUNT 100000000
316
317static void __init calibrate_APIC_clock(void)
318{
319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start;
321 int result;
322
323 local_irq_disable();
324
325 /*
326 * Put whatever arbitrary (but long enough) timeout
327 * value into the APIC clock, we just want to get the
328 * counter running for calibration.
329 *
330 * No interrupt enable !
331 */
332 __setup_APIC_LVTT(250000000, 0, 0);
333
334 apic_start = apic_read(APIC_TMCCT);
335#ifdef CONFIG_X86_PM_TIMER
336 if (apic_calibrate_pmtmr && pmtmr_ioport) {
337 pmtimer_wait(5000); /* 5ms wait */
338 apic = apic_read(APIC_TMCCT);
339 result = (apic_start - apic) * 1000L / 5;
340 } else
341#endif
342 {
343 rdtscll(tsc_start);
344
345 do {
346 apic = apic_read(APIC_TMCCT);
347 rdtscll(tsc);
348 } while ((tsc - tsc_start) < TICK_COUNT &&
349 (apic_start - apic) < TICK_COUNT);
350
351 result = (apic_start - apic) * 1000L * tsc_khz /
352 (tsc - tsc_start);
353 }
354
355 local_irq_enable();
356
357 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
358
359 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
360 result / 1000 / 1000, result / 1000 % 1000);
361
362 /* Calculate the scaled math multiplication factor */
363 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
364 lapic_clockevent.shift);
365 lapic_clockevent.max_delta_ns =
366 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
367 lapic_clockevent.min_delta_ns =
368 clockevent_delta2ns(0xF, &lapic_clockevent);
369
370 calibration_result = result / HZ;
371}
372
373/*
374 * Setup the boot APIC
375 *
376 * Calibrate and verify the result.
377 */
378void __init setup_boot_APIC_clock(void)
379{
380 /*
381 * The local apic timer can be disabled via the kernel commandline.
382 * Register the lapic timer as a dummy clock event source on SMP
383 * systems, so the broadcast mechanism is used. On UP systems simply
384 * ignore it.
385 */
386 if (disable_apic_timer) {
387 printk(KERN_INFO "Disabling APIC timer\n");
388 /* No broadcast on UP ! */
389 if (num_possible_cpus() > 1) {
390 lapic_clockevent.mult = 1;
391 setup_APIC_timer();
392 }
393 return;
394 }
395
396 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock();
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1)
407 setup_APIC_timer();
408 return;
409 }
410
411 /*
412 * If nmi_watchdog is set to IO_APIC, we need the
413 * PIT/HPET going. Otherwise register lapic as a dummy
414 * device.
415 */
416 if (nmi_watchdog != NMI_IO_APIC)
417 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
418 else
419 printk(KERN_WARNING "APIC timer registered as dummy,"
420 " due to nmi_watchdog=%d!\n", nmi_watchdog);
421
422 setup_APIC_timer();
423}
424
425void __cpuinit setup_secondary_APIC_clock(void)
426{
427 setup_APIC_timer();
428}
429
430/*
431 * The guts of the apic timer interrupt
432 */
433static void local_apic_timer_interrupt(void)
434{
435 int cpu = smp_processor_id();
436 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
437
438 /*
439 * Normally we should not be here till LAPIC has been initialized but
440 * in some cases like kdump, its possible that there is a pending LAPIC
441 * timer interrupt from previous kernel's context and is delivered in
442 * new kernel the moment interrupts are enabled.
443 *
444 * Interrupts are enabled early and LAPIC is setup much later, hence
445 * its possible that when we get here evt->event_handler is NULL.
446 * Check for event_handler being NULL and discard the interrupt as
447 * spurious.
448 */
449 if (!evt->event_handler) {
450 printk(KERN_WARNING
451 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
452 /* Switch it off */
453 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
454 return;
455 }
456
457 /*
458 * the NMI deadlock-detector uses this.
459 */
460 add_pda(apic_timer_irqs, 1);
461
462 evt->event_handler(evt);
463}
464
465/*
466 * Local APIC timer interrupt. This is the most natural way for doing
467 * local interrupts, but local timer interrupts can be emulated by
468 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
469 *
470 * [ if a single-CPU system runs an SMP kernel then we call the local
471 * interrupt as well. Thus we cannot inline the local irq ... ]
472 */
473void smp_apic_timer_interrupt(struct pt_regs *regs)
474{
475 struct pt_regs *old_regs = set_irq_regs(regs);
476
477 /*
478 * NOTE! We'd better ACK the irq immediately,
479 * because timer handling can be slow.
480 */
481 ack_APIC_irq();
482 /*
483 * update_process_times() expects us to have done irq_enter().
484 * Besides, if we don't timer interrupts ignore the global
485 * interrupt lock, which is the WrongThing (tm) to do.
486 */
487 exit_idle();
488 irq_enter();
489 local_apic_timer_interrupt();
490 irq_exit();
491 set_irq_regs(old_regs);
492}
493
494int setup_profiling_timer(unsigned int multiplier)
495{
496 return -EINVAL;
497}
498
499
500/*
501 * Local APIC start and shutdown
502 */
503
504/**
505 * clear_local_APIC - shutdown the local APIC
506 *
507 * This is called, when a CPU is disabled and before rebooting, so the state of
508 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
509 * leftovers during boot.
510 */
511void clear_local_APIC(void)
512{
513 int maxlvt;
514 u32 v;
515
516 /* APIC hasn't been mapped yet */
517 if (!apic_phys)
518 return;
519
520 maxlvt = lapic_get_maxlvt();
521 /*
522 * Masking an LVT entry can trigger a local APIC error
523 * if the vector is zero. Mask LVTERR first to prevent this.
524 */
525 if (maxlvt >= 3) {
526 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
527 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
528 }
529 /*
530 * Careful: we have to set masks only first to deassert
531 * any level-triggered sources.
532 */
533 v = apic_read(APIC_LVTT);
534 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
535 v = apic_read(APIC_LVT0);
536 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
537 v = apic_read(APIC_LVT1);
538 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
539 if (maxlvt >= 4) {
540 v = apic_read(APIC_LVTPC);
541 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
542 }
543
544 /*
545 * Clean APIC state for other OSs:
546 */
547 apic_write(APIC_LVTT, APIC_LVT_MASKED);
548 apic_write(APIC_LVT0, APIC_LVT_MASKED);
549 apic_write(APIC_LVT1, APIC_LVT_MASKED);
550 if (maxlvt >= 3)
551 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
552 if (maxlvt >= 4)
553 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
554 apic_write(APIC_ESR, 0);
555 apic_read(APIC_ESR);
556}
557
558/**
559 * disable_local_APIC - clear and disable the local APIC
560 */
561void disable_local_APIC(void)
562{
563 unsigned int value;
564
565 clear_local_APIC();
566
567 /*
568 * Disable APIC (implies clearing of registers
569 * for 82489DX!).
570 */
571 value = apic_read(APIC_SPIV);
572 value &= ~APIC_SPIV_APIC_ENABLED;
573 apic_write(APIC_SPIV, value);
574}
575
576void lapic_shutdown(void)
577{
578 unsigned long flags;
579
580 if (!cpu_has_apic)
581 return;
582
583 local_irq_save(flags);
584
585 disable_local_APIC();
586
587 local_irq_restore(flags);
588}
589
590/*
591 * This is to verify that we're looking at a real local APIC.
592 * Check these against your board if the CPUs aren't getting
593 * started for no apparent reason.
594 */
595int __init verify_local_APIC(void)
596{
597 unsigned int reg0, reg1;
598
599 /*
600 * The version register is read-only in a real APIC.
601 */
602 reg0 = apic_read(APIC_LVR);
603 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
604 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
605 reg1 = apic_read(APIC_LVR);
606 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
607
608 /*
609 * The two version reads above should print the same
610 * numbers. If the second one is different, then we
611 * poke at a non-APIC.
612 */
613 if (reg1 != reg0)
614 return 0;
615
616 /*
617 * Check if the version looks reasonably.
618 */
619 reg1 = GET_APIC_VERSION(reg0);
620 if (reg1 == 0x00 || reg1 == 0xff)
621 return 0;
622 reg1 = lapic_get_maxlvt();
623 if (reg1 < 0x02 || reg1 == 0xff)
624 return 0;
625
626 /*
627 * The ID register is read/write in a real APIC.
628 */
629 reg0 = read_apic_id();
630 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
631 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
632 reg1 = read_apic_id();
633 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
634 apic_write(APIC_ID, reg0);
635 if (reg1 != (reg0 ^ APIC_ID_MASK))
636 return 0;
637
638 /*
639 * The next two are just to see if we have sane values.
640 * They're only really relevant if we're in Virtual Wire
641 * compatibility mode, but most boxes are anymore.
642 */
643 reg0 = apic_read(APIC_LVT0);
644 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
645 reg1 = apic_read(APIC_LVT1);
646 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
647
648 return 1;
649}
650
651/**
652 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
653 */
654void __init sync_Arb_IDs(void)
655{
656 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
657 if (modern_apic())
658 return;
659
660 /*
661 * Wait for idle.
662 */
663 apic_wait_icr_idle();
664
665 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
666 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
667 | APIC_DM_INIT);
668}
669
670/*
671 * An initial setup of the virtual wire mode.
672 */
673void __init init_bsp_APIC(void)
674{
675 unsigned int value;
676
677 /*
678 * Don't do the setup now if we have a SMP BIOS as the
679 * through-I/O-APIC virtual wire mode might be active.
680 */
681 if (smp_found_config || !cpu_has_apic)
682 return;
683
684 value = apic_read(APIC_LVR);
685
686 /*
687 * Do not trust the local APIC being empty at bootup.
688 */
689 clear_local_APIC();
690
691 /*
692 * Enable APIC.
693 */
694 value = apic_read(APIC_SPIV);
695 value &= ~APIC_VECTOR_MASK;
696 value |= APIC_SPIV_APIC_ENABLED;
697 value |= APIC_SPIV_FOCUS_DISABLED;
698 value |= SPURIOUS_APIC_VECTOR;
699 apic_write(APIC_SPIV, value);
700
701 /*
702 * Set up the virtual wire mode.
703 */
704 apic_write(APIC_LVT0, APIC_DM_EXTINT);
705 value = APIC_DM_NMI;
706 apic_write(APIC_LVT1, value);
707}
708
709/**
710 * setup_local_APIC - setup the local APIC
711 */
712void __cpuinit setup_local_APIC(void)
713{
714 unsigned int value;
715 int i, j;
716
717 preempt_disable();
718 value = apic_read(APIC_LVR);
719
720 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
721
722 /*
723 * Double-check whether this APIC is really registered.
724 * This is meaningless in clustered apic mode, so we skip it.
725 */
726 if (!apic_id_registered())
727 BUG();
728
729 /*
730 * Intel recommends to set DFR, LDR and TPR before enabling
731 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
732 * document number 292116). So here it goes...
733 */
734 init_apic_ldr();
735
736 /*
737 * Set Task Priority to 'accept all'. We never change this
738 * later on.
739 */
740 value = apic_read(APIC_TASKPRI);
741 value &= ~APIC_TPRI_MASK;
742 apic_write(APIC_TASKPRI, value);
743
744 /*
745 * After a crash, we no longer service the interrupts and a pending
746 * interrupt from previous kernel might still have ISR bit set.
747 *
748 * Most probably by now CPU has serviced that pending interrupt and
749 * it might not have done the ack_APIC_irq() because it thought,
750 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
751 * does not clear the ISR bit and cpu thinks it has already serivced
752 * the interrupt. Hence a vector might get locked. It was noticed
753 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
754 */
755 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
756 value = apic_read(APIC_ISR + i*0x10);
757 for (j = 31; j >= 0; j--) {
758 if (value & (1<<j))
759 ack_APIC_irq();
760 }
761 }
762
763 /*
764 * Now that we are all set up, enable the APIC
765 */
766 value = apic_read(APIC_SPIV);
767 value &= ~APIC_VECTOR_MASK;
768 /*
769 * Enable APIC
770 */
771 value |= APIC_SPIV_APIC_ENABLED;
772
773 /* We always use processor focus */
774
775 /*
776 * Set spurious IRQ vector
777 */
778 value |= SPURIOUS_APIC_VECTOR;
779 apic_write(APIC_SPIV, value);
780
781 /*
782 * Set up LVT0, LVT1:
783 *
784 * set up through-local-APIC on the BP's LINT0. This is not
785 * strictly necessary in pure symmetric-IO mode, but sometimes
786 * we delegate interrupts to the 8259A.
787 */
788 /*
789 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
790 */
791 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
792 if (!smp_processor_id() && !value) {
793 value = APIC_DM_EXTINT;
794 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
795 smp_processor_id());
796 } else {
797 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
798 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
799 smp_processor_id());
800 }
801 apic_write(APIC_LVT0, value);
802
803 /*
804 * only the BP should see the LINT1 NMI signal, obviously.
805 */
806 if (!smp_processor_id())
807 value = APIC_DM_NMI;
808 else
809 value = APIC_DM_NMI | APIC_LVT_MASKED;
810 apic_write(APIC_LVT1, value);
811 preempt_enable();
812}
813
814static void __cpuinit lapic_setup_esr(void)
815{
816 unsigned maxlvt = lapic_get_maxlvt();
817
818 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
819 /*
820 * spec says clear errors after enabling vector.
821 */
822 if (maxlvt > 3)
823 apic_write(APIC_ESR, 0);
824}
825
826void __cpuinit end_local_APIC_setup(void)
827{
828 lapic_setup_esr();
829 setup_apic_nmi_watchdog(NULL);
830 apic_pm_activate();
831}
832
833/*
834 * Detect and enable local APICs on non-SMP boards.
835 * Original code written by Keir Fraser.
836 * On AMD64 we trust the BIOS - if it says no APIC it is likely
837 * not correctly set up (usually the APIC timer won't work etc.)
838 */
839static int __init detect_init_APIC(void)
840{
841 if (!cpu_has_apic) {
842 printk(KERN_INFO "No local APIC present\n");
843 return -1;
844 }
845
846 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
847 boot_cpu_physical_apicid = 0;
848 return 0;
849}
850
851void __init early_init_lapic_mapping(void)
852{
853 unsigned long phys_addr;
854
855 /*
856 * If no local APIC can be found then go out
857 * : it means there is no mpatable and MADT
858 */
859 if (!smp_found_config)
860 return;
861
862 phys_addr = mp_lapic_addr;
863
864 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
865 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
866 APIC_BASE, phys_addr);
867
868 /*
869 * Fetch the APIC ID of the BSP in case we have a
870 * default configuration (or the MP table is broken).
871 */
872 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
873}
874
875/**
876 * init_apic_mappings - initialize APIC mappings
877 */
878void __init init_apic_mappings(void)
879{
880 /*
881 * If no local APIC can be found then set up a fake all
882 * zeroes page to simulate the local APIC and another
883 * one for the IO-APIC.
884 */
885 if (!smp_found_config && detect_init_APIC()) {
886 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
887 apic_phys = __pa(apic_phys);
888 } else
889 apic_phys = mp_lapic_addr;
890
891 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
892 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
893 APIC_BASE, apic_phys);
894
895 /*
896 * Fetch the APIC ID of the BSP in case we have a
897 * default configuration (or the MP table is broken).
898 */
899 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
900}
901
902/*
903 * This initializes the IO-APIC and APIC hardware if this is
904 * a UP kernel.
905 */
906int __init APIC_init_uniprocessor(void)
907{
908 if (disable_apic) {
909 printk(KERN_INFO "Apic disabled\n");
910 return -1;
911 }
912 if (!cpu_has_apic) {
913 disable_apic = 1;
914 printk(KERN_INFO "Apic disabled by BIOS\n");
915 return -1;
916 }
917
918 verify_local_APIC();
919
920 connect_bsp_APIC();
921
922 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
923 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
924
925 setup_local_APIC();
926
927 /*
928 * Now enable IO-APICs, actually call clear_IO_APIC
929 * We need clear_IO_APIC before enabling vector on BP
930 */
931 if (!skip_ioapic_setup && nr_ioapics)
932 enable_IO_APIC();
933
934 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
935 localise_nmi_watchdog();
936 end_local_APIC_setup();
937
938 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
939 setup_IO_APIC();
940 else
941 nr_ioapics = 0;
942 setup_boot_APIC_clock();
943 check_nmi_watchdog();
944 return 0;
945}
946
947/*
948 * Local APIC interrupts
949 */
950
951/*
952 * This interrupt should _never_ happen with our APIC/SMP architecture
953 */
954asmlinkage void smp_spurious_interrupt(void)
955{
956 unsigned int v;
957 exit_idle();
958 irq_enter();
959 /*
960 * Check if this really is a spurious interrupt and ACK it
961 * if it is a vectored one. Just in case...
962 * Spurious interrupts should not be ACKed.
963 */
964 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
965 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
966 ack_APIC_irq();
967
968 add_pda(irq_spurious_count, 1);
969 irq_exit();
970}
971
972/*
973 * This interrupt should never happen with our APIC/SMP architecture
974 */
975asmlinkage void smp_error_interrupt(void)
976{
977 unsigned int v, v1;
978
979 exit_idle();
980 irq_enter();
981 /* First tickle the hardware, only then report what went on. -- REW */
982 v = apic_read(APIC_ESR);
983 apic_write(APIC_ESR, 0);
984 v1 = apic_read(APIC_ESR);
985 ack_APIC_irq();
986 atomic_inc(&irq_err_count);
987
988 /* Here is what the APIC error bits mean:
989 0: Send CS error
990 1: Receive CS error
991 2: Send accept error
992 3: Receive accept error
993 4: Reserved
994 5: Send illegal vector
995 6: Received illegal vector
996 7: Illegal register address
997 */
998 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
999 smp_processor_id(), v , v1);
1000 irq_exit();
1001}
1002
1003/**
1004 * * connect_bsp_APIC - attach the APIC to the interrupt system
1005 * */
1006void __init connect_bsp_APIC(void)
1007{
1008 enable_apic_mode();
1009}
1010
1011void disconnect_bsp_APIC(int virt_wire_setup)
1012{
1013 /* Go back to Virtual Wire compatibility mode */
1014 unsigned long value;
1015
1016 /* For the spurious interrupt use vector F, and enable it */
1017 value = apic_read(APIC_SPIV);
1018 value &= ~APIC_VECTOR_MASK;
1019 value |= APIC_SPIV_APIC_ENABLED;
1020 value |= 0xf;
1021 apic_write(APIC_SPIV, value);
1022
1023 if (!virt_wire_setup) {
1024 /*
1025 * For LVT0 make it edge triggered, active high,
1026 * external and enabled
1027 */
1028 value = apic_read(APIC_LVT0);
1029 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1030 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1031 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1032 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1033 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1034 apic_write(APIC_LVT0, value);
1035 } else {
1036 /* Disable LVT0 */
1037 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1038 }
1039
1040 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1041 value = apic_read(APIC_LVT1);
1042 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1043 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1044 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1045 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1046 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1047 apic_write(APIC_LVT1, value);
1048}
1049
1050void __cpuinit generic_processor_info(int apicid, int version)
1051{
1052 int cpu;
1053 cpumask_t tmp_map;
1054
1055 if (num_processors >= NR_CPUS) {
1056 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1057 " Processor ignored.\n", NR_CPUS);
1058 return;
1059 }
1060
1061 if (num_processors >= maxcpus) {
1062 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1063 " Processor ignored.\n", maxcpus);
1064 return;
1065 }
1066
1067 num_processors++;
1068 cpus_complement(tmp_map, cpu_present_map);
1069 cpu = first_cpu(tmp_map);
1070
1071 physid_set(apicid, phys_cpu_present_map);
1072 if (apicid == boot_cpu_physical_apicid) {
1073 /*
1074 * x86_bios_cpu_apicid is required to have processors listed
1075 * in same order as logical cpu numbers. Hence the first
1076 * entry is BSP, and so on.
1077 */
1078 cpu = 0;
1079 }
1080 if (apicid > max_physical_apicid)
1081 max_physical_apicid = apicid;
1082
1083 /* are we being called early in kernel startup? */
1084 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1085 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1086 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1087
1088 cpu_to_apicid[cpu] = apicid;
1089 bios_cpu_apicid[cpu] = apicid;
1090 } else {
1091 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1092 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1093 }
1094
1095 cpu_set(cpu, cpu_possible_map);
1096 cpu_set(cpu, cpu_present_map);
1097}
1098
1099/*
1100 * Power management
1101 */
1102#ifdef CONFIG_PM
1103
1104static struct {
1105 /* 'active' is true if the local APIC was enabled by us and
1106 not the BIOS; this signifies that we are also responsible
1107 for disabling it before entering apm/acpi suspend */
1108 int active;
1109 /* r/w apic fields */
1110 unsigned int apic_id;
1111 unsigned int apic_taskpri;
1112 unsigned int apic_ldr;
1113 unsigned int apic_dfr;
1114 unsigned int apic_spiv;
1115 unsigned int apic_lvtt;
1116 unsigned int apic_lvtpc;
1117 unsigned int apic_lvt0;
1118 unsigned int apic_lvt1;
1119 unsigned int apic_lvterr;
1120 unsigned int apic_tmict;
1121 unsigned int apic_tdcr;
1122 unsigned int apic_thmr;
1123} apic_pm_state;
1124
1125static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1126{
1127 unsigned long flags;
1128 int maxlvt;
1129
1130 if (!apic_pm_state.active)
1131 return 0;
1132
1133 maxlvt = lapic_get_maxlvt();
1134
1135 apic_pm_state.apic_id = read_apic_id();
1136 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1137 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1138 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1139 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1140 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1141 if (maxlvt >= 4)
1142 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1143 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1144 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1145 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1146 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1147 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1148#ifdef CONFIG_X86_MCE_INTEL
1149 if (maxlvt >= 5)
1150 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1151#endif
1152 local_irq_save(flags);
1153 disable_local_APIC();
1154 local_irq_restore(flags);
1155 return 0;
1156}
1157
1158static int lapic_resume(struct sys_device *dev)
1159{
1160 unsigned int l, h;
1161 unsigned long flags;
1162 int maxlvt;
1163
1164 if (!apic_pm_state.active)
1165 return 0;
1166
1167 maxlvt = lapic_get_maxlvt();
1168
1169 local_irq_save(flags);
1170 rdmsr(MSR_IA32_APICBASE, l, h);
1171 l &= ~MSR_IA32_APICBASE_BASE;
1172 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1173 wrmsr(MSR_IA32_APICBASE, l, h);
1174 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1175 apic_write(APIC_ID, apic_pm_state.apic_id);
1176 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1177 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1178 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1179 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1180 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1181 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1182#ifdef CONFIG_X86_MCE_INTEL
1183 if (maxlvt >= 5)
1184 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1185#endif
1186 if (maxlvt >= 4)
1187 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1188 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1189 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1190 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1191 apic_write(APIC_ESR, 0);
1192 apic_read(APIC_ESR);
1193 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1194 apic_write(APIC_ESR, 0);
1195 apic_read(APIC_ESR);
1196 local_irq_restore(flags);
1197 return 0;
1198}
1199
1200static struct sysdev_class lapic_sysclass = {
1201 .name = "lapic",
1202 .resume = lapic_resume,
1203 .suspend = lapic_suspend,
1204};
1205
1206static struct sys_device device_lapic = {
1207 .id = 0,
1208 .cls = &lapic_sysclass,
1209};
1210
1211static void __cpuinit apic_pm_activate(void)
1212{
1213 apic_pm_state.active = 1;
1214}
1215
1216static int __init init_lapic_sysfs(void)
1217{
1218 int error;
1219
1220 if (!cpu_has_apic)
1221 return 0;
1222 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1223
1224 error = sysdev_class_register(&lapic_sysclass);
1225 if (!error)
1226 error = sysdev_register(&device_lapic);
1227 return error;
1228}
1229device_initcall(init_lapic_sysfs);
1230
1231#else /* CONFIG_PM */
1232
1233static void apic_pm_activate(void) { }
1234
1235#endif /* CONFIG_PM */
1236
1237/*
1238 * apic_is_clustered_box() -- Check if we can expect good TSC
1239 *
1240 * Thus far, the major user of this is IBM's Summit2 series:
1241 *
1242 * Clustered boxes may have unsynced TSC problems if they are
1243 * multi-chassis. Use available data to take a good guess.
1244 * If in doubt, go HPET.
1245 */
1246__cpuinit int apic_is_clustered_box(void)
1247{
1248 int i, clusters, zeros;
1249 unsigned id;
1250 u16 *bios_cpu_apicid;
1251 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1252
1253 /*
1254 * there is not this kind of box with AMD CPU yet.
1255 * Some AMD box with quadcore cpu and 8 sockets apicid
1256 * will be [4, 0x23] or [8, 0x27] could be thought to
1257 * vsmp box still need checking...
1258 */
1259 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1260 return 0;
1261
1262 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1263 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1264
1265 for (i = 0; i < NR_CPUS; i++) {
1266 /* are we being called early in kernel startup? */
1267 if (bios_cpu_apicid) {
1268 id = bios_cpu_apicid[i];
1269 }
1270 else if (i < nr_cpu_ids) {
1271 if (cpu_present(i))
1272 id = per_cpu(x86_bios_cpu_apicid, i);
1273 else
1274 continue;
1275 }
1276 else
1277 break;
1278
1279 if (id != BAD_APICID)
1280 __set_bit(APIC_CLUSTERID(id), clustermap);
1281 }
1282
1283 /* Problem: Partially populated chassis may not have CPUs in some of
1284 * the APIC clusters they have been allocated. Only present CPUs have
1285 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1286 * Since clusters are allocated sequentially, count zeros only if
1287 * they are bounded by ones.
1288 */
1289 clusters = 0;
1290 zeros = 0;
1291 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1292 if (test_bit(i, clustermap)) {
1293 clusters += 1 + zeros;
1294 zeros = 0;
1295 } else
1296 ++zeros;
1297 }
1298
1299 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
1300 * not guaranteed to be synced between boards
1301 */
1302 if (is_vsmp_box() && clusters > 1)
1303 return 1;
1304
1305 /*
1306 * If clusters > 2, then should be multi-chassis.
1307 * May have to revisit this when multi-core + hyperthreaded CPUs come
1308 * out, but AFAIK this will work even for them.
1309 */
1310 return (clusters > 2);
1311}
1312
1313/*
1314 * APIC command line parameters
1315 */
1316static int __init apic_set_verbosity(char *str)
1317{
1318 if (str == NULL) {
1319 skip_ioapic_setup = 0;
1320 ioapic_force = 1;
1321 return 0;
1322 }
1323 if (strcmp("debug", str) == 0)
1324 apic_verbosity = APIC_DEBUG;
1325 else if (strcmp("verbose", str) == 0)
1326 apic_verbosity = APIC_VERBOSE;
1327 else {
1328 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1329 " use apic=verbose or apic=debug\n", str);
1330 return -EINVAL;
1331 }
1332
1333 return 0;
1334}
1335early_param("apic", apic_set_verbosity);
1336
1337static __init int setup_disableapic(char *str)
1338{
1339 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1341 return 0;
1342}
1343early_param("disableapic", setup_disableapic);
1344
1345/* same as disableapic, for compatibility */
1346static __init int setup_nolapic(char *str)
1347{
1348 return setup_disableapic(str);
1349}
1350early_param("nolapic", setup_nolapic);
1351
1352static int __init parse_lapic_timer_c2_ok(char *arg)
1353{
1354 local_apic_timer_c2_ok = 1;
1355 return 0;
1356}
1357early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1358
1359static __init int setup_noapictimer(char *str)
1360{
1361 if (str[0] != ' ' && str[0] != 0)
1362 return 0;
1363 disable_apic_timer = 1;
1364 return 1;
1365}
1366__setup("noapictimer", setup_noapictimer);
1367
1368static __init int setup_apicpmtimer(char *s)
1369{
1370 apic_calibrate_pmtmr = 1;
1371 notsc_setup(NULL);
1372 return 0;
1373}
1374__setup("apicpmtimer", setup_apicpmtimer);
1375
1376static int __init lapic_insert_resource(void)
1377{
1378 if (!apic_phys)
1379 return -1;
1380
1381 /* Put local APIC into the resource map. */
1382 lapic_resource.start = apic_phys;
1383 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1384 insert_resource(&iomem_resource, &lapic_resource);
1385
1386 return 0;
1387}
1388
1389/*
1390 * need call insert after e820_reserve_resources()
1391 * that is using request_resource
1392 */
1393late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
@@ -229,12 +228,12 @@
229#include <linux/suspend.h> 228#include <linux/suspend.h>
230#include <linux/kthread.h> 229#include <linux/kthread.h>
231#include <linux/jiffies.h> 230#include <linux/jiffies.h>
232#include <linux/smp_lock.h>
233 231
234#include <asm/system.h> 232#include <asm/system.h>
235#include <asm/uaccess.h> 233#include <asm/uaccess.h>
236#include <asm/desc.h> 234#include <asm/desc.h>
237#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
238#include <asm/paravirt.h> 237#include <asm/paravirt.h>
239#include <asm/reboot.h> 238#include <asm/reboot.h>
240 239
@@ -2218,7 +2217,7 @@ static int __init apm_init(void)
2218 2217
2219 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2220 2219
2221 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2222 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2223 return -ENODEV; 2222 return -ENODEV;
2224 } 2223 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..7fcf63d22f8b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,9 +18,11 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_UNISTD_64_H
24#define __SYSCALL(nr, sym) [nr] = 1, 26#define __SYSCALL(nr, sym) [nr] = 1,
25static char syscalls[] = { 27static char syscalls[] = {
26#include <asm/unistd.h> 28#include <asm/unistd.h>
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..f0dfe6f17e7e
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,141 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/efi.h>
23#include <asm/efi.h>
24#include <linux/io.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h>
27
28struct uv_systab uv_systab;
29
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{
32 struct uv_systab *tab = &uv_systab;
33
34 if (!tab->function)
35 /*
36 * BIOS does not support UV systab
37 */
38 return BIOS_STATUS_UNIMPLEMENTED;
39
40 return efi_call6((void *)__va(tab->function),
41 (u64)which, a1, a2, a3, a4, a5);
42}
43
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5)
46{
47 unsigned long bios_flags;
48 s64 ret;
49
50 local_irq_save(bios_flags);
51 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
52 local_irq_restore(bios_flags);
53
54 return ret;
55}
56
57s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
58 u64 a4, u64 a5)
59{
60 s64 ret;
61
62 preempt_disable();
63 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
64 preempt_enable();
65
66 return ret;
67}
68
69
70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id);
74long uv_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size);
76int uv_type;
77
78
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region)
81{
82 s64 ret;
83 u64 v0, v1;
84 union partition_info_u part;
85
86 ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
87 (u64)(&v0), (u64)(&v1), 0, 0);
88 if (ret != BIOS_STATUS_SUCCESS)
89 return ret;
90
91 part.val = v0;
92 if (uvtype)
93 *uvtype = part.hub_version;
94 if (partid)
95 *partid = part.partition_id;
96 if (coher)
97 *coher = part.coherence_id;
98 if (region)
99 *region = part.region_size;
100 return ret;
101}
102
103
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{
106 return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
107 (u64)ticks_per_second, 0, 0, 0);
108}
109EXPORT_SYMBOL_GPL(uv_bios_freq_base);
110
111
112#ifdef CONFIG_EFI
113void uv_bios_init(void)
114{
115 struct uv_systab *tab;
116
117 if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
118 (efi.uv_systab == (unsigned long)NULL)) {
119 printk(KERN_CRIT "No EFI UV System Table.\n");
120 uv_systab.function = (unsigned long)NULL;
121 return;
122 }
123
124 tab = (struct uv_systab *)ioremap(efi.uv_systab,
125 sizeof(struct uv_systab));
126 if (strncmp(tab->signature, "UVST", 4) != 0)
127 printk(KERN_ERR "bad signature in UV system table!");
128
129 /*
130 * Copy table to permanent spot for later use.
131 */
132 memcpy(&uv_systab, tab, sizeof(struct uv_systab));
133 iounmap(tab);
134
135 printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
136}
137#else /* !CONFIG_EFI */
138
139void uv_bios_init(void) { }
140#endif
141
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000000..667df55a4399
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad3001..82ec6075c057 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += amd.o 10
11obj-$(CONFIG_X86_64) += amd_64.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_64) += centaur_64.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_X86_64) += intel_64.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
18obj-$(CONFIG_X86_32) += umc.o
19 18
20obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
21obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
22obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
23 22
24obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../include/asm/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
@@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
56 144
57 switch (c->x86_vendor) { 145 switch (c->x86_vendor) {
58 case X86_VENDOR_INTEL: 146 case X86_VENDOR_INTEL:
59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 147 /*
148 * There is a known erratum on Pentium III and Core Solo
149 * and Core Duo CPUs.
150 * " Page with PAT set to WC while associated MTRR is UC
151 * may consolidate to UC "
152 * Because of this erratum, it is better to stick with
153 * setting WC in MTRR rather than using PAT on these CPUs.
154 *
155 * Enable PAT WC only on P4, Core 2 or later CPUs.
156 */
157 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
60 return; 158 return;
61 break; 159
160 pat_disable("PAT WC disabled due to known CPU erratum.");
161 return;
162
62 case X86_VENDOR_AMD: 163 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR: 164 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA: 165 case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..8f1e31db2ad5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,23 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27int force_mwait __cpuinitdata; 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 37{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
32 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
33 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
34 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
35 } 50 }
36} 51}
37 52
38static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
39{ 55{
40 u32 l, h; 56 u32 l, h;
41 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
42 int r;
43 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
44#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
45 unsigned long long value; 304 unsigned long long value;
46 305
@@ -51,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
51 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
52 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
53 */ 312 */
54 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
55 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
56 value |= 1 << 6; 315 value |= 1 << 6;
57 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -61,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
61 early_init_amd(c); 320 early_init_amd(c);
62 321
63 /* 322 /*
64 * FIXME: We should handle the K5 here. Set up the write
65 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
66 * no bus pipeline)
67 */
68
69 /*
70 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
71 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
72 */ 325 */
73 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
74 327
75 r = get_model_name(c); 328#ifdef CONFIG_X86_64
76 329 /* On C+ stepping K8 rep microcode works well for copy/memset */
77 switch (c->x86) { 330 if (c->x86 == 0xf) {
78 case 4: 331 u32 level;
79 /*
80 * General Systems BIOSen alias the cpu frequency registers
81 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
82 * drivers subsequently pokes it, and changes the CPU speed.
83 * Workaround : Remove the unneeded alias.
84 */
85#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
86#define CBAR_ENB (0x80000000)
87#define CBAR_KEY (0X000000CB)
88 if (c->x86_model == 9 || c->x86_model == 10) {
89 if (inl (CBAR) & CBAR_ENB)
90 outl (0 | CBAR_KEY, CBAR);
91 }
92 break;
93 case 5:
94 if (c->x86_model < 6) {
95 /* Based on AMD doc 20734R - June 2000 */
96 if (c->x86_model == 0) {
97 clear_cpu_cap(c, X86_FEATURE_APIC);
98 set_cpu_cap(c, X86_FEATURE_PGE);
99 }
100 break;
101 }
102
103 if (c->x86_model == 6 && c->x86_mask == 1) {
104 const int K6_BUG_LOOP = 1000000;
105 int n;
106 void (*f_vide)(void);
107 unsigned long d, d2;
108
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110
111 /*
112 * It looks like AMD fixed the 2.6.2 bug and improved indirect
113 * calls at the same time.
114 */
115
116 n = K6_BUG_LOOP;
117 f_vide = vide;
118 rdtscl(d);
119 while (n--)
120 f_vide();
121 rdtscl(d2);
122 d = d2-d;
123
124 if (d > 20*K6_BUG_LOOP)
125 printk("system stability may be impaired when more than 32 MB are used.\n");
126 else
127 printk("probably OK (after B9730xxxx).\n");
128 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
129 }
130
131 /* K6 with old style WHCR */
132 if (c->x86_model < 8 ||
133 (c->x86_model == 8 && c->x86_mask < 8)) {
134 /* We can only write allocate on the low 508Mb */
135 if (mbytes > 508)
136 mbytes = 508;
137
138 rdmsr(MSR_K6_WHCR, l, h);
139 if ((l&0x0000FFFF) == 0) {
140 unsigned long flags;
141 l = (1<<0)|((mbytes/4)<<1);
142 local_irq_save(flags);
143 wbinvd();
144 wrmsr(MSR_K6_WHCR, l, h);
145 local_irq_restore(flags);
146 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
147 mbytes);
148 }
149 break;
150 }
151
152 if ((c->x86_model == 8 && c->x86_mask > 7) ||
153 c->x86_model == 9 || c->x86_model == 13) {
154 /* The more serious chips .. */
155
156 if (mbytes > 4092)
157 mbytes = 4092;
158
159 rdmsr(MSR_K6_WHCR, l, h);
160 if ((l&0xFFFF0000) == 0) {
161 unsigned long flags;
162 l = ((mbytes>>2)<<22)|(1<<16);
163 local_irq_save(flags);
164 wbinvd();
165 wrmsr(MSR_K6_WHCR, l, h);
166 local_irq_restore(flags);
167 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
168 mbytes);
169 }
170
171 /* Set MTRR capability flag if appropriate */
172 if (c->x86_model == 13 || c->x86_model == 9 ||
173 (c->x86_model == 8 && c->x86_mask >= 8))
174 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
175 break;
176 }
177
178 if (c->x86_model == 10) {
179 /* AMD Geode LX is model 10 */
180 /* placeholder for any needed mods */
181 break;
182 }
183 break;
184 case 6: /* An Athlon/Duron */
185 332
186 /* 333 level = cpuid_eax(1);
187 * Bit 15 of Athlon specific MSR 15, needs to be 0 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
188 * to enable SSE on Palomino/Morgan/Barton CPU's. 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
189 * If the BIOS didn't enable it already, enable it here.
190 */
191 if (c->x86_model >= 6 && c->x86_model <= 10) {
192 if (!cpu_has(c, X86_FEATURE_XMM)) {
193 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
194 rdmsr(MSR_K7_HWCR, l, h);
195 l &= ~0x00008000;
196 wrmsr(MSR_K7_HWCR, l, h);
197 set_cpu_cap(c, X86_FEATURE_XMM);
198 }
199 }
200
201 /*
202 * It's been determined by AMD that Athlons since model 8 stepping 1
203 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
204 * As per AMD technical note 27212 0.2
205 */
206 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
207 rdmsr(MSR_K7_CLK_CTL, l, h);
208 if ((l & 0xfff00000) != 0x20000000) {
209 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
210 ((l & 0x000fffff)|0x20000000));
211 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
212 }
213 }
214 break;
215 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
216 346
217 switch (c->x86) { 347 switch (c->x86) {
218 case 15: 348 case 4:
219 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
220 case 0x10:
221 case 0x11:
222 set_cpu_cap(c, X86_FEATURE_K8);
223 break; 350 break;
224 case 6: 351 case 5:
225 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
226 break; 356 break;
227 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
228 if (c->x86 >= 6) 365 if (c->x86 >= 6)
229 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
230 367
231 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
232 369 switch (c->x86) {
233 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
234 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
235 377
236#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
237 /*
238 * On a AMD multi core setup the lower bits of the APIC id
239 * distinguish the cores.
240 */
241 if (c->x86_max_cores > 1) {
242 int cpu = smp_processor_id();
243 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
244 379
245 if (bits == 0) { 380 /* Multi core CPU? */
246 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
247 bits++; 382 amd_detect_cmp(c);
248 } 383 srat_detect_node(c);
249 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
250 c->phys_proc_id >>= bits;
251 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
252 cpu, c->x86_max_cores, c->cpu_core_id);
253 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
254#endif 388#endif
255 389
256 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
257 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
258 num_cache_leaves = 4; 392 num_cache_leaves = 4;
259 else 393 else
260 num_cache_leaves = 3; 394 num_cache_leaves = 3;
261 } 395 }
262 396
263 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
264 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
265 clear_cpu_cap(c, X86_FEATURE_MCE);
266 399
267 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
268 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
269} 433}
270 434
435#ifdef CONFIG_X86_32
271static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
272{ 437{
273 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -280,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
280 } 445 }
281 return size; 446 return size;
282} 447}
448#endif
283 449
284static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
285 .c_vendor = "AMD", 451 .c_vendor = "AMD",
286 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
287 .c_models = { 454 .c_models = {
288 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
289 { 456 {
@@ -296,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
296 } 463 }
297 }, 464 },
298 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
299 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
300 .c_init = init_amd, 469 .c_init = init_amd,
301 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
302}; 471};
303 472
304cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index 7c36fb8a28d4..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,222 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118}
119
120static void __cpuinit init_amd(struct cpuinfo_x86 *c)
121{
122 unsigned level;
123
124#ifdef CONFIG_SMP
125 unsigned long value;
126
127 /*
128 * Disable TLB flush filter by setting HWCR.FFDIS on K8
129 * bit 6 of msr C001_0015
130 *
131 * Errata 63 for SH-B3 steppings
132 * Errata 122 for all steppings (F+ have it disabled by default)
133 */
134 if (c->x86 == 0xf) {
135 rdmsrl(MSR_K8_HWCR, value);
136 value |= 1 << 6;
137 wrmsrl(MSR_K8_HWCR, value);
138 }
139#endif
140
141 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
142 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
143 clear_cpu_cap(c, 0*32+31);
144
145 /* On C+ stepping K8 rep microcode works well for copy/memset */
146 if (c->x86 == 0xf) {
147 level = cpuid_eax(1);
148 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
149 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
150 }
151 if (c->x86 == 0x10 || c->x86 == 0x11)
152 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
153
154 /* Enable workaround for FXSAVE leak */
155 if (c->x86 >= 6)
156 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
157
158 level = get_model_name(c);
159 if (!level) {
160 switch (c->x86) {
161 case 0xf:
162 /* Should distinguish Models here, but this is only
163 a fallback anyways. */
164 strcpy(c->x86_model_id, "Hammer");
165 break;
166 }
167 }
168 display_cacheinfo(c);
169
170 /* Multi core CPU? */
171 if (c->extended_cpuid_level >= 0x80000008)
172 amd_detect_cmp(c);
173
174 if (c->extended_cpuid_level >= 0x80000006 &&
175 (cpuid_edx(0x80000006) & 0xf000))
176 num_cache_leaves = 4;
177 else
178 num_cache_leaves = 3;
179
180 if (c->x86 >= 0xf && c->x86 <= 0x11)
181 set_cpu_cap(c, X86_FEATURE_K8);
182
183 /* MFENCE stops RDTSC speculation */
184 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
185
186 if (c->x86 == 0x10) {
187 /* do this for boot cpu */
188 if (c == &boot_cpu_data)
189 check_enable_amd_mmconf_dmi();
190
191 fam10h_check_enable_mmcfg();
192 }
193
194 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
195 unsigned long long tseg;
196
197 /*
198 * Split up direct mapping around the TSEG SMM area.
199 * Don't do it for gbpages because there seems very little
200 * benefit in doing so.
201 */
202 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
203 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
204 if ((tseg>>PMD_SHIFT) <
205 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
206 ((tseg>>PMD_SHIFT) <
207 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
208 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
209 set_memory_4k((unsigned long)__va(tseg), 1);
210 }
211 }
212}
213
214static struct cpu_dev amd_cpu_dev __cpuinitdata = {
215 .c_vendor = "AMD",
216 .c_ident = { "AuthenticAMD" },
217 .c_early_init = early_init_amd,
218 .c_init = init_amd,
219};
220
221cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
222
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
74 "fistpl %0\n\t" 76 "fistpl %0\n\t"
75 "fwait\n\t" 77 "fwait\n\t"
76 "fninit" 78 "fninit"
77 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
78 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
79 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
80 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
81} 85}
@@ -131,13 +135,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 135 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 136 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 137 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 138 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 139
142static void __init check_config(void) 140static void __init check_config(void)
143{ 141{
@@ -151,21 +149,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 149 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 150 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 151#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 152}
170 153
171 154
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -314,6 +313,16 @@ enum {
314 EAMD3D = 1<<20, 313 EAMD3D = 1<<20,
315}; 314};
316 315
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{
318 switch (c->x86) {
319 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break;
323 }
324}
325
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 326static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 327{
319 328
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 471static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 472 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 473 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 475 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
467}; 478};
468 479
469cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,27 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
16#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 24#include <asm/mpspec.h>
18#include <asm/apic.h> 25#include <asm/apic.h>
19#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
20#endif 28#endif
21 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
22#include "cpu.h" 39#include "cpu.h"
23 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
57} }; 92} };
93#endif
58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
59 95
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
61
62static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
63static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
64 99
65struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 /*
128 * Cyrix and IDT cpus allow disabling of CPUID
129 * so the code below may return different results
130 * when it is executed before and after enabling
131 * the CPUID. Add "volatile" to not allow gcc to
132 * optimize the subsequent calls to this function.
133 */
134 asm volatile ("pushfl\n\t"
135 "pushfl\n\t"
136 "popl %0\n\t"
137 "movl %0,%1\n\t"
138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
146
147 return ((f1^f2) & flag) != 0;
148}
149
150/* Probe for the CPUID instruction */
151static int __cpuinit have_cpuid_p(void)
152{
153 return flag_is_changeable_p(X86_EFLAGS_ID);
154}
155
156static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
157{
158 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
159 /* Disable processor serial number */
160 unsigned long lo, hi;
161 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
162 lo |= 0x200000;
163 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
164 printk(KERN_NOTICE "CPU serial number disabled.\n");
165 clear_cpu_cap(c, X86_FEATURE_PN);
166
167 /* Disabling the serial number may affect the cpuid level */
168 c->cpuid_level = cpuid_eax(0);
169 }
170}
171
172static int __init x86_serial_nr_setup(char *s)
173{
174 disable_x86_serial_nr = 0;
175 return 1;
176}
177__setup("serialnumber", x86_serial_nr_setup);
178#else
179static inline int flag_is_changeable_p(u32 flag)
180{
181 return 1;
182}
183/* Probe for the CPUID instruction */
184static inline int have_cpuid_p(void)
185{
186 return 1;
187}
188static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
189{
190}
191#endif
192
193/*
194 * Naming convention should be: <Name> [(<Codename>)]
195 * This table only is used unless init_<vendor>() below doesn't set it;
196 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
197 *
198 */
199
200/* Look up CPU names by table lookup. */
201static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
202{
203 struct cpu_model_info *info;
204
205 if (c->x86_model >= 16)
206 return NULL; /* Range check */
207
208 if (!this_cpu)
209 return NULL;
210
211 info = this_cpu->c_models;
212
213 while (info && info->family) {
214 if (info->family == c->x86)
215 return info->model_names[c->x86_model];
216 info++;
217 }
218 return NULL; /* Not found */
219}
220
221__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
222
223/* Current gdt points %fs at the "master" per-cpu area: after this,
224 * it's on the real one. */
225void switch_to_new_gdt(void)
226{
227 struct desc_ptr gdt_descr;
228
229 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
230 gdt_descr.size = GDT_SIZE - 1;
231 load_gdt(&gdt_descr);
232#ifdef CONFIG_X86_32
233 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
234#endif
235}
236
237static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
66 238
67static void __cpuinit default_init(struct cpuinfo_x86 *c) 239static void __cpuinit default_init(struct cpuinfo_x86 *c)
68{ 240{
241#ifdef CONFIG_X86_64
242 display_cacheinfo(c);
243#else
69 /* Not much we can do here... */ 244 /* Not much we can do here... */
70 /* Check if at least it has cpuid */ 245 /* Check if at least it has cpuid */
71 if (c->cpuid_level == -1) { 246 if (c->cpuid_level == -1) {
@@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
75 else if (c->x86 == 3) 250 else if (c->x86 == 3)
76 strcpy(c->x86_model_id, "386"); 251 strcpy(c->x86_model_id, "386");
77 } 252 }
253#endif
78} 254}
79 255
80static struct cpu_dev __cpuinitdata default_cpu = { 256static struct cpu_dev __cpuinitdata default_cpu = {
81 .c_init = default_init, 257 .c_init = default_init,
82 .c_vendor = "Unknown", 258 .c_vendor = "Unknown",
259 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83}; 260};
84static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
85
86static int __init cachesize_setup(char *str)
87{
88 get_option(&str, &cachesize_override);
89 return 1;
90}
91__setup("cachesize=", cachesize_setup);
92 261
93int __cpuinit get_model_name(struct cpuinfo_x86 *c) 262static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
94{ 263{
95 unsigned int *v; 264 unsigned int *v;
96 char *p, *q; 265 char *p, *q;
97 266
98 if (cpuid_eax(0x80000000) < 0x80000004) 267 if (c->extended_cpuid_level < 0x80000004)
99 return 0; 268 return;
100 269
101 v = (unsigned int *) c->x86_model_id; 270 v = (unsigned int *) c->x86_model_id;
102 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 271 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
115 while (q <= &c->x86_model_id[48]) 284 while (q <= &c->x86_model_id[48])
116 *q++ = '\0'; /* Zero-pad the rest */ 285 *q++ = '\0'; /* Zero-pad the rest */
117 } 286 }
118
119 return 1;
120} 287}
121 288
122
123void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 289void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
124{ 290{
125 unsigned int n, dummy, ecx, edx, l2size; 291 unsigned int n, dummy, ebx, ecx, edx, l2size;
126 292
127 n = cpuid_eax(0x80000000); 293 n = c->extended_cpuid_level;
128 294
129 if (n >= 0x80000005) { 295 if (n >= 0x80000005) {
130 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 296 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
131 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 297 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
132 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 298 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
133 c->x86_cache_size = (ecx>>24)+(edx>>24); 299 c->x86_cache_size = (ecx>>24) + (edx>>24);
300#ifdef CONFIG_X86_64
301 /* On K8 L1 TLB is inclusive, so don't count it */
302 c->x86_tlbsize = 0;
303#endif
134 } 304 }
135 305
136 if (n < 0x80000006) /* Some chips just has a large L1. */ 306 if (n < 0x80000006) /* Some chips just has a large L1. */
137 return; 307 return;
138 308
139 ecx = cpuid_ecx(0x80000006); 309 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
140 l2size = ecx >> 16; 310 l2size = ecx >> 16;
141 311
312#ifdef CONFIG_X86_64
313 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
314#else
142 /* do processor-specific cache resizing */ 315 /* do processor-specific cache resizing */
143 if (this_cpu->c_size_cache) 316 if (this_cpu->c_size_cache)
144 l2size = this_cpu->c_size_cache(c, l2size); 317 l2size = this_cpu->c_size_cache(c, l2size);
@@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
149 322
150 if (l2size == 0) 323 if (l2size == 0)
151 return; /* Again, no L2 cache is possible */ 324 return; /* Again, no L2 cache is possible */
325#endif
152 326
153 c->x86_cache_size = l2size; 327 c->x86_cache_size = l2size;
154 328
155 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 329 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
156 l2size, ecx & 0xFF); 330 l2size, ecx & 0xFF);
157} 331}
158 332
159/* 333void __cpuinit detect_ht(struct cpuinfo_x86 *c)
160 * Naming convention should be: <Name> [(<Codename>)]
161 * This table only is used unless init_<vendor>() below doesn't set it;
162 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
163 *
164 */
165
166/* Look up CPU names by table lookup. */
167static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
168{ 334{
169 struct cpu_model_info *info; 335#ifdef CONFIG_X86_HT
336 u32 eax, ebx, ecx, edx;
337 int index_msb, core_bits;
170 338
171 if (c->x86_model >= 16) 339 if (!cpu_has(c, X86_FEATURE_HT))
172 return NULL; /* Range check */ 340 return;
173 341
174 if (!this_cpu) 342 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
175 return NULL; 343 goto out;
176 344
177 info = this_cpu->c_models; 345 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
346 return;
178 347
179 while (info && info->family) { 348 cpuid(1, &eax, &ebx, &ecx, &edx);
180 if (info->family == c->x86) 349
181 return info->model_names[c->x86_model]; 350 smp_num_siblings = (ebx & 0xff0000) >> 16;
182 info++; 351
352 if (smp_num_siblings == 1) {
353 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
354 } else if (smp_num_siblings > 1) {
355
356 if (smp_num_siblings > NR_CPUS) {
357 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
358 smp_num_siblings);
359 smp_num_siblings = 1;
360 return;
361 }
362
363 index_msb = get_count_order(smp_num_siblings);
364#ifdef CONFIG_X86_64
365 c->phys_proc_id = phys_pkg_id(index_msb);
366#else
367 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
368#endif
369
370 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
371
372 index_msb = get_count_order(smp_num_siblings);
373
374 core_bits = get_count_order(c->x86_max_cores);
375
376#ifdef CONFIG_X86_64
377 c->cpu_core_id = phys_pkg_id(index_msb) &
378 ((1 << core_bits) - 1);
379#else
380 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
381 ((1 << core_bits) - 1);
382#endif
183 } 383 }
184 return NULL; /* Not found */
185}
186 384
385out:
386 if ((c->x86_max_cores * smp_num_siblings) > 1) {
387 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
388 c->phys_proc_id);
389 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
390 c->cpu_core_id);
391 }
392#endif
393}
187 394
188static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 395static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
189{ 396{
190 char *v = c->x86_vendor_id; 397 char *v = c->x86_vendor_id;
191 int i; 398 int i;
192 static int printed; 399 static int printed;
193 400
194 for (i = 0; i < X86_VENDOR_NUM; i++) { 401 for (i = 0; i < X86_VENDOR_NUM; i++) {
195 if (cpu_devs[i]) { 402 if (!cpu_devs[i])
196 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 403 break;
197 (cpu_devs[i]->c_ident[1] && 404
198 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 405 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
199 c->x86_vendor = i; 406 (cpu_devs[i]->c_ident[1] &&
200 if (!early) 407 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
201 this_cpu = cpu_devs[i]; 408 this_cpu = cpu_devs[i];
202 return; 409 c->x86_vendor = this_cpu->c_x86_vendor;
203 } 410 return;
204 } 411 }
205 } 412 }
413
206 if (!printed) { 414 if (!printed) {
207 printed++; 415 printed++;
208 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 416 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
209 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 417 printk(KERN_ERR "CPU: Your system may be unstable.\n");
210 } 418 }
419
211 c->x86_vendor = X86_VENDOR_UNKNOWN; 420 c->x86_vendor = X86_VENDOR_UNKNOWN;
212 this_cpu = &default_cpu; 421 this_cpu = &default_cpu;
213} 422}
214 423
215 424void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
216static int __init x86_fxsr_setup(char *s)
217{
218 setup_clear_cpu_cap(X86_FEATURE_FXSR);
219 setup_clear_cpu_cap(X86_FEATURE_XMM);
220 return 1;
221}
222__setup("nofxsr", x86_fxsr_setup);
223
224
225static int __init x86_sep_setup(char *s)
226{
227 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1;
229}
230__setup("nosep", x86_sep_setup);
231
232
233/* Standard macro to see if a specific flag is changeable */
234static inline int flag_is_changeable_p(u32 flag)
235{
236 u32 f1, f2;
237
238 asm("pushfl\n\t"
239 "pushfl\n\t"
240 "popl %0\n\t"
241 "movl %0,%1\n\t"
242 "xorl %2,%0\n\t"
243 "pushl %0\n\t"
244 "popfl\n\t"
245 "pushfl\n\t"
246 "popl %0\n\t"
247 "popfl\n\t"
248 : "=&r" (f1), "=&r" (f2)
249 : "ir" (flag));
250
251 return ((f1^f2) & flag) != 0;
252}
253
254
255/* Probe for the CPUID instruction */
256static int __cpuinit have_cpuid_p(void)
257{
258 return flag_is_changeable_p(X86_EFLAGS_ID);
259}
260
261void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 425{
263 /* Get vendor name */ 426 /* Get vendor name */
264 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 427 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
267 (unsigned int *)&c->x86_vendor_id[4]); 430 (unsigned int *)&c->x86_vendor_id[4]);
268 431
269 c->x86 = 4; 432 c->x86 = 4;
433 /* Intel-defined flags: level 0x00000001 */
270 if (c->cpuid_level >= 0x00000001) { 434 if (c->cpuid_level >= 0x00000001) {
271 u32 junk, tfms, cap0, misc; 435 u32 junk, tfms, cap0, misc;
272 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 436 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
273 c->x86 = (tfms >> 8) & 15; 437 c->x86 = (tfms >> 8) & 0xf;
274 c->x86_model = (tfms >> 4) & 15; 438 c->x86_model = (tfms >> 4) & 0xf;
439 c->x86_mask = tfms & 0xf;
275 if (c->x86 == 0xf) 440 if (c->x86 == 0xf)
276 c->x86 += (tfms >> 20) & 0xff; 441 c->x86 += (tfms >> 20) & 0xff;
277 if (c->x86 >= 0x6) 442 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 443 c->x86_model += ((tfms >> 16) & 0xf) << 4;
279 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) { 444 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 445 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
446 c->x86_cache_alignment = c->x86_clflush_size;
283 } 447 }
284 } 448 }
285} 449}
286static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 450
451static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
287{ 452{
288 u32 tfms, xlvl; 453 u32 tfms, xlvl;
289 unsigned int ebx; 454 u32 ebx;
290 455
291 memset(&c->x86_capability, 0, sizeof c->x86_capability); 456 /* Intel-defined flags: level 0x00000001 */
292 if (have_cpuid_p()) { 457 if (c->cpuid_level >= 0x00000001) {
293 /* Intel-defined flags: level 0x00000001 */ 458 u32 capability, excap;
294 if (c->cpuid_level >= 0x00000001) { 459 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
295 u32 capability, excap; 460 c->x86_capability[0] = capability;
296 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 461 c->x86_capability[4] = excap;
297 c->x86_capability[0] = capability; 462 }
298 c->x86_capability[4] = excap;
299 }
300 463
301 /* AMD-defined flags: level 0x80000001 */ 464 /* AMD-defined flags: level 0x80000001 */
302 xlvl = cpuid_eax(0x80000000); 465 xlvl = cpuid_eax(0x80000000);
303 if ((xlvl & 0xffff0000) == 0x80000000) { 466 c->extended_cpuid_level = xlvl;
304 if (xlvl >= 0x80000001) { 467 if ((xlvl & 0xffff0000) == 0x80000000) {
305 c->x86_capability[1] = cpuid_edx(0x80000001); 468 if (xlvl >= 0x80000001) {
306 c->x86_capability[6] = cpuid_ecx(0x80000001); 469 c->x86_capability[1] = cpuid_edx(0x80000001);
307 } 470 c->x86_capability[6] = cpuid_ecx(0x80000001);
308 } 471 }
472 }
473
474#ifdef CONFIG_X86_64
475 if (c->extended_cpuid_level >= 0x80000008) {
476 u32 eax = cpuid_eax(0x80000008);
309 477
478 c->x86_virt_bits = (eax >> 8) & 0xff;
479 c->x86_phys_bits = eax & 0xff;
310 } 480 }
481#endif
482
483 if (c->extended_cpuid_level >= 0x80000007)
484 c->x86_power = cpuid_edx(0x80000007);
311 485
312} 486}
313 487
488static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
489{
490#ifdef CONFIG_X86_32
491 int i;
492
493 /*
494 * First of all, decide if this is a 486 or higher
495 * It's a 486 if we can modify the AC flag
496 */
497 if (flag_is_changeable_p(X86_EFLAGS_AC))
498 c->x86 = 4;
499 else
500 c->x86 = 3;
501
502 for (i = 0; i < X86_VENDOR_NUM; i++)
503 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
504 c->x86_vendor_id[0] = 0;
505 cpu_devs[i]->c_identify(c);
506 if (c->x86_vendor_id[0]) {
507 get_cpu_vendor(c);
508 break;
509 }
510 }
511#endif
512}
513
314/* 514/*
315 * Do minimum CPU detection early. 515 * Do minimum CPU detection early.
316 * Fields really needed: vendor, cpuid_level, family, model, mask, 516 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
320 * WARNING: this function is only called on the BP. Don't add code here 520 * WARNING: this function is only called on the BP. Don't add code here
321 * that is supposed to run on all CPUs. 521 * that is supposed to run on all CPUs.
322 */ 522 */
323static void __init early_cpu_detect(void) 523static void __init early_identify_cpu(struct cpuinfo_x86 *c)
324{ 524{
325 struct cpuinfo_x86 *c = &boot_cpu_data; 525#ifdef CONFIG_X86_64
326 526 c->x86_clflush_size = 64;
327 c->x86_cache_alignment = 32; 527#else
328 c->x86_clflush_size = 32; 528 c->x86_clflush_size = 32;
529#endif
530 c->x86_cache_alignment = c->x86_clflush_size;
531
532 memset(&c->x86_capability, 0, sizeof c->x86_capability);
533 c->extended_cpuid_level = 0;
534
535 if (!have_cpuid_p())
536 identify_cpu_without_cpuid(c);
329 537
538 /* cyrix could have cpuid enabled via c_identify()*/
330 if (!have_cpuid_p()) 539 if (!have_cpuid_p())
331 return; 540 return;
332 541
333 cpu_detect(c); 542 cpu_detect(c);
334 543
335 get_cpu_vendor(c, 1); 544 get_cpu_vendor(c);
545
546 get_cpu_cap(c);
336 547
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 548 if (this_cpu->c_early_init)
338 cpu_devs[c->x86_vendor]->c_early_init) 549 this_cpu->c_early_init(c);
339 cpu_devs[c->x86_vendor]->c_early_init(c);
340 550
341 early_get_cap(c); 551 validate_pat_support(c);
342} 552}
343 553
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 554void __init early_cpu_init(void)
345{ 555{
346 u32 tfms, xlvl; 556 struct cpu_dev **cdev;
347 unsigned int ebx; 557 int count = 0;
348 558
349 if (have_cpuid_p()) { 559 printk("KERNEL supported cpus:\n");
350 /* Get vendor name */ 560 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
351 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 561 struct cpu_dev *cpudev = *cdev;
352 (unsigned int *)&c->x86_vendor_id[0], 562 unsigned int j;
353 (unsigned int *)&c->x86_vendor_id[8], 563
354 (unsigned int *)&c->x86_vendor_id[4]); 564 if (count >= X86_VENDOR_NUM)
355 565 break;
356 get_cpu_vendor(c, 0); 566 cpu_devs[count] = cpudev;
357 /* Initialize the standard set of capabilities */ 567 count++;
358 /* Note that the vendor-specific code below might override */ 568
359 /* Intel-defined flags: level 0x00000001 */ 569 for (j = 0; j < 2; j++) {
360 if (c->cpuid_level >= 0x00000001) { 570 if (!cpudev->c_ident[j])
361 u32 capability, excap; 571 continue;
362 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 572 printk(" %s %s\n", cpudev->c_vendor,
363 c->x86_capability[0] = capability; 573 cpudev->c_ident[j]);
364 c->x86_capability[4] = excap;
365 c->x86 = (tfms >> 8) & 15;
366 c->x86_model = (tfms >> 4) & 15;
367 if (c->x86 == 0xf)
368 c->x86 += (tfms >> 20) & 0xff;
369 if (c->x86 >= 0x6)
370 c->x86_model += ((tfms >> 16) & 0xF) << 4;
371 c->x86_mask = tfms & 15;
372 c->initial_apicid = (ebx >> 24) & 0xFF;
373#ifdef CONFIG_X86_HT
374 c->apicid = phys_pkg_id(c->initial_apicid, 0);
375 c->phys_proc_id = c->initial_apicid;
376#else
377 c->apicid = c->initial_apicid;
378#endif
379 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
380 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
381 } else {
382 /* Have CPUID level 0 only - unheard of */
383 c->x86 = 4;
384 }
385
386 /* AMD-defined flags: level 0x80000001 */
387 xlvl = cpuid_eax(0x80000000);
388 if ((xlvl & 0xffff0000) == 0x80000000) {
389 if (xlvl >= 0x80000001) {
390 c->x86_capability[1] = cpuid_edx(0x80000001);
391 c->x86_capability[6] = cpuid_ecx(0x80000001);
392 }
393 if (xlvl >= 0x80000004)
394 get_model_name(c); /* Default name */
395 } 574 }
396
397 init_scattered_cpuid_features(c);
398 } 575 }
399 576
577 early_identify_cpu(&boot_cpu_data);
400} 578}
401 579
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 580/*
581 * The NOPL instruction is supposed to exist on all CPUs with
582 * family >= 6; unfortunately, that's not true in practice because
583 * of early VIA chips and (more importantly) broken virtualizers that
584 * are not easy to detect. In the latter case it doesn't even *fail*
585 * reliably, so probing for it doesn't even work. Disable it completely
586 * unless we can find a reliable way to detect all the broken cases.
587 */
588static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
403{ 589{
404 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 590 clear_cpu_cap(c, X86_FEATURE_NOPL);
405 /* Disable processor serial number */
406 unsigned long lo, hi;
407 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
408 lo |= 0x200000;
409 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
410 printk(KERN_NOTICE "CPU serial number disabled.\n");
411 clear_cpu_cap(c, X86_FEATURE_PN);
412
413 /* Disabling the serial number may affect the cpuid level */
414 c->cpuid_level = cpuid_eax(0);
415 }
416} 591}
417 592
418static int __init x86_serial_nr_setup(char *s) 593static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
419{ 594{
420 disable_x86_serial_nr = 0; 595 c->extended_cpuid_level = 0;
421 return 1; 596
422} 597 if (!have_cpuid_p())
423__setup("serialnumber", x86_serial_nr_setup); 598 identify_cpu_without_cpuid(c);
599
600 /* cyrix could have cpuid enabled via c_identify()*/
601 if (!have_cpuid_p())
602 return;
603
604 cpu_detect(c);
605
606 get_cpu_vendor(c);
424 607
608 get_cpu_cap(c);
425 609
610 if (c->cpuid_level >= 0x00000001) {
611 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
612#ifdef CONFIG_X86_32
613# ifdef CONFIG_X86_HT
614 c->apicid = phys_pkg_id(c->initial_apicid, 0);
615# else
616 c->apicid = c->initial_apicid;
617# endif
618#endif
619
620#ifdef CONFIG_X86_HT
621 c->phys_proc_id = c->initial_apicid;
622#endif
623 }
624
625 get_model_name(c); /* Default name */
626
627 init_scattered_cpuid_features(c);
628 detect_nopl(c);
629}
426 630
427/* 631/*
428 * This does the hard work of actually picking apart the CPU stuff... 632 * This does the hard work of actually picking apart the CPU stuff...
@@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
434 c->loops_per_jiffy = loops_per_jiffy; 638 c->loops_per_jiffy = loops_per_jiffy;
435 c->x86_cache_size = -1; 639 c->x86_cache_size = -1;
436 c->x86_vendor = X86_VENDOR_UNKNOWN; 640 c->x86_vendor = X86_VENDOR_UNKNOWN;
437 c->cpuid_level = -1; /* CPUID not detected */
438 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 641 c->x86_model = c->x86_mask = 0; /* So far unknown... */
439 c->x86_vendor_id[0] = '\0'; /* Unset */ 642 c->x86_vendor_id[0] = '\0'; /* Unset */
440 c->x86_model_id[0] = '\0'; /* Unset */ 643 c->x86_model_id[0] = '\0'; /* Unset */
441 c->x86_max_cores = 1; 644 c->x86_max_cores = 1;
645 c->x86_coreid_bits = 0;
646#ifdef CONFIG_X86_64
647 c->x86_clflush_size = 64;
648#else
649 c->cpuid_level = -1; /* CPUID not detected */
442 c->x86_clflush_size = 32; 650 c->x86_clflush_size = 32;
651#endif
652 c->x86_cache_alignment = c->x86_clflush_size;
443 memset(&c->x86_capability, 0, sizeof c->x86_capability); 653 memset(&c->x86_capability, 0, sizeof c->x86_capability);
444 654
445 if (!have_cpuid_p()) {
446 /*
447 * First of all, decide if this is a 486 or higher
448 * It's a 486 if we can modify the AC flag
449 */
450 if (flag_is_changeable_p(X86_EFLAGS_AC))
451 c->x86 = 4;
452 else
453 c->x86 = 3;
454 }
455
456 generic_identify(c); 655 generic_identify(c);
457 656
458 if (this_cpu->c_identify) 657 if (this_cpu->c_identify)
459 this_cpu->c_identify(c); 658 this_cpu->c_identify(c);
460 659
660#ifdef CONFIG_X86_64
661 c->apicid = phys_pkg_id(0);
662#endif
663
461 /* 664 /*
462 * Vendor-specific initialization. In this section we 665 * Vendor-specific initialization. In this section we
463 * canonicalize the feature flags, meaning if there are 666 * canonicalize the feature flags, meaning if there are
@@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
491 c->x86, c->x86_model); 694 c->x86, c->x86_model);
492 } 695 }
493 696
697#ifdef CONFIG_X86_64
698 detect_ht(c);
699#endif
700
494 /* 701 /*
495 * On SMP, boot_cpu_data holds the common feature set between 702 * On SMP, boot_cpu_data holds the common feature set between
496 * all CPUs; so make sure that we indicate which features are 703 * all CPUs; so make sure that we indicate which features are
@@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
499 */ 706 */
500 if (c != &boot_cpu_data) { 707 if (c != &boot_cpu_data) {
501 /* AND the already accumulated flags with these */ 708 /* AND the already accumulated flags with these */
502 for (i = 0 ; i < NCAPINTS ; i++) 709 for (i = 0; i < NCAPINTS; i++)
503 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 710 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
504 } 711 }
505 712
@@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
507 for (i = 0; i < NCAPINTS; i++) 714 for (i = 0; i < NCAPINTS; i++)
508 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 715 c->x86_capability[i] &= ~cleared_cpu_caps[i];
509 716
717#ifdef CONFIG_X86_MCE
510 /* Init Machine Check Exception if available. */ 718 /* Init Machine Check Exception if available. */
511 mcheck_init(c); 719 mcheck_init(c);
720#endif
512 721
513 select_idle_routine(c); 722 select_idle_routine(c);
723
724#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
725 numa_add_cpu(smp_processor_id());
726#endif
514} 727}
515 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
516void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
517{ 740{
518 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
742#ifdef CONFIG_X86_32
519 sysenter_setup(); 743 sysenter_setup();
520 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
747#endif
521} 748}
522 749
523void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 750void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
524{ 751{
525 BUG_ON(c == &boot_cpu_data); 752 BUG_ON(c == &boot_cpu_data);
526 identify_cpu(c); 753 identify_cpu(c);
754#ifdef CONFIG_X86_32
527 enable_sep_cpu(); 755 enable_sep_cpu();
756#endif
528 mtrr_ap_init(); 757 mtrr_ap_init();
529} 758}
530 759
531#ifdef CONFIG_X86_HT 760struct msr_range {
532void __cpuinit detect_ht(struct cpuinfo_x86 *c) 761 unsigned min;
533{ 762 unsigned max;
534 u32 eax, ebx, ecx, edx; 763};
535 int index_msb, core_bits;
536
537 cpuid(1, &eax, &ebx, &ecx, &edx);
538
539 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
540 return;
541
542 smp_num_siblings = (ebx & 0xff0000) >> 16;
543 764
544 if (smp_num_siblings == 1) { 765static struct msr_range msr_range_array[] __cpuinitdata = {
545 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 766 { 0x00000000, 0x00000418},
546 } else if (smp_num_siblings > 1) { 767 { 0xc0000000, 0xc000040b},
768 { 0xc0010000, 0xc0010142},
769 { 0xc0011000, 0xc001103b},
770};
547 771
548 if (smp_num_siblings > NR_CPUS) { 772static void __cpuinit print_cpu_msr(void)
549 printk(KERN_WARNING "CPU: Unsupported number of the " 773{
550 "siblings %d", smp_num_siblings); 774 unsigned index;
551 smp_num_siblings = 1; 775 u64 val;
552 return; 776 int i;
777 unsigned index_min, index_max;
778
779 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
780 index_min = msr_range_array[i].min;
781 index_max = msr_range_array[i].max;
782 for (index = index_min; index < index_max; index++) {
783 if (rdmsrl_amd_safe(index, &val))
784 continue;
785 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
553 } 786 }
787 }
788}
554 789
555 index_msb = get_count_order(smp_num_siblings); 790static int show_msr __cpuinitdata;
556 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 791static __init int setup_show_msr(char *arg)
557 792{
558 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 793 int num;
559 c->phys_proc_id);
560
561 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
562
563 index_msb = get_count_order(smp_num_siblings) ;
564
565 core_bits = get_count_order(c->x86_max_cores);
566 794
567 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 795 get_option(&arg, &num);
568 ((1 << core_bits) - 1);
569 796
570 if (c->x86_max_cores > 1) 797 if (num > 0)
571 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 798 show_msr = num;
572 c->cpu_core_id); 799 return 1;
573 }
574} 800}
575#endif 801__setup("show_msr=", setup_show_msr);
576 802
577static __init int setup_noclflush(char *arg) 803static __init int setup_noclflush(char *arg)
578{ 804{
@@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
591 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
592 818
593 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
594 printk("%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
595 821
596 if (!c->x86_model_id[0]) 822 if (c->x86_model_id[0])
597 printk("%d86", c->x86); 823 printk(KERN_CONT "%s", c->x86_model_id);
598 else 824 else
599 printk("%s", c->x86_model_id); 825 printk(KERN_CONT "%d86", c->x86);
600 826
601 if (c->x86_mask || c->cpuid_level >= 0) 827 if (c->x86_mask || c->cpuid_level >= 0)
602 printk(" stepping %02x\n", c->x86_mask); 828 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
603 else 829 else
604 printk("\n"); 830 printk(KERN_CONT "\n");
831
832#ifdef CONFIG_SMP
833 if (c->cpu_index < show_msr)
834 print_cpu_msr();
835#else
836 if (show_msr)
837 print_cpu_msr();
838#endif
605} 839}
606 840
607static __init int setup_disablecpuid(char *arg) 841static __init int setup_disablecpuid(char *arg)
@@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
617 851
618cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 852cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
619 853
620void __init early_cpu_init(void) 854#ifdef CONFIG_X86_64
855struct x8664_pda **_cpu_pda __read_mostly;
856EXPORT_SYMBOL(_cpu_pda);
857
858struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
859
860char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
861
862void __cpuinit pda_init(int cpu)
621{ 863{
622 struct cpu_vendor_dev *cvdev; 864 struct x8664_pda *pda = cpu_pda(cpu);
865
866 /* Setup up data that may be needed in __get_free_pages early */
867 loadsegment(fs, 0);
868 loadsegment(gs, 0);
869 /* Memory clobbers used to order PDA accessed */
870 mb();
871 wrmsrl(MSR_GS_BASE, pda);
872 mb();
873
874 pda->cpunumber = cpu;
875 pda->irqcount = -1;
876 pda->kernelstack = (unsigned long)stack_thread_info() -
877 PDA_STACKOFFSET + THREAD_SIZE;
878 pda->active_mm = &init_mm;
879 pda->mmu_state = 0;
880
881 if (cpu == 0) {
882 /* others are initialized in smpboot.c */
883 pda->pcurrent = &init_task;
884 pda->irqstackptr = boot_cpu_stack;
885 pda->irqstackptr += IRQSTACKSIZE - 64;
886 } else {
887 if (!pda->irqstackptr) {
888 pda->irqstackptr = (char *)
889 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
890 if (!pda->irqstackptr)
891 panic("cannot allocate irqstack for cpu %d",
892 cpu);
893 pda->irqstackptr += IRQSTACKSIZE - 64;
894 }
895
896 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
897 pda->nodenumber = cpu_to_node(cpu);
898 }
899}
900
901char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
902 DEBUG_STKSZ] __page_aligned_bss;
623 903
624 for (cvdev = __x86cpuvendor_start ; 904extern asmlinkage void ignore_sysret(void);
625 cvdev < __x86cpuvendor_end ;
626 cvdev++)
627 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
628 905
629 early_cpu_detect(); 906/* May not be marked __init: used by software suspend */
630 validate_pat_support(&boot_cpu_data); 907void syscall_init(void)
908{
909 /*
910 * LSTAR and STAR live in a bit strange symbiosis.
911 * They both write to the same internal register. STAR allows to
912 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
913 */
914 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
915 wrmsrl(MSR_LSTAR, system_call);
916 wrmsrl(MSR_CSTAR, ignore_sysret);
917
918#ifdef CONFIG_IA32_EMULATION
919 syscall32_cpu_init();
920#endif
921
922 /* Flags to clear on syscall */
923 wrmsrl(MSR_SYSCALL_MASK,
924 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
631} 925}
632 926
927unsigned long kernel_eflags;
928
929/*
930 * Copies of the original ist values from the tss are only accessed during
931 * debugging, no special alignment required.
932 */
933DEFINE_PER_CPU(struct orig_ist, orig_ist);
934
935#else
936
633/* Make sure %fs is initialized properly in idle threads */ 937/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 938struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 939{
@@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
637 regs->fs = __KERNEL_PERCPU; 941 regs->fs = __KERNEL_PERCPU;
638 return regs; 942 return regs;
639} 943}
640 944#endif
641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
644{
645 struct desc_ptr gdt_descr;
646
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1;
649 load_gdt(&gdt_descr);
650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
651}
652 945
653/* 946/*
654 * cpu_init() initializes state that is per-CPU. Some data is already 947 * cpu_init() initializes state that is per-CPU. Some data is already
655 * initialized (naturally) in the bootstrap process, such as the GDT 948 * initialized (naturally) in the bootstrap process, such as the GDT
656 * and IDT. We reload them nevertheless, this function acts as a 949 * and IDT. We reload them nevertheless, this function acts as a
657 * 'CPU state barrier', nothing should get across. 950 * 'CPU state barrier', nothing should get across.
951 * A lot of state is already set up in PDA init for 64 bit
658 */ 952 */
953#ifdef CONFIG_X86_64
954void __cpuinit cpu_init(void)
955{
956 int cpu = stack_smp_processor_id();
957 struct tss_struct *t = &per_cpu(init_tss, cpu);
958 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
959 unsigned long v;
960 char *estacks = NULL;
961 struct task_struct *me;
962 int i;
963
964 /* CPU 0 is initialised in head64.c */
965 if (cpu != 0)
966 pda_init(cpu);
967 else
968 estacks = boot_exception_stacks;
969
970 me = current;
971
972 if (cpu_test_and_set(cpu, cpu_initialized))
973 panic("CPU#%d already initialized!\n", cpu);
974
975 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
976
977 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
978
979 /*
980 * Initialize the per-CPU GDT with the boot GDT,
981 * and set up the GDT descriptor:
982 */
983
984 switch_to_new_gdt();
985 load_idt((const struct desc_ptr *)&idt_descr);
986
987 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
988 syscall_init();
989
990 wrmsrl(MSR_FS_BASE, 0);
991 wrmsrl(MSR_KERNEL_GS_BASE, 0);
992 barrier();
993
994 check_efer();
995 if (cpu != 0 && x2apic)
996 enable_x2apic();
997
998 /*
999 * set up and load the per-CPU TSS
1000 */
1001 if (!orig_ist->ist[0]) {
1002 static const unsigned int order[N_EXCEPTION_STACKS] = {
1003 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1004 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1005 };
1006 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1007 if (cpu) {
1008 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1009 if (!estacks)
1010 panic("Cannot allocate exception "
1011 "stack %ld %d\n", v, cpu);
1012 }
1013 estacks += PAGE_SIZE << order[v];
1014 orig_ist->ist[v] = t->x86_tss.ist[v] =
1015 (unsigned long)estacks;
1016 }
1017 }
1018
1019 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1020 /*
1021 * <= is required because the CPU will access up to
1022 * 8 bits beyond the end of the IO permission bitmap.
1023 */
1024 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1025 t->io_bitmap[i] = ~0UL;
1026
1027 atomic_inc(&init_mm.mm_count);
1028 me->active_mm = &init_mm;
1029 if (me->mm)
1030 BUG();
1031 enter_lazy_tlb(&init_mm, me);
1032
1033 load_sp0(t, &current->thread);
1034 set_tss_desc(cpu, t);
1035 load_TR_desc();
1036 load_LDT(&init_mm.context);
1037
1038#ifdef CONFIG_KGDB
1039 /*
1040 * If the kgdb is connected no debug regs should be altered. This
1041 * is only applicable when KGDB and a KGDB I/O module are built
1042 * into the kernel and you are using early debugging with
1043 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1044 */
1045 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1046 arch_kgdb_ops.correct_hw_break();
1047 else {
1048#endif
1049 /*
1050 * Clear all 6 debug registers:
1051 */
1052
1053 set_debugreg(0UL, 0);
1054 set_debugreg(0UL, 1);
1055 set_debugreg(0UL, 2);
1056 set_debugreg(0UL, 3);
1057 set_debugreg(0UL, 6);
1058 set_debugreg(0UL, 7);
1059#ifdef CONFIG_KGDB
1060 /* If the kgdb is connected no debug regs should be altered. */
1061 }
1062#endif
1063
1064 fpu_init();
1065
1066 raw_local_save_flags(kernel_eflags);
1067
1068 if (is_uv_system())
1069 uv_cpu_init();
1070}
1071
1072#else
1073
659void __cpuinit cpu_init(void) 1074void __cpuinit cpu_init(void)
660{ 1075{
661 int cpu = smp_processor_id(); 1076 int cpu = smp_processor_id();
@@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void)
709 /* 1124 /*
710 * Force FPU initialization: 1125 * Force FPU initialization:
711 */ 1126 */
712 current_thread_info()->status = 0; 1127 if (cpu_has_xsave)
1128 current_thread_info()->status = TS_XSAVE;
1129 else
1130 current_thread_info()->status = 0;
713 clear_used_math(); 1131 clear_used_math();
714 mxcsr_feature_mask_init(); 1132 mxcsr_feature_mask_init();
715}
716 1133
717#ifdef CONFIG_HOTPLUG_CPU 1134 /*
718void __cpuinit cpu_uninit(void) 1135 * Boot processor to setup the FP and extended state context info.
719{ 1136 */
720 int cpu = raw_smp_processor_id(); 1137 if (!smp_processor_id())
721 cpu_clear(cpu, cpu_initialized); 1138 init_thread_xstate();
722 1139
723 /* lazy TLB state */ 1140 xsave_init();
724 per_cpu(cpu_tlbstate, cpu).state = 0;
725 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
726} 1141}
1142
1143
727#endif 1144#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 7b8cc72feb40..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,681 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h>
12#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h>
17#include <asm/msr.h>
18#include <asm/io.h>
19#include <asm/mmu_context.h>
20#include <asm/mtrr.h>
21#include <asm/mce.h>
22#include <asm/pat.h>
23#include <asm/numa.h>
24#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h>
26#include <asm/apic.h>
27#include <mach_apic.h>
28#endif
29#include <asm/pda.h>
30#include <asm/pgtable.h>
31#include <asm/processor.h>
32#include <asm/desc.h>
33#include <asm/atomic.h>
34#include <asm/proto.h>
35#include <asm/sections.h>
36#include <asm/setup.h>
37#include <asm/genapic.h>
38
39#include "cpu.h"
40
41/* We need valid kernel segments for data and code in long mode too
42 * IRET will check the segment types kkeil 2000/10/28
43 * Also sysret mandates a special GDT layout
44 */
45/* The TLS descriptors are currently at a different place compared to i386.
46 Hopefully nobody expects them at a fixed place (Wine?) */
47DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
48 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
49 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
50 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
51 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
52 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
53 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
54} };
55EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
56
57__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
58
59/* Current gdt points %fs at the "master" per-cpu area: after this,
60 * it's on the real one. */
61void switch_to_new_gdt(void)
62{
63 struct desc_ptr gdt_descr;
64
65 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
66 gdt_descr.size = GDT_SIZE - 1;
67 load_gdt(&gdt_descr);
68}
69
70struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
71
72static void __cpuinit default_init(struct cpuinfo_x86 *c)
73{
74 display_cacheinfo(c);
75}
76
77static struct cpu_dev __cpuinitdata default_cpu = {
78 .c_init = default_init,
79 .c_vendor = "Unknown",
80};
81static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
82
83int __cpuinit get_model_name(struct cpuinfo_x86 *c)
84{
85 unsigned int *v;
86
87 if (c->extended_cpuid_level < 0x80000004)
88 return 0;
89
90 v = (unsigned int *) c->x86_model_id;
91 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
92 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
93 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
94 c->x86_model_id[48] = 0;
95 return 1;
96}
97
98
99void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
100{
101 unsigned int n, dummy, ebx, ecx, edx;
102
103 n = c->extended_cpuid_level;
104
105 if (n >= 0x80000005) {
106 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
107 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
108 "D cache %dK (%d bytes/line)\n",
109 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
110 c->x86_cache_size = (ecx>>24) + (edx>>24);
111 /* On K8 L1 TLB is inclusive, so don't count it */
112 c->x86_tlbsize = 0;
113 }
114
115 if (n >= 0x80000006) {
116 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
117 ecx = cpuid_ecx(0x80000006);
118 c->x86_cache_size = ecx >> 16;
119 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
120
121 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
122 c->x86_cache_size, ecx & 0xFF);
123 }
124}
125
126void __cpuinit detect_ht(struct cpuinfo_x86 *c)
127{
128#ifdef CONFIG_SMP
129 u32 eax, ebx, ecx, edx;
130 int index_msb, core_bits;
131
132 cpuid(1, &eax, &ebx, &ecx, &edx);
133
134
135 if (!cpu_has(c, X86_FEATURE_HT))
136 return;
137 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
138 goto out;
139
140 smp_num_siblings = (ebx & 0xff0000) >> 16;
141
142 if (smp_num_siblings == 1) {
143 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
144 } else if (smp_num_siblings > 1) {
145
146 if (smp_num_siblings > NR_CPUS) {
147 printk(KERN_WARNING "CPU: Unsupported number of "
148 "siblings %d", smp_num_siblings);
149 smp_num_siblings = 1;
150 return;
151 }
152
153 index_msb = get_count_order(smp_num_siblings);
154 c->phys_proc_id = phys_pkg_id(index_msb);
155
156 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
157
158 index_msb = get_count_order(smp_num_siblings);
159
160 core_bits = get_count_order(c->x86_max_cores);
161
162 c->cpu_core_id = phys_pkg_id(index_msb) &
163 ((1 << core_bits) - 1);
164 }
165out:
166 if ((c->x86_max_cores * smp_num_siblings) > 1) {
167 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
168 c->phys_proc_id);
169 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
170 c->cpu_core_id);
171 }
172
173#endif
174}
175
176static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
177{
178 char *v = c->x86_vendor_id;
179 int i;
180 static int printed;
181
182 for (i = 0; i < X86_VENDOR_NUM; i++) {
183 if (cpu_devs[i]) {
184 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
185 (cpu_devs[i]->c_ident[1] &&
186 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
187 c->x86_vendor = i;
188 this_cpu = cpu_devs[i];
189 return;
190 }
191 }
192 }
193 if (!printed) {
194 printed++;
195 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
196 printk(KERN_ERR "CPU: Your system may be unstable.\n");
197 }
198 c->x86_vendor = X86_VENDOR_UNKNOWN;
199}
200
201static void __init early_cpu_support_print(void)
202{
203 int i,j;
204 struct cpu_dev *cpu_devx;
205
206 printk("KERNEL supported cpus:\n");
207 for (i = 0; i < X86_VENDOR_NUM; i++) {
208 cpu_devx = cpu_devs[i];
209 if (!cpu_devx)
210 continue;
211 for (j = 0; j < 2; j++) {
212 if (!cpu_devx->c_ident[j])
213 continue;
214 printk(" %s %s\n", cpu_devx->c_vendor,
215 cpu_devx->c_ident[j]);
216 }
217 }
218}
219
220static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
221
222void __init early_cpu_init(void)
223{
224 struct cpu_vendor_dev *cvdev;
225
226 for (cvdev = __x86cpuvendor_start ;
227 cvdev < __x86cpuvendor_end ;
228 cvdev++)
229 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
230 early_cpu_support_print();
231 early_identify_cpu(&boot_cpu_data);
232}
233
234/* Do some early cpuid on the boot CPU to get some parameter that are
235 needed before check_bugs. Everything advanced is in identify_cpu
236 below. */
237static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
238{
239 u32 tfms, xlvl;
240
241 c->loops_per_jiffy = loops_per_jiffy;
242 c->x86_cache_size = -1;
243 c->x86_vendor = X86_VENDOR_UNKNOWN;
244 c->x86_model = c->x86_mask = 0; /* So far unknown... */
245 c->x86_vendor_id[0] = '\0'; /* Unset */
246 c->x86_model_id[0] = '\0'; /* Unset */
247 c->x86_clflush_size = 64;
248 c->x86_cache_alignment = c->x86_clflush_size;
249 c->x86_max_cores = 1;
250 c->x86_coreid_bits = 0;
251 c->extended_cpuid_level = 0;
252 memset(&c->x86_capability, 0, sizeof c->x86_capability);
253
254 /* Get vendor name */
255 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
256 (unsigned int *)&c->x86_vendor_id[0],
257 (unsigned int *)&c->x86_vendor_id[8],
258 (unsigned int *)&c->x86_vendor_id[4]);
259
260 get_cpu_vendor(c);
261
262 /* Initialize the standard set of capabilities */
263 /* Note that the vendor-specific code below might override */
264
265 /* Intel-defined flags: level 0x00000001 */
266 if (c->cpuid_level >= 0x00000001) {
267 __u32 misc;
268 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
269 &c->x86_capability[0]);
270 c->x86 = (tfms >> 8) & 0xf;
271 c->x86_model = (tfms >> 4) & 0xf;
272 c->x86_mask = tfms & 0xf;
273 if (c->x86 == 0xf)
274 c->x86 += (tfms >> 20) & 0xff;
275 if (c->x86 >= 0x6)
276 c->x86_model += ((tfms >> 16) & 0xF) << 4;
277 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
278 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
279 } else {
280 /* Have CPUID level 0 only - unheard of */
281 c->x86 = 4;
282 }
283
284 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
285#ifdef CONFIG_SMP
286 c->phys_proc_id = c->initial_apicid;
287#endif
288 /* AMD-defined flags: level 0x80000001 */
289 xlvl = cpuid_eax(0x80000000);
290 c->extended_cpuid_level = xlvl;
291 if ((xlvl & 0xffff0000) == 0x80000000) {
292 if (xlvl >= 0x80000001) {
293 c->x86_capability[1] = cpuid_edx(0x80000001);
294 c->x86_capability[6] = cpuid_ecx(0x80000001);
295 }
296 if (xlvl >= 0x80000004)
297 get_model_name(c); /* Default name */
298 }
299
300 /* Transmeta-defined flags: level 0x80860001 */
301 xlvl = cpuid_eax(0x80860000);
302 if ((xlvl & 0xffff0000) == 0x80860000) {
303 /* Don't set x86_cpuid_level here for now to not confuse. */
304 if (xlvl >= 0x80860001)
305 c->x86_capability[2] = cpuid_edx(0x80860001);
306 }
307
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007);
311
312 if (c->extended_cpuid_level >= 0x80000008) {
313 u32 eax = cpuid_eax(0x80000008);
314
315 c->x86_virt_bits = (eax >> 8) & 0xff;
316 c->x86_phys_bits = eax & 0xff;
317 }
318
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c);
325
326 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331}
332
333/*
334 * This does the hard work of actually picking apart the CPU stuff...
335 */
336static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
337{
338 int i;
339
340 early_identify_cpu(c);
341
342 init_scattered_cpuid_features(c);
343
344 c->apicid = phys_pkg_id(0);
345
346 /*
347 * Vendor-specific initialization. In this section we
348 * canonicalize the feature flags, meaning if there are
349 * features a certain CPU supports which CPUID doesn't
350 * tell us, CPUID claiming incorrect flags, or other bugs,
351 * we handle them here.
352 *
353 * At the end of this section, c->x86_capability better
354 * indicate the features this CPU genuinely supports!
355 */
356 if (this_cpu->c_init)
357 this_cpu->c_init(c);
358
359 detect_ht(c);
360
361 /*
362 * On SMP, boot_cpu_data holds the common feature set between
363 * all CPUs; so make sure that we indicate which features are
364 * common between the CPUs. The first time this routine gets
365 * executed, c == &boot_cpu_data.
366 */
367 if (c != &boot_cpu_data) {
368 /* AND the already accumulated flags with these */
369 for (i = 0; i < NCAPINTS; i++)
370 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
371 }
372
373 /* Clear all flags overriden by options */
374 for (i = 0; i < NCAPINTS; i++)
375 c->x86_capability[i] &= ~cleared_cpu_caps[i];
376
377#ifdef CONFIG_X86_MCE
378 mcheck_init(c);
379#endif
380 select_idle_routine(c);
381
382#ifdef CONFIG_NUMA
383 numa_add_cpu(smp_processor_id());
384#endif
385
386}
387
388void __cpuinit identify_boot_cpu(void)
389{
390 identify_cpu(&boot_cpu_data);
391}
392
393void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
394{
395 BUG_ON(c == &boot_cpu_data);
396 identify_cpu(c);
397 mtrr_ap_init();
398}
399
400static __init int setup_noclflush(char *arg)
401{
402 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
403 return 1;
404}
405__setup("noclflush", setup_noclflush);
406
407void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
408{
409 if (c->x86_model_id[0])
410 printk(KERN_CONT "%s", c->x86_model_id);
411
412 if (c->x86_mask || c->cpuid_level >= 0)
413 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
414 else
415 printk(KERN_CONT "\n");
416}
417
418static __init int setup_disablecpuid(char *arg)
419{
420 int bit;
421 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
422 setup_clear_cpu_cap(bit);
423 else
424 return 0;
425 return 1;
426}
427__setup("clearcpuid=", setup_disablecpuid);
428
429cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
430
431struct x8664_pda **_cpu_pda __read_mostly;
432EXPORT_SYMBOL(_cpu_pda);
433
434struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
435
436char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
437
438unsigned long __supported_pte_mask __read_mostly = ~0UL;
439EXPORT_SYMBOL_GPL(__supported_pte_mask);
440
441static int do_not_nx __cpuinitdata;
442
443/* noexec=on|off
444Control non executable mappings for 64bit processes.
445
446on Enable(default)
447off Disable
448*/
449static int __init nonx_setup(char *str)
450{
451 if (!str)
452 return -EINVAL;
453 if (!strncmp(str, "on", 2)) {
454 __supported_pte_mask |= _PAGE_NX;
455 do_not_nx = 0;
456 } else if (!strncmp(str, "off", 3)) {
457 do_not_nx = 1;
458 __supported_pte_mask &= ~_PAGE_NX;
459 }
460 return 0;
461}
462early_param("noexec", nonx_setup);
463
464int force_personality32;
465
466/* noexec32=on|off
467Control non executable heap for 32bit processes.
468To control the stack too use noexec=off
469
470on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
471off PROT_READ implies PROT_EXEC
472*/
473static int __init nonx32_setup(char *str)
474{
475 if (!strcmp(str, "on"))
476 force_personality32 &= ~READ_IMPLIES_EXEC;
477 else if (!strcmp(str, "off"))
478 force_personality32 |= READ_IMPLIES_EXEC;
479 return 1;
480}
481__setup("noexec32=", nonx32_setup);
482
483void pda_init(int cpu)
484{
485 struct x8664_pda *pda = cpu_pda(cpu);
486
487 /* Setup up data that may be needed in __get_free_pages early */
488 loadsegment(fs, 0);
489 loadsegment(gs, 0);
490 /* Memory clobbers used to order PDA accessed */
491 mb();
492 wrmsrl(MSR_GS_BASE, pda);
493 mb();
494
495 pda->cpunumber = cpu;
496 pda->irqcount = -1;
497 pda->kernelstack = (unsigned long)stack_thread_info() -
498 PDA_STACKOFFSET + THREAD_SIZE;
499 pda->active_mm = &init_mm;
500 pda->mmu_state = 0;
501
502 if (cpu == 0) {
503 /* others are initialized in smpboot.c */
504 pda->pcurrent = &init_task;
505 pda->irqstackptr = boot_cpu_stack;
506 } else {
507 pda->irqstackptr = (char *)
508 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
509 if (!pda->irqstackptr)
510 panic("cannot allocate irqstack for cpu %d", cpu);
511
512 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
513 pda->nodenumber = cpu_to_node(cpu);
514 }
515
516 pda->irqstackptr += IRQSTACKSIZE-64;
517}
518
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ]
521__attribute__((section(".bss.page_aligned")));
522
523extern asmlinkage void ignore_sysret(void);
524
525/* May not be marked __init: used by software suspend */
526void syscall_init(void)
527{
528 /*
529 * LSTAR and STAR live in a bit strange symbiosis.
530 * They both write to the same internal register. STAR allows to
531 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
532 */
533 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
534 wrmsrl(MSR_LSTAR, system_call);
535 wrmsrl(MSR_CSTAR, ignore_sysret);
536
537#ifdef CONFIG_IA32_EMULATION
538 syscall32_cpu_init();
539#endif
540
541 /* Flags to clear on syscall */
542 wrmsrl(MSR_SYSCALL_MASK,
543 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
544}
545
546void __cpuinit check_efer(void)
547{
548 unsigned long efer;
549
550 rdmsrl(MSR_EFER, efer);
551 if (!(efer & EFER_NX) || do_not_nx)
552 __supported_pte_mask &= ~_PAGE_NX;
553}
554
555unsigned long kernel_eflags;
556
557/*
558 * Copies of the original ist values from the tss are only accessed during
559 * debugging, no special alignment required.
560 */
561DEFINE_PER_CPU(struct orig_ist, orig_ist);
562
563/*
564 * cpu_init() initializes state that is per-CPU. Some data is already
565 * initialized (naturally) in the bootstrap process, such as the GDT
566 * and IDT. We reload them nevertheless, this function acts as a
567 * 'CPU state barrier', nothing should get across.
568 * A lot of state is already set up in PDA init.
569 */
570void __cpuinit cpu_init(void)
571{
572 int cpu = stack_smp_processor_id();
573 struct tss_struct *t = &per_cpu(init_tss, cpu);
574 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
575 unsigned long v;
576 char *estacks = NULL;
577 struct task_struct *me;
578 int i;
579
580 /* CPU 0 is initialised in head64.c */
581 if (cpu != 0)
582 pda_init(cpu);
583 else
584 estacks = boot_exception_stacks;
585
586 me = current;
587
588 if (cpu_test_and_set(cpu, cpu_initialized))
589 panic("CPU#%d already initialized!\n", cpu);
590
591 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
592
593 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
594
595 /*
596 * Initialize the per-CPU GDT with the boot GDT,
597 * and set up the GDT descriptor:
598 */
599
600 switch_to_new_gdt();
601 load_idt((const struct desc_ptr *)&idt_descr);
602
603 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
604 syscall_init();
605
606 wrmsrl(MSR_FS_BASE, 0);
607 wrmsrl(MSR_KERNEL_GS_BASE, 0);
608 barrier();
609
610 check_efer();
611
612 /*
613 * set up and load the per-CPU TSS
614 */
615 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
616 static const unsigned int order[N_EXCEPTION_STACKS] = {
617 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
618 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
619 };
620 if (cpu) {
621 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
622 if (!estacks)
623 panic("Cannot allocate exception stack %ld %d\n",
624 v, cpu);
625 }
626 estacks += PAGE_SIZE << order[v];
627 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
628 }
629
630 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
631 /*
632 * <= is required because the CPU will access up to
633 * 8 bits beyond the end of the IO permission bitmap.
634 */
635 for (i = 0; i <= IO_BITMAP_LONGS; i++)
636 t->io_bitmap[i] = ~0UL;
637
638 atomic_inc(&init_mm.mm_count);
639 me->active_mm = &init_mm;
640 if (me->mm)
641 BUG();
642 enter_lazy_tlb(&init_mm, me);
643
644 load_sp0(t, &current->thread);
645 set_tss_desc(cpu, t);
646 load_TR_desc();
647 load_LDT(&init_mm.context);
648
649#ifdef CONFIG_KGDB
650 /*
651 * If the kgdb is connected no debug regs should be altered. This
652 * is only applicable when KGDB and a KGDB I/O module are built
653 * into the kernel and you are using early debugging with
654 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
655 */
656 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
657 arch_kgdb_ops.correct_hw_break();
658 else {
659#endif
660 /*
661 * Clear all 6 debug registers:
662 */
663
664 set_debugreg(0UL, 0);
665 set_debugreg(0UL, 1);
666 set_debugreg(0UL, 2);
667 set_debugreg(0UL, 3);
668 set_debugreg(0UL, 6);
669 set_debugreg(0UL, 7);
670#ifdef CONFIG_KGDB
671 /* If the kgdb is connected no debug regs should be altered. */
672 }
673#endif
674
675 fpu_init();
676
677 raw_local_save_flags(kernel_eflags);
678
679 if (is_uv_system())
680 uv_cpu_init();
681}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..8e48c5d4467d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -451,7 +452,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 452
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 453 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 454 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 455 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 456 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 457 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 458 }
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 467 }
467 } 468 }
468 469
469 for_each_cpu_mask(i, cmd.mask) { 470 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 471 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 472 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 473 }
@@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void)
779{ 780{
780 int ret; 781 int ret;
781 782
783 if (acpi_disabled)
784 return 0;
785
782 dprintk("acpi_cpufreq_init\n"); 786 dprintk("acpi_cpufreq_init\n");
783 787
784 ret = acpi_cpufreq_early_init(); 788 ret = acpi_cpufreq_early_init();
785 if (ret) 789 if (ret)
786 return ret; 790 return ret;
787 791
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 792 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
793 if (ret)
794 free_percpu(acpi_perf_data);
795
796 return ret;
789} 797}
790 798
791static void __exit acpi_cpufreq_exit(void) 799static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 803 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 804
797 free_percpu(acpi_perf_data); 805 free_percpu(acpi_perf_data);
798
799 return;
800} 806}
801 807
802module_param(acpi_pstate_strict, uint, 0644); 808module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d51..b0461856acfb 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> 2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com> 3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 * 4 *
5 * Licensed under the terms of the GNU GPL License version 2. 5 * Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1019module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1021
1022MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1024MODULE_LICENSE ("GPL");
1025 1025
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..c1ac5790c63e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b71..7c7d56b43136 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * AMD K7 Powernow driver. 2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. 3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com> 4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 * 5 *
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
692module_param(acpi_force, int, 0444); 692module_param(acpi_force, int, 0444);
693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
694 694
695MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 695MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
697MODULE_LICENSE ("GPL"); 697MODULE_LICENSE ("GPL");
698 698
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..d3dcd58b87cd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
7 * Support : mark.langsdorf@amd.com 7 * Support : mark.langsdorf@amd.com
8 * 8 *
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
45#endif 45#endif
46 46
47#define PFX "powernow-k8: " 47#define PFX "powernow-k8: "
48#define BFX PFX "BIOS error: "
49#define VERSION "version 2.20.00" 48#define VERSION "version 2.20.00"
50#include "powernow-k8.h" 49#include "powernow-k8.h"
51 50
@@ -66,7 +65,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 65 return 800 + (fid * 100);
67} 66}
68 67
69
70/* Return a frequency in KHz, given an input fid */ 68/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 69static u32 find_khz_freq_from_fid(u32 fid)
72{ 70{
@@ -78,7 +76,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 76 return data[pstate].frequency;
79} 77}
80 78
81
82/* Return the vco fid for an input fid 79/* Return the vco fid for an input fid
83 * 80 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 81 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +163,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 163 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 164}
168 165
169
170/* write the new fid value along with the other control fields to the msr */ 166/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 167static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 168{
@@ -539,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
539 535
540 for (j = 0; j < data->numps; j++) { 536 for (j = 0; j < data->numps; j++) {
541 if (pst[j].vid > LEAST_VID) { 537 if (pst[j].vid > LEAST_VID) {
542 printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); 538 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
539 j, pst[j].vid);
543 return -EINVAL; 540 return -EINVAL;
544 } 541 }
545 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ 542 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
546 printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); 543 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
544 " %d\n", j);
547 return -ENODEV; 545 return -ENODEV;
548 } 546 }
549 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ 547 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
550 printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); 548 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
549 " %d\n", j);
551 return -ENODEV; 550 return -ENODEV;
552 } 551 }
553 if (pst[j].fid > MAX_FID) { 552 if (pst[j].fid > MAX_FID) {
554 printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); 553 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
554 " %d\n", j);
555 return -ENODEV; 555 return -ENODEV;
556 } 556 }
557 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { 557 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
558 /* Only first fid is allowed to be in "low" range */ 558 /* Only first fid is allowed to be in "low" range */
559 printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); 559 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
560 "0x%x\n", j, pst[j].fid);
560 return -EINVAL; 561 return -EINVAL;
561 } 562 }
562 if (pst[j].fid < lastfid) 563 if (pst[j].fid < lastfid)
563 lastfid = pst[j].fid; 564 lastfid = pst[j].fid;
564 } 565 }
565 if (lastfid & 1) { 566 if (lastfid & 1) {
566 printk(KERN_ERR BFX "lastfid invalid\n"); 567 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
567 return -EINVAL; 568 return -EINVAL;
568 } 569 }
569 if (lastfid > LO_FID_TABLE_TOP) 570 if (lastfid > LO_FID_TABLE_TOP)
570 printk(KERN_INFO BFX "first fid not from lo freq table\n"); 571 printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n");
571 572
572 return 0; 573 return 0;
573} 574}
@@ -675,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data)
675 676
676 dprintk("table vers: 0x%x\n", psb->tableversion); 677 dprintk("table vers: 0x%x\n", psb->tableversion);
677 if (psb->tableversion != PSB_VERSION_1_4) { 678 if (psb->tableversion != PSB_VERSION_1_4) {
678 printk(KERN_ERR BFX "PSB table is not v1.4\n"); 679 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
679 return -ENODEV; 680 return -ENODEV;
680 } 681 }
681 682
682 dprintk("flags: 0x%x\n", psb->flags1); 683 dprintk("flags: 0x%x\n", psb->flags1);
683 if (psb->flags1) { 684 if (psb->flags1) {
684 printk(KERN_ERR BFX "unknown flags\n"); 685 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
685 return -ENODEV; 686 return -ENODEV;
686 } 687 }
687 688
@@ -708,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data)
708 } 709 }
709 } 710 }
710 if (cpst != 1) { 711 if (cpst != 1) {
711 printk(KERN_ERR BFX "numpst must be 1\n"); 712 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
712 return -ENODEV; 713 return -ENODEV;
713 } 714 }
714 715
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 967 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 968 freqs.new = find_khz_freq_from_fid(fid);
968 969
969 for_each_cpu_mask(i, *(data->available_cores)) { 970 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 971 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 972 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 973 }
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 975 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 976 freqs.new = find_khz_freq_from_fid(data->currfid);
976 977
977 for_each_cpu_mask(i, *(data->available_cores)) { 978 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 979 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 980 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 981 }
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 998 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 999 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1000
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1002 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1004 }
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1006 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1007 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1008
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1010 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1012 }
@@ -1133,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1133 "ACPI Processor module before starting this " 1134 "ACPI Processor module before starting this "
1134 "driver.\n"); 1135 "driver.\n");
1135#else 1136#else
1136 printk(KERN_ERR PFX "Your BIOS does not provide ACPI " 1137 printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
1137 "_PSS objects in a way that Linux understands. " 1138 " ACPI _PSS objects in a way that Linux "
1138 "Please report this to the Linux ACPI maintainers" 1139 "understands. Please report this to the Linux "
1139 " and complain to your BIOS vendor.\n"); 1140 "ACPI maintainers and complain to your BIOS "
1141 "vendor.\n");
1140#endif 1142#endif
1141 kfree(data); 1143 kfree(data);
1142 return -ENODEV; 1144 return -ENODEV;
1143 } 1145 }
1144 if (pol->cpu != 0) { 1146 if (pol->cpu != 0) {
1145 printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " 1147 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1146 "CPU0. Complain to your BIOS vendor.\n"); 1148 "CPU other than CPU0. Complain to your BIOS "
1149 "vendor.\n");
1147 kfree(data); 1150 kfree(data);
1148 return -ENODEV; 1151 return -ENODEV;
1149 } 1152 }
@@ -1196,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1196 1199
1197 /* min/max the cpu is capable of */ 1200 /* min/max the cpu is capable of */
1198 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { 1201 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1199 printk(KERN_ERR PFX "invalid powernow_table\n"); 1202 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1200 powernow_k8_cpu_exit_acpi(data); 1203 powernow_k8_cpu_exit_acpi(data);
1201 kfree(data->powernow_table); 1204 kfree(data->powernow_table);
1202 kfree(data); 1205 kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,9 +26,10 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 358 int i;
348 359
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 360 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 361 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
362 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 363 return -ENODEV;
352 364
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 365 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 373 break;
362 374
363 if (i != N_IDS) 375 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 376 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 377
366 if (!centrino_cpu[policy->cpu]) { 378 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 379 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 380 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 381 MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 398 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 400 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 403 return -ENODEV;
391 } 404 }
392 } 405 }
393 406
394 freq = get_cur_freq(policy->cpu); 407 freq = get_cur_freq(policy->cpu);
395 408 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 409 /* 10uS transition latency */
397 policy->cur = freq; 410 policy->cur = freq;
398 411
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 412 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 413
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 414 ret = cpufreq_frequency_table_cpuinfo(policy,
415 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 416 if (ret)
403 return (ret); 417 return (ret);
404 418
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 419 cpufreq_frequency_table_get_attr(
420 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 421
407 return 0; 422 return 0;
408} 423}
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 426{
412 unsigned int cpu = policy->cpu; 427 unsigned int cpu = policy->cpu;
413 428
414 if (!centrino_model[cpu]) 429 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 430 return -ENODEV;
416 431
417 cpufreq_frequency_table_put_attr(cpu); 432 cpufreq_frequency_table_put_attr(cpu);
418 433
419 centrino_model[cpu] = NULL; 434 per_cpu(centrino_model, cpu) = NULL;
420 435
421 return 0; 436 return 0;
422} 437}
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 445 */
431static int centrino_verify (struct cpufreq_policy *policy) 446static int centrino_verify (struct cpufreq_policy *policy)
432{ 447{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 448 return cpufreq_frequency_table_verify(policy,
449 per_cpu(centrino_model, policy->cpu)->op_points);
434} 450}
435 451
436/** 452/**
437 * centrino_setpolicy - set a new CPUFreq policy 453 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 454 * @policy: new policy
439 * @target_freq: the target frequency 455 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 456 * @relation: how that frequency relates to achieved frequency
457 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 458 *
442 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
443 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
444static int centrino_target (struct cpufreq_policy *policy, 468static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 469 unsigned int target_freq,
446 unsigned int relation) 470 unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 472 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 473 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 474 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 475 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 476 unsigned int j, k, first_cpu, tmp;
457 477 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 478 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482
483 if (unlikely(allmasks == NULL))
484 return -ENOMEM;
485
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV;
488 goto out;
489 }
460 490
461 if (unlikely(cpufreq_frequency_table_target(policy, 491 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 492 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 493 target_freq,
464 relation, 494 relation,
465 &newstate))) { 495 &newstate))) {
466 return -EINVAL; 496 retval = -EINVAL;
497 goto out;
467 } 498 }
468 499
469#ifdef CONFIG_HOTPLUG_CPU 500#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 501 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 503#else
473 online_policy_cpus = policy->cpus; 504 *online_policy_cpus = policy->cpus;
474#endif 505#endif
475 506
476 saved_mask = current->cpus_allowed; 507 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 508 first_cpu = 1;
478 cpus_clear(covered_cpus); 509 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 510 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 511 /*
481 * Support for SMP systems. 512 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 513 * Make sure we are running on CPU that wants to change freq
483 */ 514 */
484 cpus_clear(set_mask); 515 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 517 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 518 else
488 cpu_set(j, set_mask); 519 cpu_set(j, *set_mask);
489 520
490 set_cpus_allowed_ptr(current, &set_mask); 521 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 522 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 524 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 525 retval = -EAGAIN;
495 if (first_cpu) { 526 if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 531 break;
501 } 532 }
502 533
503 msr = centrino_model[cpu]->op_points[newstate].index; 534 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 535
505 if (first_cpu) { 536 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 537 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 549 target_freq, freqs.old, freqs.new, msr);
519 550
520 for_each_cpu_mask(k, online_policy_cpus) { 551 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 552 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 553 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 554 CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 567 break;
537 } 568 }
538 569
539 cpu_set(j, covered_cpus); 570 cpu_set(j, *covered_cpus);
540 preempt_enable(); 571 preempt_enable();
541 } 572 }
542 573
543 for_each_cpu_mask(k, online_policy_cpus) { 574 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 575 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 577 }
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 584 * Best effort undo..
554 */ 585 */
555 586
556 if (!cpus_empty(covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
557 for_each_cpu_mask(j, covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
558 set_cpus_allowed_ptr(current, 589 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j)); 590 &cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 592 }
562 }
563 593
564 tmp = freqs.new; 594 tmp = freqs.new;
565 freqs.new = freqs.old; 595 freqs.new = freqs.old;
566 freqs.old = tmp; 596 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 597 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 598 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 601 }
572 } 602 }
573 set_cpus_allowed_ptr(current, &saved_mask); 603 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 604 retval = 0;
605 goto out;
575 606
576migrate_end: 607migrate_end:
577 preempt_enable(); 608 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 609 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 610out:
611 CPUMASK_FREE(allmasks);
612 return retval;
580} 613}
581 614
582static struct freq_attr* centrino_attr[] = { 615static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..04d0376b64b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask(i, policy->cpus) { 282 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
431} 431}
432 432
433 433
434MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
436MODULE_LICENSE ("GPL"); 436MODULE_LICENSE ("GPL");
437 437
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
116 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
117 122
118 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
119 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
120 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
121 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
122 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
127 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
128 133
129 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
130 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
131 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
132 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
133 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
135}
136
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152} 140}
153 141
154/* 142/*
@@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void)
162 local_irq_save(flags); 150 local_irq_save(flags);
163 151
164 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
165 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
166 154
167 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
168 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
169 157
170 158
171 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
172 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
173 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
286 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
287 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
288 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
289 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
290 295
291 /* 296 /*
292 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
296 */ 301 */
297 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
298 geode_configure(); 303 geode_configure();
299 get_model_name(c); /* get CPU marketing name */
300 return; 304 return;
301 } else { /* MediaGX */ 305 } else { /* MediaGX */
302 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
309 if (dir1 > 7) { 313 if (dir1 > 7) {
310 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
311 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
312 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
313 } else { 317 } else {
314 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
315 } 319 }
@@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
424 local_irq_save(flags); 428 local_irq_save(flags);
425 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
426 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
427 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
428 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
429 local_irq_restore(flags); 433 local_irq_restore(flags);
430 } 434 }
@@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 442 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
439}; 445};
440 446
441cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
442 448
443static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
444 .c_vendor = "NSC", 450 .c_vendor = "NSC",
445 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
446 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
447}; 454};
448 455
449cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index e43ad4ad4cba..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..cce0b6118d55 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
80} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,60 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182 301
183 if (p) 302 if (p)
184 strcpy(c->x86_model_id, p); 303 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189
190 /* Work around errata */
191 Intel_errata_workarounds(c);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226 310
227 if (cpu_has_bts) 311 if (cpu_has_bts)
228 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
229 313
230#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable();
232#endif 314#endif
315
316 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
233} 333}
234 334
335#ifdef CONFIG_X86_32
235static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
236{ 337{
237 /* 338 /*
@@ -244,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
244 size = 256; 345 size = 256;
245 return size; 346 return size;
246} 347}
348#endif
247 349
248static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
249 .c_vendor = "Intel", 351 .c_vendor = "Intel",
250 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
251 .c_models = { 354 .c_models = {
252 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
253 { 356 {
@@ -297,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
297 } 400 }
298 }, 401 },
299 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
300 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
301 .c_init = init_intel, 406 .c_init = init_intel,
302 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
303}; 408};
304 409
305cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
306
307#ifndef CONFIG_X86_CMPXCHG
308unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
309{
310 u8 prev;
311 unsigned long flags;
312
313 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
314 local_irq_save(flags);
315 prev = *(u8 *)ptr;
316 if (prev == old)
317 *(u8 *)ptr = new;
318 local_irq_restore(flags);
319 return prev;
320}
321EXPORT_SYMBOL(cmpxchg_386_u8);
322
323unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
324{
325 u16 prev;
326 unsigned long flags;
327
328 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
329 local_irq_save(flags);
330 prev = *(u16 *)ptr;
331 if (prev == old)
332 *(u16 *)ptr = new;
333 local_irq_restore(flags);
334 return prev;
335}
336EXPORT_SYMBOL(cmpxchg_386_u16);
337
338unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
339{
340 u32 prev;
341 unsigned long flags;
342
343 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
344 local_irq_save(flags);
345 prev = *(u32 *)ptr;
346 if (prev == old)
347 *(u32 *)ptr = new;
348 local_irq_restore(flags);
349 return prev;
350}
351EXPORT_SYMBOL(cmpxchg_386_u32);
352#endif
353
354#ifndef CONFIG_X86_CMPXCHG64
355unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
356{
357 u64 prev;
358 unsigned long flags;
359
360 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
361 local_irq_save(flags);
362 prev = *(u64 *)ptr;
363 if (prev == old)
364 *(u64 *)ptr = new;
365 local_irq_restore(flags);
366 return prev;
367}
368EXPORT_SYMBOL(cmpxchg_486_u64);
369#endif
370
371/* arch_initcall(intel_cpu_init); */
372 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
130 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
131 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
132 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
133 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
134}; 136};
135 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
136unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
137 147
138/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
182static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
183static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
184 194
185static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
186 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
187 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
188{ 199{
189 unsigned dummy; 200 unsigned dummy;
190 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
251 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
252} 263}
253 264
254static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
255{ 275{
256 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
257 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
258 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
259 unsigned edx; 279 unsigned edx;
260 280
261 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
262 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
263 else 283 if (boot_cpu_data.x86 >= 0x10)
264 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
265 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
266 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
267 291
268 this_leaf->eax = eax; 292 this_leaf->eax = eax;
269 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
270 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
271 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
272 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
273 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
274 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
275 return 0; 299 return 0;
276} 300}
277 301
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
453 477
454/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
455static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
456#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
457 481
458#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
459static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -489,8 +513,8 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 513 int sibling;
490 514
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 519 }
496} 520}
@@ -572,7 +596,7 @@ struct _index_kobject {
572 596
573/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
574static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
575#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
576 600
577#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
578static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
637 } 661 }
638} 662}
639 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
640struct _cache_attr { 757struct _cache_attr {
641 struct attribute attr; 758 struct attribute attr;
642 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
657define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
658define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
659 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
660static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
661 &type.attr, 780 &type.attr,
662 &level.attr, 781 &level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
667 &size.attr, 786 &size.attr,
668 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
669 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
670 NULL 790 NULL
671}; 791};
672 792
673#define to_object(k) container_of(k, struct _index_kobject, kobj)
674#define to_attr(a) container_of(a, struct _cache_attr, attr)
675
676static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
677{ 794{
678 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
682 ret = fattr->show ? 799 ret = fattr->show ?
683 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
684 buf) : 801 buf) :
685 0; 802 0;
686 return ret; 803 return ret;
687} 804}
688 805
689static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
690 const char * buf, size_t count) 807 const char * buf, size_t count)
691{ 808{
692 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
693} 818}
694 819
695static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
@@ -780,15 +905,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 905 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 906 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 907 cpuid4_cache_sysfs_exit(cpu);
783 break; 908 return retval;
784 } 909 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 910 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 911 }
787 if (!retval) 912 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 913
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 914 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 915 return 0;
792} 916}
793 917
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 918static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f66351..dd3af6e7b39a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8cd..0ebf3fc6a610 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * mce.c - x86 Machine Check Exception Reporting 2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> 3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = {
759}; 759};
760 760
761DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
762 763
763/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 766 static ssize_t show_ ## name(struct sys_device *s, \
767 struct sysdev_attribute *attr, \
768 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 769 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 770 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 771 static ssize_t set_ ## name(struct sys_device *s, \
772 struct sysdev_attribute *attr, \
773 const char *buf, size_t siz) { \
769 char *end; \ 774 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 775 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 776 if (end == buf) return -EINVAL; \
@@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 791ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 792ACCESSOR(bank5ctl,bank[5],mce_restart())
788 793
789static ssize_t show_trigger(struct sys_device *s, char *buf) 794static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
795 char *buf)
790{ 796{
791 strcpy(buf, trigger); 797 strcpy(buf, trigger);
792 strcat(buf, "\n"); 798 strcat(buf, "\n");
793 return strlen(trigger) + 1; 799 return strlen(trigger) + 1;
794} 800}
795 801
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 802static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
803 const char *buf,size_t siz)
797{ 804{
798 char *p; 805 char *p;
799 int len; 806 int len;
@@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 813}
807 814
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 815static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 816static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 817ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 818static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 819 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 820 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 821 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 822 NULL
816}; 823};
817 824
@@ -853,7 +860,7 @@ error:
853 return err; 860 return err;
854} 861}
855 862
856static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
857{ 864{
858 int i; 865 int i;
859 866
@@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 case CPU_ONLINE: 884 case CPU_ONLINE:
878 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
879 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
880 break; 889 break;
881 case CPU_DEAD: 890 case CPU_DEAD:
882 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
883 mce_remove_device(cpu); 894 mce_remove_device(cpu);
884 break; 895 break;
885 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e0..a74af128efc9 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Non Fatal Machine Check Exception Reporting 2 * Non Fatal Machine Check Exception Reporting
3 * 3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> 4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 * 5 *
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
379 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
380{ 380{
381 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
382 383
383 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
384 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
392 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
393 394
394 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
395 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
396 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405 mask_lo = tmp;
406 }
407 }
397 408
398 /* This works correctly if size is a power of two, i.e. a 409 /* This works correctly if size is a power of two, i.e. a
399 contiguous range. */ 410 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6f23969c8faf..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
729 mtrr_type type; 729 mtrr_type type;
730}; 730};
731 731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print; 733static int __initdata debug_print;
734 734
735static int __init 735static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
@@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1496 1600
1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1498 if (!highest_pfn) { 1602 if (!highest_pfn) {
1499 if (!kvm_para_available()) { 1603 WARN(!kvm_para_available(), KERN_WARNING
1500 printk(KERN_WARNING
1501 "WARNING: strange, CPU MTRRs all blank?\n"); 1604 "WARNING: strange, CPU MTRRs all blank?\n");
1502 WARN_ON(1);
1503 }
1504 return 0; 1605 return 0;
1505 } 1606 }
1506 1607
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..9abd48b22674 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h>
21
20#include <asm/apic.h> 22#include <asm/apic.h>
21#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
22 24
@@ -250,7 +252,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
250 252
251 do_div(count, nmi_hz); 253 do_div(count, nmi_hz);
252 if(descr) 254 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count); 256 wrmsrl(perfctr_msr, 0 - count);
255} 257}
256 258
@@ -261,7 +263,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
261 263
262 do_div(count, nmi_hz); 264 do_div(count, nmi_hz);
263 if(descr) 265 if(descr)
264 Dprintk("setting %s to -0x%08Lx\n", descr, count); 266 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0); 267 wrmsr(perfctr_msr, (u32)(-count), 0);
266} 268}
267 269
@@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 297 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 298 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 300
301 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 303 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 304 wd->cccr_msr = 0; /* unused */
305
306 /* ok, everything is initialized, announce that we're set */
307 cpu_nmi_set_wd_enabled();
308
309 apic_write(APIC_LVTPC, APIC_DM_NMI);
310 evntsel |= K7_EVNTSEL_ENABLE;
311 wrmsr(evntsel_msr, evntsel, 0);
312
305 return 1; 313 return 1;
306} 314}
307 315
@@ -330,7 +338,8 @@ static void single_msr_unreserve(void)
330 release_perfctr_nmi(wd_ops->perfctr); 338 release_perfctr_nmi(wd_ops->perfctr);
331} 339}
332 340
333static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 341static void __kprobes
342single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
334{ 343{
335 /* start the cycle over again */ 344 /* start the cycle over again */
336 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); 345 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 388 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 389 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 391
392 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 393 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 394 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 395 wd->cccr_msr = 0; /* unused */
396
397 /* ok, everything is initialized, announce that we're set */
398 cpu_nmi_set_wd_enabled();
399
400 apic_write(APIC_LVTPC, APIC_DM_NMI);
401 evntsel |= P6_EVNTSEL0_ENABLE;
402 wrmsr(evntsel_msr, evntsel, 0);
403
389 return 1; 404 return 1;
390} 405}
391 406
392static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 407static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
393{ 408{
394 /* 409 /*
395 * P6 based Pentium M need to re-unmask 410 * P6 based Pentium M need to re-unmask
@@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 447#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 448#define P4_CCCR_OVF (1 << 31)
434 449
450#define P4_CONTROLS 18
451static unsigned int p4_controls[18] = {
452 MSR_P4_BPU_CCCR0,
453 MSR_P4_BPU_CCCR1,
454 MSR_P4_BPU_CCCR2,
455 MSR_P4_BPU_CCCR3,
456 MSR_P4_MS_CCCR0,
457 MSR_P4_MS_CCCR1,
458 MSR_P4_MS_CCCR2,
459 MSR_P4_MS_CCCR3,
460 MSR_P4_FLAME_CCCR0,
461 MSR_P4_FLAME_CCCR1,
462 MSR_P4_FLAME_CCCR2,
463 MSR_P4_FLAME_CCCR3,
464 MSR_P4_IQ_CCCR0,
465 MSR_P4_IQ_CCCR1,
466 MSR_P4_IQ_CCCR2,
467 MSR_P4_IQ_CCCR3,
468 MSR_P4_IQ_CCCR4,
469 MSR_P4_IQ_CCCR5,
470};
435/* 471/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 472 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 473 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,12 +509,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 509 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 510 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 511 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
512
513 /*
514 * If we're on the kdump kernel or other situation, we may
515 * still have other performance counter registers set to
516 * interrupt and they'll keep interrupting forever because
517 * of the P4_CCCR_OVF quirk. So we need to ACK all the
518 * pending interrupts and disable all the registers here,
519 * before reenabling the NMI delivery. Refer to p4_rearm()
520 * about the P4_CCCR_OVF quirk.
521 */
522 if (reset_devices) {
523 unsigned int low, high;
524 int i;
525
526 for (i = 0; i < P4_CONTROLS; i++) {
527 rdmsr(p4_controls[i], low, high);
528 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
529 wrmsr(p4_controls[i], low, high);
530 }
531 }
476 } else { 532 } else {
477 /* logical cpu 1 */ 533 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 534 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0; 535 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1; 536 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 537
538 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
539 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
540 cccr_val = P4_CCCR_OVF_PMI0;
541 else
542 cccr_val = P4_CCCR_OVF_PMI1;
543 cccr_val |= P4_CCCR_ESCR_SELECT(4);
482 } 544 }
483 545
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 546 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -493,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
493 wrmsr(evntsel_msr, evntsel, 0); 555 wrmsr(evntsel_msr, evntsel, 0);
494 wrmsr(cccr_msr, cccr_val, 0); 556 wrmsr(cccr_msr, cccr_val, 0);
495 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 557 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
496 apic_write(APIC_LVTPC, APIC_DM_NMI); 558
497 cccr_val |= P4_CCCR_ENABLE;
498 wrmsr(cccr_msr, cccr_val, 0);
499 wd->perfctr_msr = perfctr_msr; 559 wd->perfctr_msr = perfctr_msr;
500 wd->evntsel_msr = evntsel_msr; 560 wd->evntsel_msr = evntsel_msr;
501 wd->cccr_msr = cccr_msr; 561 wd->cccr_msr = cccr_msr;
562
563 /* ok, everything is initialized, announce that we're set */
564 cpu_nmi_set_wd_enabled();
565
566 apic_write(APIC_LVTPC, APIC_DM_NMI);
567 cccr_val |= P4_CCCR_ENABLE;
568 wrmsr(cccr_msr, cccr_val, 0);
502 return 1; 569 return 1;
503} 570}
504 571
@@ -541,7 +608,7 @@ static void p4_unreserve(void)
541 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); 608 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
542} 609}
543 610
544static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 611static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
545{ 612{
546 unsigned dummy; 613 unsigned dummy;
547 /* 614 /*
@@ -614,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
614 wrmsr(evntsel_msr, evntsel, 0); 681 wrmsr(evntsel_msr, evntsel, 0);
615 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 682 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
616 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 683 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
617 apic_write(APIC_LVTPC, APIC_DM_NMI);
618 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
619 wrmsr(evntsel_msr, evntsel, 0);
620 684
621 wd->perfctr_msr = perfctr_msr; 685 wd->perfctr_msr = perfctr_msr;
622 wd->evntsel_msr = evntsel_msr; 686 wd->evntsel_msr = evntsel_msr;
623 wd->cccr_msr = 0; /* unused */ 687 wd->cccr_msr = 0; /* unused */
688
689 /* ok, everything is initialized, announce that we're set */
690 cpu_nmi_set_wd_enabled();
691
692 apic_write(APIC_LVTPC, APIC_DM_NMI);
693 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
694 wrmsr(evntsel_msr, evntsel, 0);
624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 695 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
625 return 1; 696 return 1;
626} 697}
@@ -716,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
716 return hz; 787 return hz;
717} 788}
718 789
719int lapic_wd_event(unsigned nmi_hz) 790int __kprobes lapic_wd_event(unsigned nmi_hz)
720{ 791{
721 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 792 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
722 u64 ctr; 793 u64 ctr;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..01b1244ef1c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 else
164 *pos = next_cpu_nr(*pos - 1, cpu_online_map);
165 if ((*pos) < nr_cpu_ids)
164 return &cpu_data(*pos); 166 return &cpu_data(*pos);
165 return NULL; 167 return NULL;
166} 168}
167 169
168static void *c_next(struct seq_file *m, void *v, loff_t *pos) 170static void *c_next(struct seq_file *m, void *v, loff_t *pos)
169{ 171{
170 *pos = next_cpu(*pos, cpu_online_map); 172 (*pos)++;
171 return c_start(m, pos); 173 return c_start(m, pos);
172} 174}
173 175
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..72cefd1e649b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
89 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
90 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
91 u64 pos = *ppos; 90 u64 pos = *ppos;
91 ssize_t bytes = 0;
92 int err = 0;
92 93
93 if (count % 16) 94 if (count % 16)
94 return -EINVAL; /* Invalid chunk size */ 95 return -EINVAL; /* Invalid chunk size */
@@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
96 for (; count; count -= 16) { 97 for (; count; count -= 16) {
97 cmd.eax = pos; 98 cmd.eax = pos;
98 cmd.ecx = pos >> 32; 99 cmd.ecx = pos >> 32;
99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); 100 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
100 if (copy_to_user(tmp, &cmd, 16)) 101 if (err)
101 return -EFAULT; 102 break;
103 if (copy_to_user(tmp, &cmd, 16)) {
104 err = -EFAULT;
105 break;
106 }
102 tmp += 16; 107 tmp += 16;
108 bytes += 16;
103 *ppos = ++pos; 109 *ppos = ++pos;
104 } 110 }
105 111
106 return tmp - buf; 112 return bytes ? bytes : err;
107} 113}
108 114
109static int cpuid_open(struct inode *inode, struct file *file) 115static int cpuid_open(struct inode *inode, struct file *file)
@@ -141,7 +147,7 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 147{
142 struct device *dev; 148 struct device *dev;
143 149
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 150 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
145 "cpu%d", cpu); 151 "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 152 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 153}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
13 13
14static void *kdump_buf_page; 14static void *kdump_buf_page;
15 15
16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18
16/** 19/**
17 * copy_oldmem_page - copy one page from "oldmem" 20 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 21 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,11 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10#include <linux/uaccess.h>
11#include <linux/io.h>
10 12
11#include <asm/uaccess.h> 13/* Stores the physical address of elf header of crash image. */
12#include <asm/io.h> 14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
13 15
14/** 16/**
15 * copy_oldmem_page - copy one page from "oldmem" 17 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +27,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 27 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 28 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 29ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 30 size_t csize, unsigned long offset, int userbuf)
29{ 31{
30 void *vaddr; 32 void *vaddr;
31 33
@@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 35 return 0;
34 36
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr)
39 return -ENOMEM;
36 40
37 if (userbuf) { 41 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 42 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 43 iounmap(vaddr);
40 return -EFAULT; 44 return -EFAULT;
41 } 45 }
42 } else 46 } else
43 memcpy(buf, (vaddr + offset), csize); 47 memcpy(buf, vaddr + offset, csize);
44 48
45 iounmap(vaddr); 49 iounmap(vaddr);
46 return csize; 50 return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..b3614752197b
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 8
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78
79void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data)
82{
83 if (!task)
84 task = current;
85
86 if (!stack) {
87 unsigned long dummy;
88 stack = &dummy;
89 if (task && task != current)
90 stack = (unsigned long *)task->thread.sp;
91 }
92
93#ifdef CONFIG_FRAME_POINTER
94 if (!bp) {
95 if (task == current) {
96 /* Grab bp right from our regs */
97 get_bp(bp);
98 } else {
99 /* bp is the last reg pushed by switch_to */
100 bp = *(unsigned long *) task->thread.sp;
101 }
102 }
103#endif
104
105 for (;;) {
106 struct thread_info *context;
107
108 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL);
111
112 stack = (unsigned long *)context->previous_esp;
113 if (!stack)
114 break;
115 if (ops->stack(data, "IRQ") < 0)
116 break;
117 touch_nmi_watchdog();
118 }
119}
120EXPORT_SYMBOL(dump_trace);
121
122static void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl)
175{
176 unsigned long *stack;
177 int i;
178
179 if (sp == NULL) {
180 if (task)
181 sp = (unsigned long *)task->thread.sp;
182 else
183 sp = (unsigned long *)&sp;
184 }
185
186 stack = sp;
187 for (i = 0; i < kstack_depth_to_print; i++) {
188 if (kstack_end(stack))
189 break;
190 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
191 printk("\n%s", log_lvl);
192 printk(" %08lx", *stack++);
193 touch_nmi_watchdog();
194 }
195 printk("\n");
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197}
198
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226
227void show_registers(struct pt_regs *regs)
228{
229 int i;
230
231 print_modules();
232 __show_regs(regs, 0);
233
234 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
235 TASK_COMM_LEN, current->comm, task_pid_nr(current),
236 current_thread_info(), current, task_thread_info(current));
237 /*
238 * When in-kernel, we also print out the stack and code at the
239 * time of the fault..
240 */
241 if (!user_mode_vm(regs)) {
242 unsigned int code_prologue = code_bytes * 43 / 64;
243 unsigned int code_len = code_bytes;
244 unsigned char c;
245 u8 *ip;
246
247 printk(KERN_EMERG "Stack:\n");
248 show_stack_log_lvl(NULL, regs, &regs->sp,
249 0, KERN_EMERG);
250
251 printk(KERN_EMERG "Code: ");
252
253 ip = (u8 *)regs->ip - code_prologue;
254 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
255 /* try starting at IP */
256 ip = (u8 *)regs->ip;
257 code_len = code_len - code_prologue + 1;
258 }
259 for (i = 0; i < code_len; i++, ip++) {
260 if (ip < (u8 *)PAGE_OFFSET ||
261 probe_kernel_address(ip, c)) {
262 printk(" Bad EIP value.");
263 break;
264 }
265 if (ip == (u8 *)regs->ip)
266 printk("<%02x> ", c);
267 else
268 printk("%02x ", c);
269 }
270 }
271 printk("\n");
272}
273
274int is_valid_bugaddr(unsigned long ip)
275{
276 unsigned short ud2;
277
278 if (ip < PAGE_OFFSET)
279 return 0;
280 if (probe_kernel_address((unsigned short *)ip, ud2))
281 return 0;
282
283 return ud2 == 0x0b0f;
284}
285
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..96a5db7da8a7
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 4
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp)
36{
37 static char ids[][8] = {
38 [DEBUG_STACK - 1] = "#DB",
39 [NMI_STACK - 1] = "NMI",
40 [DOUBLEFAULT_STACK - 1] = "#DF",
41 [STACKFAULT_STACK - 1] = "#SS",
42 [MCE_STACK - 1] = "#MC",
43#if DEBUG_STKSZ > EXCEPTION_STKSZ
44 [N_EXCEPTION_STACKS ...
45 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
46#endif
47 };
48 unsigned k;
49
50 /*
51 * Iterate over all exception stacks, and figure out whether
52 * 'stack' is in one of them:
53 */
54 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
55 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
56 /*
57 * Is 'stack' above this exception frame's end?
58 * If yes then skip to the next frame.
59 */
60 if (stack >= end)
61 continue;
62 /*
63 * Is 'stack' above this exception frame's start address?
64 * If yes then we found the right frame.
65 */
66 if (stack >= end - EXCEPTION_STKSZ) {
67 /*
68 * Make sure we only iterate through an exception
69 * stack once. If it comes up for the second time
70 * then there's something wrong going on - just
71 * break out and return NULL:
72 */
73 if (*usedp & (1U << k))
74 break;
75 *usedp |= 1U << k;
76 *idp = ids[k];
77 return (unsigned long *)end;
78 }
79 /*
80 * If this is a debug stack, and if it has a larger size than
81 * the usual exception stacks, then 'stack' might still
82 * be within the lower portion of the debug stack:
83 */
84#if DEBUG_STKSZ > EXCEPTION_STKSZ
85 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
86 unsigned j = N_EXCEPTION_STACKS - 1;
87
88 /*
89 * Black magic. A large debug stack is composed of
90 * multiple exception stack entries, which we
91 * iterate through now. Dont look:
92 */
93 do {
94 ++j;
95 end -= EXCEPTION_STKSZ;
96 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
97 } while (stack < end - EXCEPTION_STKSZ);
98 if (*usedp & (1U << j))
99 break;
100 *usedp |= 1U << j;
101 *idp = ids[j];
102 return (unsigned long *)end;
103 }
104#endif
105 }
106 return NULL;
107}
108
109/*
110 * x86-64 can have up to three kernel stacks:
111 * process stack
112 * interrupt stack
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data)
164{
165 const unsigned cpu = get_cpu();
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0;
168 struct thread_info *tinfo;
169
170 if (!task)
171 task = current;
172
173 if (!stack) {
174 unsigned long dummy;
175 stack = &dummy;
176 if (task && task != current)
177 stack = (unsigned long *)task->thread.sp;
178 }
179
180#ifdef CONFIG_FRAME_POINTER
181 if (!bp) {
182 if (task == current) {
183 /* Grab bp right from our regs */
184 get_bp(bp);
185 } else {
186 /* bp is the last reg pushed by switch_to */
187 bp = *(unsigned long *) task->thread.sp;
188 }
189 }
190#endif
191
192 /*
193 * Print function call entries in all stacks, starting at the
194 * current stack address. If the stacks consist of nested
195 * exceptions
196 */
197 tinfo = task_thread_info(task);
198 for (;;) {
199 char *id;
200 unsigned long *estack_end;
201 estack_end = in_exception_stack(cpu, (unsigned long)stack,
202 &used, &id);
203
204 if (estack_end) {
205 if (ops->stack(data, id) < 0)
206 break;
207
208 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end);
210 ops->stack(data, "<EOE>");
211 /*
212 * We link to the next stack via the
213 * second-to-last pointer (index -2 to end) in the
214 * exception stack:
215 */
216 stack = (unsigned long *) estack_end[-2];
217 continue;
218 }
219 if (irqstack_end) {
220 unsigned long *irqstack;
221 irqstack = irqstack_end -
222 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
223
224 if (stack >= irqstack && stack < irqstack_end) {
225 if (ops->stack(data, "IRQ") < 0)
226 break;
227 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end);
229 /*
230 * We link to the next stack (which would be
231 * the process stack normally) the last
232 * pointer (index -1 to end) in the IRQ stack:
233 */
234 stack = (unsigned long *) (irqstack_end[-1]);
235 irqstack_end = NULL;
236 ops->stack(data, "EOI");
237 continue;
238 }
239 }
240 break;
241 }
242
243 /*
244 * This handles the process stack:
245 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
247 put_cpu();
248}
249EXPORT_SYMBOL(dump_trace);
250
251static void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl)
304{
305 unsigned long *stack;
306 int i;
307 const int cpu = smp_processor_id();
308 unsigned long *irqstack_end =
309 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
310 unsigned long *irqstack =
311 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
312
313 /*
314 * debugging aid: "show_stack(NULL, NULL);" prints the
315 * back trace for this cpu.
316 */
317
318 if (sp == NULL) {
319 if (task)
320 sp = (unsigned long *)task->thread.sp;
321 else
322 sp = (unsigned long *)&sp;
323 }
324
325 stack = sp;
326 for (i = 0; i < kstack_depth_to_print; i++) {
327 if (stack >= irqstack && stack <= irqstack_end) {
328 if (stack == irqstack_end) {
329 stack = (unsigned long *) (irqstack_end[-1]);
330 printk(" <EOI> ");
331 }
332 } else {
333 if (((long) stack & (THREAD_SIZE-1)) == 0)
334 break;
335 }
336 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
337 printk("\n%s", log_lvl);
338 printk(" %016lx", *stack++);
339 touch_nmi_watchdog();
340 }
341 printk("\n");
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343}
344
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs)
373{
374 int i;
375 unsigned long sp;
376 const int cpu = smp_processor_id();
377 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
378
379 sp = regs->sp;
380 printk("CPU %d ", cpu);
381 __show_regs(regs, 1);
382 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
383 cur->comm, cur->pid, task_thread_info(cur), cur);
384
385 /*
386 * When in-kernel, we also print out the stack and code at the
387 * time of the fault..
388 */
389 if (!user_mode(regs)) {
390 unsigned int code_prologue = code_bytes * 43 / 64;
391 unsigned int code_len = code_bytes;
392 unsigned char c;
393 u8 *ip;
394
395 printk(KERN_EMERG "Stack:\n");
396 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
397 regs->bp, KERN_EMERG);
398
399 printk(KERN_EMERG "Code: ");
400
401 ip = (u8 *)regs->ip - code_prologue;
402 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
403 /* try starting at IP */
404 ip = (u8 *)regs->ip;
405 code_len = code_len - code_prologue + 1;
406 }
407 for (i = 0; i < code_len; i++, ip++) {
408 if (ip < (u8 *)PAGE_OFFSET ||
409 probe_kernel_address(ip, c)) {
410 printk(" Bad RIP value.");
411 break;
412 }
413 if (ip == (u8 *)regs->ip)
414 printk("<%02x> ", c);
415 else
416 printk("%02x ", c);
417 }
418 }
419 printk("\n");
420}
421
422int is_valid_bugaddr(unsigned long ip)
423{
424 unsigned short ud2;
425
426 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
427 return 0;
428
429 return ud2 == 0x0b0f;
430}
431
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..ce97bf3bed12 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -877,7 +880,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 880 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 881 count++;
879 882
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 883 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
884 count, start, end);
881 for (i = 0; i < count; i++) { 885 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 886 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 887 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1202,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
1202 if (!p) 1206 if (!p)
1203 return -EINVAL; 1207 return -EINVAL;
1204 1208
1205 if (!strcmp(p, "exactmap")) { 1209 if (!strncmp(p, "exactmap", 8)) {
1206#ifdef CONFIG_CRASH_DUMP 1210#ifdef CONFIG_CRASH_DUMP
1207 /* 1211 /*
1208 * If we are doing a crash dump, we still need to know 1212 * If we are doing a crash dump, we still need to know
@@ -1259,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1259 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1260 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1261 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1262 default: return "reserved"; 1267 default: return "reserved";
1263 } 1268 }
1264} 1269}
@@ -1266,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1266/* 1271/*
1267 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1268 */ 1273 */
1274static struct resource __initdata *e820_res;
1269void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1270{ 1276{
1271 int i; 1277 int i;
@@ -1273,20 +1279,26 @@ void __init e820_reserve_resources(void)
1273 u64 end; 1279 u64 end;
1274 1280
1275 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1276 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1277 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1278#ifndef CONFIG_RESOURCES_64BIT 1285 if (end != (resource_size_t)end) {
1279 if (end > 0x100000000ULL) {
1280 res++; 1286 res++;
1281 continue; 1287 continue;
1282 } 1288 }
1283#endif
1284 res->name = e820_type_to_string(e820.map[i].type); 1289 res->name = e820_type_to_string(e820.map[i].type);
1285 res->start = e820.map[i].addr; 1290 res->start = e820.map[i].addr;
1286 res->end = end; 1291 res->end = end;
1287 1292
1288 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1293 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1289 insert_resource(&iomem_resource, res); 1294
1295 /*
1296 * don't register the region that could be conflicted with
1297 * pci device BAR resource and insert them later in
1298 * pcibios_resource_survey()
1299 */
1300 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1301 insert_resource(&iomem_resource, res);
1290 res++; 1302 res++;
1291 } 1303 }
1292 1304
@@ -1298,10 +1310,18 @@ void __init e820_reserve_resources(void)
1298 } 1310 }
1299} 1311}
1300 1312
1301/* 1313void __init e820_reserve_resources_late(void)
1302 * Non-standard memory setup can be specified via this quirk: 1314{
1303 */ 1315 int i;
1304char * (*arch_memory_setup_quirk)(void); 1316 struct resource *res;
1317
1318 res = e820_res;
1319 for (i = 0; i < e820.nr_map; i++) {
1320 if (!res->parent && res->end)
1321 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1322 res++;
1323 }
1324}
1305 1325
1306char *__init default_machine_specific_memory_setup(void) 1326char *__init default_machine_specific_memory_setup(void)
1307{ 1327{
@@ -1343,8 +1363,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1363
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1364char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1365{
1346 if (arch_memory_setup_quirk) { 1366 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1367 char *who = x86_quirks->arch_memory_setup();
1348 1368
1349 if (who) 1369 if (who)
1350 return who; 1370 return who;
@@ -1367,24 +1387,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1387 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1388 e820_print_map(who);
1369} 1389}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..3ce029ffaa55 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
@@ -98,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func)
98 95
99} 96}
100 97
98#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
99static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
100{
101 u32 d;
102 u8 b;
103
104 b = read_pci_config_byte(num, slot, func, 0xac);
105 b &= ~(1<<5);
106 write_pci_config_byte(num, slot, func, 0xac, b);
107
108 d = read_pci_config(num, slot, func, 0x70);
109 d |= 1<<8;
110 write_pci_config(num, slot, func, 0x70, d);
111
112 d = read_pci_config(num, slot, func, 0x8);
113 d &= 0xff;
114 return d;
115}
116
117static void __init ati_bugs(int num, int slot, int func)
118{
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141}
142
143static u32 __init ati_sbx00_rev(int num, int slot, int func)
144{
145 u32 old, d;
146
147 d = read_pci_config(num, slot, func, 0x70);
148 old = d;
149 d &= ~(1<<8);
150 write_pci_config(num, slot, func, 0x70, d);
151 d = read_pci_config(num, slot, func, 0x8);
152 d &= 0xff;
153 write_pci_config(num, slot, func, 0x70, old);
154
155 return d;
156}
157
158static void __init ati_bugs_contd(int num, int slot, int func)
159{
160 u32 d, rev;
161
162 if (acpi_use_timer_override)
163 return;
164
165 rev = ati_sbx00_rev(num, slot, func);
166 if (rev > 0x13)
167 return;
168
169 /* check for IRQ0 interrupt swap */
170 d = read_pci_config(num, slot, func, 0x64);
171 if (!(d & (1<<14)))
172 acpi_skip_timer_override = 1;
173
174 if (acpi_skip_timer_override) {
175 printk(KERN_INFO "SB600 revision 0x%x\n", rev);
176 printk(KERN_INFO "Ignoring ACPI timer override.\n");
177 printk(KERN_INFO "If you got timer trouble "
178 "try acpi_use_timer_override\n");
179 }
180}
181#else
182static void __init ati_bugs(int num, int slot, int func)
183{
184}
185
186static void __init ati_bugs_contd(int num, int slot, int func)
187{
188}
189#endif
190
191#ifdef CONFIG_DMAR
192static void __init intel_g33_dmar(int num, int slot, int func)
193{
194 struct acpi_table_header *dmar_tbl;
195 acpi_status status;
196
197 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
198 if (ACPI_SUCCESS(status)) {
199 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
200 dmar_disabled = 1;
201 }
202}
203#endif
204
101#define QFLAG_APPLY_ONCE 0x1 205#define QFLAG_APPLY_ONCE 0x1
102#define QFLAG_APPLIED 0x2 206#define QFLAG_APPLIED 0x2
103#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 207#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -117,6 +221,14 @@ static struct chipset early_qrk[] __initdata = {
117 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 221 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
118 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 222 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
119 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 223 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
224 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
225 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
226 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
227 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
228#ifdef CONFIG_DMAR
229 { PCI_VENDOR_ID_INTEL, 0x29c0,
230 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
231#endif
120 {} 232 {}
121}; 233};
122 234
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369 } else if (!efi_guidcmp(config_tables[i].guid, 369 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table);
373 } else if (!efi_guidcmp(config_tables[i].guid,
370 HCDP_TABLE_GUID)) { 374 HCDP_TABLE_GUID)) {
371 efi.hcdp = config_tables[i].table; 375 efi.hcdp = config_tables[i].table;
372 printk(" HCDP=0x%lx ", config_tables[i].table); 376 printk(" HCDP=0x%lx ", config_tables[i].table);
@@ -414,9 +418,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 418 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 419 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 420 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
421
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 422 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 423 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 424 "Kernel-defined memdesc doesn't match the one from EFI!\n");
425
420 if (add_efi_memmap) 426 if (add_efi_memmap)
421 do_add_efi_memmap(); 427 do_add_efi_memmap();
422 428
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
53 * directory. If I have PAE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PAE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PAE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..dd65143941a8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 425 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 427 jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
383 # setting need_resched or sigpending 434 # setting need_resched or sigpending
384 # between sampling and the iret 435 # between sampling and the iret
385 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 439 jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
514syscall_trace_entry: 561syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 563 movl %esp, %eax
517 xorl %edx,%edx 564 call syscall_trace_enter
518 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 567 jnae syscall_call
525 jmp syscall_exit 568 jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 571 # perform syscall exit tracing
529 ALIGN 572 ALIGN
530syscall_exit_work: 573syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 575 jz work_pending
533 TRACE_IRQS_ON 576 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 578 # schedule() instead
536 movl %esp, %eax 579 movl %esp, %eax
537 movl $1, %edx 580 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 581 jmp resume_userspace
540END(syscall_exit_work) 582END(syscall_exit_work)
541 CFI_ENDPROC 583 CFI_ENDPROC
@@ -587,7 +629,7 @@ ENTRY(interrupt)
587ENTRY(irq_entries_start) 629ENTRY(irq_entries_start)
588 RING0_INT_FRAME 630 RING0_INT_FRAME
589vector=0 631vector=0
590.rept NR_IRQS 632.rept NR_VECTORS
591 ALIGN 633 ALIGN
592 .if vector 634 .if vector
593 CFI_ADJUST_CFA_OFFSET -4 635 CFI_ADJUST_CFA_OFFSET -4
@@ -688,6 +730,7 @@ error_code:
688 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
689 movl %ecx, %ds 731 movl %ecx, %ds
690 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
691 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
692 call *%edi 735 call *%edi
693 jmp ret_from_exception 736 jmp ret_from_exception
@@ -718,20 +761,9 @@ ENTRY(device_not_available)
718 RING0_INT_FRAME 761 RING0_INT_FRAME
719 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
720 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
721 SAVE_ALL 764 pushl $do_device_not_available
722 GET_CR0_INTO_EAX
723 testl $0x4, %eax # EM (math emulation bit)
724 jne device_not_available_emulate
725 preempt_stop(CLBR_ANY)
726 call math_state_restore
727 jmp ret_from_exception
728device_not_available_emulate:
729 pushl $0 # temporary storage for ORIG_EIP
730 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
731 call math_emulate 766 jmp error_code
732 addl $4, %esp
733 CFI_ADJUST_CFA_OFFSET -4
734 jmp ret_from_exception
735 CFI_ENDPROC 767 CFI_ENDPROC
736END(device_not_available) 768END(device_not_available)
737 769
@@ -772,6 +804,7 @@ debug_stack_correct:
772 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
773 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
774 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
775 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
776 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
777 call do_debug 810 call do_debug
@@ -816,6 +849,7 @@ nmi_stack_correct:
816 pushl %eax 849 pushl %eax
817 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
819 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
820 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
821 call do_nmi 855 call do_nmi
@@ -856,6 +890,7 @@ nmi_espfix_stack:
856 pushl %eax 890 pushl %eax
857 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
858 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
859 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
860 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
861 call do_nmi 896 call do_nmi
@@ -886,6 +921,7 @@ KPROBE_ENTRY(int3)
886 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
887 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
888 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
889 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
890 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
891 call do_int3 927 call do_int3
@@ -1024,6 +1060,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1060ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1061 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1062 addl $5*4, %esp /* remove xen-provided frame */
1063 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1064 jmp sysenter_past_esp
1028 CFI_ENDPROC 1065 CFI_ENDPROC
1029 1066
@@ -1116,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
1116#ifdef CONFIG_DYNAMIC_FTRACE 1153#ifdef CONFIG_DYNAMIC_FTRACE
1117 1154
1118ENTRY(mcount) 1155ENTRY(mcount)
1119 pushl %eax
1120 pushl %ecx
1121 pushl %edx
1122 movl 0xc(%esp), %eax
1123 subl $MCOUNT_INSN_SIZE, %eax
1124
1125.globl mcount_call
1126mcount_call:
1127 call ftrace_stub
1128
1129 popl %edx
1130 popl %ecx
1131 popl %eax
1132
1133 ret 1156 ret
1134END(mcount) 1157END(mcount)
1135 1158
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..09e7145484c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,37 +53,17 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
59#ifdef CONFIG_DYNAMIC_FTRACE 65#ifdef CONFIG_DYNAMIC_FTRACE
60ENTRY(mcount) 66ENTRY(mcount)
61
62 subq $0x38, %rsp
63 movq %rax, (%rsp)
64 movq %rcx, 8(%rsp)
65 movq %rdx, 16(%rsp)
66 movq %rsi, 24(%rsp)
67 movq %rdi, 32(%rsp)
68 movq %r8, 40(%rsp)
69 movq %r9, 48(%rsp)
70
71 movq 0x38(%rsp), %rdi
72 subq $MCOUNT_INSN_SIZE, %rdi
73
74.globl mcount_call
75mcount_call:
76 call ftrace_stub
77
78 movq 48(%rsp), %r9
79 movq 40(%rsp), %r8
80 movq 32(%rsp), %rdi
81 movq 24(%rsp), %rsi
82 movq 16(%rsp), %rdx
83 movq 8(%rsp), %rcx
84 movq (%rsp), %rax
85 addq $0x38, %rsp
86
87 retq 67 retq
88END(mcount) 68END(mcount)
89 69
@@ -269,9 +249,9 @@ ENTRY(native_usergs_sysret64)
269ENTRY(ret_from_fork) 249ENTRY(ret_from_fork)
270 CFI_DEFAULT_STACK 250 CFI_DEFAULT_STACK
271 push kernel_eflags(%rip) 251 push kernel_eflags(%rip)
272 CFI_ADJUST_CFA_OFFSET 4 252 CFI_ADJUST_CFA_OFFSET 8
273 popf # reset kernel eflags 253 popf # reset kernel eflags
274 CFI_ADJUST_CFA_OFFSET -4 254 CFI_ADJUST_CFA_OFFSET -8
275 call schedule_tail 255 call schedule_tail
276 GET_THREAD_INFO(%rcx) 256 GET_THREAD_INFO(%rcx)
277 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
@@ -349,9 +329,9 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 329 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 330 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 331 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 332 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 333 jnz tracesys
334system_call_fastpath:
355 cmpq $__NR_syscall_max,%rax 335 cmpq $__NR_syscall_max,%rax
356 ja badsys 336 ja badsys
357 movq %r10,%rcx 337 movq %r10,%rcx
@@ -403,16 +383,16 @@ sysret_careful:
403sysret_signal: 383sysret_signal:
404 TRACE_IRQS_ON 384 TRACE_IRQS_ON
405 ENABLE_INTERRUPTS(CLBR_NONE) 385 ENABLE_INTERRUPTS(CLBR_NONE)
406 testl $_TIF_DO_NOTIFY_MASK,%edx 386#ifdef CONFIG_AUDITSYSCALL
407 jz 1f 387 bt $TIF_SYSCALL_AUDIT,%edx
408 388 jc sysret_audit
409 /* Really a signal */ 389#endif
410 /* edx: work flags (arg3) */ 390 /* edx: work flags (arg3) */
411 leaq do_notify_resume(%rip),%rax 391 leaq do_notify_resume(%rip),%rax
412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 392 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
413 xorl %esi,%esi # oldset -> arg2 393 xorl %esi,%esi # oldset -> arg2
414 call ptregscall_common 394 call ptregscall_common
4151: movl $_TIF_WORK_MASK,%edi 395 movl $_TIF_WORK_MASK,%edi
416 /* Use IRET because user could have changed frame. This 396 /* Use IRET because user could have changed frame. This
417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 397 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
418 DISABLE_INTERRUPTS(CLBR_NONE) 398 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +403,56 @@ badsys:
423 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 403 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
424 jmp ret_from_sys_call 404 jmp ret_from_sys_call
425 405
406#ifdef CONFIG_AUDITSYSCALL
407 /*
408 * Fast path for syscall audit without full syscall trace.
409 * We just call audit_syscall_entry() directly, and then
410 * jump back to the normal fast path.
411 */
412auditsys:
413 movq %r10,%r9 /* 6th arg: 4th syscall arg */
414 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
415 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
416 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
417 movq %rax,%rsi /* 2nd arg: syscall number */
418 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
419 call audit_syscall_entry
420 LOAD_ARGS 0 /* reload call-clobbered registers */
421 jmp system_call_fastpath
422
423 /*
424 * Return fast path for syscall audit. Call audit_syscall_exit()
425 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
426 * masked off.
427 */
428sysret_audit:
429 movq %rax,%rsi /* second arg, syscall return value */
430 cmpq $0,%rax /* is it < 0? */
431 setl %al /* 1 if so, 0 if not */
432 movzbl %al,%edi /* zero-extend that into %edi */
433 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
434 call audit_syscall_exit
435 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
436 jmp sysret_check
437#endif /* CONFIG_AUDITSYSCALL */
438
426 /* Do syscall tracing */ 439 /* Do syscall tracing */
427tracesys: 440tracesys:
441#ifdef CONFIG_AUDITSYSCALL
442 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
443 jz auditsys
444#endif
428 SAVE_REST 445 SAVE_REST
429 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 446 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
430 FIXUP_TOP_OF_STACK %rdi 447 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 448 movq %rsp,%rdi
432 call syscall_trace_enter 449 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 450 /*
451 * Reload arg registers from stack in case ptrace changed them.
452 * We don't reload %rax because syscall_trace_enter() returned
453 * the value it wants us to use in the table lookup.
454 */
455 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 456 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 457 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 458 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +466,7 @@ tracesys:
444 * Has correct top of stack, but partial stack frame. 466 * Has correct top of stack, but partial stack frame.
445 */ 467 */
446 .globl int_ret_from_sys_call 468 .globl int_ret_from_sys_call
469 .globl int_with_check
447int_ret_from_sys_call: 470int_ret_from_sys_call:
448 DISABLE_INTERRUPTS(CLBR_NONE) 471 DISABLE_INTERRUPTS(CLBR_NONE)
449 TRACE_IRQS_OFF 472 TRACE_IRQS_OFF
@@ -483,7 +506,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 506 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 507 SAVE_REST
485 /* Check for syscall exit trace */ 508 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 509 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 510 jz int_signal
488 pushq %rdi 511 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 512 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +514,7 @@ int_very_careful:
491 call syscall_trace_leave 514 call syscall_trace_leave
492 popq %rdi 515 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 516 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 517 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 518 jmp int_restore_rest
496 519
497int_signal: 520int_signal:
@@ -618,6 +641,13 @@ END(stub_rt_sigreturn)
618 SAVE_ARGS 641 SAVE_ARGS
619 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
620 pushq %rbp 643 pushq %rbp
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
621 CFI_ADJUST_CFA_OFFSET 8 651 CFI_ADJUST_CFA_OFFSET 8
622 CFI_REL_OFFSET rbp, 0 652 CFI_REL_OFFSET rbp, 0
623 movq %rsp,%rbp 653 movq %rsp,%rbp
@@ -883,6 +913,9 @@ END(spurious_interrupt)
883 .if \ist 913 .if \ist
884 movq %gs:pda_data_offset, %rbp 914 movq %gs:pda_data_offset, %rbp
885 .endif 915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
886 movq %rsp,%rdi 919 movq %rsp,%rdi
887 movq ORIG_RAX(%rsp),%rsi 920 movq ORIG_RAX(%rsp),%rsi
888 movq $-1,ORIG_RAX(%rsp) 921 movq $-1,ORIG_RAX(%rsp)
@@ -1009,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
1009 je error_kernelspace 1042 je error_kernelspace
1010error_swapgs: 1043error_swapgs:
1011 SWAPGS 1044 SWAPGS
1012error_sti: 1045error_sti:
1046 TRACE_IRQS_OFF
1013 movq %rdi,RDI(%rsp) 1047 movq %rdi,RDI(%rsp)
1014 CFI_REL_OFFSET rdi,RDI 1048 CFI_REL_OFFSET rdi,RDI
1015 movq %rsp,%rdi 1049 movq %rsp,%rdi
@@ -1183,12 +1217,13 @@ ENTRY(simd_coprocessor_error)
1183END(simd_coprocessor_error) 1217END(simd_coprocessor_error)
1184 1218
1185ENTRY(device_not_available) 1219ENTRY(device_not_available)
1186 zeroentry math_state_restore 1220 zeroentry do_device_not_available
1187END(device_not_available) 1221END(device_not_available)
1188 1222
1189 /* runs on exception stack */ 1223 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1224KPROBE_ENTRY(debug)
1191 INTR_FRAME 1225 INTR_FRAME
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1227 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1228 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1229 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1233,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1233 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1234KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1235 INTR_FRAME
1236 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1237 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1238 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1239 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1247,7 @@ KPROBE_END(nmi)
1211 1247
1212KPROBE_ENTRY(int3) 1248KPROBE_ENTRY(int3)
1213 INTR_FRAME 1249 INTR_FRAME
1250 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1251 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1252 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1253 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1274,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1274 /* runs on exception stack */
1238ENTRY(double_fault) 1275ENTRY(double_fault)
1239 XCPT_FRAME 1276 XCPT_FRAME
1277 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1278 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1279 jmp paranoid_exit1
1242 CFI_ENDPROC 1280 CFI_ENDPROC
@@ -1253,6 +1291,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1291 /* runs on exception stack */
1254ENTRY(stack_segment) 1292ENTRY(stack_segment)
1255 XCPT_FRAME 1293 XCPT_FRAME
1294 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1295 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1296 jmp paranoid_exit1
1258 CFI_ENDPROC 1297 CFI_ENDPROC
@@ -1278,6 +1317,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1317 /* runs on exception stack */
1279ENTRY(machine_check) 1318ENTRY(machine_check)
1280 INTR_FRAME 1319 INTR_FRAME
1320 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1321 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1322 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1323 paranoidentry do_machine_check
@@ -1312,3 +1352,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1352 sysret
1313 CFI_ENDPROC 1353 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1354ENDPROC(ignore_sysret)
1355
1356#ifdef CONFIG_XEN
1357ENTRY(xen_hypervisor_callback)
1358 zeroentry xen_do_hypervisor_callback
1359END(xen_hypervisor_callback)
1360
1361/*
1362# A note on the "critical region" in our callback handler.
1363# We want to avoid stacking callback handlers due to events occurring
1364# during handling of the last event. To do this, we keep events disabled
1365# until we've done all processing. HOWEVER, we must enable events before
1366# popping the stack frame (can't be done atomically) and so it would still
1367# be possible to get enough handler activations to overflow the stack.
1368# Although unlikely, bugs of that kind are hard to track down, so we'd
1369# like to avoid the possibility.
1370# So, on entry to the handler we detect whether we interrupted an
1371# existing activation in its critical region -- if so, we pop the current
1372# activation and restart the handler using the previous one.
1373*/
1374ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1375 CFI_STARTPROC
1376/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1377 see the correct pointer to the pt_regs */
1378 movq %rdi, %rsp # we don't return, adjust the stack frame
1379 CFI_ENDPROC
1380 CFI_DEFAULT_STACK
138111: incl %gs:pda_irqcount
1382 movq %rsp,%rbp
1383 CFI_DEF_CFA_REGISTER rbp
1384 cmovzq %gs:pda_irqstackptr,%rsp
1385 pushq %rbp # backlink for old unwinder
1386 call xen_evtchn_do_upcall
1387 popq %rsp
1388 CFI_DEF_CFA_REGISTER rsp
1389 decl %gs:pda_irqcount
1390 jmp error_exit
1391 CFI_ENDPROC
1392END(do_hypervisor_callback)
1393
1394/*
1395# Hypervisor uses this for application faults while it executes.
1396# We get here for two reasons:
1397# 1. Fault while reloading DS, ES, FS or GS
1398# 2. Fault while executing IRET
1399# Category 1 we do not need to fix up as Xen has already reloaded all segment
1400# registers that could be reloaded and zeroed the others.
1401# Category 2 we fix up by killing the current process. We cannot use the
1402# normal Linux return path in this case because if we use the IRET hypercall
1403# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1404# We distinguish between categories by comparing each saved segment register
1405# with its current contents: any discrepancy means we in category 1.
1406*/
1407ENTRY(xen_failsafe_callback)
1408 framesz = (RIP-0x30) /* workaround buggy gas */
1409 _frame framesz
1410 CFI_REL_OFFSET rcx, 0
1411 CFI_REL_OFFSET r11, 8
1412 movw %ds,%cx
1413 cmpw %cx,0x10(%rsp)
1414 CFI_REMEMBER_STATE
1415 jne 1f
1416 movw %es,%cx
1417 cmpw %cx,0x18(%rsp)
1418 jne 1f
1419 movw %fs,%cx
1420 cmpw %cx,0x20(%rsp)
1421 jne 1f
1422 movw %gs,%cx
1423 cmpw %cx,0x28(%rsp)
1424 jne 1f
1425 /* All segments match their saved values => Category 2 (Bad IRET). */
1426 movq (%rsp),%rcx
1427 CFI_RESTORE rcx
1428 movq 8(%rsp),%r11
1429 CFI_RESTORE r11
1430 addq $0x30,%rsp
1431 CFI_ADJUST_CFA_OFFSET -0x30
1432 pushq $0
1433 CFI_ADJUST_CFA_OFFSET 8
1434 pushq %r11
1435 CFI_ADJUST_CFA_OFFSET 8
1436 pushq %rcx
1437 CFI_ADJUST_CFA_OFFSET 8
1438 jmp general_protection
1439 CFI_RESTORE_STATE
14401: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 SAVE_ALL
1450 jmp error_exit
1451 CFI_ENDPROC
1452END(xen_failsafe_callback)
1453
1454#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
new file mode 100644
index 000000000000..f454c78fcef6
--- /dev/null
+++ b/arch/x86/kernel/es7000_32.c
@@ -0,0 +1,363 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/smp.h>
31#include <linux/string.h>
32#include <linux/spinlock.h>
33#include <linux/errno.h>
34#include <linux/notifier.h>
35#include <linux/reboot.h>
36#include <linux/init.h>
37#include <linux/acpi.h>
38#include <asm/io.h>
39#include <asm/nmi.h>
40#include <asm/smp.h>
41#include <asm/apicdef.h>
42#include <mach_mpparse.h>
43
44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
113#endif
114
115struct mip_reg {
116 unsigned long long off_0;
117 unsigned long long off_8;
118 unsigned long long off_10;
119 unsigned long long off_18;
120 unsigned long long off_20;
121 unsigned long long off_28;
122 unsigned long long off_30;
123 unsigned long long off_38;
124};
125
126#define MIP_SW_APIC 0x1020b
127#define MIP_FUNC(VALUE) (VALUE & 0xff)
128
129/*
130 * ES7000 Globals
131 */
132
133static volatile unsigned long *psai = NULL;
134static struct mip_reg *mip_reg;
135static struct mip_reg *host_reg;
136static int mip_port;
137static unsigned long mip_addr, host_addr;
138
139int es7000_plat;
140
141/*
142 * GSI override for ES7000 platforms.
143 */
144
145static unsigned int base;
146
147static int
148es7000_rename_gsi(int ioapic, int gsi)
149{
150 if (es7000_plat == ES7000_ZORRO)
151 return gsi;
152
153 if (!base) {
154 int i;
155 for (i = 0; i < nr_ioapics; i++)
156 base += nr_ioapic_registers[i];
157 }
158
159 if (!ioapic && (gsi < 16))
160 gsi += base;
161 return gsi;
162}
163
164void __init
165setup_unisys(void)
166{
167 /*
168 * Determine the generation of the ES7000 currently running.
169 *
170 * es7000_plat = 1 if the machine is a 5xx ES7000 box
171 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
172 *
173 */
174 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
175 es7000_plat = ES7000_ZORRO;
176 else
177 es7000_plat = ES7000_CLASSIC;
178 ioapic_renumber_irq = es7000_rename_gsi;
179}
180
181/*
182 * Parse the OEM Table
183 */
184
185int __init
186parse_unisys_oem (char *oemptr)
187{
188 int i;
189 int success = 0;
190 unsigned char type, size;
191 unsigned long val;
192 char *tp = NULL;
193 struct psai *psaip = NULL;
194 struct mip_reg_info *mi;
195 struct mip_reg *host, *mip;
196
197 tp = oemptr;
198
199 tp += 8;
200
201 for (i=0; i <= 6; i++) {
202 type = *tp++;
203 size = *tp++;
204 tp -= 2;
205 switch (type) {
206 case MIP_REG:
207 mi = (struct mip_reg_info *)tp;
208 val = MIP_RD_LO(mi->host_reg);
209 host_addr = val;
210 host = (struct mip_reg *)val;
211 host_reg = __va(host);
212 val = MIP_RD_LO(mi->mip_reg);
213 mip_port = MIP_PORT(mi->mip_info);
214 mip_addr = val;
215 mip = (struct mip_reg *)val;
216 mip_reg = __va(mip);
217 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
218 (unsigned long)host_reg);
219 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
220 (unsigned long)mip_reg);
221 success++;
222 break;
223 case MIP_PSAI_REG:
224 psaip = (struct psai *)tp;
225 if (tp != NULL) {
226 if (psaip->addr)
227 psai = __va(psaip->addr);
228 else
229 psai = NULL;
230 success++;
231 }
232 break;
233 default:
234 break;
235 }
236 tp += size;
237 }
238
239 if (success < 2) {
240 es7000_plat = NON_UNISYS;
241 } else
242 setup_unisys();
243 return es7000_plat;
244}
245
246#ifdef CONFIG_ACPI
247static unsigned long oem_addrX;
248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
250{
251 struct acpi_table_header *header = NULL;
252 int i = 0;
253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
257 struct oem_table *t = (struct oem_table *)header;
258
259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
265 return 0;
266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
268 }
269 return -1;
270}
271
272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
279#endif
280
281static void
282es7000_spin(int n)
283{
284 int i = 0;
285
286 while (i++ < n)
287 rep_nop();
288}
289
290static int __init
291es7000_mip_write(struct mip_reg *mip_reg)
292{
293 int status = 0;
294 int spin;
295
296 spin = MIP_SPIN;
297 while (((unsigned long long)host_reg->off_38 &
298 (unsigned long long)MIP_VALID) != 0) {
299 if (--spin <= 0) {
300 printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
301 return -1;
302 }
303 es7000_spin(MIP_SPIN);
304 }
305
306 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
307 outb(1, mip_port);
308
309 spin = MIP_SPIN;
310
311 while (((unsigned long long)mip_reg->off_38 &
312 (unsigned long long)MIP_VALID) == 0) {
313 if (--spin <= 0) {
314 printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
315 return -1;
316 }
317 es7000_spin(MIP_SPIN);
318 }
319
320 status = ((unsigned long long)mip_reg->off_0 &
321 (unsigned long long)0xffff0000000000ULL) >> 48;
322 mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
323 (unsigned long long)~MIP_VALID);
324 return status;
325}
326
327int
328es7000_start_cpu(int cpu, unsigned long eip)
329{
330 unsigned long vect = 0, psaival = 0;
331
332 if (psai == NULL)
333 return -1;
334
335 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
336 psaival = (0x1000000 | vect | cpu);
337
338 while (*psai & 0x1000000)
339 ;
340
341 *psai = psaival;
342
343 return 0;
344
345}
346
347void __init
348es7000_sw_apic(void)
349{
350 if (es7000_plat) {
351 int mip_status;
352 struct mip_reg es7000_mip_reg;
353
354 printk("ES7000: Enabling APIC mode.\n");
355 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
356 es7000_mip_reg.off_0 = MIP_SW_APIC;
357 es7000_mip_reg.off_38 = (MIP_VALID);
358 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
359 printk("es7000_sw_apic: command failed, status = %x\n",
360 mip_status);
361 return;
362 }
363}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fdf..d073d981a730 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
11 11
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
15#include <linux/percpu.h> 16#include <linux/percpu.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/list.h> 18#include <linux/list.h>
18 19
19#include <asm/alternative.h>
20#include <asm/ftrace.h> 20#include <asm/ftrace.h>
21#include <asm/nops.h>
21 22
22 23
23/* Long is fine, even if it is only 4 bytes ;-) */ 24/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop; 25static unsigned long *ftrace_nop;
25 26
26union ftrace_code_union { 27union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 28 char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 61ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 62 unsigned char *new_code)
62{ 63{
63 unsigned replaced; 64 unsigned char replaced[MCOUNT_INSN_SIZE];
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68 65
69 /* 66 /*
70 * Note: Due to modules and __init, code can 67 * Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
72 * as well as code changing. 69 * as well as code changing.
73 * 70 *
74 * No real locking needed, this code is run through 71 * No real locking needed, this code is run through
75 * kstop_machine. 72 * kstop_machine, or before SMP starts.
76 */ 73 */
77 asm volatile ( 74 if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
78 "1: lock\n" 75 return 1;
79 " cmpxchg %3, (%2)\n" 76
80 " jnz 2f\n" 77 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
81 " movb %b4, 4(%2)\n" 78 return 2;
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93 79
94 if (replaced != old && replaced != new) 80 WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
95 faulted = 2; 81 MCOUNT_INSN_SIZE));
96 82
97 return faulted; 83 sync_core();
84
85 return 0;
98} 86}
99 87
100notrace int ftrace_update_ftrace_func(ftrace_func_t func) 88notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
112 100
113notrace int ftrace_mcount_set(unsigned long *data) 101notrace int ftrace_mcount_set(unsigned long *data)
114{ 102{
115 unsigned long ip = (long)(&mcount_call); 103 /* mcount is initialized as a nop */
116 unsigned long *addr = data; 104 *data = 0;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0; 105 return 0;
128} 106}
129 107
130int __init ftrace_dyn_arch_init(void *data) 108int __init ftrace_dyn_arch_init(void *data)
131{ 109{
132 const unsigned char *const *noptable = find_nop_table(); 110 extern const unsigned char ftrace_test_p6nop[];
133 111 extern const unsigned char ftrace_test_nop5[];
134 /* This is running in kstop_machine */ 112 extern const unsigned char ftrace_test_jmp[];
135 113 int faulted = 0;
136 ftrace_mcount_set(data);
137 114
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; 115 /*
116 * There is no good nop for all x86 archs.
117 * We will default to using the P6_NOP5, but first we
118 * will test to make sure that the nop will actually
119 * work on this CPU. If it faults, we will then
120 * go to a lesser efficient 5 byte nop. If that fails
121 * we then just use a jmp as our nop. This isn't the most
122 * efficient nop, but we can not use a multi part nop
123 * since we would then risk being preempted in the middle
124 * of that nop, and if we enabled tracing then, it might
125 * cause a system crash.
126 *
127 * TODO: check the cpuid to determine the best nop.
128 */
129 asm volatile (
130 "jmp ftrace_test_jmp\n"
131 /* This code needs to stay around */
132 ".section .text, \"ax\"\n"
133 "ftrace_test_jmp:"
134 "jmp ftrace_test_p6nop\n"
135 "nop\n"
136 "nop\n"
137 "nop\n" /* 2 byte jmp + 3 bytes */
138 "ftrace_test_p6nop:"
139 P6_NOP5
140 "jmp 1f\n"
141 "ftrace_test_nop5:"
142 ".byte 0x66,0x66,0x66,0x66,0x90\n"
143 "jmp 1f\n"
144 ".previous\n"
145 "1:"
146 ".section .fixup, \"ax\"\n"
147 "2: movl $1, %0\n"
148 " jmp ftrace_test_nop5\n"
149 "3: movl $2, %0\n"
150 " jmp 1b\n"
151 ".previous\n"
152 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
153 _ASM_EXTABLE(ftrace_test_nop5, 3b)
154 : "=r"(faulted) : "0" (faulted));
155
156 switch (faulted) {
157 case 0:
158 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
159 ftrace_nop = (unsigned long *)ftrace_test_p6nop;
160 break;
161 case 1:
162 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
163 ftrace_nop = (unsigned long *)ftrace_test_nop5;
164 break;
165 case 2:
166 pr_info("ftrace: converting mcount calls to jmp . + 5\n");
167 ftrace_nop = (unsigned long *)ftrace_test_jmp;
168 break;
169 }
170
171 /* The return code is retured via data */
172 *(unsigned long *)data = 0;
139 173
140 return 0; 174 return 0;
141} 175}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,86 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (max_physical_apicid < 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..c0262791bda4 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,23 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
183 printk(KERN_DEBUG "system APIC only can use physical flat");
184 return 1;
185 }
186#endif
187
188 return 0;
189}
133 190
134static cpumask_t physflat_target_cpus(void) 191static cpumask_t physflat_target_cpus(void)
135{ 192{
@@ -168,7 +225,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 225 * May as well be the first.
169 */ 226 */
170 cpu = first_cpu(cpumask); 227 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 228 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 229 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 230 else
174 return BAD_APICID; 231 return BAD_APICID;
@@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 233
177struct genapic apic_physflat = { 234struct genapic apic_physflat = {
178 .name = "physical flat", 235 .name = "physical flat",
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 237 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 238 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 239 .target_cpus = physflat_target_cpus,
@@ -185,6 +243,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 243 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 244 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 245 .send_IPI_mask = physflat_send_IPI_mask,
246 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 248 .phys_pkg_id = phys_pkg_id,
249 .get_apic_id = get_apic_id,
250 .set_apic_id = set_apic_id,
251 .apic_id_mask = (0xFFu<<24),
190}; 252};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..f6a2c8eb48a6
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..d042211768b7
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..680a06557c5e 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,18 +12,49 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
27 58
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +71,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 71short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 72EXPORT_SYMBOL_GPL(uv_possible_blades);
42 73
74unsigned long sn_rtc_cycles_per_second;
75EXPORT_SYMBOL(sn_rtc_cycles_per_second);
76
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 77/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 78
45static cpumask_t uv_target_cpus(void) 79static cpumask_t uv_target_cpus(void)
@@ -80,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
80 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
81 int pnode; 115 int pnode;
82 116
83 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
84 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
85 pnode = uv_apicid_to_pnode(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
86 val = 120 val =
@@ -94,7 +128,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
94{ 128{
95 unsigned int cpu; 129 unsigned int cpu;
96 130
97 for (cpu = 0; cpu < NR_CPUS; ++cpu) 131 for_each_possible_cpu(cpu)
98 if (cpu_isset(cpu, mask)) 132 if (cpu_isset(cpu, mask))
99 uv_send_IPI_one(cpu, vector); 133 uv_send_IPI_one(cpu, vector);
100} 134}
@@ -119,6 +153,10 @@ static int uv_apic_id_registered(void)
119 return 1; 153 return 1;
120} 154}
121 155
156static void uv_init_apic_ldr(void)
157{
158}
159
122static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
123{ 161{
124 int cpu; 162 int cpu;
@@ -128,37 +166,65 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
128 * May as well be the first. 166 * May as well be the first.
129 */ 167 */
130 cpu = first_cpu(cpumask); 168 cpu = first_cpu(cpumask);
131 if ((unsigned)cpu < NR_CPUS) 169 if ((unsigned)cpu < nr_cpu_ids)
132 return per_cpu(x86_cpu_to_apicid, cpu); 170 return per_cpu(x86_cpu_to_apicid, cpu);
133 else 171 else
134 return BAD_APICID; 172 return BAD_APICID;
135} 173}
136 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
137static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
138{ 201{
139 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
140} 203}
141 204
142#ifdef ZZZ /* Needs x2apic patch */
143static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
144{ 206{
145 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
146} 208}
147#endif
148 209
149struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
150 .name = "UV large system", 211 .name = "UV large system",
212 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
151 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
152 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
153 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
154 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
155 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
218 .init_apic_ldr = uv_init_apic_ldr,
156 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
157 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
158 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
159 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
160 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
161 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
225 .get_apic_id = get_apic_id,
226 .set_apic_id = set_apic_id,
227 .apic_id_mask = (0xFFFFFFFFu),
162}; 228};
163 229
164static __cpuinit void set_x2apic_extra_bits(int pnode) 230static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -218,12 +284,13 @@ static __init void map_low_mmrs(void)
218 284
219enum map_type {map_wb, map_uc}; 285enum map_type {map_wb, map_uc};
220 286
221static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
222{ 289{
223 unsigned long bytes, paddr; 290 unsigned long bytes, paddr;
224 291
225 paddr = base << shift; 292 paddr = base << shift;
226 bytes = (1UL << shift); 293 bytes = (1UL << shift) * (max_pnode + 1);
227 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
228 paddr + bytes); 295 paddr + bytes);
229 if (map_type == map_uc) 296 if (map_type == map_uc)
@@ -239,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
239 306
240 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
241 if (gru.s.enable) 308 if (gru.s.enable)
242 map_high("GRU", gru.s.base, shift, map_wb); 309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
243} 310}
244 311
245static __init void map_config_high(int max_pnode) 312static __init void map_config_high(int max_pnode)
@@ -249,7 +316,7 @@ static __init void map_config_high(int max_pnode)
249 316
250 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); 317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
251 if (cfg.s.enable) 318 if (cfg.s.enable)
252 map_high("CONFIG", cfg.s.base, shift, map_uc); 319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
253} 320}
254 321
255static __init void map_mmr_high(int max_pnode) 322static __init void map_mmr_high(int max_pnode)
@@ -259,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
259 326
260 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
261 if (mmr.s.enable) 328 if (mmr.s.enable)
262 map_high("MMR", mmr.s.base, shift, map_uc); 329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
263} 330}
264 331
265static __init void map_mmioh_high(int max_pnode) 332static __init void map_mmioh_high(int max_pnode)
@@ -269,10 +336,44 @@ static __init void map_mmioh_high(int max_pnode)
269 336
270 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
271 if (mmioh.s.enable) 338 if (mmioh.s.enable)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
340}
341
342static __init void uv_rtc_init(void)
343{
344 long status;
345 u64 ticks_per_sec;
346
347 status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
348 &ticks_per_sec);
349 if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
350 printk(KERN_WARNING
351 "unable to determine platform RTC clock frequency, "
352 "guessing.\n");
353 /* BIOS gives wrong value for clock freq. so guess */
354 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
355 } else
356 sn_rtc_cycles_per_second = ticks_per_sec;
273} 357}
274 358
275static __init void uv_system_init(void) 359/*
360 * Called on each cpu to initialize the per_cpu UV data area.
361 * ZZZ hotplug not supported yet
362 */
363void __cpuinit uv_cpu_init(void)
364{
365 /* CPU 0 initilization will be done via uv_system_init. */
366 if (!uv_blade_info)
367 return;
368
369 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
370
371 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
372 set_x2apic_extra_bits(uv_hub_info->pnode);
373}
374
375
376void __init uv_system_init(void)
276{ 377{
277 union uvh_si_addr_map_config_u m_n_config; 378 union uvh_si_addr_map_config_u m_n_config;
278 union uvh_node_id_u node_id; 379 union uvh_node_id_u node_id;
@@ -326,6 +427,11 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 427 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 428 ~((1 << n_val) - 1)) << m_val;
328 429
430 uv_bios_init();
431 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
432 &uv_coherency_id, &uv_region_size);
433 uv_rtc_init();
434
329 for_each_present_cpu(cpu) { 435 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 436 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 437 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
@@ -345,7 +451,7 @@ static __init void uv_system_init(void)
345 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 451 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
346 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 452 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
347 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 453 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
348 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 454 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
349 uv_node_to_blade[nid] = blade; 455 uv_node_to_blade[nid] = blade;
350 uv_cpu_to_blade[cpu] = blade; 456 uv_cpu_to_blade[cpu] = blade;
351 max_pnode = max(pnode, max_pnode); 457 max_pnode = max(pnode, max_pnode);
@@ -360,19 +466,6 @@ static __init void uv_system_init(void)
360 map_mmr_high(max_pnode); 466 map_mmr_high(max_pnode);
361 map_config_high(max_pnode); 467 map_config_high(max_pnode);
362 map_mmioh_high(max_pnode); 468 map_mmioh_high(max_pnode);
363}
364 469
365/* 470 uv_cpu_init();
366 * Called on each cpu to initialize the per_cpu UV data area.
367 * ZZZ hotplug not supported yet
368 */
369void __cpuinit uv_cpu_init(void)
370{
371 if (!uv_node_to_blade)
372 uv_system_init();
373
374 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
375
376 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
377 set_x2apic_extra_bits(uv_hub_info->pnode);
378} 471}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..1dcb0f13897e 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
38 39
39 /* Fixup: bios puts an EBDA in the top 64K segment */ 40 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */ 41 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
81 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
82 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
83 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
84 92
85 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
86 clear_bss(); 94 clear_bss();
@@ -100,13 +108,10 @@ void __init x86_64_start_kernel(char * real_mode_data)
100 } 108 }
101 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
102 110
103 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
104 112 early_printk("Kernel alive\n");
105 _cpu_pda = __cpu_pda;
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 x86_64_init_pda();
110 115
111 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
112} 117}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -456,9 +452,6 @@ is386: movl $2,%ecx # set MP
4561: 4521:
457#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
458 jmp *(initial_code) 454 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 455
463/* 456/*
464 * We depend on ET to be correct. This checks for 287/387. 457 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +594,11 @@ ignore_int:
601#endif 594#endif
602 iret 595 iret
603 596
597.section .cpuinit.data,"wa"
598.align 4
599ENTRY(initial_code)
600 .long i386_start_kernel
601
604.section .text 602.section .text
605/* 603/*
606 * Real beginning of normal "text" segment 604 * Real beginning of normal "text" segment
@@ -632,19 +630,19 @@ ENTRY(empty_zero_page)
632 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
633 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
634ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
635 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
636# if KPMDS == 3 634# if KPMDS == 3
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
638 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
639 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
640# elif KPMDS == 2 638# elif KPMDS == 2
641 .long 0,0 639 .long 0,0
642 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
643 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
644# elif KPMDS == 1 642# elif KPMDS == 1
645 .long 0,0 643 .long 0,0
646 .long 0,0 644 .long 0,0
647 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
648# else 646# else
649# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
650# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..77017e834cf7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h>
4#include <linux/sysdev.h>
3#include <linux/delay.h> 5#include <linux/delay.h>
4#include <linux/errno.h> 6#include <linux/errno.h>
5#include <linux/hpet.h> 7#include <linux/hpet.h>
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/sysdev.h> 9#include <linux/cpu.h>
8#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/io.h>
9 12
10#include <asm/fixmap.h> 13#include <asm/fixmap.h>
11#include <asm/hpet.h>
12#include <asm/i8253.h> 14#include <asm/i8253.h>
13#include <asm/io.h> 15#include <asm/hpet.h>
14 16
15#define HPET_MASK CLOCKSOURCE_MASK(32) 17#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22 18#define HPET_SHIFT 22
17 19
18/* FSEC = 10^-15 20/* FSEC = 10^-15
19 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000L 22#define FSEC_PER_NSEC 1000000L
23
24#define HPET_DEV_USED_BIT 2
25#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
26#define HPET_DEV_VALID 0x8
27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000
29
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
21 31
22/* 32/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
24 */ 34 */
25unsigned long hpet_address; 35unsigned long hpet_address;
26static void __iomem *hpet_virt_address; 36unsigned long hpet_num_timers;
37static void __iomem *hpet_virt_address;
38
39struct hpet_dev {
40 struct clock_event_device evt;
41 unsigned int num;
42 int cpu;
43 unsigned int irq;
44 unsigned int flags;
45 char name[10];
46};
27 47
28unsigned long hpet_readl(unsigned long a) 48unsigned long hpet_readl(unsigned long a)
29{ 49{
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
59static int boot_hpet_disable; 79static int boot_hpet_disable;
60int hpet_force_user; 80int hpet_force_user;
61 81
62static int __init hpet_setup(char* str) 82static int __init hpet_setup(char *str)
63{ 83{
64 if (str) { 84 if (str) {
65 if (!strncmp("disable", str, 7)) 85 if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
80 100
81static inline int is_hpet_capable(void) 101static inline int is_hpet_capable(void)
82{ 102{
83 return (!boot_hpet_disable && hpet_address); 103 return !boot_hpet_disable && hpet_address;
84} 104}
85 105
86/* 106/*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
102 * timer 0 and timer 1 in case of RTC emulation. 122 * timer 0 and timer 1 in case of RTC emulation.
103 */ 123 */
104#ifdef CONFIG_HPET 124#ifdef CONFIG_HPET
125
126static void hpet_reserve_msi_timers(struct hpet_data *hd);
127
105static void hpet_reserve_platform_timers(unsigned long id) 128static void hpet_reserve_platform_timers(unsigned long id)
106{ 129{
107 struct hpet __iomem *hpet = hpet_virt_address; 130 struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
111 134
112 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; 135 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
113 136
114 memset(&hd, 0, sizeof (hd)); 137 memset(&hd, 0, sizeof(hd));
115 hd.hd_phys_address = hpet_address; 138 hd.hd_phys_address = hpet_address;
116 hd.hd_address = hpet; 139 hd.hd_address = hpet;
117 hd.hd_nirqs = nrtimers; 140 hd.hd_nirqs = nrtimers;
118 hd.hd_flags = HPET_DATA_PLATFORM;
119 hpet_reserve_timer(&hd, 0); 141 hpet_reserve_timer(&hd, 0);
120 142
121#ifdef CONFIG_HPET_EMULATE_RTC 143#ifdef CONFIG_HPET_EMULATE_RTC
122 hpet_reserve_timer(&hd, 1); 144 hpet_reserve_timer(&hd, 1);
123#endif 145#endif
124 146
147 /*
148 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
149 * is wrong for i8259!) not the output IRQ. Many BIOS writers
150 * don't bother configuring *any* comparator interrupts.
151 */
125 hd.hd_irq[0] = HPET_LEGACY_8254; 152 hd.hd_irq[0] = HPET_LEGACY_8254;
126 hd.hd_irq[1] = HPET_LEGACY_RTC; 153 hd.hd_irq[1] = HPET_LEGACY_RTC;
127 154
128 for (i = 2; i < nrtimers; timer++, i++) { 155 for (i = 2; i < nrtimers; timer++, i++) {
129 hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> 156 hd.hd_irq[i] = (readl(&timer->hpet_config) &
130 Tn_INT_ROUTE_CNF_SHIFT; 157 Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
131 } 158 }
132 159
160 hpet_reserve_msi_timers(&hd);
161
133 hpet_alloc(&hd); 162 hpet_alloc(&hd);
134 163
135} 164}
@@ -210,8 +239,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 239 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 240 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 241 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 242 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 243 hpet_clockevent.min_delta_ns = 5000;
215 244
216 /* 245 /*
217 * Start hpet with the boot cpu mask and make it 246 * Start hpet with the boot cpu mask and make it
@@ -223,63 +252,421 @@ static void hpet_legacy_clockevent_register(void)
223 printk(KERN_DEBUG "hpet clockevent registered\n"); 252 printk(KERN_DEBUG "hpet clockevent registered\n");
224} 253}
225 254
226static void hpet_legacy_set_mode(enum clock_event_mode mode, 255static int hpet_setup_msi_irq(unsigned int irq);
227 struct clock_event_device *evt) 256
257static void hpet_set_mode(enum clock_event_mode mode,
258 struct clock_event_device *evt, int timer)
228{ 259{
229 unsigned long cfg, cmp, now; 260 unsigned long cfg, cmp, now;
230 uint64_t delta; 261 uint64_t delta;
231 262
232 switch(mode) { 263 switch (mode) {
233 case CLOCK_EVT_MODE_PERIODIC: 264 case CLOCK_EVT_MODE_PERIODIC:
234 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; 265 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
235 delta >>= hpet_clockevent.shift; 266 delta >>= evt->shift;
236 now = hpet_readl(HPET_COUNTER); 267 now = hpet_readl(HPET_COUNTER);
237 cmp = now + (unsigned long) delta; 268 cmp = now + (unsigned long) delta;
238 cfg = hpet_readl(HPET_T0_CFG); 269 cfg = hpet_readl(HPET_Tn_CFG(timer));
239 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 270 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
240 HPET_TN_SETVAL | HPET_TN_32BIT; 271 HPET_TN_SETVAL | HPET_TN_32BIT;
241 hpet_writel(cfg, HPET_T0_CFG); 272 hpet_writel(cfg, HPET_Tn_CFG(timer));
242 /* 273 /*
243 * The first write after writing TN_SETVAL to the 274 * The first write after writing TN_SETVAL to the
244 * config register sets the counter value, the second 275 * config register sets the counter value, the second
245 * write sets the period. 276 * write sets the period.
246 */ 277 */
247 hpet_writel(cmp, HPET_T0_CMP); 278 hpet_writel(cmp, HPET_Tn_CMP(timer));
248 udelay(1); 279 udelay(1);
249 hpet_writel((unsigned long) delta, HPET_T0_CMP); 280 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
250 break; 281 break;
251 282
252 case CLOCK_EVT_MODE_ONESHOT: 283 case CLOCK_EVT_MODE_ONESHOT:
253 cfg = hpet_readl(HPET_T0_CFG); 284 cfg = hpet_readl(HPET_Tn_CFG(timer));
254 cfg &= ~HPET_TN_PERIODIC; 285 cfg &= ~HPET_TN_PERIODIC;
255 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; 286 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
256 hpet_writel(cfg, HPET_T0_CFG); 287 hpet_writel(cfg, HPET_Tn_CFG(timer));
257 break; 288 break;
258 289
259 case CLOCK_EVT_MODE_UNUSED: 290 case CLOCK_EVT_MODE_UNUSED:
260 case CLOCK_EVT_MODE_SHUTDOWN: 291 case CLOCK_EVT_MODE_SHUTDOWN:
261 cfg = hpet_readl(HPET_T0_CFG); 292 cfg = hpet_readl(HPET_Tn_CFG(timer));
262 cfg &= ~HPET_TN_ENABLE; 293 cfg &= ~HPET_TN_ENABLE;
263 hpet_writel(cfg, HPET_T0_CFG); 294 hpet_writel(cfg, HPET_Tn_CFG(timer));
264 break; 295 break;
265 296
266 case CLOCK_EVT_MODE_RESUME: 297 case CLOCK_EVT_MODE_RESUME:
267 hpet_enable_legacy_int(); 298 if (timer == 0) {
299 hpet_enable_legacy_int();
300 } else {
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
305 enable_irq(hdev->irq);
306 }
268 break; 307 break;
269 } 308 }
270} 309}
271 310
272static int hpet_legacy_next_event(unsigned long delta, 311static int hpet_next_event(unsigned long delta,
273 struct clock_event_device *evt) 312 struct clock_event_device *evt, int timer)
274{ 313{
275 unsigned long cnt; 314 u32 cnt;
276 315
277 cnt = hpet_readl(HPET_COUNTER); 316 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 317 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 318 hpet_writel(cnt, HPET_Tn_CMP(timer));
319
320 /*
321 * We need to read back the CMP register to make sure that
322 * what we wrote hit the chip before we compare it to the
323 * counter.
324 */
325 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
326
327 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
328}
329
330static void hpet_legacy_set_mode(enum clock_event_mode mode,
331 struct clock_event_device *evt)
332{
333 hpet_set_mode(mode, evt, 0);
334}
335
336static int hpet_legacy_next_event(unsigned long delta,
337 struct clock_event_device *evt)
338{
339 return hpet_next_event(delta, evt, 0);
340}
341
342/*
343 * HPET MSI Support
344 */
345#ifdef CONFIG_PCI_MSI
346
347static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
348static struct hpet_dev *hpet_devs;
349
350void hpet_msi_unmask(unsigned int irq)
351{
352 struct hpet_dev *hdev = get_irq_data(irq);
353 unsigned long cfg;
354
355 /* unmask it */
356 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
357 cfg |= HPET_TN_FSB;
358 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
359}
360
361void hpet_msi_mask(unsigned int irq)
362{
363 unsigned long cfg;
364 struct hpet_dev *hdev = get_irq_data(irq);
365
366 /* mask it */
367 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
368 cfg &= ~HPET_TN_FSB;
369 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
370}
371
372void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
373{
374 struct hpet_dev *hdev = get_irq_data(irq);
375
376 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
377 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
378}
379
380void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
381{
382 struct hpet_dev *hdev = get_irq_data(irq);
383
384 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
385 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
386 msg->address_hi = 0;
387}
388
389static void hpet_msi_set_mode(enum clock_event_mode mode,
390 struct clock_event_device *evt)
391{
392 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
393 hpet_set_mode(mode, evt, hdev->num);
394}
395
396static int hpet_msi_next_event(unsigned long delta,
397 struct clock_event_device *evt)
398{
399 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
400 return hpet_next_event(delta, evt, hdev->num);
401}
402
403static int hpet_setup_msi_irq(unsigned int irq)
404{
405 if (arch_setup_hpet_msi(irq)) {
406 destroy_irq(irq);
407 return -EINVAL;
408 }
409 return 0;
410}
411
412static int hpet_assign_irq(struct hpet_dev *dev)
413{
414 unsigned int irq;
415
416 irq = create_irq();
417 if (!irq)
418 return -EINVAL;
419
420 set_irq_data(irq, dev);
421
422 if (hpet_setup_msi_irq(irq))
423 return -EINVAL;
424
425 dev->irq = irq;
426 return 0;
427}
428
429static irqreturn_t hpet_interrupt_handler(int irq, void *data)
430{
431 struct hpet_dev *dev = (struct hpet_dev *)data;
432 struct clock_event_device *hevt = &dev->evt;
433
434 if (!hevt->event_handler) {
435 printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
436 dev->num);
437 return IRQ_HANDLED;
438 }
439
440 hevt->event_handler(hevt);
441 return IRQ_HANDLED;
442}
443
444static int hpet_setup_irq(struct hpet_dev *dev)
445{
446
447 if (request_irq(dev->irq, hpet_interrupt_handler,
448 IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
449 return -1;
450
451 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
453 enable_irq(dev->irq);
454
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
456 dev->name, dev->irq);
457
458 return 0;
459}
460
461/* This should be called in specific @cpu */
462static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
463{
464 struct clock_event_device *evt = &hdev->evt;
465 uint64_t hpet_freq;
466
467 WARN_ON(cpu != smp_processor_id());
468 if (!(hdev->flags & HPET_DEV_VALID))
469 return;
280 470
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 471 if (hpet_setup_msi_irq(hdev->irq))
472 return;
473
474 hdev->cpu = cpu;
475 per_cpu(cpu_hpet_dev, cpu) = hdev;
476 evt->name = hdev->name;
477 hpet_setup_irq(hdev);
478 evt->irq = hdev->irq;
479
480 evt->rating = 110;
481 evt->features = CLOCK_EVT_FEAT_ONESHOT;
482 if (hdev->flags & HPET_DEV_PERI_CAP)
483 evt->features |= CLOCK_EVT_FEAT_PERIODIC;
484
485 evt->set_mode = hpet_msi_set_mode;
486 evt->set_next_event = hpet_msi_next_event;
487 evt->shift = 32;
488
489 /*
490 * The period is a femto seconds value. We need to calculate the
491 * scaled math multiplication factor for nanosecond to hpet tick
492 * conversion.
493 */
494 hpet_freq = 1000000000000000ULL;
495 do_div(hpet_freq, hpet_period);
496 evt->mult = div_sc((unsigned long) hpet_freq,
497 NSEC_PER_SEC, evt->shift);
498 /* Calculate the max delta */
499 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
500 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000;
502
503 evt->cpumask = cpumask_of_cpu(hdev->cpu);
504 clockevents_register_device(evt);
505}
506
507#ifdef CONFIG_HPET
508/* Reserve at least one timer for userspace (/dev/hpet) */
509#define RESERVE_TIMERS 1
510#else
511#define RESERVE_TIMERS 0
512#endif
513
514static void hpet_msi_capability_lookup(unsigned int start_timer)
515{
516 unsigned int id;
517 unsigned int num_timers;
518 unsigned int num_timers_used = 0;
519 int i;
520
521 id = hpet_readl(HPET_ID);
522
523 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
524 num_timers++; /* Value read out starts from 0 */
525
526 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
527 if (!hpet_devs)
528 return;
529
530 hpet_num_timers = num_timers;
531
532 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
533 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
534 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
535
536 /* Only consider HPET timer with MSI support */
537 if (!(cfg & HPET_TN_FSB_CAP))
538 continue;
539
540 hdev->flags = 0;
541 if (cfg & HPET_TN_PERIODIC_CAP)
542 hdev->flags |= HPET_DEV_PERI_CAP;
543 hdev->num = i;
544
545 sprintf(hdev->name, "hpet%d", i);
546 if (hpet_assign_irq(hdev))
547 continue;
548
549 hdev->flags |= HPET_DEV_FSB_CAP;
550 hdev->flags |= HPET_DEV_VALID;
551 num_timers_used++;
552 if (num_timers_used == num_possible_cpus())
553 break;
554 }
555
556 printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
557 num_timers, num_timers_used);
558}
559
560#ifdef CONFIG_HPET
561static void hpet_reserve_msi_timers(struct hpet_data *hd)
562{
563 int i;
564
565 if (!hpet_devs)
566 return;
567
568 for (i = 0; i < hpet_num_timers; i++) {
569 struct hpet_dev *hdev = &hpet_devs[i];
570
571 if (!(hdev->flags & HPET_DEV_VALID))
572 continue;
573
574 hd->hd_irq[hdev->num] = hdev->irq;
575 hpet_reserve_timer(hd, hdev->num);
576 }
577}
578#endif
579
580static struct hpet_dev *hpet_get_unused_timer(void)
581{
582 int i;
583
584 if (!hpet_devs)
585 return NULL;
586
587 for (i = 0; i < hpet_num_timers; i++) {
588 struct hpet_dev *hdev = &hpet_devs[i];
589
590 if (!(hdev->flags & HPET_DEV_VALID))
591 continue;
592 if (test_and_set_bit(HPET_DEV_USED_BIT,
593 (unsigned long *)&hdev->flags))
594 continue;
595 return hdev;
596 }
597 return NULL;
598}
599
600struct hpet_work_struct {
601 struct delayed_work work;
602 struct completion complete;
603};
604
605static void hpet_work(struct work_struct *w)
606{
607 struct hpet_dev *hdev;
608 int cpu = smp_processor_id();
609 struct hpet_work_struct *hpet_work;
610
611 hpet_work = container_of(w, struct hpet_work_struct, work.work);
612
613 hdev = hpet_get_unused_timer();
614 if (hdev)
615 init_one_hpet_msi_clockevent(hdev, cpu);
616
617 complete(&hpet_work->complete);
618}
619
620static int hpet_cpuhp_notify(struct notifier_block *n,
621 unsigned long action, void *hcpu)
622{
623 unsigned long cpu = (unsigned long)hcpu;
624 struct hpet_work_struct work;
625 struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
626
627 switch (action & 0xf) {
628 case CPU_ONLINE:
629 INIT_DELAYED_WORK(&work.work, hpet_work);
630 init_completion(&work.complete);
631 /* FIXME: add schedule_work_on() */
632 schedule_delayed_work_on(cpu, &work.work, 0);
633 wait_for_completion(&work.complete);
634 break;
635 case CPU_DEAD:
636 if (hdev) {
637 free_irq(hdev->irq, hdev);
638 hdev->flags &= ~HPET_DEV_USED;
639 per_cpu(cpu_hpet_dev, cpu) = NULL;
640 }
641 break;
642 }
643 return NOTIFY_OK;
644}
645#else
646
647static int hpet_setup_msi_irq(unsigned int irq)
648{
649 return 0;
650}
651static void hpet_msi_capability_lookup(unsigned int start_timer)
652{
653 return;
654}
655
656#ifdef CONFIG_HPET
657static void hpet_reserve_msi_timers(struct hpet_data *hd)
658{
659 return;
282} 660}
661#endif
662
663static int hpet_cpuhp_notify(struct notifier_block *n,
664 unsigned long action, void *hcpu)
665{
666 return NOTIFY_OK;
667}
668
669#endif
283 670
284/* 671/*
285 * Clock source related code 672 * Clock source related code
@@ -359,6 +746,7 @@ static int hpet_clocksource_register(void)
359int __init hpet_enable(void) 746int __init hpet_enable(void)
360{ 747{
361 unsigned long id; 748 unsigned long id;
749 int i;
362 750
363 if (!is_hpet_capable()) 751 if (!is_hpet_capable())
364 return 0; 752 return 0;
@@ -369,6 +757,29 @@ int __init hpet_enable(void)
369 * Read the period and check for a sane value: 757 * Read the period and check for a sane value:
370 */ 758 */
371 hpet_period = hpet_readl(HPET_PERIOD); 759 hpet_period = hpet_readl(HPET_PERIOD);
760
761 /*
762 * AMD SB700 based systems with spread spectrum enabled use a
763 * SMM based HPET emulation to provide proper frequency
764 * setting. The SMM code is initialized with the first HPET
765 * register access and takes some time to complete. During
766 * this time the config register reads 0xffffffff. We check
767 * for max. 1000 loops whether the config register reads a non
768 * 0xffffffff value to make sure that HPET is up and running
769 * before we go further. A counting loop is safe, as the HPET
770 * access takes thousands of CPU cycles. On non SB700 based
771 * machines this check is only done once and has no side
772 * effects.
773 */
774 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
775 if (i == 1000) {
776 printk(KERN_WARNING
777 "HPET config register value = 0xFFFFFFFF. "
778 "Disabling HPET\n");
779 goto out_nohpet;
780 }
781 }
782
372 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 783 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
373 goto out_nohpet; 784 goto out_nohpet;
374 785
@@ -392,8 +803,10 @@ int __init hpet_enable(void)
392 803
393 if (id & HPET_ID_LEGSUP) { 804 if (id & HPET_ID_LEGSUP) {
394 hpet_legacy_clockevent_register(); 805 hpet_legacy_clockevent_register();
806 hpet_msi_capability_lookup(2);
395 return 1; 807 return 1;
396 } 808 }
809 hpet_msi_capability_lookup(0);
397 return 0; 810 return 0;
398 811
399out_nohpet: 812out_nohpet:
@@ -410,6 +823,8 @@ out_nohpet:
410 */ 823 */
411static __init int hpet_late_init(void) 824static __init int hpet_late_init(void)
412{ 825{
826 int cpu;
827
413 if (boot_hpet_disable) 828 if (boot_hpet_disable)
414 return -ENODEV; 829 return -ENODEV;
415 830
@@ -425,6 +840,13 @@ static __init int hpet_late_init(void)
425 840
426 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 841 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
427 842
843 for_each_online_cpu(cpu) {
844 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
845 }
846
847 /* This notifier should be called after workqueue is ready */
848 hotcpu_notifier(hpet_cpuhp_notify, -20);
849
428 return 0; 850 return 0;
429} 851}
430fs_initcall(hpet_late_init); 852fs_initcall(hpet_late_init);
@@ -468,7 +890,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 890#define RTC_NUM_INTS 1
469 891
470static unsigned long hpet_rtc_flags; 892static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 893static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 894static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 895static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 896static unsigned long hpet_t1_cmp;
@@ -575,6 +997,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 997
576 hpet_rtc_flags |= bit_mask; 998 hpet_rtc_flags |= bit_mask;
577 999
1000 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
1001 hpet_prev_update_sec = -1;
1002
578 if (!oldbits) 1003 if (!oldbits)
579 hpet_rtc_timer_init(); 1004 hpet_rtc_timer_init();
580 1005
@@ -652,7 +1077,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 1077 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 1078 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 1079 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 1080 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 1081 lost_ints);
657 } 1082 }
658} 1083}
@@ -670,7 +1095,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 1095
671 if (hpet_rtc_flags & RTC_UIE && 1096 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 1097 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 1098 if (hpet_prev_update_sec >= 0)
1099 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 1100 hpet_prev_update_sec = curr_time.tm_sec;
675 } 1101 }
676 1102
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d9204..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
282 282
283device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
284 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
285void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
286{ 310{
287 unsigned long flags; 311 unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 6510cde36b35..b764d7429c61 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,16 +27,21 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/pci.h> 28#include <linux/pci.h>
29#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h>
31#include <linux/sysdev.h> 33#include <linux/sysdev.h>
32#include <linux/msi.h> 34#include <linux/msi.h>
33#include <linux/htirq.h> 35#include <linux/htirq.h>
34#include <linux/dmar.h> 36#include <linux/freezer.h>
35#include <linux/jiffies.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
36#ifdef CONFIG_ACPI 39#ifdef CONFIG_ACPI
37#include <acpi/acpi_bus.h> 40#include <acpi/acpi_bus.h>
38#endif 41#endif
39#include <linux/bootmem.h> 42#include <linux/bootmem.h>
43#include <linux/dmar.h>
44#include <linux/hpet.h>
40 45
41#include <asm/idle.h> 46#include <asm/idle.h>
42#include <asm/io.h> 47#include <asm/io.h>
@@ -45,62 +50,31 @@
45#include <asm/proto.h> 50#include <asm/proto.h>
46#include <asm/acpi.h> 51#include <asm/acpi.h>
47#include <asm/dma.h> 52#include <asm/dma.h>
53#include <asm/timer.h>
54#include <asm/i8259.h>
48#include <asm/nmi.h> 55#include <asm/nmi.h>
49#include <asm/msidef.h> 56#include <asm/msidef.h>
50#include <asm/hypertransport.h> 57#include <asm/hypertransport.h>
58#include <asm/setup.h>
59#include <asm/irq_remapping.h>
60#include <asm/hpet.h>
61#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h>
51 63
52#include <mach_ipi.h> 64#include <mach_ipi.h>
53#include <mach_apic.h> 65#include <mach_apic.h>
66#include <mach_apicdef.h>
54 67
55struct irq_cfg { 68#define __apicdebuginit(type) static type __init
56 cpumask_t domain;
57 cpumask_t old_domain;
58 unsigned move_cleanup_count;
59 u8 vector;
60 u8 move_in_progress : 1;
61};
62
63/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
64static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
65 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
66 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
67 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
68 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
69 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
70 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
71 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
72 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
73 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
74 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
75 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
76 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
77 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
78 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
79 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
80 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
81};
82
83static int assign_irq_vector(int irq, cpumask_t mask);
84 69
85int first_system_vector = 0xfe; 70/*
86 71 * Is the SiS APIC rmw bug present ?
87char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 72 * -1 = don't know, 0 = no, 1 = yes
88 73 */
89#define __apicdebuginit __init 74int sis_apic_bug = -1;
90
91int sis_apic_bug; /* not actually supported, dummy for compile */
92
93static int no_timer_check;
94
95static int disable_timer_pin_1 __initdata;
96
97int timer_through_8259 __initdata;
98
99/* Where if anywhere is the i8259 connect in external int mode */
100static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
101 75
102static DEFINE_SPINLOCK(ioapic_lock); 76static DEFINE_SPINLOCK(ioapic_lock);
103DEFINE_SPINLOCK(vector_lock); 77static DEFINE_SPINLOCK(vector_lock);
104 78
105/* 79/*
106 * # of IRQ routing registers 80 * # of IRQ routing registers
@@ -117,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
117/* # of MP IRQ source entries */ 91/* # of MP IRQ source entries */
118int mp_irq_entries; 92int mp_irq_entries;
119 93
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif
97
120DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); 98DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
121 99
100int skip_ioapic_setup;
101
102static int __init parse_noapic(char *str)
103{
104 /* disable IO-APIC */
105 disable_ioapic_setup();
106 return 0;
107}
108early_param("noapic", parse_noapic);
109
110struct irq_pin_list;
111struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain;
115 cpumask_t old_domain;
116 unsigned move_cleanup_count;
117 u8 vector;
118 u8 move_in_progress : 1;
119};
120
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
122static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139};
140
141#define for_each_irq_cfg(irq, cfg) \
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL;
147}
148
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
150{
151 return irq_cfg(irq);
152}
153
122/* 154/*
123 * Rough estimation of how many shared IRQs there are, can 155 * Rough estimation of how many shared IRQs there are, can be changed
124 * be changed anytime. 156 * anytime.
125 */ 157 */
126#define MAX_PLUS_SHARED_IRQS NR_IRQS 158#define MAX_PLUS_SHARED_IRQS NR_IRQS
127#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -133,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
133 * between pins and IRQs. 165 * between pins and IRQs.
134 */ 166 */
135 167
136static struct irq_pin_list { 168struct irq_pin_list {
137 short apic, pin, next; 169 int apic, pin;
138} irq_2_pin[PIN_MAP_SIZE]; 170 struct irq_pin_list *next;
171};
172
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
174static struct irq_pin_list *irq_2_pin_ptr;
175
176static void __init irq_2_pin_init(void)
177{
178 struct irq_pin_list *pin = irq_2_pin_head;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183
184 irq_2_pin_ptr = &pin[0];
185}
186
187static struct irq_pin_list *get_one_free_irq_2_pin(void)
188{
189 struct irq_pin_list *pin = irq_2_pin_ptr;
190
191 if (!pin)
192 panic("can not get more irq_2_pin\n");
193
194 irq_2_pin_ptr = pin->next;
195 pin->next = NULL;
196 return pin;
197}
139 198
140struct io_apic { 199struct io_apic {
141 unsigned int index; 200 unsigned int index;
@@ -166,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
166/* 225/*
167 * Re-write a value: to be used for read-modify-write 226 * Re-write a value: to be used for read-modify-write
168 * cycles where the read already set up the index register. 227 * cycles where the read already set up the index register.
228 *
229 * Older SiS APIC requires we rewrite the index register
169 */ 230 */
170static inline void io_apic_modify(unsigned int apic, unsigned int value) 231static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
171{ 232{
172 struct io_apic __iomem *io_apic = io_apic_base(apic); 233 struct io_apic __iomem *io_apic = io_apic_base(apic);
234
235 if (sis_apic_bug)
236 writel(reg, &io_apic->index);
173 writel(value, &io_apic->data); 237 writel(value, &io_apic->data);
174} 238}
175 239
@@ -177,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
177{ 241{
178 struct irq_pin_list *entry; 242 struct irq_pin_list *entry;
179 unsigned long flags; 243 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
180 245
181 spin_lock_irqsave(&ioapic_lock, flags); 246 spin_lock_irqsave(&ioapic_lock, flags);
182 entry = irq_2_pin + irq; 247 entry = cfg->irq_2_pin;
183 for (;;) { 248 for (;;) {
184 unsigned int reg; 249 unsigned int reg;
185 int pin; 250 int pin;
186 251
187 pin = entry->pin; 252 if (!entry)
188 if (pin == -1)
189 break; 253 break;
254 pin = entry->pin;
190 reg = io_apic_read(entry->apic, 0x10 + pin*2); 255 reg = io_apic_read(entry->apic, 0x10 + pin*2);
191 /* Is the remote IRR bit set? */ 256 /* Is the remote IRR bit set? */
192 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 257 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -195,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
195 } 260 }
196 if (!entry->next) 261 if (!entry->next)
197 break; 262 break;
198 entry = irq_2_pin + entry->next; 263 entry = entry->next;
199 } 264 }
200 spin_unlock_irqrestore(&ioapic_lock, flags); 265 spin_unlock_irqrestore(&ioapic_lock, flags);
201 266
202 return false; 267 return false;
203} 268}
204 269
205/*
206 * Synchronize the IO-APIC and the CPU by doing
207 * a dummy read from the IO-APIC
208 */
209static inline void io_apic_sync(unsigned int apic)
210{
211 struct io_apic __iomem *io_apic = io_apic_base(apic);
212 readl(&io_apic->data);
213}
214
215#define __DO_ACTION(R, ACTION, FINAL) \
216 \
217{ \
218 int pin; \
219 struct irq_pin_list *entry = irq_2_pin + irq; \
220 \
221 BUG_ON(irq >= NR_IRQS); \
222 for (;;) { \
223 unsigned int reg; \
224 pin = entry->pin; \
225 if (pin == -1) \
226 break; \
227 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
228 reg ACTION; \
229 io_apic_modify(entry->apic, reg); \
230 FINAL; \
231 if (!entry->next) \
232 break; \
233 entry = irq_2_pin + entry->next; \
234 } \
235}
236
237union entry_union { 270union entry_union {
238 struct { u32 w1, w2; }; 271 struct { u32 w1, w2; };
239 struct IO_APIC_route_entry entry; 272 struct IO_APIC_route_entry entry;
@@ -293,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
293static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
294{ 327{
295 int apic, pin; 328 int apic, pin;
296 struct irq_pin_list *entry = irq_2_pin + irq; 329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry;
297 331
298 BUG_ON(irq >= NR_IRQS); 332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin;
299 for (;;) { 334 for (;;) {
300 unsigned int reg; 335 unsigned int reg;
336
337 if (!entry)
338 break;
339
301 apic = entry->apic; 340 apic = entry->apic;
302 pin = entry->pin; 341 pin = entry->pin;
303 if (pin == -1) 342#ifdef CONFIG_INTR_REMAP
304 break; 343 /*
344 * With interrupt-remapping, destination information comes
345 * from interrupt-remapping table entry.
346 */
347 if (!irq_remapped(irq))
348 io_apic_write(apic, 0x11 + pin*2, dest);
349#else
305 io_apic_write(apic, 0x11 + pin*2, dest); 350 io_apic_write(apic, 0x11 + pin*2, dest);
351#endif
306 reg = io_apic_read(apic, 0x10 + pin*2); 352 reg = io_apic_read(apic, 0x10 + pin*2);
307 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 353 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
308 reg |= vector; 354 reg |= vector;
309 io_apic_modify(apic, reg); 355 io_apic_modify(apic, 0x10 + pin*2, reg);
310 if (!entry->next) 356 if (!entry->next)
311 break; 357 break;
312 entry = irq_2_pin + entry->next; 358 entry = entry->next;
313 } 359 }
314} 360}
315 361
362static int assign_irq_vector(int irq, cpumask_t mask);
363
316static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
317{ 365{
318 struct irq_cfg *cfg = irq_cfg + irq; 366 struct irq_cfg *cfg;
319 unsigned long flags; 367 unsigned long flags;
320 unsigned int dest; 368 unsigned int dest;
321 cpumask_t tmp; 369 cpumask_t tmp;
370 struct irq_desc *desc;
322 371
323 cpus_and(tmp, mask, cpu_online_map); 372 cpus_and(tmp, mask, cpu_online_map);
324 if (cpus_empty(tmp)) 373 if (cpus_empty(tmp))
325 return; 374 return;
326 375
376 cfg = irq_cfg(irq);
327 if (assign_irq_vector(irq, mask)) 377 if (assign_irq_vector(irq, mask))
328 return; 378 return;
329 379
330 cpus_and(tmp, cfg->domain, mask); 380 cpus_and(tmp, cfg->domain, mask);
331 dest = cpu_mask_to_apicid(tmp); 381 dest = cpu_mask_to_apicid(tmp);
332
333 /* 382 /*
334 * Only the high 8 bits are valid. 383 * Only the high 8 bits are valid.
335 */ 384 */
336 dest = SET_APIC_LOGICAL_ID(dest); 385 dest = SET_APIC_LOGICAL_ID(dest);
337 386
387 desc = irq_to_desc(irq);
338 spin_lock_irqsave(&ioapic_lock, flags); 388 spin_lock_irqsave(&ioapic_lock, flags);
339 __target_IO_APIC_irq(irq, dest, cfg->vector); 389 __target_IO_APIC_irq(irq, dest, cfg->vector);
340 irq_desc[irq].affinity = mask; 390 desc->affinity = mask;
341 spin_unlock_irqrestore(&ioapic_lock, flags); 391 spin_unlock_irqrestore(&ioapic_lock, flags);
342} 392}
343#endif 393#endif /* CONFIG_SMP */
344 394
345/* 395/*
346 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 396 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -349,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
349 */ 399 */
350static void add_pin_to_irq(unsigned int irq, int apic, int pin) 400static void add_pin_to_irq(unsigned int irq, int apic, int pin)
351{ 401{
352 static int first_free_entry = NR_IRQS; 402 struct irq_cfg *cfg;
353 struct irq_pin_list *entry = irq_2_pin + irq; 403 struct irq_pin_list *entry;
404
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin;
408 if (!entry) {
409 entry = get_one_free_irq_2_pin();
410 cfg->irq_2_pin = entry;
411 entry->apic = apic;
412 entry->pin = pin;
413 return;
414 }
354 415
355 BUG_ON(irq >= NR_IRQS); 416 while (entry->next) {
356 while (entry->next) 417 /* not again, please */
357 entry = irq_2_pin + entry->next; 418 if (entry->apic == apic && entry->pin == pin)
419 return;
358 420
359 if (entry->pin != -1) { 421 entry = entry->next;
360 entry->next = first_free_entry;
361 entry = irq_2_pin + entry->next;
362 if (++first_free_entry >= PIN_MAP_SIZE)
363 panic("io_apic.c: ran out of irq_2_pin entries!");
364 } 422 }
423
424 entry->next = get_one_free_irq_2_pin();
425 entry = entry->next;
365 entry->apic = apic; 426 entry->apic = apic;
366 entry->pin = pin; 427 entry->pin = pin;
367} 428}
@@ -373,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
373 int oldapic, int oldpin, 434 int oldapic, int oldpin,
374 int newapic, int newpin) 435 int newapic, int newpin)
375{ 436{
376 struct irq_pin_list *entry = irq_2_pin + irq; 437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0;
377 440
378 while (1) { 441 while (entry) {
379 if (entry->apic == oldapic && entry->pin == oldpin) { 442 if (entry->apic == oldapic && entry->pin == oldpin) {
380 entry->apic = newapic; 443 entry->apic = newapic;
381 entry->pin = newpin; 444 entry->pin = newpin;
382 } 445 replaced = 1;
383 if (!entry->next) 446 /* every one is different, right? */
384 break; 447 break;
385 entry = irq_2_pin + entry->next; 448 }
449 entry = entry->next;
386 } 450 }
451
452 /* why? call replace before add? */
453 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin);
387} 455}
388 456
457static inline void io_apic_modify_irq(unsigned int irq,
458 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry))
460{
461 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry;
389 464
390#define DO_ACTION(name,R,ACTION, FINAL) \ 465 cfg = irq_cfg(irq);
391 \ 466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
392 static void name##_IO_APIC_irq (unsigned int irq) \ 467 unsigned int reg;
393 __DO_ACTION(R, ACTION, FINAL) 468 pin = entry->pin;
469 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
470 reg &= mask_and;
471 reg |= mask_or;
472 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
473 if (final)
474 final(entry);
475 }
476}
394 477
395/* mask = 1 */ 478static void __unmask_IO_APIC_irq(unsigned int irq)
396DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) 479{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
481}
397 482
398/* mask = 0 */ 483#ifdef CONFIG_X86_64
399DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) 484void io_apic_sync(struct irq_pin_list *entry)
485{
486 /*
487 * Synchronize the IO-APIC and the CPU by doing
488 * a dummy read from the IO-APIC
489 */
490 struct io_apic __iomem *io_apic;
491 io_apic = io_apic_base(entry->apic);
492 readl(&io_apic->data);
493}
494
495static void __mask_IO_APIC_irq(unsigned int irq)
496{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498}
499#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq)
501{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
503}
504
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
506{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL);
509}
510
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
512{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515}
516#endif /* CONFIG_X86_32 */
400 517
401static void mask_IO_APIC_irq (unsigned int irq) 518static void mask_IO_APIC_irq (unsigned int irq)
402{ 519{
@@ -439,24 +556,145 @@ static void clear_IO_APIC (void)
439 clear_IO_APIC_pin(apic, pin); 556 clear_IO_APIC_pin(apic, pin);
440} 557}
441 558
442int skip_ioapic_setup; 559#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
443int ioapic_force; 560void send_IPI_self(int vector)
561{
562 unsigned int cfg;
444 563
445static int __init parse_noapic(char *str) 564 /*
565 * Wait for idle.
566 */
567 apic_wait_icr_idle();
568 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
569 /*
570 * Send the IPI. The write to APIC_ICR fires this off.
571 */
572 apic_write(APIC_ICR, cfg);
573}
574#endif /* !CONFIG_SMP && CONFIG_X86_32*/
575
576#ifdef CONFIG_X86_32
577/*
578 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
579 * specific CPU-side IRQs.
580 */
581
582#define MAX_PIRQS 8
583static int pirq_entries [MAX_PIRQS];
584static int pirqs_enabled;
585
586static int __init ioapic_pirq_setup(char *str)
446{ 587{
447 disable_ioapic_setup(); 588 int i, max;
589 int ints[MAX_PIRQS+1];
590
591 get_options(str, ARRAY_SIZE(ints), ints);
592
593 for (i = 0; i < MAX_PIRQS; i++)
594 pirq_entries[i] = -1;
595
596 pirqs_enabled = 1;
597 apic_printk(APIC_VERBOSE, KERN_INFO
598 "PIRQ redirection, working around broken MP-BIOS.\n");
599 max = MAX_PIRQS;
600 if (ints[0] < MAX_PIRQS)
601 max = ints[0];
602
603 for (i = 0; i < max; i++) {
604 apic_printk(APIC_VERBOSE, KERN_DEBUG
605 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
606 /*
607 * PIRQs are mapped upside down, usually.
608 */
609 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
610 }
611 return 1;
612}
613
614__setup("pirq=", ioapic_pirq_setup);
615#endif /* CONFIG_X86_32 */
616
617#ifdef CONFIG_INTR_REMAP
618/* I/O APIC RTE contents at the OS boot up */
619static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
620
621/*
622 * Saves and masks all the unmasked IO-APIC RTE's
623 */
624int save_mask_IO_APIC_setup(void)
625{
626 union IO_APIC_reg_01 reg_01;
627 unsigned long flags;
628 int apic, pin;
629
630 /*
631 * The number of IO-APIC IRQ registers (== #pins):
632 */
633 for (apic = 0; apic < nr_ioapics; apic++) {
634 spin_lock_irqsave(&ioapic_lock, flags);
635 reg_01.raw = io_apic_read(apic, 1);
636 spin_unlock_irqrestore(&ioapic_lock, flags);
637 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
638 }
639
640 for (apic = 0; apic < nr_ioapics; apic++) {
641 early_ioapic_entries[apic] =
642 kzalloc(sizeof(struct IO_APIC_route_entry) *
643 nr_ioapic_registers[apic], GFP_KERNEL);
644 if (!early_ioapic_entries[apic])
645 goto nomem;
646 }
647
648 for (apic = 0; apic < nr_ioapics; apic++)
649 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
650 struct IO_APIC_route_entry entry;
651
652 entry = early_ioapic_entries[apic][pin] =
653 ioapic_read_entry(apic, pin);
654 if (!entry.mask) {
655 entry.mask = 1;
656 ioapic_write_entry(apic, pin, entry);
657 }
658 }
659
448 return 0; 660 return 0;
661
662nomem:
663 while (apic >= 0)
664 kfree(early_ioapic_entries[apic--]);
665 memset(early_ioapic_entries, 0,
666 ARRAY_SIZE(early_ioapic_entries));
667
668 return -ENOMEM;
449} 669}
450early_param("noapic", parse_noapic);
451 670
452/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ 671void restore_IO_APIC_setup(void)
453static int __init disable_timer_pin_setup(char *arg)
454{ 672{
455 disable_timer_pin_1 = 1; 673 int apic, pin;
456 return 1; 674
675 for (apic = 0; apic < nr_ioapics; apic++) {
676 if (!early_ioapic_entries[apic])
677 break;
678 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
679 ioapic_write_entry(apic, pin,
680 early_ioapic_entries[apic][pin]);
681 kfree(early_ioapic_entries[apic]);
682 early_ioapic_entries[apic] = NULL;
683 }
457} 684}
458__setup("disable_timer_pin_1", disable_timer_pin_setup);
459 685
686void reinit_intr_remapped_IO_APIC(int intr_remapping)
687{
688 /*
689 * for now plain restore of previous settings.
690 * TBD: In the case of OS enabling interrupt-remapping,
691 * IO-APIC RTE's need to be setup to point to interrupt-remapping
692 * table entries. for now, do a plain restore, and wait for
693 * the setup_IO_APIC_irqs() to do proper initialization.
694 */
695 restore_IO_APIC_setup();
696}
697#endif
460 698
461/* 699/*
462 * Find the IRQ entry number of a certain pin. 700 * Find the IRQ entry number of a certain pin.
@@ -560,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
560 best_guess = irq; 798 best_guess = irq;
561 } 799 }
562 } 800 }
563 BUG_ON(best_guess >= NR_IRQS);
564 return best_guess; 801 return best_guess;
565} 802}
566 803
804EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
805
806#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
807/*
808 * EISA Edge/Level control register, ELCR
809 */
810static int EISA_ELCR(unsigned int irq)
811{
812 if (irq < 16) {
813 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1;
815 }
816 apic_printk(APIC_VERBOSE, KERN_INFO
817 "Broken MPtable reports ISA irq %d\n", irq);
818 return 0;
819}
820
821#endif
822
567/* ISA interrupts are always polarity zero edge triggered, 823/* ISA interrupts are always polarity zero edge triggered,
568 * when listed as conforming in the MP table. */ 824 * when listed as conforming in the MP table. */
569 825
570#define default_ISA_trigger(idx) (0) 826#define default_ISA_trigger(idx) (0)
571#define default_ISA_polarity(idx) (0) 827#define default_ISA_polarity(idx) (0)
572 828
829/* EISA interrupts are always polarity zero and can be edge or level
830 * trigger depending on the ELCR value. If an interrupt is listed as
831 * EISA conforming in the MP table, that means its trigger type must
832 * be read in from the ELCR */
833
834#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
835#define default_EISA_polarity(idx) default_ISA_polarity(idx)
836
573/* PCI interrupts are always polarity one level triggered, 837/* PCI interrupts are always polarity one level triggered,
574 * when listed as conforming in the MP table. */ 838 * when listed as conforming in the MP table. */
575 839
576#define default_PCI_trigger(idx) (1) 840#define default_PCI_trigger(idx) (1)
577#define default_PCI_polarity(idx) (1) 841#define default_PCI_polarity(idx) (1)
578 842
843/* MCA interrupts are always polarity zero level triggered,
844 * when listed as conforming in the MP table. */
845
846#define default_MCA_trigger(idx) (1)
847#define default_MCA_polarity(idx) default_ISA_polarity(idx)
848
579static int MPBIOS_polarity(int idx) 849static int MPBIOS_polarity(int idx)
580{ 850{
581 int bus = mp_irqs[idx].mp_srcbus; 851 int bus = mp_irqs[idx].mp_srcbus;
@@ -633,6 +903,36 @@ static int MPBIOS_trigger(int idx)
633 trigger = default_ISA_trigger(idx); 903 trigger = default_ISA_trigger(idx);
634 else 904 else
635 trigger = default_PCI_trigger(idx); 905 trigger = default_PCI_trigger(idx);
906#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
907 switch (mp_bus_id_to_type[bus]) {
908 case MP_BUS_ISA: /* ISA pin */
909 {
910 /* set before the switch */
911 break;
912 }
913 case MP_BUS_EISA: /* EISA pin */
914 {
915 trigger = default_EISA_trigger(idx);
916 break;
917 }
918 case MP_BUS_PCI: /* PCI pin */
919 {
920 /* set before the switch */
921 break;
922 }
923 case MP_BUS_MCA: /* MCA pin */
924 {
925 trigger = default_MCA_trigger(idx);
926 break;
927 }
928 default:
929 {
930 printk(KERN_WARNING "broken BIOS!!\n");
931 trigger = 1;
932 break;
933 }
934 }
935#endif
636 break; 936 break;
637 case 1: /* edge */ 937 case 1: /* edge */
638 { 938 {
@@ -670,6 +970,7 @@ static inline int irq_trigger(int idx)
670 return MPBIOS_trigger(idx); 970 return MPBIOS_trigger(idx);
671} 971}
672 972
973int (*ioapic_renumber_irq)(int ioapic, int irq);
673static int pin_2_irq(int idx, int apic, int pin) 974static int pin_2_irq(int idx, int apic, int pin)
674{ 975{
675 int irq, i; 976 int irq, i;
@@ -691,11 +992,48 @@ static int pin_2_irq(int idx, int apic, int pin)
691 while (i < apic) 992 while (i < apic)
692 irq += nr_ioapic_registers[i++]; 993 irq += nr_ioapic_registers[i++];
693 irq += pin; 994 irq += pin;
995 /*
996 * For MPS mode, so far only needed by ES7000 platform
997 */
998 if (ioapic_renumber_irq)
999 irq = ioapic_renumber_irq(apic, irq);
694 } 1000 }
695 BUG_ON(irq >= NR_IRQS); 1001
1002#ifdef CONFIG_X86_32
1003 /*
1004 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1005 */
1006 if ((pin >= 16) && (pin <= 23)) {
1007 if (pirq_entries[pin-16] != -1) {
1008 if (!pirq_entries[pin-16]) {
1009 apic_printk(APIC_VERBOSE, KERN_DEBUG
1010 "disabling PIRQ%d\n", pin-16);
1011 } else {
1012 irq = pirq_entries[pin-16];
1013 apic_printk(APIC_VERBOSE, KERN_DEBUG
1014 "using PIRQ%d -> IRQ %d\n",
1015 pin-16, irq);
1016 }
1017 }
1018 }
1019#endif
1020
696 return irq; 1021 return irq;
697} 1022}
698 1023
1024void lock_vector_lock(void)
1025{
1026 /* Used to the online set of cpus does not change
1027 * during assign_irq_vector.
1028 */
1029 spin_lock(&vector_lock);
1030}
1031
1032void unlock_vector_lock(void)
1033{
1034 spin_unlock(&vector_lock);
1035}
1036
699static int __assign_irq_vector(int irq, cpumask_t mask) 1037static int __assign_irq_vector(int irq, cpumask_t mask)
700{ 1038{
701 /* 1039 /*
@@ -714,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
714 int cpu; 1052 int cpu;
715 struct irq_cfg *cfg; 1053 struct irq_cfg *cfg;
716 1054
717 BUG_ON((unsigned)irq >= NR_IRQS); 1055 cfg = irq_cfg(irq);
718 cfg = &irq_cfg[irq];
719 1056
720 /* Only try and allocate irqs on cpus that are present */ 1057 /* Only try and allocate irqs on cpus that are present */
721 cpus_and(mask, mask, cpu_online_map); 1058 cpus_and(mask, mask, cpu_online_map);
@@ -731,7 +1068,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
731 return 0; 1068 return 0;
732 } 1069 }
733 1070
734 for_each_cpu_mask(cpu, mask) { 1071 for_each_cpu_mask_nr(cpu, mask) {
735 cpumask_t domain, new_mask; 1072 cpumask_t domain, new_mask;
736 int new_cpu; 1073 int new_cpu;
737 int vector, offset; 1074 int vector, offset;
@@ -750,9 +1087,14 @@ next:
750 } 1087 }
751 if (unlikely(current_vector == vector)) 1088 if (unlikely(current_vector == vector))
752 continue; 1089 continue;
1090#ifdef CONFIG_X86_64
753 if (vector == IA32_SYSCALL_VECTOR) 1091 if (vector == IA32_SYSCALL_VECTOR)
754 goto next; 1092 goto next;
755 for_each_cpu_mask(new_cpu, new_mask) 1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next;
1096#endif
1097 for_each_cpu_mask_nr(new_cpu, new_mask)
756 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
757 goto next; 1099 goto next;
758 /* Found one! */ 1100 /* Found one! */
@@ -762,7 +1104,7 @@ next:
762 cfg->move_in_progress = 1; 1104 cfg->move_in_progress = 1;
763 cfg->old_domain = cfg->domain; 1105 cfg->old_domain = cfg->domain;
764 } 1106 }
765 for_each_cpu_mask(new_cpu, new_mask) 1107 for_each_cpu_mask_nr(new_cpu, new_mask)
766 per_cpu(vector_irq, new_cpu)[vector] = irq; 1108 per_cpu(vector_irq, new_cpu)[vector] = irq;
767 cfg->vector = vector; 1109 cfg->vector = vector;
768 cfg->domain = domain; 1110 cfg->domain = domain;
@@ -788,30 +1130,30 @@ static void __clear_irq_vector(int irq)
788 cpumask_t mask; 1130 cpumask_t mask;
789 int cpu, vector; 1131 int cpu, vector;
790 1132
791 BUG_ON((unsigned)irq >= NR_IRQS); 1133 cfg = irq_cfg(irq);
792 cfg = &irq_cfg[irq];
793 BUG_ON(!cfg->vector); 1134 BUG_ON(!cfg->vector);
794 1135
795 vector = cfg->vector; 1136 vector = cfg->vector;
796 cpus_and(mask, cfg->domain, cpu_online_map); 1137 cpus_and(mask, cfg->domain, cpu_online_map);
797 for_each_cpu_mask(cpu, mask) 1138 for_each_cpu_mask_nr(cpu, mask)
798 per_cpu(vector_irq, cpu)[vector] = -1; 1139 per_cpu(vector_irq, cpu)[vector] = -1;
799 1140
800 cfg->vector = 0; 1141 cfg->vector = 0;
801 cpus_clear(cfg->domain); 1142 cpus_clear(cfg->domain);
802} 1143}
803 1144
804static void __setup_vector_irq(int cpu) 1145void __setup_vector_irq(int cpu)
805{ 1146{
806 /* Initialize vector_irq on a new cpu */ 1147 /* Initialize vector_irq on a new cpu */
807 /* This function must be called with vector_lock held */ 1148 /* This function must be called with vector_lock held */
808 int irq, vector; 1149 int irq, vector;
1150 struct irq_cfg *cfg;
809 1151
810 /* Mark the inuse vectors */ 1152 /* Mark the inuse vectors */
811 for (irq = 0; irq < NR_IRQS; ++irq) { 1153 for_each_irq_cfg(irq, cfg) {
812 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1154 if (!cpu_isset(cpu, cfg->domain))
813 continue; 1155 continue;
814 vector = irq_cfg[irq].vector; 1156 vector = cfg->vector;
815 per_cpu(vector_irq, cpu)[vector] = irq; 1157 per_cpu(vector_irq, cpu)[vector] = irq;
816 } 1158 }
817 /* Mark the free vectors */ 1159 /* Mark the free vectors */
@@ -819,44 +1161,154 @@ static void __setup_vector_irq(int cpu)
819 irq = per_cpu(vector_irq, cpu)[vector]; 1161 irq = per_cpu(vector_irq, cpu)[vector];
820 if (irq < 0) 1162 if (irq < 0)
821 continue; 1163 continue;
822 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1164
1165 cfg = irq_cfg(irq);
1166 if (!cpu_isset(cpu, cfg->domain))
823 per_cpu(vector_irq, cpu)[vector] = -1; 1167 per_cpu(vector_irq, cpu)[vector] = -1;
824 } 1168 }
825} 1169}
826 1170
827void setup_vector_irq(int cpu) 1171static struct irq_chip ioapic_chip;
828{ 1172#ifdef CONFIG_INTR_REMAP
829 spin_lock(&vector_lock); 1173static struct irq_chip ir_ioapic_chip;
830 __setup_vector_irq(smp_processor_id()); 1174#endif
831 spin_unlock(&vector_lock);
832}
833 1175
1176#define IOAPIC_AUTO -1
1177#define IOAPIC_EDGE 0
1178#define IOAPIC_LEVEL 1
834 1179
835static struct irq_chip ioapic_chip; 1180#ifdef CONFIG_X86_32
1181static inline int IO_APIC_irq_trigger(int irq)
1182{
1183 int apic, idx, pin;
1184
1185 for (apic = 0; apic < nr_ioapics; apic++) {
1186 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1187 idx = find_irq_entry(apic, pin, mp_INT);
1188 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1189 return irq_trigger(idx);
1190 }
1191 }
1192 /*
1193 * nonexistent IRQs are edge default
1194 */
1195 return 0;
1196}
1197#else
1198static inline int IO_APIC_irq_trigger(int irq)
1199{
1200 return 1;
1201}
1202#endif
836 1203
837static void ioapic_register_intr(int irq, unsigned long trigger) 1204static void ioapic_register_intr(int irq, unsigned long trigger)
838{ 1205{
839 if (trigger) { 1206 struct irq_desc *desc;
840 irq_desc[irq].status |= IRQ_LEVEL; 1207
1208 desc = irq_to_desc(irq);
1209
1210 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1211 trigger == IOAPIC_LEVEL)
1212 desc->status |= IRQ_LEVEL;
1213 else
1214 desc->status &= ~IRQ_LEVEL;
1215
1216#ifdef CONFIG_INTR_REMAP
1217 if (irq_remapped(irq)) {
1218 desc->status |= IRQ_MOVE_PCNTXT;
1219 if (trigger)
1220 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1221 handle_fasteoi_irq,
1222 "fasteoi");
1223 else
1224 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1225 handle_edge_irq, "edge");
1226 return;
1227 }
1228#endif
1229 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1230 trigger == IOAPIC_LEVEL)
841 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
842 handle_fasteoi_irq, "fasteoi"); 1232 handle_fasteoi_irq,
843 } else { 1233 "fasteoi");
844 irq_desc[irq].status &= ~IRQ_LEVEL; 1234 else
845 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1235 set_irq_chip_and_handler_name(irq, &ioapic_chip,
846 handle_edge_irq, "edge"); 1236 handle_edge_irq, "edge");
1237}
1238
1239static int setup_ioapic_entry(int apic, int irq,
1240 struct IO_APIC_route_entry *entry,
1241 unsigned int destination, int trigger,
1242 int polarity, int vector)
1243{
1244 /*
1245 * add it to the IO-APIC irq-routing table:
1246 */
1247 memset(entry,0,sizeof(*entry));
1248
1249#ifdef CONFIG_INTR_REMAP
1250 if (intr_remapping_enabled) {
1251 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
1252 struct irte irte;
1253 struct IR_IO_APIC_route_entry *ir_entry =
1254 (struct IR_IO_APIC_route_entry *) entry;
1255 int index;
1256
1257 if (!iommu)
1258 panic("No mapping iommu for ioapic %d\n", apic);
1259
1260 index = alloc_irte(iommu, irq, 1);
1261 if (index < 0)
1262 panic("Failed to allocate IRTE for ioapic %d\n", apic);
1263
1264 memset(&irte, 0, sizeof(irte));
1265
1266 irte.present = 1;
1267 irte.dst_mode = INT_DEST_MODE;
1268 irte.trigger_mode = trigger;
1269 irte.dlvry_mode = INT_DELIVERY_MODE;
1270 irte.vector = vector;
1271 irte.dest_id = IRTE_DEST(destination);
1272
1273 modify_irte(irq, &irte);
1274
1275 ir_entry->index2 = (index >> 15) & 0x1;
1276 ir_entry->zero = 0;
1277 ir_entry->format = 1;
1278 ir_entry->index = (index & 0x7fff);
1279 } else
1280#endif
1281 {
1282 entry->delivery_mode = INT_DELIVERY_MODE;
1283 entry->dest_mode = INT_DEST_MODE;
1284 entry->dest = destination;
847 } 1285 }
1286
1287 entry->mask = 0; /* enable IRQ */
1288 entry->trigger = trigger;
1289 entry->polarity = polarity;
1290 entry->vector = vector;
1291
1292 /* Mask level triggered irqs.
1293 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1294 */
1295 if (trigger)
1296 entry->mask = 1;
1297 return 0;
848} 1298}
849 1299
850static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1300static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
851 int trigger, int polarity) 1301 int trigger, int polarity)
852{ 1302{
853 struct irq_cfg *cfg = irq_cfg + irq; 1303 struct irq_cfg *cfg;
854 struct IO_APIC_route_entry entry; 1304 struct IO_APIC_route_entry entry;
855 cpumask_t mask; 1305 cpumask_t mask;
856 1306
857 if (!IO_APIC_IRQ(irq)) 1307 if (!IO_APIC_IRQ(irq))
858 return; 1308 return;
859 1309
1310 cfg = irq_cfg(irq);
1311
860 mask = TARGET_CPUS; 1312 mask = TARGET_CPUS;
861 if (assign_irq_vector(irq, mask)) 1313 if (assign_irq_vector(irq, mask))
862 return; 1314 return;
@@ -869,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
869 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1321 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
870 irq, trigger, polarity); 1322 irq, trigger, polarity);
871 1323
872 /*
873 * add it to the IO-APIC irq-routing table:
874 */
875 memset(&entry,0,sizeof(entry));
876 1324
877 entry.delivery_mode = INT_DELIVERY_MODE; 1325 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
878 entry.dest_mode = INT_DEST_MODE; 1326 cpu_mask_to_apicid(mask), trigger, polarity,
879 entry.dest = cpu_mask_to_apicid(mask); 1327 cfg->vector)) {
880 entry.mask = 0; /* enable IRQ */ 1328 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
881 entry.trigger = trigger; 1329 mp_ioapics[apic].mp_apicid, pin);
882 entry.polarity = polarity; 1330 __clear_irq_vector(irq);
883 entry.vector = cfg->vector; 1331 return;
884 1332 }
885 /* Mask level triggered irqs.
886 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
887 */
888 if (trigger)
889 entry.mask = 1;
890 1333
891 ioapic_register_intr(irq, trigger); 1334 ioapic_register_intr(irq, trigger);
892 if (irq < 16) 1335 if (irq < 16)
@@ -897,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
897 1340
898static void __init setup_IO_APIC_irqs(void) 1341static void __init setup_IO_APIC_irqs(void)
899{ 1342{
900 int apic, pin, idx, irq, first_notcon = 1; 1343 int apic, pin, idx, irq;
1344 int notcon = 0;
901 1345
902 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1346 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
903 1347
904 for (apic = 0; apic < nr_ioapics; apic++) { 1348 for (apic = 0; apic < nr_ioapics; apic++) {
905 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1349 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
906 1350
907 idx = find_irq_entry(apic,pin,mp_INT); 1351 idx = find_irq_entry(apic, pin, mp_INT);
908 if (idx == -1) { 1352 if (idx == -1) {
909 if (first_notcon) { 1353 if (!notcon) {
910 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); 1354 notcon = 1;
911 first_notcon = 0; 1355 apic_printk(APIC_VERBOSE,
912 } else 1356 KERN_DEBUG " %d-%d",
913 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); 1357 mp_ioapics[apic].mp_apicid,
914 continue; 1358 pin);
915 } 1359 } else
916 if (!first_notcon) { 1360 apic_printk(APIC_VERBOSE, " %d-%d",
917 apic_printk(APIC_VERBOSE, " not connected.\n"); 1361 mp_ioapics[apic].mp_apicid,
918 first_notcon = 1; 1362 pin);
919 } 1363 continue;
1364 }
1365 if (notcon) {
1366 apic_printk(APIC_VERBOSE,
1367 " (apicid-pin) not connected\n");
1368 notcon = 0;
1369 }
920 1370
921 irq = pin_2_irq(idx, apic, pin); 1371 irq = pin_2_irq(idx, apic, pin);
922 add_pin_to_irq(irq, apic, pin); 1372#ifdef CONFIG_X86_32
1373 if (multi_timer_check(apic, irq))
1374 continue;
1375#endif
1376 add_pin_to_irq(irq, apic, pin);
923 1377
924 setup_IO_APIC_irq(apic, pin, irq, 1378 setup_IO_APIC_irq(apic, pin, irq,
925 irq_trigger(idx), irq_polarity(idx)); 1379 irq_trigger(idx), irq_polarity(idx));
926 } 1380 }
927 } 1381 }
928 1382
929 if (!first_notcon) 1383 if (notcon)
930 apic_printk(APIC_VERBOSE, " not connected.\n"); 1384 apic_printk(APIC_VERBOSE,
1385 " (apicid-pin) not connected\n");
931} 1386}
932 1387
933/* 1388/*
@@ -938,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
938{ 1393{
939 struct IO_APIC_route_entry entry; 1394 struct IO_APIC_route_entry entry;
940 1395
1396#ifdef CONFIG_INTR_REMAP
1397 if (intr_remapping_enabled)
1398 return;
1399#endif
1400
941 memset(&entry, 0, sizeof(entry)); 1401 memset(&entry, 0, sizeof(entry));
942 1402
943 /* 1403 /*
@@ -964,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
964 ioapic_write_entry(apic, pin, entry); 1424 ioapic_write_entry(apic, pin, entry);
965} 1425}
966 1426
967void __apicdebuginit print_IO_APIC(void) 1427
1428__apicdebuginit(void) print_IO_APIC(void)
968{ 1429{
969 int apic, i; 1430 int apic, i;
970 union IO_APIC_reg_00 reg_00; 1431 union IO_APIC_reg_00 reg_00;
971 union IO_APIC_reg_01 reg_01; 1432 union IO_APIC_reg_01 reg_01;
972 union IO_APIC_reg_02 reg_02; 1433 union IO_APIC_reg_02 reg_02;
1434 union IO_APIC_reg_03 reg_03;
973 unsigned long flags; 1435 unsigned long flags;
1436 struct irq_cfg *cfg;
1437 unsigned int irq;
974 1438
975 if (apic_verbosity == APIC_QUIET) 1439 if (apic_verbosity == APIC_QUIET)
976 return; 1440 return;
@@ -993,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void)
993 reg_01.raw = io_apic_read(apic, 1); 1457 reg_01.raw = io_apic_read(apic, 1);
994 if (reg_01.bits.version >= 0x10) 1458 if (reg_01.bits.version >= 0x10)
995 reg_02.raw = io_apic_read(apic, 2); 1459 reg_02.raw = io_apic_read(apic, 2);
1460 if (reg_01.bits.version >= 0x20)
1461 reg_03.raw = io_apic_read(apic, 3);
996 spin_unlock_irqrestore(&ioapic_lock, flags); 1462 spin_unlock_irqrestore(&ioapic_lock, flags);
997 1463
998 printk("\n"); 1464 printk("\n");
999 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1465 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1000 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1466 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1001 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1467 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1468 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1469 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1002 1470
1003 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1471 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1004 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1472 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1006,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void)
1006 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1474 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1007 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1475 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1008 1476
1009 if (reg_01.bits.version >= 0x10) { 1477 /*
1478 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1479 * but the value of reg_02 is read as the previous read register
1480 * value, so ignore it if reg_02 == reg_01.
1481 */
1482 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1010 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1483 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1011 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1484 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1012 } 1485 }
1013 1486
1487 /*
1488 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1489 * or reg_03, but the value of reg_0[23] is read as the previous read
1490 * register value, so ignore it if reg_03 == reg_0[12].
1491 */
1492 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1493 reg_03.raw != reg_01.raw) {
1494 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1495 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1496 }
1497
1014 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1498 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1015 1499
1016 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1500 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1039,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void)
1039 } 1523 }
1040 } 1524 }
1041 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1525 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1042 for (i = 0; i < NR_IRQS; i++) { 1526 for_each_irq_cfg(irq, cfg) {
1043 struct irq_pin_list *entry = irq_2_pin + i; 1527 struct irq_pin_list *entry = cfg->irq_2_pin;
1044 if (entry->pin < 0) 1528 if (!entry)
1045 continue; 1529 continue;
1046 printk(KERN_DEBUG "IRQ%d ", i); 1530 printk(KERN_DEBUG "IRQ%d ", irq);
1047 for (;;) { 1531 for (;;) {
1048 printk("-> %d:%d", entry->apic, entry->pin); 1532 printk("-> %d:%d", entry->apic, entry->pin);
1049 if (!entry->next) 1533 if (!entry->next)
1050 break; 1534 break;
1051 entry = irq_2_pin + entry->next; 1535 entry = entry->next;
1052 } 1536 }
1053 printk("\n"); 1537 printk("\n");
1054 } 1538 }
@@ -1058,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void)
1058 return; 1542 return;
1059} 1543}
1060 1544
1061#if 0 1545__apicdebuginit(void) print_APIC_bitfield(int base)
1062
1063static __apicdebuginit void print_APIC_bitfield (int base)
1064{ 1546{
1065 unsigned int v; 1547 unsigned int v;
1066 int i, j; 1548 int i, j;
@@ -1081,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1081 } 1563 }
1082} 1564}
1083 1565
1084void __apicdebuginit print_local_APIC(void * dummy) 1566__apicdebuginit(void) print_local_APIC(void *dummy)
1085{ 1567{
1086 unsigned int v, ver, maxlvt; 1568 unsigned int v, ver, maxlvt;
1569 u64 icr;
1087 1570
1088 if (apic_verbosity == APIC_QUIET) 1571 if (apic_verbosity == APIC_QUIET)
1089 return; 1572 return;
@@ -1091,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1091 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1574 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1092 smp_processor_id(), hard_smp_processor_id()); 1575 smp_processor_id(), hard_smp_processor_id());
1093 v = apic_read(APIC_ID); 1576 v = apic_read(APIC_ID);
1094 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1577 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1095 v = apic_read(APIC_LVR); 1578 v = apic_read(APIC_LVR);
1096 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1579 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1097 ver = GET_APIC_VERSION(v); 1580 ver = GET_APIC_VERSION(v);
@@ -1100,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy)
1100 v = apic_read(APIC_TASKPRI); 1583 v = apic_read(APIC_TASKPRI);
1101 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1584 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1102 1585
1103 v = apic_read(APIC_ARBPRI); 1586 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1104 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, 1587 if (!APIC_XAPIC(ver)) {
1105 v & APIC_ARBPRI_MASK); 1588 v = apic_read(APIC_ARBPRI);
1106 v = apic_read(APIC_PROCPRI); 1589 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1107 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); 1590 v & APIC_ARBPRI_MASK);
1591 }
1592 v = apic_read(APIC_PROCPRI);
1593 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1594 }
1595
1596 /*
1597 * Remote read supported only in the 82489DX and local APIC for
1598 * Pentium processors.
1599 */
1600 if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
1601 v = apic_read(APIC_RRR);
1602 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1603 }
1108 1604
1109 v = apic_read(APIC_EOI);
1110 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1111 v = apic_read(APIC_RRR);
1112 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1113 v = apic_read(APIC_LDR); 1605 v = apic_read(APIC_LDR);
1114 printk(KERN_DEBUG "... APIC LDR: %08x\n", v); 1606 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1115 v = apic_read(APIC_DFR); 1607 if (!x2apic_enabled()) {
1116 printk(KERN_DEBUG "... APIC DFR: %08x\n", v); 1608 v = apic_read(APIC_DFR);
1609 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1610 }
1117 v = apic_read(APIC_SPIV); 1611 v = apic_read(APIC_SPIV);
1118 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1612 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1119 1613
@@ -1124,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1124 printk(KERN_DEBUG "... APIC IRR field:\n"); 1618 printk(KERN_DEBUG "... APIC IRR field:\n");
1125 print_APIC_bitfield(APIC_IRR); 1619 print_APIC_bitfield(APIC_IRR);
1126 1620
1127 v = apic_read(APIC_ESR); 1621 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1128 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1622 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1623 apic_write(APIC_ESR, 0);
1129 1624
1130 v = apic_read(APIC_ICR); 1625 v = apic_read(APIC_ESR);
1131 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1626 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1132 v = apic_read(APIC_ICR2); 1627 }
1133 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); 1628
1629 icr = apic_icr_read();
1630 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1631 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1134 1632
1135 v = apic_read(APIC_LVTT); 1633 v = apic_read(APIC_LVTT);
1136 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1634 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1158,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1158 printk("\n"); 1656 printk("\n");
1159} 1657}
1160 1658
1161void print_all_local_APICs (void) 1659__apicdebuginit(void) print_all_local_APICs(void)
1162{ 1660{
1163 on_each_cpu(print_local_APIC, NULL, 1); 1661 int cpu;
1662
1663 preempt_disable();
1664 for_each_online_cpu(cpu)
1665 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1666 preempt_enable();
1164} 1667}
1165 1668
1166void __apicdebuginit print_PIC(void) 1669__apicdebuginit(void) print_PIC(void)
1167{ 1670{
1168 unsigned int v; 1671 unsigned int v;
1169 unsigned long flags; 1672 unsigned long flags;
@@ -1195,19 +1698,34 @@ void __apicdebuginit print_PIC(void)
1195 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1698 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1196} 1699}
1197 1700
1198#endif /* 0 */ 1701__apicdebuginit(int) print_all_ICs(void)
1702{
1703 print_PIC();
1704 print_all_local_APICs();
1705 print_IO_APIC();
1706
1707 return 0;
1708}
1709
1710fs_initcall(print_all_ICs);
1711
1712
1713/* Where if anywhere is the i8259 connect in external int mode */
1714static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1199 1715
1200void __init enable_IO_APIC(void) 1716void __init enable_IO_APIC(void)
1201{ 1717{
1202 union IO_APIC_reg_01 reg_01; 1718 union IO_APIC_reg_01 reg_01;
1203 int i8259_apic, i8259_pin; 1719 int i8259_apic, i8259_pin;
1204 int i, apic; 1720 int apic;
1205 unsigned long flags; 1721 unsigned long flags;
1206 1722
1207 for (i = 0; i < PIN_MAP_SIZE; i++) { 1723#ifdef CONFIG_X86_32
1208 irq_2_pin[i].pin = -1; 1724 int i;
1209 irq_2_pin[i].next = 0; 1725 if (!pirqs_enabled)
1210 } 1726 for (i = 0; i < MAX_PIRQS; i++)
1727 pirq_entries[i] = -1;
1728#endif
1211 1729
1212 /* 1730 /*
1213 * The number of IO-APIC IRQ registers (== #pins): 1731 * The number of IO-APIC IRQ registers (== #pins):
@@ -1237,6 +1755,10 @@ void __init enable_IO_APIC(void)
1237 } 1755 }
1238 found_i8259: 1756 found_i8259:
1239 /* Look to see what if the MP table has reported the ExtINT */ 1757 /* Look to see what if the MP table has reported the ExtINT */
1758 /* If we could not find the appropriate pin by looking at the ioapic
1759 * the i8259 probably is not connected the ioapic but give the
1760 * mptable a chance anyway.
1761 */
1240 i8259_pin = find_isa_irq_pin(0, mp_ExtINT); 1762 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1241 i8259_apic = find_isa_irq_apic(0, mp_ExtINT); 1763 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1242 /* Trust the MP table if nothing is setup in the hardware */ 1764 /* Trust the MP table if nothing is setup in the hardware */
@@ -1285,7 +1807,7 @@ void disable_IO_APIC(void)
1285 entry.dest_mode = 0; /* Physical */ 1807 entry.dest_mode = 0; /* Physical */
1286 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1808 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1287 entry.vector = 0; 1809 entry.vector = 0;
1288 entry.dest = GET_APIC_ID(read_apic_id()); 1810 entry.dest = read_apic_id();
1289 1811
1290 /* 1812 /*
1291 * Add it to the IO-APIC irq-routing table: 1813 * Add it to the IO-APIC irq-routing table:
@@ -1296,6 +1818,133 @@ void disable_IO_APIC(void)
1296 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1818 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1297} 1819}
1298 1820
1821#ifdef CONFIG_X86_32
1822/*
1823 * function to set the IO-APIC physical IDs based on the
1824 * values stored in the MPC table.
1825 *
1826 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1827 */
1828
1829static void __init setup_ioapic_ids_from_mpc(void)
1830{
1831 union IO_APIC_reg_00 reg_00;
1832 physid_mask_t phys_id_present_map;
1833 int apic;
1834 int i;
1835 unsigned char old_id;
1836 unsigned long flags;
1837
1838 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1839 return;
1840
1841 /*
1842 * Don't check I/O APIC IDs for xAPIC systems. They have
1843 * no meaning without the serial APIC bus.
1844 */
1845 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1846 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1847 return;
1848 /*
1849 * This is broken; anything with a real cpu count has to
1850 * circumvent this idiocy regardless.
1851 */
1852 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1853
1854 /*
1855 * Set the IOAPIC ID to the value stored in the MPC table.
1856 */
1857 for (apic = 0; apic < nr_ioapics; apic++) {
1858
1859 /* Read the register 0 value */
1860 spin_lock_irqsave(&ioapic_lock, flags);
1861 reg_00.raw = io_apic_read(apic, 0);
1862 spin_unlock_irqrestore(&ioapic_lock, flags);
1863
1864 old_id = mp_ioapics[apic].mp_apicid;
1865
1866 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1867 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1868 apic, mp_ioapics[apic].mp_apicid);
1869 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1870 reg_00.bits.ID);
1871 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1872 }
1873
1874 /*
1875 * Sanity check, is the ID really free? Every APIC in a
1876 * system must have a unique ID or we get lots of nice
1877 * 'stuck on smp_invalidate_needed IPI wait' messages.
1878 */
1879 if (check_apicid_used(phys_id_present_map,
1880 mp_ioapics[apic].mp_apicid)) {
1881 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1882 apic, mp_ioapics[apic].mp_apicid);
1883 for (i = 0; i < get_physical_broadcast(); i++)
1884 if (!physid_isset(i, phys_id_present_map))
1885 break;
1886 if (i >= get_physical_broadcast())
1887 panic("Max APIC ID exceeded!\n");
1888 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1889 i);
1890 physid_set(i, phys_id_present_map);
1891 mp_ioapics[apic].mp_apicid = i;
1892 } else {
1893 physid_mask_t tmp;
1894 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1895 apic_printk(APIC_VERBOSE, "Setting %d in the "
1896 "phys_id_present_map\n",
1897 mp_ioapics[apic].mp_apicid);
1898 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1899 }
1900
1901
1902 /*
1903 * We need to adjust the IRQ routing table
1904 * if the ID changed.
1905 */
1906 if (old_id != mp_ioapics[apic].mp_apicid)
1907 for (i = 0; i < mp_irq_entries; i++)
1908 if (mp_irqs[i].mp_dstapic == old_id)
1909 mp_irqs[i].mp_dstapic
1910 = mp_ioapics[apic].mp_apicid;
1911
1912 /*
1913 * Read the right value from the MPC table and
1914 * write it into the ID register.
1915 */
1916 apic_printk(APIC_VERBOSE, KERN_INFO
1917 "...changing IO-APIC physical APIC ID to %d ...",
1918 mp_ioapics[apic].mp_apicid);
1919
1920 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1921 spin_lock_irqsave(&ioapic_lock, flags);
1922 io_apic_write(apic, 0, reg_00.raw);
1923 spin_unlock_irqrestore(&ioapic_lock, flags);
1924
1925 /*
1926 * Sanity check
1927 */
1928 spin_lock_irqsave(&ioapic_lock, flags);
1929 reg_00.raw = io_apic_read(apic, 0);
1930 spin_unlock_irqrestore(&ioapic_lock, flags);
1931 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1932 printk("could not set ID!\n");
1933 else
1934 apic_printk(APIC_VERBOSE, " ok.\n");
1935 }
1936}
1937#endif
1938
1939int no_timer_check __initdata;
1940
1941static int __init notimercheck(char *s)
1942{
1943 no_timer_check = 1;
1944 return 1;
1945}
1946__setup("no_timer_check", notimercheck);
1947
1299/* 1948/*
1300 * There is a nasty bug in some older SMP boards, their mptable lies 1949 * There is a nasty bug in some older SMP boards, their mptable lies
1301 * about the timer IRQ. We do the following to work around the situation: 1950 * about the timer IRQ. We do the following to work around the situation:
@@ -1309,6 +1958,9 @@ static int __init timer_irq_works(void)
1309 unsigned long t1 = jiffies; 1958 unsigned long t1 = jiffies;
1310 unsigned long flags; 1959 unsigned long flags;
1311 1960
1961 if (no_timer_check)
1962 return 1;
1963
1312 local_save_flags(flags); 1964 local_save_flags(flags);
1313 local_irq_enable(); 1965 local_irq_enable();
1314 /* Let ten ticks pass... */ 1966 /* Let ten ticks pass... */
@@ -1369,19 +2021,27 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1369 return was_pending; 2021 return was_pending;
1370} 2022}
1371 2023
2024#ifdef CONFIG_X86_64
1372static int ioapic_retrigger_irq(unsigned int irq) 2025static int ioapic_retrigger_irq(unsigned int irq)
1373{ 2026{
1374 struct irq_cfg *cfg = &irq_cfg[irq]; 2027
1375 cpumask_t mask; 2028 struct irq_cfg *cfg = irq_cfg(irq);
1376 unsigned long flags; 2029 unsigned long flags;
1377 2030
1378 spin_lock_irqsave(&vector_lock, flags); 2031 spin_lock_irqsave(&vector_lock, flags);
1379 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 2032 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1380 send_IPI_mask(mask, cfg->vector);
1381 spin_unlock_irqrestore(&vector_lock, flags); 2033 spin_unlock_irqrestore(&vector_lock, flags);
1382 2034
1383 return 1; 2035 return 1;
1384} 2036}
2037#else
2038static int ioapic_retrigger_irq(unsigned int irq)
2039{
2040 send_IPI_self(irq_cfg(irq)->vector);
2041
2042 return 1;
2043}
2044#endif
1385 2045
1386/* 2046/*
1387 * Level and edge triggered IO-APIC interrupts need different handling, 2047 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1393,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq)
1393 */ 2053 */
1394 2054
1395#ifdef CONFIG_SMP 2055#ifdef CONFIG_SMP
2056
2057#ifdef CONFIG_INTR_REMAP
2058static void ir_irq_migration(struct work_struct *work);
2059
2060static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2061
2062/*
2063 * Migrate the IO-APIC irq in the presence of intr-remapping.
2064 *
2065 * For edge triggered, irq migration is a simple atomic update(of vector
2066 * and cpu destination) of IRTE and flush the hardware cache.
2067 *
2068 * For level triggered, we need to modify the io-apic RTE aswell with the update
2069 * vector information, along with modifying IRTE with vector and destination.
2070 * So irq migration for level triggered is little bit more complex compared to
2071 * edge triggered migration. But the good news is, we use the same algorithm
2072 * for level triggered migration as we have today, only difference being,
2073 * we now initiate the irq migration from process context instead of the
2074 * interrupt context.
2075 *
2076 * In future, when we do a directed EOI (combined with cpu EOI broadcast
2077 * suppression) to the IO-APIC, level triggered irq migration will also be
2078 * as simple as edge triggered migration and we can do the irq migration
2079 * with a simple atomic update to IO-APIC RTE.
2080 */
2081static void migrate_ioapic_irq(int irq, cpumask_t mask)
2082{
2083 struct irq_cfg *cfg;
2084 struct irq_desc *desc;
2085 cpumask_t tmp, cleanup_mask;
2086 struct irte irte;
2087 int modify_ioapic_rte;
2088 unsigned int dest;
2089 unsigned long flags;
2090
2091 cpus_and(tmp, mask, cpu_online_map);
2092 if (cpus_empty(tmp))
2093 return;
2094
2095 if (get_irte(irq, &irte))
2096 return;
2097
2098 if (assign_irq_vector(irq, mask))
2099 return;
2100
2101 cfg = irq_cfg(irq);
2102 cpus_and(tmp, cfg->domain, mask);
2103 dest = cpu_mask_to_apicid(tmp);
2104
2105 desc = irq_to_desc(irq);
2106 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2107 if (modify_ioapic_rte) {
2108 spin_lock_irqsave(&ioapic_lock, flags);
2109 __target_IO_APIC_irq(irq, dest, cfg->vector);
2110 spin_unlock_irqrestore(&ioapic_lock, flags);
2111 }
2112
2113 irte.vector = cfg->vector;
2114 irte.dest_id = IRTE_DEST(dest);
2115
2116 /*
2117 * Modified the IRTE and flushes the Interrupt entry cache.
2118 */
2119 modify_irte(irq, &irte);
2120
2121 if (cfg->move_in_progress) {
2122 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2123 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2124 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2125 cfg->move_in_progress = 0;
2126 }
2127
2128 desc->affinity = mask;
2129}
2130
2131static int migrate_irq_remapped_level(int irq)
2132{
2133 int ret = -1;
2134 struct irq_desc *desc = irq_to_desc(irq);
2135
2136 mask_IO_APIC_irq(irq);
2137
2138 if (io_apic_level_ack_pending(irq)) {
2139 /*
2140 * Interrupt in progress. Migrating irq now will change the
2141 * vector information in the IO-APIC RTE and that will confuse
2142 * the EOI broadcast performed by cpu.
2143 * So, delay the irq migration to the next instance.
2144 */
2145 schedule_delayed_work(&ir_migration_work, 1);
2146 goto unmask;
2147 }
2148
2149 /* everthing is clear. we have right of way */
2150 migrate_ioapic_irq(irq, desc->pending_mask);
2151
2152 ret = 0;
2153 desc->status &= ~IRQ_MOVE_PENDING;
2154 cpus_clear(desc->pending_mask);
2155
2156unmask:
2157 unmask_IO_APIC_irq(irq);
2158 return ret;
2159}
2160
2161static void ir_irq_migration(struct work_struct *work)
2162{
2163 unsigned int irq;
2164 struct irq_desc *desc;
2165
2166 for_each_irq_desc(irq, desc) {
2167 if (desc->status & IRQ_MOVE_PENDING) {
2168 unsigned long flags;
2169
2170 spin_lock_irqsave(&desc->lock, flags);
2171 if (!desc->chip->set_affinity ||
2172 !(desc->status & IRQ_MOVE_PENDING)) {
2173 desc->status &= ~IRQ_MOVE_PENDING;
2174 spin_unlock_irqrestore(&desc->lock, flags);
2175 continue;
2176 }
2177
2178 desc->chip->set_affinity(irq, desc->pending_mask);
2179 spin_unlock_irqrestore(&desc->lock, flags);
2180 }
2181 }
2182}
2183
2184/*
2185 * Migrates the IRQ destination in the process context.
2186 */
2187static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2188{
2189 struct irq_desc *desc = irq_to_desc(irq);
2190
2191 if (desc->status & IRQ_LEVEL) {
2192 desc->status |= IRQ_MOVE_PENDING;
2193 desc->pending_mask = mask;
2194 migrate_irq_remapped_level(irq);
2195 return;
2196 }
2197
2198 migrate_ioapic_irq(irq, mask);
2199}
2200#endif
2201
1396asmlinkage void smp_irq_move_cleanup_interrupt(void) 2202asmlinkage void smp_irq_move_cleanup_interrupt(void)
1397{ 2203{
1398 unsigned vector, me; 2204 unsigned vector, me;
1399 ack_APIC_irq(); 2205 ack_APIC_irq();
2206#ifdef CONFIG_X86_64
1400 exit_idle(); 2207 exit_idle();
2208#endif
1401 irq_enter(); 2209 irq_enter();
1402 2210
1403 me = smp_processor_id(); 2211 me = smp_processor_id();
@@ -1406,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
1406 struct irq_desc *desc; 2214 struct irq_desc *desc;
1407 struct irq_cfg *cfg; 2215 struct irq_cfg *cfg;
1408 irq = __get_cpu_var(vector_irq)[vector]; 2216 irq = __get_cpu_var(vector_irq)[vector];
1409 if (irq >= NR_IRQS) 2217
2218 desc = irq_to_desc(irq);
2219 if (!desc)
1410 continue; 2220 continue;
1411 2221
1412 desc = irq_desc + irq; 2222 cfg = irq_cfg(irq);
1413 cfg = irq_cfg + irq;
1414 spin_lock(&desc->lock); 2223 spin_lock(&desc->lock);
1415 if (!cfg->move_cleanup_count) 2224 if (!cfg->move_cleanup_count)
1416 goto unlock; 2225 goto unlock;
@@ -1429,7 +2238,7 @@ unlock:
1429 2238
1430static void irq_complete_move(unsigned int irq) 2239static void irq_complete_move(unsigned int irq)
1431{ 2240{
1432 struct irq_cfg *cfg = irq_cfg + irq; 2241 struct irq_cfg *cfg = irq_cfg(irq);
1433 unsigned vector, me; 2242 unsigned vector, me;
1434 2243
1435 if (likely(!cfg->move_in_progress)) 2244 if (likely(!cfg->move_in_progress))
@@ -1449,6 +2258,17 @@ static void irq_complete_move(unsigned int irq)
1449#else 2258#else
1450static inline void irq_complete_move(unsigned int irq) {} 2259static inline void irq_complete_move(unsigned int irq) {}
1451#endif 2260#endif
2261#ifdef CONFIG_INTR_REMAP
2262static void ack_x2apic_level(unsigned int irq)
2263{
2264 ack_x2APIC_irq();
2265}
2266
2267static void ack_x2apic_edge(unsigned int irq)
2268{
2269 ack_x2APIC_irq();
2270}
2271#endif
1452 2272
1453static void ack_apic_edge(unsigned int irq) 2273static void ack_apic_edge(unsigned int irq)
1454{ 2274{
@@ -1457,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
1457 ack_APIC_irq(); 2277 ack_APIC_irq();
1458} 2278}
1459 2279
2280atomic_t irq_mis_count;
2281
1460static void ack_apic_level(unsigned int irq) 2282static void ack_apic_level(unsigned int irq)
1461{ 2283{
2284#ifdef CONFIG_X86_32
2285 unsigned long v;
2286 int i;
2287#endif
1462 int do_unmask_irq = 0; 2288 int do_unmask_irq = 0;
1463 2289
1464 irq_complete_move(irq); 2290 irq_complete_move(irq);
1465#ifdef CONFIG_GENERIC_PENDING_IRQ 2291#ifdef CONFIG_GENERIC_PENDING_IRQ
1466 /* If we are moving the irq we need to mask it */ 2292 /* If we are moving the irq we need to mask it */
1467 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 2293 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
1468 do_unmask_irq = 1; 2294 do_unmask_irq = 1;
1469 mask_IO_APIC_irq(irq); 2295 mask_IO_APIC_irq(irq);
1470 } 2296 }
1471#endif 2297#endif
1472 2298
2299#ifdef CONFIG_X86_32
2300 /*
2301 * It appears there is an erratum which affects at least version 0x11
2302 * of I/O APIC (that's the 82093AA and cores integrated into various
2303 * chipsets). Under certain conditions a level-triggered interrupt is
2304 * erroneously delivered as edge-triggered one but the respective IRR
2305 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2306 * message but it will never arrive and further interrupts are blocked
2307 * from the source. The exact reason is so far unknown, but the
2308 * phenomenon was observed when two consecutive interrupt requests
2309 * from a given source get delivered to the same CPU and the source is
2310 * temporarily disabled in between.
2311 *
2312 * A workaround is to simulate an EOI message manually. We achieve it
2313 * by setting the trigger mode to edge and then to level when the edge
2314 * trigger mode gets detected in the TMR of a local APIC for a
2315 * level-triggered interrupt. We mask the source for the time of the
2316 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2317 * The idea is from Manfred Spraul. --macro
2318 */
2319 i = irq_cfg(irq)->vector;
2320
2321 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2322#endif
2323
1473 /* 2324 /*
1474 * We must acknowledge the irq before we move it or the acknowledge will 2325 * We must acknowledge the irq before we move it or the acknowledge will
1475 * not propagate properly. 2326 * not propagate properly.
@@ -1508,24 +2359,51 @@ static void ack_apic_level(unsigned int irq)
1508 move_masked_irq(irq); 2359 move_masked_irq(irq);
1509 unmask_IO_APIC_irq(irq); 2360 unmask_IO_APIC_irq(irq);
1510 } 2361 }
2362
2363#ifdef CONFIG_X86_32
2364 if (!(v & (1 << (i & 0x1f)))) {
2365 atomic_inc(&irq_mis_count);
2366 spin_lock(&ioapic_lock);
2367 __mask_and_edge_IO_APIC_irq(irq);
2368 __unmask_and_level_IO_APIC_irq(irq);
2369 spin_unlock(&ioapic_lock);
2370 }
2371#endif
1511} 2372}
1512 2373
1513static struct irq_chip ioapic_chip __read_mostly = { 2374static struct irq_chip ioapic_chip __read_mostly = {
1514 .name = "IO-APIC", 2375 .name = "IO-APIC",
1515 .startup = startup_ioapic_irq, 2376 .startup = startup_ioapic_irq,
1516 .mask = mask_IO_APIC_irq, 2377 .mask = mask_IO_APIC_irq,
1517 .unmask = unmask_IO_APIC_irq, 2378 .unmask = unmask_IO_APIC_irq,
1518 .ack = ack_apic_edge, 2379 .ack = ack_apic_edge,
1519 .eoi = ack_apic_level, 2380 .eoi = ack_apic_level,
1520#ifdef CONFIG_SMP 2381#ifdef CONFIG_SMP
1521 .set_affinity = set_ioapic_affinity_irq, 2382 .set_affinity = set_ioapic_affinity_irq,
1522#endif 2383#endif
1523 .retrigger = ioapic_retrigger_irq, 2384 .retrigger = ioapic_retrigger_irq,
1524}; 2385};
1525 2386
2387#ifdef CONFIG_INTR_REMAP
2388static struct irq_chip ir_ioapic_chip __read_mostly = {
2389 .name = "IR-IO-APIC",
2390 .startup = startup_ioapic_irq,
2391 .mask = mask_IO_APIC_irq,
2392 .unmask = unmask_IO_APIC_irq,
2393 .ack = ack_x2apic_edge,
2394 .eoi = ack_x2apic_level,
2395#ifdef CONFIG_SMP
2396 .set_affinity = set_ir_ioapic_affinity_irq,
2397#endif
2398 .retrigger = ioapic_retrigger_irq,
2399};
2400#endif
2401
1526static inline void init_IO_APIC_traps(void) 2402static inline void init_IO_APIC_traps(void)
1527{ 2403{
1528 int irq; 2404 int irq;
2405 struct irq_desc *desc;
2406 struct irq_cfg *cfg;
1529 2407
1530 /* 2408 /*
1531 * NOTE! The local APIC isn't very good at handling 2409 * NOTE! The local APIC isn't very good at handling
@@ -1538,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
1538 * Also, we've got to be careful not to trash gate 2416 * Also, we've got to be careful not to trash gate
1539 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2417 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1540 */ 2418 */
1541 for (irq = 0; irq < NR_IRQS ; irq++) { 2419 for_each_irq_cfg(irq, cfg) {
1542 if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { 2420 if (IO_APIC_IRQ(irq) && !cfg->vector) {
1543 /* 2421 /*
1544 * Hmm.. We don't have an entry for this, 2422 * Hmm.. We don't have an entry for this,
1545 * so default to an old-fashioned 8259 2423 * so default to an old-fashioned 8259
@@ -1547,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
1547 */ 2425 */
1548 if (irq < 16) 2426 if (irq < 16)
1549 make_8259A_irq(irq); 2427 make_8259A_irq(irq);
1550 else 2428 else {
2429 desc = irq_to_desc(irq);
1551 /* Strange. Oh, well.. */ 2430 /* Strange. Oh, well.. */
1552 irq_desc[irq].chip = &no_irq_chip; 2431 desc->chip = &no_irq_chip;
2432 }
1553 } 2433 }
1554 } 2434 }
1555} 2435}
1556 2436
1557static void unmask_lapic_irq(unsigned int irq) 2437/*
2438 * The local APIC irq-chip implementation:
2439 */
2440
2441static void mask_lapic_irq(unsigned int irq)
1558{ 2442{
1559 unsigned long v; 2443 unsigned long v;
1560 2444
1561 v = apic_read(APIC_LVT0); 2445 v = apic_read(APIC_LVT0);
1562 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2446 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1563} 2447}
1564 2448
1565static void mask_lapic_irq(unsigned int irq) 2449static void unmask_lapic_irq(unsigned int irq)
1566{ 2450{
1567 unsigned long v; 2451 unsigned long v;
1568 2452
1569 v = apic_read(APIC_LVT0); 2453 v = apic_read(APIC_LVT0);
1570 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2454 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1571} 2455}
1572 2456
1573static void ack_lapic_irq (unsigned int irq) 2457static void ack_lapic_irq (unsigned int irq)
@@ -1584,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
1584 2468
1585static void lapic_register_intr(int irq) 2469static void lapic_register_intr(int irq)
1586{ 2470{
1587 irq_desc[irq].status &= ~IRQ_LEVEL; 2471 struct irq_desc *desc;
2472
2473 desc = irq_to_desc(irq);
2474 desc->status &= ~IRQ_LEVEL;
1588 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2475 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1589 "edge"); 2476 "edge");
1590} 2477}
@@ -1592,19 +2479,19 @@ static void lapic_register_intr(int irq)
1592static void __init setup_nmi(void) 2479static void __init setup_nmi(void)
1593{ 2480{
1594 /* 2481 /*
1595 * Dirty trick to enable the NMI watchdog ... 2482 * Dirty trick to enable the NMI watchdog ...
1596 * We put the 8259A master into AEOI mode and 2483 * We put the 8259A master into AEOI mode and
1597 * unmask on all local APICs LVT0 as NMI. 2484 * unmask on all local APICs LVT0 as NMI.
1598 * 2485 *
1599 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2486 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1600 * is from Maciej W. Rozycki - so we do not have to EOI from 2487 * is from Maciej W. Rozycki - so we do not have to EOI from
1601 * the NMI handler or the timer interrupt. 2488 * the NMI handler or the timer interrupt.
1602 */ 2489 */
1603 printk(KERN_INFO "activating NMI Watchdog ..."); 2490 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
1604 2491
1605 enable_NMI_through_LVT0(); 2492 enable_NMI_through_LVT0();
1606 2493
1607 printk(" done.\n"); 2494 apic_printk(APIC_VERBOSE, " done.\n");
1608} 2495}
1609 2496
1610/* 2497/*
@@ -1621,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
1621 unsigned char save_control, save_freq_select; 2508 unsigned char save_control, save_freq_select;
1622 2509
1623 pin = find_isa_irq_pin(8, mp_INT); 2510 pin = find_isa_irq_pin(8, mp_INT);
2511 if (pin == -1) {
2512 WARN_ON_ONCE(1);
2513 return;
2514 }
1624 apic = find_isa_irq_apic(8, mp_INT); 2515 apic = find_isa_irq_apic(8, mp_INT);
1625 if (pin == -1) 2516 if (apic == -1) {
2517 WARN_ON_ONCE(1);
1626 return; 2518 return;
2519 }
1627 2520
1628 entry0 = ioapic_read_entry(apic, pin); 2521 entry0 = ioapic_read_entry(apic, pin);
1629
1630 clear_IO_APIC_pin(apic, pin); 2522 clear_IO_APIC_pin(apic, pin);
1631 2523
1632 memset(&entry1, 0, sizeof(entry1)); 2524 memset(&entry1, 0, sizeof(entry1));
@@ -1661,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
1661 ioapic_write_entry(apic, pin, entry0); 2553 ioapic_write_entry(apic, pin, entry0);
1662} 2554}
1663 2555
2556static int disable_timer_pin_1 __initdata;
2557/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
2558static int __init disable_timer_pin_setup(char *arg)
2559{
2560 disable_timer_pin_1 = 1;
2561 return 0;
2562}
2563early_param("disable_timer_pin_1", disable_timer_pin_setup);
2564
2565int timer_through_8259 __initdata;
2566
1664/* 2567/*
1665 * This code may look a bit paranoid, but it's supposed to cooperate with 2568 * This code may look a bit paranoid, but it's supposed to cooperate with
1666 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ 2569 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1667 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2570 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1668 * fanatically on his truly buggy board. 2571 * fanatically on his truly buggy board.
1669 * 2572 *
1670 * FIXME: really need to revamp this for modern platforms only. 2573 * FIXME: really need to revamp this for all platforms.
1671 */ 2574 */
1672static inline void __init check_timer(void) 2575static inline void __init check_timer(void)
1673{ 2576{
1674 struct irq_cfg *cfg = irq_cfg + 0; 2577 struct irq_cfg *cfg = irq_cfg(0);
1675 int apic1, pin1, apic2, pin2; 2578 int apic1, pin1, apic2, pin2;
1676 unsigned long flags; 2579 unsigned long flags;
2580 unsigned int ver;
1677 int no_pin1 = 0; 2581 int no_pin1 = 0;
1678 2582
1679 local_irq_save(flags); 2583 local_irq_save(flags);
1680 2584
2585 ver = apic_read(APIC_LVR);
2586 ver = GET_APIC_VERSION(ver);
2587
1681 /* 2588 /*
1682 * get/set the timer IRQ vector: 2589 * get/set the timer IRQ vector:
1683 */ 2590 */
@@ -1686,18 +2593,27 @@ static inline void __init check_timer(void)
1686 2593
1687 /* 2594 /*
1688 * As IRQ0 is to be enabled in the 8259A, the virtual 2595 * As IRQ0 is to be enabled in the 8259A, the virtual
1689 * wire has to be disabled in the local APIC. 2596 * wire has to be disabled in the local APIC. Also
2597 * timer interrupts need to be acknowledged manually in
2598 * the 8259A for the i82489DX when using the NMI
2599 * watchdog as that APIC treats NMIs as level-triggered.
2600 * The AEOI mode will finish them in the 8259A
2601 * automatically.
1690 */ 2602 */
1691 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2603 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1692 init_8259A(1); 2604 init_8259A(1);
2605#ifdef CONFIG_X86_32
2606 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2607#endif
1693 2608
1694 pin1 = find_isa_irq_pin(0, mp_INT); 2609 pin1 = find_isa_irq_pin(0, mp_INT);
1695 apic1 = find_isa_irq_apic(0, mp_INT); 2610 apic1 = find_isa_irq_apic(0, mp_INT);
1696 pin2 = ioapic_i8259.pin; 2611 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 2612 apic2 = ioapic_i8259.apic;
1698 2613
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2614 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 2615 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2616 cfg->vector, apic1, pin1, apic2, pin2);
1701 2617
1702 /* 2618 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 2619 * Some BIOS writers are clueless and report the ExtINTA
@@ -1707,6 +2623,10 @@ static inline void __init check_timer(void)
1707 * 8259A. 2623 * 8259A.
1708 */ 2624 */
1709 if (pin1 == -1) { 2625 if (pin1 == -1) {
2626#ifdef CONFIG_INTR_REMAP
2627 if (intr_remapping_enabled)
2628 panic("BIOS bug: timer not connected to IO-APIC");
2629#endif
1710 pin1 = pin2; 2630 pin1 = pin2;
1711 apic1 = apic2; 2631 apic1 = apic2;
1712 no_pin1 = 1; 2632 no_pin1 = 1;
@@ -1724,7 +2644,7 @@ static inline void __init check_timer(void)
1724 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2644 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
1725 } 2645 }
1726 unmask_IO_APIC_irq(0); 2646 unmask_IO_APIC_irq(0);
1727 if (!no_timer_check && timer_irq_works()) { 2647 if (timer_irq_works()) {
1728 if (nmi_watchdog == NMI_IO_APIC) { 2648 if (nmi_watchdog == NMI_IO_APIC) {
1729 setup_nmi(); 2649 setup_nmi();
1730 enable_8259A_irq(0); 2650 enable_8259A_irq(0);
@@ -1733,16 +2653,19 @@ static inline void __init check_timer(void)
1733 clear_IO_APIC_pin(0, pin1); 2653 clear_IO_APIC_pin(0, pin1);
1734 goto out; 2654 goto out;
1735 } 2655 }
2656#ifdef CONFIG_INTR_REMAP
2657 if (intr_remapping_enabled)
2658 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2659#endif
1736 clear_IO_APIC_pin(apic1, pin1); 2660 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 2661 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 2662 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 2663 "8254 timer not connected to IO-APIC\n");
1740 2664
1741 apic_printk(APIC_VERBOSE,KERN_INFO 2665 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 2666 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 2667 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 2668 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 2669 /*
1747 * legacy devices should be connected to IO APIC #0 2670 * legacy devices should be connected to IO APIC #0
1748 */ 2671 */
@@ -1751,7 +2674,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 2674 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 2675 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 2676 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 2677 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 2678 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 2679 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 2680 disable_8259A_irq(0);
@@ -1765,29 +2688,35 @@ static inline void __init check_timer(void)
1765 */ 2688 */
1766 disable_8259A_irq(0); 2689 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 2690 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 2691 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 2692 }
1770 2693
1771 if (nmi_watchdog == NMI_IO_APIC) { 2694 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2695 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2696 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 2697 nmi_watchdog = NMI_NONE;
1774 } 2698 }
2699#ifdef CONFIG_X86_32
2700 timer_ack = 0;
2701#endif
1775 2702
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2703 apic_printk(APIC_QUIET, KERN_INFO
2704 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 2705
1778 lapic_register_intr(0); 2706 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2707 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 2708 enable_8259A_irq(0);
1781 2709
1782 if (timer_irq_works()) { 2710 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 2711 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 2712 goto out;
1785 } 2713 }
1786 disable_8259A_irq(0); 2714 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2715 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 2716 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 2717
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2718 apic_printk(APIC_QUIET, KERN_INFO
2719 "...trying to set up timer as ExtINT IRQ...\n");
1791 2720
1792 init_8259A(0); 2721 init_8259A(0);
1793 make_8259A_irq(0); 2722 make_8259A_irq(0);
@@ -1796,22 +2725,16 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 2725 unlock_ExtINT_logic();
1797 2726
1798 if (timer_irq_works()) { 2727 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 2728 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 2729 goto out;
1801 } 2730 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 2731 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 2732 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2733 "report. Then try booting with the 'noapic' option.\n");
1804out: 2734out:
1805 local_irq_restore(flags); 2735 local_irq_restore(flags);
1806} 2736}
1807 2737
1808static int __init notimercheck(char *s)
1809{
1810 no_timer_check = 1;
1811 return 1;
1812}
1813__setup("no_timer_check", notimercheck);
1814
1815/* 2738/*
1816 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available 2739 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1817 * to devices. However there may be an I/O APIC pin available for 2740 * to devices. However there may be an I/O APIC pin available for
@@ -1829,27 +2752,49 @@ __setup("no_timer_check", notimercheck);
1829 * the I/O APIC in all cases now. No actual device should request 2752 * the I/O APIC in all cases now. No actual device should request
1830 * it anyway. --macro 2753 * it anyway. --macro
1831 */ 2754 */
1832#define PIC_IRQS (1<<2) 2755#define PIC_IRQS (1 << PIC_CASCADE_IR)
1833 2756
1834void __init setup_IO_APIC(void) 2757void __init setup_IO_APIC(void)
1835{ 2758{
1836 2759
2760#ifdef CONFIG_X86_32
2761 enable_IO_APIC();
2762#else
1837 /* 2763 /*
1838 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2764 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1839 */ 2765 */
2766#endif
1840 2767
1841 io_apic_irqs = ~PIC_IRQS; 2768 io_apic_irqs = ~PIC_IRQS;
1842 2769
1843 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2770 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1844 2771 /*
2772 * Set up IO-APIC IRQ routing.
2773 */
2774#ifdef CONFIG_X86_32
2775 if (!acpi_ioapic)
2776 setup_ioapic_ids_from_mpc();
2777#endif
1845 sync_Arb_IDs(); 2778 sync_Arb_IDs();
1846 setup_IO_APIC_irqs(); 2779 setup_IO_APIC_irqs();
1847 init_IO_APIC_traps(); 2780 init_IO_APIC_traps();
1848 check_timer(); 2781 check_timer();
1849 if (!acpi_ioapic)
1850 print_IO_APIC();
1851} 2782}
1852 2783
2784/*
2785 * Called after all the initialization is done. If we didnt find any
2786 * APIC bugs then we can allow the modify fast path
2787 */
2788
2789static int __init io_apic_bug_finalize(void)
2790{
2791 if (sis_apic_bug == -1)
2792 sis_apic_bug = 0;
2793 return 0;
2794}
2795
2796late_initcall(io_apic_bug_finalize);
2797
1853struct sysfs_ioapic_data { 2798struct sysfs_ioapic_data {
1854 struct sys_device dev; 2799 struct sys_device dev;
1855 struct IO_APIC_route_entry entry[0]; 2800 struct IO_APIC_route_entry entry[0];
@@ -1937,38 +2882,60 @@ device_initcall(ioapic_init_sysfs);
1937/* 2882/*
1938 * Dynamic irq allocate and deallocation 2883 * Dynamic irq allocate and deallocation
1939 */ 2884 */
1940int create_irq(void) 2885unsigned int create_irq_nr(unsigned int irq_want)
1941{ 2886{
1942 /* Allocate an unused irq */ 2887 /* Allocate an unused irq */
1943 int irq; 2888 unsigned int irq;
1944 int new; 2889 unsigned int new;
1945 unsigned long flags; 2890 unsigned long flags;
2891 struct irq_cfg *cfg_new;
2892
2893 irq_want = nr_irqs - 1;
1946 2894
1947 irq = -ENOSPC; 2895 irq = 0;
1948 spin_lock_irqsave(&vector_lock, flags); 2896 spin_lock_irqsave(&vector_lock, flags);
1949 for (new = (NR_IRQS - 1); new >= 0; new--) { 2897 for (new = irq_want; new > 0; new--) {
1950 if (platform_legacy_irq(new)) 2898 if (platform_legacy_irq(new))
1951 continue; 2899 continue;
1952 if (irq_cfg[new].vector != 0) 2900 cfg_new = irq_cfg(new);
2901 if (cfg_new && cfg_new->vector != 0)
1953 continue; 2902 continue;
2903 /* check if need to create one */
2904 if (!cfg_new)
2905 cfg_new = irq_cfg_alloc(new);
1954 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 2906 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
1955 irq = new; 2907 irq = new;
1956 break; 2908 break;
1957 } 2909 }
1958 spin_unlock_irqrestore(&vector_lock, flags); 2910 spin_unlock_irqrestore(&vector_lock, flags);
1959 2911
1960 if (irq >= 0) { 2912 if (irq > 0) {
1961 dynamic_irq_init(irq); 2913 dynamic_irq_init(irq);
1962 } 2914 }
1963 return irq; 2915 return irq;
1964} 2916}
1965 2917
2918int create_irq(void)
2919{
2920 int irq;
2921
2922 irq = create_irq_nr(nr_irqs - 1);
2923
2924 if (irq == 0)
2925 irq = -1;
2926
2927 return irq;
2928}
2929
1966void destroy_irq(unsigned int irq) 2930void destroy_irq(unsigned int irq)
1967{ 2931{
1968 unsigned long flags; 2932 unsigned long flags;
1969 2933
1970 dynamic_irq_cleanup(irq); 2934 dynamic_irq_cleanup(irq);
1971 2935
2936#ifdef CONFIG_INTR_REMAP
2937 free_irte(irq);
2938#endif
1972 spin_lock_irqsave(&vector_lock, flags); 2939 spin_lock_irqsave(&vector_lock, flags);
1973 __clear_irq_vector(irq); 2940 __clear_irq_vector(irq);
1974 spin_unlock_irqrestore(&vector_lock, flags); 2941 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1980,18 +2947,50 @@ void destroy_irq(unsigned int irq)
1980#ifdef CONFIG_PCI_MSI 2947#ifdef CONFIG_PCI_MSI
1981static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 2948static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
1982{ 2949{
1983 struct irq_cfg *cfg = irq_cfg + irq; 2950 struct irq_cfg *cfg;
1984 int err; 2951 int err;
1985 unsigned dest; 2952 unsigned dest;
1986 cpumask_t tmp; 2953 cpumask_t tmp;
1987 2954
1988 tmp = TARGET_CPUS; 2955 tmp = TARGET_CPUS;
1989 err = assign_irq_vector(irq, tmp); 2956 err = assign_irq_vector(irq, tmp);
1990 if (!err) { 2957 if (err)
1991 cpus_and(tmp, cfg->domain, tmp); 2958 return err;
1992 dest = cpu_mask_to_apicid(tmp); 2959
2960 cfg = irq_cfg(irq);
2961 cpus_and(tmp, cfg->domain, tmp);
2962 dest = cpu_mask_to_apicid(tmp);
2963
2964#ifdef CONFIG_INTR_REMAP
2965 if (irq_remapped(irq)) {
2966 struct irte irte;
2967 int ir_index;
2968 u16 sub_handle;
2969
2970 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2971 BUG_ON(ir_index == -1);
2972
2973 memset (&irte, 0, sizeof(irte));
2974
2975 irte.present = 1;
2976 irte.dst_mode = INT_DEST_MODE;
2977 irte.trigger_mode = 0; /* edge */
2978 irte.dlvry_mode = INT_DELIVERY_MODE;
2979 irte.vector = cfg->vector;
2980 irte.dest_id = IRTE_DEST(dest);
2981
2982 modify_irte(irq, &irte);
1993 2983
1994 msg->address_hi = MSI_ADDR_BASE_HI; 2984 msg->address_hi = MSI_ADDR_BASE_HI;
2985 msg->data = sub_handle;
2986 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2987 MSI_ADDR_IR_SHV |
2988 MSI_ADDR_IR_INDEX1(ir_index) |
2989 MSI_ADDR_IR_INDEX2(ir_index);
2990 } else
2991#endif
2992 {
2993 msg->address_hi = MSI_ADDR_BASE_HI;
1995 msg->address_lo = 2994 msg->address_lo =
1996 MSI_ADDR_BASE_LO | 2995 MSI_ADDR_BASE_LO |
1997 ((INT_DEST_MODE == 0) ? 2996 ((INT_DEST_MODE == 0) ?
@@ -2016,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2016#ifdef CONFIG_SMP 3015#ifdef CONFIG_SMP
2017static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3016static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2018{ 3017{
2019 struct irq_cfg *cfg = irq_cfg + irq; 3018 struct irq_cfg *cfg;
2020 struct msi_msg msg; 3019 struct msi_msg msg;
2021 unsigned int dest; 3020 unsigned int dest;
2022 cpumask_t tmp; 3021 cpumask_t tmp;
3022 struct irq_desc *desc;
2023 3023
2024 cpus_and(tmp, mask, cpu_online_map); 3024 cpus_and(tmp, mask, cpu_online_map);
2025 if (cpus_empty(tmp)) 3025 if (cpus_empty(tmp))
@@ -2028,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2028 if (assign_irq_vector(irq, mask)) 3028 if (assign_irq_vector(irq, mask))
2029 return; 3029 return;
2030 3030
3031 cfg = irq_cfg(irq);
2031 cpus_and(tmp, cfg->domain, mask); 3032 cpus_and(tmp, cfg->domain, mask);
2032 dest = cpu_mask_to_apicid(tmp); 3033 dest = cpu_mask_to_apicid(tmp);
2033 3034
@@ -2039,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2039 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3040 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2040 3041
2041 write_msi_msg(irq, &msg); 3042 write_msi_msg(irq, &msg);
2042 irq_desc[irq].affinity = mask; 3043 desc = irq_to_desc(irq);
3044 desc->affinity = mask;
2043} 3045}
3046
3047#ifdef CONFIG_INTR_REMAP
3048/*
3049 * Migrate the MSI irq to another cpumask. This migration is
3050 * done in the process context using interrupt-remapping hardware.
3051 */
3052static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3053{
3054 struct irq_cfg *cfg;
3055 unsigned int dest;
3056 cpumask_t tmp, cleanup_mask;
3057 struct irte irte;
3058 struct irq_desc *desc;
3059
3060 cpus_and(tmp, mask, cpu_online_map);
3061 if (cpus_empty(tmp))
3062 return;
3063
3064 if (get_irte(irq, &irte))
3065 return;
3066
3067 if (assign_irq_vector(irq, mask))
3068 return;
3069
3070 cfg = irq_cfg(irq);
3071 cpus_and(tmp, cfg->domain, mask);
3072 dest = cpu_mask_to_apicid(tmp);
3073
3074 irte.vector = cfg->vector;
3075 irte.dest_id = IRTE_DEST(dest);
3076
3077 /*
3078 * atomically update the IRTE with the new destination and vector.
3079 */
3080 modify_irte(irq, &irte);
3081
3082 /*
3083 * After this point, all the interrupts will start arriving
3084 * at the new destination. So, time to cleanup the previous
3085 * vector allocation.
3086 */
3087 if (cfg->move_in_progress) {
3088 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
3089 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3090 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3091 cfg->move_in_progress = 0;
3092 }
3093
3094 desc = irq_to_desc(irq);
3095 desc->affinity = mask;
3096}
3097#endif
2044#endif /* CONFIG_SMP */ 3098#endif /* CONFIG_SMP */
2045 3099
2046/* 3100/*
@@ -2058,26 +3112,179 @@ static struct irq_chip msi_chip = {
2058 .retrigger = ioapic_retrigger_irq, 3112 .retrigger = ioapic_retrigger_irq,
2059}; 3113};
2060 3114
2061int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 3115#ifdef CONFIG_INTR_REMAP
3116static struct irq_chip msi_ir_chip = {
3117 .name = "IR-PCI-MSI",
3118 .unmask = unmask_msi_irq,
3119 .mask = mask_msi_irq,
3120 .ack = ack_x2apic_edge,
3121#ifdef CONFIG_SMP
3122 .set_affinity = ir_set_msi_irq_affinity,
3123#endif
3124 .retrigger = ioapic_retrigger_irq,
3125};
3126
3127/*
3128 * Map the PCI dev to the corresponding remapping hardware unit
3129 * and allocate 'nvec' consecutive interrupt-remapping table entries
3130 * in it.
3131 */
3132static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2062{ 3133{
3134 struct intel_iommu *iommu;
3135 int index;
3136
3137 iommu = map_dev_to_ir(dev);
3138 if (!iommu) {
3139 printk(KERN_ERR
3140 "Unable to map PCI %s to iommu\n", pci_name(dev));
3141 return -ENOENT;
3142 }
3143
3144 index = alloc_irte(iommu, irq, nvec);
3145 if (index < 0) {
3146 printk(KERN_ERR
3147 "Unable to allocate %d IRTE for PCI %s\n", nvec,
3148 pci_name(dev));
3149 return -ENOSPC;
3150 }
3151 return index;
3152}
3153#endif
3154
3155static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3156{
3157 int ret;
2063 struct msi_msg msg; 3158 struct msi_msg msg;
2064 int irq, ret;
2065 irq = create_irq();
2066 if (irq < 0)
2067 return irq;
2068 3159
2069 ret = msi_compose_msg(dev, irq, &msg); 3160 ret = msi_compose_msg(dev, irq, &msg);
3161 if (ret < 0)
3162 return ret;
3163
3164 set_irq_msi(irq, desc);
3165 write_msi_msg(irq, &msg);
3166
3167#ifdef CONFIG_INTR_REMAP
3168 if (irq_remapped(irq)) {
3169 struct irq_desc *desc = irq_to_desc(irq);
3170 /*
3171 * irq migration in process context
3172 */
3173 desc->status |= IRQ_MOVE_PCNTXT;
3174 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3175 } else
3176#endif
3177 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3178
3179 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3180
3181 return 0;
3182}
3183
3184static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
3185{
3186 unsigned int irq;
3187
3188 irq = dev->bus->number;
3189 irq <<= 8;
3190 irq |= dev->devfn;
3191 irq <<= 12;
3192
3193 return irq;
3194}
3195
3196int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3197{
3198 unsigned int irq;
3199 int ret;
3200 unsigned int irq_want;
3201
3202 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3203
3204 irq = create_irq_nr(irq_want);
3205 if (irq == 0)
3206 return -1;
3207
3208#ifdef CONFIG_INTR_REMAP
3209 if (!intr_remapping_enabled)
3210 goto no_ir;
3211
3212 ret = msi_alloc_irte(dev, irq, 1);
3213 if (ret < 0)
3214 goto error;
3215no_ir:
3216#endif
3217 ret = setup_msi_irq(dev, desc, irq);
2070 if (ret < 0) { 3218 if (ret < 0) {
2071 destroy_irq(irq); 3219 destroy_irq(irq);
2072 return ret; 3220 return ret;
2073 } 3221 }
3222 return 0;
2074 3223
2075 set_irq_msi(irq, desc); 3224#ifdef CONFIG_INTR_REMAP
2076 write_msi_msg(irq, &msg); 3225error:
3226 destroy_irq(irq);
3227 return ret;
3228#endif
3229}
2077 3230
2078 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3231int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3232{
3233 unsigned int irq;
3234 int ret, sub_handle;
3235 struct msi_desc *desc;
3236 unsigned int irq_want;
2079 3237
3238#ifdef CONFIG_INTR_REMAP
3239 struct intel_iommu *iommu = 0;
3240 int index = 0;
3241#endif
3242
3243 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3244 sub_handle = 0;
3245 list_for_each_entry(desc, &dev->msi_list, list) {
3246 irq = create_irq_nr(irq_want--);
3247 if (irq == 0)
3248 return -1;
3249#ifdef CONFIG_INTR_REMAP
3250 if (!intr_remapping_enabled)
3251 goto no_ir;
3252
3253 if (!sub_handle) {
3254 /*
3255 * allocate the consecutive block of IRTE's
3256 * for 'nvec'
3257 */
3258 index = msi_alloc_irte(dev, irq, nvec);
3259 if (index < 0) {
3260 ret = index;
3261 goto error;
3262 }
3263 } else {
3264 iommu = map_dev_to_ir(dev);
3265 if (!iommu) {
3266 ret = -ENOENT;
3267 goto error;
3268 }
3269 /*
3270 * setup the mapping between the irq and the IRTE
3271 * base index, the sub_handle pointing to the
3272 * appropriate interrupt remap table entry.
3273 */
3274 set_irte_irq(irq, iommu, index, sub_handle);
3275 }
3276no_ir:
3277#endif
3278 ret = setup_msi_irq(dev, desc, irq);
3279 if (ret < 0)
3280 goto error;
3281 sub_handle++;
3282 }
2080 return 0; 3283 return 0;
3284
3285error:
3286 destroy_irq(irq);
3287 return ret;
2081} 3288}
2082 3289
2083void arch_teardown_msi_irq(unsigned int irq) 3290void arch_teardown_msi_irq(unsigned int irq)
@@ -2089,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
2089#ifdef CONFIG_SMP 3296#ifdef CONFIG_SMP
2090static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3297static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2091{ 3298{
2092 struct irq_cfg *cfg = irq_cfg + irq; 3299 struct irq_cfg *cfg;
2093 struct msi_msg msg; 3300 struct msi_msg msg;
2094 unsigned int dest; 3301 unsigned int dest;
2095 cpumask_t tmp; 3302 cpumask_t tmp;
3303 struct irq_desc *desc;
2096 3304
2097 cpus_and(tmp, mask, cpu_online_map); 3305 cpus_and(tmp, mask, cpu_online_map);
2098 if (cpus_empty(tmp)) 3306 if (cpus_empty(tmp))
@@ -2101,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2101 if (assign_irq_vector(irq, mask)) 3309 if (assign_irq_vector(irq, mask))
2102 return; 3310 return;
2103 3311
3312 cfg = irq_cfg(irq);
2104 cpus_and(tmp, cfg->domain, mask); 3313 cpus_and(tmp, cfg->domain, mask);
2105 dest = cpu_mask_to_apicid(tmp); 3314 dest = cpu_mask_to_apicid(tmp);
2106 3315
@@ -2112,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2112 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3321 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2113 3322
2114 dmar_msi_write(irq, &msg); 3323 dmar_msi_write(irq, &msg);
2115 irq_desc[irq].affinity = mask; 3324 desc = irq_to_desc(irq);
3325 desc->affinity = mask;
2116} 3326}
2117#endif /* CONFIG_SMP */ 3327#endif /* CONFIG_SMP */
2118 3328
@@ -2142,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
2142} 3352}
2143#endif 3353#endif
2144 3354
3355#ifdef CONFIG_HPET_TIMER
3356
3357#ifdef CONFIG_SMP
3358static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3359{
3360 struct irq_cfg *cfg;
3361 struct irq_desc *desc;
3362 struct msi_msg msg;
3363 unsigned int dest;
3364 cpumask_t tmp;
3365
3366 cpus_and(tmp, mask, cpu_online_map);
3367 if (cpus_empty(tmp))
3368 return;
3369
3370 if (assign_irq_vector(irq, mask))
3371 return;
3372
3373 cfg = irq_cfg(irq);
3374 cpus_and(tmp, cfg->domain, mask);
3375 dest = cpu_mask_to_apicid(tmp);
3376
3377 hpet_msi_read(irq, &msg);
3378
3379 msg.data &= ~MSI_DATA_VECTOR_MASK;
3380 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3381 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3382 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3383
3384 hpet_msi_write(irq, &msg);
3385 desc = irq_to_desc(irq);
3386 desc->affinity = mask;
3387}
3388#endif /* CONFIG_SMP */
3389
3390struct irq_chip hpet_msi_type = {
3391 .name = "HPET_MSI",
3392 .unmask = hpet_msi_unmask,
3393 .mask = hpet_msi_mask,
3394 .ack = ack_apic_edge,
3395#ifdef CONFIG_SMP
3396 .set_affinity = hpet_msi_set_affinity,
3397#endif
3398 .retrigger = ioapic_retrigger_irq,
3399};
3400
3401int arch_setup_hpet_msi(unsigned int irq)
3402{
3403 int ret;
3404 struct msi_msg msg;
3405
3406 ret = msi_compose_msg(NULL, irq, &msg);
3407 if (ret < 0)
3408 return ret;
3409
3410 hpet_msi_write(irq, &msg);
3411 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
3412 "edge");
3413
3414 return 0;
3415}
3416#endif
3417
2145#endif /* CONFIG_PCI_MSI */ 3418#endif /* CONFIG_PCI_MSI */
2146/* 3419/*
2147 * Hypertransport interrupt support 3420 * Hypertransport interrupt support
@@ -2166,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2166 3439
2167static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3440static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2168{ 3441{
2169 struct irq_cfg *cfg = irq_cfg + irq; 3442 struct irq_cfg *cfg;
2170 unsigned int dest; 3443 unsigned int dest;
2171 cpumask_t tmp; 3444 cpumask_t tmp;
3445 struct irq_desc *desc;
2172 3446
2173 cpus_and(tmp, mask, cpu_online_map); 3447 cpus_and(tmp, mask, cpu_online_map);
2174 if (cpus_empty(tmp)) 3448 if (cpus_empty(tmp))
@@ -2177,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2177 if (assign_irq_vector(irq, mask)) 3451 if (assign_irq_vector(irq, mask))
2178 return; 3452 return;
2179 3453
3454 cfg = irq_cfg(irq);
2180 cpus_and(tmp, cfg->domain, mask); 3455 cpus_and(tmp, cfg->domain, mask);
2181 dest = cpu_mask_to_apicid(tmp); 3456 dest = cpu_mask_to_apicid(tmp);
2182 3457
2183 target_ht_irq(irq, dest, cfg->vector); 3458 target_ht_irq(irq, dest, cfg->vector);
2184 irq_desc[irq].affinity = mask; 3459 desc = irq_to_desc(irq);
3460 desc->affinity = mask;
2185} 3461}
2186#endif 3462#endif
2187 3463
@@ -2198,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
2198 3474
2199int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2200{ 3476{
2201 struct irq_cfg *cfg = irq_cfg + irq; 3477 struct irq_cfg *cfg;
2202 int err; 3478 int err;
2203 cpumask_t tmp; 3479 cpumask_t tmp;
2204 3480
@@ -2208,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2208 struct ht_irq_msg msg; 3484 struct ht_irq_msg msg;
2209 unsigned dest; 3485 unsigned dest;
2210 3486
3487 cfg = irq_cfg(irq);
2211 cpus_and(tmp, cfg->domain, tmp); 3488 cpus_and(tmp, cfg->domain, tmp);
2212 dest = cpu_mask_to_apicid(tmp); 3489 dest = cpu_mask_to_apicid(tmp);
2213 3490
@@ -2230,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2230 3507
2231 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3508 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2232 handle_edge_irq, "edge"); 3509 handle_edge_irq, "edge");
3510
3511 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
2233 } 3512 }
2234 return err; 3513 return err;
2235} 3514}
2236#endif /* CONFIG_HT_IRQ */ 3515#endif /* CONFIG_HT_IRQ */
2237 3516
3517#ifdef CONFIG_X86_64
3518/*
3519 * Re-target the irq to the specified CPU and enable the specified MMR located
3520 * on the specified blade to allow the sending of MSIs to the specified CPU.
3521 */
3522int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3523 unsigned long mmr_offset)
3524{
3525 const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
3526 struct irq_cfg *cfg;
3527 int mmr_pnode;
3528 unsigned long mmr_value;
3529 struct uv_IO_APIC_route_entry *entry;
3530 unsigned long flags;
3531 int err;
3532
3533 err = assign_irq_vector(irq, *eligible_cpu);
3534 if (err != 0)
3535 return err;
3536
3537 spin_lock_irqsave(&vector_lock, flags);
3538 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3539 irq_name);
3540 spin_unlock_irqrestore(&vector_lock, flags);
3541
3542 cfg = irq_cfg(irq);
3543
3544 mmr_value = 0;
3545 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3546 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3547
3548 entry->vector = cfg->vector;
3549 entry->delivery_mode = INT_DELIVERY_MODE;
3550 entry->dest_mode = INT_DEST_MODE;
3551 entry->polarity = 0;
3552 entry->trigger = 0;
3553 entry->mask = 0;
3554 entry->dest = cpu_mask_to_apicid(*eligible_cpu);
3555
3556 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3557 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3558
3559 return irq;
3560}
3561
3562/*
3563 * Disable the specified MMR located on the specified blade so that MSIs are
3564 * longer allowed to be sent.
3565 */
3566void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3567{
3568 unsigned long mmr_value;
3569 struct uv_IO_APIC_route_entry *entry;
3570 int mmr_pnode;
3571
3572 mmr_value = 0;
3573 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3574 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3575
3576 entry->mask = 1;
3577
3578 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3579 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3580}
3581#endif /* CONFIG_X86_64 */
3582
3583int __init io_apic_get_redir_entries (int ioapic)
3584{
3585 union IO_APIC_reg_01 reg_01;
3586 unsigned long flags;
3587
3588 spin_lock_irqsave(&ioapic_lock, flags);
3589 reg_01.raw = io_apic_read(ioapic, 1);
3590 spin_unlock_irqrestore(&ioapic_lock, flags);
3591
3592 return reg_01.bits.entries;
3593}
3594
3595int __init probe_nr_irqs(void)
3596{
3597 int idx;
3598 int nr = 0;
3599#ifndef CONFIG_XEN
3600 int nr_min = 32;
3601#else
3602 int nr_min = NR_IRQS;
3603#endif
3604
3605 for (idx = 0; idx < nr_ioapics; idx++)
3606 nr += io_apic_get_redir_entries(idx) + 1;
3607
3608 /* double it for hotplug and msi and nmi */
3609 nr <<= 1;
3610
3611 /* something wrong ? */
3612 if (nr < nr_min)
3613 nr = nr_min;
3614
3615 return nr;
3616}
3617
2238/* -------------------------------------------------------------------------- 3618/* --------------------------------------------------------------------------
2239 ACPI-based IOAPIC Configuration 3619 ACPI-based IOAPIC Configuration
2240 -------------------------------------------------------------------------- */ 3620 -------------------------------------------------------------------------- */
2241 3621
2242#ifdef CONFIG_ACPI 3622#ifdef CONFIG_ACPI
2243 3623
2244#define IO_APIC_MAX_ID 0xFE 3624#ifdef CONFIG_X86_32
3625int __init io_apic_get_unique_id(int ioapic, int apic_id)
3626{
3627 union IO_APIC_reg_00 reg_00;
3628 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
3629 physid_mask_t tmp;
3630 unsigned long flags;
3631 int i = 0;
2245 3632
2246int __init io_apic_get_redir_entries (int ioapic) 3633 /*
3634 * The P4 platform supports up to 256 APIC IDs on two separate APIC
3635 * buses (one for LAPICs, one for IOAPICs), where predecessors only
3636 * supports up to 16 on one shared APIC bus.
3637 *
3638 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
3639 * advantage of new APIC bus architecture.
3640 */
3641
3642 if (physids_empty(apic_id_map))
3643 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
3644
3645 spin_lock_irqsave(&ioapic_lock, flags);
3646 reg_00.raw = io_apic_read(ioapic, 0);
3647 spin_unlock_irqrestore(&ioapic_lock, flags);
3648
3649 if (apic_id >= get_physical_broadcast()) {
3650 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
3651 "%d\n", ioapic, apic_id, reg_00.bits.ID);
3652 apic_id = reg_00.bits.ID;
3653 }
3654
3655 /*
3656 * Every APIC in a system must have a unique ID or we get lots of nice
3657 * 'stuck on smp_invalidate_needed IPI wait' messages.
3658 */
3659 if (check_apicid_used(apic_id_map, apic_id)) {
3660
3661 for (i = 0; i < get_physical_broadcast(); i++) {
3662 if (!check_apicid_used(apic_id_map, i))
3663 break;
3664 }
3665
3666 if (i == get_physical_broadcast())
3667 panic("Max apic_id exceeded!\n");
3668
3669 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
3670 "trying %d\n", ioapic, apic_id, i);
3671
3672 apic_id = i;
3673 }
3674
3675 tmp = apicid_to_cpu_present(apic_id);
3676 physids_or(apic_id_map, apic_id_map, tmp);
3677
3678 if (reg_00.bits.ID != apic_id) {
3679 reg_00.bits.ID = apic_id;
3680
3681 spin_lock_irqsave(&ioapic_lock, flags);
3682 io_apic_write(ioapic, 0, reg_00.raw);
3683 reg_00.raw = io_apic_read(ioapic, 0);
3684 spin_unlock_irqrestore(&ioapic_lock, flags);
3685
3686 /* Sanity check */
3687 if (reg_00.bits.ID != apic_id) {
3688 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
3689 return -1;
3690 }
3691 }
3692
3693 apic_printk(APIC_VERBOSE, KERN_INFO
3694 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
3695
3696 return apic_id;
3697}
3698
3699int __init io_apic_get_version(int ioapic)
2247{ 3700{
2248 union IO_APIC_reg_01 reg_01; 3701 union IO_APIC_reg_01 reg_01;
2249 unsigned long flags; 3702 unsigned long flags;
@@ -2252,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
2252 reg_01.raw = io_apic_read(ioapic, 1); 3705 reg_01.raw = io_apic_read(ioapic, 1);
2253 spin_unlock_irqrestore(&ioapic_lock, flags); 3706 spin_unlock_irqrestore(&ioapic_lock, flags);
2254 3707
2255 return reg_01.bits.entries; 3708 return reg_01.bits.version;
2256} 3709}
2257 3710#endif
2258 3711
2259int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3712int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2260{ 3713{
@@ -2306,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2306void __init setup_ioapic_dest(void) 3759void __init setup_ioapic_dest(void)
2307{ 3760{
2308 int pin, ioapic, irq, irq_entry; 3761 int pin, ioapic, irq, irq_entry;
3762 struct irq_cfg *cfg;
2309 3763
2310 if (skip_ioapic_setup == 1) 3764 if (skip_ioapic_setup == 1)
2311 return; 3765 return;
@@ -2321,10 +3775,15 @@ void __init setup_ioapic_dest(void)
2321 * when you have too many devices, because at that time only boot 3775 * when you have too many devices, because at that time only boot
2322 * cpu is online. 3776 * cpu is online.
2323 */ 3777 */
2324 if (!irq_cfg[irq].vector) 3778 cfg = irq_cfg(irq);
3779 if (!cfg->vector)
2325 setup_IO_APIC_irq(ioapic, pin, irq, 3780 setup_IO_APIC_irq(ioapic, pin, irq,
2326 irq_trigger(irq_entry), 3781 irq_trigger(irq_entry),
2327 irq_polarity(irq_entry)); 3782 irq_polarity(irq_entry));
3783#ifdef CONFIG_INTR_REMAP
3784 else if (intr_remapping_enabled)
3785 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
3786#endif
2328 else 3787 else
2329 set_ioapic_affinity_irq(irq, TARGET_CPUS); 3788 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2330 } 3789 }
@@ -2375,18 +3834,33 @@ void __init ioapic_init_mappings(void)
2375 struct resource *ioapic_res; 3834 struct resource *ioapic_res;
2376 int i; 3835 int i;
2377 3836
3837 irq_2_pin_init();
2378 ioapic_res = ioapic_setup_resources(); 3838 ioapic_res = ioapic_setup_resources();
2379 for (i = 0; i < nr_ioapics; i++) { 3839 for (i = 0; i < nr_ioapics; i++) {
2380 if (smp_found_config) { 3840 if (smp_found_config) {
2381 ioapic_phys = mp_ioapics[i].mp_apicaddr; 3841 ioapic_phys = mp_ioapics[i].mp_apicaddr;
3842#ifdef CONFIG_X86_32
3843 if (!ioapic_phys) {
3844 printk(KERN_ERR
3845 "WARNING: bogus zero IO-APIC "
3846 "address found in MPTABLE, "
3847 "disabling IO/APIC support!\n");
3848 smp_found_config = 0;
3849 skip_ioapic_setup = 1;
3850 goto fake_ioapic_page;
3851 }
3852#endif
2382 } else { 3853 } else {
3854#ifdef CONFIG_X86_32
3855fake_ioapic_page:
3856#endif
2383 ioapic_phys = (unsigned long) 3857 ioapic_phys = (unsigned long)
2384 alloc_bootmem_pages(PAGE_SIZE); 3858 alloc_bootmem_pages(PAGE_SIZE);
2385 ioapic_phys = __pa(ioapic_phys); 3859 ioapic_phys = __pa(ioapic_phys);
2386 } 3860 }
2387 set_fixmap_nocache(idx, ioapic_phys); 3861 set_fixmap_nocache(idx, ioapic_phys);
2388 apic_printk(APIC_VERBOSE, 3862 apic_printk(APIC_VERBOSE,
2389 "mapped IOAPIC to %016lx (%016lx)\n", 3863 "mapped IOAPIC to %08lx (%08lx)\n",
2390 __fix_to_virt(idx), ioapic_phys); 3864 __fix_to_virt(idx), ioapic_phys);
2391 idx++; 3865 idx++;
2392 3866
@@ -2420,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
2420/* Insert the IO APIC resources after PCI initialization has occured to handle 3894/* Insert the IO APIC resources after PCI initialization has occured to handle
2421 * IO APICS that are mapped in on a BAR in PCI space. */ 3895 * IO APICS that are mapped in on a BAR in PCI space. */
2422late_initcall(ioapic_insert_resources); 3896late_initcall(ioapic_insert_resources);
2423
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index 558abf4c796a..000000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2900 +0,0 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/bootmem.h>
29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
31#include <linux/acpi.h>
32#include <linux/module.h>
33#include <linux/sysdev.h>
34#include <linux/pci.h>
35#include <linux/msi.h>
36#include <linux/htirq.h>
37#include <linux/freezer.h>
38#include <linux/kthread.h>
39#include <linux/jiffies.h> /* time_after() */
40
41#include <asm/io.h>
42#include <asm/smp.h>
43#include <asm/desc.h>
44#include <asm/timer.h>
45#include <asm/i8259.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49
50#include <mach_apic.h>
51#include <mach_apicdef.h>
52
53int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count;
55
56/* Where if anywhere is the i8259 connect in external int mode */
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58
59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock);
61
62int timer_through_8259 __initdata;
63
64/*
65 * Is the SiS APIC rmw bug present ?
66 * -1 = don't know, 0 = no, 1 = yes
67 */
68int sis_apic_bug = -1;
69
70/*
71 * # of IRQ routing registers
72 */
73int nr_ioapic_registers[MAX_IO_APICS];
74
75/* I/O APIC entries */
76struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
77int nr_ioapics;
78
79/* MP IRQ source entries */
80struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
81
82/* # of MP IRQ source entries */
83int mp_irq_entries;
84
85#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
86int mp_bus_id_to_type[MAX_MP_BUSSES];
87#endif
88
89DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
90
91static int disable_timer_pin_1 __initdata;
92
93/*
94 * Rough estimation of how many shared IRQs there are, can
95 * be changed anytime.
96 */
97#define MAX_PLUS_SHARED_IRQS NR_IRQS
98#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
99
100/*
101 * This is performance-critical, we want to do it O(1)
102 *
103 * the indexing order of this array favors 1:1 mappings
104 * between pins and IRQs.
105 */
106
107static struct irq_pin_list {
108 int apic, pin, next;
109} irq_2_pin[PIN_MAP_SIZE];
110
111struct io_apic {
112 unsigned int index;
113 unsigned int unused[3];
114 unsigned int data;
115};
116
117static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
118{
119 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
120 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
121}
122
123static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
124{
125 struct io_apic __iomem *io_apic = io_apic_base(apic);
126 writel(reg, &io_apic->index);
127 return readl(&io_apic->data);
128}
129
130static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
131{
132 struct io_apic __iomem *io_apic = io_apic_base(apic);
133 writel(reg, &io_apic->index);
134 writel(value, &io_apic->data);
135}
136
137/*
138 * Re-write a value: to be used for read-modify-write
139 * cycles where the read already set up the index register.
140 *
141 * Older SiS APIC requires we rewrite the index register
142 */
143static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
144{
145 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
146 if (sis_apic_bug)
147 writel(reg, &io_apic->index);
148 writel(value, &io_apic->data);
149}
150
151union entry_union {
152 struct { u32 w1, w2; };
153 struct IO_APIC_route_entry entry;
154};
155
156static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
157{
158 union entry_union eu;
159 unsigned long flags;
160 spin_lock_irqsave(&ioapic_lock, flags);
161 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
162 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
163 spin_unlock_irqrestore(&ioapic_lock, flags);
164 return eu.entry;
165}
166
167/*
168 * When we write a new IO APIC routing entry, we need to write the high
169 * word first! If the mask bit in the low word is clear, we will enable
170 * the interrupt, and we need to make sure the entry is fully populated
171 * before that happens.
172 */
173static void
174__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
175{
176 union entry_union eu;
177 eu.entry = e;
178 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
179 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
180}
181
182static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
183{
184 unsigned long flags;
185 spin_lock_irqsave(&ioapic_lock, flags);
186 __ioapic_write_entry(apic, pin, e);
187 spin_unlock_irqrestore(&ioapic_lock, flags);
188}
189
190/*
191 * When we mask an IO APIC routing entry, we need to write the low
192 * word first, in order to set the mask bit before we change the
193 * high bits!
194 */
195static void ioapic_mask_entry(int apic, int pin)
196{
197 unsigned long flags;
198 union entry_union eu = { .entry.mask = 1 };
199
200 spin_lock_irqsave(&ioapic_lock, flags);
201 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
202 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
203 spin_unlock_irqrestore(&ioapic_lock, flags);
204}
205
206/*
207 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
208 * shared ISA-space IRQs, so we have to support them. We are super
209 * fast in the common case, and fast for shared ISA-space IRQs.
210 */
211static void add_pin_to_irq(unsigned int irq, int apic, int pin)
212{
213 static int first_free_entry = NR_IRQS;
214 struct irq_pin_list *entry = irq_2_pin + irq;
215
216 while (entry->next)
217 entry = irq_2_pin + entry->next;
218
219 if (entry->pin != -1) {
220 entry->next = first_free_entry;
221 entry = irq_2_pin + entry->next;
222 if (++first_free_entry >= PIN_MAP_SIZE)
223 panic("io_apic.c: whoops");
224 }
225 entry->apic = apic;
226 entry->pin = pin;
227}
228
229/*
230 * Reroute an IRQ to a different pin.
231 */
232static void __init replace_pin_at_irq(unsigned int irq,
233 int oldapic, int oldpin,
234 int newapic, int newpin)
235{
236 struct irq_pin_list *entry = irq_2_pin + irq;
237
238 while (1) {
239 if (entry->apic == oldapic && entry->pin == oldpin) {
240 entry->apic = newapic;
241 entry->pin = newpin;
242 }
243 if (!entry->next)
244 break;
245 entry = irq_2_pin + entry->next;
246 }
247}
248
249static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
250{
251 struct irq_pin_list *entry = irq_2_pin + irq;
252 unsigned int pin, reg;
253
254 for (;;) {
255 pin = entry->pin;
256 if (pin == -1)
257 break;
258 reg = io_apic_read(entry->apic, 0x10 + pin*2);
259 reg &= ~disable;
260 reg |= enable;
261 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
262 if (!entry->next)
263 break;
264 entry = irq_2_pin + entry->next;
265 }
266}
267
268/* mask = 1 */
269static void __mask_IO_APIC_irq(unsigned int irq)
270{
271 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
272}
273
274/* mask = 0 */
275static void __unmask_IO_APIC_irq(unsigned int irq)
276{
277 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
278}
279
280/* mask = 1, trigger = 0 */
281static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
282{
283 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
284 IO_APIC_REDIR_LEVEL_TRIGGER);
285}
286
287/* mask = 0, trigger = 1 */
288static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
289{
290 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
291 IO_APIC_REDIR_MASKED);
292}
293
294static void mask_IO_APIC_irq(unsigned int irq)
295{
296 unsigned long flags;
297
298 spin_lock_irqsave(&ioapic_lock, flags);
299 __mask_IO_APIC_irq(irq);
300 spin_unlock_irqrestore(&ioapic_lock, flags);
301}
302
303static void unmask_IO_APIC_irq(unsigned int irq)
304{
305 unsigned long flags;
306
307 spin_lock_irqsave(&ioapic_lock, flags);
308 __unmask_IO_APIC_irq(irq);
309 spin_unlock_irqrestore(&ioapic_lock, flags);
310}
311
312static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
313{
314 struct IO_APIC_route_entry entry;
315
316 /* Check delivery_mode to be sure we're not clearing an SMI pin */
317 entry = ioapic_read_entry(apic, pin);
318 if (entry.delivery_mode == dest_SMI)
319 return;
320
321 /*
322 * Disable it in the IO-APIC irq-routing table:
323 */
324 ioapic_mask_entry(apic, pin);
325}
326
327static void clear_IO_APIC(void)
328{
329 int apic, pin;
330
331 for (apic = 0; apic < nr_ioapics; apic++)
332 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
333 clear_IO_APIC_pin(apic, pin);
334}
335
336#ifdef CONFIG_SMP
337static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
338{
339 unsigned long flags;
340 int pin;
341 struct irq_pin_list *entry = irq_2_pin + irq;
342 unsigned int apicid_value;
343 cpumask_t tmp;
344
345 cpus_and(tmp, cpumask, cpu_online_map);
346 if (cpus_empty(tmp))
347 tmp = TARGET_CPUS;
348
349 cpus_and(cpumask, tmp, CPU_MASK_ALL);
350
351 apicid_value = cpu_mask_to_apicid(cpumask);
352 /* Prepare to do the io_apic_write */
353 apicid_value = apicid_value << 24;
354 spin_lock_irqsave(&ioapic_lock, flags);
355 for (;;) {
356 pin = entry->pin;
357 if (pin == -1)
358 break;
359 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
360 if (!entry->next)
361 break;
362 entry = irq_2_pin + entry->next;
363 }
364 irq_desc[irq].affinity = cpumask;
365 spin_unlock_irqrestore(&ioapic_lock, flags);
366}
367
368#if defined(CONFIG_IRQBALANCE)
369# include <asm/processor.h> /* kernel_thread() */
370# include <linux/kernel_stat.h> /* kstat */
371# include <linux/slab.h> /* kmalloc() */
372# include <linux/timer.h>
373
374#define IRQBALANCE_CHECK_ARCH -999
375#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
376#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
377#define BALANCED_IRQ_MORE_DELTA (HZ/10)
378#define BALANCED_IRQ_LESS_DELTA (HZ)
379
380static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
381static int physical_balance __read_mostly;
382static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
383
384static struct irq_cpu_info {
385 unsigned long *last_irq;
386 unsigned long *irq_delta;
387 unsigned long irq;
388} irq_cpu_data[NR_CPUS];
389
390#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
391#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
392#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
393
394#define IDLE_ENOUGH(cpu,now) \
395 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
396
397#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
398
399#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
400
401static cpumask_t balance_irq_affinity[NR_IRQS] = {
402 [0 ... NR_IRQS-1] = CPU_MASK_ALL
403};
404
405void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
406{
407 balance_irq_affinity[irq] = mask;
408}
409
410static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
411 unsigned long now, int direction)
412{
413 int search_idle = 1;
414 int cpu = curr_cpu;
415
416 goto inside;
417
418 do {
419 if (unlikely(cpu == curr_cpu))
420 search_idle = 0;
421inside:
422 if (direction == 1) {
423 cpu++;
424 if (cpu >= NR_CPUS)
425 cpu = 0;
426 } else {
427 cpu--;
428 if (cpu == -1)
429 cpu = NR_CPUS-1;
430 }
431 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
432 (search_idle && !IDLE_ENOUGH(cpu, now)));
433
434 return cpu;
435}
436
437static inline void balance_irq(int cpu, int irq)
438{
439 unsigned long now = jiffies;
440 cpumask_t allowed_mask;
441 unsigned int new_cpu;
442
443 if (irqbalance_disabled)
444 return;
445
446 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
447 new_cpu = move(cpu, allowed_mask, now, 1);
448 if (cpu != new_cpu)
449 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
450}
451
452static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
453{
454 int i, j;
455
456 for_each_online_cpu(i) {
457 for (j = 0; j < NR_IRQS; j++) {
458 if (!irq_desc[j].action)
459 continue;
460 /* Is it a significant load ? */
461 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
462 useful_load_threshold)
463 continue;
464 balance_irq(i, j);
465 }
466 }
467 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
468 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
469 return;
470}
471
472static void do_irq_balance(void)
473{
474 int i, j;
475 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
476 unsigned long move_this_load = 0;
477 int max_loaded = 0, min_loaded = 0;
478 int load;
479 unsigned long useful_load_threshold = balanced_irq_interval + 10;
480 int selected_irq;
481 int tmp_loaded, first_attempt = 1;
482 unsigned long tmp_cpu_irq;
483 unsigned long imbalance = 0;
484 cpumask_t allowed_mask, target_cpu_mask, tmp;
485
486 for_each_possible_cpu(i) {
487 int package_index;
488 CPU_IRQ(i) = 0;
489 if (!cpu_online(i))
490 continue;
491 package_index = CPU_TO_PACKAGEINDEX(i);
492 for (j = 0; j < NR_IRQS; j++) {
493 unsigned long value_now, delta;
494 /* Is this an active IRQ or balancing disabled ? */
495 if (!irq_desc[j].action || irq_balancing_disabled(j))
496 continue;
497 if (package_index == i)
498 IRQ_DELTA(package_index, j) = 0;
499 /* Determine the total count per processor per IRQ */
500 value_now = (unsigned long) kstat_cpu(i).irqs[j];
501
502 /* Determine the activity per processor per IRQ */
503 delta = value_now - LAST_CPU_IRQ(i, j);
504
505 /* Update last_cpu_irq[][] for the next time */
506 LAST_CPU_IRQ(i, j) = value_now;
507
508 /* Ignore IRQs whose rate is less than the clock */
509 if (delta < useful_load_threshold)
510 continue;
511 /* update the load for the processor or package total */
512 IRQ_DELTA(package_index, j) += delta;
513
514 /* Keep track of the higher numbered sibling as well */
515 if (i != package_index)
516 CPU_IRQ(i) += delta;
517 /*
518 * We have sibling A and sibling B in the package
519 *
520 * cpu_irq[A] = load for cpu A + load for cpu B
521 * cpu_irq[B] = load for cpu B
522 */
523 CPU_IRQ(package_index) += delta;
524 }
525 }
526 /* Find the least loaded processor package */
527 for_each_online_cpu(i) {
528 if (i != CPU_TO_PACKAGEINDEX(i))
529 continue;
530 if (min_cpu_irq > CPU_IRQ(i)) {
531 min_cpu_irq = CPU_IRQ(i);
532 min_loaded = i;
533 }
534 }
535 max_cpu_irq = ULONG_MAX;
536
537tryanothercpu:
538 /*
539 * Look for heaviest loaded processor.
540 * We may come back to get the next heaviest loaded processor.
541 * Skip processors with trivial loads.
542 */
543 tmp_cpu_irq = 0;
544 tmp_loaded = -1;
545 for_each_online_cpu(i) {
546 if (i != CPU_TO_PACKAGEINDEX(i))
547 continue;
548 if (max_cpu_irq <= CPU_IRQ(i))
549 continue;
550 if (tmp_cpu_irq < CPU_IRQ(i)) {
551 tmp_cpu_irq = CPU_IRQ(i);
552 tmp_loaded = i;
553 }
554 }
555
556 if (tmp_loaded == -1) {
557 /*
558 * In the case of small number of heavy interrupt sources,
559 * loading some of the cpus too much. We use Ingo's original
560 * approach to rotate them around.
561 */
562 if (!first_attempt && imbalance >= useful_load_threshold) {
563 rotate_irqs_among_cpus(useful_load_threshold);
564 return;
565 }
566 goto not_worth_the_effort;
567 }
568
569 first_attempt = 0; /* heaviest search */
570 max_cpu_irq = tmp_cpu_irq; /* load */
571 max_loaded = tmp_loaded; /* processor */
572 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
573
574 /*
575 * if imbalance is less than approx 10% of max load, then
576 * observe diminishing returns action. - quit
577 */
578 if (imbalance < (max_cpu_irq >> 3))
579 goto not_worth_the_effort;
580
581tryanotherirq:
582 /* if we select an IRQ to move that can't go where we want, then
583 * see if there is another one to try.
584 */
585 move_this_load = 0;
586 selected_irq = -1;
587 for (j = 0; j < NR_IRQS; j++) {
588 /* Is this an active IRQ? */
589 if (!irq_desc[j].action)
590 continue;
591 if (imbalance <= IRQ_DELTA(max_loaded, j))
592 continue;
593 /* Try to find the IRQ that is closest to the imbalance
594 * without going over.
595 */
596 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
597 move_this_load = IRQ_DELTA(max_loaded, j);
598 selected_irq = j;
599 }
600 }
601 if (selected_irq == -1)
602 goto tryanothercpu;
603
604 imbalance = move_this_load;
605
606 /* For physical_balance case, we accumulated both load
607 * values in the one of the siblings cpu_irq[],
608 * to use the same code for physical and logical processors
609 * as much as possible.
610 *
611 * NOTE: the cpu_irq[] array holds the sum of the load for
612 * sibling A and sibling B in the slot for the lowest numbered
613 * sibling (A), _AND_ the load for sibling B in the slot for
614 * the higher numbered sibling.
615 *
616 * We seek the least loaded sibling by making the comparison
617 * (A+B)/2 vs B
618 */
619 load = CPU_IRQ(min_loaded) >> 1;
620 for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
621 if (load > CPU_IRQ(j)) {
622 /* This won't change cpu_sibling_map[min_loaded] */
623 load = CPU_IRQ(j);
624 min_loaded = j;
625 }
626 }
627
628 cpus_and(allowed_mask,
629 cpu_online_map,
630 balance_irq_affinity[selected_irq]);
631 target_cpu_mask = cpumask_of_cpu(min_loaded);
632 cpus_and(tmp, target_cpu_mask, allowed_mask);
633
634 if (!cpus_empty(tmp)) {
635 /* mark for change destination */
636 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
637
638 /* Since we made a change, come back sooner to
639 * check for more variation.
640 */
641 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
642 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
643 return;
644 }
645 goto tryanotherirq;
646
647not_worth_the_effort:
648 /*
649 * if we did not find an IRQ to move, then adjust the time interval
650 * upward
651 */
652 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
653 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
654 return;
655}
656
657static int balanced_irq(void *unused)
658{
659 int i;
660 unsigned long prev_balance_time = jiffies;
661 long time_remaining = balanced_irq_interval;
662
663 /* push everything to CPU 0 to give us a starting point. */
664 for (i = 0 ; i < NR_IRQS ; i++) {
665 irq_desc[i].pending_mask = cpumask_of_cpu(0);
666 set_pending_irq(i, cpumask_of_cpu(0));
667 }
668
669 set_freezable();
670 for ( ; ; ) {
671 time_remaining = schedule_timeout_interruptible(time_remaining);
672 try_to_freeze();
673 if (time_after(jiffies,
674 prev_balance_time+balanced_irq_interval)) {
675 preempt_disable();
676 do_irq_balance();
677 prev_balance_time = jiffies;
678 time_remaining = balanced_irq_interval;
679 preempt_enable();
680 }
681 }
682 return 0;
683}
684
685static int __init balanced_irq_init(void)
686{
687 int i;
688 struct cpuinfo_x86 *c;
689 cpumask_t tmp;
690
691 cpus_shift_right(tmp, cpu_online_map, 2);
692 c = &boot_cpu_data;
693 /* When not overwritten by the command line ask subarchitecture. */
694 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
695 irqbalance_disabled = NO_BALANCE_IRQ;
696 if (irqbalance_disabled)
697 return 0;
698
699 /* disable irqbalance completely if there is only one processor online */
700 if (num_online_cpus() < 2) {
701 irqbalance_disabled = 1;
702 return 0;
703 }
704 /*
705 * Enable physical balance only if more than 1 physical processor
706 * is present
707 */
708 if (smp_num_siblings > 1 && !cpus_empty(tmp))
709 physical_balance = 1;
710
711 for_each_online_cpu(i) {
712 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
713 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
714 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
715 printk(KERN_ERR "balanced_irq_init: out of memory");
716 goto failed;
717 }
718 }
719
720 printk(KERN_INFO "Starting balanced_irq\n");
721 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
722 return 0;
723 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
724failed:
725 for_each_possible_cpu(i) {
726 kfree(irq_cpu_data[i].irq_delta);
727 irq_cpu_data[i].irq_delta = NULL;
728 kfree(irq_cpu_data[i].last_irq);
729 irq_cpu_data[i].last_irq = NULL;
730 }
731 return 0;
732}
733
734int __devinit irqbalance_disable(char *str)
735{
736 irqbalance_disabled = 1;
737 return 1;
738}
739
740__setup("noirqbalance", irqbalance_disable);
741
742late_initcall(balanced_irq_init);
743#endif /* CONFIG_IRQBALANCE */
744#endif /* CONFIG_SMP */
745
746#ifndef CONFIG_SMP
747void send_IPI_self(int vector)
748{
749 unsigned int cfg;
750
751 /*
752 * Wait for idle.
753 */
754 apic_wait_icr_idle();
755 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
756 /*
757 * Send the IPI. The write to APIC_ICR fires this off.
758 */
759 apic_write_around(APIC_ICR, cfg);
760}
761#endif /* !CONFIG_SMP */
762
763
764/*
765 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
766 * specific CPU-side IRQs.
767 */
768
769#define MAX_PIRQS 8
770static int pirq_entries [MAX_PIRQS];
771static int pirqs_enabled;
772int skip_ioapic_setup;
773
774static int __init ioapic_pirq_setup(char *str)
775{
776 int i, max;
777 int ints[MAX_PIRQS+1];
778
779 get_options(str, ARRAY_SIZE(ints), ints);
780
781 for (i = 0; i < MAX_PIRQS; i++)
782 pirq_entries[i] = -1;
783
784 pirqs_enabled = 1;
785 apic_printk(APIC_VERBOSE, KERN_INFO
786 "PIRQ redirection, working around broken MP-BIOS.\n");
787 max = MAX_PIRQS;
788 if (ints[0] < MAX_PIRQS)
789 max = ints[0];
790
791 for (i = 0; i < max; i++) {
792 apic_printk(APIC_VERBOSE, KERN_DEBUG
793 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
794 /*
795 * PIRQs are mapped upside down, usually.
796 */
797 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
798 }
799 return 1;
800}
801
802__setup("pirq=", ioapic_pirq_setup);
803
804/*
805 * Find the IRQ entry number of a certain pin.
806 */
807static int find_irq_entry(int apic, int pin, int type)
808{
809 int i;
810
811 for (i = 0; i < mp_irq_entries; i++)
812 if (mp_irqs[i].mp_irqtype == type &&
813 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
814 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
815 mp_irqs[i].mp_dstirq == pin)
816 return i;
817
818 return -1;
819}
820
821/*
822 * Find the pin to which IRQ[irq] (ISA) is connected
823 */
824static int __init find_isa_irq_pin(int irq, int type)
825{
826 int i;
827
828 for (i = 0; i < mp_irq_entries; i++) {
829 int lbus = mp_irqs[i].mp_srcbus;
830
831 if (test_bit(lbus, mp_bus_not_pci) &&
832 (mp_irqs[i].mp_irqtype == type) &&
833 (mp_irqs[i].mp_srcbusirq == irq))
834
835 return mp_irqs[i].mp_dstirq;
836 }
837 return -1;
838}
839
840static int __init find_isa_irq_apic(int irq, int type)
841{
842 int i;
843
844 for (i = 0; i < mp_irq_entries; i++) {
845 int lbus = mp_irqs[i].mp_srcbus;
846
847 if (test_bit(lbus, mp_bus_not_pci) &&
848 (mp_irqs[i].mp_irqtype == type) &&
849 (mp_irqs[i].mp_srcbusirq == irq))
850 break;
851 }
852 if (i < mp_irq_entries) {
853 int apic;
854 for (apic = 0; apic < nr_ioapics; apic++) {
855 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
856 return apic;
857 }
858 }
859
860 return -1;
861}
862
863/*
864 * Find a specific PCI IRQ entry.
865 * Not an __init, possibly needed by modules
866 */
867static int pin_2_irq(int idx, int apic, int pin);
868
869int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
870{
871 int apic, i, best_guess = -1;
872
873 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
874 "slot:%d, pin:%d.\n", bus, slot, pin);
875 if (test_bit(bus, mp_bus_not_pci)) {
876 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
877 return -1;
878 }
879 for (i = 0; i < mp_irq_entries; i++) {
880 int lbus = mp_irqs[i].mp_srcbus;
881
882 for (apic = 0; apic < nr_ioapics; apic++)
883 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
884 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
885 break;
886
887 if (!test_bit(lbus, mp_bus_not_pci) &&
888 !mp_irqs[i].mp_irqtype &&
889 (bus == lbus) &&
890 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
891 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
892
893 if (!(apic || IO_APIC_IRQ(irq)))
894 continue;
895
896 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
897 return irq;
898 /*
899 * Use the first all-but-pin matching entry as a
900 * best-guess fuzzy result for broken mptables.
901 */
902 if (best_guess < 0)
903 best_guess = irq;
904 }
905 }
906 return best_guess;
907}
908EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
909
910/*
911 * This function currently is only a helper for the i386 smp boot process where
912 * we need to reprogram the ioredtbls to cater for the cpus which have come online
913 * so mask in all cases should simply be TARGET_CPUS
914 */
915#ifdef CONFIG_SMP
916void __init setup_ioapic_dest(void)
917{
918 int pin, ioapic, irq, irq_entry;
919
920 if (skip_ioapic_setup == 1)
921 return;
922
923 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
924 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
925 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
926 if (irq_entry == -1)
927 continue;
928 irq = pin_2_irq(irq_entry, ioapic, pin);
929 set_ioapic_affinity_irq(irq, TARGET_CPUS);
930 }
931
932 }
933}
934#endif
935
936#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
937/*
938 * EISA Edge/Level control register, ELCR
939 */
940static int EISA_ELCR(unsigned int irq)
941{
942 if (irq < 16) {
943 unsigned int port = 0x4d0 + (irq >> 3);
944 return (inb(port) >> (irq & 7)) & 1;
945 }
946 apic_printk(APIC_VERBOSE, KERN_INFO
947 "Broken MPtable reports ISA irq %d\n", irq);
948 return 0;
949}
950#endif
951
952/* ISA interrupts are always polarity zero edge triggered,
953 * when listed as conforming in the MP table. */
954
955#define default_ISA_trigger(idx) (0)
956#define default_ISA_polarity(idx) (0)
957
958/* EISA interrupts are always polarity zero and can be edge or level
959 * trigger depending on the ELCR value. If an interrupt is listed as
960 * EISA conforming in the MP table, that means its trigger type must
961 * be read in from the ELCR */
962
963#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
964#define default_EISA_polarity(idx) default_ISA_polarity(idx)
965
966/* PCI interrupts are always polarity one level triggered,
967 * when listed as conforming in the MP table. */
968
969#define default_PCI_trigger(idx) (1)
970#define default_PCI_polarity(idx) (1)
971
972/* MCA interrupts are always polarity zero level triggered,
973 * when listed as conforming in the MP table. */
974
975#define default_MCA_trigger(idx) (1)
976#define default_MCA_polarity(idx) default_ISA_polarity(idx)
977
978static int MPBIOS_polarity(int idx)
979{
980 int bus = mp_irqs[idx].mp_srcbus;
981 int polarity;
982
983 /*
984 * Determine IRQ line polarity (high active or low active):
985 */
986 switch (mp_irqs[idx].mp_irqflag & 3) {
987 case 0: /* conforms, ie. bus-type dependent polarity */
988 {
989 polarity = test_bit(bus, mp_bus_not_pci)?
990 default_ISA_polarity(idx):
991 default_PCI_polarity(idx);
992 break;
993 }
994 case 1: /* high active */
995 {
996 polarity = 0;
997 break;
998 }
999 case 2: /* reserved */
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 polarity = 1;
1003 break;
1004 }
1005 case 3: /* low active */
1006 {
1007 polarity = 1;
1008 break;
1009 }
1010 default: /* invalid */
1011 {
1012 printk(KERN_WARNING "broken BIOS!!\n");
1013 polarity = 1;
1014 break;
1015 }
1016 }
1017 return polarity;
1018}
1019
1020static int MPBIOS_trigger(int idx)
1021{
1022 int bus = mp_irqs[idx].mp_srcbus;
1023 int trigger;
1024
1025 /*
1026 * Determine IRQ trigger mode (edge or level sensitive):
1027 */
1028 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1029 case 0: /* conforms, ie. bus-type dependent */
1030 {
1031 trigger = test_bit(bus, mp_bus_not_pci)?
1032 default_ISA_trigger(idx):
1033 default_PCI_trigger(idx);
1034#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1035 switch (mp_bus_id_to_type[bus]) {
1036 case MP_BUS_ISA: /* ISA pin */
1037 {
1038 /* set before the switch */
1039 break;
1040 }
1041 case MP_BUS_EISA: /* EISA pin */
1042 {
1043 trigger = default_EISA_trigger(idx);
1044 break;
1045 }
1046 case MP_BUS_PCI: /* PCI pin */
1047 {
1048 /* set before the switch */
1049 break;
1050 }
1051 case MP_BUS_MCA: /* MCA pin */
1052 {
1053 trigger = default_MCA_trigger(idx);
1054 break;
1055 }
1056 default:
1057 {
1058 printk(KERN_WARNING "broken BIOS!!\n");
1059 trigger = 1;
1060 break;
1061 }
1062 }
1063#endif
1064 break;
1065 }
1066 case 1: /* edge */
1067 {
1068 trigger = 0;
1069 break;
1070 }
1071 case 2: /* reserved */
1072 {
1073 printk(KERN_WARNING "broken BIOS!!\n");
1074 trigger = 1;
1075 break;
1076 }
1077 case 3: /* level */
1078 {
1079 trigger = 1;
1080 break;
1081 }
1082 default: /* invalid */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 0;
1086 break;
1087 }
1088 }
1089 return trigger;
1090}
1091
1092static inline int irq_polarity(int idx)
1093{
1094 return MPBIOS_polarity(idx);
1095}
1096
1097static inline int irq_trigger(int idx)
1098{
1099 return MPBIOS_trigger(idx);
1100}
1101
1102static int pin_2_irq(int idx, int apic, int pin)
1103{
1104 int irq, i;
1105 int bus = mp_irqs[idx].mp_srcbus;
1106
1107 /*
1108 * Debugging check, we are in big trouble if this message pops up!
1109 */
1110 if (mp_irqs[idx].mp_dstirq != pin)
1111 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1112
1113 if (test_bit(bus, mp_bus_not_pci))
1114 irq = mp_irqs[idx].mp_srcbusirq;
1115 else {
1116 /*
1117 * PCI IRQs are mapped in order
1118 */
1119 i = irq = 0;
1120 while (i < apic)
1121 irq += nr_ioapic_registers[i++];
1122 irq += pin;
1123
1124 /*
1125 * For MPS mode, so far only needed by ES7000 platform
1126 */
1127 if (ioapic_renumber_irq)
1128 irq = ioapic_renumber_irq(apic, irq);
1129 }
1130
1131 /*
1132 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1133 */
1134 if ((pin >= 16) && (pin <= 23)) {
1135 if (pirq_entries[pin-16] != -1) {
1136 if (!pirq_entries[pin-16]) {
1137 apic_printk(APIC_VERBOSE, KERN_DEBUG
1138 "disabling PIRQ%d\n", pin-16);
1139 } else {
1140 irq = pirq_entries[pin-16];
1141 apic_printk(APIC_VERBOSE, KERN_DEBUG
1142 "using PIRQ%d -> IRQ %d\n",
1143 pin-16, irq);
1144 }
1145 }
1146 }
1147 return irq;
1148}
1149
1150static inline int IO_APIC_irq_trigger(int irq)
1151{
1152 int apic, idx, pin;
1153
1154 for (apic = 0; apic < nr_ioapics; apic++) {
1155 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1156 idx = find_irq_entry(apic, pin, mp_INT);
1157 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1158 return irq_trigger(idx);
1159 }
1160 }
1161 /*
1162 * nonexistent IRQs are edge default
1163 */
1164 return 0;
1165}
1166
1167/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1168static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1169
1170static int __assign_irq_vector(int irq)
1171{
1172 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1173 int vector, offset;
1174
1175 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1176
1177 if (irq_vector[irq] > 0)
1178 return irq_vector[irq];
1179
1180 vector = current_vector;
1181 offset = current_offset;
1182next:
1183 vector += 8;
1184 if (vector >= first_system_vector) {
1185 offset = (offset + 1) % 8;
1186 vector = FIRST_DEVICE_VECTOR + offset;
1187 }
1188 if (vector == current_vector)
1189 return -ENOSPC;
1190 if (test_and_set_bit(vector, used_vectors))
1191 goto next;
1192
1193 current_vector = vector;
1194 current_offset = offset;
1195 irq_vector[irq] = vector;
1196
1197 return vector;
1198}
1199
1200static int assign_irq_vector(int irq)
1201{
1202 unsigned long flags;
1203 int vector;
1204
1205 spin_lock_irqsave(&vector_lock, flags);
1206 vector = __assign_irq_vector(irq);
1207 spin_unlock_irqrestore(&vector_lock, flags);
1208
1209 return vector;
1210}
1211
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1216static struct irq_chip ioapic_chip;
1217
1218#define IOAPIC_AUTO -1
1219#define IOAPIC_EDGE 0
1220#define IOAPIC_LEVEL 1
1221
1222static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1223{
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) {
1226 irq_desc[irq].status |= IRQ_LEVEL;
1227 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1228 handle_fasteoi_irq, "fasteoi");
1229 } else {
1230 irq_desc[irq].status &= ~IRQ_LEVEL;
1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1232 handle_edge_irq, "edge");
1233 }
1234 set_intr_gate(vector, interrupt[irq]);
1235}
1236
1237static void __init setup_IO_APIC_irqs(void)
1238{
1239 struct IO_APIC_route_entry entry;
1240 int apic, pin, idx, irq, first_notcon = 1, vector;
1241
1242 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1243
1244 for (apic = 0; apic < nr_ioapics; apic++) {
1245 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1246
1247 /*
1248 * add it to the IO-APIC irq-routing table:
1249 */
1250 memset(&entry, 0, sizeof(entry));
1251
1252 entry.delivery_mode = INT_DELIVERY_MODE;
1253 entry.dest_mode = INT_DEST_MODE;
1254 entry.mask = 0; /* enable IRQ */
1255 entry.dest.logical.logical_dest =
1256 cpu_mask_to_apicid(TARGET_CPUS);
1257
1258 idx = find_irq_entry(apic, pin, mp_INT);
1259 if (idx == -1) {
1260 if (first_notcon) {
1261 apic_printk(APIC_VERBOSE, KERN_DEBUG
1262 " IO-APIC (apicid-pin) %d-%d",
1263 mp_ioapics[apic].mp_apicid,
1264 pin);
1265 first_notcon = 0;
1266 } else
1267 apic_printk(APIC_VERBOSE, ", %d-%d",
1268 mp_ioapics[apic].mp_apicid, pin);
1269 continue;
1270 }
1271
1272 if (!first_notcon) {
1273 apic_printk(APIC_VERBOSE, " not connected.\n");
1274 first_notcon = 1;
1275 }
1276
1277 entry.trigger = irq_trigger(idx);
1278 entry.polarity = irq_polarity(idx);
1279
1280 if (irq_trigger(idx)) {
1281 entry.trigger = 1;
1282 entry.mask = 1;
1283 }
1284
1285 irq = pin_2_irq(idx, apic, pin);
1286 /*
1287 * skip adding the timer int on secondary nodes, which causes
1288 * a small but painful rift in the time-space continuum
1289 */
1290 if (multi_timer_check(apic, irq))
1291 continue;
1292 else
1293 add_pin_to_irq(irq, apic, pin);
1294
1295 if (!apic && !IO_APIC_IRQ(irq))
1296 continue;
1297
1298 if (IO_APIC_IRQ(irq)) {
1299 vector = assign_irq_vector(irq);
1300 entry.vector = vector;
1301 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1302
1303 if (!apic && (irq < 16))
1304 disable_8259A_irq(irq);
1305 }
1306 ioapic_write_entry(apic, pin, entry);
1307 }
1308 }
1309
1310 if (!first_notcon)
1311 apic_printk(APIC_VERBOSE, " not connected.\n");
1312}
1313
1314/*
1315 * Set up the timer pin, possibly with the 8259A-master behind.
1316 */
1317static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1318 int vector)
1319{
1320 struct IO_APIC_route_entry entry;
1321
1322 memset(&entry, 0, sizeof(entry));
1323
1324 /*
1325 * We use logical delivery to get the timer IRQ
1326 * to the first CPU.
1327 */
1328 entry.dest_mode = INT_DEST_MODE;
1329 entry.mask = 1; /* mask IRQ now */
1330 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1331 entry.delivery_mode = INT_DELIVERY_MODE;
1332 entry.polarity = 0;
1333 entry.trigger = 0;
1334 entry.vector = vector;
1335
1336 /*
1337 * The timer IRQ doesn't have to know that behind the
1338 * scene we may have a 8259A-master in AEOI mode ...
1339 */
1340 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1341
1342 /*
1343 * Add it to the IO-APIC irq-routing table:
1344 */
1345 ioapic_write_entry(apic, pin, entry);
1346}
1347
1348void __init print_IO_APIC(void)
1349{
1350 int apic, i;
1351 union IO_APIC_reg_00 reg_00;
1352 union IO_APIC_reg_01 reg_01;
1353 union IO_APIC_reg_02 reg_02;
1354 union IO_APIC_reg_03 reg_03;
1355 unsigned long flags;
1356
1357 if (apic_verbosity == APIC_QUIET)
1358 return;
1359
1360 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1361 for (i = 0; i < nr_ioapics; i++)
1362 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1363 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1364
1365 /*
1366 * We are a bit conservative about what we expect. We have to
1367 * know about every hardware change ASAP.
1368 */
1369 printk(KERN_INFO "testing the IO APIC.......................\n");
1370
1371 for (apic = 0; apic < nr_ioapics; apic++) {
1372
1373 spin_lock_irqsave(&ioapic_lock, flags);
1374 reg_00.raw = io_apic_read(apic, 0);
1375 reg_01.raw = io_apic_read(apic, 1);
1376 if (reg_01.bits.version >= 0x10)
1377 reg_02.raw = io_apic_read(apic, 2);
1378 if (reg_01.bits.version >= 0x20)
1379 reg_03.raw = io_apic_read(apic, 3);
1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1381
1382 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1383 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1384 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1385 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1386 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1387
1388 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1389 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1390
1391 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1392 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1393
1394 /*
1395 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1396 * but the value of reg_02 is read as the previous read register
1397 * value, so ignore it if reg_02 == reg_01.
1398 */
1399 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1400 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1401 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1402 }
1403
1404 /*
1405 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1406 * or reg_03, but the value of reg_0[23] is read as the previous read
1407 * register value, so ignore it if reg_03 == reg_0[12].
1408 */
1409 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1410 reg_03.raw != reg_01.raw) {
1411 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1412 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1413 }
1414
1415 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1416
1417 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1418 " Stat Dest Deli Vect: \n");
1419
1420 for (i = 0; i <= reg_01.bits.entries; i++) {
1421 struct IO_APIC_route_entry entry;
1422
1423 entry = ioapic_read_entry(apic, i);
1424
1425 printk(KERN_DEBUG " %02x %03X %02X ",
1426 i,
1427 entry.dest.logical.logical_dest,
1428 entry.dest.physical.physical_dest
1429 );
1430
1431 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1432 entry.mask,
1433 entry.trigger,
1434 entry.irr,
1435 entry.polarity,
1436 entry.delivery_status,
1437 entry.dest_mode,
1438 entry.delivery_mode,
1439 entry.vector
1440 );
1441 }
1442 }
1443 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1444 for (i = 0; i < NR_IRQS; i++) {
1445 struct irq_pin_list *entry = irq_2_pin + i;
1446 if (entry->pin < 0)
1447 continue;
1448 printk(KERN_DEBUG "IRQ%d ", i);
1449 for (;;) {
1450 printk("-> %d:%d", entry->apic, entry->pin);
1451 if (!entry->next)
1452 break;
1453 entry = irq_2_pin + entry->next;
1454 }
1455 printk("\n");
1456 }
1457
1458 printk(KERN_INFO ".................................... done.\n");
1459
1460 return;
1461}
1462
1463#if 0
1464
1465static void print_APIC_bitfield(int base)
1466{
1467 unsigned int v;
1468 int i, j;
1469
1470 if (apic_verbosity == APIC_QUIET)
1471 return;
1472
1473 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1474 for (i = 0; i < 8; i++) {
1475 v = apic_read(base + i*0x10);
1476 for (j = 0; j < 32; j++) {
1477 if (v & (1<<j))
1478 printk("1");
1479 else
1480 printk("0");
1481 }
1482 printk("\n");
1483 }
1484}
1485
1486void /*__init*/ print_local_APIC(void *dummy)
1487{
1488 unsigned int v, ver, maxlvt;
1489
1490 if (apic_verbosity == APIC_QUIET)
1491 return;
1492
1493 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1494 smp_processor_id(), hard_smp_processor_id());
1495 v = apic_read(APIC_ID);
1496 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1497 GET_APIC_ID(read_apic_id()));
1498 v = apic_read(APIC_LVR);
1499 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1500 ver = GET_APIC_VERSION(v);
1501 maxlvt = lapic_get_maxlvt();
1502
1503 v = apic_read(APIC_TASKPRI);
1504 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1505
1506 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1507 v = apic_read(APIC_ARBPRI);
1508 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1509 v & APIC_ARBPRI_MASK);
1510 v = apic_read(APIC_PROCPRI);
1511 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1512 }
1513
1514 v = apic_read(APIC_EOI);
1515 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1516 v = apic_read(APIC_RRR);
1517 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1518 v = apic_read(APIC_LDR);
1519 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1520 v = apic_read(APIC_DFR);
1521 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1522 v = apic_read(APIC_SPIV);
1523 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1524
1525 printk(KERN_DEBUG "... APIC ISR field:\n");
1526 print_APIC_bitfield(APIC_ISR);
1527 printk(KERN_DEBUG "... APIC TMR field:\n");
1528 print_APIC_bitfield(APIC_TMR);
1529 printk(KERN_DEBUG "... APIC IRR field:\n");
1530 print_APIC_bitfield(APIC_IRR);
1531
1532 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1533 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1534 apic_write(APIC_ESR, 0);
1535 v = apic_read(APIC_ESR);
1536 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1537 }
1538
1539 v = apic_read(APIC_ICR);
1540 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1541 v = apic_read(APIC_ICR2);
1542 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1543
1544 v = apic_read(APIC_LVTT);
1545 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1546
1547 if (maxlvt > 3) { /* PC is LVT#4. */
1548 v = apic_read(APIC_LVTPC);
1549 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1550 }
1551 v = apic_read(APIC_LVT0);
1552 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1553 v = apic_read(APIC_LVT1);
1554 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1555
1556 if (maxlvt > 2) { /* ERR is LVT#3. */
1557 v = apic_read(APIC_LVTERR);
1558 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1559 }
1560
1561 v = apic_read(APIC_TMICT);
1562 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1563 v = apic_read(APIC_TMCCT);
1564 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1565 v = apic_read(APIC_TDCR);
1566 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1567 printk("\n");
1568}
1569
1570void print_all_local_APICs(void)
1571{
1572 on_each_cpu(print_local_APIC, NULL, 1);
1573}
1574
1575void /*__init*/ print_PIC(void)
1576{
1577 unsigned int v;
1578 unsigned long flags;
1579
1580 if (apic_verbosity == APIC_QUIET)
1581 return;
1582
1583 printk(KERN_DEBUG "\nprinting PIC contents\n");
1584
1585 spin_lock_irqsave(&i8259A_lock, flags);
1586
1587 v = inb(0xa1) << 8 | inb(0x21);
1588 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1589
1590 v = inb(0xa0) << 8 | inb(0x20);
1591 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1592
1593 outb(0x0b, 0xa0);
1594 outb(0x0b, 0x20);
1595 v = inb(0xa0) << 8 | inb(0x20);
1596 outb(0x0a, 0xa0);
1597 outb(0x0a, 0x20);
1598
1599 spin_unlock_irqrestore(&i8259A_lock, flags);
1600
1601 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1602
1603 v = inb(0x4d1) << 8 | inb(0x4d0);
1604 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1605}
1606
1607#endif /* 0 */
1608
1609static void __init enable_IO_APIC(void)
1610{
1611 union IO_APIC_reg_01 reg_01;
1612 int i8259_apic, i8259_pin;
1613 int i, apic;
1614 unsigned long flags;
1615
1616 for (i = 0; i < PIN_MAP_SIZE; i++) {
1617 irq_2_pin[i].pin = -1;
1618 irq_2_pin[i].next = 0;
1619 }
1620 if (!pirqs_enabled)
1621 for (i = 0; i < MAX_PIRQS; i++)
1622 pirq_entries[i] = -1;
1623
1624 /*
1625 * The number of IO-APIC IRQ registers (== #pins):
1626 */
1627 for (apic = 0; apic < nr_ioapics; apic++) {
1628 spin_lock_irqsave(&ioapic_lock, flags);
1629 reg_01.raw = io_apic_read(apic, 1);
1630 spin_unlock_irqrestore(&ioapic_lock, flags);
1631 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1632 }
1633 for (apic = 0; apic < nr_ioapics; apic++) {
1634 int pin;
1635 /* See if any of the pins is in ExtINT mode */
1636 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1637 struct IO_APIC_route_entry entry;
1638 entry = ioapic_read_entry(apic, pin);
1639
1640
1641 /* If the interrupt line is enabled and in ExtInt mode
1642 * I have found the pin where the i8259 is connected.
1643 */
1644 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1645 ioapic_i8259.apic = apic;
1646 ioapic_i8259.pin = pin;
1647 goto found_i8259;
1648 }
1649 }
1650 }
1651 found_i8259:
1652 /* Look to see what if the MP table has reported the ExtINT */
1653 /* If we could not find the appropriate pin by looking at the ioapic
1654 * the i8259 probably is not connected the ioapic but give the
1655 * mptable a chance anyway.
1656 */
1657 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1658 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1659 /* Trust the MP table if nothing is setup in the hardware */
1660 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1661 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1662 ioapic_i8259.pin = i8259_pin;
1663 ioapic_i8259.apic = i8259_apic;
1664 }
1665 /* Complain if the MP table and the hardware disagree */
1666 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1667 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1668 {
1669 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1670 }
1671
1672 /*
1673 * Do not trust the IO-APIC being empty at bootup
1674 */
1675 clear_IO_APIC();
1676}
1677
1678/*
1679 * Not an __init, needed by the reboot code
1680 */
1681void disable_IO_APIC(void)
1682{
1683 /*
1684 * Clear the IO-APIC before rebooting:
1685 */
1686 clear_IO_APIC();
1687
1688 /*
1689 * If the i8259 is routed through an IOAPIC
1690 * Put that IOAPIC in virtual wire mode
1691 * so legacy interrupts can be delivered.
1692 */
1693 if (ioapic_i8259.pin != -1) {
1694 struct IO_APIC_route_entry entry;
1695
1696 memset(&entry, 0, sizeof(entry));
1697 entry.mask = 0; /* Enabled */
1698 entry.trigger = 0; /* Edge */
1699 entry.irr = 0;
1700 entry.polarity = 0; /* High */
1701 entry.delivery_status = 0;
1702 entry.dest_mode = 0; /* Physical */
1703 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1704 entry.vector = 0;
1705 entry.dest.physical.physical_dest =
1706 GET_APIC_ID(read_apic_id());
1707
1708 /*
1709 * Add it to the IO-APIC irq-routing table:
1710 */
1711 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1712 }
1713 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1714}
1715
1716/*
1717 * function to set the IO-APIC physical IDs based on the
1718 * values stored in the MPC table.
1719 *
1720 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1721 */
1722
1723static void __init setup_ioapic_ids_from_mpc(void)
1724{
1725 union IO_APIC_reg_00 reg_00;
1726 physid_mask_t phys_id_present_map;
1727 int apic;
1728 int i;
1729 unsigned char old_id;
1730 unsigned long flags;
1731
1732#ifdef CONFIG_X86_NUMAQ
1733 if (found_numaq)
1734 return;
1735#endif
1736
1737 /*
1738 * Don't check I/O APIC IDs for xAPIC systems. They have
1739 * no meaning without the serial APIC bus.
1740 */
1741 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1742 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1743 return;
1744 /*
1745 * This is broken; anything with a real cpu count has to
1746 * circumvent this idiocy regardless.
1747 */
1748 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1749
1750 /*
1751 * Set the IOAPIC ID to the value stored in the MPC table.
1752 */
1753 for (apic = 0; apic < nr_ioapics; apic++) {
1754
1755 /* Read the register 0 value */
1756 spin_lock_irqsave(&ioapic_lock, flags);
1757 reg_00.raw = io_apic_read(apic, 0);
1758 spin_unlock_irqrestore(&ioapic_lock, flags);
1759
1760 old_id = mp_ioapics[apic].mp_apicid;
1761
1762 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1763 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1764 apic, mp_ioapics[apic].mp_apicid);
1765 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1766 reg_00.bits.ID);
1767 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1768 }
1769
1770 /*
1771 * Sanity check, is the ID really free? Every APIC in a
1772 * system must have a unique ID or we get lots of nice
1773 * 'stuck on smp_invalidate_needed IPI wait' messages.
1774 */
1775 if (check_apicid_used(phys_id_present_map,
1776 mp_ioapics[apic].mp_apicid)) {
1777 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1778 apic, mp_ioapics[apic].mp_apicid);
1779 for (i = 0; i < get_physical_broadcast(); i++)
1780 if (!physid_isset(i, phys_id_present_map))
1781 break;
1782 if (i >= get_physical_broadcast())
1783 panic("Max APIC ID exceeded!\n");
1784 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1785 i);
1786 physid_set(i, phys_id_present_map);
1787 mp_ioapics[apic].mp_apicid = i;
1788 } else {
1789 physid_mask_t tmp;
1790 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1791 apic_printk(APIC_VERBOSE, "Setting %d in the "
1792 "phys_id_present_map\n",
1793 mp_ioapics[apic].mp_apicid);
1794 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1795 }
1796
1797
1798 /*
1799 * We need to adjust the IRQ routing table
1800 * if the ID changed.
1801 */
1802 if (old_id != mp_ioapics[apic].mp_apicid)
1803 for (i = 0; i < mp_irq_entries; i++)
1804 if (mp_irqs[i].mp_dstapic == old_id)
1805 mp_irqs[i].mp_dstapic
1806 = mp_ioapics[apic].mp_apicid;
1807
1808 /*
1809 * Read the right value from the MPC table and
1810 * write it into the ID register.
1811 */
1812 apic_printk(APIC_VERBOSE, KERN_INFO
1813 "...changing IO-APIC physical APIC ID to %d ...",
1814 mp_ioapics[apic].mp_apicid);
1815
1816 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1817 spin_lock_irqsave(&ioapic_lock, flags);
1818 io_apic_write(apic, 0, reg_00.raw);
1819 spin_unlock_irqrestore(&ioapic_lock, flags);
1820
1821 /*
1822 * Sanity check
1823 */
1824 spin_lock_irqsave(&ioapic_lock, flags);
1825 reg_00.raw = io_apic_read(apic, 0);
1826 spin_unlock_irqrestore(&ioapic_lock, flags);
1827 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1828 printk("could not set ID!\n");
1829 else
1830 apic_printk(APIC_VERBOSE, " ok.\n");
1831 }
1832}
1833
1834int no_timer_check __initdata;
1835
1836static int __init notimercheck(char *s)
1837{
1838 no_timer_check = 1;
1839 return 1;
1840}
1841__setup("no_timer_check", notimercheck);
1842
1843/*
1844 * There is a nasty bug in some older SMP boards, their mptable lies
1845 * about the timer IRQ. We do the following to work around the situation:
1846 *
1847 * - timer IRQ defaults to IO-APIC IRQ
1848 * - if this function detects that timer IRQs are defunct, then we fall
1849 * back to ISA timer IRQs
1850 */
1851static int __init timer_irq_works(void)
1852{
1853 unsigned long t1 = jiffies;
1854 unsigned long flags;
1855
1856 if (no_timer_check)
1857 return 1;
1858
1859 local_save_flags(flags);
1860 local_irq_enable();
1861 /* Let ten ticks pass... */
1862 mdelay((10 * 1000) / HZ);
1863 local_irq_restore(flags);
1864
1865 /*
1866 * Expect a few ticks at least, to be sure some possible
1867 * glue logic does not lock up after one or two first
1868 * ticks in a non-ExtINT mode. Also the local APIC
1869 * might have cached one ExtINT interrupt. Finally, at
1870 * least one tick may be lost due to delays.
1871 */
1872 if (time_after(jiffies, t1 + 4))
1873 return 1;
1874
1875 return 0;
1876}
1877
1878/*
1879 * In the SMP+IOAPIC case it might happen that there are an unspecified
1880 * number of pending IRQ events unhandled. These cases are very rare,
1881 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1882 * better to do it this way as thus we do not have to be aware of
1883 * 'pending' interrupts in the IRQ path, except at this point.
1884 */
1885/*
1886 * Edge triggered needs to resend any interrupt
1887 * that was delayed but this is now handled in the device
1888 * independent code.
1889 */
1890
1891/*
1892 * Startup quirk:
1893 *
1894 * Starting up a edge-triggered IO-APIC interrupt is
1895 * nasty - we need to make sure that we get the edge.
1896 * If it is already asserted for some reason, we need
1897 * return 1 to indicate that is was pending.
1898 *
1899 * This is not complete - we should be able to fake
1900 * an edge even if it isn't on the 8259A...
1901 *
1902 * (We do this for level-triggered IRQs too - it cannot hurt.)
1903 */
1904static unsigned int startup_ioapic_irq(unsigned int irq)
1905{
1906 int was_pending = 0;
1907 unsigned long flags;
1908
1909 spin_lock_irqsave(&ioapic_lock, flags);
1910 if (irq < 16) {
1911 disable_8259A_irq(irq);
1912 if (i8259A_irq_pending(irq))
1913 was_pending = 1;
1914 }
1915 __unmask_IO_APIC_irq(irq);
1916 spin_unlock_irqrestore(&ioapic_lock, flags);
1917
1918 return was_pending;
1919}
1920
1921static void ack_ioapic_irq(unsigned int irq)
1922{
1923 move_native_irq(irq);
1924 ack_APIC_irq();
1925}
1926
1927static void ack_ioapic_quirk_irq(unsigned int irq)
1928{
1929 unsigned long v;
1930 int i;
1931
1932 move_native_irq(irq);
1933/*
1934 * It appears there is an erratum which affects at least version 0x11
1935 * of I/O APIC (that's the 82093AA and cores integrated into various
1936 * chipsets). Under certain conditions a level-triggered interrupt is
1937 * erroneously delivered as edge-triggered one but the respective IRR
1938 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1939 * message but it will never arrive and further interrupts are blocked
1940 * from the source. The exact reason is so far unknown, but the
1941 * phenomenon was observed when two consecutive interrupt requests
1942 * from a given source get delivered to the same CPU and the source is
1943 * temporarily disabled in between.
1944 *
1945 * A workaround is to simulate an EOI message manually. We achieve it
1946 * by setting the trigger mode to edge and then to level when the edge
1947 * trigger mode gets detected in the TMR of a local APIC for a
1948 * level-triggered interrupt. We mask the source for the time of the
1949 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1950 * The idea is from Manfred Spraul. --macro
1951 */
1952 i = irq_vector[irq];
1953
1954 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1955
1956 ack_APIC_irq();
1957
1958 if (!(v & (1 << (i & 0x1f)))) {
1959 atomic_inc(&irq_mis_count);
1960 spin_lock(&ioapic_lock);
1961 __mask_and_edge_IO_APIC_irq(irq);
1962 __unmask_and_level_IO_APIC_irq(irq);
1963 spin_unlock(&ioapic_lock);
1964 }
1965}
1966
1967static int ioapic_retrigger_irq(unsigned int irq)
1968{
1969 send_IPI_self(irq_vector[irq]);
1970
1971 return 1;
1972}
1973
1974static struct irq_chip ioapic_chip __read_mostly = {
1975 .name = "IO-APIC",
1976 .startup = startup_ioapic_irq,
1977 .mask = mask_IO_APIC_irq,
1978 .unmask = unmask_IO_APIC_irq,
1979 .ack = ack_ioapic_irq,
1980 .eoi = ack_ioapic_quirk_irq,
1981#ifdef CONFIG_SMP
1982 .set_affinity = set_ioapic_affinity_irq,
1983#endif
1984 .retrigger = ioapic_retrigger_irq,
1985};
1986
1987
1988static inline void init_IO_APIC_traps(void)
1989{
1990 int irq;
1991
1992 /*
1993 * NOTE! The local APIC isn't very good at handling
1994 * multiple interrupts at the same interrupt level.
1995 * As the interrupt level is determined by taking the
1996 * vector number and shifting that right by 4, we
1997 * want to spread these out a bit so that they don't
1998 * all fall in the same interrupt level.
1999 *
2000 * Also, we've got to be careful not to trash gate
2001 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2002 */
2003 for (irq = 0; irq < NR_IRQS ; irq++) {
2004 if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2005 /*
2006 * Hmm.. We don't have an entry for this,
2007 * so default to an old-fashioned 8259
2008 * interrupt if we can..
2009 */
2010 if (irq < 16)
2011 make_8259A_irq(irq);
2012 else
2013 /* Strange. Oh, well.. */
2014 irq_desc[irq].chip = &no_irq_chip;
2015 }
2016 }
2017}
2018
2019/*
2020 * The local APIC irq-chip implementation:
2021 */
2022
2023static void ack_lapic_irq(unsigned int irq)
2024{
2025 ack_APIC_irq();
2026}
2027
2028static void mask_lapic_irq(unsigned int irq)
2029{
2030 unsigned long v;
2031
2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2034}
2035
2036static void unmask_lapic_irq(unsigned int irq)
2037{
2038 unsigned long v;
2039
2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042}
2043
2044static struct irq_chip lapic_chip __read_mostly = {
2045 .name = "local-APIC",
2046 .mask = mask_lapic_irq,
2047 .unmask = unmask_lapic_irq,
2048 .ack = ack_lapic_irq,
2049};
2050
2051static void lapic_register_intr(int irq, int vector)
2052{
2053 irq_desc[irq].status &= ~IRQ_LEVEL;
2054 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2055 "edge");
2056 set_intr_gate(vector, interrupt[irq]);
2057}
2058
2059static void __init setup_nmi(void)
2060{
2061 /*
2062 * Dirty trick to enable the NMI watchdog ...
2063 * We put the 8259A master into AEOI mode and
2064 * unmask on all local APICs LVT0 as NMI.
2065 *
2066 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2067 * is from Maciej W. Rozycki - so we do not have to EOI from
2068 * the NMI handler or the timer interrupt.
2069 */
2070 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2071
2072 enable_NMI_through_LVT0();
2073
2074 apic_printk(APIC_VERBOSE, " done.\n");
2075}
2076
2077/*
2078 * This looks a bit hackish but it's about the only one way of sending
2079 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2080 * not support the ExtINT mode, unfortunately. We need to send these
2081 * cycles as some i82489DX-based boards have glue logic that keeps the
2082 * 8259A interrupt line asserted until INTA. --macro
2083 */
2084static inline void __init unlock_ExtINT_logic(void)
2085{
2086 int apic, pin, i;
2087 struct IO_APIC_route_entry entry0, entry1;
2088 unsigned char save_control, save_freq_select;
2089
2090 pin = find_isa_irq_pin(8, mp_INT);
2091 if (pin == -1) {
2092 WARN_ON_ONCE(1);
2093 return;
2094 }
2095 apic = find_isa_irq_apic(8, mp_INT);
2096 if (apic == -1) {
2097 WARN_ON_ONCE(1);
2098 return;
2099 }
2100
2101 entry0 = ioapic_read_entry(apic, pin);
2102 clear_IO_APIC_pin(apic, pin);
2103
2104 memset(&entry1, 0, sizeof(entry1));
2105
2106 entry1.dest_mode = 0; /* physical delivery */
2107 entry1.mask = 0; /* unmask IRQ now */
2108 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2109 entry1.delivery_mode = dest_ExtINT;
2110 entry1.polarity = entry0.polarity;
2111 entry1.trigger = 0;
2112 entry1.vector = 0;
2113
2114 ioapic_write_entry(apic, pin, entry1);
2115
2116 save_control = CMOS_READ(RTC_CONTROL);
2117 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2118 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2119 RTC_FREQ_SELECT);
2120 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2121
2122 i = 100;
2123 while (i-- > 0) {
2124 mdelay(10);
2125 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2126 i -= 10;
2127 }
2128
2129 CMOS_WRITE(save_control, RTC_CONTROL);
2130 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2131 clear_IO_APIC_pin(apic, pin);
2132
2133 ioapic_write_entry(apic, pin, entry0);
2134}
2135
2136/*
2137 * This code may look a bit paranoid, but it's supposed to cooperate with
2138 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2139 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2140 * fanatically on his truly buggy board.
2141 */
2142static inline void __init check_timer(void)
2143{
2144 int apic1, pin1, apic2, pin2;
2145 int no_pin1 = 0;
2146 int vector;
2147 unsigned int ver;
2148 unsigned long flags;
2149
2150 local_irq_save(flags);
2151
2152 ver = apic_read(APIC_LVR);
2153 ver = GET_APIC_VERSION(ver);
2154
2155 /*
2156 * get/set the timer IRQ vector:
2157 */
2158 disable_8259A_irq(0);
2159 vector = assign_irq_vector(0);
2160 set_intr_gate(vector, interrupt[0]);
2161
2162 /*
2163 * As IRQ0 is to be enabled in the 8259A, the virtual
2164 * wire has to be disabled in the local APIC. Also
2165 * timer interrupts need to be acknowledged manually in
2166 * the 8259A for the i82489DX when using the NMI
2167 * watchdog as that APIC treats NMIs as level-triggered.
2168 * The AEOI mode will finish them in the 8259A
2169 * automatically.
2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174
2175 pin1 = find_isa_irq_pin(0, mp_INT);
2176 apic1 = find_isa_irq_apic(0, mp_INT);
2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic;
2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2181 vector, apic1, pin1, apic2, pin2);
2182
2183 /*
2184 * Some BIOS writers are clueless and report the ExtINTA
2185 * I/O APIC input from the cascaded 8259A as the timer
2186 * interrupt input. So just in case, if only one pin
2187 * was found above, try it both directly and through the
2188 * 8259A.
2189 */
2190 if (pin1 == -1) {
2191 pin1 = pin2;
2192 apic1 = apic2;
2193 no_pin1 = 1;
2194 } else if (pin2 == -1) {
2195 pin2 = pin1;
2196 apic2 = apic1;
2197 }
2198
2199 if (pin1 != -1) {
2200 /*
2201 * Ok, does IRQ0 through the IOAPIC work?
2202 */
2203 if (no_pin1) {
2204 add_pin_to_irq(0, apic1, pin1);
2205 setup_timer_IRQ0_pin(apic1, pin1, vector);
2206 }
2207 unmask_IO_APIC_irq(0);
2208 if (timer_irq_works()) {
2209 if (nmi_watchdog == NMI_IO_APIC) {
2210 setup_nmi();
2211 enable_8259A_irq(0);
2212 }
2213 if (disable_timer_pin_1 > 0)
2214 clear_IO_APIC_pin(0, pin1);
2215 goto out;
2216 }
2217 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n");
2221
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) "
2223 "through the 8259A ... ");
2224 printk("\n..... (found pin %d) ...", pin2);
2225 /*
2226 * legacy devices should be connected to IO APIC #0
2227 */
2228 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2229 setup_timer_IRQ0_pin(apic2, pin2, vector);
2230 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0);
2232 if (timer_irq_works()) {
2233 printk("works.\n");
2234 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0);
2237 setup_nmi();
2238 enable_8259A_irq(0);
2239 }
2240 goto out;
2241 }
2242 /*
2243 * Cleanup, just in case ...
2244 */
2245 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n");
2248 }
2249
2250 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE;
2253 }
2254 timer_ack = 0;
2255
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2257
2258 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0);
2261
2262 if (timer_irq_works()) {
2263 printk(" works.\n");
2264 goto out;
2265 }
2266 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n");
2269
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
2271
2272 init_8259A(0);
2273 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
2275
2276 unlock_ExtINT_logic();
2277
2278 if (timer_irq_works()) {
2279 printk(" works.\n");
2280 goto out;
2281 }
2282 printk(" failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option");
2285out:
2286 local_irq_restore(flags);
2287}
2288
2289/*
2290 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2291 * to devices. However there may be an I/O APIC pin available for
2292 * this interrupt regardless. The pin may be left unconnected, but
2293 * typically it will be reused as an ExtINT cascade interrupt for
2294 * the master 8259A. In the MPS case such a pin will normally be
2295 * reported as an ExtINT interrupt in the MP table. With ACPI
2296 * there is no provision for ExtINT interrupts, and in the absence
2297 * of an override it would be treated as an ordinary ISA I/O APIC
2298 * interrupt, that is edge-triggered and unmasked by default. We
2299 * used to do this, but it caused problems on some systems because
2300 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2301 * the same ExtINT cascade interrupt to drive the local APIC of the
2302 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2303 * the I/O APIC in all cases now. No actual device should request
2304 * it anyway. --macro
2305 */
2306#define PIC_IRQS (1 << PIC_CASCADE_IR)
2307
2308void __init setup_IO_APIC(void)
2309{
2310 int i;
2311
2312 /* Reserve all the system vectors. */
2313 for (i = first_system_vector; i < NR_VECTORS; i++)
2314 set_bit(i, used_vectors);
2315
2316 enable_IO_APIC();
2317
2318 io_apic_irqs = ~PIC_IRQS;
2319
2320 printk("ENABLING IO-APIC IRQs\n");
2321
2322 /*
2323 * Set up IO-APIC IRQ routing.
2324 */
2325 if (!acpi_ioapic)
2326 setup_ioapic_ids_from_mpc();
2327 sync_Arb_IDs();
2328 setup_IO_APIC_irqs();
2329 init_IO_APIC_traps();
2330 check_timer();
2331 if (!acpi_ioapic)
2332 print_IO_APIC();
2333}
2334
2335/*
2336 * Called after all the initialization is done. If we didnt find any
2337 * APIC bugs then we can allow the modify fast path
2338 */
2339
2340static int __init io_apic_bug_finalize(void)
2341{
2342 if (sis_apic_bug == -1)
2343 sis_apic_bug = 0;
2344 return 0;
2345}
2346
2347late_initcall(io_apic_bug_finalize);
2348
2349struct sysfs_ioapic_data {
2350 struct sys_device dev;
2351 struct IO_APIC_route_entry entry[0];
2352};
2353static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2354
2355static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2356{
2357 struct IO_APIC_route_entry *entry;
2358 struct sysfs_ioapic_data *data;
2359 int i;
2360
2361 data = container_of(dev, struct sysfs_ioapic_data, dev);
2362 entry = data->entry;
2363 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2364 entry[i] = ioapic_read_entry(dev->id, i);
2365
2366 return 0;
2367}
2368
2369static int ioapic_resume(struct sys_device *dev)
2370{
2371 struct IO_APIC_route_entry *entry;
2372 struct sysfs_ioapic_data *data;
2373 unsigned long flags;
2374 union IO_APIC_reg_00 reg_00;
2375 int i;
2376
2377 data = container_of(dev, struct sysfs_ioapic_data, dev);
2378 entry = data->entry;
2379
2380 spin_lock_irqsave(&ioapic_lock, flags);
2381 reg_00.raw = io_apic_read(dev->id, 0);
2382 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2383 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2384 io_apic_write(dev->id, 0, reg_00.raw);
2385 }
2386 spin_unlock_irqrestore(&ioapic_lock, flags);
2387 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2388 ioapic_write_entry(dev->id, i, entry[i]);
2389
2390 return 0;
2391}
2392
2393static struct sysdev_class ioapic_sysdev_class = {
2394 .name = "ioapic",
2395 .suspend = ioapic_suspend,
2396 .resume = ioapic_resume,
2397};
2398
2399static int __init ioapic_init_sysfs(void)
2400{
2401 struct sys_device *dev;
2402 int i, size, error = 0;
2403
2404 error = sysdev_class_register(&ioapic_sysdev_class);
2405 if (error)
2406 return error;
2407
2408 for (i = 0; i < nr_ioapics; i++) {
2409 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2410 * sizeof(struct IO_APIC_route_entry);
2411 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2412 if (!mp_ioapic_data[i]) {
2413 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2414 continue;
2415 }
2416 dev = &mp_ioapic_data[i]->dev;
2417 dev->id = i;
2418 dev->cls = &ioapic_sysdev_class;
2419 error = sysdev_register(dev);
2420 if (error) {
2421 kfree(mp_ioapic_data[i]);
2422 mp_ioapic_data[i] = NULL;
2423 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2424 continue;
2425 }
2426 }
2427
2428 return 0;
2429}
2430
2431device_initcall(ioapic_init_sysfs);
2432
2433/*
2434 * Dynamic irq allocate and deallocation
2435 */
2436int create_irq(void)
2437{
2438 /* Allocate an unused irq */
2439 int irq, new, vector = 0;
2440 unsigned long flags;
2441
2442 irq = -ENOSPC;
2443 spin_lock_irqsave(&vector_lock, flags);
2444 for (new = (NR_IRQS - 1); new >= 0; new--) {
2445 if (platform_legacy_irq(new))
2446 continue;
2447 if (irq_vector[new] != 0)
2448 continue;
2449 vector = __assign_irq_vector(new);
2450 if (likely(vector > 0))
2451 irq = new;
2452 break;
2453 }
2454 spin_unlock_irqrestore(&vector_lock, flags);
2455
2456 if (irq >= 0) {
2457 set_intr_gate(vector, interrupt[irq]);
2458 dynamic_irq_init(irq);
2459 }
2460 return irq;
2461}
2462
2463void destroy_irq(unsigned int irq)
2464{
2465 unsigned long flags;
2466
2467 dynamic_irq_cleanup(irq);
2468
2469 spin_lock_irqsave(&vector_lock, flags);
2470 clear_bit(irq_vector[irq], used_vectors);
2471 irq_vector[irq] = 0;
2472 spin_unlock_irqrestore(&vector_lock, flags);
2473}
2474
2475/*
2476 * MSI message composition
2477 */
2478#ifdef CONFIG_PCI_MSI
2479static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2480{
2481 int vector;
2482 unsigned dest;
2483
2484 vector = assign_irq_vector(irq);
2485 if (vector >= 0) {
2486 dest = cpu_mask_to_apicid(TARGET_CPUS);
2487
2488 msg->address_hi = MSI_ADDR_BASE_HI;
2489 msg->address_lo =
2490 MSI_ADDR_BASE_LO |
2491 ((INT_DEST_MODE == 0) ?
2492MSI_ADDR_DEST_MODE_PHYSICAL:
2493 MSI_ADDR_DEST_MODE_LOGICAL) |
2494 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2495 MSI_ADDR_REDIRECTION_CPU:
2496 MSI_ADDR_REDIRECTION_LOWPRI) |
2497 MSI_ADDR_DEST_ID(dest);
2498
2499 msg->data =
2500 MSI_DATA_TRIGGER_EDGE |
2501 MSI_DATA_LEVEL_ASSERT |
2502 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2503MSI_DATA_DELIVERY_FIXED:
2504 MSI_DATA_DELIVERY_LOWPRI) |
2505 MSI_DATA_VECTOR(vector);
2506 }
2507 return vector;
2508}
2509
2510#ifdef CONFIG_SMP
2511static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2512{
2513 struct msi_msg msg;
2514 unsigned int dest;
2515 cpumask_t tmp;
2516 int vector;
2517
2518 cpus_and(tmp, mask, cpu_online_map);
2519 if (cpus_empty(tmp))
2520 tmp = TARGET_CPUS;
2521
2522 vector = assign_irq_vector(irq);
2523 if (vector < 0)
2524 return;
2525
2526 dest = cpu_mask_to_apicid(mask);
2527
2528 read_msi_msg(irq, &msg);
2529
2530 msg.data &= ~MSI_DATA_VECTOR_MASK;
2531 msg.data |= MSI_DATA_VECTOR(vector);
2532 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2533 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2534
2535 write_msi_msg(irq, &msg);
2536 irq_desc[irq].affinity = mask;
2537}
2538#endif /* CONFIG_SMP */
2539
2540/*
2541 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2542 * which implement the MSI or MSI-X Capability Structure.
2543 */
2544static struct irq_chip msi_chip = {
2545 .name = "PCI-MSI",
2546 .unmask = unmask_msi_irq,
2547 .mask = mask_msi_irq,
2548 .ack = ack_ioapic_irq,
2549#ifdef CONFIG_SMP
2550 .set_affinity = set_msi_irq_affinity,
2551#endif
2552 .retrigger = ioapic_retrigger_irq,
2553};
2554
2555int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2556{
2557 struct msi_msg msg;
2558 int irq, ret;
2559 irq = create_irq();
2560 if (irq < 0)
2561 return irq;
2562
2563 ret = msi_compose_msg(dev, irq, &msg);
2564 if (ret < 0) {
2565 destroy_irq(irq);
2566 return ret;
2567 }
2568
2569 set_irq_msi(irq, desc);
2570 write_msi_msg(irq, &msg);
2571
2572 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2573 "edge");
2574
2575 return 0;
2576}
2577
2578void arch_teardown_msi_irq(unsigned int irq)
2579{
2580 destroy_irq(irq);
2581}
2582
2583#endif /* CONFIG_PCI_MSI */
2584
2585/*
2586 * Hypertransport interrupt support
2587 */
2588#ifdef CONFIG_HT_IRQ
2589
2590#ifdef CONFIG_SMP
2591
2592static void target_ht_irq(unsigned int irq, unsigned int dest)
2593{
2594 struct ht_irq_msg msg;
2595 fetch_ht_irq_msg(irq, &msg);
2596
2597 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2598 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2599
2600 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2601 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2602
2603 write_ht_irq_msg(irq, &msg);
2604}
2605
2606static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2607{
2608 unsigned int dest;
2609 cpumask_t tmp;
2610
2611 cpus_and(tmp, mask, cpu_online_map);
2612 if (cpus_empty(tmp))
2613 tmp = TARGET_CPUS;
2614
2615 cpus_and(mask, tmp, CPU_MASK_ALL);
2616
2617 dest = cpu_mask_to_apicid(mask);
2618
2619 target_ht_irq(irq, dest);
2620 irq_desc[irq].affinity = mask;
2621}
2622#endif
2623
2624static struct irq_chip ht_irq_chip = {
2625 .name = "PCI-HT",
2626 .mask = mask_ht_irq,
2627 .unmask = unmask_ht_irq,
2628 .ack = ack_ioapic_irq,
2629#ifdef CONFIG_SMP
2630 .set_affinity = set_ht_irq_affinity,
2631#endif
2632 .retrigger = ioapic_retrigger_irq,
2633};
2634
2635int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2636{
2637 int vector;
2638
2639 vector = assign_irq_vector(irq);
2640 if (vector >= 0) {
2641 struct ht_irq_msg msg;
2642 unsigned dest;
2643 cpumask_t tmp;
2644
2645 cpus_clear(tmp);
2646 cpu_set(vector >> 8, tmp);
2647 dest = cpu_mask_to_apicid(tmp);
2648
2649 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2650
2651 msg.address_lo =
2652 HT_IRQ_LOW_BASE |
2653 HT_IRQ_LOW_DEST_ID(dest) |
2654 HT_IRQ_LOW_VECTOR(vector) |
2655 ((INT_DEST_MODE == 0) ?
2656 HT_IRQ_LOW_DM_PHYSICAL :
2657 HT_IRQ_LOW_DM_LOGICAL) |
2658 HT_IRQ_LOW_RQEOI_EDGE |
2659 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2660 HT_IRQ_LOW_MT_FIXED :
2661 HT_IRQ_LOW_MT_ARBITRATED) |
2662 HT_IRQ_LOW_IRQ_MASKED;
2663
2664 write_ht_irq_msg(irq, &msg);
2665
2666 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2667 handle_edge_irq, "edge");
2668 }
2669 return vector;
2670}
2671#endif /* CONFIG_HT_IRQ */
2672
2673/* --------------------------------------------------------------------------
2674 ACPI-based IOAPIC Configuration
2675 -------------------------------------------------------------------------- */
2676
2677#ifdef CONFIG_ACPI
2678
2679int __init io_apic_get_unique_id(int ioapic, int apic_id)
2680{
2681 union IO_APIC_reg_00 reg_00;
2682 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2683 physid_mask_t tmp;
2684 unsigned long flags;
2685 int i = 0;
2686
2687 /*
2688 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2689 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2690 * supports up to 16 on one shared APIC bus.
2691 *
2692 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2693 * advantage of new APIC bus architecture.
2694 */
2695
2696 if (physids_empty(apic_id_map))
2697 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2698
2699 spin_lock_irqsave(&ioapic_lock, flags);
2700 reg_00.raw = io_apic_read(ioapic, 0);
2701 spin_unlock_irqrestore(&ioapic_lock, flags);
2702
2703 if (apic_id >= get_physical_broadcast()) {
2704 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2705 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2706 apic_id = reg_00.bits.ID;
2707 }
2708
2709 /*
2710 * Every APIC in a system must have a unique ID or we get lots of nice
2711 * 'stuck on smp_invalidate_needed IPI wait' messages.
2712 */
2713 if (check_apicid_used(apic_id_map, apic_id)) {
2714
2715 for (i = 0; i < get_physical_broadcast(); i++) {
2716 if (!check_apicid_used(apic_id_map, i))
2717 break;
2718 }
2719
2720 if (i == get_physical_broadcast())
2721 panic("Max apic_id exceeded!\n");
2722
2723 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2724 "trying %d\n", ioapic, apic_id, i);
2725
2726 apic_id = i;
2727 }
2728
2729 tmp = apicid_to_cpu_present(apic_id);
2730 physids_or(apic_id_map, apic_id_map, tmp);
2731
2732 if (reg_00.bits.ID != apic_id) {
2733 reg_00.bits.ID = apic_id;
2734
2735 spin_lock_irqsave(&ioapic_lock, flags);
2736 io_apic_write(ioapic, 0, reg_00.raw);
2737 reg_00.raw = io_apic_read(ioapic, 0);
2738 spin_unlock_irqrestore(&ioapic_lock, flags);
2739
2740 /* Sanity check */
2741 if (reg_00.bits.ID != apic_id) {
2742 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2743 return -1;
2744 }
2745 }
2746
2747 apic_printk(APIC_VERBOSE, KERN_INFO
2748 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2749
2750 return apic_id;
2751}
2752
2753
2754int __init io_apic_get_version(int ioapic)
2755{
2756 union IO_APIC_reg_01 reg_01;
2757 unsigned long flags;
2758
2759 spin_lock_irqsave(&ioapic_lock, flags);
2760 reg_01.raw = io_apic_read(ioapic, 1);
2761 spin_unlock_irqrestore(&ioapic_lock, flags);
2762
2763 return reg_01.bits.version;
2764}
2765
2766
2767int __init io_apic_get_redir_entries(int ioapic)
2768{
2769 union IO_APIC_reg_01 reg_01;
2770 unsigned long flags;
2771
2772 spin_lock_irqsave(&ioapic_lock, flags);
2773 reg_01.raw = io_apic_read(ioapic, 1);
2774 spin_unlock_irqrestore(&ioapic_lock, flags);
2775
2776 return reg_01.bits.entries;
2777}
2778
2779
2780int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2781{
2782 struct IO_APIC_route_entry entry;
2783
2784 if (!IO_APIC_IRQ(irq)) {
2785 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2786 ioapic);
2787 return -EINVAL;
2788 }
2789
2790 /*
2791 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2792 * Note that we mask (disable) IRQs now -- these get enabled when the
2793 * corresponding device driver registers for this IRQ.
2794 */
2795
2796 memset(&entry, 0, sizeof(entry));
2797
2798 entry.delivery_mode = INT_DELIVERY_MODE;
2799 entry.dest_mode = INT_DEST_MODE;
2800 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2801 entry.trigger = edge_level;
2802 entry.polarity = active_high_low;
2803 entry.mask = 1;
2804
2805 /*
2806 * IRQs < 16 are already in the irq_2_pin[] map
2807 */
2808 if (irq >= 16)
2809 add_pin_to_irq(irq, ioapic, pin);
2810
2811 entry.vector = assign_irq_vector(irq);
2812
2813 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2814 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2815 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2816 edge_level, active_high_low);
2817
2818 ioapic_register_intr(irq, entry.vector, edge_level);
2819
2820 if (!ioapic && (irq < 16))
2821 disable_8259A_irq(irq);
2822
2823 ioapic_write_entry(ioapic, pin, entry);
2824
2825 return 0;
2826}
2827
2828int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2829{
2830 int i;
2831
2832 if (skip_ioapic_setup)
2833 return -1;
2834
2835 for (i = 0; i < mp_irq_entries; i++)
2836 if (mp_irqs[i].mp_irqtype == mp_INT &&
2837 mp_irqs[i].mp_srcbusirq == bus_irq)
2838 break;
2839 if (i >= mp_irq_entries)
2840 return -1;
2841
2842 *trigger = irq_trigger(i);
2843 *polarity = irq_polarity(i);
2844 return 0;
2845}
2846
2847#endif /* CONFIG_ACPI */
2848
2849static int __init parse_disable_timer_pin_1(char *arg)
2850{
2851 disable_timer_pin_1 = 1;
2852 return 0;
2853}
2854early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2855
2856static int __init parse_enable_timer_pin_1(char *arg)
2857{
2858 disable_timer_pin_1 = -1;
2859 return 0;
2860}
2861early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2862
2863static int __init parse_noapic(char *arg)
2864{
2865 /* disable IO-APIC */
2866 disable_ioapic_setup();
2867 return 0;
2868}
2869early_param("noapic", parse_noapic);
2870
2871void __init ioapic_init_mappings(void)
2872{
2873 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2874 int i;
2875
2876 for (i = 0; i < nr_ioapics; i++) {
2877 if (smp_found_config) {
2878 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2879 if (!ioapic_phys) {
2880 printk(KERN_ERR
2881 "WARNING: bogus zero IO-APIC "
2882 "address found in MPTABLE, "
2883 "disabling IO/APIC support!\n");
2884 smp_found_config = 0;
2885 skip_ioapic_setup = 1;
2886 goto fake_ioapic_page;
2887 }
2888 } else {
2889fake_ioapic_page:
2890 ioapic_phys = (unsigned long)
2891 alloc_bootmem_pages(PAGE_SIZE);
2892 ioapic_phys = __pa(ioapic_phys);
2893 }
2894 set_fixmap_nocache(idx, ioapic_phys);
2895 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2896 __fix_to_virt(idx), ioapic_phys);
2897 idx++;
2898 }
2899}
2900
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
@@ -103,6 +111,9 @@ void __init io_delay_init(void)
103 111
104static int __init io_delay_param(char *s) 112static int __init io_delay_param(char *s)
105{ 113{
114 if (!s)
115 return -EINVAL;
116
106 if (!strcmp(s, "0x80")) 117 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 118 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 119 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <asm/syscalls.h>
17 18
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, 20static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..f1c688e46f35 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
20 20
21#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
22#include <mach_apic.h> 22#include <mach_apic.h>
23#include <mach_ipi.h>
24
23/* 25/*
24 * the following functions deal with sending IPIs between CPUs. 26 * the following functions deal with sending IPIs between CPUs.
25 * 27 *
@@ -70,7 +72,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 72 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 73 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 74 */
73 apic_write_around(APIC_ICR, cfg); 75 apic_write(APIC_ICR, cfg);
74} 76}
75 77
76void send_IPI_self(int vector) 78void send_IPI_self(int vector)
@@ -98,7 +100,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 100 * prepare target chip field
99 */ 101 */
100 cfg = __prepare_ICR2(mask); 102 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 103 apic_write(APIC_ICR2, cfg);
102 104
103 /* 105 /*
104 * program the ICR 106 * program the ICR
@@ -108,7 +110,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 110 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 111 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 112 */
111 apic_write_around(APIC_ICR, cfg); 113 apic_write(APIC_ICR, cfg);
112} 114}
113 115
114/* 116/*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
147} 149}
148 150
149/* must come after the send_IPI functions above for inlining */ 151/* must come after the send_IPI functions above for inlining */
150#include <mach_ipi.h>
151static int convert_apicid_to_cpu(int apic_id) 152static int convert_apicid_to_cpu(int apic_id)
152{ 153{
153 int i; 154 int i;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000000..d1d4dc52f649
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
1/*
2 * Common interrupt code for 32 and 64 bit
3 */
4#include <linux/cpu.h>
5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h>
8
9#include <asm/apic.h>
10#include <asm/io_apic.h>
11#include <asm/smp.h>
12
13atomic_t irq_err_count;
14
15/*
16 * 'what should we do if we get a hw irq event on an illegal vector'.
17 * each architecture has to answer this themselves.
18 */
19void ack_bad_irq(unsigned int irq)
20{
21 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
22
23#ifdef CONFIG_X86_LOCAL_APIC
24 /*
25 * Currently unexpected vectors happen only on SMP and APIC.
26 * We _must_ ack these because every local APIC has only N
27 * irq slots per priority level, and a 'hanging, unacked' IRQ
28 * holds up an irq slot - in excessive cases (when multiple
29 * unexpected vectors occur) that might lock up the APIC
30 * completely.
31 * But only ack when the APIC is enabled -AK
32 */
33 if (cpu_has_apic)
34 ack_APIC_irq();
35#endif
36}
37
38#ifdef CONFIG_X86_32
39# define irq_stats(x) (&per_cpu(irq_stat, x))
40#else
41# define irq_stats(x) cpu_pda(x)
42#endif
43/*
44 * /proc/interrupts printing:
45 */
46static int show_other_interrupts(struct seq_file *p)
47{
48 int j;
49
50 seq_printf(p, "NMI: ");
51 for_each_online_cpu(j)
52 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
53 seq_printf(p, " Non-maskable interrupts\n");
54#ifdef CONFIG_X86_LOCAL_APIC
55 seq_printf(p, "LOC: ");
56 for_each_online_cpu(j)
57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
58 seq_printf(p, " Local timer interrupts\n");
59#endif
60#ifdef CONFIG_SMP
61 seq_printf(p, "RES: ");
62 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
64 seq_printf(p, " Rescheduling interrupts\n");
65 seq_printf(p, "CAL: ");
66 for_each_online_cpu(j)
67 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
68 seq_printf(p, " Function call interrupts\n");
69 seq_printf(p, "TLB: ");
70 for_each_online_cpu(j)
71 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
72 seq_printf(p, " TLB shootdowns\n");
73#endif
74#ifdef CONFIG_X86_MCE
75 seq_printf(p, "TRM: ");
76 for_each_online_cpu(j)
77 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
78 seq_printf(p, " Thermal event interrupts\n");
79# ifdef CONFIG_X86_64
80 seq_printf(p, "THR: ");
81 for_each_online_cpu(j)
82 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
83 seq_printf(p, " Threshold APIC interrupts\n");
84# endif
85#endif
86#ifdef CONFIG_X86_LOCAL_APIC
87 seq_printf(p, "SPU: ");
88 for_each_online_cpu(j)
89 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
90 seq_printf(p, " Spurious interrupts\n");
91#endif
92 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
93#if defined(CONFIG_X86_IO_APIC)
94 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
95#endif
96 return 0;
97}
98
99int show_interrupts(struct seq_file *p, void *v)
100{
101 unsigned long flags, any_count = 0;
102 int i = *(loff_t *) v, j;
103 struct irqaction *action;
104 struct irq_desc *desc;
105
106 if (i > nr_irqs)
107 return 0;
108
109 if (i == nr_irqs)
110 return show_other_interrupts(p);
111
112 /* print header */
113 if (i == 0) {
114 seq_printf(p, " ");
115 for_each_online_cpu(j)
116 seq_printf(p, "CPU%-8d", j);
117 seq_putc(p, '\n');
118 }
119
120 desc = irq_to_desc(i);
121 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i);
124#else
125 for_each_online_cpu(j)
126 any_count |= kstat_irqs_cpu(i, j);
127#endif
128 action = desc->action;
129 if (!action && !any_count)
130 goto out;
131
132 seq_printf(p, "%3d: ", i);
133#ifndef CONFIG_SMP
134 seq_printf(p, "%10u ", kstat_irqs(i));
135#else
136 for_each_online_cpu(j)
137 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
138#endif
139 seq_printf(p, " %8s", desc->chip->name);
140 seq_printf(p, "-%-8s", desc->name);
141
142 if (action) {
143 seq_printf(p, " %s", action->name);
144 while ((action = action->next) != NULL)
145 seq_printf(p, ", %s", action->name);
146 }
147
148 seq_putc(p, '\n');
149out:
150 spin_unlock_irqrestore(&desc->lock, flags);
151 return 0;
152}
153
154/*
155 * /proc/stat helpers
156 */
157u64 arch_irq_stat_cpu(unsigned int cpu)
158{
159 u64 sum = irq_stats(cpu)->__nmi_count;
160
161#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs;
163#endif
164#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count;
166 sum += irq_stats(cpu)->irq_call_count;
167 sum += irq_stats(cpu)->irq_tlb_count;
168#endif
169#ifdef CONFIG_X86_MCE
170 sum += irq_stats(cpu)->irq_thermal_count;
171# ifdef CONFIG_X86_64
172 sum += irq_stats(cpu)->irq_threshold_count;
173#endif
174#endif
175#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->irq_spurious_count;
177#endif
178 return sum;
179}
180
181u64 arch_irq_stat(void)
182{
183 u64 sum = atomic_read(&irq_err_count);
184
185#ifdef CONFIG_X86_IO_APIC
186 sum += atomic_read(&irq_mis_count);
187#endif
188 return sum;
189}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..a51382672de0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
25DEFINE_PER_CPU(struct pt_regs *, irq_regs); 25DEFINE_PER_CPU(struct pt_regs *, irq_regs);
26EXPORT_PER_CPU_SYMBOL(irq_regs); 26EXPORT_PER_CPU_SYMBOL(irq_regs);
27 27
28/*
29 * 'what should we do if we get a hw irq event on an illegal vector'.
30 * each architecture has to answer this themselves.
31 */
32void ack_bad_irq(unsigned int irq)
33{
34 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37 /*
38 * Currently unexpected vectors happen only on SMP and APIC.
39 * We _must_ ack these because every local APIC has only N
40 * irq slots per priority level, and a 'hanging, unacked' IRQ
41 * holds up an irq slot - in excessive cases (when multiple
42 * unexpected vectors occur) that might lock up the APIC
43 * completely.
44 * But only ack when the APIC is enabled -AK
45 */
46 if (cpu_has_apic)
47 ack_APIC_irq();
48#endif
49}
50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW 28#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */ 29/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void) 30static int check_stack_overflow(void)
@@ -83,11 +60,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 62
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 65
92static void call_on_stack(void *func, void *stack) 66static void call_on_stack(void *func, void *stack)
93{ 67{
@@ -226,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
226{ 200{
227 struct pt_regs *old_regs; 201 struct pt_regs *old_regs;
228 /* high bit used in ret_from_ code */ 202 /* high bit used in ret_from_ code */
229 int overflow, irq = ~regs->orig_ax; 203 int overflow;
230 struct irq_desc *desc = irq_desc + irq; 204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc;
206 unsigned irq;
231 207
232 if (unlikely((unsigned)irq >= NR_IRQS)) {
233 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
234 __func__, irq);
235 BUG();
236 }
237 208
238 old_regs = set_irq_regs(regs); 209 old_regs = set_irq_regs(regs);
239 irq_enter(); 210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
240 212
241 overflow = check_stack_overflow(); 213 overflow = check_stack_overflow();
242 214
215 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) {
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221
243 if (!execute_on_irq_stack(overflow, desc, irq)) { 222 if (!execute_on_irq_stack(overflow, desc, irq)) {
244 if (unlikely(overflow)) 223 if (unlikely(overflow))
245 print_stack_overflow(); 224 print_stack_overflow();
@@ -251,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
251 return 1; 230 return 1;
252} 231}
253 232
254/*
255 * Interrupt statistics:
256 */
257
258atomic_t irq_err_count;
259
260/*
261 * /proc/interrupts printing:
262 */
263
264int show_interrupts(struct seq_file *p, void *v)
265{
266 int i = *(loff_t *) v, j;
267 struct irqaction * action;
268 unsigned long flags;
269
270 if (i == 0) {
271 seq_printf(p, " ");
272 for_each_online_cpu(j)
273 seq_printf(p, "CPU%-8d",j);
274 seq_putc(p, '\n');
275 }
276
277 if (i < NR_IRQS) {
278 unsigned any_count = 0;
279
280 spin_lock_irqsave(&irq_desc[i].lock, flags);
281#ifndef CONFIG_SMP
282 any_count = kstat_irqs(i);
283#else
284 for_each_online_cpu(j)
285 any_count |= kstat_cpu(j).irqs[i];
286#endif
287 action = irq_desc[i].action;
288 if (!action && !any_count)
289 goto skip;
290 seq_printf(p, "%3d: ",i);
291#ifndef CONFIG_SMP
292 seq_printf(p, "%10u ", kstat_irqs(i));
293#else
294 for_each_online_cpu(j)
295 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
296#endif
297 seq_printf(p, " %8s", irq_desc[i].chip->name);
298 seq_printf(p, "-%-8s", irq_desc[i].name);
299
300 if (action) {
301 seq_printf(p, " %s", action->name);
302 while ((action = action->next) != NULL)
303 seq_printf(p, ", %s", action->name);
304 }
305
306 seq_putc(p, '\n');
307skip:
308 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
309 } else if (i == NR_IRQS) {
310 seq_printf(p, "NMI: ");
311 for_each_online_cpu(j)
312 seq_printf(p, "%10u ", nmi_count(j));
313 seq_printf(p, " Non-maskable interrupts\n");
314#ifdef CONFIG_X86_LOCAL_APIC
315 seq_printf(p, "LOC: ");
316 for_each_online_cpu(j)
317 seq_printf(p, "%10u ",
318 per_cpu(irq_stat,j).apic_timer_irqs);
319 seq_printf(p, " Local timer interrupts\n");
320#endif
321#ifdef CONFIG_SMP
322 seq_printf(p, "RES: ");
323 for_each_online_cpu(j)
324 seq_printf(p, "%10u ",
325 per_cpu(irq_stat,j).irq_resched_count);
326 seq_printf(p, " Rescheduling interrupts\n");
327 seq_printf(p, "CAL: ");
328 for_each_online_cpu(j)
329 seq_printf(p, "%10u ",
330 per_cpu(irq_stat,j).irq_call_count);
331 seq_printf(p, " function call interrupts\n");
332 seq_printf(p, "TLB: ");
333 for_each_online_cpu(j)
334 seq_printf(p, "%10u ",
335 per_cpu(irq_stat,j).irq_tlb_count);
336 seq_printf(p, " TLB shootdowns\n");
337#endif
338#ifdef CONFIG_X86_MCE
339 seq_printf(p, "TRM: ");
340 for_each_online_cpu(j)
341 seq_printf(p, "%10u ",
342 per_cpu(irq_stat,j).irq_thermal_count);
343 seq_printf(p, " Thermal event interrupts\n");
344#endif
345#ifdef CONFIG_X86_LOCAL_APIC
346 seq_printf(p, "SPU: ");
347 for_each_online_cpu(j)
348 seq_printf(p, "%10u ",
349 per_cpu(irq_stat,j).irq_spurious_count);
350 seq_printf(p, " Spurious interrupts\n");
351#endif
352 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
353#if defined(CONFIG_X86_IO_APIC)
354 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
355#endif
356 }
357 return 0;
358}
359
360/*
361 * /proc/stat helpers
362 */
363u64 arch_irq_stat_cpu(unsigned int cpu)
364{
365 u64 sum = nmi_count(cpu);
366
367#ifdef CONFIG_X86_LOCAL_APIC
368 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
369#endif
370#ifdef CONFIG_SMP
371 sum += per_cpu(irq_stat, cpu).irq_resched_count;
372 sum += per_cpu(irq_stat, cpu).irq_call_count;
373 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
374#endif
375#ifdef CONFIG_X86_MCE
376 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
377#endif
378#ifdef CONFIG_X86_LOCAL_APIC
379 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
380#endif
381 return sum;
382}
383
384u64 arch_irq_stat(void)
385{
386 u64 sum = atomic_read(&irq_err_count);
387
388#ifdef CONFIG_X86_IO_APIC
389 sum += atomic_read(&irq_mis_count);
390#endif
391 return sum;
392}
393
394#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
395#include <mach_apic.h> 234#include <mach_apic.h>
396 235
@@ -398,20 +237,22 @@ void fixup_irqs(cpumask_t map)
398{ 237{
399 unsigned int irq; 238 unsigned int irq;
400 static int warned; 239 static int warned;
240 struct irq_desc *desc;
401 241
402 for (irq = 0; irq < NR_IRQS; irq++) { 242 for_each_irq_desc(irq, desc) {
403 cpumask_t mask; 243 cpumask_t mask;
244
404 if (irq == 2) 245 if (irq == 2)
405 continue; 246 continue;
406 247
407 cpus_and(mask, irq_desc[irq].affinity, map); 248 cpus_and(mask, desc->affinity, map);
408 if (any_online_cpu(mask) == NR_CPUS) { 249 if (any_online_cpu(mask) == NR_CPUS) {
409 printk("Breaking affinity for irq %i\n", irq); 250 printk("Breaking affinity for irq %i\n", irq);
410 mask = map; 251 mask = map;
411 } 252 }
412 if (irq_desc[irq].chip->set_affinity) 253 if (desc->chip->set_affinity)
413 irq_desc[irq].chip->set_affinity(irq, mask); 254 desc->chip->set_affinity(irq, mask);
414 else if (irq_desc[irq].action && !(warned++)) 255 else if (desc->action && !(warned++))
415 printk("Cannot set affinity for irq %i\n", irq); 256 printk("Cannot set affinity for irq %i\n", irq);
416 } 257 }
417 258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..60eb84eb77a0 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
21atomic_t irq_err_count;
22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
43#ifdef CONFIG_DEBUG_STACKOVERFLOW 21#ifdef CONFIG_DEBUG_STACKOVERFLOW
44/* 22/*
45 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
65#endif 43#endif
66 44
67/* 45/*
68 * Generic, controller-independent functions:
69 */
70
71int show_interrupts(struct seq_file *p, void *v)
72{
73 int i = *(loff_t *) v, j;
74 struct irqaction * action;
75 unsigned long flags;
76
77 if (i == 0) {
78 seq_printf(p, " ");
79 for_each_online_cpu(j)
80 seq_printf(p, "CPU%-8d",j);
81 seq_putc(p, '\n');
82 }
83
84 if (i < NR_IRQS) {
85 unsigned any_count = 0;
86
87 spin_lock_irqsave(&irq_desc[i].lock, flags);
88#ifndef CONFIG_SMP
89 any_count = kstat_irqs(i);
90#else
91 for_each_online_cpu(j)
92 any_count |= kstat_cpu(j).irqs[i];
93#endif
94 action = irq_desc[i].action;
95 if (!action && !any_count)
96 goto skip;
97 seq_printf(p, "%3d: ",i);
98#ifndef CONFIG_SMP
99 seq_printf(p, "%10u ", kstat_irqs(i));
100#else
101 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
103#endif
104 seq_printf(p, " %8s", irq_desc[i].chip->name);
105 seq_printf(p, "-%-8s", irq_desc[i].name);
106
107 if (action) {
108 seq_printf(p, " %s", action->name);
109 while ((action = action->next) != NULL)
110 seq_printf(p, ", %s", action->name);
111 }
112 seq_putc(p, '\n');
113skip:
114 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
115 } else if (i == NR_IRQS) {
116 seq_printf(p, "NMI: ");
117 for_each_online_cpu(j)
118 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
119 seq_printf(p, " Non-maskable interrupts\n");
120 seq_printf(p, "LOC: ");
121 for_each_online_cpu(j)
122 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
123 seq_printf(p, " Local timer interrupts\n");
124#ifdef CONFIG_SMP
125 seq_printf(p, "RES: ");
126 for_each_online_cpu(j)
127 seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
128 seq_printf(p, " Rescheduling interrupts\n");
129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n");
133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n");
137#endif
138#ifdef CONFIG_X86_MCE
139 seq_printf(p, "TRM: ");
140 for_each_online_cpu(j)
141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
142 seq_printf(p, " Thermal event interrupts\n");
143 seq_printf(p, "THR: ");
144 for_each_online_cpu(j)
145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
148 seq_printf(p, "SPU: ");
149 for_each_online_cpu(j)
150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
151 seq_printf(p, " Spurious interrupts\n");
152 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
153 }
154 return 0;
155}
156
157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
184 * do_IRQ handles all normal device IRQ's (the special 46 * do_IRQ handles all normal device IRQ's (the special
185 * SMP cross-CPU interrupts have their own specific 47 * SMP cross-CPU interrupts have their own specific
186 * handlers). 48 * handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
188asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 50asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
189{ 51{
190 struct pt_regs *old_regs = set_irq_regs(regs); 52 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc;
191 54
192 /* high bit used in ret_from_ code */ 55 /* high bit used in ret_from_ code */
193 unsigned vector = ~regs->orig_ax; 56 unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
201 stack_overflow_check(regs); 64 stack_overflow_check(regs);
202#endif 65#endif
203 66
204 if (likely(irq < NR_IRQS)) 67 desc = irq_to_desc(irq);
205 generic_handle_irq(irq); 68 if (likely(desc))
69 generic_handle_irq_desc(irq, desc);
206 else { 70 else {
207 if (!disable_apic) 71 if (!disable_apic)
208 ack_APIC_irq(); 72 ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
223{ 87{
224 unsigned int irq; 88 unsigned int irq;
225 static int warned; 89 static int warned;
90 struct irq_desc *desc;
226 91
227 for (irq = 0; irq < NR_IRQS; irq++) { 92 for_each_irq_desc(irq, desc) {
228 cpumask_t mask; 93 cpumask_t mask;
229 int break_affinity = 0; 94 int break_affinity = 0;
230 int set_affinity = 1; 95 int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
233 continue; 98 continue;
234 99
235 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
236 spin_lock(&irq_desc[irq].lock); 101 spin_lock(&desc->lock);
237 102
238 if (!irq_has_action(irq) || 103 if (!irq_has_action(irq) ||
239 cpus_equal(irq_desc[irq].affinity, map)) { 104 cpus_equal(desc->affinity, map)) {
240 spin_unlock(&irq_desc[irq].lock); 105 spin_unlock(&desc->lock);
241 continue; 106 continue;
242 } 107 }
243 108
244 cpus_and(mask, irq_desc[irq].affinity, map); 109 cpus_and(mask, desc->affinity, map);
245 if (cpus_empty(mask)) { 110 if (cpus_empty(mask)) {
246 break_affinity = 1; 111 break_affinity = 1;
247 mask = map; 112 mask = map;
248 } 113 }
249 114
250 if (irq_desc[irq].chip->mask) 115 if (desc->chip->mask)
251 irq_desc[irq].chip->mask(irq); 116 desc->chip->mask(irq);
252 117
253 if (irq_desc[irq].chip->set_affinity) 118 if (desc->chip->set_affinity)
254 irq_desc[irq].chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, mask);
255 else if (!(warned++)) 120 else if (!(warned++))
256 set_affinity = 0; 121 set_affinity = 0;
257 122
258 if (irq_desc[irq].chip->unmask) 123 if (desc->chip->unmask)
259 irq_desc[irq].chip->unmask(irq); 124 desc->chip->unmask(irq);
260 125
261 spin_unlock(&irq_desc[irq].lock); 126 spin_unlock(&desc->lock);
262 127
263 if (break_affinity && set_affinity) 128 if (break_affinity && set_affinity)
264 printk("Broke affinity for irq %i\n", irq); 129 printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..845aa9803e80 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,11 +69,48 @@ void __init init_ISA_irqs (void)
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < 16; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i);
74
75 desc->status = IRQ_DISABLED;
76 desc->action = NULL;
77 desc->depth = 1;
78
72 set_irq_chip_and_handler_name(i, &i8259A_chip, 79 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT"); 80 handle_level_irq, "XT");
74 } 81 }
75} 82}
76 83
84/*
85 * IRQ2 is cascade interrupt to second interrupt controller
86 */
87static struct irqaction irq2 = {
88 .handler = no_action,
89 .mask = CPU_MASK_NONE,
90 .name = "cascade",
91};
92
93DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
94 [0 ... IRQ0_VECTOR - 1] = -1,
95 [IRQ0_VECTOR] = 0,
96 [IRQ1_VECTOR] = 1,
97 [IRQ2_VECTOR] = 2,
98 [IRQ3_VECTOR] = 3,
99 [IRQ4_VECTOR] = 4,
100 [IRQ5_VECTOR] = 5,
101 [IRQ6_VECTOR] = 6,
102 [IRQ7_VECTOR] = 7,
103 [IRQ8_VECTOR] = 8,
104 [IRQ9_VECTOR] = 9,
105 [IRQ10_VECTOR] = 10,
106 [IRQ11_VECTOR] = 11,
107 [IRQ12_VECTOR] = 12,
108 [IRQ13_VECTOR] = 13,
109 [IRQ14_VECTOR] = 14,
110 [IRQ15_VECTOR] = 15,
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112};
113
77/* Overridden in paravirt.c */ 114/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 116
@@ -89,15 +126,50 @@ void __init native_init_IRQ(void)
89 * us. (some of these will be overridden and become 126 * us. (some of these will be overridden and become
90 * 'special' SMP interrupts) 127 * 'special' SMP interrupts)
91 */ 128 */
92 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
93 int vector = FIRST_EXTERNAL_VECTOR + i;
94 if (i >= NR_IRQS)
95 break;
96 /* SYSCALL_VECTOR was reserved in trap_init. */ 130 /* SYSCALL_VECTOR was reserved in trap_init. */
97 if (!test_bit(vector, used_vectors)) 131 if (i != SYSCALL_VECTOR)
98 set_intr_gate(vector, interrupt[i]); 132 set_intr_gate(i, interrupt[i]);
99 } 133 }
100 134
135
136#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
137 /*
138 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
139 * IPI, driven by wakeup.
140 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142
143 /* IPI for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
145
146 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148
149 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
151
152 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
154#endif
155
156#ifdef CONFIG_X86_LOCAL_APIC
157 /* self generated IPI for local APIC timer */
158 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
159
160 /* IPI vectors for APIC spurious and error interrupts */
161 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
162 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
163#endif
164
165#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
166 /* thermal monitor LVT interrupt */
167 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
168#endif
169
170 if (!acpi_ioapic)
171 setup_irq(2, &irq2);
172
101 /* setup after call gates are initialised (usually add in 173 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 174 * the architecture specific gates)
103 */ 175 */
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..ff0235391285 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
@@ -134,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
134 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
135}; 136};
136 137
137static void __init init_ISA_irqs (void) 138void __init init_ISA_irqs(void)
138{ 139{
139 int i; 140 int i;
140 141
141 init_bsp_APIC(); 142 init_bsp_APIC();
142 init_8259A(0); 143 init_8259A(0);
143 144
144 for (i = 0; i < NR_IRQS; i++) { 145 for (i = 0; i < 16; i++) {
145 irq_desc[i].status = IRQ_DISABLED; 146 /* first time call this irq_desc */
146 irq_desc[i].action = NULL; 147 struct irq_desc *desc = irq_to_desc(i);
147 irq_desc[i].depth = 1;
148 148
149 if (i < 16) { 149 desc->status = IRQ_DISABLED;
150 /* 150 desc->action = NULL;
151 * 16 old-style INTA-cycle interrupts: 151 desc->depth = 1;
152 */ 152
153 set_irq_chip_and_handler_name(i, &i8259A_chip, 153 /*
154 * 16 old-style INTA-cycle interrupts:
155 */
156 set_irq_chip_and_handler_name(i, &i8259A_chip,
154 handle_level_irq, "XT"); 157 handle_level_irq, "XT");
155 } else {
156 /*
157 * 'high' PCI IRQs filled in on demand
158 */
159 irq_desc[i].chip = &no_irq_chip;
160 }
161 } 158 }
162} 159}
163 160
164void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 161void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
165 162
166void __init native_init_IRQ(void) 163static void __init smp_intr_init(void)
167{ 164{
168 int i;
169
170 init_ISA_irqs();
171 /*
172 * Cover the whole vector space, no vector can escape
173 * us. (some of these will be overridden and become
174 * 'special' SMP interrupts)
175 */
176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
177 int vector = FIRST_EXTERNAL_VECTOR + i;
178 if (vector != IA32_SYSCALL_VECTOR)
179 set_intr_gate(vector, interrupt[i]);
180 }
181
182#ifdef CONFIG_SMP 165#ifdef CONFIG_SMP
183 /* 166 /*
184 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 167 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -206,6 +189,12 @@ void __init native_init_IRQ(void)
206 /* Low priority IPI to cleanup after moving an irq */ 189 /* Low priority IPI to cleanup after moving an irq */
207 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
208#endif 191#endif
192}
193
194static void __init apic_intr_init(void)
195{
196 smp_intr_init();
197
209 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 198 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
210 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 199 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
211 200
@@ -215,6 +204,25 @@ void __init native_init_IRQ(void)
215 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
216 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
217 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207}
208
209void __init native_init_IRQ(void)
210{
211 int i;
212
213 init_ISA_irqs();
214 /*
215 * Cover the whole vector space, no vector can escape
216 * us. (some of these will be overridden and become
217 * 'special' SMP interrupts)
218 */
219 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
220 int vector = FIRST_EXTERNAL_VECTOR + i;
221 if (vector != IA32_SYSCALL_VECTOR)
222 set_intr_gate(vector, interrupt[i]);
223 }
224
225 apic_intr_init();
218 226
219 if (!acpi_ioapic) 227 if (!acpi_ioapic)
220 setup_irq(2, &irq2); 228 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
135 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
136 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
137 if (!data) { 141 if (!data) {
142 kfree(node);
138 error = -ENXIO; 143 error = -ENXIO;
139 goto err_dir; 144 goto err_dir;
140 } 145 }
@@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void)
209{ 214{
210 int error = 0; 215 int error = 0;
211 216
217 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
218 if (!arch_debugfs_dir)
219 return -ENOMEM;
220
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 221#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 222 error = boot_params_kdebugfs_init();
214#endif 223#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..774ac4991568 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
78 return ret; 78 return ret;
79} 79}
80 80
81/*
82 * If we don't do that, there is the possibility that the guest
83 * will calibrate under heavy load - thus, getting a lower lpj -
84 * and execute the delays themselves without load. This is wrong,
85 * because no delay loop can finish beforehand.
86 * Any heuristics is subject to fail, because ultimately, a large
87 * poll of guests can be running and trouble each other. So we preset
88 * lpj here
89 */
90static unsigned long kvm_get_tsc_khz(void)
91{
92 return preset_lpj;
93}
94
95static void kvm_get_preset_lpj(void)
96{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz;
99 u64 lpj;
100
101 src = &per_cpu(hv_clock, 0);
102 khz = pvclock_tsc_khz(src);
103
104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ);
106 preset_lpj = lpj;
107}
108
81static struct clocksource kvm_clock = { 109static struct clocksource kvm_clock = {
82 .name = "kvm-clock", 110 .name = "kvm-clock",
83 .read = kvm_clock_read, 111 .read = kvm_clock_read,
@@ -113,7 +141,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 141#endif
114 142
115#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 144static void __init kvm_smp_prepare_boot_cpu(void)
117{ 145{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 146 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 147 native_smp_prepare_boot_cpu();
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
153 pv_time_ops.get_wallclock = kvm_get_wallclock; 181 pv_time_ops.get_wallclock = kvm_get_wallclock;
154 pv_time_ops.set_wallclock = kvm_set_wallclock; 182 pv_time_ops.set_wallclock = kvm_set_wallclock;
155 pv_time_ops.sched_clock = kvm_clock_read; 183 pv_time_ops.sched_clock = kvm_clock_read;
184 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
156#ifdef CONFIG_X86_LOCAL_APIC 185#ifdef CONFIG_X86_LOCAL_APIC
157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 186 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
158#endif 187#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
163#ifdef CONFIG_KEXEC 192#ifdef CONFIG_KEXEC
164 machine_ops.crash_shutdown = kvm_crash_shutdown; 193 machine_ops.crash_shutdown = kvm_crash_shutdown;
165#endif 194#endif
195 kvm_get_preset_lpj();
166 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
167 } 197 }
168} 198}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h>
21 22
22#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
23static void flush_ldt(void *current_mm) 24static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 54
55 paravirt_alloc_ldt(newldt, mincount);
56
54#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
56 wmb(); 59 wmb();
@@ -62,12 +65,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 65
63 if (reload) { 66 if (reload) {
64#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
65 cpumask_t mask;
66
67 preempt_disable(); 68 preempt_disable();
68 load_LDT(pc); 69 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 70 if (!cpus_equal(current->mm->cpu_vm_mask,
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 71 cpumask_of_cpu(smp_processor_id())))
71 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 73 preempt_enable();
73#else 74#else
@@ -75,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
75#endif 76#endif
76 } 77 }
77 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
79 vfree(oldldt); 81 vfree(oldldt);
80 else 82 else
@@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
86static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
87{ 89{
88 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
89 92
90 if (err < 0) 93 if (err < 0)
91 return err; 94 return err;
92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
93 return 0; 98 return 0;
94} 99}
95 100
@@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
126 if (mm == current->active_mm) 131 if (mm == current->active_mm)
127 clear_LDT(); 132 clear_LDT();
128#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
130 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
131 else 137 else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
22#include <asm/cpufeature.h> 23#include <asm/cpufeature.h>
23#include <asm/desc.h> 24#include <asm/desc.h>
24#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/cacheflush.h>
25 27
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 29static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
77/* 79/*
78 * A architecture hook called to validate the 80 * A architecture hook called to validate the
79 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
80 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
81 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
82 * been copied into the kernel. 84 * been copied into the kernel.
83 * 85 *
@@ -85,10 +87,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 87 * reboot code buffer to allow us to avoid allocations
86 * later. 88 * later.
87 * 89 *
88 * Currently nothing. 90 * Make control page executable.
89 */ 91 */
90int machine_kexec_prepare(struct kimage *image) 92int machine_kexec_prepare(struct kimage *image)
91{ 93{
94 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1);
92 return 0; 96 return 0;
93} 97}
94 98
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 102 */
99void machine_kexec_cleanup(struct kimage *image) 103void machine_kexec_cleanup(struct kimage *image)
100{ 104{
105 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1);
101} 107}
102 108
103/* 109/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 110 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 111 * We are past the point of no return, committed to rebooting now.
106 */ 112 */
107NORET_TYPE void machine_kexec(struct kimage *image) 113void machine_kexec(struct kimage *image)
108{ 114{
109 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
110 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
118 asmlinkage unsigned long
119 (*relocate_kernel_ptr)(unsigned long indirection_page,
120 unsigned long control_page,
121 unsigned long start_address,
122 unsigned int has_pae,
123 unsigned int preserve_context);
124
125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
111 129
112 tracer_disable(); 130 save_ftrace_enabled = __ftrace_enabled_save();
113 131
114 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 133 local_irq_disable();
116 134
135 if (image->preserve_context) {
136#ifdef CONFIG_X86_IO_APIC
137 /* We need to put APICs in legacy mode so that we can
138 * get timer interrupts in second kernel. kexec/kdump
139 * paths already have calls to disable_IO_APIC() in
140 * one form or other. kexec jump path also need
141 * one.
142 */
143 disable_IO_APIC();
144#endif
145 }
146
117 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
119 149
150 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 153 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 155#ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 163 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 166
135 /* The segment registers are funny things, they have both a 167 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 168 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 181 set_idt(phys_to_virt(0),0);
150 182
151 /* now call it */ 183 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 184 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 185 (unsigned long)page_list,
186 image->start, cpu_has_pae,
187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
154} 195}
155 196
156void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 56b933119a04..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,851 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed_ptr(current, &old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 cycle_kernel_lock();
428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
429}
430
431static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
432{
433 ssize_t ret;
434
435 if ((len >> PAGE_SHIFT) > num_physpages) {
436 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
437 return -EINVAL;
438 }
439
440 get_online_cpus();
441 mutex_lock(&microcode_mutex);
442
443 user_buffer = (void __user *) buf;
444 user_buffer_size = (int) len;
445
446 ret = do_microcode_update();
447 if (!ret)
448 ret = (ssize_t)len;
449
450 mutex_unlock(&microcode_mutex);
451 put_online_cpus();
452
453 return ret;
454}
455
456static const struct file_operations microcode_fops = {
457 .owner = THIS_MODULE,
458 .write = microcode_write,
459 .open = microcode_open,
460};
461
462static struct miscdevice microcode_dev = {
463 .minor = MICROCODE_MINOR,
464 .name = "microcode",
465 .fops = &microcode_fops,
466};
467
468static int __init microcode_dev_init (void)
469{
470 int error;
471
472 error = misc_register(&microcode_dev);
473 if (error) {
474 printk(KERN_ERR
475 "microcode: can't misc_register on minor=%d\n",
476 MICROCODE_MINOR);
477 return error;
478 }
479
480 return 0;
481}
482
483static void microcode_dev_exit (void)
484{
485 misc_deregister(&microcode_dev);
486}
487
488MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#else
490#define microcode_dev_init() 0
491#define microcode_dev_exit() do { } while(0)
492#endif
493
494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
495 unsigned long size, long offset)
496{
497 microcode_header_t *mc_header;
498 unsigned long total_size;
499
500 /* No more data */
501 if (offset >= size)
502 return 0;
503 mc_header = (microcode_header_t *)(buf + offset);
504 total_size = get_totalsize(mc_header);
505
506 if (offset + total_size > size) {
507 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
508 return -EINVAL;
509 }
510
511 *mc = vmalloc(total_size);
512 if (!*mc) {
513 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
514 return -ENOMEM;
515 }
516 memcpy(*mc, buf + offset, total_size);
517 return offset + total_size;
518}
519
520/* fake device for request_firmware */
521static struct platform_device *microcode_pdev;
522
523static int cpu_request_microcode(int cpu)
524{
525 char name[30];
526 struct cpuinfo_x86 *c = &cpu_data(cpu);
527 const struct firmware *firmware;
528 const u8 *buf;
529 unsigned long size;
530 long offset = 0;
531 int error;
532 void *mc;
533
534 /* We should bind the task to the CPU */
535 BUG_ON(cpu != raw_smp_processor_id());
536 sprintf(name,"intel-ucode/%02x-%02x-%02x",
537 c->x86, c->x86_model, c->x86_mask);
538 error = request_firmware(&firmware, name, &microcode_pdev->dev);
539 if (error) {
540 pr_debug("microcode: data file %s load failed\n", name);
541 return error;
542 }
543 buf = firmware->data;
544 size = firmware->size;
545 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
546 > 0) {
547 error = microcode_sanity_check(mc);
548 if (error)
549 break;
550 error = get_maching_microcode(mc, cpu);
551 if (error < 0)
552 break;
553 /*
554 * It's possible the data file has multiple matching ucode,
555 * lets keep searching till the latest version
556 */
557 if (error == 1) {
558 apply_microcode(cpu);
559 error = 0;
560 }
561 vfree(mc);
562 }
563 if (offset > 0)
564 vfree(mc);
565 if (offset < 0)
566 error = offset;
567 release_firmware(firmware);
568
569 return error;
570}
571
572static int apply_microcode_check_cpu(int cpu)
573{
574 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old;
577 unsigned int val[2];
578 int err = 0;
579
580 /* Check if the microcode is available */
581 if (!uci->mc)
582 return 0;
583
584 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
586
587 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
589 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
590 err = -EINVAL;
591
592 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
593 /* get processor flags from MSR 0x17 */
594 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
595 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
596 err = -EINVAL;
597 }
598
599 if (!err) {
600 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
601 /* see notes above for revision 1.07. Apparent chip bug */
602 sync_core();
603 /* get the current revision from MSR 0x8B */
604 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
605 if (uci->rev != val[1])
606 err = -EINVAL;
607 }
608
609 if (!err)
610 apply_microcode(cpu);
611 else
612 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
613 " sig=0x%x, pf=0x%x, rev=0x%x\n",
614 cpu, uci->sig, uci->pf, uci->rev);
615
616 set_cpus_allowed_ptr(current, &old);
617 return err;
618}
619
620static void microcode_init_cpu(int cpu, int resume)
621{
622 cpumask_t old;
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624
625 old = current->cpus_allowed;
626
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
628 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
631 cpu_request_microcode(cpu);
632 mutex_unlock(&microcode_mutex);
633 set_cpus_allowed_ptr(current, &old);
634}
635
636static void microcode_fini_cpu(int cpu)
637{
638 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
639
640 mutex_lock(&microcode_mutex);
641 uci->valid = 0;
642 vfree(uci->mc);
643 uci->mc = NULL;
644 mutex_unlock(&microcode_mutex);
645}
646
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
648{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end;
651 unsigned long val = simple_strtoul(buf, &end, 0);
652 int err = 0;
653 int cpu = dev->id;
654
655 if (end == buf)
656 return -EINVAL;
657 if (val == 1) {
658 cpumask_t old;
659
660 old = current->cpus_allowed;
661
662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
664
665 mutex_lock(&microcode_mutex);
666 if (uci->valid)
667 err = cpu_request_microcode(cpu);
668 mutex_unlock(&microcode_mutex);
669 put_online_cpus();
670 set_cpus_allowed_ptr(current, &old);
671 }
672 if (err)
673 return err;
674 return sz;
675}
676
677static ssize_t version_show(struct sys_device *dev, char *buf)
678{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680
681 return sprintf(buf, "0x%x\n", uci->rev);
682}
683
684static ssize_t pf_show(struct sys_device *dev, char *buf)
685{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687
688 return sprintf(buf, "0x%x\n", uci->pf);
689}
690
691static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
692static SYSDEV_ATTR(version, 0400, version_show, NULL);
693static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
694
695static struct attribute *mc_default_attrs[] = {
696 &attr_reload.attr,
697 &attr_version.attr,
698 &attr_processor_flags.attr,
699 NULL
700};
701
702static struct attribute_group mc_attr_group = {
703 .attrs = mc_default_attrs,
704 .name = "microcode",
705};
706
707static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
708{
709 int err, cpu = sys_dev->id;
710 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
711
712 if (!cpu_online(cpu))
713 return 0;
714
715 pr_debug("microcode: CPU%d added\n", cpu);
716 memset(uci, 0, sizeof(*uci));
717
718 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
719 if (err)
720 return err;
721
722 microcode_init_cpu(cpu, resume);
723
724 return 0;
725}
726
727static int mc_sysdev_add(struct sys_device *sys_dev)
728{
729 return __mc_sysdev_add(sys_dev, 0);
730}
731
732static int mc_sysdev_remove(struct sys_device *sys_dev)
733{
734 int cpu = sys_dev->id;
735
736 if (!cpu_online(cpu))
737 return 0;
738
739 pr_debug("microcode: CPU%d removed\n", cpu);
740 microcode_fini_cpu(cpu);
741 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
742 return 0;
743}
744
745static int mc_sysdev_resume(struct sys_device *dev)
746{
747 int cpu = dev->id;
748
749 if (!cpu_online(cpu))
750 return 0;
751 pr_debug("microcode: CPU%d resumed\n", cpu);
752 /* only CPU 0 will apply ucode here */
753 apply_microcode(0);
754 return 0;
755}
756
757static struct sysdev_driver mc_sysdev_driver = {
758 .add = mc_sysdev_add,
759 .remove = mc_sysdev_remove,
760 .resume = mc_sysdev_resume,
761};
762
763static __cpuinit int
764mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
765{
766 unsigned int cpu = (unsigned long)hcpu;
767 struct sys_device *sys_dev;
768
769 sys_dev = get_cpu_sysdev(cpu);
770 switch (action) {
771 case CPU_UP_CANCELED_FROZEN:
772 /* The CPU refused to come up during a system resume */
773 microcode_fini_cpu(cpu);
774 break;
775 case CPU_ONLINE:
776 case CPU_DOWN_FAILED:
777 mc_sysdev_add(sys_dev);
778 break;
779 case CPU_ONLINE_FROZEN:
780 /* System-wide resume is in progress, try to apply microcode */
781 if (apply_microcode_check_cpu(cpu)) {
782 /* The application of microcode failed */
783 microcode_fini_cpu(cpu);
784 __mc_sysdev_add(sys_dev, 1);
785 break;
786 }
787 case CPU_DOWN_FAILED_FROZEN:
788 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
789 printk(KERN_ERR "microcode: Failed to create the sysfs "
790 "group for CPU%d\n", cpu);
791 break;
792 case CPU_DOWN_PREPARE:
793 mc_sysdev_remove(sys_dev);
794 break;
795 case CPU_DOWN_PREPARE_FROZEN:
796 /* Suspend is in progress, only remove the interface */
797 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
798 break;
799 }
800 return NOTIFY_OK;
801}
802
803static struct notifier_block __refdata mc_cpu_notifier = {
804 .notifier_call = mc_cpu_callback,
805};
806
807static int __init microcode_init (void)
808{
809 int error;
810
811 printk(KERN_INFO
812 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
813
814 error = microcode_dev_init();
815 if (error)
816 return error;
817 microcode_pdev = platform_device_register_simple("microcode", -1,
818 NULL, 0);
819 if (IS_ERR(microcode_pdev)) {
820 microcode_dev_exit();
821 return PTR_ERR(microcode_pdev);
822 }
823
824 get_online_cpus();
825 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
826 put_online_cpus();
827 if (error) {
828 microcode_dev_exit();
829 platform_device_unregister(microcode_pdev);
830 return error;
831 }
832
833 register_hotcpu_notifier(&mc_cpu_notifier);
834 return 0;
835}
836
837static void __exit microcode_exit (void)
838{
839 microcode_dev_exit();
840
841 unregister_hotcpu_notifier(&mc_cpu_notifier);
842
843 get_online_cpus();
844 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
845 put_online_cpus();
846
847 platform_device_unregister(microcode_pdev);
848}
849
850module_init(microcode_init)
851module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
238 {} 238 {}
239}; 239};
240 240
241void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
242{ 242{
243 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
244} 244}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ 52static void __init MP_processor_info(struct mpc_config_processor *m)
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
124 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
228 154
229static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
230{ 156{
231 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
232 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
233 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
234 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
237 163
238static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
239{ 165{
240 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
241 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
242 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
243 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
309 235
310static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
311{ 237{
312 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
313 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
314 m->mpc_irqtype, m->mpc_irqflag & 3, 240 m->mpc_irqtype, m->mpc_irqflag & 3,
315 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,16 +389,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
545 generic_bigsmp_probe(); 397 generic_bigsmp_probe();
546#endif 398#endif
547 399
400#ifdef CONFIG_X86_32
548 setup_apic_routing(); 401 setup_apic_routing();
402#endif
549 if (!num_processors) 403 if (!num_processors)
550 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
551 return num_processors; 405 return num_processors;
@@ -632,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
632} 486}
633 487
634 488
635static void construct_ioapic_table(int mpc_default_type) 489static void __init construct_ioapic_table(int mpc_default_type)
636{ 490{
637 struct mpc_config_ioapic ioapic; 491 struct mpc_config_ioapic ioapic;
638 struct mpc_config_bus bus; 492 struct mpc_config_bus bus;
@@ -677,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type)
677 construct_default_ioirq_mptable(mpc_default_type); 531 construct_default_ioirq_mptable(mpc_default_type);
678} 532}
679#else 533#else
680static inline void construct_ioapic_table(int mpc_default_type) { } 534static inline void __init construct_ioapic_table(int mpc_default_type) { }
681#endif 535#endif
682 536
683static inline void __init construct_default_ISA_mptable(int mpc_default_type) 537static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -726,20 +580,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 580static struct intel_mp_floating *mpf_found;
727 581
728/* 582/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 583 * Scan the memory blocks for an SMP configuration block.
736 */ 584 */
737static void __init __get_smp_config(unsigned int early) 585static void __init __get_smp_config(unsigned int early)
738{ 586{
739 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
740 588
741 if (mach_get_smp_config_quirk) { 589 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 590 if (x86_quirks->mach_get_smp_config(early))
743 return; 591 return;
744 } 592 }
745 if (acpi_lapic && early) 593 if (acpi_lapic && early)
@@ -849,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
849 unsigned int *bp = phys_to_virt(base); 697 unsigned int *bp = phys_to_virt(base);
850 struct intel_mp_floating *mpf; 698 struct intel_mp_floating *mpf;
851 699
852 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); 700 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
701 bp, length);
853 BUILD_BUG_ON(sizeof(*mpf) != 16); 702 BUILD_BUG_ON(sizeof(*mpf) != 16);
854 703
855 while (length > 0) { 704 while (length > 0) {
@@ -899,14 +748,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 748 return 0;
900} 749}
901 750
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 751static void __init __find_smp_config(unsigned int reserve)
905{ 752{
906 unsigned int address; 753 unsigned int address;
907 754
908 if (mach_find_smp_config_quirk) { 755 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 756 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 757 return;
911 } 758 }
912 /* 759 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..82a7c7ed6d45 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
131 ret = -EIO; /* MSR not supported */ 145 ret = -EIO; /* MSR not supported */
132out: 146out:
133 unlock_kernel(); 147 unlock_kernel();
134 return 0; 148 return ret;
135} 149}
136 150
137/* 151/*
@@ -149,7 +163,7 @@ static int __cpuinit msr_device_create(int cpu)
149{ 163{
150 struct device *dev; 164 struct device *dev;
151 165
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
153 "msr%d", cpu); 167 "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 169}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
114} 114}
115#endif 115#endif
116 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
117int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
118{ 135{
119 unsigned int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
141 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
142 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
143 continue; 160 continue;
144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
145 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
146 "appears to be stuck (%d->%d)!\n",
147 cpu,
148 prev_nmi_count[cpu],
149 get_nmi_count(cpu));
150 per_cpu(wd_enabled, cpu) = 0;
151 atomic_dec(&nmi_active);
152 }
153 } 163 }
154 endflag = 1; 164 endflag = 1;
155 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
@@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 273
264static void __acpi_nmi_enable(void *__unused) 274static void __acpi_nmi_enable(void *__unused)
265{ 275{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 276 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 277}
268 278
269/* 279/*
@@ -277,7 +287,7 @@ void acpi_nmi_enable(void)
277 287
278static void __acpi_nmi_disable(void *__unused) 288static void __acpi_nmi_disable(void *__unused)
279{ 289{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 290 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 291}
282 292
283/* 293/*
@@ -289,6 +299,15 @@ void acpi_nmi_disable(void)
289 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
290} 300}
291 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
292void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
293{ 312{
294 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
301 320
302 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
303 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
304 /* enable it before to avoid race with handler */
305 __get_cpu_var(wd_enabled) = 1;
306 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
307 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
308 return; 325 return;
@@ -448,6 +465,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 465
449#ifdef CONFIG_SYSCTL 466#ifdef CONFIG_SYSCTL
450 467
468static int __init setup_unknown_nmi_panic(char *str)
469{
470 unknown_nmi_panic = 1;
471 return 1;
472}
473__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
474
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 475static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 476{
453 unsigned char reason = get_nmi_reason(); 477 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,195 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __cpuinit numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
238static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL,
241 .arch_pre_intr_init = NULL,
242 .arch_memory_setup = NULL,
243 .arch_intr_init = NULL,
244 .arch_trap_init = NULL,
245 .mach_get_smp_config = NULL,
246 .mach_find_smp_config = NULL,
247 .mpc_record = &mpc_record,
248 .mpc_apic_id = mpc_apic_id,
249 .mpc_oem_bus_info = mpc_oem_bus_info,
250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
253};
254
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
256 char *productid)
257{
258 if (strncmp(oem, "IBM NUMA", 8))
259 printk("Warning! Not a NUMA-Q system!\n");
260 else
261 found_numaq = 1;
262}
263
74static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
75{ 265{
76 /* 266 /*
@@ -82,6 +272,9 @@ static __init void early_check_numaq(void)
82 */ 272 */
83 if (smp_found_config) 273 if (smp_found_config)
84 early_get_smp_config(); 274 early_get_smp_config();
275
276 if (found_numaq)
277 x86_quirks = &numaq_x86_quirks;
85} 278}
86 279
87int __init get_memcfg_numaq(void) 280int __init get_memcfg_numaq(void)
@@ -92,14 +285,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 285 smp_dump_qct();
93 return 1; 286 return 1;
94} 287}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -317,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
317#endif 319#endif
318 .wbinvd = native_wbinvd, 320 .wbinvd = native_wbinvd,
319 .read_msr = native_read_msr_safe, 321 .read_msr = native_read_msr_safe,
322 .read_msr_amd = native_read_msr_amd_safe,
320 .write_msr = native_write_msr_safe, 323 .write_msr = native_write_msr_safe,
321 .read_tsc = native_read_tsc, 324 .read_tsc = native_read_tsc,
322 .read_pmc = native_read_pmc, 325 .read_pmc = native_read_pmc,
@@ -335,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
335 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
336 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
337 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
338 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
339 346
340#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -360,9 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
360 367
361struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
368 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -373,6 +377,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 377#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 378 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 379 .pagetable_setup_done = native_pagetable_setup_done,
380#else
381 .pagetable_setup_start = paravirt_nop,
382 .pagetable_setup_done = paravirt_nop,
376#endif 383#endif
377 384
378 .read_cr2 = native_read_cr2, 385 .read_cr2 = native_read_cr2,
@@ -428,7 +435,7 @@ struct pv_mmu_ops pv_mmu_ops = {
428#endif /* PAGETABLE_LEVELS >= 3 */ 435#endif /* PAGETABLE_LEVELS >= 3 */
429 436
430 .pte_val = native_pte_val, 437 .pte_val = native_pte_val,
431 .pte_flags = native_pte_val, 438 .pte_flags = native_pte_flags,
432 .pgd_val = native_pgd_val, 439 .pgd_val = native_pgd_val,
433 440
434 .make_pte = native_make_pte, 441 .make_pte = native_make_pte,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..e1e731d78f38 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -213,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 217
214#endif /* CONFIG_IOMMU_DEBUG */ 218#endif /* CONFIG_IOMMU_DEBUG */
215 219
216static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
217{
218 unsigned int npages;
219
220 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
221 npages >>= PAGE_SHIFT;
222
223 return npages;
224}
225
226static inline int translation_enabled(struct iommu_table *tbl) 220static inline int translation_enabled(struct iommu_table *tbl)
227{ 221{
228 /* only PHBs with translation enabled have an IOMMU table */ 222 /* only PHBs with translation enabled have an IOMMU table */
@@ -257,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
257 badbit, tbl, start_addr, npages); 251 badbit, tbl, start_addr, npages);
258 } 252 }
259 253
260 set_bit_string(tbl->it_map, index, npages); 254 iommu_area_reserve(tbl->it_map, index, npages);
261 255
262 spin_unlock_irqrestore(&tbl->it_lock, flags); 256 spin_unlock_irqrestore(&tbl->it_lock, flags);
263} 257}
@@ -339,9 +333,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
339 /* were we called with bad_dma_address? */ 333 /* were we called with bad_dma_address? */
340 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 334 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
341 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 335 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
342 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 336 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
343 "address 0x%Lx\n", dma_addr); 337 "address 0x%Lx\n", dma_addr);
344 WARN_ON(1);
345 return; 338 return;
346 } 339 }
347 340
@@ -405,27 +398,11 @@ static void calgary_unmap_sg(struct device *dev,
405 if (dmalen == 0) 398 if (dmalen == 0)
406 break; 399 break;
407 400
408 npages = num_dma_pages(dma, dmalen); 401 npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
409 iommu_free(tbl, dma, npages); 402 iommu_free(tbl, dma, npages);
410 } 403 }
411} 404}
412 405
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 406static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 407 int nelems, int direction)
431{ 408{
@@ -436,14 +413,11 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 413 unsigned long entry;
437 int i; 414 int i;
438 415
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 416 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 417 BUG_ON(!sg_page(s));
444 418
445 vaddr = (unsigned long) sg_virt(s); 419 vaddr = (unsigned long) sg_virt(s);
446 npages = num_dma_pages(vaddr, s->length); 420 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
447 421
448 entry = iommu_range_alloc(dev, tbl, npages); 422 entry = iommu_range_alloc(dev, tbl, npages);
449 if (entry == bad_dma_address) { 423 if (entry == bad_dma_address) {
@@ -474,21 +448,15 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 448static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 449 size_t size, int direction)
476{ 450{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 451 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 452 unsigned long uaddr;
480 unsigned int npages; 453 unsigned int npages;
481 struct iommu_table *tbl = find_iommu_table(dev); 454 struct iommu_table *tbl = find_iommu_table(dev);
482 455
483 uaddr = (unsigned long)vaddr; 456 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
485 458
486 if (translation_enabled(tbl)) 459 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 460}
493 461
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 462static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,10 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 465 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 466 unsigned int npages;
499 467
500 if (!translation_enabled(tbl)) 468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
501 return;
502
503 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 469 iommu_free(tbl, dma_handle, npages);
505} 470}
506 471
@@ -516,24 +481,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
516 npages = size >> PAGE_SHIFT; 481 npages = size >> PAGE_SHIFT;
517 order = get_order(size); 482 order = get_order(size);
518 483
484 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
485
519 /* alloc enough pages (and possibly more) */ 486 /* alloc enough pages (and possibly more) */
520 ret = (void *)__get_free_pages(flag, order); 487 ret = (void *)__get_free_pages(flag, order);
521 if (!ret) 488 if (!ret)
522 goto error; 489 goto error;
523 memset(ret, 0, size); 490 memset(ret, 0, size);
524 491
525 if (translation_enabled(tbl)) { 492 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 493 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 494 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 495 goto free;
529 goto free; 496 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 497 return ret;
536
537free: 498free:
538 free_pages((unsigned long)ret, get_order(size)); 499 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 500 ret = NULL;
@@ -541,8 +502,22 @@ error:
541 return ret; 502 return ret;
542} 503}
543 504
544static const struct dma_mapping_ops calgary_dma_ops = { 505static void calgary_free_coherent(struct device *dev, size_t size,
506 void *vaddr, dma_addr_t dma_handle)
507{
508 unsigned int npages;
509 struct iommu_table *tbl = find_iommu_table(dev);
510
511 size = PAGE_ALIGN(size);
512 npages = size >> PAGE_SHIFT;
513
514 iommu_free(tbl, dma_handle, npages);
515 free_pages((unsigned long)vaddr, get_order(size));
516}
517
518static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 519 .alloc_coherent = calgary_alloc_coherent,
520 .free_coherent = calgary_free_coherent,
546 .map_single = calgary_map_single, 521 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 522 .unmap_single = calgary_unmap_single,
548 .map_sg = calgary_map_sg, 523 .map_sg = calgary_map_sg,
@@ -830,7 +805,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 805
831 tbl = pci_iommu(dev->bus); 806 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 807 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 808
809 if (is_kdump_kernel())
810 calgary_init_bitmap_from_tce_table(tbl);
811 else
812 tce_free(tbl, 0, tbl->it_size);
834 813
835 if (is_calgary(dev->device)) 814 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 815 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1188,10 @@ static int __init calgary_init(void)
1209 if (ret) 1188 if (ret)
1210 return ret; 1189 return ret;
1211 1190
1191 /* Purely for kdump kernel case */
1192 if (is_kdump_kernel())
1193 get_tce_space_from_tar();
1194
1212 do { 1195 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1196 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1197 if (!dev)
@@ -1230,6 +1213,16 @@ static int __init calgary_init(void)
1230 goto error; 1213 goto error;
1231 } while (1); 1214 } while (1);
1232 1215
1216 dev = NULL;
1217 for_each_pci_dev(dev) {
1218 struct iommu_table *tbl;
1219
1220 tbl = find_iommu_table(&dev->dev);
1221
1222 if (translation_enabled(tbl))
1223 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1224 }
1225
1233 return ret; 1226 return ret;
1234 1227
1235error: 1228error:
@@ -1251,6 +1244,7 @@ error:
1251 calgary_disable_translation(dev); 1244 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1245 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1246 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1247 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1248 } while (1);
1255 1249
1256 return ret; 1250 return ret;
@@ -1280,13 +1274,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1280static int __init build_detail_arrays(void) 1274static int __init build_detail_arrays(void)
1281{ 1275{
1282 unsigned long ptr; 1276 unsigned long ptr;
1283 int i, scal_detail_size, rio_detail_size; 1277 unsigned numnodes, i;
1278 int scal_detail_size, rio_detail_size;
1284 1279
1285 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1280 numnodes = rio_table_hdr->num_scal_dev;
1281 if (numnodes > MAX_NUMNODES){
1286 printk(KERN_WARNING 1282 printk(KERN_WARNING
1287 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1283 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1288 "but system has %d nodes.\n", 1284 "but system has %d nodes.\n",
1289 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1285 MAX_NUMNODES, numnodes);
1290 return -ENODEV; 1286 return -ENODEV;
1291 } 1287 }
1292 1288
@@ -1307,8 +1303,7 @@ static int __init build_detail_arrays(void)
1307 } 1303 }
1308 1304
1309 ptr = ((unsigned long)rio_table_hdr) + 3; 1305 ptr = ((unsigned long)rio_table_hdr) + 3;
1310 for (i = 0; i < rio_table_hdr->num_scal_dev; 1306 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1311 i++, ptr += scal_detail_size)
1312 scal_devs[i] = (struct scal_detail *)ptr; 1307 scal_devs[i] = (struct scal_detail *)ptr;
1313 1308
1314 for (i = 0; i < rio_table_hdr->num_rio_dev; 1309 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1339,6 +1334,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1334 return (val != 0xffffffff);
1340} 1335}
1341 1336
1337/*
1338 * calgary_init_bitmap_from_tce_table():
1339 * Funtion for kdump case. In the second/kdump kernel initialize
1340 * the bitmap based on the tce table entries obtained from first kernel
1341 */
1342static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1343{
1344 u64 *tp;
1345 unsigned int index;
1346 tp = ((u64 *)tbl->it_base);
1347 for (index = 0 ; index < tbl->it_size; index++) {
1348 if (*tp != 0x0)
1349 set_bit(index, tbl->it_map);
1350 tp++;
1351 }
1352}
1353
1354/*
1355 * get_tce_space_from_tar():
1356 * Function for kdump case. Get the tce tables from first kernel
1357 * by reading the contents of the base adress register of calgary iommu
1358 */
1359static void __init get_tce_space_from_tar(void)
1360{
1361 int bus;
1362 void __iomem *target;
1363 unsigned long tce_space;
1364
1365 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1366 struct calgary_bus_info *info = &bus_info[bus];
1367 unsigned short pci_device;
1368 u32 val;
1369
1370 val = read_pci_config(bus, 0, 0, 0);
1371 pci_device = (val & 0xFFFF0000) >> 16;
1372
1373 if (!is_cal_pci_dev(pci_device))
1374 continue;
1375 if (info->translation_disabled)
1376 continue;
1377
1378 if (calgary_bus_has_devices(bus, pci_device) ||
1379 translate_empty_slots) {
1380 target = calgary_reg(bus_info[bus].bbar,
1381 tar_offset(bus));
1382 tce_space = be64_to_cpu(readq(target));
1383 tce_space = tce_space & TAR_SW_BITS;
1384
1385 tce_space = tce_space & (~specified_table_size);
1386 info->tce_space = (u64 *)__va(tce_space);
1387 }
1388 }
1389 return;
1390}
1391
1342void __init detect_calgary(void) 1392void __init detect_calgary(void)
1343{ 1393{
1344 int bus; 1394 int bus;
@@ -1394,7 +1444,8 @@ void __init detect_calgary(void)
1394 return; 1444 return;
1395 } 1445 }
1396 1446
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1447 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1448 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1449
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1450 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1451 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1463,16 @@ void __init detect_calgary(void)
1412 1463
1413 if (calgary_bus_has_devices(bus, pci_device) || 1464 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1465 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1466 /*
1416 if (!tbl) 1467 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1468 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1469 */
1470 if (!is_kdump_kernel()) {
1471 tbl = alloc_tce_table();
1472 if (!tbl)
1473 goto cleanup;
1474 info->tce_space = tbl;
1475 }
1419 calgary_found = 1; 1476 calgary_found = 1;
1420 } 1477 }
1421 } 1478 }
@@ -1430,6 +1487,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1487 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1488 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1489 debugging ? "enabled" : "disabled");
1490
1491 /* swiotlb for devices that aren't behind the Calgary. */
1492 if (max_pfn > MAX_DMA32_PFN)
1493 swiotlb = 1;
1433 } 1494 }
1434 return; 1495 return;
1435 1496
@@ -1446,7 +1507,7 @@ int __init calgary_iommu_init(void)
1446{ 1507{
1447 int ret; 1508 int ret;
1448 1509
1449 if (no_iommu || swiotlb) 1510 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1511 return -ENODEV;
1451 1512
1452 if (!calgary_detected) 1513 if (!calgary_detected)
@@ -1459,15 +1520,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1520 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1521 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1522 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1523 return ret;
1466 } 1524 }
1467 1525
1468 force_iommu = 1; 1526 force_iommu = 1;
1469 bad_dma_address = 0x0; 1527 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1528 /* dma_ops is set to swiotlb or nommu */
1529 if (!dma_ops)
1530 dma_ops = &nommu_dma_ops;
1471 1531
1472 return 0; 1532 return 0;
1473} 1533}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..1972266e8ba5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,11 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12struct dma_mapping_ops *dma_ops;
13EXPORT_SYMBOL(forbid_dac);
14
15const struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 13EXPORT_SYMBOL(dma_ops);
17 14
18static int iommu_sac_force __read_mostly; 15static int iommu_sac_force __read_mostly;
@@ -42,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address);
42/* Dummy device used for NULL arguments (normally ISA). Better would 39/* Dummy device used for NULL arguments (normally ISA). Better would
43 be probably a smaller DMA mask, but this is bug-to-bug compatible 40 be probably a smaller DMA mask, but this is bug-to-bug compatible
44 to older i386. */ 41 to older i386. */
45struct device fallback_dev = { 42struct device x86_dma_fallback_dev = {
46 .bus_id = "fallback device", 43 .bus_id = "fallback device",
47 .coherent_dma_mask = DMA_32BIT_MASK, 44 .coherent_dma_mask = DMA_32BIT_MASK,
48 .dma_mask = &fallback_dev.coherent_dma_mask, 45 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
49}; 46};
47EXPORT_SYMBOL(x86_dma_fallback_dev);
50 48
51int dma_set_mask(struct device *dev, u64 mask) 49int dma_set_mask(struct device *dev, u64 mask)
52{ 50{
@@ -83,7 +81,7 @@ void __init dma32_reserve_bootmem(void)
83 * using 512M as goal 81 * using 512M as goal
84 */ 82 */
85 align = 64ULL<<20; 83 align = 64ULL<<20;
86 size = round_up(dma32_bootmem_size, align); 84 size = roundup(dma32_bootmem_size, align);
87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 85 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
88 512ULL<<20); 86 512ULL<<20);
89 if (dma32_bootmem_ptr) 87 if (dma32_bootmem_ptr)
@@ -114,24 +112,57 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 112 * The order of these functions is important for
115 * fall-back/fail-over reasons 113 * fall-back/fail-over reasons
116 */ 114 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 115 gart_iommu_hole_init();
119#endif
120 116
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 117 detect_calgary();
123#endif
124 118
125 detect_intel_iommu(); 119 detect_intel_iommu();
126 120
127 amd_iommu_detect(); 121 amd_iommu_detect();
128 122
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 123 pci_swiotlb_init();
131#endif
132} 124}
125
126unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
127{
128 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
129
130 return size >> PAGE_SHIFT;
131}
132EXPORT_SYMBOL(iommu_nr_pages);
133#endif 133#endif
134 134
135void *dma_generic_alloc_coherent(struct device *dev, size_t size,
136 dma_addr_t *dma_addr, gfp_t flag)
137{
138 unsigned long dma_mask;
139 struct page *page;
140 dma_addr_t addr;
141
142 dma_mask = dma_alloc_coherent_mask(dev, flag);
143
144 flag |= __GFP_ZERO;
145again:
146 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
147 if (!page)
148 return NULL;
149
150 addr = page_to_phys(page);
151 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
152 __free_pages(page, get_order(size));
153
154 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
155 flag = (flag & ~GFP_DMA32) | GFP_DMA;
156 goto again;
157 }
158
159 return NULL;
160 }
161
162 *dma_addr = addr;
163 return page_address(page);
164}
165
135/* 166/*
136 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 167 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
137 * documentation. 168 * documentation.
@@ -184,9 +215,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 215 swiotlb = 1;
185#endif 216#endif
186 217
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 218 gart_parse_options(p);
189#endif
190 219
191#ifdef CONFIG_CALGARY_IOMMU 220#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 221 if (!strncmp(p, "calgary", 7))
@@ -201,136 +230,19 @@ static __init int iommu_setup(char *p)
201} 230}
202early_param("iommu", iommu_setup); 231early_param("iommu", iommu_setup);
203 232
204#ifdef CONFIG_X86_32
205int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
206 dma_addr_t device_addr, size_t size, int flags)
207{
208 void __iomem *mem_base = NULL;
209 int pages = size >> PAGE_SHIFT;
210 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
211
212 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
213 goto out;
214 if (!size)
215 goto out;
216 if (dev->dma_mem)
217 goto out;
218
219 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
220
221 mem_base = ioremap(bus_addr, size);
222 if (!mem_base)
223 goto out;
224
225 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
226 if (!dev->dma_mem)
227 goto out;
228 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
229 if (!dev->dma_mem->bitmap)
230 goto free1_out;
231
232 dev->dma_mem->virt_base = mem_base;
233 dev->dma_mem->device_base = device_addr;
234 dev->dma_mem->size = pages;
235 dev->dma_mem->flags = flags;
236
237 if (flags & DMA_MEMORY_MAP)
238 return DMA_MEMORY_MAP;
239
240 return DMA_MEMORY_IO;
241
242 free1_out:
243 kfree(dev->dma_mem);
244 out:
245 if (mem_base)
246 iounmap(mem_base);
247 return 0;
248}
249EXPORT_SYMBOL(dma_declare_coherent_memory);
250
251void dma_release_declared_memory(struct device *dev)
252{
253 struct dma_coherent_mem *mem = dev->dma_mem;
254
255 if (!mem)
256 return;
257 dev->dma_mem = NULL;
258 iounmap(mem->virt_base);
259 kfree(mem->bitmap);
260 kfree(mem);
261}
262EXPORT_SYMBOL(dma_release_declared_memory);
263
264void *dma_mark_declared_memory_occupied(struct device *dev,
265 dma_addr_t device_addr, size_t size)
266{
267 struct dma_coherent_mem *mem = dev->dma_mem;
268 int pos, err;
269 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
270
271 pages >>= PAGE_SHIFT;
272
273 if (!mem)
274 return ERR_PTR(-EINVAL);
275
276 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
277 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
278 if (err != 0)
279 return ERR_PTR(err);
280 return mem->virt_base + (pos << PAGE_SHIFT);
281}
282EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
283
284static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
285 dma_addr_t *dma_handle, void **ret)
286{
287 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
288 int order = get_order(size);
289
290 if (mem) {
291 int page = bitmap_find_free_region(mem->bitmap, mem->size,
292 order);
293 if (page >= 0) {
294 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
295 *ret = mem->virt_base + (page << PAGE_SHIFT);
296 memset(*ret, 0, size);
297 }
298 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
299 *ret = NULL;
300 }
301 return (mem != NULL);
302}
303
304static int dma_release_coherent(struct device *dev, int order, void *vaddr)
305{
306 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
307
308 if (mem && vaddr >= mem->virt_base && vaddr <
309 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
310 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
311
312 bitmap_release_region(mem->bitmap, page, order);
313 return 1;
314 }
315 return 0;
316}
317#else
318#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
319#define dma_release_coherent(dev, order, vaddr) (0)
320#endif /* CONFIG_X86_32 */
321
322int dma_supported(struct device *dev, u64 mask) 233int dma_supported(struct device *dev, u64 mask)
323{ 234{
235 struct dma_mapping_ops *ops = get_dma_ops(dev);
236
324#ifdef CONFIG_PCI 237#ifdef CONFIG_PCI
325 if (mask > 0xffffffff && forbid_dac > 0) { 238 if (mask > 0xffffffff && forbid_dac > 0) {
326 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 239 dev_info(dev, "PCI: Disallowing DAC for device\n");
327 dev->bus_id);
328 return 0; 240 return 0;
329 } 241 }
330#endif 242#endif
331 243
332 if (dma_ops->dma_supported) 244 if (ops->dma_supported)
333 return dma_ops->dma_supported(dev, mask); 245 return ops->dma_supported(dev, mask);
334 246
335 /* Copied from i386. Doesn't make much sense, because it will 247 /* Copied from i386. Doesn't make much sense, because it will
336 only work for pci_alloc_coherent. 248 only work for pci_alloc_coherent.
@@ -351,8 +263,7 @@ int dma_supported(struct device *dev, u64 mask)
351 type. Normally this doesn't make any difference, but gives 263 type. Normally this doesn't make any difference, but gives
352 more gentle handling of IOMMU overflow. */ 264 more gentle handling of IOMMU overflow. */
353 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 265 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
354 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 266 dev_info(dev, "Force SAC with mask %Lx\n", mask);
355 dev->bus_id, mask);
356 return 0; 267 return 0;
357 } 268 }
358 269
@@ -360,157 +271,15 @@ int dma_supported(struct device *dev, u64 mask)
360} 271}
361EXPORT_SYMBOL(dma_supported); 272EXPORT_SYMBOL(dma_supported);
362 273
363/* Allocate DMA memory on node near device */
364static noinline struct page *
365dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
366{
367 int node;
368
369 node = dev_to_node(dev);
370
371 return alloc_pages_node(node, gfp, order);
372}
373
374/*
375 * Allocate memory for a coherent mapping.
376 */
377void *
378dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
379 gfp_t gfp)
380{
381 void *memory = NULL;
382 struct page *page;
383 unsigned long dma_mask = 0;
384 dma_addr_t bus;
385 int noretry = 0;
386
387 /* ignore region specifiers */
388 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
389
390 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
391 return memory;
392
393 if (!dev) {
394 dev = &fallback_dev;
395 gfp |= GFP_DMA;
396 }
397 dma_mask = dev->coherent_dma_mask;
398 if (dma_mask == 0)
399 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
400
401 /* Device not DMA able */
402 if (dev->dma_mask == NULL)
403 return NULL;
404
405 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
406 if (gfp & __GFP_DMA)
407 noretry = 1;
408
409#ifdef CONFIG_X86_64
410 /* Why <=? Even when the mask is smaller than 4GB it is often
411 larger than 16MB and in this case we have a chance of
412 finding fitting memory in the next higher zone first. If
413 not retry with true GFP_DMA. -AK */
414 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
415 gfp |= GFP_DMA32;
416 if (dma_mask < DMA_32BIT_MASK)
417 noretry = 1;
418 }
419#endif
420
421 again:
422 page = dma_alloc_pages(dev,
423 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
424 if (page == NULL)
425 return NULL;
426
427 {
428 int high, mmu;
429 bus = page_to_phys(page);
430 memory = page_address(page);
431 high = (bus + size) >= dma_mask;
432 mmu = high;
433 if (force_iommu && !(gfp & GFP_DMA))
434 mmu = 1;
435 else if (high) {
436 free_pages((unsigned long)memory,
437 get_order(size));
438
439 /* Don't use the 16MB ZONE_DMA unless absolutely
440 needed. It's better to use remapping first. */
441 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
442 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
443 goto again;
444 }
445
446 /* Let low level make its own zone decisions */
447 gfp &= ~(GFP_DMA32|GFP_DMA);
448
449 if (dma_ops->alloc_coherent)
450 return dma_ops->alloc_coherent(dev, size,
451 dma_handle, gfp);
452 return NULL;
453 }
454
455 memset(memory, 0, size);
456 if (!mmu) {
457 *dma_handle = bus;
458 return memory;
459 }
460 }
461
462 if (dma_ops->alloc_coherent) {
463 free_pages((unsigned long)memory, get_order(size));
464 gfp &= ~(GFP_DMA|GFP_DMA32);
465 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
466 }
467
468 if (dma_ops->map_simple) {
469 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
470 size,
471 PCI_DMA_BIDIRECTIONAL);
472 if (*dma_handle != bad_dma_address)
473 return memory;
474 }
475
476 if (panic_on_overflow)
477 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
478 (unsigned long)size);
479 free_pages((unsigned long)memory, get_order(size));
480 return NULL;
481}
482EXPORT_SYMBOL(dma_alloc_coherent);
483
484/*
485 * Unmap coherent memory.
486 * The caller must ensure that the device has finished accessing the mapping.
487 */
488void dma_free_coherent(struct device *dev, size_t size,
489 void *vaddr, dma_addr_t bus)
490{
491 int order = get_order(size);
492 WARN_ON(irqs_disabled()); /* for portability */
493 if (dma_release_coherent(dev, order, vaddr))
494 return;
495 if (dma_ops->unmap_single)
496 dma_ops->unmap_single(dev, bus, size, 0);
497 free_pages((unsigned long)vaddr, order);
498}
499EXPORT_SYMBOL(dma_free_coherent);
500
501static int __init pci_iommu_init(void) 274static int __init pci_iommu_init(void)
502{ 275{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 276 calgary_iommu_init();
505#endif
506 277
507 intel_iommu_init(); 278 intel_iommu_init();
508 279
509 amd_iommu_init(); 280 amd_iommu_init();
510 281
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 282 gart_iommu_init();
513#endif
514 283
515 no_iommu_init(); 284 no_iommu_init();
516 return 0; 285 return 0;
@@ -522,17 +291,3 @@ void pci_iommu_shutdown(void)
522} 291}
523/* Must execute after PCI subsystem */ 292/* Must execute after PCI subsystem */
524fs_initcall(pci_iommu_init); 293fs_initcall(pci_iommu_init);
525
526#ifdef CONFIG_PCI
527/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
528
529static __devinit void via_no_dac(struct pci_dev *dev)
530{
531 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
532 printk(KERN_INFO "PCI: VIA PCI bridge detected."
533 "Disabling DAC.\n");
534 forbid_dac = 1;
535 }
536}
537DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
538#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..e3f75bbcedea 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,11 +27,12 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry;
66 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
67#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
68 69
69#define to_pages(addr, size) \
70 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
71
72#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
73 71
74#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -82,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
82AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
83 81
84static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
85static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
86 84
87static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
88{ 87{
89 unsigned long offset, flags; 88 unsigned long offset, flags;
90 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -92,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
92 91
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
97 96
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
99 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
100 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
101 if (offset == -1) { 100 if (offset == -1) {
102 need_flush = 1; 101 need_flush = true;
103 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
104 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
105 } 105 }
106 if (offset != -1) { 106 if (offset != -1) {
107 next_bit = offset+size; 107 next_bit = offset+size;
108 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
109 next_bit = 0; 109 next_bit = 0;
110 need_flush = 1; 110 need_flush = true;
111 } 111 }
112 } 112 }
113 if (iommu_fullflush) 113 if (iommu_fullflush)
114 need_flush = 1; 114 need_flush = true;
115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
116 116
117 return offset; 117 return offset;
@@ -136,7 +136,7 @@ static void flush_gart(void)
136 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
137 if (need_flush) { 137 if (need_flush) {
138 k8_flush_garts(); 138 k8_flush_garts();
139 need_flush = 0; 139 need_flush = false;
140 } 140 }
141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
142} 142}
@@ -175,7 +175,8 @@ static void dump_leak(void)
175 iommu_leak_pages); 175 iommu_leak_pages);
176 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
177 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
179 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
180 } 181 }
181 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
197 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
198 */ 199 */
199 200
200 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
202 size, dev->bus_id);
203 202
204 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -216,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
216static inline int 215static inline int
217need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
218{ 217{
219 u64 mask = *dev->dma_mask; 218 return force_iommu ||
220 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
221 int mmu = high;
222
223 if (force_iommu)
224 mmu = 1;
225
226 return mmu;
227} 220}
228 221
229static inline int 222static inline int
230nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
231{ 224{
232 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
233 int high = addr + size > mask;
234 int mmu = high;
235
236 return mmu;
237} 226}
238 227
239/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
240 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
241 */ 230 */
242static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
243 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
244{ 233{
245 unsigned long npages = to_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
246 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
247 int i; 236 int i;
248 237
249 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -263,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
263 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
264} 253}
265 254
266static dma_addr_t
267gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
268{
269 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
270
271 flush_gart();
272
273 return map;
274}
275
276/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
277static dma_addr_t 256static dma_addr_t
278gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -280,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
280 unsigned long bus; 259 unsigned long bus;
281 260
282 if (!dev) 261 if (!dev)
283 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
284 263
285 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
286 return paddr; 265 return paddr;
287 266
288 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
289 269
290 return bus; 270 return bus;
291} 271}
@@ -305,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
305 return; 285 return;
306 286
307 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
308 npages = to_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
309 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
310 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
311 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -344,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
344 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
345 325
346 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
347 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
348 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
349 if (i > 0) 329 if (i > 0)
350 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -366,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
366 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
367 unsigned long pages) 347 unsigned long pages)
368{ 348{
369 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
370 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
371 struct scatterlist *s; 351 struct scatterlist *s;
372 int i; 352 int i;
@@ -388,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
388 } 368 }
389 369
390 addr = phys_addr; 370 addr = phys_addr;
391 pages = to_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
392 while (pages--) { 372 while (pages--) {
393 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
394 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -431,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
431 return 0; 411 return 0;
432 412
433 if (!dev) 413 if (!dev)
434 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
435 415
436 out = 0; 416 out = 0;
437 start = 0; 417 start = 0;
@@ -471,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
471 451
472 seg_size += s->length; 452 seg_size += s->length;
473 need = nextneed; 453 need = nextneed;
474 pages += to_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
475 ps = s; 455 ps = s;
476 } 456 }
477 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -503,6 +483,46 @@ error:
503 return 0; 483 return 0;
504} 484}
505 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
506static int no_agp; 526static int no_agp;
507 527
508static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -630,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
630 struct pci_dev *dev; 650 struct pci_dev *dev;
631 void *gatt; 651 void *gatt;
632 int i, error; 652 int i, error;
633 unsigned long start_pfn, end_pfn;
634 653
635 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
636 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -654,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
654 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
655 674
656 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
657 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
658 if (!gatt) 678 if (!gatt)
659 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
660 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
661 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
662 682
663 memset(gatt, 0, gatt_size);
664 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
665 684
666 enable_gart_translations(); 685 enable_gart_translations();
@@ -669,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
669 if (!error) 688 if (!error)
670 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
671 if (error) 690 if (error)
672 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
673 693
674 flush_gart(); 694 flush_gart();
675 695
676 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
677 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
678 698
679 /* need to map that range */
680 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
681 if (end_pfn > max_low_pfn_mapped) {
682 start_pfn = (aper_base>>PAGE_SHIFT);
683 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
684 }
685 return 0; 699 return 0;
686 700
687 nommu: 701 nommu:
@@ -691,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
691 return -1; 705 return -1;
692} 706}
693 707
694extern int agp_amd64_init(void); 708static struct dma_mapping_ops gart_dma_ops = {
695
696static const struct dma_mapping_ops gart_dma_ops = {
697 .mapping_error = NULL,
698 .map_single = gart_map_single, 709 .map_single = gart_map_single,
699 .map_simple = gart_map_simple,
700 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
701 .sync_single_for_cpu = NULL,
702 .sync_single_for_device = NULL,
703 .sync_single_range_for_cpu = NULL,
704 .sync_single_range_for_device = NULL,
705 .sync_sg_for_cpu = NULL,
706 .sync_sg_for_device = NULL,
707 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
708 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
709}; 715};
710 716
711void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -732,7 +738,8 @@ void __init gart_iommu_init(void)
732{ 738{
733 struct agp_kern_info info; 739 struct agp_kern_info info;
734 unsigned long iommu_start; 740 unsigned long iommu_start;
735 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
736 unsigned long scratch; 743 unsigned long scratch;
737 long i; 744 long i;
738 745
@@ -764,30 +771,35 @@ void __init gart_iommu_init(void)
764 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
765 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
766 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
767 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
768 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
769 } 776 }
770 return; 777 return;
771 } 778 }
772 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
773 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
774 aper_size = info.aper_size * 1024 * 1024;
775 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
776 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
777 792
778 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
779 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
780 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
781 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
782 memset(iommu_gart_bitmap, 0, iommu_pages/8);
783 797
784#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
785 if (leak_trace) { 799 if (leak_trace) {
786 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
787 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
788 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
789 memset(iommu_leak_tab, 0, iommu_pages * 8);
790 else
791 printk(KERN_DEBUG 803 printk(KERN_DEBUG
792 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
793 } 805 }
@@ -797,7 +809,7 @@ void __init gart_iommu_init(void)
797 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
798 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
799 */ 811 */
800 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
801 813
802 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
803 printk(KERN_INFO 815 printk(KERN_INFO
@@ -855,7 +867,8 @@ void __init gart_parse_options(char *p)
855 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
856 leak_trace = 1; 868 leak_trace = 1;
857 p += 4; 869 p += 4;
858 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
859 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
860 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
861 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,14 +7,14 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76static int nommu_mapping_error(dma_addr_t dma_addr) 76 dma_addr_t dma_addr)
77{ 77{
78#ifdef CONFIG_X86_32 78 free_pages((unsigned long)vaddr, get_order(size));
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83} 79}
84 80
85 81struct dma_mapping_ops nommu_dma_ops = {
86const struct dma_mapping_ops nommu_dma_ops = { 82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
87 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 86 .is_phys = 1,
91}; 87};
92 88
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -184,7 +184,8 @@ static void mwait_idle(void)
184static void poll_idle(void) 184static void poll_idle(void)
185{ 185{
186 local_irq_enable(); 186 local_irq_enable();
187 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
188} 189}
189 190
190/* 191/*
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
244 return 1; 246 return 1;
245} 247}
246 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
247/* 257/*
248 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
249 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
251 */ 261 */
252static void c1e_idle(void) 262static void c1e_idle(void)
253{ 263{
254 static cpumask_t c1e_mask = CPU_MASK_NONE;
255 static int c1e_detected;
256
257 if (need_resched()) 264 if (need_resched())
258 return; 265 return;
259 266
@@ -263,8 +270,10 @@ static void c1e_idle(void)
263 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
264 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
265 c1e_detected = 1; 272 c1e_detected = 1;
266 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
267 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
268 } 277 }
269 } 278 }
270 279
@@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 335
327static int __init idle_setup(char *str) 336static int __init idle_setup(char *str)
328{ 337{
338 if (!str)
339 return -EINVAL;
340
329 if (!strcmp(str, "poll")) { 341 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 342 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 343 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,9 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
60#include <asm/syscalls.h>
61#include <asm/smp.h>
58 62
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 64
@@ -72,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
72 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
73} 77}
74 78
75#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
76#include <asm/nmi.h>
77
78static void cpu_exit_clear(void)
79{
80 int cpu = raw_smp_processor_id();
81
82 idle_task_exit();
83
84 cpu_uninit();
85 irq_ctx_exit(cpu);
86
87 cpu_clear(cpu, cpu_callout_map);
88 cpu_clear(cpu, cpu_callin_map);
89
90 numa_remove_cpu(cpu);
91}
92
93/* We don't actually take CPU down, just spin without interrupts. */
94static inline void play_dead(void)
95{
96 /* This must be done before dead CPU ack */
97 cpu_exit_clear();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 /*
104 * With physical CPU hotplug, we should halt the cpu
105 */
106 local_irq_disable();
107 while (1)
108 halt();
109}
110#else
111static inline void play_dead(void) 80static inline void play_dead(void)
112{ 81{
113 BUG(); 82 BUG();
114} 83}
115#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
116 85
117/* 86/*
118 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -128,7 +97,7 @@ void cpu_idle(void)
128 97
129 /* endless idle loop with no priority at all */ 98 /* endless idle loop with no priority at all */
130 while (1) { 99 while (1) {
131 tick_nohz_stop_sched_tick(); 100 tick_nohz_stop_sched_tick(1);
132 while (!need_resched()) { 101 while (!need_resched()) {
133 102
134 check_pgt_cache(); 103 check_pgt_cache();
@@ -154,12 +123,13 @@ void cpu_idle(void)
154 } 123 }
155} 124}
156 125
157void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
158{ 127{
159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
160 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
161 unsigned long sp; 130 unsigned long sp;
162 unsigned short ss, gs; 131 unsigned short ss, gs;
132 const char *board;
163 133
164 if (user_mode_vm(regs)) { 134 if (user_mode_vm(regs)) {
165 sp = regs->sp; 135 sp = regs->sp;
@@ -172,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
172 } 142 }
173 143
174 printk("\n"); 144 printk("\n");
175 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 145
146 board = dmi_get_system_info(DMI_PRODUCT_NAME);
147 if (!board)
148 board = "";
149 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
176 task_pid_nr(current), current->comm, 150 task_pid_nr(current), current->comm,
177 print_tainted(), init_utsname()->release, 151 print_tainted(), init_utsname()->release,
178 (int)strcspn(init_utsname()->version, " "), 152 (int)strcspn(init_utsname()->version, " "),
179 init_utsname()->version); 153 init_utsname()->version, board);
180 154
181 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 155 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
182 (u16)regs->cs, regs->ip, regs->flags, 156 (u16)regs->cs, regs->ip, regs->flags,
@@ -215,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
215 189
216void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
217{ 191{
218 __show_registers(regs, 1); 192 __show_regs(regs, 1);
219 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
220} 194}
221 195
@@ -276,6 +250,14 @@ void exit_thread(void)
276 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
277 put_cpu(); 251 put_cpu();
278 } 252 }
253#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
279} 261}
280 262
281void flush_thread(void) 263void flush_thread(void)
@@ -437,6 +419,35 @@ int set_tsc_mode(unsigned int val)
437 return 0; 419 return 0;
438} 420}
439 421
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
440static noinline void 451static noinline void
441__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
442 struct tss_struct *tss) 453 struct tss_struct *tss)
@@ -447,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
447 prev = &prev_p->thread; 458 prev = &prev_p->thread;
448 next = &next_p->thread; 459 next = &next_p->thread;
449 460
450 debugctl = prev->debugctlmsr; 461 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
451 if (next->ds_area_msr != prev->ds_area_msr) {
452 /* we clear debugctl to make sure DS
453 * is not in use when we change it */
454 debugctl = 0;
455 update_debugctlmsr(0);
456 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
457 }
458 462
459 if (next->debugctlmsr != debugctl) 463 if (next->debugctlmsr != debugctl)
460 update_debugctlmsr(next->debugctlmsr); 464 update_debugctlmsr(next->debugctlmsr);
@@ -478,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
478 hard_enable_TSC(); 482 hard_enable_TSC();
479 } 483 }
480 484
481#ifdef X86_BTS 485#ifdef CONFIG_X86_PTRACE_BTS
482 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
483 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
484 488
485 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
486 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
487#endif 491#endif /* CONFIG_X86_PTRACE_BTS */
488 492
489 493
490 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..c958120fb1b6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
51#include <asm/proto.h> 51#include <asm/proto.h>
52#include <asm/ia32.h> 52#include <asm/ia32.h>
53#include <asm/idle.h> 53#include <asm/idle.h>
54#include <asm/syscalls.h>
54 55
55asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
56 57
@@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n)
62{ 63{
63 atomic_notifier_chain_register(&idle_notifier, n); 64 atomic_notifier_chain_register(&idle_notifier, n);
64} 65}
66EXPORT_SYMBOL_GPL(idle_notifier_register);
67
68void idle_notifier_unregister(struct notifier_block *n)
69{
70 atomic_notifier_chain_unregister(&idle_notifier, n);
71}
72EXPORT_SYMBOL_GPL(idle_notifier_unregister);
65 73
66void enter_idle(void) 74void enter_idle(void)
67{ 75{
@@ -85,29 +93,12 @@ void exit_idle(void)
85 __exit_idle(); 93 __exit_idle();
86} 94}
87 95
88#ifdef CONFIG_HOTPLUG_CPU 96#ifndef CONFIG_SMP
89DECLARE_PER_CPU(int, cpu_state);
90
91#include <asm/nmi.h>
92/* We halt the CPU with physical CPU hotplug */
93static inline void play_dead(void)
94{
95 idle_task_exit();
96 wbinvd();
97 mb();
98 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD;
100
101 local_irq_disable();
102 while (1)
103 halt();
104}
105#else
106static inline void play_dead(void) 97static inline void play_dead(void)
107{ 98{
108 BUG(); 99 BUG();
109} 100}
110#endif /* CONFIG_HOTPLUG_CPU */ 101#endif
111 102
112/* 103/*
113 * The idle thread. There's no useful work to be 104 * The idle thread. There's no useful work to be
@@ -120,7 +111,7 @@ void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */ 112 /* endless idle loop with no priority at all */
122 while (1) { 113 while (1) {
123 tick_nohz_stop_sched_tick(); 114 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) { 115 while (!need_resched()) {
125 116
126 rmb(); 117 rmb();
@@ -152,7 +143,7 @@ void cpu_idle(void)
152} 143}
153 144
154/* Prints also some state that isn't saved in the pt_regs */ 145/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs * regs) 146void __show_regs(struct pt_regs *regs, int all)
156{ 147{
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 148 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7; 149 unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,60 +152,65 @@ void __show_regs(struct pt_regs * regs)
161 152
162 printk("\n"); 153 printk("\n");
163 print_modules(); 154 print_modules();
164 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 155 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(), 156 current->pid, current->comm, print_tainted(),
166 init_utsname()->release, 157 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "), 158 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version); 159 init_utsname()->version);
169 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 160 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1); 161 printk_address(regs->ip, 1);
171 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 162 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->flags); 163 regs->sp, regs->flags);
173 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 164 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx); 165 regs->ax, regs->bx, regs->cx);
175 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 166 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di); 167 regs->dx, regs->si, regs->di);
177 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 168 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9); 169 regs->bp, regs->r8, regs->r9);
179 printk("R10: %016lx R11: %016lx R12: %016lx\n", 170 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12); 171 regs->r10, regs->r11, regs->r12);
181 printk("R13: %016lx R14: %016lx R15: %016lx\n", 172 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15); 173 regs->r13, regs->r14, regs->r15);
183 174
184 asm("movl %%ds,%0" : "=r" (ds)); 175 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs)); 176 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es)); 177 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex)); 178 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex)); 179 asm("movl %%gs,%0" : "=r" (gsindex));
189 180
190 rdmsrl(MSR_FS_BASE, fs); 181 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs); 182 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 183 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
184
185 if (!all)
186 return;
193 187
194 cr0 = read_cr0(); 188 cr0 = read_cr0();
195 cr2 = read_cr2(); 189 cr2 = read_cr2();
196 cr3 = read_cr3(); 190 cr3 = read_cr3();
197 cr4 = read_cr4(); 191 cr4 = read_cr4();
198 192
199 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 193 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs,fsindex,gs,gsindex,shadowgs); 194 fs, fsindex, gs, gsindex, shadowgs);
201 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 195 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 196 es, cr0);
197 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
198 cr4);
203 199
204 get_debugreg(d0, 0); 200 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 201 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 202 get_debugreg(d2, 2);
207 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 203 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 204 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 205 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 206 get_debugreg(d7, 7);
211 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 207 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 208}
213 209
214void show_regs(struct pt_regs *regs) 210void show_regs(struct pt_regs *regs)
215{ 211{
216 printk("CPU %d:", smp_processor_id()); 212 printk(KERN_INFO "CPU %d:", smp_processor_id());
217 __show_regs(regs); 213 __show_regs(regs, 1);
218 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 214 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
219} 215}
220 216
@@ -239,6 +235,14 @@ void exit_thread(void)
239 t->io_bitmap_max = 0; 235 t->io_bitmap_max = 0;
240 put_cpu(); 236 put_cpu();
241 } 237 }
238#ifdef CONFIG_X86_DS
239 /* Free any DS contexts that have not been properly released. */
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
242} 246}
243 247
244void flush_thread(void) 248void flush_thread(void)
@@ -314,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk)
314 318
315int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 319int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
316 unsigned long unused, 320 unsigned long unused,
317 struct task_struct * p, struct pt_regs * regs) 321 struct task_struct *p, struct pt_regs *regs)
318{ 322{
319 int err; 323 int err;
320 struct pt_regs * childregs; 324 struct pt_regs *childregs;
321 struct task_struct *me = current; 325 struct task_struct *me = current;
322 326
323 childregs = ((struct pt_regs *) 327 childregs = ((struct pt_regs *)
@@ -362,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
362 if (test_thread_flag(TIF_IA32)) 366 if (test_thread_flag(TIF_IA32))
363 err = do_set_thread_area(p, -1, 367 err = do_set_thread_area(p, -1,
364 (struct user_desc __user *)childregs->si, 0); 368 (struct user_desc __user *)childregs->si, 0);
365 else 369 else
366#endif 370#endif
367 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 371 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
368 if (err) 372 if (err)
369 goto out; 373 goto out;
370 } 374 }
371 err = 0; 375 err = 0;
@@ -472,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
472 next = &next_p->thread; 476 next = &next_p->thread;
473 477
474 debugctl = prev->debugctlmsr; 478 debugctl = prev->debugctlmsr;
475 if (next->ds_area_msr != prev->ds_area_msr) { 479
476 /* we clear debugctl to make sure DS 480#ifdef CONFIG_X86_DS
477 * is not in use when we change it */ 481 {
478 debugctl = 0; 482 unsigned long ds_prev = 0, ds_next = 0;
479 update_debugctlmsr(0); 483
480 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
481 } 498 }
499#endif /* CONFIG_X86_DS */
482 500
483 if (next->debugctlmsr != debugctl) 501 if (next->debugctlmsr != debugctl)
484 update_debugctlmsr(next->debugctlmsr); 502 update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
516 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
517 } 535 }
518 536
519#ifdef X86_BTS 537#ifdef CONFIG_X86_PTRACE_BTS
520 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
521 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
522 540
523 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
524 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
525#endif 543#endif /* CONFIG_X86_PTRACE_BTS */
526} 544}
527 545
528/* 546/*
@@ -537,14 +555,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 555struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 556__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 557{
540 struct thread_struct *prev = &prev_p->thread, 558 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 559 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 560 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 561 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 562 unsigned fsindex, gsindex;
545 563
546 /* we're going to use this soon, after a few expensive things */ 564 /* we're going to use this soon, after a few expensive things */
547 if (next_p->fpu_counter>5) 565 if (next_p->fpu_counter > 5)
548 prefetch(next->xstate); 566 prefetch(next->xstate);
549 567
550 /* 568 /*
@@ -552,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 */ 570 */
553 load_sp0(tss, next); 571 load_sp0(tss, next);
554 572
555 /* 573 /*
556 * Switch DS and ES. 574 * Switch DS and ES.
557 * This won't pick up thread selector changes, but I guess that is ok. 575 * This won't pick up thread selector changes, but I guess that is ok.
558 */ 576 */
559 savesegment(es, prev->es); 577 savesegment(es, prev->es);
560 if (unlikely(next->es | prev->es)) 578 if (unlikely(next->es | prev->es))
561 loadsegment(es, next->es); 579 loadsegment(es, next->es);
562 580
563 savesegment(ds, prev->ds); 581 savesegment(ds, prev->ds);
564 if (unlikely(next->ds | prev->ds)) 582 if (unlikely(next->ds | prev->ds))
@@ -584,50 +602,50 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 */ 602 */
585 arch_leave_lazy_cpu_mode(); 603 arch_leave_lazy_cpu_mode();
586 604
587 /* 605 /*
588 * Switch FS and GS. 606 * Switch FS and GS.
607 *
608 * Segment register != 0 always requires a reload. Also
609 * reload when it has changed. When prev process used 64bit
610 * base always reload to avoid an information leak.
589 */ 611 */
590 { 612 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 613 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 614 /*
593 when prev process used 64bit base always reload 615 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 616 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 617 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 618 */
597 /* check if the user used a selector != 0 619 if (fsindex)
598 * if yes clear 64bit base, since overloaded base 620 prev->fs = 0;
599 * is always mapped to the Null selector 621 }
600 */ 622 /* when next process has a 64bit base use it */
601 if (fsindex) 623 if (next->fs)
602 prev->fs = 0; 624 wrmsrl(MSR_FS_BASE, next->fs);
603 } 625 prev->fsindex = fsindex;
604 /* when next process has a 64bit base use it */ 626
605 if (next->fs) 627 if (unlikely(gsindex | next->gsindex | prev->gs)) {
606 wrmsrl(MSR_FS_BASE, next->fs); 628 load_gs_index(next->gsindex);
607 prev->fsindex = fsindex; 629 if (gsindex)
608 630 prev->gs = 0;
609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex);
611 if (gsindex)
612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 631 }
632 if (next->gs)
633 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
634 prev->gsindex = gsindex;
618 635
619 /* Must be after DS reload */ 636 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 637 unlazy_fpu(prev_p);
621 638
622 /* 639 /*
623 * Switch the PDA and FPU contexts. 640 * Switch the PDA and FPU contexts.
624 */ 641 */
625 prev->usersp = read_pda(oldrsp); 642 prev->usersp = read_pda(oldrsp);
626 write_pda(oldrsp, next->usersp); 643 write_pda(oldrsp, next->usersp);
627 write_pda(pcurrent, next_p); 644 write_pda(pcurrent, next_p);
628 645
629 write_pda(kernelstack, 646 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 647 (unsigned long)task_stack_page(next_p) +
648 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 649#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 650 write_pda(stack_canary, next_p->stack_canary);
633 /* 651 /*
@@ -664,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
664 char __user * __user *envp, struct pt_regs *regs) 682 char __user * __user *envp, struct pt_regs *regs)
665{ 683{
666 long error; 684 long error;
667 char * filename; 685 char *filename;
668 686
669 filename = getname(name); 687 filename = getname(name);
670 error = PTR_ERR(filename); 688 error = PTR_ERR(filename);
@@ -722,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
722unsigned long get_wchan(struct task_struct *p) 740unsigned long get_wchan(struct task_struct *p)
723{ 741{
724 unsigned long stack; 742 unsigned long stack;
725 u64 fp,ip; 743 u64 fp, ip;
726 int count = 0; 744 int count = 0;
727 745
728 if (!p || p == current || p->state==TASK_RUNNING) 746 if (!p || p == current || p->state == TASK_RUNNING)
729 return 0; 747 return 0;
730 stack = (unsigned long)task_stack_page(p); 748 stack = (unsigned long)task_stack_page(p);
731 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 749 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
732 return 0; 750 return 0;
733 fp = *(u64 *)(p->thread.sp); 751 fp = *(u64 *)(p->thread.sp);
734 do { 752 do {
735 if (fp < (unsigned long)stack || 753 if (fp < (unsigned long)stack ||
736 fp > (unsigned long)stack+THREAD_SIZE) 754 fp >= (unsigned long)stack+THREAD_SIZE)
737 return 0; 755 return 0;
738 ip = *(u64 *)(fp+8); 756 ip = *(u64 *)(fp+8);
739 if (!in_sched_functions(ip)) 757 if (!in_sched_functions(ip))
740 return ip; 758 return ip;
741 fp = *(u64 *)fp; 759 fp = *(u64 *)fp;
742 } while (count++ < 16); 760 } while (count++ < 16);
743 return 0; 761 return 0;
744} 762}
745 763
746long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 764long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747{ 765{
748 int ret = 0; 766 int ret = 0;
749 int doit = task == current; 767 int doit = task == current;
750 int cpu; 768 int cpu;
751 769
752 switch (code) { 770 switch (code) {
753 case ARCH_SET_GS: 771 case ARCH_SET_GS:
754 if (addr >= TASK_SIZE_OF(task)) 772 if (addr >= TASK_SIZE_OF(task))
755 return -EPERM; 773 return -EPERM;
756 cpu = get_cpu(); 774 cpu = get_cpu();
757 /* handle small bases via the GDT because that's faster to 775 /* handle small bases via the GDT because that's faster to
758 switch. */ 776 switch. */
759 if (addr <= 0xffffffff) { 777 if (addr <= 0xffffffff) {
760 set_32bit_tls(task, GS_TLS, addr); 778 set_32bit_tls(task, GS_TLS, addr);
761 if (doit) { 779 if (doit) {
762 load_TLS(&task->thread, cpu); 780 load_TLS(&task->thread, cpu);
763 load_gs_index(GS_TLS_SEL); 781 load_gs_index(GS_TLS_SEL);
764 } 782 }
765 task->thread.gsindex = GS_TLS_SEL; 783 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gs = 0; 784 task->thread.gs = 0;
767 } else { 785 } else {
768 task->thread.gsindex = 0; 786 task->thread.gsindex = 0;
769 task->thread.gs = addr; 787 task->thread.gs = addr;
770 if (doit) { 788 if (doit) {
771 load_gs_index(0); 789 load_gs_index(0);
772 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 790 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
773 } 791 }
774 } 792 }
775 put_cpu(); 793 put_cpu();
776 break; 794 break;
@@ -824,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 rdmsrl(MSR_KERNEL_GS_BASE, base); 842 rdmsrl(MSR_KERNEL_GS_BASE, base);
825 else 843 else
826 base = task->thread.gs; 844 base = task->thread.gs;
827 } 845 } else
828 else
829 base = task->thread.gs; 846 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr); 847 ret = put_user(base, (unsigned long __user *)addr);
831 break; 848 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
39 REGSET_GENERAL, 40 REGSET_GENERAL,
40 REGSET_FP, 41 REGSET_FP,
41 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
42 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
43}; 46};
44 47
45/* 48/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
69 72
70#define FLAG_MASK FLAG_MASK_32 73#define FLAG_MASK FLAG_MASK_32
71 74
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{ 76{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2; 78 regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 557 return 0;
555} 558}
556 559
557#ifdef X86_BTS 560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
558 569
559static int ptrace_bts_get_size(struct task_struct *child) 570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
560{ 574{
561 if (!child->thread.ds_area_msr) 575 if (!target->thread.io_bitmap_ptr)
562 return -ENXIO; 576 return -ENXIO;
563 577
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
565} 636}
566 637
567static int ptrace_bts_read_record(struct task_struct *child, 638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
568 long index, 639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index,
569 struct bts_struct __user *out) 664 struct bts_struct __user *out)
570{ 665{
571 struct bts_struct ret; 666 struct bts_struct ret;
572 int retval; 667 const void *bts_record;
573 int bts_end; 668 size_t bts_index, bts_end;
574 int bts_index; 669 int error;
575 670
576 if (!child->thread.ds_area_msr) 671 error = ds_get_bts_end(child, &bts_end);
577 return -ENXIO; 672 if (error < 0)
673 return error;
578 674
579 if (index < 0)
580 return -EINVAL;
581
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 675 if (bts_end <= index)
584 return -EINVAL; 676 return -EINVAL;
585 677
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681
586 /* translate the ptrace bts index into the ds bts index */ 682 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 683 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 684 if (bts_end <= bts_index)
589 if (bts_index < 0) 685 bts_index -= bts_end;
590 bts_index += bts_end; 686
687 error = ds_access_bts(child, bts_index, &bts_record);
688 if (error < 0)
689 return error;
591 690
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 691 ptrace_bts_translate_record(&ret, bts_record);
593 bts_index, &ret);
594 if (retval < 0)
595 return retval;
596 692
597 if (copy_to_user(out, &ret, sizeof(ret))) 693 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 694 return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 696 return sizeof(ret);
601} 697}
602 698
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 699static int ptrace_bts_drain(struct task_struct *child,
612 long size, 700 long size,
613 struct bts_struct __user *out) 701 struct bts_struct __user *out)
614{ 702{
615 int end, i; 703 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 704 const unsigned char *raw;
617 705 size_t end, i;
618 if (!ds) 706 int error;
619 return -ENXIO;
620 707
621 end = ds_get_bts_index(ds); 708 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 709 if (error < 0)
623 return end; 710 return error;
624 711
625 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 713 return -EIO;
627 714
628 for (i = 0; i < end; i++, out++) { 715 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 716 if (error < 0)
630 int retval; 717 return error;
631 718
632 retval = ds_read_bts(ds, i, &ret); 719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 720 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 721
636 if (copy_to_user(out, &ret, sizeof(ret))) 722 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 723 return -EFAULT;
638 } 724 }
639 725
640 ds_clear(ds); 726 error = ds_clear_bts(child);
727 if (error < 0)
728 return error;
641 729
642 return end; 730 return end;
643} 731}
644 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
645static int ptrace_bts_config(struct task_struct *child, 738static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 739 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 740 const struct ptrace_bts_config __user *ucfg)
648{ 741{
649 struct ptrace_bts_config cfg; 742 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 743 int error = 0;
651 void *ds;
652 744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748
749 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 750 if (cfg_size < sizeof(cfg))
654 return -EIO; 751 goto errout;
655 752
753 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 754 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 755 goto errout;
658 756
659 if ((int)cfg.size < 0) 757 error = -EINVAL;
660 return -EINVAL; 758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
661 761
662 bts_size = 0; 762 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 763 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 764 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds);
666 if (bts_size < 0)
667 return bts_size;
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 765
671 if (bts_size != cfg.size) { 766 /* we ignore the error in case we were not tracing child */
672 ret = ptrace_bts_realloc(child, cfg.size, 767 (void)ds_release_bts(child);
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 768
674 if (ret < 0) 769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal)
771 goto errout;
772
773 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 }
776
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
778 if (error < 0)
675 goto errout; 779 goto errout;
676 780
677 ds = (void *)child->thread.ds_area_msr; 781 child->thread.bts_ovfl_signal = sig;
678 } 782 }
679 783
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 784 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 785 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 786 goto errout;
686 787
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 788 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 790 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 792
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 793 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 795 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 797
697 ret = sizeof(cfg); 798 error = sizeof(cfg);
698 799
699out: 800out:
700 if (child->thread.debugctlmsr) 801 if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
702 else 803 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 805
705 return ret; 806 return error;
706 807
707errout: 808errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 811 goto out;
711} 812}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 815 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 816 struct ptrace_bts_config __user *ucfg)
716{ 817{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 818 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
719 822
720 if (cfg_size < sizeof(cfg)) 823 if (cfg_size < sizeof(cfg))
721 return -EIO; 824 return -EIO;
722 825
723 memset(&cfg, 0, sizeof(cfg)); 826 error = ds_get_bts_end(child, &end);
827 if (error < 0)
828 return error;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
724 833
725 if (ds) { 834 error = ds_access_bts(child, /* index = */ end, &max);
726 cfg.size = ds_get_bts_size(ds); 835 if (error < 0)
836 return error;
727 837
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 838 memset(&cfg, 0, sizeof(cfg));
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 839 cfg.size = (max - base);
840 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct);
730 842
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 843 if (cfg.signal)
732 child->thread.debugctlmsr & ds_debugctl_mask()) 844 cfg.flags |= PTRACE_BTS_O_SIGNAL;
733 cfg.flags |= PTRACE_BTS_O_TRACE;
734 845
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
736 cfg.flags |= PTRACE_BTS_O_SCHED; 847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
737 } 848 cfg.flags |= PTRACE_BTS_O_TRACE;
738 849
739 cfg.bts_size = sizeof(struct bts_struct); 850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
851 cfg.flags |= PTRACE_BTS_O_SCHED;
740 852
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 853 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 854 return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 856 return sizeof(cfg);
745} 857}
746 858
747
748static int ptrace_bts_write_record(struct task_struct *child, 859static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 860 const struct bts_struct *in)
750{ 861{
751 int retval; 862 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 863
753 if (!child->thread.ds_area_msr) 864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 865
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 866 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 867 switch (in->qualifier) {
758 return retval; 868 case BTS_INVALID:
869 break;
759 870
760 return sizeof(*in); 871 case BTS_BRANCH:
761} 872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
762 875
763static int ptrace_bts_realloc(struct task_struct *child, 876 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 877 case BTS_TASK_DEPARTS:
765{ 878 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 879 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
768 882
769 if (size < 0) 883 default:
770 return -EINVAL; 884 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 } 885 }
801 886
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 887 /* The writing task will be the switched-to task on a context
803 vm = current->mm->locked_vm + size; 888 * switch. It needs to write into the switched-from task's BTS
804 if (rlim < vm) { 889 * buffer. */
805 ret = -ENOMEM; 890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 }
814
815 ret = ds_allocate((void **)&child->thread.ds_area_msr,
816 size << PAGE_SHIFT);
817 if (ret < 0)
818 goto out;
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 891}
831 892
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 893void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 900
840 ptrace_bts_write_record(tsk, &rec); 901 ptrace_bts_write_record(tsk, &rec);
841} 902}
842#endif /* X86_BTS */ 903
904static const struct bts_configuration bts_cfg_netburst = {
905 .sizeof_bts = sizeof(long) * 3,
906 .sizeof_field = sizeof(long),
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909
910static const struct bts_configuration bts_cfg_pentium_m = {
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961}
962#endif /* CONFIG_X86_PTRACE_BTS */
843 963
844/* 964/*
845 * Called by kernel/ptrace.c when detaching.. 965 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 972#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 974#endif
855 if (child->thread.ds_area_msr) { 975#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 976 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 977
858#endif 978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 979 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 981
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 983#endif /* CONFIG_X86_PTRACE_BTS */
864} 984}
865 985
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1100 /*
981 * These bits need more cooking - not enabled yet: 1101 * These bits need more cooking - not enabled yet:
982 */ 1102 */
983#ifdef X86_BTS 1103#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1104 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1105 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1106 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1112 break;
993 1113
994 case PTRACE_BTS_SIZE: 1114 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1115 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1116 break;
997 1117
998 case PTRACE_BTS_GET: 1118 case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1121 break;
1002 1122
1003 case PTRACE_BTS_CLEAR: 1123 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1124 ret = ds_clear_bts(child);
1005 break; 1125 break;
1006 1126
1007 case PTRACE_BTS_DRAIN: 1127 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1128 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1129 (child, data, (struct bts_struct __user *) addr);
1010 break; 1130 break;
1011#endif 1131#endif /* CONFIG_X86_PTRACE_BTS */
1012 1132
1013 default: 1133 default:
1014 ret = ptrace_request(child, request, addr, data); 1134 ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1290 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1291 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1292 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1293}; 1419};
1294 1420
1295static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1336 .active = regset_tls_active, 1462 .active = regset_tls_active,
1337 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1338 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1339}; 1471};
1340 1472
1341static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1357,9 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1489#endif
1358} 1490}
1359 1491
1360#ifdef CONFIG_X86_32 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1361 1493 int error_code, int si_code)
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1494{
1364 struct siginfo info; 1495 struct siginfo info;
1365 1496
@@ -1368,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1368 1499
1369 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1370 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1371 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1372 1503
1373 /* User-mode ip? */ 1504 /* User-mode ip? */
1374 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1377,143 +1508,83 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1508 force_sig_info(SIGTRAP, &info, tsk);
1378} 1509}
1379 1510
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435 1511
1436 /* 1512#ifdef CONFIG_X86_32
1437 * this isn't the same as continuing with a signal, but it will do 1513# define IS_IA32 1
1438 * for normal use. strace only continues with a signal if the 1514#elif defined CONFIG_IA32_EMULATION
1439 * stopping signal is not SIGTRAP. -brl 1515# define IS_IA32 test_thread_flag(TIF_IA32)
1440 */ 1516#else
1441 if (current->exit_code) { 1517# define IS_IA32 0
1442 send_sig(current->exit_code, current, 1); 1518#endif
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460 1519
1461static void syscall_trace(struct pt_regs *regs) 1520/*
1521 * We must return the syscall number to actually look up in the table.
1522 * This can be -1L to skip running any syscall at all.
1523 */
1524asmregparm long syscall_trace_enter(struct pt_regs *regs)
1462{ 1525{
1526 long ret = 0;
1463 1527
1464#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1466 current->comm,
1467 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1468 current_thread_info()->flags, current->ptrace);
1469#endif
1470
1471 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1472 ? 0x80 : 0));
1473 /* 1528 /*
1474 * this isn't the same as continuing with a signal, but it will do 1529 * If we stepped into a sysenter/syscall insn, it trapped in
1475 * for normal use. strace only continues with a signal if the 1530 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1476 * stopping signal is not SIGTRAP. -brl 1531 * If user-mode had set TF itself, then it's still clear from
1532 * do_debug() and we need to set it again to restore the user
1533 * state. If we entered on the slow path, TF was already set.
1477 */ 1534 */
1478 if (current->exit_code) { 1535 if (test_thread_flag(TIF_SINGLESTEP))
1479 send_sig(current->exit_code, current, 1); 1536 regs->flags |= X86_EFLAGS_TF;
1480 current->exit_code = 0;
1481 }
1482}
1483 1537
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1485{
1486 /* do the secure computing check first */ 1538 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1539 secure_computing(regs->orig_ax);
1488 1540
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1541 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1542 ret = -1L;
1491 syscall_trace(regs); 1543
1544 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1545 tracehook_report_syscall_entry(regs))
1546 ret = -1L;
1492 1547
1493 if (unlikely(current->audit_context)) { 1548 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1549 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1550 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1551 regs->orig_ax,
1497 regs->bx, regs->cx, 1552 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1553 regs->dx, regs->si);
1499 } else { 1554#ifdef CONFIG_X86_64
1555 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1556 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1557 regs->orig_ax,
1502 regs->di, regs->si, 1558 regs->di, regs->si,
1503 regs->dx, regs->r10); 1559 regs->dx, regs->r10);
1504 } 1560#endif
1505 } 1561 }
1562
1563 return ret ?: regs->orig_ax;
1506} 1564}
1507 1565
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1566asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1567{
1510 if (unlikely(current->audit_context)) 1568 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1569 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1570
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1571 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP)) 1572 tracehook_report_syscall_exit(regs, 0);
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs);
1517}
1518 1573
1519#endif /* CONFIG_X86_32 */ 1574 /*
1575 * If TIF_SYSCALL_EMU is set, we only get here because of
1576 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1577 * We already reported this syscall instruction in
1578 * syscall_trace_enter(), so don't do any more now.
1579 */
1580 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1581 return;
1582
1583 /*
1584 * If we are single-stepping, synthesize a trap to follow the
1585 * system call instruction.
1586 */
1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1590}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
97 return dst->version; 97 return dst->version;
98} 98}
99 99
100unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
101{
102 u64 pv_tsc_khz = 1000000ULL << 32;
103
104 do_div(pv_tsc_khz, src->tsc_to_system_mul);
105 if (src->tsc_shift < 0)
106 pv_tsc_khz <<= -src->tsc_shift;
107 else
108 pv_tsc_khz >>= src->tsc_shift;
109 return pv_tsc_khz;
110}
111
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{ 113{
102 struct pvclock_shadow_time shadow; 114 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d13858818100..67465ed89310 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
35 if (!(word & (1 << 13))) { 35 if (!(word & (1 << 13))) {
36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " 36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
37 "disabling irq balancing and affinity\n"); 37 "disabling irq balancing and affinity\n");
38#ifdef CONFIG_IRQBALANCE
39 irqbalance_disable("");
40#endif
41 noirqdebug_setup(""); 38 noirqdebug_setup("");
42#ifdef CONFIG_PROC_FS 39#ifdef CONFIG_PROC_FS
43 no_irq_affinity = 1; 40 no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
354 printk(KERN_DEBUG "Force enabled HPET at resume\n"); 351 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355} 352}
356 353
354static u32 ati_ixp4x0_rev(struct pci_dev *dev)
355{
356 u32 d;
357 u8 b;
358
359 pci_read_config_byte(dev, 0xac, &b);
360 b &= ~(1<<5);
361 pci_write_config_byte(dev, 0xac, b);
362 pci_read_config_dword(dev, 0x70, &d);
363 d |= 1<<8;
364 pci_write_config_dword(dev, 0x70, d);
365 pci_read_config_dword(dev, 0x8, &d);
366 d &= 0xff;
367 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
368 return d;
369}
370
357static void ati_force_enable_hpet(struct pci_dev *dev) 371static void ati_force_enable_hpet(struct pci_dev *dev)
358{ 372{
359 u32 uninitialized_var(val); 373 u32 d, val;
374 u8 b;
360 375
361 if (hpet_address || force_hpet_address) 376 if (hpet_address || force_hpet_address)
362 return; 377 return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
366 return; 381 return;
367 } 382 }
368 383
384 d = ati_ixp4x0_rev(dev);
385 if (d < 0x82)
386 return;
387
388 /* base address */
369 pci_write_config_dword(dev, 0x14, 0xfed00000); 389 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val); 390 pci_read_config_dword(dev, 0x14, &val);
391
392 /* enable interrupt */
393 outb(0x72, 0xcd6); b = inb(0xcd7);
394 b |= 0x1;
395 outb(0x72, 0xcd6); outb(b, 0xcd7);
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 if (!(b & 0x1))
398 return;
399 pci_read_config_dword(dev, 0x64, &d);
400 d |= (1<<10);
401 pci_write_config_dword(dev, 0x64, d);
402 pci_read_config_dword(dev, 0x64, &d);
403 if (!(d & (1<<10)))
404 return;
405
371 force_hpet_address = val; 406 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME; 407 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", 408 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address); 409 force_hpet_address);
375 cached_dev = dev; 410 cached_dev = dev;
376 return;
377} 411}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, 412DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet); 413 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -177,6 +181,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 181 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 182 },
179 }, 183 },
184 { /* Handle problems with rebooting on Dell T5400's */
185 .callback = set_bios_reboot,
186 .ident = "Dell Precision T5400",
187 .matches = {
188 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
189 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
190 },
191 },
180 { /* Handle problems with rebooting on HP laptops */ 192 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 193 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 194 .ident = "HP Compaq Laptop",
@@ -403,10 +415,9 @@ void native_machine_shutdown(void)
403{ 415{
404 /* Stop the cpus and apics */ 416 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 417#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 418
408 /* The boot cpu is always logical cpu 0 */ 419 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 420 int reboot_cpu_id = 0;
410 421
411#ifdef CONFIG_X86_32 422#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 423 /* See if there has been given a command line override */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define ESP DATA(0x0)
31#define CR0 DATA(0x4)
32#define CR3 DATA(0x8)
33#define CR4 DATA(0xc)
34
35/* other data */
36#define CP_VA_CONTROL_PAGE DATA(0x10)
37#define CP_PA_PGD DATA(0x14)
38#define CP_PA_SWAP_PAGE DATA(0x18)
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40
23 .text 41 .text
24 .align PAGE_SIZE 42 .align PAGE_SIZE
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 45 /* Save the CPU context, used for jumping back */
46
47 pushl %ebx
48 pushl %esi
49 pushl %edi
50 pushl %ebp
51 pushf
52
53 movl 20+8(%esp), %ebp /* list of pages */
54 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
55 movl %esp, ESP(%edi)
56 movl %cr0, %eax
57 movl %eax, CR0(%edi)
58 movl %cr3, %eax
59 movl %eax, CR3(%edi)
60 movl %cr4, %eax
61 movl %eax, CR4(%edi)
28 62
29#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 64 /* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
138 172
139relocate_new_kernel: 173relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 174 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 175 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 176 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 177 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 178 movl 20+16(%esp), %ecx /* cpu_has_pae */
179 movl 20+20(%esp), %esi /* preserve_context */
145 180
146 /* zero out flags, and disable interrupts */ 181 /* zero out flags, and disable interrupts */
147 pushl $0 182 pushl $0
148 popfl 183 popfl
149 184
185 /* save some information for jumping back */
186 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
187 movl %edi, CP_VA_CONTROL_PAGE(%edi)
188 movl PTR(PA_PGD)(%ebp), %eax
189 movl %eax, CP_PA_PGD(%edi)
190 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
191 movl %eax, CP_PA_SWAP_PAGE(%edi)
192 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
193
150 /* get physical address of control page now */ 194 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 195 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 196 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
197 xorl %eax, %eax 241 xorl %eax, %eax
198 movl %eax, %cr3 242 movl %eax, %cr3
199 243
244 movl CP_PA_SWAP_PAGE(%edi), %eax
245 pushl %eax
246 pushl %ebx
247 call swap_pages
248 addl $8, %esp
249
250 /* To be certain of avoiding problems with self-modifying code
251 * I need to execute a serializing instruction here.
252 * So I flush the TLB, it's handy, and not processor dependent.
253 */
254 xorl %eax, %eax
255 movl %eax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %esp alone */
259
260 testl %esi, %esi
261 jnz 1f
262 xorl %edi, %edi
263 xorl %eax, %eax
264 xorl %ebx, %ebx
265 xorl %ecx, %ecx
266 xorl %edx, %edx
267 xorl %esi, %esi
268 xorl %ebp, %ebp
269 ret
2701:
271 popl %edx
272 movl CP_PA_SWAP_PAGE(%edi), %esp
273 addl $PAGE_SIZE, %esp
2742:
275 call *%edx
276
277 /* get the re-entry point of the peer system */
278 movl 0(%esp), %ebp
279 call 1f
2801:
281 popl %ebx
282 subl $(1b - relocate_kernel), %ebx
283 movl CP_VA_CONTROL_PAGE(%ebx), %edi
284 lea PAGE_SIZE(%ebx), %esp
285 movl CP_PA_SWAP_PAGE(%ebx), %eax
286 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
287 pushl %eax
288 pushl %edx
289 call swap_pages
290 addl $8, %esp
291 movl CP_PA_PGD(%ebx), %eax
292 movl %eax, %cr3
293 movl %cr0, %eax
294 orl $(1<<31), %eax
295 movl %eax, %cr0
296 lea PAGE_SIZE(%edi), %esp
297 movl %edi, %eax
298 addl $(virtual_mapped - relocate_kernel), %eax
299 pushl %eax
300 ret
301
302virtual_mapped:
303 movl CR4(%edi), %eax
304 movl %eax, %cr4
305 movl CR3(%edi), %eax
306 movl %eax, %cr3
307 movl CR0(%edi), %eax
308 movl %eax, %cr0
309 movl ESP(%edi), %esp
310 movl %ebp, %eax
311
312 popf
313 popl %ebp
314 popl %edi
315 popl %esi
316 popl %ebx
317 ret
318
200 /* Do the copies */ 319 /* Do the copies */
201 movl %ebx, %ecx 320swap_pages:
321 movl 8(%esp), %edx
322 movl 4(%esp), %ecx
323 pushl %ebp
324 pushl %ebx
325 pushl %edi
326 pushl %esi
327 movl %ecx, %ebx
202 jmp 1f 328 jmp 1f
203 329
2040: /* top, read another word from the indirection page */ 3300: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 352 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 353 andl $0xfffff000, %esi
228 354
355 movl %edi, %eax
356 movl %esi, %ebp
357
358 movl %edx, %edi
229 movl $1024, %ecx 359 movl $1024, %ecx
230 rep ; movsl 360 rep ; movsl
231 jmp 0b
232
2333:
234 361
235 /* To be certain of avoiding problems with self-modifying code 362 movl %ebp, %edi
236 * I need to execute a serializing instruction here. 363 movl %eax, %esi
237 * So I flush the TLB, it's handy, and not processor dependent. 364 movl $1024, %ecx
238 */ 365 rep ; movsl
239 xorl %eax, %eax
240 movl %eax, %cr3
241 366
242 /* set all of the registers to known values */ 367 movl %eax, %edi
243 /* leave %esp alone */ 368 movl %edx, %esi
369 movl $1024, %ecx
370 rep ; movsl
244 371
245 xorl %eax, %eax 372 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 373 jmp 0b
247 xorl %ecx, %ecx 3743:
248 xorl %edx, %edx 375 popl %esi
249 xorl %esi, %esi 376 popl %edi
250 xorl %edi, %edi 377 popl %ebx
251 xorl %ebp, %ebp 378 popl %ebp
252 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b8..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
52 52
53 cmos_minutes = CMOS_READ(RTC_MINUTES); 53 cmos_minutes = CMOS_READ(RTC_MINUTES);
54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
55 BCD_TO_BIN(cmos_minutes); 55 cmos_minutes = bcd2bin(cmos_minutes);
56 56
57 /* 57 /*
58 * since we're only adjusting minutes and seconds, 58 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
69 69
70 if (abs(real_minutes - cmos_minutes) < 30) { 70 if (abs(real_minutes - cmos_minutes) < 30) {
71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { 71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
72 BIN_TO_BCD(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 BIN_TO_BCD(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds,RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); 124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
125 125
126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { 126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec); 127 sec = bcd2bin(sec);
128 BCD_TO_BIN(min); 128 min = bcd2bin(min);
129 BCD_TO_BIN(hour); 129 hour = bcd2bin(hour);
130 BCD_TO_BIN(day); 130 day = bcd2bin(day);
131 BCD_TO_BIN(mon); 131 mon = bcd2bin(mon);
132 BCD_TO_BIN(year); 132 year = bcd2bin(year);
133 } 133 }
134 134
135 if (century) { 135 if (century) {
136 BCD_TO_BIN(century); 136 century = bcd2bin(century);
137 year += century * 100; 137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); 138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else 139 } else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
223static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
224{ 224{
225#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices) 226 static const char *ids[] __initconst =
227 platform_device_register(&rtc_device); 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
228#else 228 struct pnp_dev *dev;
229 struct pnp_id *id;
230 int i;
231
232 pnp_for_each_dev(dev) {
233 for (id = dev->id; id; id = id->next) {
234 for (i = 0; i < ARRAY_SIZE(ids); i++) {
235 if (compare_pnp_id(id, ids[i]) != 0)
236 return 0;
237 }
238 }
239 }
240#endif
241
229 platform_device_register(&rtc_device); 242 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */ 243 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n");
231 return 0; 245 return 0;
232} 246}
233device_initcall(add_rtc_cmos); 247device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..0fa6790c1dd3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -228,6 +223,9 @@ unsigned long saved_video_mode;
228#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
229 224
230static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
231 229
232#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
233struct edd edd; 231struct edd edd;
@@ -304,7 +302,7 @@ static void __init relocate_initrd(void)
304 if (clen > MAX_MAP_CHUNK-slop) 302 if (clen > MAX_MAP_CHUNK-slop)
305 clen = MAX_MAP_CHUNK-slop; 303 clen = MAX_MAP_CHUNK-slop;
306 mapaddr = ramdisk_image & PAGE_MASK; 304 mapaddr = ramdisk_image & PAGE_MASK;
307 p = early_ioremap(mapaddr, clen+slop); 305 p = early_memremap(mapaddr, clen+slop);
308 memcpy(q, p+slop, clen); 306 memcpy(q, p+slop, clen);
309 early_iounmap(p, clen+slop); 307 early_iounmap(p, clen+slop);
310 q += clen; 308 q += clen;
@@ -381,7 +379,7 @@ static void __init parse_setup_data(void)
381 return; 379 return;
382 pa_data = boot_params.hdr.setup_data; 380 pa_data = boot_params.hdr.setup_data;
383 while (pa_data) { 381 while (pa_data) {
384 data = early_ioremap(pa_data, PAGE_SIZE); 382 data = early_memremap(pa_data, PAGE_SIZE);
385 switch (data->type) { 383 switch (data->type) {
386 case SETUP_E820_EXT: 384 case SETUP_E820_EXT:
387 parse_e820_ext(data, pa_data); 385 parse_e820_ext(data, pa_data);
@@ -404,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
404 return; 402 return;
405 pa_data = boot_params.hdr.setup_data; 403 pa_data = boot_params.hdr.setup_data;
406 while (pa_data) { 404 while (pa_data) {
407 data = early_ioremap(pa_data, sizeof(*data)); 405 data = early_memremap(pa_data, sizeof(*data));
408 e820_update_range(pa_data, sizeof(*data)+data->len, 406 e820_update_range(pa_data, sizeof(*data)+data->len,
409 E820_RAM, E820_RESERVED_KERN); 407 E820_RAM, E820_RESERVED_KERN);
410 found = 1; 408 found = 1;
@@ -430,7 +428,7 @@ static void __init reserve_early_setup_data(void)
430 return; 428 return;
431 pa_data = boot_params.hdr.setup_data; 429 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) { 430 while (pa_data) {
433 data = early_ioremap(pa_data, sizeof(*data)); 431 data = early_memremap(pa_data, sizeof(*data));
434 sprintf(buf, "setup data %x", data->type); 432 sprintf(buf, "setup data %x", data->type);
435 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
436 pa_data = data->next; 434 pa_data = data->next;
@@ -450,7 +448,7 @@ static void __init reserve_early_setup_data(void)
450 * @size: Size of the crashkernel memory to reserve. 448 * @size: Size of the crashkernel memory to reserve.
451 * Returns the base address on success, and -1ULL on failure. 449 * Returns the base address on success, and -1ULL on failure.
452 */ 450 */
453unsigned long long find_and_reserve_crashkernel(unsigned long long size) 451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
454{ 452{
455 const unsigned long long alignment = 16<<20; /* 16M */ 453 const unsigned long long alignment = 16<<20; /* 16M */
456 unsigned long long start = 0LL; 454 unsigned long long start = 0LL;
@@ -563,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
563 561
564} 562}
565 563
566#ifdef CONFIG_PROC_VMCORE 564/*
565 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
566 * is_kdump_kernel() to determine if we are booting after a panic. Hence
567 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
568 */
569
570#ifdef CONFIG_CRASH_DUMP
567/* elfcorehdr= specifies the location of elf core header 571/* elfcorehdr= specifies the location of elf core header
568 * stored by the crashed kernel. This option will be passed 572 * stored by the crashed kernel. This option will be passed
569 * by kexec loader to the capture kernel. 573 * by kexec loader to the capture kernel.
@@ -579,6 +583,194 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 583early_param("elfcorehdr", setup_elfcorehdr);
580#endif 584#endif
581 585
586static struct x86_quirks default_x86_quirks __initdata;
587
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589
590/*
591 * Some BIOSes seem to corrupt the low 64k of memory during events
592 * like suspend/resume and unplugging an HDMI cable. Reserve all
593 * remaining free memory in that area and fill it with a distinct
594 * pattern.
595 */
596#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
597#define MAX_SCAN_AREAS 8
598
599static int __read_mostly memory_corruption_check = -1;
600
601static unsigned __read_mostly corruption_check_size = 64*1024;
602static unsigned __read_mostly corruption_check_period = 60; /* seconds */
603
604static struct e820entry scan_areas[MAX_SCAN_AREAS];
605static int num_scan_areas;
606
607
608static int set_corruption_check(char *arg)
609{
610 char *end;
611
612 memory_corruption_check = simple_strtol(arg, &end, 10);
613
614 return (*end == 0) ? 0 : -EINVAL;
615}
616early_param("memory_corruption_check", set_corruption_check);
617
618static int set_corruption_check_period(char *arg)
619{
620 char *end;
621
622 corruption_check_period = simple_strtoul(arg, &end, 10);
623
624 return (*end == 0) ? 0 : -EINVAL;
625}
626early_param("memory_corruption_check_period", set_corruption_check_period);
627
628static int set_corruption_check_size(char *arg)
629{
630 char *end;
631 unsigned size;
632
633 size = memparse(arg, &end);
634
635 if (*end == '\0')
636 corruption_check_size = size;
637
638 return (size == corruption_check_size) ? 0 : -EINVAL;
639}
640early_param("memory_corruption_check_size", set_corruption_check_size);
641
642
643static void __init setup_bios_corruption_check(void)
644{
645 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
646
647 if (memory_corruption_check == -1) {
648 memory_corruption_check =
649#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
650 1
651#else
652 0
653#endif
654 ;
655 }
656
657 if (corruption_check_size == 0)
658 memory_corruption_check = 0;
659
660 if (!memory_corruption_check)
661 return;
662
663 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
664
665 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
666 u64 size;
667 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
668
669 if (addr == 0)
670 break;
671
672 if ((addr + size) > corruption_check_size)
673 size = corruption_check_size - addr;
674
675 if (size == 0)
676 break;
677
678 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
679 scan_areas[num_scan_areas].addr = addr;
680 scan_areas[num_scan_areas].size = size;
681 num_scan_areas++;
682
683 /* Assume we've already mapped this early memory */
684 memset(__va(addr), 0, size);
685
686 addr += size;
687 }
688
689 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
690 num_scan_areas);
691 update_e820();
692}
693
694static struct timer_list periodic_check_timer;
695
696void check_for_bios_corruption(void)
697{
698 int i;
699 int corruption = 0;
700
701 if (!memory_corruption_check)
702 return;
703
704 for(i = 0; i < num_scan_areas; i++) {
705 unsigned long *addr = __va(scan_areas[i].addr);
706 unsigned long size = scan_areas[i].size;
707
708 for(; size; addr++, size -= sizeof(unsigned long)) {
709 if (!*addr)
710 continue;
711 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
712 addr, __pa(addr), *addr);
713 corruption = 1;
714 *addr = 0;
715 }
716 }
717
718 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
719}
720
721static void periodic_check_for_corruption(unsigned long data)
722{
723 check_for_bios_corruption();
724 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
725}
726
727void start_periodic_check_for_corruption(void)
728{
729 if (!memory_corruption_check || corruption_check_period == 0)
730 return;
731
732 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
733 corruption_check_period);
734
735 init_timer(&periodic_check_timer);
736 periodic_check_timer.function = &periodic_check_for_corruption;
737 periodic_check_for_corruption(0);
738}
739#endif
740
741static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
742{
743 printk(KERN_NOTICE
744 "%s detected: BIOS may corrupt low RAM, working it around.\n",
745 d->ident);
746
747 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
748 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
749
750 return 0;
751}
752
753/* List of systems that have known low memory corruption BIOS problems */
754static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
755#ifdef CONFIG_X86_RESERVE_LOW_64K
756 {
757 .callback = dmi_low_memory_corruption,
758 .ident = "AMI BIOS",
759 .matches = {
760 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
761 },
762 },
763 {
764 .callback = dmi_low_memory_corruption,
765 .ident = "Phoenix BIOS",
766 .matches = {
767 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
768 },
769 },
770#endif
771 {}
772};
773
582/* 774/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 775 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 776 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +790,11 @@ void __init setup_arch(char **cmdline_p)
598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 790 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect(); 791 visws_early_detect();
600 pre_setup_arch_hook(); 792 pre_setup_arch_hook();
601 early_cpu_init();
602#else 793#else
603 printk(KERN_INFO "Command line: %s\n", boot_command_line); 794 printk(KERN_INFO "Command line: %s\n", boot_command_line);
604#endif 795#endif
605 796
797 early_cpu_init();
606 early_ioremap_init(); 798 early_ioremap_init();
607 799
608 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 800 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,14 +858,36 @@ void __init setup_arch(char **cmdline_p)
666 bss_resource.start = virt_to_phys(&__bss_start); 858 bss_resource.start = virt_to_phys(&__bss_start);
667 bss_resource.end = virt_to_phys(&__bss_stop)-1; 859 bss_resource.end = virt_to_phys(&__bss_stop)-1;
668 860
669#ifdef CONFIG_X86_64 861#ifdef CONFIG_CMDLINE_BOOL
670 early_cpu_init(); 862#ifdef CONFIG_CMDLINE_OVERRIDE
863 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
864#else
865 if (builtin_cmdline[0]) {
866 /* append boot loader cmdline to builtin */
867 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
868 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
869 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
870 }
671#endif 871#endif
872#endif
873
672 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 874 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
673 *cmdline_p = command_line; 875 *cmdline_p = command_line;
674 876
675 parse_early_param(); 877 parse_early_param();
676 878
879#ifdef CONFIG_X86_64
880 check_efer();
881#endif
882
883#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
884 /*
885 * Must be before kernel pagetables are setup
886 * or fixmap area is touched.
887 */
888 vmi_init();
889#endif
890
677 /* after early param, so could get panic from serial */ 891 /* after early param, so could get panic from serial */
678 reserve_early_setup_data(); 892 reserve_early_setup_data();
679 893
@@ -681,7 +895,7 @@ void __init setup_arch(char **cmdline_p)
681#ifdef CONFIG_X86_LOCAL_APIC 895#ifdef CONFIG_X86_LOCAL_APIC
682 disable_apic = 1; 896 disable_apic = 1;
683#endif 897#endif
684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 898 setup_clear_cpu_cap(X86_FEATURE_APIC);
685 } 899 }
686 900
687#ifdef CONFIG_PCI 901#ifdef CONFIG_PCI
@@ -691,6 +905,10 @@ void __init setup_arch(char **cmdline_p)
691 905
692 finish_e820_parsing(); 906 finish_e820_parsing();
693 907
908 dmi_scan_machine();
909
910 dmi_check_system(bad_bios_dmi_table);
911
694#ifdef CONFIG_X86_32 912#ifdef CONFIG_X86_32
695 probe_roms(); 913 probe_roms();
696#endif 914#endif
@@ -734,7 +952,8 @@ void __init setup_arch(char **cmdline_p)
734#else 952#else
735 num_physpages = max_pfn; 953 num_physpages = max_pfn;
736 954
737 check_efer(); 955 if (cpu_has_x2apic)
956 check_x2apic();
738 957
739 /* How many end-of-memory variables you have, grandma! */ 958 /* How many end-of-memory variables you have, grandma! */
740 /* need this before calling reserve_initrd */ 959 /* need this before calling reserve_initrd */
@@ -746,6 +965,10 @@ void __init setup_arch(char **cmdline_p)
746 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 965 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
747#endif 966#endif
748 967
968#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
969 setup_bios_corruption_check();
970#endif
971
749 /* max_pfn_mapped is updated here */ 972 /* max_pfn_mapped is updated here */
750 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 973 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
751 max_pfn_mapped = max_low_pfn_mapped; 974 max_pfn_mapped = max_low_pfn_mapped;
@@ -774,8 +997,6 @@ void __init setup_arch(char **cmdline_p)
774 vsmp_init(); 997 vsmp_init();
775#endif 998#endif
776 999
777 dmi_scan_machine();
778
779 io_delay_init(); 1000 io_delay_init();
780 1001
781 /* 1002 /*
@@ -783,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
783 */ 1004 */
784 acpi_boot_table_init(); 1005 acpi_boot_table_init();
785 1006
1007 early_acpi_boot_init();
1008
786#ifdef CONFIG_ACPI_NUMA 1009#ifdef CONFIG_ACPI_NUMA
787 /* 1010 /*
788 * Parse SRAT to discover nodes. 1011 * Parse SRAT to discover nodes.
@@ -792,10 +1015,6 @@ void __init setup_arch(char **cmdline_p)
792 1015
793 initmem_init(0, max_pfn); 1016 initmem_init(0, max_pfn);
794 1017
795#ifdef CONFIG_X86_64
796 dma32_reserve_bootmem();
797#endif
798
799#ifdef CONFIG_ACPI_SLEEP 1018#ifdef CONFIG_ACPI_SLEEP
800 /* 1019 /*
801 * Reserve low memory region for sleep support. 1020 * Reserve low memory region for sleep support.
@@ -810,21 +1029,25 @@ void __init setup_arch(char **cmdline_p)
810#endif 1029#endif
811 reserve_crashkernel(); 1030 reserve_crashkernel();
812 1031
1032#ifdef CONFIG_X86_64
1033 /*
1034 * dma32_reserve_bootmem() allocates bootmem which may conflict
1035 * with the crashkernel command line, so do that after
1036 * reserve_crashkernel()
1037 */
1038 dma32_reserve_bootmem();
1039#endif
1040
813 reserve_ibft_region(); 1041 reserve_ibft_region();
814 1042
815#ifdef CONFIG_KVM_CLOCK 1043#ifdef CONFIG_KVM_CLOCK
816 kvmclock_init(); 1044 kvmclock_init();
817#endif 1045#endif
818 1046
819#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 1047 paravirt_pagetable_setup_start(swapper_pg_dir);
820 /*
821 * Must be after max_low_pfn is determined, and before kernel
822 * pagetables are setup.
823 */
824 vmi_init();
825#endif
826
827 paging_init(); 1048 paging_init();
1049 paravirt_pagetable_setup_done(swapper_pg_dir);
1050 paravirt_post_allocator_init();
828 1051
829#ifdef CONFIG_X86_64 1052#ifdef CONFIG_X86_64
830 map_vsyscall(); 1053 map_vsyscall();
@@ -850,27 +1073,17 @@ void __init setup_arch(char **cmdline_p)
850#endif 1073#endif
851 1074
852 prefill_possible_map(); 1075 prefill_possible_map();
1076
853#ifdef CONFIG_X86_64 1077#ifdef CONFIG_X86_64
854 init_cpu_to_node(); 1078 init_cpu_to_node();
855#endif 1079#endif
856 1080
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 1081 init_apic_mappings();
866 ioapic_init_mappings(); 1082 ioapic_init_mappings();
867 1083
868#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) 1084 /* need to wait for io_apic is mapped */
869 if (def_to_bigsmp) 1085 nr_irqs = probe_nr_irqs();
870 printk(KERN_WARNING "More than 8 CPUs detected and " 1086
871 "CONFIG_X86_PC cannot handle it.\nUse "
872 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
873#endif
874 kvm_guest_init(); 1087 kvm_guest_init();
875 1088
876 e820_reserve_resources(); 1089 e820_reserve_resources();
@@ -892,3 +1105,5 @@ void __init setup_arch(char **cmdline_p)
892#endif 1105#endif
893#endif 1106#endif
894} 1107}
1108
1109
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..ae0c0d3bb770 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
80#endif 80#endif
81} 81}
82 82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
102/* 84/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it 85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -158,35 +140,47 @@ static void __init setup_cpu_pda_map(void)
158 */ 140 */
159void __init setup_per_cpu_areas(void) 141void __init setup_per_cpu_areas(void)
160{ 142{
161 ssize_t size = PERCPU_ENOUGH_ROOM; 143 ssize_t size, old_size;
162 char *ptr; 144 char *ptr;
163 int cpu; 145 int cpu;
146 unsigned long align = 1;
164 147
165 /* Setup cpu_pda map */ 148 /* Setup cpu_pda map */
166 setup_cpu_pda_map(); 149 setup_cpu_pda_map();
167 150
168 /* Copy section for each CPU (we discard the original) */ 151 /* Copy section for each CPU (we discard the original) */
169 size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align);
170 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
171 size); 156 size);
172 157
173 for_each_possible_cpu(cpu) { 158 for_each_possible_cpu(cpu) {
174#ifndef CONFIG_NEED_MULTIPLE_NODES 159#ifndef CONFIG_NEED_MULTIPLE_NODES
175 ptr = alloc_bootmem_pages(size); 160 ptr = __alloc_bootmem(size, align,
161 __pa(MAX_DMA_ADDRESS));
176#else 162#else
177 int node = early_cpu_to_node(cpu); 163 int node = early_cpu_to_node(cpu);
178 if (!node_online(node) || !NODE_DATA(node)) { 164 if (!node_online(node) || !NODE_DATA(node)) {
179 ptr = alloc_bootmem_pages(size); 165 ptr = __alloc_bootmem(size, align,
166 __pa(MAX_DMA_ADDRESS));
180 printk(KERN_INFO 167 printk(KERN_INFO
181 "cpu %d has no node %d or node-local memory\n", 168 "cpu %d has no node %d or node-local memory\n",
182 cpu, node); 169 cpu, node);
170 if (ptr)
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
172 cpu, __pa(ptr));
173 }
174 else {
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS));
177 if (ptr)
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
179 cpu, node, __pa(ptr));
183 } 180 }
184 else
185 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
186#endif 181#endif
187 per_cpu_offset(cpu) = ptr - __per_cpu_start; 182 per_cpu_offset(cpu) = ptr - __per_cpu_start;
188 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
189
190 } 184 }
191 185
192 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", 186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -197,9 +191,6 @@ void __init setup_per_cpu_areas(void)
197 191
198 /* Setup node to cpumask map */ 192 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map(); 193 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203} 194}
204 195
205#endif 196#endif
@@ -227,8 +218,8 @@ static void __init setup_node_to_cpumask_map(void)
227 /* allocate the map */ 218 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 219 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229 220
230 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 221 pr_debug("Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids); 222 map, nr_node_ids);
232 223
233 /* node_to_cpumask() will now work */ 224 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map; 225 node_to_cpumask_map = map;
@@ -248,7 +239,7 @@ void __cpuinit numa_set_node(int cpu, int node)
248 per_cpu(x86_cpu_to_node_map, cpu) = node; 239 per_cpu(x86_cpu_to_node_map, cpu) = node;
249 240
250 else 241 else
251 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); 242 pr_debug("Setting node for non-present cpu %d\n", cpu);
252} 243}
253 244
254void __cpuinit numa_clear_node(int cpu) 245void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
27#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -26,6 +27,8 @@
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
31#include <asm/syscalls.h>
29 32
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
110 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
111} 114}
112 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
113 137
114/* 138/*
115 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
118restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
119 unsigned long *pax) 143 unsigned long *pax)
120{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
121 unsigned int err = 0; 147 unsigned int err = 0;
122 148
123 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
124 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
125 151
126#define COPY(x) err |= __get_user(regs->x, &sc->x)
127
128#define COPY_SEG(seg) \
129 { unsigned short tmp; \
130 err |= __get_user(tmp, &sc->seg); \
131 regs->seg = tmp; }
132
133#define COPY_SEG_STRICT(seg) \
134 { unsigned short tmp; \
135 err |= __get_user(tmp, &sc->seg); \
136 regs->seg = tmp|3; }
137
138#define GET_SEG(seg) \
139 { unsigned short tmp; \
140 err |= __get_user(tmp, &sc->seg); \
141 loadsegment(seg, tmp); }
142
143 GET_SEG(gs); 152 GET_SEG(gs);
144 COPY_SEG(fs); 153 COPY_SEG(fs);
145 COPY_SEG(es); 154 COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
150 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
151 160
152 { 161 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 163 regs->orig_ax = -1; /* disable syscall checks */
155 err |= __get_user(tmpflags, &sc->flags);
156 regs->flags = (regs->flags & ~FIX_EFLAGS) |
157 (tmpflags & FIX_EFLAGS);
158 regs->orig_ax = -1; /* disable syscall checks */
159 }
160 164
161 { 165 err |= __get_user(buf, &sc->fpstate);
162 struct _fpstate __user *buf; 166 err |= restore_i387_xstate(buf);
163
164 err |= __get_user(buf, &sc->fpstate);
165 if (buf) {
166 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
167 goto badframe;
168 err |= restore_i387(buf);
169 } else {
170 struct task_struct *me = current;
171
172 if (used_math()) {
173 clear_fpu(me);
174 clear_used_math();
175 }
176 }
177 }
178 167
179 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
180 return err; 169 return err;
181
182badframe:
183 return 1;
184} 170}
185 171
186asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -212,7 +198,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 198
213badframe: 199badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 200 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 201 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 202 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 204 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -226,9 +212,8 @@ badframe:
226 return 0; 212 return 0;
227} 213}
228 214
229asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
230{ 216{
231 struct pt_regs *regs = (struct pt_regs *)&__unused;
232 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
233 unsigned long ax; 218 unsigned long ax;
234 sigset_t set; 219 sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
254 return ax; 239 return ax;
255 240
256badframe: 241badframe:
257 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
258 return 0; 243 return 0;
259} 244}
260 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
261/* 253/*
262 * Set up a signal frame. 254 * Set up a signal frame.
263 */ 255 */
264static int 256static int
265setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
266 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
267{ 259{
268 int tmp, err = 0; 260 int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
289 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
290 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
291 283
292 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
293 if (tmp < 0) 285 if (tmp < 0)
294 err = 1; 286 err = 1;
295 else 287 else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
306 * Determine which stack to use.. 298 * Determine which stack to use..
307 */ 299 */
308static inline void __user * 300static inline void __user *
309get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
310{ 303{
311 unsigned long sp; 304 unsigned long sp;
312 305
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
332 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
333 } 326 }
334 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
335 sp -= frame_size; 333 sp -= frame_size;
336 /* 334 /*
337 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
343} 341}
344 342
345static int 343static int
346setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
347 struct pt_regs *regs) 345 struct pt_regs *regs)
348{ 346{
349 struct sigframe __user *frame; 347 struct sigframe __user *frame;
350 void __user *restorer; 348 void __user *restorer;
351 int err = 0; 349 int err = 0;
352 int usig; 350 void __user *fpstate = NULL;
353 351
354 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
355 353
356 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
357 goto give_sigsegv; 355 return -EFAULT;
358 356
359 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
360 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
361 && sig < 32
362 ? current_thread_info()->exec_domain->signal_invmap[sig]
363 : sig;
364 359
365 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
366 if (err) 361 return -EFAULT;
367 goto give_sigsegv;
368
369 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
370 if (err)
371 goto give_sigsegv;
372 362
373 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
374 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
375 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
376 if (err) 366 return -EFAULT;
377 goto give_sigsegv;
378 } 367 }
379 368
380 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400 389
401 if (err) 390 if (err)
402 goto give_sigsegv; 391 return -EFAULT;
403 392
404 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
405 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
414 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
415 404
416 return 0; 405 return 0;
417
418give_sigsegv:
419 force_sigsegv(sig, current);
420 return -EFAULT;
421} 406}
422 407
423static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
424 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
425{ 410{
426 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
427 void __user *restorer; 412 void __user *restorer;
428 int err = 0; 413 int err = 0;
429 int usig; 414 void __user *fpstate = NULL;
430 415
431 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
432 417
433 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
434 goto give_sigsegv; 419 return -EFAULT;
435
436 usig = current_thread_info()->exec_domain
437 && current_thread_info()->exec_domain->signal_invmap
438 && sig < 32
439 ? current_thread_info()->exec_domain->signal_invmap[sig]
440 : sig;
441 420
442 err |= __put_user(usig, &frame->sig); 421 err |= __put_user(sig, &frame->sig);
443 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
444 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
445 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
446 if (err) 425 if (err)
447 goto give_sigsegv; 426 return -EFAULT;
448 427
449 /* Create the ucontext. */ 428 /* Create the ucontext. */
450 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
451 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
452 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
453 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
454 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
455 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
456 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
457 regs, set->sig[0]); 439 regs, set->sig[0]);
458 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
459 if (err) 441 if (err)
460 goto give_sigsegv; 442 return -EFAULT;
461 443
462 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
463 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
477 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
478 460
479 if (err) 461 if (err)
480 goto give_sigsegv; 462 return -EFAULT;
481 463
482 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
483 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
484 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
485 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
486 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
487 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
488 470
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
492 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
493 475
494 return 0; 476 return 0;
495
496give_sigsegv:
497 force_sigsegv(sig, current);
498 return -EFAULT;
499} 477}
500 478
501/* 479/*
502 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
503 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
504static int 519static int
505handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
506 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 int ret; 523 int ret;
509 524
510 /* Are we from a system call? */ 525 /* Are we from a system call? */
511 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
512 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
513 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
514 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
515 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
516 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
538 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
539 554
540 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
541 if (ka->sa.sa_flags & SA_SIGINFO)
542 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 else
544 ret = setup_frame(sig, ka, oldset, regs);
545 556
546 if (ret) 557 if (ret)
547 return ret; 558 return ret;
548 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
549 /* 569 /*
550 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
551 */ 571 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
558 * handler too. 578 * handler too.
559 */ 579 */
560 regs->flags &= ~X86_EFLAGS_TF; 580 regs->flags &= ~X86_EFLAGS_TF;
561 if (test_thread_flag(TIF_SINGLESTEP))
562 ptrace_notify(SIGTRAP);
563 581
564 spin_lock_irq(&current->sighand->siglock); 582 spin_lock_irq(&current->sighand->siglock);
565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 583 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
568 recalc_sigpending(); 586 recalc_sigpending();
569 spin_unlock_irq(&current->sighand->siglock); 587 spin_unlock_irq(&current->sighand->siglock);
570 588
589 tracehook_signal_handler(sig, info, ka, regs,
590 test_thread_flag(TIF_SINGLESTEP));
591
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,18 +679,38 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
660 /* Pending single-step? */ 682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
661 if (thread_info_flags & _TIF_SINGLESTEP) { 683 /* notify userspace of pending MCEs */
662 regs->flags |= X86_EFLAGS_TF; 684 if (thread_info_flags & _TIF_MCE_NOTIFY)
663 clear_thread_flag(TIF_SINGLESTEP); 685 mce_notify_user();
664 } 686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
665 687
666 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 690 do_signal(regs);
669 691
670 if (thread_info_flags & _TIF_HRTICK_RESCHED) 692 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
671 hrtick_resched(); 693 clear_thread_flag(TIF_NOTIFY_RESUME);
694 tracehook_notify_resume(regs);
695 }
672 696
697#ifdef CONFIG_X86_32
673 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
674} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
29#include "sigframe.h" 33#include "sigframe.h"
30 34
31#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
41# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
42#endif 46#endif
43 47
44int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
45 sigset_t *set, struct pt_regs * regs);
46int ia32_setup_frame(int sig, struct k_sigaction *ka,
47 sigset_t *set, struct pt_regs * regs);
48
49asmlinkage long 48asmlinkage long
50sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
51 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -53,6 +52,15 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
54} 53}
55 54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
56 64
57/* 65/*
58 * Do a signal return; undo the signal stack. 66 * Do a signal return; undo the signal stack.
@@ -61,13 +69,13 @@ static int
61restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
62 unsigned long *pax) 70 unsigned long *pax)
63{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
64 unsigned int err = 0; 74 unsigned int err = 0;
65 75
66 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
67 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
68 78
69#define COPY(x) err |= __get_user(regs->x, &sc->x)
70
71 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
72 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
73 COPY(r8); 81 COPY(r8);
@@ -82,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
82 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
83 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
84 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
85 { 93 COPY_SEG_STRICT(cs);
86 unsigned cs;
87 err |= __get_user(cs, &sc->cs);
88 regs->cs = cs | 3; /* Force into user mode */
89 }
90 94
91 { 95 err |= __get_user(tmpflags, &sc->flags);
92 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
93 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
94 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
95 regs->orig_ax = -1; /* disable syscall checks */
96 }
97 98
98 { 99 err |= __get_user(buf, &sc->fpstate);
99 struct _fpstate __user * buf; 100 err |= restore_i387_xstate(buf);
100 err |= __get_user(buf, &sc->fpstate);
101
102 if (buf) {
103 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
104 goto badframe;
105 err |= restore_i387(buf);
106 } else {
107 struct task_struct *me = current;
108 if (used_math()) {
109 clear_fpu(me);
110 clear_used_math();
111 }
112 }
113 }
114 101
115 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
116 return err; 103 return err;
117
118badframe:
119 return 1;
120} 104}
121 105
122asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
123{ 107{
124 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
125 sigset_t set;
126 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
127 111
128 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
129 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -136,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
136 current->blocked = set; 120 current->blocked = set;
137 recalc_sigpending(); 121 recalc_sigpending();
138 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
139 123
140 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
141 goto badframe; 125 goto badframe;
142 126
@@ -146,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
146 return ax; 130 return ax;
147 131
148badframe: 132badframe:
149 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
150 return 0; 134 return 0;
151} 135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
152 141
153/* 142/*
154 * Set up a signal frame. 143 * Set up a signal frame.
155 */ 144 */
156 145
157static inline int 146static inline int
158setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
159{ 149{
160 int err = 0; 150 int err = 0;
161 151
@@ -207,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
207 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
208 } 198 }
209 199
210 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
211} 201}
212 202
213static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
214 sigset_t *set, struct pt_regs * regs) 204 sigset_t *set, struct pt_regs *regs)
215{ 205{
216 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
217 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
218 int err = 0; 208 int err = 0;
219 struct task_struct *me = current; 209 struct task_struct *me = current;
220 210
221 if (used_math()) { 211 if (used_math()) {
222 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
223 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
224 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
225 215
226 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
227 goto give_sigsegv; 217 return -EFAULT;
228
229 if (save_i387(fp) < 0)
230 err |= -1;
231 } else 218 } else
232 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
233 220
234 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
235 goto give_sigsegv; 222 return -EFAULT;
236 223
237 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
238 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
239 if (err) 226 return -EFAULT;
240 goto give_sigsegv;
241 } 227 }
242 228
243 /* Create the ucontext. */ 229 /* Create the ucontext. */
244 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
245 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
246 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
247 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -249,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
249 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
250 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
251 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
252 if (sizeof(*set) == 16) { 241 if (sizeof(*set) == 16) {
253 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
254 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
255 } else 244 } else
256 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
257 246
@@ -262,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
262 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
263 } else { 252 } else {
264 /* could use a vstub here */ 253 /* could use a vstub here */
265 goto give_sigsegv; 254 return -EFAULT;
266 } 255 }
267 256
268 if (err) 257 if (err)
269 goto give_sigsegv; 258 return -EFAULT;
270 259
271 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
272 regs->di = sig; 261 regs->di = sig;
273 /* In case the signal handler was declared without prototypes */ 262 /* In case the signal handler was declared without prototypes */
274 regs->ax = 0; 263 regs->ax = 0;
275 264
276 /* This also works for non SA_SIGINFO handlers because they expect the 265 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -286,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
286 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
287 276
288 return 0; 277 return 0;
289
290give_sigsegv:
291 force_sigsegv(sig, current);
292 return -EFAULT;
293} 278}
294 279
295/* 280/*
296 * Return -1L or the syscall number that @regs is executing. 281 * OK, we're invoking a handler
297 */ 282 */
298static long current_syscall(struct pt_regs *regs) 283static int signr_convert(int sig)
299{ 284{
300 /* 285 return sig;
301 * We always sign-extend a -1 value being set here,
302 * so this is always either -1L or a syscall number.
303 */
304 return regs->orig_ax;
305} 286}
306 287
307/*
308 * Return a value that is -EFOO if the system call in @regs->orig_ax
309 * returned an error. This only works for @regs from @current.
310 */
311static long current_syscall_ret(struct pt_regs *regs)
312{
313#ifdef CONFIG_IA32_EMULATION 288#ifdef CONFIG_IA32_EMULATION
314 if (test_thread_flag(TIF_IA32)) 289#define is_ia32 test_thread_flag(TIF_IA32)
315 /* 290#else
316 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO 291#define is_ia32 0
317 * and will match correctly in comparisons.
318 */
319 return (int) regs->ax;
320#endif 292#endif
321 return regs->ax;
322}
323 293
324/* 294static int
325 * OK, we're invoking a handler 295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
326 */ 296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
327 317
328static int 318static int
329handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -332,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
332 int ret; 322 int ret;
333 323
334 /* Are we from a system call? */ 324 /* Are we from a system call? */
335 if (current_syscall(regs) >= 0) { 325 if (syscall_get_nr(current, regs) >= 0) {
336 /* If so, check system call restarting.. */ 326 /* If so, check system call restarting.. */
337 switch (current_syscall_ret(regs)) { 327 switch (syscall_get_error(current, regs)) {
338 case -ERESTART_RESTARTBLOCK: 328 case -ERESTART_RESTARTBLOCK:
339 case -ERESTARTNOHAND: 329 case -ERESTARTNOHAND:
340 regs->ax = -EINTR; 330 regs->ax = -EINTR;
@@ -361,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
361 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
362 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
363 353
364#ifdef CONFIG_IA32_EMULATION
365 if (test_thread_flag(TIF_IA32)) {
366 if (ka->sa.sa_flags & SA_SIGINFO)
367 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
368 else
369 ret = ia32_setup_frame(sig, ka, oldset, regs);
370 } else
371#endif
372 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
373 355
374 if (ret == 0) { 356 if (ret)
375 /* 357 return ret;
376 * This has nothing to do with segment registers,
377 * despite the name. This magic affects uaccess.h
378 * macros' behavior. Reset it to the normal setting.
379 */
380 set_fs(USER_DS);
381 358
382 /* 359#ifdef CONFIG_X86_64
383 * Clear the direction flag as per the ABI for function entry. 360 /*
384 */ 361 * This has nothing to do with segment registers,
385 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
386 367
387 /* 368 /*
388 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
389 * notify any tracer that was single-stepping it. 370 */
390 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
391 * handler too.
392 */
393 regs->flags &= ~X86_EFLAGS_TF;
394 if (test_thread_flag(TIF_SINGLESTEP))
395 ptrace_notify(SIGTRAP);
396
397 spin_lock_irq(&current->sighand->siglock);
398 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
399 if (!(ka->sa.sa_flags & SA_NODEFER))
400 sigaddset(&current->blocked,sig);
401 recalc_sigpending();
402 spin_unlock_irq(&current->sighand->siglock);
403 }
404 372
405 return ret; 373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
406} 392}
407 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
408/* 396/*
409 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
410 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -434,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
434 422
435 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
436 if (signr > 0) { 424 if (signr > 0) {
437 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
438 * signal to user space. The processor register will 427 * signal to user space. The processor register will
439 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
440 * inside the kernel. 429 * inside the kernel.
@@ -442,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
442 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
443 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
444 433
445 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
446 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
447 /* 436 /*
448 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -456,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
456 } 445 }
457 446
458 /* Did we come from a system call? */ 447 /* Did we come from a system call? */
459 if (current_syscall(regs) >= 0) { 448 if (syscall_get_nr(current, regs) >= 0) {
460 /* Restart the system call - no handlers present */ 449 /* Restart the system call - no handlers present */
461 switch (current_syscall_ret(regs)) { 450 switch (syscall_get_error(current, regs)) {
462 case -ERESTARTNOHAND: 451 case -ERESTARTNOHAND:
463 case -ERESTARTSYS: 452 case -ERESTARTSYS:
464 case -ERESTARTNOINTR: 453 case -ERESTARTNOINTR:
465 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
466 regs->ip -= 2; 455 regs->ip -= 2;
467 break; 456 break;
457
468 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
469 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
470 __NR_ia32_restart_syscall :
471 __NR_restart_syscall;
472 regs->ip -= 2; 460 regs->ip -= 2;
473 break; 461 break;
474 } 462 }
@@ -484,38 +472,45 @@ static void do_signal(struct pt_regs *regs)
484 } 472 }
485} 473}
486 474
487void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
488 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
489{ 481{
490 /* Pending single-step? */ 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
499 mce_notify_user(); 485 mce_notify_user();
500#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
501 487
502 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 490 do_signal(regs);
505 491
506 if (thread_info_flags & _TIF_HRTICK_RESCHED) 492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
507 hrtick_resched(); 493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
508} 500}
509 501
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
511{ 503{
512 struct task_struct *me = current; 504 struct task_struct *me = current;
505
513 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
514 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
515 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
516 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
517 printk("\n"); 512 printk(KERN_CONT "\n");
518 } 513 }
519 514
520 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
521} 516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..7b1093397319 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
88#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 89#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
89#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 90#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
90#else 91#else
91struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 92static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
92#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 93#define get_idle_for_cpu(x) (idle_thread_array[(x)])
93#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) 94#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
94#endif 95#endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
123 124
124static atomic_t init_deasserted; 125static atomic_t init_deasserted;
125 126
126static int boot_cpu_logical_apicid;
127 127
128/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
130 130
131/* Set if we find a B stepping CPU */ 131/* Set if we find a B stepping CPU */
132int __cpuinitdata smp_b_stepping; 132static int __cpuinitdata smp_b_stepping;
133 133
134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
135 135
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
165#endif 165#endif
166 166
167#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
168u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
169 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
170 172
@@ -210,13 +212,13 @@ static void __cpuinit smp_callin(void)
210 /* 212 /*
211 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
212 */ 214 */
213 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
214 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
215 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
217 phys_id, cpuid); 219 phys_id, cpuid);
218 } 220 }
219 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 221 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
220 222
221 /* 223 /*
222 * STARTUP IPIs are fragile beasts as they might sometimes 224 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,12 +253,13 @@ static void __cpuinit smp_callin(void)
251 * boards) 253 * boards)
252 */ 254 */
253 255
254 Dprintk("CALLIN, before setup_local_APIC().\n"); 256 pr_debug("CALLIN, before setup_local_APIC().\n");
255 smp_callin_clear_local_apic(); 257 smp_callin_clear_local_apic();
256 setup_local_APIC(); 258 setup_local_APIC();
257 end_local_APIC_setup(); 259 end_local_APIC_setup();
258 map_cpu_to_logical_apicid(); 260 map_cpu_to_logical_apicid();
259 261
262 notify_cpu_starting(cpuid);
260 /* 263 /*
261 * Get our bogomips. 264 * Get our bogomips.
262 * 265 *
@@ -266,7 +269,7 @@ static void __cpuinit smp_callin(void)
266 local_irq_enable(); 269 local_irq_enable();
267 calibrate_delay(); 270 calibrate_delay();
268 local_irq_disable(); 271 local_irq_disable();
269 Dprintk("Stack at about %p\n", &cpuid); 272 pr_debug("Stack at about %p\n", &cpuid);
270 273
271 /* 274 /*
272 * Save our processor parameters 275 * Save our processor parameters
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void)
279 cpu_set(cpuid, cpu_callin_map); 282 cpu_set(cpuid, cpu_callin_map);
280} 283}
281 284
285static int __cpuinitdata unsafe_smp;
286
282/* 287/*
283 * Activate a secondary processor. 288 * Activate a secondary processor.
284 */ 289 */
@@ -326,15 +331,22 @@ static void __cpuinit start_secondary(void *unused)
326 * for which cpus receive the IPI. Holding this 331 * for which cpus receive the IPI. Holding this
327 * lock helps us to not include this cpu in a currently in progress 332 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 333 * smp_call_function().
334 *
335 * We need to hold vector_lock so there the set of online cpus
336 * does not change while we are assigning vectors to cpus. Holding
337 * this lock ensures we don't half assign or remove an irq from a cpu.
329 */ 338 */
330 ipi_call_lock_irq(); 339 ipi_call_lock();
331#ifdef CONFIG_X86_IO_APIC 340 lock_vector_lock();
332 setup_vector_irq(smp_processor_id()); 341 __setup_vector_irq(smp_processor_id());
333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 342 cpu_set(smp_processor_id(), cpu_online_map);
335 ipi_call_unlock_irq(); 343 unlock_vector_lock();
344 ipi_call_unlock();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 345 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 346
347 /* enable local interrupts */
348 local_irq_enable();
349
338 setup_secondary_clock(); 350 setup_secondary_clock();
339 351
340 wmb(); 352 wmb();
@@ -387,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
387 goto valid_k7; 399 goto valid_k7;
388 400
389 /* If we get here, not a certified SMP capable AMD system. */ 401 /* If we get here, not a certified SMP capable AMD system. */
390 add_taint(TAINT_UNSAFE_SMP); 402 unsafe_smp = 1;
391 } 403 }
392 404
393valid_k7: 405valid_k7:
@@ -404,12 +416,10 @@ static void __cpuinit smp_checks(void)
404 * Don't taint if we are running SMP kernel on a single non-MP 416 * Don't taint if we are running SMP kernel on a single non-MP
405 * approved Athlon 417 * approved Athlon
406 */ 418 */
407 if (tainted & TAINT_UNSAFE_SMP) { 419 if (unsafe_smp && num_online_cpus() > 1) {
408 if (num_online_cpus()) 420 printk(KERN_INFO "WARNING: This combination of AMD"
409 printk(KERN_INFO "WARNING: This combination of AMD" 421 "processors is not suitable for SMP.\n");
410 "processors is not suitable for SMP.\n"); 422 add_taint(TAINT_UNSAFE_SMP);
411 else
412 tainted &= ~TAINT_UNSAFE_SMP;
413 } 423 }
414} 424}
415 425
@@ -438,7 +448,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 448 cpu_set(cpu, cpu_sibling_setup_map);
439 449
440 if (smp_num_siblings > 1) { 450 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 451 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 452 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 453 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 454 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +471,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 471 return;
462 } 472 }
463 473
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 474 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 475 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 476 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 477 cpu_set(i, c->llc_shared_map);
@@ -513,7 +523,7 @@ static void impress_friends(void)
513 /* 523 /*
514 * Allow the user to impress friends. 524 * Allow the user to impress friends.
515 */ 525 */
516 Dprintk("Before bogomips.\n"); 526 pr_debug("Before bogomips.\n");
517 for_each_possible_cpu(cpu) 527 for_each_possible_cpu(cpu)
518 if (cpu_isset(cpu, cpu_callout_map)) 528 if (cpu_isset(cpu, cpu_callout_map))
519 bogosum += cpu_data(cpu).loops_per_jiffy; 529 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +533,7 @@ static void impress_friends(void)
523 bogosum/(500000/HZ), 533 bogosum/(500000/HZ),
524 (bogosum/(5000/HZ))%100); 534 (bogosum/(5000/HZ))%100);
525 535
526 Dprintk("Before bogocount - setting activated=1.\n"); 536 pr_debug("Before bogocount - setting activated=1.\n");
527} 537}
528 538
529static inline void __inquire_remote_apic(int apicid) 539static inline void __inquire_remote_apic(int apicid)
@@ -533,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
533 int timeout; 543 int timeout;
534 u32 status; 544 u32 status;
535 545
536 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 546 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
537 547
538 for (i = 0; i < ARRAY_SIZE(regs); i++) { 548 for (i = 0; i < ARRAY_SIZE(regs); i++) {
539 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); 549 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
540 550
541 /* 551 /*
542 * Wait for idle. 552 * Wait for idle.
@@ -546,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 556 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 557 "a previous APIC delivery may have failed\n");
548 558
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 559 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 560
552 timeout = 0; 561 timeout = 0;
553 do { 562 do {
@@ -579,29 +588,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 588 int maxlvt;
580 589
581 /* Target chip */ 590 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583
584 /* Boot on the stack */ 591 /* Boot on the stack */
585 /* Kick the second */ 592 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
587 594
588 Dprintk("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
590 597
591 /* 598 /*
592 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
593 */ 600 */
594 udelay(200); 601 udelay(200);
595 /* 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
596 * Due to the Pentium erratum 3AP. 603 maxlvt = lapic_get_maxlvt();
597 */ 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
598 maxlvt = lapic_get_maxlvt(); 605 apic_write(APIC_ESR, 0);
599 if (maxlvt > 3) { 606 accept_status = (apic_read(APIC_ESR) & 0xEF);
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0);
602 } 607 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 608 pr_debug("NMI sent.\n");
604 Dprintk("NMI sent.\n");
605 609
606 if (send_status) 610 if (send_status)
607 printk(KERN_ERR "APIC never delivered???\n"); 611 printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +629,40 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 629 return send_status;
626 } 630 }
627 631
632 maxlvt = lapic_get_maxlvt();
633
628 /* 634 /*
629 * Be paranoid about clearing APIC errors. 635 * Be paranoid about clearing APIC errors.
630 */ 636 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 637 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 638 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 639 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 640 apic_read(APIC_ESR);
635 } 641 }
636 642
637 Dprintk("Asserting INIT.\n"); 643 pr_debug("Asserting INIT.\n");
638 644
639 /* 645 /*
640 * Turn INIT on target chip 646 * Turn INIT on target chip
641 */ 647 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643
644 /* 648 /*
645 * Send IPI 649 * Send IPI
646 */ 650 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 651 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
648 | APIC_DM_INIT); 652 phys_apicid);
649 653
650 Dprintk("Waiting for send to finish...\n"); 654 pr_debug("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 655 send_status = safe_apic_wait_icr_idle();
652 656
653 mdelay(10); 657 mdelay(10);
654 658
655 Dprintk("Deasserting INIT.\n"); 659 pr_debug("Deasserting INIT.\n");
656 660
657 /* Target chip */ 661 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659
660 /* Send IPI */ 662 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 663 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
662 664
663 Dprintk("Waiting for send to finish...\n"); 665 pr_debug("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 666 send_status = safe_apic_wait_icr_idle();
665 667
666 mb(); 668 mb();
@@ -687,55 +689,46 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
687 /* 689 /*
688 * Run STARTUP IPI loop. 690 * Run STARTUP IPI loop.
689 */ 691 */
690 Dprintk("#startup loops: %d.\n", num_starts); 692 pr_debug("#startup loops: %d.\n", num_starts);
691
692 maxlvt = lapic_get_maxlvt();
693 693
694 for (j = 1; j <= num_starts; j++) { 694 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 695 pr_debug("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 696 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 697 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 698 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 699 pr_debug("After apic_write.\n");
700 700
701 /* 701 /*
702 * STARTUP IPI 702 * STARTUP IPI
703 */ 703 */
704 704
705 /* Target chip */ 705 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707
708 /* Boot on the stack */ 706 /* Boot on the stack */
709 /* Kick the second */ 707 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 708 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
711 | (start_eip >> 12)); 709 phys_apicid);
712 710
713 /* 711 /*
714 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
715 */ 713 */
716 udelay(300); 714 udelay(300);
717 715
718 Dprintk("Startup point 1.\n"); 716 pr_debug("Startup point 1.\n");
719 717
720 Dprintk("Waiting for send to finish...\n"); 718 pr_debug("Waiting for send to finish...\n");
721 send_status = safe_apic_wait_icr_idle(); 719 send_status = safe_apic_wait_icr_idle();
722 720
723 /* 721 /*
724 * Give the other CPU some time to accept the IPI. 722 * Give the other CPU some time to accept the IPI.
725 */ 723 */
726 udelay(200); 724 udelay(200);
727 /* 725 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 726 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 727 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 728 if (send_status || accept_status)
736 break; 729 break;
737 } 730 }
738 Dprintk("After Startup.\n"); 731 pr_debug("After Startup.\n");
739 732
740 if (send_status) 733 if (send_status)
741 printk(KERN_ERR "APIC never delivered???\n"); 734 printk(KERN_ERR "APIC never delivered???\n");
@@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
763} 756}
764 757
765#ifdef CONFIG_X86_64 758#ifdef CONFIG_X86_64
759
760/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
761static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
762{
763 if (!after_bootmem)
764 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
765}
766
766/* 767/*
767 * Allocate node local memory for the AP pda. 768 * Allocate node local memory for the AP pda.
768 * 769 *
769 * Must be called after the _cpu_pda pointer table is initialized. 770 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 771 */
771static int __cpuinit get_local_pda(int cpu) 772int __cpuinit get_local_pda(int cpu)
772{ 773{
773 struct x8664_pda *oldpda, *newpda; 774 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 775 unsigned long size = sizeof(struct x8664_pda);
@@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu)
791 792
792 if (oldpda) { 793 if (oldpda) {
793 memcpy(newpda, oldpda, size); 794 memcpy(newpda, oldpda, size);
794 if (!after_bootmem) 795 free_bootmem_pda(oldpda);
795 free_bootmem((unsigned long)oldpda, size);
796 } 796 }
797 797
798 newpda->in_bootmem = 0; 798 newpda->in_bootmem = 0;
@@ -874,7 +874,7 @@ do_rest:
874 start_ip = setup_trampoline(); 874 start_ip = setup_trampoline();
875 875
876 /* So we see what's up */ 876 /* So we see what's up */
877 printk(KERN_INFO "Booting processor %d/%d ip %lx\n", 877 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
878 cpu, apicid, start_ip); 878 cpu, apicid, start_ip);
879 879
880 /* 880 /*
@@ -886,16 +886,18 @@ do_rest:
886 886
887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
888 888
889 Dprintk("Setting warm reset code and vector.\n"); 889 pr_debug("Setting warm reset code and vector.\n");
890 890
891 store_NMI_vector(&nmi_high, &nmi_low); 891 store_NMI_vector(&nmi_high, &nmi_low);
892 892
893 smpboot_setup_warm_reset_vector(start_ip); 893 smpboot_setup_warm_reset_vector(start_ip);
894 /* 894 /*
895 * Be paranoid about clearing APIC errors. 895 * Be paranoid about clearing APIC errors.
896 */ 896 */
897 apic_write(APIC_ESR, 0); 897 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
898 apic_read(APIC_ESR); 898 apic_write(APIC_ESR, 0);
899 apic_read(APIC_ESR);
900 }
899 } 901 }
900 902
901 /* 903 /*
@@ -907,9 +909,9 @@ do_rest:
907 /* 909 /*
908 * allow APs to start initializing. 910 * allow APs to start initializing.
909 */ 911 */
910 Dprintk("Before Callout %d.\n", cpu); 912 pr_debug("Before Callout %d.\n", cpu);
911 cpu_set(cpu, cpu_callout_map); 913 cpu_set(cpu, cpu_callout_map);
912 Dprintk("After Callout %d.\n", cpu); 914 pr_debug("After Callout %d.\n", cpu);
913 915
914 /* 916 /*
915 * Wait 5s total for a response 917 * Wait 5s total for a response
@@ -922,10 +924,10 @@ do_rest:
922 924
923 if (cpu_isset(cpu, cpu_callin_map)) { 925 if (cpu_isset(cpu, cpu_callin_map)) {
924 /* number CPUs logically, starting from 1 (BSP is 0) */ 926 /* number CPUs logically, starting from 1 (BSP is 0) */
925 Dprintk("OK.\n"); 927 pr_debug("OK.\n");
926 printk(KERN_INFO "CPU%d: ", cpu); 928 printk(KERN_INFO "CPU%d: ", cpu);
927 print_cpu_info(&cpu_data(cpu)); 929 print_cpu_info(&cpu_data(cpu));
928 Dprintk("CPU has booted.\n"); 930 pr_debug("CPU has booted.\n");
929 } else { 931 } else {
930 boot_error = 1; 932 boot_error = 1;
931 if (*((volatile unsigned char *)trampoline_base) 933 if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +972,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
970 972
971 WARN_ON(irqs_disabled()); 973 WARN_ON(irqs_disabled());
972 974
973 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 975 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
974 976
975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 977 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
976 !physid_isset(apicid, phys_cpu_present_map)) { 978 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +984,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
982 * Already booted CPU? 984 * Already booted CPU?
983 */ 985 */
984 if (cpu_isset(cpu, cpu_callin_map)) { 986 if (cpu_isset(cpu, cpu_callin_map)) {
985 Dprintk("do_boot_cpu %d Already started\n", cpu); 987 pr_debug("do_boot_cpu %d Already started\n", cpu);
986 return -ENOSYS; 988 return -ENOSYS;
987 } 989 }
988 990
@@ -1009,7 +1011,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1009 err = do_boot_cpu(apicid, cpu); 1011 err = do_boot_cpu(apicid, cpu);
1010#endif 1012#endif
1011 if (err) { 1013 if (err) {
1012 Dprintk("do_boot_cpu failed %d\n", err); 1014 pr_debug("do_boot_cpu failed %d\n", err);
1013 return -EIO; 1015 return -EIO;
1014 } 1016 }
1015 1017
@@ -1055,6 +1057,34 @@ static __init void disable_smp(void)
1055static int __init smp_sanity_check(unsigned max_cpus) 1057static int __init smp_sanity_check(unsigned max_cpus)
1056{ 1058{
1057 preempt_disable(); 1059 preempt_disable();
1060
1061#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1062 if (def_to_bigsmp && nr_cpu_ids > 8) {
1063 unsigned int cpu;
1064 unsigned nr;
1065
1066 printk(KERN_WARNING
1067 "More than 8 CPUs detected - skipping them.\n"
1068 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1069
1070 nr = 0;
1071 for_each_present_cpu(cpu) {
1072 if (nr >= 8)
1073 cpu_clear(cpu, cpu_present_map);
1074 nr++;
1075 }
1076
1077 nr = 0;
1078 for_each_possible_cpu(cpu) {
1079 if (nr >= 8)
1080 cpu_clear(cpu, cpu_possible_map);
1081 nr++;
1082 }
1083
1084 nr_cpu_ids = 8;
1085 }
1086#endif
1087
1058 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1088 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1059 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1089 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1060 "by the BIOS.\n", hard_smp_processor_id()); 1090 "by the BIOS.\n", hard_smp_processor_id());
@@ -1147,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1147 * Setup boot CPU information 1177 * Setup boot CPU information
1148 */ 1178 */
1149 smp_store_cpu_info(0); /* Final full version of the data */ 1179 smp_store_cpu_info(0); /* Final full version of the data */
1180#ifdef CONFIG_X86_32
1150 boot_cpu_logical_apicid = logical_smp_processor_id(); 1181 boot_cpu_logical_apicid = logical_smp_processor_id();
1182#endif
1151 current_thread_info()->cpu = 0; /* needed? */ 1183 current_thread_info()->cpu = 0; /* needed? */
1152 set_cpu_sibling_map(0); 1184 set_cpu_sibling_map(0);
1153 1185
1186#ifdef CONFIG_X86_64
1187 enable_IR_x2apic();
1188 setup_apic_routing();
1189#endif
1190
1154 if (smp_sanity_check(max_cpus) < 0) { 1191 if (smp_sanity_check(max_cpus) < 0) {
1155 printk(KERN_INFO "SMP disabled\n"); 1192 printk(KERN_INFO "SMP disabled\n");
1156 disable_smp(); 1193 disable_smp();
@@ -1158,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1158 } 1195 }
1159 1196
1160 preempt_disable(); 1197 preempt_disable();
1161 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1198 if (read_apic_id() != boot_cpu_physical_apicid) {
1162 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1199 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1163 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1200 read_apic_id(), boot_cpu_physical_apicid);
1164 /* Or can we switch back to PIC here? */ 1201 /* Or can we switch back to PIC here? */
1165 } 1202 }
1166 preempt_enable(); 1203 preempt_enable();
@@ -1193,6 +1230,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1193 printk(KERN_INFO "CPU%d: ", 0); 1230 printk(KERN_INFO "CPU%d: ", 0);
1194 print_cpu_info(&cpu_data(0)); 1231 print_cpu_info(&cpu_data(0));
1195 setup_boot_clock(); 1232 setup_boot_clock();
1233
1234 if (is_uv_system())
1235 uv_system_init();
1196out: 1236out:
1197 preempt_enable(); 1237 preempt_enable();
1198} 1238}
@@ -1213,7 +1253,7 @@ void __init native_smp_prepare_boot_cpu(void)
1213 1253
1214void __init native_smp_cpus_done(unsigned int max_cpus) 1254void __init native_smp_cpus_done(unsigned int max_cpus)
1215{ 1255{
1216 Dprintk("Boot done.\n"); 1256 pr_debug("Boot done.\n");
1217 1257
1218 impress_friends(); 1258 impress_friends();
1219 smp_checks(); 1259 smp_checks();
@@ -1223,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1223 check_nmi_watchdog(); 1263 check_nmi_watchdog();
1224} 1264}
1225 1265
1226#ifdef CONFIG_HOTPLUG_CPU
1227
1228static void remove_siblinginfo(int cpu)
1229{
1230 int sibling;
1231 struct cpuinfo_x86 *c = &cpu_data(cpu);
1232
1233 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
1234 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1235 /*/
1236 * last thread sibling in this cpu core going down
1237 */
1238 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1239 cpu_data(sibling).booted_cores--;
1240 }
1241
1242 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
1243 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1244 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1245 cpus_clear(per_cpu(cpu_core_map, cpu));
1246 c->phys_proc_id = 0;
1247 c->cpu_core_id = 0;
1248 cpu_clear(cpu, cpu_sibling_setup_map);
1249}
1250
1251static int additional_cpus __initdata = -1;
1252
1253static __init int setup_additional_cpus(char *s)
1254{
1255 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1256}
1257early_param("additional_cpus", setup_additional_cpus);
1258
1259/* 1266/*
1260 * cpu_possible_map should be static, it cannot change as cpu's 1267 * cpu_possible_map should be static, it cannot change as cpu's
1261 * are onlined, or offlined. The reason is per-cpu data-structures 1268 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus);
1275 */ 1282 */
1276__init void prefill_possible_map(void) 1283__init void prefill_possible_map(void)
1277{ 1284{
1278 int i; 1285 int i, possible;
1279 int possible;
1280 1286
1281 /* no processor from mptable or madt */ 1287 /* no processor from mptable or madt */
1282 if (!num_processors) 1288 if (!num_processors)
1283 num_processors = 1; 1289 num_processors = 1;
1284 1290
1285#ifdef CONFIG_HOTPLUG_CPU 1291 possible = num_processors + disabled_cpus;
1286 if (additional_cpus == -1) {
1287 if (disabled_cpus > 0)
1288 additional_cpus = disabled_cpus;
1289 else
1290 additional_cpus = 0;
1291 }
1292#else
1293 additional_cpus = 0;
1294#endif
1295 possible = num_processors + additional_cpus;
1296 if (possible > NR_CPUS) 1292 if (possible > NR_CPUS)
1297 possible = NR_CPUS; 1293 possible = NR_CPUS;
1298 1294
@@ -1305,17 +1301,64 @@ __init void prefill_possible_map(void)
1305 nr_cpu_ids = possible; 1301 nr_cpu_ids = possible;
1306} 1302}
1307 1303
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306static void remove_siblinginfo(int cpu)
1307{
1308 int sibling;
1309 struct cpuinfo_x86 *c = &cpu_data(cpu);
1310
1311 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1312 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1313 /*/
1314 * last thread sibling in this cpu core going down
1315 */
1316 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1317 cpu_data(sibling).booted_cores--;
1318 }
1319
1320 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1321 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1322 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1323 cpus_clear(per_cpu(cpu_core_map, cpu));
1324 c->phys_proc_id = 0;
1325 c->cpu_core_id = 0;
1326 cpu_clear(cpu, cpu_sibling_setup_map);
1327}
1328
1308static void __ref remove_cpu_from_maps(int cpu) 1329static void __ref remove_cpu_from_maps(int cpu)
1309{ 1330{
1310 cpu_clear(cpu, cpu_online_map); 1331 cpu_clear(cpu, cpu_online_map);
1311 cpu_clear(cpu, cpu_callout_map); 1332 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1333 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1334 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1335 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1336 numa_remove_cpu(cpu);
1316} 1337}
1317 1338
1318int __cpu_disable(void) 1339void cpu_disable_common(void)
1340{
1341 int cpu = smp_processor_id();
1342 /*
1343 * HACK:
1344 * Allow any queued timer interrupts to get serviced
1345 * This is only a temporary solution until we cleanup
1346 * fixup_irqs as we do for IA64.
1347 */
1348 local_irq_enable();
1349 mdelay(1);
1350
1351 local_irq_disable();
1352 remove_siblinginfo(cpu);
1353
1354 /* It's now safe to remove this processor from the online map */
1355 lock_vector_lock();
1356 remove_cpu_from_maps(cpu);
1357 unlock_vector_lock();
1358 fixup_irqs(cpu_online_map);
1359}
1360
1361int native_cpu_disable(void)
1319{ 1362{
1320 int cpu = smp_processor_id(); 1363 int cpu = smp_processor_id();
1321 1364
@@ -1334,25 +1377,11 @@ int __cpu_disable(void)
1334 stop_apic_nmi_watchdog(NULL); 1377 stop_apic_nmi_watchdog(NULL);
1335 clear_local_APIC(); 1378 clear_local_APIC();
1336 1379
1337 /* 1380 cpu_disable_common();
1338 * HACK:
1339 * Allow any queued timer interrupts to get serviced
1340 * This is only a temporary solution until we cleanup
1341 * fixup_irqs as we do for IA64.
1342 */
1343 local_irq_enable();
1344 mdelay(1);
1345
1346 local_irq_disable();
1347 remove_siblinginfo(cpu);
1348
1349 /* It's now safe to remove this processor from the online map */
1350 remove_cpu_from_maps(cpu);
1351 fixup_irqs(cpu_online_map);
1352 return 0; 1381 return 0;
1353} 1382}
1354 1383
1355void __cpu_die(unsigned int cpu) 1384void native_cpu_die(unsigned int cpu)
1356{ 1385{
1357 /* We don't do anything here: idle task is faking death itself. */ 1386 /* We don't do anything here: idle task is faking death itself. */
1358 unsigned int i; 1387 unsigned int i;
@@ -1369,28 +1398,45 @@ void __cpu_die(unsigned int cpu)
1369 } 1398 }
1370 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1399 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1371} 1400}
1401
1402void play_dead_common(void)
1403{
1404 idle_task_exit();
1405 reset_lazy_tlbstate();
1406 irq_ctx_exit(raw_smp_processor_id());
1407 c1e_remove_cpu(raw_smp_processor_id());
1408
1409 mb();
1410 /* Ack it */
1411 __get_cpu_var(cpu_state) = CPU_DEAD;
1412
1413 /*
1414 * With physical CPU hotplug, we should halt the cpu
1415 */
1416 local_irq_disable();
1417}
1418
1419void native_play_dead(void)
1420{
1421 play_dead_common();
1422 wbinvd_halt();
1423}
1424
1372#else /* ... !CONFIG_HOTPLUG_CPU */ 1425#else /* ... !CONFIG_HOTPLUG_CPU */
1373int __cpu_disable(void) 1426int native_cpu_disable(void)
1374{ 1427{
1375 return -ENOSYS; 1428 return -ENOSYS;
1376} 1429}
1377 1430
1378void __cpu_die(unsigned int cpu) 1431void native_cpu_die(unsigned int cpu)
1379{ 1432{
1380 /* We said "no" in __cpu_disable */ 1433 /* We said "no" in __cpu_disable */
1381 BUG(); 1434 BUG();
1382} 1435}
1383#endif
1384 1436
1385/* 1437void native_play_dead(void)
1386 * If the BIOS enumerates physical processors before logical,
1387 * maxcpus=N at enumeration-time can be used to disable HT.
1388 */
1389static int __init parse_maxcpus(char *arg)
1390{ 1438{
1391 extern unsigned int maxcpus; 1439 BUG();
1392
1393 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0;
1395} 1440}
1396early_param("maxcpus", parse_maxcpus); 1441
1442#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044ba..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a60..1884a8d12bfa 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/unistd.h> 23#include <linux/unistd.h>
24 24
25#include <asm/syscalls.h>
26
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 28 unsigned long prot, unsigned long flags,
27 unsigned long fd, unsigned long pgoff) 29 unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h>
19 20
20asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
21 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
22{ 24{
23 long error; 25 long error;
24 struct file * file; 26 struct file *file;
25 27
26 error = -EINVAL; 28 error = -EINVAL;
27 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
56 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
57 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
58 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
59 of playground for now. -AK */ 61 of playground for now. -AK */
60 *begin = 0x40000000; 62 *begin = 0x40000000;
61 *end = 0x80000000; 63 *end = 0x80000000;
62 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
63 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
64 if (new_begin) 66 if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
66 } 68 }
67 } else { 69 } else {
68 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
69 *end = TASK_SIZE; 71 *end = TASK_SIZE;
70 } 72 }
71} 73}
72 74
73unsigned long 75unsigned long
74arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
78 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
79 unsigned long start_addr; 81 unsigned long start_addr;
80 unsigned long begin, end; 82 unsigned long begin, end;
81 83
82 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
83 return addr; 85 return addr;
84 86
85 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
86 88
87 if (len > end) 89 if (len > end)
88 return -ENOMEM; 90 return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
96 } 98 }
97 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
98 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
99 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
100 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
101 } 103 }
102 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
103 if (addr < begin) 105 if (addr < begin)
104 addr = begin; 106 addr = begin;
105 start_addr = addr; 107 start_addr = addr;
106 108
107full_search: 109full_search:
@@ -127,7 +129,7 @@ full_search:
127 return addr; 129 return addr;
128 } 130 }
129 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
130 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
131 133
132 addr = vma->vm_end; 134 addr = vma->vm_end;
133 } 135 }
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
177 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
178 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
179 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
180 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
181 } 183 }
182 184
183 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
194 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
195 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
196 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
198 200
199 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
200 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
224} 226}
225 227
226 228
227asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
228{ 230{
229 int err; 231 int err;
230 down_read(&uts_sem); 232 down_read(&uts_sem);
231 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
232 up_read(&uts_sem); 234 up_read(&uts_sem);
233 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
234 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
235 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
236} 238}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c17487..de87d6008295 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
8#define __NO_STUBS 8#define __NO_STUBS
9 9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_ 11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h> 12#include <asm/unistd_64.h>
13 13
14#undef __SYSCALL 14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym, 15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_64_UNISTD_H_ 16#undef _ASM_X86_UNISTD_64_H
17 17
18typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
19 19
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h>
39 40
40#include "do_timer.h" 41#include "do_timer.h"
41 42
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
46 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
47 48
48#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
49 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
50 in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else 53#else
54 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
55 55
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
94 94
95 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
96 96
97#ifdef CONFIG_MCA
97 if (MCA_bus) { 98 if (MCA_bus) {
98 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
99 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
107 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
108 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
109 } 110 }
111#endif
110 112
111 return IRQ_HANDLED; 113 return IRQ_HANDLED;
112} 114}
@@ -129,6 +131,7 @@ void __init hpet_time_init(void)
129 */ 131 */
130void __init time_init(void) 132void __init time_init(void)
131{ 133{
134 pre_time_init_hook();
132 tsc_init(); 135 tsc_init();
133 late_time_init = choose_time_init(); 136 late_time_init = choose_time_init();
134} 137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af2..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_init(); 132 tsc_init();
120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
121 vgetcpu_mode = VGETCPU_RDTSCP;
122 else
123 vgetcpu_mode = VGETCPU_LSL;
124 133
125 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
126} 135}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..04431f34fd16 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
8 */ 8 */
9#include <linux/mc146818rtc.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
@@ -17,6 +17,7 @@
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
20 21
21#include <mach_apic.h> 22#include <mach_apic.h>
22 23
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
783 uv_init_blade(blade, node, cur_cpu); 784 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade); 785 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 } 786 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts(); 788 uv_enable_timeouts();
788 789
789 return 0; 790 return 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
10#include <asm/ldt.h> 10#include <asm/ldt.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/syscalls.h>
13 14
14#include "tls.h" 15#include "tls.h"
15 16
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 8a768973c4f0..04d242ab0161 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
7 */ 7 */
8 8
9/* 9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
11 * state in 'asm.s'.
12 */ 11 */
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/highmem.h>
17#include <linux/kprobes.h> 15#include <linux/kprobes.h>
18#include <linux/uaccess.h> 16#include <linux/uaccess.h>
19#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -32,6 +30,8 @@
32#include <linux/bug.h> 30#include <linux/bug.h>
33#include <linux/nmi.h> 31#include <linux/nmi.h>
34#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
35 35
36#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
37#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -46,20 +46,31 @@
46#include <linux/edac.h> 46#include <linux/edac.h>
47#endif 47#endif
48 48
49#include <asm/arch_hooks.h>
50#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/debugreg.h> 51#include <asm/debugreg.h>
53#include <asm/atomic.h> 52#include <asm/atomic.h>
54#include <asm/system.h> 53#include <asm/system.h>
55#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
58#include <asm/nmi.h> 68#include <asm/nmi.h>
59#include <asm/smp.h> 69#include <asm/smp.h>
60#include <asm/io.h> 70#include <asm/io.h>
71#include <asm/traps.h>
61 72
62#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
63 74
64DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
65EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -76,431 +87,104 @@ char ignore_fpu_irq;
76 */ 87 */
77gate_desc idt_table[256] 88gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64;
103static int ignore_nmis;
104static int die_counter;
105
106void printk_address(unsigned long address, int reliable)
107{
108#ifdef CONFIG_KALLSYMS
109 unsigned long offset = 0;
110 unsigned long symsize;
111 const char *symname;
112 char *modname;
113 char *delim = ":";
114 char namebuf[KSYM_NAME_LEN];
115 char reliab[4] = "";
116
117 symname = kallsyms_lookup(address, &symsize, &offset,
118 &modname, namebuf);
119 if (!symname) {
120 printk(" [<%08lx>]\n", address);
121 return;
122 }
123 if (!reliable)
124 strcpy(reliab, "? ");
125
126 if (!modname)
127 modname = delim = "";
128 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
129 address, reliab, delim, modname, delim, symname, offset, symsize);
130#else
131 printk(" [<%08lx>]\n", address);
132#endif 90#endif
133}
134
135static inline int valid_stack_ptr(struct thread_info *tinfo,
136 void *p, unsigned int size)
137{
138 void *t = tinfo;
139 return p > t && p <= t + THREAD_SIZE - size;
140}
141
142/* The form of the top of the frame on the stack */
143struct stack_frame {
144 struct stack_frame *next_frame;
145 unsigned long return_address;
146};
147 91
148static inline unsigned long 92static int ignore_nmis;
149print_context_stack(struct thread_info *tinfo,
150 unsigned long *stack, unsigned long bp,
151 const struct stacktrace_ops *ops, void *data)
152{
153 struct stack_frame *frame = (struct stack_frame *)bp;
154
155 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
156 unsigned long addr;
157
158 addr = *stack;
159 if (__kernel_text_address(addr)) {
160 if ((unsigned long) stack == bp + 4) {
161 ops->address(data, addr, 1);
162 frame = frame->next_frame;
163 bp = (unsigned long) frame;
164 } else {
165 ops->address(data, addr, bp == 0);
166 }
167 }
168 stack++;
169 }
170 return bp;
171}
172
173void dump_trace(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *stack, unsigned long bp,
175 const struct stacktrace_ops *ops, void *data)
176{
177 if (!task)
178 task = current;
179
180 if (!stack) {
181 unsigned long dummy;
182 stack = &dummy;
183 if (task != current)
184 stack = (unsigned long *)task->thread.sp;
185 }
186
187#ifdef CONFIG_FRAME_POINTER
188 if (!bp) {
189 if (task == current) {
190 /* Grab bp right from our regs */
191 asm("movl %%ebp, %0" : "=r" (bp) :);
192 } else {
193 /* bp is the last reg pushed by switch_to */
194 bp = *(unsigned long *) task->thread.sp;
195 }
196 }
197#endif
198
199 for (;;) {
200 struct thread_info *context;
201
202 context = (struct thread_info *)
203 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
204 bp = print_context_stack(context, stack, bp, ops, data);
205 /*
206 * Should be after the line below, but somewhere
207 * in early boot context comes out corrupted and we
208 * can't reference it:
209 */
210 if (ops->stack(data, "IRQ") < 0)
211 break;
212 stack = (unsigned long *)context->previous_esp;
213 if (!stack)
214 break;
215 touch_nmi_watchdog();
216 }
217}
218EXPORT_SYMBOL(dump_trace);
219
220static void
221print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
222{
223 printk(data);
224 print_symbol(msg, symbol);
225 printk("\n");
226}
227
228static void print_trace_warning(void *data, char *msg)
229{
230 printk("%s%s\n", (char *)data, msg);
231}
232
233static int print_trace_stack(void *data, char *name)
234{
235 return 0;
236}
237
238/*
239 * Print one address/symbol entries per line.
240 */
241static void print_trace_address(void *data, unsigned long addr, int reliable)
242{
243 printk("%s [<%08lx>] ", (char *)data, addr);
244 if (!reliable)
245 printk("? ");
246 print_symbol("%s\n", addr);
247 touch_nmi_watchdog();
248}
249
250static const struct stacktrace_ops print_trace_ops = {
251 .warning = print_trace_warning,
252 .warning_symbol = print_trace_warning_symbol,
253 .stack = print_trace_stack,
254 .address = print_trace_address,
255};
256 93
257static void 94static inline void conditional_sti(struct pt_regs *regs)
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 95{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
262 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
263} 98}
264 99
265void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
266 unsigned long *stack, unsigned long bp)
267{ 101{
268 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
269} 105}
270 106
271static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
272show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
273 unsigned long *sp, unsigned long bp, char *log_lvl)
274{ 108{
275 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
276 int i; 110 local_irq_disable();
277 111 dec_preempt_count();
278 if (sp == NULL) {
279 if (task)
280 sp = (unsigned long *)task->thread.sp;
281 else
282 sp = (unsigned long *)&sp;
283 }
284
285 stack = sp;
286 for (i = 0; i < kstack_depth_to_print; i++) {
287 if (kstack_end(stack))
288 break;
289 if (i && ((i % 8) == 0))
290 printk("\n%s ", log_lvl);
291 printk("%08lx ", *stack++);
292 }
293 printk("\n%sCall Trace:\n", log_lvl);
294
295 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
296} 112}
297 113
298void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
299{ 117{
300 printk(" "); 118 if (!user_mode_vm(regs))
301 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
302} 120}
303 121
304/* 122/*
305 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
306 */ 127 */
307void dump_stack(void) 128static int lazy_iobitmap_copy(void)
308{
309 unsigned long bp = 0;
310 unsigned long stack;
311
312#ifdef CONFIG_FRAME_POINTER
313 if (!bp)
314 asm("movl %%ebp, %0" : "=r" (bp):);
315#endif
316
317 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
318 current->pid, current->comm, print_tainted(),
319 init_utsname()->release,
320 (int)strcspn(init_utsname()->version, " "),
321 init_utsname()->version);
322
323 show_trace(current, NULL, &stack, bp);
324}
325
326EXPORT_SYMBOL(dump_stack);
327
328void show_registers(struct pt_regs *regs)
329{ 129{
330 int i; 130 struct thread_struct *thread;
131 struct tss_struct *tss;
132 int cpu;
331 133
332 print_modules(); 134 cpu = get_cpu();
333 __show_registers(regs, 0); 135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
334 137
335 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
336 TASK_COMM_LEN, current->comm, task_pid_nr(current), 139 thread->io_bitmap_ptr) {
337 current_thread_info(), current, task_thread_info(current)); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
338 /* 141 thread->io_bitmap_max);
339 * When in-kernel, we also print out the stack and code at the 142 /*
340 * time of the fault.. 143 * If the previously set map was extending to higher ports
341 */ 144 * than the current one, pad extra space with 0xff (no access).
342 if (!user_mode_vm(regs)) { 145 */
343 unsigned int code_prologue = code_bytes * 43 / 64; 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
344 unsigned int code_len = code_bytes; 147 memset((char *) tss->io_bitmap +
345 unsigned char c; 148 thread->io_bitmap_max, 0xff,
346 u8 *ip; 149 tss->io_bitmap_max - thread->io_bitmap_max);
347
348 printk("\n" KERN_EMERG "Stack: ");
349 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
350
351 printk(KERN_EMERG "Code: ");
352
353 ip = (u8 *)regs->ip - code_prologue;
354 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
355 /* try starting at EIP */
356 ip = (u8 *)regs->ip;
357 code_len = code_len - code_prologue + 1;
358 }
359 for (i = 0; i < code_len; i++, ip++) {
360 if (ip < (u8 *)PAGE_OFFSET ||
361 probe_kernel_address(ip, c)) {
362 printk(" Bad EIP value.");
363 break;
364 }
365 if (ip == (u8 *)regs->ip)
366 printk("<%02x> ", c);
367 else
368 printk("%02x ", c);
369 } 150 }
370 } 151 tss->io_bitmap_max = thread->io_bitmap_max;
371 printk("\n"); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
372} 153 tss->io_bitmap_owner = thread;
373 154 put_cpu();
374int is_valid_bugaddr(unsigned long ip)
375{
376 unsigned short ud2;
377
378 if (ip < PAGE_OFFSET)
379 return 0;
380 if (probe_kernel_address((unsigned short *)ip, ud2))
381 return 0;
382
383 return ud2 == 0x0b0f;
384}
385
386int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{
388 unsigned short ss;
389 unsigned long sp;
390 155
391 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
392#ifdef CONFIG_PREEMPT
393 printk("PREEMPT ");
394#endif
395#ifdef CONFIG_SMP
396 printk("SMP ");
397#endif
398#ifdef CONFIG_DEBUG_PAGEALLOC
399 printk("DEBUG_PAGEALLOC");
400#endif
401 printk("\n");
402 if (notify_die(DIE_OOPS, str, regs, err,
403 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
404 return 1; 156 return 1;
405
406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 sp = (unsigned long) (&regs->sp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 sp = regs->sp;
412 ss = regs->ss & 0xffff;
413 } 157 }
414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 158 put_cpu();
415 print_symbol("%s", regs->ip);
416 printk(" SS:ESP %04x:%08lx\n", ss, sp);
417 return 0;
418}
419
420/*
421 * This is gone through when something in the kernel has done something bad
422 * and is about to be terminated:
423 */
424void die(const char *str, struct pt_regs *regs, long err)
425{
426 static struct {
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449
450 if (++die.lock_owner_depth < 3) {
451 report_bug(regs->ip, regs);
452
453 if (__die(str, regs, err))
454 regs = NULL;
455 } else {
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 }
458
459 bust_spinlocks(0);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479}
480 159
481static inline void 160 return 0;
482die_if_kernel(const char *str, struct pt_regs *regs, long err)
483{
484 if (!user_mode_vm(regs))
485 die(str, regs, err);
486} 161}
162#endif
487 163
488static void __kprobes 164static void __kprobes
489do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
490 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
491{ 167{
492 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
493 169
170#ifdef CONFIG_X86_32
494 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
495 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
496 goto vm86_trap; 177 goto vm86_trap;
497 goto trap_signal; 178 goto trap_signal;
498 } 179 }
180#endif
499 181
500 if (!user_mode(regs)) 182 if (!user_mode(regs))
501 goto kernel_trap; 183 goto kernel_trap;
502 184
185#ifdef CONFIG_X86_32
503trap_signal: 186trap_signal:
187#endif
504 /* 188 /*
505 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
506 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -513,6 +197,18 @@ trap_signal:
513 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
514 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
515 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
516 if (info) 212 if (info)
517 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
518 else 214 else
@@ -527,120 +223,98 @@ kernel_trap:
527 } 223 }
528 return; 224 return;
529 225
226#ifdef CONFIG_X86_32
530vm86_trap: 227vm86_trap:
531 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
532 error_code, trapnr)) 229 error_code, trapnr))
533 goto trap_signal; 230 goto trap_signal;
534 return; 231 return;
232#endif
535} 233}
536 234
537#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
538void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
539{ \
540 trace_hardirqs_fixup(); \
541 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
542 == NOTIFY_STOP) \
543 return; \
544 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
545}
546
547#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
548void do_##name(struct pt_regs *regs, long error_code) \
549{ \
550 siginfo_t info; \
551 if (irq) \
552 local_irq_enable(); \
553 info.si_signo = signr; \
554 info.si_errno = 0; \
555 info.si_code = sicode; \
556 info.si_addr = (void __user *)siaddr; \
557 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
558 == NOTIFY_STOP) \
559 return; \
560 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
561}
562
563#define DO_VM86_ERROR(trapnr, signr, str, name) \
564void do_##name(struct pt_regs *regs, long error_code) \
565{ \ 237{ \
566 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
567 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
568 return; \ 240 return; \
569 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
570} 243}
571 244
572#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
573void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
574{ \ 247{ \
575 siginfo_t info; \ 248 siginfo_t info; \
576 info.si_signo = signr; \ 249 info.si_signo = signr; \
577 info.si_errno = 0; \ 250 info.si_errno = 0; \
578 info.si_code = sicode; \ 251 info.si_code = sicode; \
579 info.si_addr = (void __user *)siaddr; \ 252 info.si_addr = (void __user *)siaddr; \
580 trace_hardirqs_fixup(); \
581 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
582 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
583 return; \ 255 return; \
584 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
585} 258}
586 259
587DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
588#ifndef CONFIG_KPROBES 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
589DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
590#endif 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
591DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
592DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
593DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
594DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
595DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
596DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
267#ifdef CONFIG_X86_32
597DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
598DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 269#endif
599DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
271
272#ifdef CONFIG_X86_64
273/* Runs on IST stack */
274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
275{
276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
277 12, SIGBUS) == NOTIFY_STOP)
278 return;
279 preempt_conditional_sti(regs);
280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
281 preempt_conditional_cli(regs);
282}
600 283
601void __kprobes 284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
285{
286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
291
292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
300#endif
301
302dotraplinkage void __kprobes
602do_general_protection(struct pt_regs *regs, long error_code) 303do_general_protection(struct pt_regs *regs, long error_code)
603{ 304{
604 struct task_struct *tsk; 305 struct task_struct *tsk;
605 struct thread_struct *thread;
606 struct tss_struct *tss;
607 int cpu;
608 306
609 cpu = get_cpu(); 307 conditional_sti(regs);
610 tss = &per_cpu(init_tss, cpu);
611 thread = &current->thread;
612
613 /*
614 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
615 * invalid offset set (the LAZY one) and the faulting thread has
616 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
617 * and we set the offset field correctly. Then we let the CPU to
618 * restart the faulting instruction.
619 */
620 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
621 thread->io_bitmap_ptr) {
622 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
623 thread->io_bitmap_max);
624 /*
625 * If the previously set map was extending to higher ports
626 * than the current one, pad extra space with 0xff (no access).
627 */
628 if (thread->io_bitmap_max < tss->io_bitmap_max) {
629 memset((char *) tss->io_bitmap +
630 thread->io_bitmap_max, 0xff,
631 tss->io_bitmap_max - thread->io_bitmap_max);
632 }
633 tss->io_bitmap_max = thread->io_bitmap_max;
634 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
635 tss->io_bitmap_owner = thread;
636 put_cpu();
637 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
638 return; 312 return;
639 } 313 }
640 put_cpu();
641 314
642 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
643 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
644 318
645 tsk = current; 319 tsk = current;
646 if (!user_mode(regs)) 320 if (!user_mode(regs))
@@ -662,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
662 force_sig(SIGSEGV, tsk); 336 force_sig(SIGSEGV, tsk);
663 return; 337 return;
664 338
339#ifdef CONFIG_X86_32
665gp_in_vm86: 340gp_in_vm86:
666 local_irq_enable(); 341 local_irq_enable();
667 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
668 return; 343 return;
344#endif
669 345
670gp_in_kernel: 346gp_in_kernel:
671 if (fixup_exception(regs)) 347 if (fixup_exception(regs))
@@ -702,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
702 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
703 379
704 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
705 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
706} 383}
707 384
708static notrace __kprobes void 385static notrace __kprobes void
@@ -728,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
728static notrace __kprobes void 405static notrace __kprobes void
729unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
730{ 407{
731 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
732 return; 410 return;
733#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
734 /* 412 /*
@@ -751,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
751 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
752} 430}
753 431
754static DEFINE_SPINLOCK(nmi_print_lock);
755
756void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
757{
758 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
759 return;
760
761 spin_lock(&nmi_print_lock);
762 /*
763 * We are in trouble anyway, lets at least try
764 * to get a message out:
765 */
766 bust_spinlocks(1);
767 printk(KERN_EMERG "%s", str);
768 printk(" on CPU%d, ip %08lx, registers:\n",
769 smp_processor_id(), regs->ip);
770 show_registers(regs);
771 if (do_panic)
772 panic("Non maskable interrupt");
773 console_silent();
774 spin_unlock(&nmi_print_lock);
775 bust_spinlocks(0);
776
777 /*
778 * If we are in kernel we are probably nested up pretty bad
779 * and might aswell get out now while we still can:
780 */
781 if (!user_mode_vm(regs)) {
782 current->thread.trap_no = 2;
783 crash_kexec(regs);
784 }
785
786 do_exit(SIGSEGV);
787}
788
789static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
790{ 433{
791 unsigned char reason = 0; 434 unsigned char reason = 0;
@@ -824,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
824 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
825 if (reason & 0x40) 468 if (reason & 0x40)
826 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
827 /* 471 /*
828 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
829 * as it's edge-triggered: 473 * as it's edge-triggered:
830 */ 474 */
831 reassert_nmi(); 475 reassert_nmi();
476#endif
832} 477}
833 478
834notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
835{ 481{
836 int cpu;
837
838 nmi_enter(); 482 nmi_enter();
839 483
840 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
841 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
842 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
843 489
844 if (!ignore_nmis) 490 if (!ignore_nmis)
845 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -859,21 +505,44 @@ void restart_nmi(void)
859 acpi_nmi_enable(); 505 acpi_nmi_enable();
860} 506}
861 507
862#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
863void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
864{ 510{
865 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
866
867 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
868 == NOTIFY_STOP) 513 == NOTIFY_STOP)
869 return; 514 return;
870 /* 515#else
871 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
872 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
873 */ 518 return;
874 restore_interrupts(regs); 519#endif
875 520
876 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
525
526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
877} 546}
878#endif 547#endif
879 548
@@ -898,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
898 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
899 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
900 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
901 */ 572 */
902void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
903{ 574{
904 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
905 unsigned int condition; 576 unsigned long condition;
906 577 int si_code;
907 trace_hardirqs_fixup();
908 578
909 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
910 580
@@ -917,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
917 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
918 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
919 return; 589 return;
590
920 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
921 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
922 local_irq_enable();
923 593
924 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
925 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -927,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
927 goto clear_dr7; 597 goto clear_dr7;
928 } 598 }
929 599
600#ifdef CONFIG_X86_32
930 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
931 goto debug_vm86; 602 goto debug_vm86;
603#endif
932 604
933 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
934 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -938,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
938 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
939 */ 611 */
940 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
941 /*
942 * We already checked v86 mode above, so we can
943 * check for kernel mode by just checking the CPL
944 * of CS.
945 */
946 if (!user_mode(regs)) 613 if (!user_mode(regs))
947 goto clear_TF_reenable; 614 goto clear_TF_reenable;
948 } 615 }
949 616
617 si_code = get_si_code(condition);
950 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
951 send_sigtrap(tsk, regs, error_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
952 620
953 /* 621 /*
954 * Disable additional traps. They'll be re-enabled when 622 * Disable additional traps. They'll be re-enabled when
@@ -956,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
956 */ 624 */
957clear_dr7: 625clear_dr7:
958 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
959 return; 628 return;
960 629
630#ifdef CONFIG_X86_32
961debug_vm86: 631debug_vm86:
962 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
963 return; 634 return;
635#endif
964 636
965clear_TF_reenable: 637clear_TF_reenable:
966 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
967 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
968 return; 641 return;
969} 642}
970 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
971/* 658/*
972 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
973 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -1004,7 +691,9 @@ void math_error(void __user *ip)
1004 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
1005 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
1006 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
1007 return; 695 return;
696#endif
1008 default: /* Multiple exceptions */ 697 default: /* Multiple exceptions */
1009 break; 698 break;
1010 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
@@ -1032,9 +721,18 @@ void math_error(void __user *ip)
1032 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1033} 722}
1034 723
1035void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1036{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1037 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1038 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1039} 737}
1040 738
@@ -1086,8 +784,12 @@ static void simd_math_error(void __user *ip)
1086 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1087} 785}
1088 786
1089void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1090{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1091 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1092 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1093 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1106,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1106 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1107 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1108 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1109} 817}
1110 818
1111void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1112{ 821{
822 conditional_sti(regs);
1113#if 0 823#if 0
1114 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1115 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1116#endif 826#endif
1117} 827}
1118 828
829#ifdef CONFIG_X86_32
1119unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1120{ 831{
1121 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1134,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1134 845
1135 return new_kesp; 846 return new_kesp;
1136} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1137 857
1138/* 858/*
1139 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1166,14 +886,24 @@ asmlinkage void math_state_restore(void)
1166 } 886 }
1167 887
1168 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1169 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1170 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1171 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1172} 903}
1173EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1174 905
1175#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1176
1177asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1178{ 908{
1179 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1182,12 +912,46 @@ asmlinkage void math_emulate(long arg)
1182 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1183 schedule(); 913 schedule();
1184} 914}
1185
1186#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1187 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
935{
936 siginfo_t info;
937 local_irq_enable();
938
939 info.si_signo = SIGILL;
940 info.si_errno = 0;
941 info.si_code = ILL_BADSTK;
942 info.si_addr = 0;
943 if (notify_die(DIE_TRAP, "iret exception",
944 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
945 return;
946 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
947}
948#endif
949
1188void __init trap_init(void) 950void __init trap_init(void)
1189{ 951{
952#ifdef CONFIG_X86_32
1190 int i; 953 int i;
954#endif
1191 955
1192#ifdef CONFIG_EISA 956#ifdef CONFIG_EISA
1193 void __iomem *p = early_ioremap(0x0FFFD9, 4); 957 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1197,29 +961,40 @@ void __init trap_init(void)
1197 early_iounmap(p, 4); 961 early_iounmap(p, 4);
1198#endif 962#endif
1199 963
1200 set_trap_gate(0, &divide_error); 964 set_intr_gate(0, &divide_error);
1201 set_intr_gate(1, &debug); 965 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1202 set_intr_gate(2, &nmi); 966 set_intr_gate_ist(2, &nmi, NMI_STACK);
1203 set_system_intr_gate(3, &int3); /* int3 can be called from all */ 967 /* int3 can be called from all */
1204 set_system_gate(4, &overflow); /* int4 can be called from all */ 968 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
1205 set_trap_gate(5, &bounds); 969 /* int4 can be called from all */
1206 set_trap_gate(6, &invalid_op); 970 set_system_intr_gate(4, &overflow);
1207 set_trap_gate(7, &device_not_available); 971 set_intr_gate(5, &bounds);
972 set_intr_gate(6, &invalid_op);
973 set_intr_gate(7, &device_not_available);
974#ifdef CONFIG_X86_32
1208 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 975 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1209 set_trap_gate(9, &coprocessor_segment_overrun); 976#else
1210 set_trap_gate(10, &invalid_TSS); 977 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1211 set_trap_gate(11, &segment_not_present); 978#endif
1212 set_trap_gate(12, &stack_segment); 979 set_intr_gate(9, &coprocessor_segment_overrun);
1213 set_trap_gate(13, &general_protection); 980 set_intr_gate(10, &invalid_TSS);
981 set_intr_gate(11, &segment_not_present);
982 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
983 set_intr_gate(13, &general_protection);
1214 set_intr_gate(14, &page_fault); 984 set_intr_gate(14, &page_fault);
1215 set_trap_gate(15, &spurious_interrupt_bug); 985 set_intr_gate(15, &spurious_interrupt_bug);
1216 set_trap_gate(16, &coprocessor_error); 986 set_intr_gate(16, &coprocessor_error);
1217 set_trap_gate(17, &alignment_check); 987 set_intr_gate(17, &alignment_check);
1218#ifdef CONFIG_X86_MCE 988#ifdef CONFIG_X86_MCE
1219 set_trap_gate(18, &machine_check); 989 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1220#endif 990#endif
1221 set_trap_gate(19, &simd_coprocessor_error); 991 set_intr_gate(19, &simd_coprocessor_error);
1222 992
993#ifdef CONFIG_IA32_EMULATION
994 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
995#endif
996
997#ifdef CONFIG_X86_32
1223 if (cpu_has_fxsr) { 998 if (cpu_has_fxsr) {
1224 printk(KERN_INFO "Enabling fast FPU save and restore... "); 999 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1225 set_in_cr4(X86_CR4_OSFXSR); 1000 set_in_cr4(X86_CR4_OSFXSR);
@@ -1232,37 +1007,20 @@ void __init trap_init(void)
1232 printk("done.\n"); 1007 printk("done.\n");
1233 } 1008 }
1234 1009
1235 set_system_gate(SYSCALL_VECTOR, &system_call); 1010 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1236 1011
1237 /* Reserve all the builtin and the syscall vector: */ 1012 /* Reserve all the builtin and the syscall vector: */
1238 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1013 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1239 set_bit(i, used_vectors); 1014 set_bit(i, used_vectors);
1240 1015
1241 set_bit(SYSCALL_VECTOR, used_vectors); 1016 set_bit(SYSCALL_VECTOR, used_vectors);
1242 1017#endif
1243 init_thread_xstate();
1244 /* 1018 /*
1245 * Should be a barrier for any external CPU state: 1019 * Should be a barrier for any external CPU state:
1246 */ 1020 */
1247 cpu_init(); 1021 cpu_init();
1248 1022
1023#ifdef CONFIG_X86_32
1249 trap_init_hook(); 1024 trap_init_hook();
1025#endif
1250} 1026}
1251
1252static int __init kstack_setup(char *s)
1253{
1254 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1255
1256 return 1;
1257}
1258__setup("kstack=", kstack_setup);
1259
1260static int __init code_bytes_setup(char *s)
1261{
1262 code_bytes = simple_strtoul(s, NULL, 0);
1263 if (code_bytes > 8192)
1264 code_bytes = 8192;
1265
1266 return 1;
1267}
1268__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 2696a6837782..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1217 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
27#include <linux/errno.h>
28#include <linux/kexec.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/init.h>
32#include <linux/bug.h>
33#include <linux/nmi.h>
34#include <linux/mm.h>
35
36#if defined(CONFIG_EDAC)
37#include <linux/edac.h>
38#endif
39
40#include <asm/stacktrace.h>
41#include <asm/processor.h>
42#include <asm/debugreg.h>
43#include <asm/atomic.h>
44#include <asm/system.h>
45#include <asm/unwind.h>
46#include <asm/desc.h>
47#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h>
52#include <asm/proto.h>
53#include <asm/pda.h>
54
55#include <mach_traps.h>
56
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64;
81static int ignore_nmis;
82static int die_counter;
83
84static inline void conditional_sti(struct pt_regs *regs)
85{
86 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_enable();
88}
89
90static inline void preempt_conditional_sti(struct pt_regs *regs)
91{
92 inc_preempt_count();
93 if (regs->flags & X86_EFLAGS_IF)
94 local_irq_enable();
95}
96
97static inline void preempt_conditional_cli(struct pt_regs *regs)
98{
99 if (regs->flags & X86_EFLAGS_IF)
100 local_irq_disable();
101 /* Make sure to not schedule here because we could be running
102 on an exception stack. */
103 dec_preempt_count();
104}
105
106void printk_address(unsigned long address, int reliable)
107{
108 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
109}
110
111static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
112 unsigned *usedp, char **idp)
113{
114 static char ids[][8] = {
115 [DEBUG_STACK - 1] = "#DB",
116 [NMI_STACK - 1] = "NMI",
117 [DOUBLEFAULT_STACK - 1] = "#DF",
118 [STACKFAULT_STACK - 1] = "#SS",
119 [MCE_STACK - 1] = "#MC",
120#if DEBUG_STKSZ > EXCEPTION_STKSZ
121 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
122#endif
123 };
124 unsigned k;
125
126 /*
127 * Iterate over all exception stacks, and figure out whether
128 * 'stack' is in one of them:
129 */
130 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
131 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
132 /*
133 * Is 'stack' above this exception frame's end?
134 * If yes then skip to the next frame.
135 */
136 if (stack >= end)
137 continue;
138 /*
139 * Is 'stack' above this exception frame's start address?
140 * If yes then we found the right frame.
141 */
142 if (stack >= end - EXCEPTION_STKSZ) {
143 /*
144 * Make sure we only iterate through an exception
145 * stack once. If it comes up for the second time
146 * then there's something wrong going on - just
147 * break out and return NULL:
148 */
149 if (*usedp & (1U << k))
150 break;
151 *usedp |= 1U << k;
152 *idp = ids[k];
153 return (unsigned long *)end;
154 }
155 /*
156 * If this is a debug stack, and if it has a larger size than
157 * the usual exception stacks, then 'stack' might still
158 * be within the lower portion of the debug stack:
159 */
160#if DEBUG_STKSZ > EXCEPTION_STKSZ
161 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
162 unsigned j = N_EXCEPTION_STACKS - 1;
163
164 /*
165 * Black magic. A large debug stack is composed of
166 * multiple exception stack entries, which we
167 * iterate through now. Dont look:
168 */
169 do {
170 ++j;
171 end -= EXCEPTION_STKSZ;
172 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
173 } while (stack < end - EXCEPTION_STKSZ);
174 if (*usedp & (1U << j))
175 break;
176 *usedp |= 1U << j;
177 *idp = ids[j];
178 return (unsigned long *)end;
179 }
180#endif
181 }
182 return NULL;
183}
184
185/*
186 * x86-64 can have up to three kernel stacks:
187 * process stack
188 * interrupt stack
189 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
190 */
191
192static inline int valid_stack_ptr(struct thread_info *tinfo,
193 void *p, unsigned int size, void *end)
194{
195 void *t = tinfo;
196 if (end) {
197 if (p < end && p >= (end-THREAD_SIZE))
198 return 1;
199 else
200 return 0;
201 }
202 return p > t && p < t + THREAD_SIZE - size;
203}
204
205/* The form of the top of the frame on the stack */
206struct stack_frame {
207 struct stack_frame *next_frame;
208 unsigned long return_address;
209};
210
211static inline unsigned long
212print_context_stack(struct thread_info *tinfo,
213 unsigned long *stack, unsigned long bp,
214 const struct stacktrace_ops *ops, void *data,
215 unsigned long *end)
216{
217 struct stack_frame *frame = (struct stack_frame *)bp;
218
219 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
220 unsigned long addr;
221
222 addr = *stack;
223 if (__kernel_text_address(addr)) {
224 if ((unsigned long) stack == bp + 8) {
225 ops->address(data, addr, 1);
226 frame = frame->next_frame;
227 bp = (unsigned long) frame;
228 } else {
229 ops->address(data, addr, bp == 0);
230 }
231 }
232 stack++;
233 }
234 return bp;
235}
236
237void dump_trace(struct task_struct *task, struct pt_regs *regs,
238 unsigned long *stack, unsigned long bp,
239 const struct stacktrace_ops *ops, void *data)
240{
241 const unsigned cpu = get_cpu();
242 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
243 unsigned used = 0;
244 struct thread_info *tinfo;
245
246 if (!task)
247 task = current;
248
249 if (!stack) {
250 unsigned long dummy;
251 stack = &dummy;
252 if (task && task != current)
253 stack = (unsigned long *)task->thread.sp;
254 }
255
256#ifdef CONFIG_FRAME_POINTER
257 if (!bp) {
258 if (task == current) {
259 /* Grab bp right from our regs */
260 asm("movq %%rbp, %0" : "=r" (bp) :);
261 } else {
262 /* bp is the last reg pushed by switch_to */
263 bp = *(unsigned long *) task->thread.sp;
264 }
265 }
266#endif
267
268 /*
269 * Print function call entries in all stacks, starting at the
270 * current stack address. If the stacks consist of nested
271 * exceptions
272 */
273 tinfo = task_thread_info(task);
274 for (;;) {
275 char *id;
276 unsigned long *estack_end;
277 estack_end = in_exception_stack(cpu, (unsigned long)stack,
278 &used, &id);
279
280 if (estack_end) {
281 if (ops->stack(data, id) < 0)
282 break;
283
284 bp = print_context_stack(tinfo, stack, bp, ops,
285 data, estack_end);
286 ops->stack(data, "<EOE>");
287 /*
288 * We link to the next stack via the
289 * second-to-last pointer (index -2 to end) in the
290 * exception stack:
291 */
292 stack = (unsigned long *) estack_end[-2];
293 continue;
294 }
295 if (irqstack_end) {
296 unsigned long *irqstack;
297 irqstack = irqstack_end -
298 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
299
300 if (stack >= irqstack && stack < irqstack_end) {
301 if (ops->stack(data, "IRQ") < 0)
302 break;
303 bp = print_context_stack(tinfo, stack, bp,
304 ops, data, irqstack_end);
305 /*
306 * We link to the next stack (which would be
307 * the process stack normally) the last
308 * pointer (index -1 to end) in the IRQ stack:
309 */
310 stack = (unsigned long *) (irqstack_end[-1]);
311 irqstack_end = NULL;
312 ops->stack(data, "EOI");
313 continue;
314 }
315 }
316 break;
317 }
318
319 /*
320 * This handles the process stack:
321 */
322 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
323 put_cpu();
324}
325EXPORT_SYMBOL(dump_trace);
326
327static void
328print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
329{
330 print_symbol(msg, symbol);
331 printk("\n");
332}
333
334static void print_trace_warning(void *data, char *msg)
335{
336 printk("%s\n", msg);
337}
338
339static int print_trace_stack(void *data, char *name)
340{
341 printk(" <%s> ", name);
342 return 0;
343}
344
345static void print_trace_address(void *data, unsigned long addr, int reliable)
346{
347 touch_nmi_watchdog();
348 printk_address(addr, reliable);
349}
350
351static const struct stacktrace_ops print_trace_ops = {
352 .warning = print_trace_warning,
353 .warning_symbol = print_trace_warning_symbol,
354 .stack = print_trace_stack,
355 .address = print_trace_address,
356};
357
358void show_trace(struct task_struct *task, struct pt_regs *regs,
359 unsigned long *stack, unsigned long bp)
360{
361 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
363 printk("\n");
364}
365
366static void
367_show_stack(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp)
369{
370 unsigned long *stack;
371 int i;
372 const int cpu = smp_processor_id();
373 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
374 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
375
376 // debugging aid: "show_stack(NULL, NULL);" prints the
377 // back trace for this cpu.
378
379 if (sp == NULL) {
380 if (task)
381 sp = (unsigned long *)task->thread.sp;
382 else
383 sp = (unsigned long *)&sp;
384 }
385
386 stack = sp;
387 for (i = 0; i < kstack_depth_to_print; i++) {
388 if (stack >= irqstack && stack <= irqstack_end) {
389 if (stack == irqstack_end) {
390 stack = (unsigned long *) (irqstack_end[-1]);
391 printk(" <EOI> ");
392 }
393 } else {
394 if (((long) stack & (THREAD_SIZE-1)) == 0)
395 break;
396 }
397 if (i && ((i % 4) == 0))
398 printk("\n");
399 printk(" %016lx", *stack++);
400 touch_nmi_watchdog();
401 }
402 show_trace(task, regs, sp, bp);
403}
404
405void show_stack(struct task_struct *task, unsigned long *sp)
406{
407 _show_stack(task, NULL, sp, 0);
408}
409
410/*
411 * The architecture-independent dump_stack generator
412 */
413void dump_stack(void)
414{
415 unsigned long bp = 0;
416 unsigned long stack;
417
418#ifdef CONFIG_FRAME_POINTER
419 if (!bp)
420 asm("movq %%rbp, %0" : "=r" (bp):);
421#endif
422
423 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
424 current->pid, current->comm, print_tainted(),
425 init_utsname()->release,
426 (int)strcspn(init_utsname()->version, " "),
427 init_utsname()->version);
428 show_trace(NULL, NULL, &stack, bp);
429}
430
431EXPORT_SYMBOL(dump_stack);
432
433void show_registers(struct pt_regs *regs)
434{
435 int i;
436 unsigned long sp;
437 const int cpu = smp_processor_id();
438 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
439
440 sp = regs->sp;
441 printk("CPU %d ", cpu);
442 __show_regs(regs);
443 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
444 cur->comm, cur->pid, task_thread_info(cur), cur);
445
446 /*
447 * When in-kernel, we also print out the stack and code at the
448 * time of the fault..
449 */
450 if (!user_mode(regs)) {
451 unsigned int code_prologue = code_bytes * 43 / 64;
452 unsigned int code_len = code_bytes;
453 unsigned char c;
454 u8 *ip;
455
456 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
458 printk("\n");
459
460 printk(KERN_EMERG "Code: ");
461
462 ip = (u8 *)regs->ip - code_prologue;
463 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
464 /* try starting at RIP */
465 ip = (u8 *)regs->ip;
466 code_len = code_len - code_prologue + 1;
467 }
468 for (i = 0; i < code_len; i++, ip++) {
469 if (ip < (u8 *)PAGE_OFFSET ||
470 probe_kernel_address(ip, c)) {
471 printk(" Bad RIP value.");
472 break;
473 }
474 if (ip == (u8 *)regs->ip)
475 printk("<%02x> ", c);
476 else
477 printk("%02x ", c);
478 }
479 }
480 printk("\n");
481}
482
483int is_valid_bugaddr(unsigned long ip)
484{
485 unsigned short ud2;
486
487 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
488 return 0;
489
490 return ud2 == 0x0b0f;
491}
492
493static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
494static int die_owner = -1;
495static unsigned int die_nest_count;
496
497unsigned __kprobes long oops_begin(void)
498{
499 int cpu;
500 unsigned long flags;
501
502 oops_enter();
503
504 /* racy, but better than risking deadlock. */
505 raw_local_irq_save(flags);
506 cpu = smp_processor_id();
507 if (!__raw_spin_trylock(&die_lock)) {
508 if (cpu == die_owner)
509 /* nested oops. should stop eventually */;
510 else
511 __raw_spin_lock(&die_lock);
512 }
513 die_nest_count++;
514 die_owner = cpu;
515 console_verbose();
516 bust_spinlocks(1);
517 return flags;
518}
519
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{
522 die_owner = -1;
523 bust_spinlocks(0);
524 die_nest_count--;
525 if (!die_nest_count)
526 /* Nest count reaches zero, release the lock. */
527 __raw_spin_unlock(&die_lock);
528 raw_local_irq_restore(flags);
529 if (!regs) {
530 oops_exit();
531 return;
532 }
533 if (panic_on_oops)
534 panic("Fatal exception");
535 oops_exit();
536 do_exit(signr);
537}
538
539int __kprobes __die(const char *str, struct pt_regs *regs, long err)
540{
541 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
542#ifdef CONFIG_PREEMPT
543 printk("PREEMPT ");
544#endif
545#ifdef CONFIG_SMP
546 printk("SMP ");
547#endif
548#ifdef CONFIG_DEBUG_PAGEALLOC
549 printk("DEBUG_PAGEALLOC");
550#endif
551 printk("\n");
552 if (notify_die(DIE_OOPS, str, regs, err,
553 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
554 return 1;
555
556 show_registers(regs);
557 add_taint(TAINT_DIE);
558 /* Executive summary in case the oops scrolled away */
559 printk(KERN_ALERT "RIP ");
560 printk_address(regs->ip, 1);
561 printk(" RSP <%016lx>\n", regs->sp);
562 if (kexec_should_crash(current))
563 crash_kexec(regs);
564 return 0;
565}
566
567void die(const char *str, struct pt_regs *regs, long err)
568{
569 unsigned long flags = oops_begin();
570
571 if (!user_mode(regs))
572 report_bug(regs->ip, regs);
573
574 if (__die(str, regs, err))
575 regs = NULL;
576 oops_end(flags, regs, SIGSEGV);
577}
578
579notrace __kprobes void
580die_nmi(char *str, struct pt_regs *regs, int do_panic)
581{
582 unsigned long flags;
583
584 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
585 return;
586
587 flags = oops_begin();
588 /*
589 * We are in trouble anyway, lets at least try
590 * to get a message out.
591 */
592 printk(KERN_EMERG "%s", str);
593 printk(" on CPU%d, ip %08lx, registers:\n",
594 smp_processor_id(), regs->ip);
595 show_registers(regs);
596 if (kexec_should_crash(current))
597 crash_kexec(regs);
598 if (do_panic || panic_on_oops)
599 panic("Non maskable interrupt");
600 oops_end(flags, NULL, SIGBUS);
601 nmi_exit();
602 local_irq_enable();
603 do_exit(SIGBUS);
604}
605
606static void __kprobes
607do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
608 long error_code, siginfo_t *info)
609{
610 struct task_struct *tsk = current;
611
612 if (!user_mode(regs))
613 goto kernel_trap;
614
615 /*
616 * We want error_code and trap_no set for userspace faults and
617 * kernelspace faults which result in die(), but not
618 * kernelspace faults which are fixed up. die() gives the
619 * process no chance to handle the signal and notice the
620 * kernel fault information, so that won't result in polluting
621 * the information about previously queued, but not yet
622 * delivered, faults. See also do_general_protection below.
623 */
624 tsk->thread.error_code = error_code;
625 tsk->thread.trap_no = trapnr;
626
627 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
628 printk_ratelimit()) {
629 printk(KERN_INFO
630 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
631 tsk->comm, tsk->pid, str,
632 regs->ip, regs->sp, error_code);
633 print_vma_addr(" in ", regs->ip);
634 printk("\n");
635 }
636
637 if (info)
638 force_sig_info(signr, info, tsk);
639 else
640 force_sig(signr, tsk);
641 return;
642
643kernel_trap:
644 if (!fixup_exception(regs)) {
645 tsk->thread.error_code = error_code;
646 tsk->thread.trap_no = trapnr;
647 die(str, regs, error_code);
648 }
649 return;
650}
651
652#define DO_ERROR(trapnr, signr, str, name) \
653asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
654{ \
655 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
656 == NOTIFY_STOP) \
657 return; \
658 conditional_sti(regs); \
659 do_trap(trapnr, signr, str, regs, error_code, NULL); \
660}
661
662#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
663asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
664{ \
665 siginfo_t info; \
666 info.si_signo = signr; \
667 info.si_errno = 0; \
668 info.si_code = sicode; \
669 info.si_addr = (void __user *)siaddr; \
670 trace_hardirqs_fixup(); \
671 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
672 == NOTIFY_STOP) \
673 return; \
674 conditional_sti(regs); \
675 do_trap(trapnr, signr, str, regs, error_code, &info); \
676}
677
678DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
679DO_ERROR(4, SIGSEGV, "overflow", overflow)
680DO_ERROR(5, SIGSEGV, "bounds", bounds)
681DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
682DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
683DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
684DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
685DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
686
687/* Runs on IST stack */
688asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
689{
690 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
691 12, SIGBUS) == NOTIFY_STOP)
692 return;
693 preempt_conditional_sti(regs);
694 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
695 preempt_conditional_cli(regs);
696}
697
698asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
699{
700 static const char str[] = "double fault";
701 struct task_struct *tsk = current;
702
703 /* Return not checked because double check cannot be ignored */
704 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
705
706 tsk->thread.error_code = error_code;
707 tsk->thread.trap_no = 8;
708
709 /* This is always a kernel trap and never fixable (and thus must
710 never return). */
711 for (;;)
712 die(str, regs, error_code);
713}
714
715asmlinkage void __kprobes
716do_general_protection(struct pt_regs *regs, long error_code)
717{
718 struct task_struct *tsk;
719
720 conditional_sti(regs);
721
722 tsk = current;
723 if (!user_mode(regs))
724 goto gp_in_kernel;
725
726 tsk->thread.error_code = error_code;
727 tsk->thread.trap_no = 13;
728
729 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
730 printk_ratelimit()) {
731 printk(KERN_INFO
732 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
733 tsk->comm, tsk->pid,
734 regs->ip, regs->sp, error_code);
735 print_vma_addr(" in ", regs->ip);
736 printk("\n");
737 }
738
739 force_sig(SIGSEGV, tsk);
740 return;
741
742gp_in_kernel:
743 if (fixup_exception(regs))
744 return;
745
746 tsk->thread.error_code = error_code;
747 tsk->thread.trap_no = 13;
748 if (notify_die(DIE_GPF, "general protection fault", regs,
749 error_code, 13, SIGSEGV) == NOTIFY_STOP)
750 return;
751 die("general protection fault", regs, error_code);
752}
753
754static notrace __kprobes void
755mem_parity_error(unsigned char reason, struct pt_regs *regs)
756{
757 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
758 reason);
759 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
760
761#if defined(CONFIG_EDAC)
762 if (edac_handler_set()) {
763 edac_atomic_assert_error();
764 return;
765 }
766#endif
767
768 if (panic_on_unrecovered_nmi)
769 panic("NMI: Not continuing");
770
771 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
772
773 /* Clear and disable the memory parity error line. */
774 reason = (reason & 0xf) | 4;
775 outb(reason, 0x61);
776}
777
778static notrace __kprobes void
779io_check_error(unsigned char reason, struct pt_regs *regs)
780{
781 printk("NMI: IOCK error (debug interrupt?)\n");
782 show_registers(regs);
783
784 /* Re-enable the IOCK line, wait for a few seconds */
785 reason = (reason & 0xf) | 8;
786 outb(reason, 0x61);
787 mdelay(2000);
788 reason &= ~8;
789 outb(reason, 0x61);
790}
791
792static notrace __kprobes void
793unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
794{
795 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
796 return;
797 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
798 reason);
799 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
800
801 if (panic_on_unrecovered_nmi)
802 panic("NMI: Not continuing");
803
804 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
805}
806
807/* Runs on IST stack. This code must keep interrupts off all the time.
808 Nested NMIs are prevented by the CPU. */
809asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
810{
811 unsigned char reason = 0;
812 int cpu;
813
814 cpu = smp_processor_id();
815
816 /* Only the BSP gets external NMIs from the system. */
817 if (!cpu)
818 reason = get_nmi_reason();
819
820 if (!(reason & 0xc0)) {
821 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
822 == NOTIFY_STOP)
823 return;
824 /*
825 * Ok, so this is none of the documented NMI sources,
826 * so it must be the NMI watchdog.
827 */
828 if (nmi_watchdog_tick(regs, reason))
829 return;
830 if (!do_nmi_callback(regs, cpu))
831 unknown_nmi_error(reason, regs);
832
833 return;
834 }
835 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
836 return;
837
838 /* AK: following checks seem to be broken on modern chipsets. FIXME */
839 if (reason & 0x80)
840 mem_parity_error(reason, regs);
841 if (reason & 0x40)
842 io_check_error(reason, regs);
843}
844
845asmlinkage notrace __kprobes void
846do_nmi(struct pt_regs *regs, long error_code)
847{
848 nmi_enter();
849
850 add_pda(__nmi_count, 1);
851
852 if (!ignore_nmis)
853 default_do_nmi(regs);
854
855 nmi_exit();
856}
857
858void stop_nmi(void)
859{
860 acpi_nmi_disable();
861 ignore_nmis++;
862}
863
864void restart_nmi(void)
865{
866 ignore_nmis--;
867 acpi_nmi_enable();
868}
869
870/* runs on IST stack. */
871asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
872{
873 trace_hardirqs_fixup();
874
875 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
876 == NOTIFY_STOP)
877 return;
878
879 preempt_conditional_sti(regs);
880 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
881 preempt_conditional_cli(regs);
882}
883
884/* Help handler running on IST stack to switch back to user stack
885 for scheduling or signal handling. The actual stack switch is done in
886 entry.S */
887asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
888{
889 struct pt_regs *regs = eregs;
890 /* Did already sync */
891 if (eregs == (struct pt_regs *)eregs->sp)
892 ;
893 /* Exception from user space */
894 else if (user_mode(eregs))
895 regs = task_pt_regs(current);
896 /* Exception from kernel and interrupts are enabled. Move to
897 kernel process stack. */
898 else if (eregs->flags & X86_EFLAGS_IF)
899 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
900 if (eregs != regs)
901 *regs = *eregs;
902 return regs;
903}
904
905/* runs on IST stack. */
906asmlinkage void __kprobes do_debug(struct pt_regs * regs,
907 unsigned long error_code)
908{
909 struct task_struct *tsk = current;
910 unsigned long condition;
911 siginfo_t info;
912
913 trace_hardirqs_fixup();
914
915 get_debugreg(condition, 6);
916
917 /*
918 * The processor cleared BTF, so don't mark that we need it set.
919 */
920 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
921 tsk->thread.debugctlmsr = 0;
922
923 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
924 SIGTRAP) == NOTIFY_STOP)
925 return;
926
927 preempt_conditional_sti(regs);
928
929 /* Mask out spurious debug traps due to lazy DR7 setting */
930 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
931 if (!tsk->thread.debugreg7)
932 goto clear_dr7;
933 }
934
935 tsk->thread.debugreg6 = condition;
936
937 /*
938 * Single-stepping through TF: make sure we ignore any events in
939 * kernel space (but re-enable TF when returning to user mode).
940 */
941 if (condition & DR_STEP) {
942 if (!user_mode(regs))
943 goto clear_TF_reenable;
944 }
945
946 /* Ok, finally something we can handle */
947 tsk->thread.trap_no = 1;
948 tsk->thread.error_code = error_code;
949 info.si_signo = SIGTRAP;
950 info.si_errno = 0;
951 info.si_code = TRAP_BRKPT;
952 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
953 force_sig_info(SIGTRAP, &info, tsk);
954
955clear_dr7:
956 set_debugreg(0, 7);
957 preempt_conditional_cli(regs);
958 return;
959
960clear_TF_reenable:
961 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
962 regs->flags &= ~X86_EFLAGS_TF;
963 preempt_conditional_cli(regs);
964 return;
965}
966
967static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
968{
969 if (fixup_exception(regs))
970 return 1;
971
972 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
973 /* Illegal floating point operation in the kernel */
974 current->thread.trap_no = trapnr;
975 die(str, regs, 0);
976 return 0;
977}
978
979/*
980 * Note that we play around with the 'TS' bit in an attempt to get
981 * the correct behaviour even in the presence of the asynchronous
982 * IRQ13 behaviour
983 */
984asmlinkage void do_coprocessor_error(struct pt_regs *regs)
985{
986 void __user *ip = (void __user *)(regs->ip);
987 struct task_struct *task;
988 siginfo_t info;
989 unsigned short cwd, swd;
990
991 conditional_sti(regs);
992 if (!user_mode(regs) &&
993 kernel_math_error(regs, "kernel x87 math error", 16))
994 return;
995
996 /*
997 * Save the info for the exception handler and clear the error.
998 */
999 task = current;
1000 save_init_fpu(task);
1001 task->thread.trap_no = 16;
1002 task->thread.error_code = 0;
1003 info.si_signo = SIGFPE;
1004 info.si_errno = 0;
1005 info.si_code = __SI_FAULT;
1006 info.si_addr = ip;
1007 /*
1008 * (~cwd & swd) will mask out exceptions that are not set to unmasked
1009 * status. 0x3f is the exception bits in these regs, 0x200 is the
1010 * C1 reg you need in case of a stack fault, 0x040 is the stack
1011 * fault bit. We should only be taking one exception at a time,
1012 * so if this combination doesn't produce any single exception,
1013 * then we have a bad program that isn't synchronizing its FPU usage
1014 * and it will suffer the consequences since we won't be able to
1015 * fully reproduce the context of the exception
1016 */
1017 cwd = get_fpu_cwd(task);
1018 swd = get_fpu_swd(task);
1019 switch (swd & ~cwd & 0x3f) {
1020 case 0x000: /* No unmasked exception */
1021 default: /* Multiple exceptions */
1022 break;
1023 case 0x001: /* Invalid Op */
1024 /*
1025 * swd & 0x240 == 0x040: Stack Underflow
1026 * swd & 0x240 == 0x240: Stack Overflow
1027 * User must clear the SF bit (0x40) if set
1028 */
1029 info.si_code = FPE_FLTINV;
1030 break;
1031 case 0x002: /* Denormalize */
1032 case 0x010: /* Underflow */
1033 info.si_code = FPE_FLTUND;
1034 break;
1035 case 0x004: /* Zero Divide */
1036 info.si_code = FPE_FLTDIV;
1037 break;
1038 case 0x008: /* Overflow */
1039 info.si_code = FPE_FLTOVF;
1040 break;
1041 case 0x020: /* Precision */
1042 info.si_code = FPE_FLTRES;
1043 break;
1044 }
1045 force_sig_info(SIGFPE, &info, task);
1046}
1047
1048asmlinkage void bad_intr(void)
1049{
1050 printk("bad interrupt");
1051}
1052
1053asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1054{
1055 void __user *ip = (void __user *)(regs->ip);
1056 struct task_struct *task;
1057 siginfo_t info;
1058 unsigned short mxcsr;
1059
1060 conditional_sti(regs);
1061 if (!user_mode(regs) &&
1062 kernel_math_error(regs, "kernel simd math error", 19))
1063 return;
1064
1065 /*
1066 * Save the info for the exception handler and clear the error.
1067 */
1068 task = current;
1069 save_init_fpu(task);
1070 task->thread.trap_no = 19;
1071 task->thread.error_code = 0;
1072 info.si_signo = SIGFPE;
1073 info.si_errno = 0;
1074 info.si_code = __SI_FAULT;
1075 info.si_addr = ip;
1076 /*
1077 * The SIMD FPU exceptions are handled a little differently, as there
1078 * is only a single status/control register. Thus, to determine which
1079 * unmasked exception was caught we must mask the exception mask bits
1080 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1081 */
1082 mxcsr = get_fpu_mxcsr(task);
1083 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1084 case 0x000:
1085 default:
1086 break;
1087 case 0x001: /* Invalid Op */
1088 info.si_code = FPE_FLTINV;
1089 break;
1090 case 0x002: /* Denormalize */
1091 case 0x010: /* Underflow */
1092 info.si_code = FPE_FLTUND;
1093 break;
1094 case 0x004: /* Zero Divide */
1095 info.si_code = FPE_FLTDIV;
1096 break;
1097 case 0x008: /* Overflow */
1098 info.si_code = FPE_FLTOVF;
1099 break;
1100 case 0x020: /* Precision */
1101 info.si_code = FPE_FLTRES;
1102 break;
1103 }
1104 force_sig_info(SIGFPE, &info, task);
1105}
1106
1107asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1108{
1109}
1110
1111asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1112{
1113}
1114
1115asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1116{
1117}
1118
1119/*
1120 * 'math_state_restore()' saves the current math information in the
1121 * old math state array, and gets the new ones from the current task
1122 *
1123 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1124 * Don't touch unless you *really* know how it works.
1125 */
1126asmlinkage void math_state_restore(void)
1127{
1128 struct task_struct *me = current;
1129
1130 if (!used_math()) {
1131 local_irq_enable();
1132 /*
1133 * does a slab alloc which can sleep
1134 */
1135 if (init_fpu(me)) {
1136 /*
1137 * ran out of memory!
1138 */
1139 do_group_exit(SIGKILL);
1140 return;
1141 }
1142 local_irq_disable();
1143 }
1144
1145 clts(); /* Allow maths ops (or we recurse) */
1146 restore_fpu_checking(&me->thread.xstate->fxsave);
1147 task_thread_info(me)->status |= TS_USEDFPU;
1148 me->fpu_counter++;
1149}
1150EXPORT_SYMBOL_GPL(math_state_restore);
1151
1152void __init trap_init(void)
1153{
1154 set_intr_gate(0, &divide_error);
1155 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1156 set_intr_gate_ist(2, &nmi, NMI_STACK);
1157 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
1158 set_system_gate(4, &overflow); /* int4 can be called from all */
1159 set_intr_gate(5, &bounds);
1160 set_intr_gate(6, &invalid_op);
1161 set_intr_gate(7, &device_not_available);
1162 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1163 set_intr_gate(9, &coprocessor_segment_overrun);
1164 set_intr_gate(10, &invalid_TSS);
1165 set_intr_gate(11, &segment_not_present);
1166 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1167 set_intr_gate(13, &general_protection);
1168 set_intr_gate(14, &page_fault);
1169 set_intr_gate(15, &spurious_interrupt_bug);
1170 set_intr_gate(16, &coprocessor_error);
1171 set_intr_gate(17, &alignment_check);
1172#ifdef CONFIG_X86_MCE
1173 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1174#endif
1175 set_intr_gate(19, &simd_coprocessor_error);
1176
1177#ifdef CONFIG_IA32_EMULATION
1178 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1179#endif
1180 /*
1181 * initialize the per thread extended state:
1182 */
1183 init_thread_xstate();
1184 /*
1185 * Should be a barrier for any external CPU state:
1186 */
1187 cpu_init();
1188}
1189
1190static int __init oops_setup(char *s)
1191{
1192 if (!s)
1193 return -EINVAL;
1194 if (!strcmp(s, "panic"))
1195 panic_on_oops = 1;
1196 return 0;
1197}
1198early_param("oops", oops_setup);
1199
1200static int __init kstack_setup(char *s)
1201{
1202 if (!s)
1203 return -EINVAL;
1204 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1205 return 0;
1206}
1207early_param("kstack", kstack_setup);
1208
1209static int __init code_bytes_setup(char *s)
1210{
1211 code_bytes = simple_strtoul(s, NULL, 0);
1212 if (code_bytes > 8192)
1213 code_bytes = 8192;
1214
1215 return 1;
1216}
1217__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
122 return ULLONG_MAX; 122 return ULLONG_MAX;
123} 123}
124 124
125/** 125/*
126 * native_calibrate_tsc - calibrate the tsc on boot 126 * Calculate the TSC frequency from HPET reference
127 */ 127 */
128unsigned long native_calibrate_tsc(void) 128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{ 129{
130 unsigned long flags; 130 u64 tmp;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134 131
135 local_irq_save(flags); 132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
136 148
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
172 * Try to calibrate the TSC against the Programmable
173 * Interrupt Timer and return the frequency of the TSC
174 * in kHz.
175 *
176 * Return ULONG_MAX on failure to calibrate.
177 */
178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
179{
180 u64 tsc, t1, t2, delta;
181 unsigned long tscmin, tscmax;
182 int pitcnt;
138 183
184 /* Set the Gate high, disable speaker */
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 185 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140 186
187 /*
188 * Setup CTC channel 2* for mode 0, (interrupt on terminal
189 * count mode), binary count. Set the latch register to 50ms
190 * (LSB then MSB) to begin countdown.
191 */
141 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
144 tr1 = get_cycles(); 195
145 while ((inb(0x61) & 0x20) == 0); 196 tsc = t1 = t2 = get_cycles();
146 tr2 = get_cycles(); 197
198 pitcnt = 0;
199 tscmax = 0;
200 tscmin = ULONG_MAX;
201 while ((inb(0x61) & 0x20) == 0) {
202 t2 = get_cycles();
203 delta = t2 - tsc;
204 tsc = t2;
205 if ((unsigned long) delta < tscmin)
206 tscmin = (unsigned int) delta;
207 if ((unsigned long) delta > tscmax)
208 tscmax = (unsigned int) delta;
209 pitcnt++;
210 }
211
212 /*
213 * Sanity checks:
214 *
215 * If we were not able to read the PIT more than loopmin
216 * times, then we have been hit by a massive SMI
217 *
218 * If the maximum is 10 times larger than the minimum,
219 * then we got hit by an SMI as well.
220 */
221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
222 return ULONG_MAX;
223
224 /* Calculate the PIT value */
225 delta = t2 - t1;
226 do_div(delta, ms);
227 return delta;
228}
147 229
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
149 268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
347
348/**
349 * native_calibrate_tsc - calibrate the tsc on boot
350 */
351unsigned long native_calibrate_tsc(void)
352{
353 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate;
356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
150 local_irq_restore(flags); 360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
151 363
152 /* 364 /*
153 * Preset the result with the raw and inaccurate PIT 365 * Run 5 calibration loops to get the lowest frequency value
154 * calibration value 366 * (the best estimate). We use two different calibration modes
367 * here:
368 *
369 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
370 * load a timeout of 50ms. We read the time right after we
371 * started the timer and wait until the PIT count down reaches
372 * zero. In each wait loop iteration we read the TSC and check
373 * the delta to the previous read. We keep track of the min
374 * and max values of that delta. The delta is mostly defined
375 * by the IO time of the PIT access, so we can detect when a
376 * SMI/SMM disturbance happend between the two reads. If the
377 * maximum time is significantly larger than the minimum time,
378 * then we discard the result and have another try.
379 *
380 * 2) Reference counter. If available we use the HPET or the
381 * PMTIMER as a reference to check the sanity of that value.
382 * We use separate TSC readouts and check inside of the
383 * reference read for a SMI/SMM disturbance. We dicard
384 * disturbed values here as well. We do that around the PIT
385 * calibration delay loop as we have to wait for a certain
386 * amount of time anyway.
155 */ 387 */
156 delta = (tr2 - tr1); 388
157 do_div(delta, 50); 389 /* Preset PIT loop values */
158 tsc_khz_val = delta; 390 latch = CAL_LATCH;
159 391 ms = CAL_MS;
160 /* hpet or pmtimer available ? */ 392 loopmin = CAL_PIT_LOOPS;
161 if (!hpet && !pm1 && !pm2) { 393
162 printk(KERN_INFO "TSC calibrated against PIT\n"); 394 for (i = 0; i < 3; i++) {
163 goto out; 395 unsigned long tsc_pit_khz;
396
397 /*
398 * Read the start value and the reference count of
399 * hpet/pmtimer when available. Then do the PIT
400 * calibration, which will take at least 50ms, and
401 * read the end value.
402 */
403 local_irq_save(flags);
404 tsc1 = tsc_read_refs(&ref1, hpet);
405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
406 tsc2 = tsc_read_refs(&ref2, hpet);
407 local_irq_restore(flags);
408
409 /* Pick the lowest PIT TSC calibration so far */
410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
411
412 /* hpet or pmtimer available ? */
413 if (!hpet && !ref1 && !ref2)
414 continue;
415
416 /* Check, whether the sampling was disturbed by an SMI */
417 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
418 continue;
419
420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
425
426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
427
428 /* Check the reference deviation */
429 delta = ((u64) tsc_pit_min) * 100;
430 do_div(delta, tsc_ref_min);
431
432 /*
433 * If both calibration results are inside a 10% window
434 * then we can be sure, that the calibration
435 * succeeded. We break out of the loop right away. We
436 * use the reference value, as it is more precise.
437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
443 }
444
445 /*
446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
164 } 456 }
165 457
166 /* Check, whether the sampling was disturbed by an SMI */ 458 /*
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { 459 * Now check the results.
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 460 */
169 "using PIT calibration result\n"); 461 if (tsc_pit_min == ULONG_MAX) {
170 goto out; 462 /* PIT gave no useful value */
463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
464
465 /* We don't have an alternative source, disable TSC */
466 if (!hpet && !ref1 && !ref2) {
467 printk("TSC: No reference (HPET/PMTIMER) available\n");
468 return 0;
469 }
470
471 /* The alternative source failed as well, disable TSC */
472 if (tsc_ref_min == ULONG_MAX) {
473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
474 "failed.\n");
475 return 0;
476 }
477
478 /* Use the alternative source */
479 printk(KERN_INFO "TSC: using %s reference calibration\n",
480 hpet ? "HPET" : "PMTIMER");
481
482 return tsc_ref_min;
171 } 483 }
172 484
173 tsc2 = (tsc2 - tsc1) * 1000000LL; 485 /* We don't have an alternative source, use the PIT calibration value */
174 486 if (!hpet && !ref1 && !ref2) {
175 if (hpet) { 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
176 printk(KERN_INFO "TSC calibrated against HPET\n"); 488 return tsc_pit_min;
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 } 489 }
190 490
191 do_div(tsc2, tsc1); 491 /* The alternative source failed, use the PIT calibration value */
192 tsc_khz_val = tsc2; 492 if (tsc_ref_min == ULONG_MAX) {
493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
494 "Using PIT calibration\n");
495 return tsc_pit_min;
496 }
193 497
194out: 498 /*
195 return tsc_khz_val; 499 * The calibration values differ too much. In doubt, we use
500 * the PIT value as we know that there are PMTIMERs around
501 * running at double speed. At least we let the user know:
502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
506 return tsc_pit_min;
196} 507}
197 508
198
199#ifdef CONFIG_X86_32 509#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */ 510/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void) 511int recalibrate_cpu_khz(void)
@@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
314 mark_tsc_unstable("cpufreq changes"); 624 mark_tsc_unstable("cpufreq changes");
315 } 625 }
316 626
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu); 627 set_cyc2ns_scale(tsc_khz, freq->cpu);
318 628
319 return 0; 629 return 0;
320} 630}
@@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
325 635
326static int __init cpufreq_tsc(void) 636static int __init cpufreq_tsc(void)
327{ 637{
638 if (!cpu_has_tsc)
639 return 0;
640 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
641 return 0;
328 cpufreq_register_notifier(&time_cpufreq_notifier_block, 642 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER); 643 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0; 644 return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 000000000000..aeef529917e4
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ functions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#include <linux/module.h>
12#include <linux/irq.h>
13
14#include <asm/apic.h>
15#include <asm/uv/uv_irq.h>
16
17static void uv_noop(unsigned int irq)
18{
19}
20
21static unsigned int uv_noop_ret(unsigned int irq)
22{
23 return 0;
24}
25
26static void uv_ack_apic(unsigned int irq)
27{
28 ack_APIC_irq();
29}
30
31struct irq_chip uv_irq_chip = {
32 .name = "UV-CORE",
33 .startup = uv_noop_ret,
34 .shutdown = uv_noop,
35 .enable = uv_noop,
36 .disable = uv_noop,
37 .ack = uv_noop,
38 .mask = uv_noop,
39 .unmask = uv_noop,
40 .eoi = uv_ack_apic,
41 .end = uv_noop,
42};
43
44/*
45 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised.
48 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset)
51{
52 int irq;
53 int ret;
54
55 irq = create_irq();
56 if (irq <= 0)
57 return -EBUSY;
58
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
60 if (ret != irq)
61 destroy_irq(irq);
62
63 return ret;
64}
65EXPORT_SYMBOL_GPL(uv_setup_irq);
66
67/*
68 * Tear down a mapping of an irq and vector, and disable the specified MMR that
69 * defined the MSI that was to be sent to the specified CPU when an interrupt
70 * was raised.
71 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
75{
76 arch_disable_uv_irq(mmr_blade, mmr_offset);
77 destroy_irq(irq);
78}
79EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 000000000000..67f9b9dbf800
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
1/*
2 * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/sysdev.h>
23#include <asm/uv/bios.h>
24
25struct kobject *sgi_uv_kobj;
26
27static ssize_t partition_id_show(struct kobject *kobj,
28 struct kobj_attribute *attr, char *buf)
29{
30 return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
31}
32
33static ssize_t coherence_id_show(struct kobject *kobj,
34 struct kobj_attribute *attr, char *buf)
35{
36 return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
37}
38
39static struct kobj_attribute partition_id_attr =
40 __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
41
42static struct kobj_attribute coherence_id_attr =
43 __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
44
45
46static int __init sgi_uv_sysfs_init(void)
47{
48 unsigned long ret;
49
50 if (!sgi_uv_kobj)
51 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
52 if (!sgi_uv_kobj) {
53 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
54 return -EINVAL;
55 }
56
57 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
58 if (ret) {
59 printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
60 return ret;
61 }
62
63 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
64 if (ret) {
65 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
66 return ret;
67 }
68
69 return 0;
70}
71
72device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
@@ -73,7 +59,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 59 return visws_board_type >= 0;
74} 60}
75 61
76static int __init visws_time_init_quirk(void) 62static int __init visws_time_init(void)
77{ 63{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 64 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 65
@@ -93,7 +79,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 79 return 0;
94} 80}
95 81
96static int __init visws_pre_intr_init_quirk(void) 82static int __init visws_pre_intr_init(void)
97{ 83{
98 init_VISWS_APIC_irqs(); 84 init_VISWS_APIC_irqs();
99 85
@@ -114,7 +100,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 100
115long long mem_size __initdata = 0; 101long long mem_size __initdata = 0;
116 102
117static char * __init visws_memory_setup_quirk(void) 103static char * __init visws_memory_setup(void)
118{ 104{
119 long long gfx_mem_size = 8 * MB; 105 long long gfx_mem_size = 8 * MB;
120 106
@@ -176,7 +162,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 162 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 163}
178 164
179static int __init visws_get_smp_config_quirk(unsigned int early) 165static int __init visws_get_smp_config(unsigned int early)
180{ 166{
181 /* 167 /*
182 * Prevent MP-table parsing by the generic code: 168 * Prevent MP-table parsing by the generic code:
@@ -184,15 +170,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early)
184 return 1; 170 return 1;
185} 171}
186 172
187extern unsigned int __cpuinitdata maxcpus;
188
189/* 173/*
190 * The Visual Workstation is Intel MP compliant in the hardware 174 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table). 175 * sense, but it doesn't have a BIOS(-configuration table).
192 * No problem for Linux. 176 * No problem for Linux.
193 */ 177 */
194 178
195static void __init MP_processor_info (struct mpc_config_processor *m) 179static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 180{
197 int ver, logical_apicid; 181 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 182 physid_mask_t apic_cpus;
@@ -232,7 +216,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 216 apic_version[m->mpc_apicid] = ver;
233} 217}
234 218
235int __init visws_find_smp_config_quirk(unsigned int reserve) 219static int __init visws_find_smp_config(unsigned int reserve)
236{ 220{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 221 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 222 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -244,8 +228,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
244 ncpus = CO_CPU_MAX; 228 ncpus = CO_CPU_MAX;
245 } 229 }
246 230
247 if (ncpus > maxcpus) 231 if (ncpus > setup_max_cpus)
248 ncpus = maxcpus; 232 ncpus = setup_max_cpus;
249 233
250#ifdef CONFIG_X86_LOCAL_APIC 234#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1; 235 smp_found_config = 1;
@@ -258,7 +242,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 242 return 1;
259} 243}
260 244
261extern int visws_trap_init_quirk(void); 245static int visws_trap_init(void);
246
247static struct x86_quirks visws_x86_quirks __initdata = {
248 .arch_time_init = visws_time_init,
249 .arch_pre_intr_init = visws_pre_intr_init,
250 .arch_memory_setup = visws_memory_setup,
251 .arch_intr_init = NULL,
252 .arch_trap_init = visws_trap_init,
253 .mach_get_smp_config = visws_get_smp_config,
254 .mach_find_smp_config = visws_find_smp_config,
255};
262 256
263void __init visws_early_detect(void) 257void __init visws_early_detect(void)
264{ 258{
@@ -272,16 +266,10 @@ void __init visws_early_detect(void)
272 266
273 /* 267 /*
274 * Install special quirks for timer, interrupt and memory setup: 268 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 269 * Fall back to generic behavior for traps:
270 * Override generic MP-table parsing:
282 */ 271 */
283 arch_intr_init_quirk = NULL; 272 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 273
286 /* 274 /*
287 * Install reboot quirks: 275 * Install reboot quirks:
@@ -294,12 +282,6 @@ void __init visws_early_detect(void)
294 */ 282 */
295 no_broadcast = 0; 283 no_broadcast = 0;
296 284
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 285#ifdef CONFIG_X86_IO_APIC
304 /* 286 /*
305 * Turn off IO-APIC detection and initialization: 287 * Turn off IO-APIC detection and initialization:
@@ -426,7 +408,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 408 co_apic_read(CO_APIC_ID));
427} 409}
428 410
429int __init visws_trap_init_quirk(void) 411static int __init visws_trap_init(void)
430{ 412{
431 lithium_init(); 413 lithium_init();
432 cobalt_init(); 414 cobalt_init();
@@ -502,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
502static unsigned int startup_cobalt_irq(unsigned int irq) 484static unsigned int startup_cobalt_irq(unsigned int irq)
503{ 485{
504 unsigned long flags; 486 unsigned long flags;
487 struct irq_desc *desc = irq_to_desc(irq);
505 488
506 spin_lock_irqsave(&cobalt_lock, flags); 489 spin_lock_irqsave(&cobalt_lock, flags);
507 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 490 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
508 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); 491 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
509 enable_cobalt_irq(irq); 492 enable_cobalt_irq(irq);
510 spin_unlock_irqrestore(&cobalt_lock, flags); 493 spin_unlock_irqrestore(&cobalt_lock, flags);
511 return 0; 494 return 0;
@@ -524,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
524static void end_cobalt_irq(unsigned int irq) 507static void end_cobalt_irq(unsigned int irq)
525{ 508{
526 unsigned long flags; 509 unsigned long flags;
510 struct irq_desc *desc = irq_to_desc(irq);
527 511
528 spin_lock_irqsave(&cobalt_lock, flags); 512 spin_lock_irqsave(&cobalt_lock, flags);
529 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) 513 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
530 enable_cobalt_irq(irq); 514 enable_cobalt_irq(irq);
531 spin_unlock_irqrestore(&cobalt_lock, flags); 515 spin_unlock_irqrestore(&cobalt_lock, flags);
532} 516}
@@ -644,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
644 628
645 spin_unlock_irqrestore(&i8259A_lock, flags); 629 spin_unlock_irqrestore(&i8259A_lock, flags);
646 630
647 desc = irq_desc + realirq; 631 desc = irq_to_desc(realirq);
648 632
649 /* 633 /*
650 * handle this 'virtual interrupt' as a Cobalt one now. 634 * handle this 'virtual interrupt' as a Cobalt one now.
651 */ 635 */
652 kstat_cpu(smp_processor_id()).irqs[realirq]++; 636 kstat_incr_irqs_this_cpu(realirq, desc);
653 637
654 if (likely(desc->action != NULL)) 638 if (likely(desc->action != NULL))
655 handle_IRQ_event(realirq, desc->action); 639 handle_IRQ_event(realirq, desc->action);
@@ -680,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
680 int i; 664 int i;
681 665
682 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 666 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
683 irq_desc[i].status = IRQ_DISABLED; 667 struct irq_desc *desc = irq_to_desc(i);
684 irq_desc[i].action = 0; 668
685 irq_desc[i].depth = 1; 669 desc->status = IRQ_DISABLED;
670 desc->action = 0;
671 desc->depth = 1;
686 672
687 if (i == 0) { 673 if (i == 0) {
688 irq_desc[i].chip = &cobalt_irq_type; 674 desc->chip = &cobalt_irq_type;
689 } 675 }
690 else if (i == CO_IRQ_IDE0) { 676 else if (i == CO_IRQ_IDE0) {
691 irq_desc[i].chip = &cobalt_irq_type; 677 desc->chip = &cobalt_irq_type;
692 } 678 }
693 else if (i == CO_IRQ_IDE1) { 679 else if (i == CO_IRQ_IDE1) {
694 irq_desc[i].chip = &cobalt_irq_type; 680 desc->chip = &cobalt_irq_type;
695 } 681 }
696 else if (i == CO_IRQ_8259) { 682 else if (i == CO_IRQ_8259) {
697 irq_desc[i].chip = &piix4_master_irq_type; 683 desc->chip = &piix4_master_irq_type;
698 } 684 }
699 else if (i < CO_IRQ_APIC0) { 685 else if (i < CO_IRQ_APIC0) {
700 irq_desc[i].chip = &piix4_virtual_irq_type; 686 desc->chip = &piix4_virtual_irq_type;
701 } 687 }
702 else if (IS_CO_APIC(i)) { 688 else if (IS_CO_APIC(i)) {
703 irq_desc[i].chip = &cobalt_irq_type; 689 desc->chip = &cobalt_irq_type;
704 } 690 }
705 } 691 }
706 692
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
46#include <asm/io.h> 46#include <asm/io.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/irq.h> 48#include <asm/irq.h>
49#include <asm/syscalls.h>
49 50
50/* 51/*
51 * Known problems: 52 * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc) 235 const void *desc)
235{ 236{
236 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238} 239}
239 240
240static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
392} 393}
393#endif 394#endif
394 395
395static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
396{ 397{
397 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
399} 400}
400 401
401static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
402{ 403{
403 /* 404 /*
404 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
410} 411}
411 412
412static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
413{ 414{
414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
415 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
417} 418}
418 419
419static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
420{ 421{
421 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
423} 424}
424 425
425static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
426{ 427{
427 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
@@ -904,9 +905,8 @@ static inline int __init activate_vmi(void)
904#endif 905#endif
905 906
906#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe289..254ee07f8635 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
235 235
236void __init vmi_time_init(void) 236void __init vmi_time_init(void)
237{ 237{
238 unsigned int cpu;
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240 241
241 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
244 for_each_possible_cpu(cpu)
245 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
243} 246}
244 247
245#ifdef CONFIG_X86_LOCAL_APIC 248#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
@@ -209,3 +210,11 @@ SECTIONS
209 210
210 DWARF_DEBUG 211 DWARF_DEBUG
211} 212}
213
214#ifdef CONFIG_KEXEC
215/* Link time checks */
216#include <asm/kexec.h>
217
218ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
219 "kexec control code size is too big")
220#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..b13acb75e822
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251static void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}