aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-07 05:15:40 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-07 05:15:40 -0400
commit5e34437840d33554f69380584311743b39e8fbeb (patch)
treee081135619ee146af5efb9ee883afca950df5757 /arch/x86/kernel
parent77d05632baee21b1cef8730d7c06aa69601e4dca (diff)
parentd508afb437daee7cf07da085b635c44a4ebf9b38 (diff)
Merge branch 'linus' into core/softlockup
Conflicts: kernel/sysctl.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile47
-rw-r--r--arch/x86/kernel/acpi/boot.c234
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S4
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S34
-rw-r--r--arch/x86/kernel/alternative.c42
-rw-r--r--arch/x86/kernel/amd_iommu.c33
-rw-r--r--arch/x86/kernel/apic/Makefile19
-rw-r--r--arch/x86/kernel/apic/apic.c (renamed from arch/x86/kernel/apic.c)394
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c (renamed from arch/x86/kernel/genapic_flat_64.c)206
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c267
-rw-r--r--arch/x86/kernel/apic/es7000_32.c781
-rw-r--r--arch/x86/kernel/apic/io_apic.c (renamed from arch/x86/kernel/io_apic.c)841
-rw-r--r--arch/x86/kernel/apic/ipi.c164
-rw-r--r--arch/x86/kernel/apic/nmi.c (renamed from arch/x86/kernel/nmi.c)23
-rw-r--r--arch/x86/kernel/apic/numaq_32.c558
-rw-r--r--arch/x86/kernel/apic/probe_32.c285
-rw-r--r--arch/x86/kernel/apic/probe_64.c100
-rw-r--r--arch/x86/kernel/apic/summit_32.c576
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c245
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c (renamed from arch/x86/kernel/genx2apic_phys.c)156
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c (renamed from arch/x86/kernel/genx2apic_uv_x.c)158
-rw-r--r--arch/x86/kernel/apm_32.c269
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c12
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c56
-rw-r--r--arch/x86/kernel/cpu/amd.c56
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c586
-rw-r--r--arch/x86/kernel/cpu/cpu.h25
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c54
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c54
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c27
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c105
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c199
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c75
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c239
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c393
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c30
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c163
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c166
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c57
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c77
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c547
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c85
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c199
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c26
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/ds.c3
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/e820.c145
-rw-r--r--arch/x86/kernel/early_printk.c22
-rw-r--r--arch/x86/kernel/efi.c9
-rw-r--r--arch/x86/kernel/efi_64.c22
-rw-r--r--arch/x86/kernel/efi_stub_32.S3
-rw-r--r--arch/x86/kernel/efi_stub_64.S7
-rw-r--r--arch/x86/kernel/entry_32.S473
-rw-r--r--arch/x86/kernel/entry_64.S78
-rw-r--r--arch/x86/kernel/es7000_32.c378
-rw-r--r--arch/x86/kernel/ftrace.c265
-rw-r--r--arch/x86/kernel/genapic_64.c82
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c198
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c25
-rw-r--r--arch/x86/kernel/head_32.S120
-rw-r--r--arch/x86/kernel/head_64.S23
-rw-r--r--arch/x86/kernel/hpet.c82
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8253.c68
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/io_delay.c27
-rw-r--r--arch/x86/kernel/ioport.c14
-rw-r--r--arch/x86/kernel/ipi.c190
-rw-r--r--arch/x86/kernel/irq.c132
-rw-r--r--arch/x86/kernel/irq_32.c61
-rw-r--r--arch/x86/kernel/irq_64.c43
-rw-r--r--arch/x86/kernel/irqinit_32.c39
-rw-r--r--arch/x86/kernel/irqinit_64.c4
-rw-r--r--arch/x86/kernel/kdebugfs.c82
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c20
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c19
-rw-r--r--arch/x86/kernel/machine_kexec_64.c179
-rw-r--r--arch/x86/kernel/mca_32.c5
-rw-r--r--arch/x86/kernel/mfgpt_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c43
-rw-r--r--arch/x86/kernel/microcode_core.c160
-rw-r--r--arch/x86/kernel/microcode_intel.c91
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c541
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/numaq_32.c293
-rw-r--r--arch/x86/kernel/olpc.c2
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c10
-rw-r--r--arch/x86/kernel/paravirt.c83
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/pci-calgary_64.c38
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-gart_64.c34
-rw-r--r--arch/x86/kernel/pci-nommu.c39
-rw-r--r--arch/x86/kernel/pci-swiotlb.c (renamed from arch/x86/kernel/pci-swiotlb_64.c)19
-rw-r--r--arch/x86/kernel/probe_roms_32.c2
-rw-r--r--arch/x86/kernel/process.c219
-rw-r--r--arch/x86/kernel/process_32.c246
-rw-r--r--arch/x86/kernel/process_64.c232
-rw-r--r--arch/x86/kernel/ptrace.c51
-rw-r--r--arch/x86/kernel/quirks.c6
-rw-r--r--arch/x86/kernel/reboot.c13
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S26
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S312
-rw-r--r--arch/x86/kernel/rtc.c20
-rw-r--r--arch/x86/kernel/setup.c201
-rw-r--r--arch/x86/kernel/setup_percpu.c669
-rw-r--r--arch/x86/kernel/signal.c493
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c285
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/summit_32.c188
-rw-r--r--arch/x86/kernel/syscall_table_32.S22
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/time_64.c4
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_64.c284
-rw-r--r--arch/x86/kernel/tlb_uv.c80
-rw-r--r--arch/x86/kernel/topology.c14
-rw-r--r--arch/x86/kernel/trampoline_32.S2
-rw-r--r--arch/x86/kernel/trampoline_64.S23
-rw-r--r--arch/x86/kernel/traps.c74
-rw-r--r--arch/x86/kernel/tsc.c124
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c12
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmi_32.c19
-rw-r--r--arch/x86/kernel/vmiclock_32.c10
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S32
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S136
-rw-r--r--arch/x86/kernel/vsmp_64.c14
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
174 files changed, 12610 insertions, 8603 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..145cce75cda7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
26 27
27obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 31obj-y += setup.o i8259.o irqinit_$(BITS).o
31obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
32obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
33obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -49,31 +50,27 @@ obj-y += step.o
49obj-$(CONFIG_STACKTRACE) += stacktrace.o 50obj-$(CONFIG_STACKTRACE) += stacktrace.o
50obj-y += cpu/ 51obj-y += cpu/
51obj-y += acpi/ 52obj-y += acpi/
52obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o 53obj-y += reboot.o
53obj-$(CONFIG_MCA) += mca_32.o 54obj-$(CONFIG_MCA) += mca_32.o
54obj-$(CONFIG_X86_MSR) += msr.o 55obj-$(CONFIG_X86_MSR) += msr.o
55obj-$(CONFIG_X86_CPUID) += cpuid.o 56obj-$(CONFIG_X86_CPUID) += cpuid.o
56obj-$(CONFIG_PCI) += early-quirks.o 57obj-$(CONFIG_PCI) += early-quirks.o
57apm-y := apm_32.o 58apm-y := apm_32.o
58obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
59obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_SMP) += smp.o
60obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
61obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_SMP) += setup_percpu.o
62obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
63obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
64obj-$(CONFIG_X86_MPPARSE) += mpparse.o 65obj-$(CONFIG_X86_MPPARSE) += mpparse.o
65obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 66obj-y += apic/
66obj-$(CONFIG_X86_IO_APIC) += io_apic.o
67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
70obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 71obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 72obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 73obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
73obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
74obj-$(CONFIG_X86_ES7000) += es7000_32.o
75obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
76obj-y += vsmp_64.o
77obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_MODULES) += module_$(BITS).o 75obj-$(CONFIG_MODULES) += module_$(BITS).o
79obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -109,21 +106,19 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109 106
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 107obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111 108
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 109obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
113 110
114### 111###
115# 64 bit specific files 112# 64 bit specific files
116ifeq ($(CONFIG_X86_64),y) 113ifeq ($(CONFIG_X86_64),y)
117 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
118 obj-y += bios_uv.o uv_irq.o uv_sysfs.o 115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
119 obj-y += genx2apic_cluster.o 116 obj-$(CONFIG_AUDIT) += audit_64.o
120 obj-y += genx2apic_phys.o 117
121 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 118 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
122 obj-$(CONFIG_AUDIT) += audit_64.o 119 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
123 120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 121
125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 122 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 123 obj-y += vsmp_64.o
127
128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
129endif 124endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c4568..723989d7f802 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,15 +37,10 @@
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/io_apic.h> 38#include <asm/io_apic.h>
39#include <asm/apic.h> 39#include <asm/apic.h>
40#include <asm/genapic.h>
41#include <asm/io.h> 40#include <asm/io.h>
42#include <asm/mpspec.h> 41#include <asm/mpspec.h>
43#include <asm/smp.h> 42#include <asm/smp.h>
44 43
45#ifdef CONFIG_X86_LOCAL_APIC
46# include <mach_apic.h>
47#endif
48
49static int __initdata acpi_force = 0; 44static int __initdata acpi_force = 0;
50u32 acpi_rsdt_forced; 45u32 acpi_rsdt_forced;
51#ifdef CONFIG_ACPI 46#ifdef CONFIG_ACPI
@@ -56,16 +51,7 @@ int acpi_disabled = 1;
56EXPORT_SYMBOL(acpi_disabled); 51EXPORT_SYMBOL(acpi_disabled);
57 52
58#ifdef CONFIG_X86_64 53#ifdef CONFIG_X86_64
59 54# include <asm/proto.h>
60#include <asm/proto.h>
61
62#else /* X86 */
63
64#ifdef CONFIG_X86_LOCAL_APIC
65#include <mach_apic.h>
66#include <mach_mpparse.h>
67#endif /* CONFIG_X86_LOCAL_APIC */
68
69#endif /* X86 */ 55#endif /* X86 */
70 56
71#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -121,35 +107,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
121 */ 107 */
122char *__init __acpi_map_table(unsigned long phys, unsigned long size) 108char *__init __acpi_map_table(unsigned long phys, unsigned long size)
123{ 109{
124 unsigned long base, offset, mapped_size;
125 int idx;
126 110
127 if (!phys || !size) 111 if (!phys || !size)
128 return NULL; 112 return NULL;
129 113
130 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) 114 return early_ioremap(phys, size);
131 return __va(phys); 115}
132 116void __init __acpi_unmap_table(char *map, unsigned long size)
133 offset = phys & (PAGE_SIZE - 1); 117{
134 mapped_size = PAGE_SIZE - offset; 118 if (!map || !size)
135 clear_fixmap(FIX_ACPI_END); 119 return;
136 set_fixmap(FIX_ACPI_END, phys);
137 base = fix_to_virt(FIX_ACPI_END);
138
139 /*
140 * Most cases can be covered by the below.
141 */
142 idx = FIX_ACPI_END;
143 while (mapped_size < size) {
144 if (--idx < FIX_ACPI_BEGIN)
145 return NULL; /* cannot handle this */
146 phys += PAGE_SIZE;
147 clear_fixmap(idx);
148 set_fixmap(idx, phys);
149 mapped_size += PAGE_SIZE;
150 }
151 120
152 return ((unsigned char *)base + offset); 121 early_iounmap(map, size);
153} 122}
154 123
155#ifdef CONFIG_PCI_MMCONFIG 124#ifdef CONFIG_PCI_MMCONFIG
@@ -239,7 +208,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
239 madt->address); 208 madt->address);
240 } 209 }
241 210
242 acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); 211 default_acpi_madt_oem_check(madt->header.oem_id,
212 madt->header.oem_table_id);
243 213
244 return 0; 214 return 0;
245} 215}
@@ -260,6 +230,35 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
260} 230}
261 231
262static int __init 232static int __init
233acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
234{
235 struct acpi_madt_local_x2apic *processor = NULL;
236
237 processor = (struct acpi_madt_local_x2apic *)header;
238
239 if (BAD_MADT_ENTRY(processor, end))
240 return -EINVAL;
241
242 acpi_table_print_madt_entry(header);
243
244#ifdef CONFIG_X86_X2APIC
245 /*
246 * We need to register disabled CPU as well to permit
247 * counting disabled CPUs. This allows us to size
248 * cpus_possible_map more accurately, to permit
249 * to not preallocating memory for all NR_CPUS
250 * when we use CPU hotplug.
251 */
252 acpi_register_lapic(processor->local_apic_id, /* APIC ID */
253 processor->lapic_flags & ACPI_MADT_ENABLED);
254#else
255 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
256#endif
257
258 return 0;
259}
260
261static int __init
263acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) 262acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
264{ 263{
265 struct acpi_madt_local_apic *processor = NULL; 264 struct acpi_madt_local_apic *processor = NULL;
@@ -319,6 +318,25 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
319} 318}
320 319
321static int __init 320static int __init
321acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
322 const unsigned long end)
323{
324 struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
325
326 x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
327
328 if (BAD_MADT_ENTRY(x2apic_nmi, end))
329 return -EINVAL;
330
331 acpi_table_print_madt_entry(header);
332
333 if (x2apic_nmi->lint != 1)
334 printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
335
336 return 0;
337}
338
339static int __init
322acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) 340acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
323{ 341{
324 struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; 342 struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
@@ -823,6 +841,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
823static int __init acpi_parse_madt_lapic_entries(void) 841static int __init acpi_parse_madt_lapic_entries(void)
824{ 842{
825 int count; 843 int count;
844 int x2count = 0;
826 845
827 if (!cpu_has_apic) 846 if (!cpu_has_apic)
828 return -ENODEV; 847 return -ENODEV;
@@ -846,22 +865,28 @@ static int __init acpi_parse_madt_lapic_entries(void)
846 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 865 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
847 acpi_parse_sapic, MAX_APICS); 866 acpi_parse_sapic, MAX_APICS);
848 867
849 if (!count) 868 if (!count) {
869 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
870 acpi_parse_x2apic, MAX_APICS);
850 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 871 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
851 acpi_parse_lapic, MAX_APICS); 872 acpi_parse_lapic, MAX_APICS);
852 if (!count) { 873 }
874 if (!count && !x2count) {
853 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 875 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
854 /* TBD: Cleanup to allow fallback to MPS */ 876 /* TBD: Cleanup to allow fallback to MPS */
855 return -ENODEV; 877 return -ENODEV;
856 } else if (count < 0) { 878 } else if (count < 0 || x2count < 0) {
857 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); 879 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
858 /* TBD: Cleanup to allow fallback to MPS */ 880 /* TBD: Cleanup to allow fallback to MPS */
859 return count; 881 return count;
860 } 882 }
861 883
884 x2count =
885 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
886 acpi_parse_x2apic_nmi, 0);
862 count = 887 count =
863 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); 888 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
864 if (count < 0) { 889 if (count < 0 || x2count < 0) {
865 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); 890 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
866 /* TBD: Cleanup to allow fallback to MPS */ 891 /* TBD: Cleanup to allow fallback to MPS */
867 return count; 892 return count;
@@ -884,7 +909,7 @@ static struct {
884 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 909 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
885} mp_ioapic_routing[MAX_IO_APICS]; 910} mp_ioapic_routing[MAX_IO_APICS];
886 911
887static int mp_find_ioapic(int gsi) 912int mp_find_ioapic(int gsi)
888{ 913{
889 int i = 0; 914 int i = 0;
890 915
@@ -899,6 +924,16 @@ static int mp_find_ioapic(int gsi)
899 return -1; 924 return -1;
900} 925}
901 926
927int mp_find_ioapic_pin(int ioapic, int gsi)
928{
929 if (WARN_ON(ioapic == -1))
930 return -1;
931 if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
932 return -1;
933
934 return gsi - mp_ioapic_routing[ioapic].gsi_base;
935}
936
902static u8 __init uniq_ioapic_id(u8 id) 937static u8 __init uniq_ioapic_id(u8 id)
903{ 938{
904#ifdef CONFIG_X86_32 939#ifdef CONFIG_X86_32
@@ -912,8 +947,8 @@ static u8 __init uniq_ioapic_id(u8 id)
912 DECLARE_BITMAP(used, 256); 947 DECLARE_BITMAP(used, 256);
913 bitmap_zero(used, 256); 948 bitmap_zero(used, 256);
914 for (i = 0; i < nr_ioapics; i++) { 949 for (i = 0; i < nr_ioapics; i++) {
915 struct mp_config_ioapic *ia = &mp_ioapics[i]; 950 struct mpc_ioapic *ia = &mp_ioapics[i];
916 __set_bit(ia->mp_apicid, used); 951 __set_bit(ia->apicid, used);
917 } 952 }
918 if (!test_bit(id, used)) 953 if (!test_bit(id, used))
919 return id; 954 return id;
@@ -945,29 +980,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
945 980
946 idx = nr_ioapics; 981 idx = nr_ioapics;
947 982
948 mp_ioapics[idx].mp_type = MP_IOAPIC; 983 mp_ioapics[idx].type = MP_IOAPIC;
949 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; 984 mp_ioapics[idx].flags = MPC_APIC_USABLE;
950 mp_ioapics[idx].mp_apicaddr = address; 985 mp_ioapics[idx].apicaddr = address;
951 986
952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 987 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
953 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); 988 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
954#ifdef CONFIG_X86_32 989#ifdef CONFIG_X86_32
955 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); 990 mp_ioapics[idx].apicver = io_apic_get_version(idx);
956#else 991#else
957 mp_ioapics[idx].mp_apicver = 0; 992 mp_ioapics[idx].apicver = 0;
958#endif 993#endif
959 /* 994 /*
960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 995 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 996 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
962 */ 997 */
963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; 998 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
964 mp_ioapic_routing[idx].gsi_base = gsi_base; 999 mp_ioapic_routing[idx].gsi_base = gsi_base;
965 mp_ioapic_routing[idx].gsi_end = gsi_base + 1000 mp_ioapic_routing[idx].gsi_end = gsi_base +
966 io_apic_get_redir_entries(idx); 1001 io_apic_get_redir_entries(idx);
967 1002
968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 1003 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
969 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, 1004 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
970 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, 1005 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 1006 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
972 1007
973 nr_ioapics++; 1008 nr_ioapics++;
@@ -996,19 +1031,19 @@ int __init acpi_probe_gsi(void)
996 return max_gsi + 1; 1031 return max_gsi + 1;
997} 1032}
998 1033
999static void assign_to_mp_irq(struct mp_config_intsrc *m, 1034static void assign_to_mp_irq(struct mpc_intsrc *m,
1000 struct mp_config_intsrc *mp_irq) 1035 struct mpc_intsrc *mp_irq)
1001{ 1036{
1002 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); 1037 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
1003} 1038}
1004 1039
1005static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, 1040static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
1006 struct mp_config_intsrc *m) 1041 struct mpc_intsrc *m)
1007{ 1042{
1008 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); 1043 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
1009} 1044}
1010 1045
1011static void save_mp_irq(struct mp_config_intsrc *m) 1046static void save_mp_irq(struct mpc_intsrc *m)
1012{ 1047{
1013 int i; 1048 int i;
1014 1049
@@ -1026,7 +1061,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1026{ 1061{
1027 int ioapic; 1062 int ioapic;
1028 int pin; 1063 int pin;
1029 struct mp_config_intsrc mp_irq; 1064 struct mpc_intsrc mp_irq;
1030 1065
1031 /* 1066 /*
1032 * Convert 'gsi' to 'ioapic.pin'. 1067 * Convert 'gsi' to 'ioapic.pin'.
@@ -1034,7 +1069,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1034 ioapic = mp_find_ioapic(gsi); 1069 ioapic = mp_find_ioapic(gsi);
1035 if (ioapic < 0) 1070 if (ioapic < 0)
1036 return; 1071 return;
1037 pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1072 pin = mp_find_ioapic_pin(ioapic, gsi);
1038 1073
1039 /* 1074 /*
1040 * TBD: This check is for faulty timer entries, where the override 1075 * TBD: This check is for faulty timer entries, where the override
@@ -1044,13 +1079,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1044 if ((bus_irq == 0) && (trigger == 3)) 1079 if ((bus_irq == 0) && (trigger == 3))
1045 trigger = 1; 1080 trigger = 1;
1046 1081
1047 mp_irq.mp_type = MP_INTSRC; 1082 mp_irq.type = MP_INTSRC;
1048 mp_irq.mp_irqtype = mp_INT; 1083 mp_irq.irqtype = mp_INT;
1049 mp_irq.mp_irqflag = (trigger << 2) | polarity; 1084 mp_irq.irqflag = (trigger << 2) | polarity;
1050 mp_irq.mp_srcbus = MP_ISA_BUS; 1085 mp_irq.srcbus = MP_ISA_BUS;
1051 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ 1086 mp_irq.srcbusirq = bus_irq; /* IRQ */
1052 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ 1087 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1053 mp_irq.mp_dstirq = pin; /* INTIN# */ 1088 mp_irq.dstirq = pin; /* INTIN# */
1054 1089
1055 save_mp_irq(&mp_irq); 1090 save_mp_irq(&mp_irq);
1056} 1091}
@@ -1060,7 +1095,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1060 int i; 1095 int i;
1061 int ioapic; 1096 int ioapic;
1062 unsigned int dstapic; 1097 unsigned int dstapic;
1063 struct mp_config_intsrc mp_irq; 1098 struct mpc_intsrc mp_irq;
1064 1099
1065#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1100#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1066 /* 1101 /*
@@ -1085,7 +1120,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1085 ioapic = mp_find_ioapic(0); 1120 ioapic = mp_find_ioapic(0);
1086 if (ioapic < 0) 1121 if (ioapic < 0)
1087 return; 1122 return;
1088 dstapic = mp_ioapics[ioapic].mp_apicid; 1123 dstapic = mp_ioapics[ioapic].apicid;
1089 1124
1090 /* 1125 /*
1091 * Use the default configuration for the IRQs 0-15. Unless 1126 * Use the default configuration for the IRQs 0-15. Unless
@@ -1095,16 +1130,14 @@ void __init mp_config_acpi_legacy_irqs(void)
1095 int idx; 1130 int idx;
1096 1131
1097 for (idx = 0; idx < mp_irq_entries; idx++) { 1132 for (idx = 0; idx < mp_irq_entries; idx++) {
1098 struct mp_config_intsrc *irq = mp_irqs + idx; 1133 struct mpc_intsrc *irq = mp_irqs + idx;
1099 1134
1100 /* Do we already have a mapping for this ISA IRQ? */ 1135 /* Do we already have a mapping for this ISA IRQ? */
1101 if (irq->mp_srcbus == MP_ISA_BUS 1136 if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
1102 && irq->mp_srcbusirq == i)
1103 break; 1137 break;
1104 1138
1105 /* Do we already have a mapping for this IOAPIC pin */ 1139 /* Do we already have a mapping for this IOAPIC pin */
1106 if (irq->mp_dstapic == dstapic && 1140 if (irq->dstapic == dstapic && irq->dstirq == i)
1107 irq->mp_dstirq == i)
1108 break; 1141 break;
1109 } 1142 }
1110 1143
@@ -1113,13 +1146,13 @@ void __init mp_config_acpi_legacy_irqs(void)
1113 continue; /* IRQ already used */ 1146 continue; /* IRQ already used */
1114 } 1147 }
1115 1148
1116 mp_irq.mp_type = MP_INTSRC; 1149 mp_irq.type = MP_INTSRC;
1117 mp_irq.mp_irqflag = 0; /* Conforming */ 1150 mp_irq.irqflag = 0; /* Conforming */
1118 mp_irq.mp_srcbus = MP_ISA_BUS; 1151 mp_irq.srcbus = MP_ISA_BUS;
1119 mp_irq.mp_dstapic = dstapic; 1152 mp_irq.dstapic = dstapic;
1120 mp_irq.mp_irqtype = mp_INT; 1153 mp_irq.irqtype = mp_INT;
1121 mp_irq.mp_srcbusirq = i; /* Identity mapped */ 1154 mp_irq.srcbusirq = i; /* Identity mapped */
1122 mp_irq.mp_dstirq = i; 1155 mp_irq.dstirq = i;
1123 1156
1124 save_mp_irq(&mp_irq); 1157 save_mp_irq(&mp_irq);
1125 } 1158 }
@@ -1156,7 +1189,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1156 return gsi; 1189 return gsi;
1157 } 1190 }
1158 1191
1159 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1192 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1160 1193
1161#ifdef CONFIG_X86_32 1194#ifdef CONFIG_X86_32
1162 if (ioapic_renumber_irq) 1195 if (ioapic_renumber_irq)
@@ -1230,22 +1263,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1230 u32 gsi, int triggering, int polarity) 1263 u32 gsi, int triggering, int polarity)
1231{ 1264{
1232#ifdef CONFIG_X86_MPPARSE 1265#ifdef CONFIG_X86_MPPARSE
1233 struct mp_config_intsrc mp_irq; 1266 struct mpc_intsrc mp_irq;
1234 int ioapic; 1267 int ioapic;
1235 1268
1236 if (!acpi_ioapic) 1269 if (!acpi_ioapic)
1237 return 0; 1270 return 0;
1238 1271
1239 /* print the entry should happen on mptable identically */ 1272 /* print the entry should happen on mptable identically */
1240 mp_irq.mp_type = MP_INTSRC; 1273 mp_irq.type = MP_INTSRC;
1241 mp_irq.mp_irqtype = mp_INT; 1274 mp_irq.irqtype = mp_INT;
1242 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | 1275 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1243 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); 1276 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1244 mp_irq.mp_srcbus = number; 1277 mp_irq.srcbus = number;
1245 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1278 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1246 ioapic = mp_find_ioapic(gsi); 1279 ioapic = mp_find_ioapic(gsi);
1247 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; 1280 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1248 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; 1281 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1249 1282
1250 save_mp_irq(&mp_irq); 1283 save_mp_irq(&mp_irq);
1251#endif 1284#endif
@@ -1372,7 +1405,7 @@ static void __init acpi_process_madt(void)
1372 if (!error) { 1405 if (!error) {
1373 acpi_lapic = 1; 1406 acpi_lapic = 1;
1374 1407
1375#ifdef CONFIG_X86_GENERICARCH 1408#ifdef CONFIG_X86_BIGSMP
1376 generic_bigsmp_probe(); 1409 generic_bigsmp_probe();
1377#endif 1410#endif
1378 /* 1411 /*
@@ -1384,9 +1417,8 @@ static void __init acpi_process_madt(void)
1384 acpi_ioapic = 1; 1417 acpi_ioapic = 1;
1385 1418
1386 smp_found_config = 1; 1419 smp_found_config = 1;
1387#ifdef CONFIG_X86_32 1420 if (apic->setup_apic_routing)
1388 setup_apic_routing(); 1421 apic->setup_apic_routing();
1389#endif
1390 } 1422 }
1391 } 1423 }
1392 if (error == -EINVAL) { 1424 if (error == -EINVAL) {
@@ -1493,7 +1525,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1493 1525
1494/* 1526/*
1495 * If your system is blacklisted here, but you find that acpi=force 1527 * If your system is blacklisted here, but you find that acpi=force
1496 * works for you, please contact acpi-devel@sourceforge.net 1528 * works for you, please contact linux-acpi@vger.kernel.org
1497 */ 1529 */
1498static struct dmi_system_id __initdata acpi_dmi_table[] = { 1530static struct dmi_system_id __initdata acpi_dmi_table[] = {
1499 /* 1531 /*
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 3355973b12ac..580b4e296010 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -3,8 +3,8 @@
3 */ 3 */
4#include <asm/segment.h> 4#include <asm/segment.h>
5#include <asm/msr-index.h> 5#include <asm/msr-index.h>
6#include <asm/page.h> 6#include <asm/page_types.h>
7#include <asm/pgtable.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9 9
10 .code16 10 .code16
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a60c1f3bcb87..7c243a2c5115 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index a12e6a9fb659..8ded418b0593 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,7 +1,7 @@
1 .section .text.page_aligned 1 .section .text.page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page.h> 4#include <asm/page_types.h>
5 5
6# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 6# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
7 7
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index bcc293423a70..8ea5164cbd04 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,8 +1,8 @@
1.text 1.text
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/pgtable.h> 4#include <asm/pgtable_types.h>
5#include <asm/page.h> 5#include <asm/page_types.h>
6#include <asm/msr.h> 6#include <asm/msr.h>
7#include <asm/asm-offsets.h> 7#include <asm/asm-offsets.h>
8 8
@@ -13,7 +13,6 @@
13 * Hooray, we are in Long 64-bit mode (but still running in low memory) 13 * Hooray, we are in Long 64-bit mode (but still running in low memory)
14 */ 14 */
15ENTRY(wakeup_long64) 15ENTRY(wakeup_long64)
16wakeup_long64:
17 movq saved_magic, %rax 16 movq saved_magic, %rax
18 movq $0x123456789abcdef0, %rdx 17 movq $0x123456789abcdef0, %rdx
19 cmpq %rdx, %rax 18 cmpq %rdx, %rax
@@ -34,16 +33,12 @@ wakeup_long64:
34 33
35 movq saved_rip, %rax 34 movq saved_rip, %rax
36 jmp *%rax 35 jmp *%rax
36ENDPROC(wakeup_long64)
37 37
38bogus_64_magic: 38bogus_64_magic:
39 jmp bogus_64_magic 39 jmp bogus_64_magic
40 40
41 .align 2 41ENTRY(do_suspend_lowlevel)
42 .p2align 4,,15
43.globl do_suspend_lowlevel
44 .type do_suspend_lowlevel,@function
45do_suspend_lowlevel:
46.LFB5:
47 subq $8, %rsp 42 subq $8, %rsp
48 xorl %eax, %eax 43 xorl %eax, %eax
49 call save_processor_state 44 call save_processor_state
@@ -67,7 +62,7 @@ do_suspend_lowlevel:
67 pushfq 62 pushfq
68 popq pt_regs_flags(%rax) 63 popq pt_regs_flags(%rax)
69 64
70 movq $.L97, saved_rip(%rip) 65 movq $resume_point, saved_rip(%rip)
71 66
72 movq %rsp, saved_rsp 67 movq %rsp, saved_rsp
73 movq %rbp, saved_rbp 68 movq %rbp, saved_rbp
@@ -78,14 +73,12 @@ do_suspend_lowlevel:
78 addq $8, %rsp 73 addq $8, %rsp
79 movl $3, %edi 74 movl $3, %edi
80 xorl %eax, %eax 75 xorl %eax, %eax
81 jmp acpi_enter_sleep_state 76 call acpi_enter_sleep_state
82.L97: 77 /* in case something went wrong, restore the machine status and go on */
83 .p2align 4,,7 78 jmp resume_point
84.L99:
85 .align 4
86 movl $24, %eax
87 movw %ax, %ds
88 79
80 .align 4
81resume_point:
89 /* We don't restore %rax, it must be 0 anyway */ 82 /* We don't restore %rax, it must be 0 anyway */
90 movq $saved_context, %rax 83 movq $saved_context, %rax
91 movq saved_context_cr4(%rax), %rbx 84 movq saved_context_cr4(%rax), %rbx
@@ -117,12 +110,9 @@ do_suspend_lowlevel:
117 xorl %eax, %eax 110 xorl %eax, %eax
118 addq $8, %rsp 111 addq $8, %rsp
119 jmp restore_processor_state 112 jmp restore_processor_state
120.LFE5: 113ENDPROC(do_suspend_lowlevel)
121.Lfe5: 114
122 .size do_suspend_lowlevel, .Lfe5-do_suspend_lowlevel
123
124.data 115.data
125ALIGN
126ENTRY(saved_rbp) .quad 0 116ENTRY(saved_rbp) .quad 0
127ENTRY(saved_rsi) .quad 0 117ENTRY(saved_rsi) .quad 0
128ENTRY(saved_rdi) .quad 0 118ENTRY(saved_rdi) .quad 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e6..f57658702571 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,6 +5,7 @@
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/vmalloc.h> 7#include <linux/vmalloc.h>
8#include <linux/memory.h>
8#include <asm/alternative.h> 9#include <asm/alternative.h>
9#include <asm/sections.h> 10#include <asm/sections.h>
10#include <asm/pgtable.h> 11#include <asm/pgtable.h>
@@ -12,7 +13,9 @@
12#include <asm/nmi.h> 13#include <asm/nmi.h>
13#include <asm/vsyscall.h> 14#include <asm/vsyscall.h>
14#include <asm/cacheflush.h> 15#include <asm/cacheflush.h>
16#include <asm/tlbflush.h>
15#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/fixmap.h>
16 19
17#define MAX_PATCH_LEN (255-1) 20#define MAX_PATCH_LEN (255-1)
18 21
@@ -226,6 +229,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
226{ 229{
227 u8 **ptr; 230 u8 **ptr;
228 231
232 mutex_lock(&text_mutex);
229 for (ptr = start; ptr < end; ptr++) { 233 for (ptr = start; ptr < end; ptr++) {
230 if (*ptr < text) 234 if (*ptr < text)
231 continue; 235 continue;
@@ -234,6 +238,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
234 /* turn DS segment override prefix into lock prefix */ 238 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 239 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
236 }; 240 };
241 mutex_unlock(&text_mutex);
237} 242}
238 243
239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 244static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
@@ -243,6 +248,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
243 if (noreplace_smp) 248 if (noreplace_smp)
244 return; 249 return;
245 250
251 mutex_lock(&text_mutex);
246 for (ptr = start; ptr < end; ptr++) { 252 for (ptr = start; ptr < end; ptr++) {
247 if (*ptr < text) 253 if (*ptr < text)
248 continue; 254 continue;
@@ -251,6 +257,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
251 /* turn lock prefix into DS segment override prefix */ 257 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 258 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
253 }; 259 };
260 mutex_unlock(&text_mutex);
254} 261}
255 262
256struct smp_alt_module { 263struct smp_alt_module {
@@ -414,9 +421,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 421 that might execute the to be patched code.
415 Other CPUs are not running. */ 422 Other CPUs are not running. */
416 stop_nmi(); 423 stop_nmi();
417#ifdef CONFIG_X86_MCE 424
418 stop_mce(); 425 /*
419#endif 426 * Don't stop machine check exceptions while patching.
427 * MCEs only happen when something got corrupted and in this
428 * case we must do something about the corruption.
429 * Ignoring it is worse than a unlikely patching race.
430 * Also machine checks tend to be broadcast and if one CPU
431 * goes into machine check the others follow quickly, so we don't
432 * expect a machine check to cause undue problems during to code
433 * patching.
434 */
420 435
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 436 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 437
@@ -456,9 +471,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 471 (unsigned long)__smp_locks_end);
457 472
458 restart_nmi(); 473 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 474}
463 475
464/** 476/**
@@ -495,12 +507,13 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
495 * It means the size must be writable atomically and the address must be aligned 507 * It means the size must be writable atomically and the address must be aligned
496 * in a way that permits an atomic write. It also makes sure we fit on a single 508 * in a way that permits an atomic write. It also makes sure we fit on a single
497 * page. 509 * page.
510 *
511 * Note: Must be called under text_mutex.
498 */ 512 */
499void *__kprobes text_poke(void *addr, const void *opcode, size_t len) 513void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
500{ 514{
501 unsigned long flags; 515 unsigned long flags;
502 char *vaddr; 516 char *vaddr;
503 int nr_pages = 2;
504 struct page *pages[2]; 517 struct page *pages[2];
505 int i; 518 int i;
506 519
@@ -513,18 +526,21 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
513 pages[1] = virt_to_page(addr + PAGE_SIZE); 526 pages[1] = virt_to_page(addr + PAGE_SIZE);
514 } 527 }
515 BUG_ON(!pages[0]); 528 BUG_ON(!pages[0]);
516 if (!pages[1])
517 nr_pages = 1;
518 vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
519 BUG_ON(!vaddr);
520 local_irq_save(flags); 529 local_irq_save(flags);
530 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
531 if (pages[1])
532 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
533 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
521 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); 534 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
522 local_irq_restore(flags); 535 clear_fixmap(FIX_TEXT_POKE0);
523 vunmap(vaddr); 536 if (pages[1])
537 clear_fixmap(FIX_TEXT_POKE1);
538 local_flush_tlb();
524 sync_core(); 539 sync_core();
525 /* Could also do a CLFLUSH here to speed up CPU recovery; but 540 /* Could also do a CLFLUSH here to speed up CPU recovery; but
526 that causes hangs on some VIA CPUs. */ 541 that causes hangs on some VIA CPUs. */
527 for (i = 0; i < len; i++) 542 for (i = 0; i < len; i++)
528 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); 543 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
544 local_irq_restore(flags);
529 return addr; 545 return addr;
530} 546}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5113c080f0c4..a97db99dad52 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -22,10 +22,9 @@
22#include <linux/bitops.h> 22#include <linux/bitops.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h>
25#include <linux/iommu-helper.h> 26#include <linux/iommu-helper.h>
26#ifdef CONFIG_IOMMU_API
27#include <linux/iommu.h> 27#include <linux/iommu.h>
28#endif
29#include <asm/proto.h> 28#include <asm/proto.h>
30#include <asm/iommu.h> 29#include <asm/iommu.h>
31#include <asm/gart.h> 30#include <asm/gart.h>
@@ -1297,8 +1296,10 @@ static void __unmap_single(struct amd_iommu *iommu,
1297/* 1296/*
1298 * The exported map_single function for dma_ops. 1297 * The exported map_single function for dma_ops.
1299 */ 1298 */
1300static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 1299static dma_addr_t map_page(struct device *dev, struct page *page,
1301 size_t size, int dir) 1300 unsigned long offset, size_t size,
1301 enum dma_data_direction dir,
1302 struct dma_attrs *attrs)
1302{ 1303{
1303 unsigned long flags; 1304 unsigned long flags;
1304 struct amd_iommu *iommu; 1305 struct amd_iommu *iommu;
@@ -1306,6 +1307,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1306 u16 devid; 1307 u16 devid;
1307 dma_addr_t addr; 1308 dma_addr_t addr;
1308 u64 dma_mask; 1309 u64 dma_mask;
1310 phys_addr_t paddr = page_to_phys(page) + offset;
1309 1311
1310 INC_STATS_COUNTER(cnt_map_single); 1312 INC_STATS_COUNTER(cnt_map_single);
1311 1313
@@ -1340,8 +1342,8 @@ out:
1340/* 1342/*
1341 * The exported unmap_single function for dma_ops. 1343 * The exported unmap_single function for dma_ops.
1342 */ 1344 */
1343static void unmap_single(struct device *dev, dma_addr_t dma_addr, 1345static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1344 size_t size, int dir) 1346 enum dma_data_direction dir, struct dma_attrs *attrs)
1345{ 1347{
1346 unsigned long flags; 1348 unsigned long flags;
1347 struct amd_iommu *iommu; 1349 struct amd_iommu *iommu;
@@ -1390,7 +1392,8 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
1390 * lists). 1392 * lists).
1391 */ 1393 */
1392static int map_sg(struct device *dev, struct scatterlist *sglist, 1394static int map_sg(struct device *dev, struct scatterlist *sglist,
1393 int nelems, int dir) 1395 int nelems, enum dma_data_direction dir,
1396 struct dma_attrs *attrs)
1394{ 1397{
1395 unsigned long flags; 1398 unsigned long flags;
1396 struct amd_iommu *iommu; 1399 struct amd_iommu *iommu;
@@ -1457,7 +1460,8 @@ unmap:
1457 * lists). 1460 * lists).
1458 */ 1461 */
1459static void unmap_sg(struct device *dev, struct scatterlist *sglist, 1462static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1460 int nelems, int dir) 1463 int nelems, enum dma_data_direction dir,
1464 struct dma_attrs *attrs)
1461{ 1465{
1462 unsigned long flags; 1466 unsigned long flags;
1463 struct amd_iommu *iommu; 1467 struct amd_iommu *iommu;
@@ -1644,11 +1648,11 @@ static void prealloc_protection_domains(void)
1644 } 1648 }
1645} 1649}
1646 1650
1647static struct dma_mapping_ops amd_iommu_dma_ops = { 1651static struct dma_map_ops amd_iommu_dma_ops = {
1648 .alloc_coherent = alloc_coherent, 1652 .alloc_coherent = alloc_coherent,
1649 .free_coherent = free_coherent, 1653 .free_coherent = free_coherent,
1650 .map_single = map_single, 1654 .map_page = map_page,
1651 .unmap_single = unmap_single, 1655 .unmap_page = unmap_page,
1652 .map_sg = map_sg, 1656 .map_sg = map_sg,
1653 .unmap_sg = unmap_sg, 1657 .unmap_sg = unmap_sg,
1654 .dma_supported = amd_iommu_dma_supported, 1658 .dma_supported = amd_iommu_dma_supported,
@@ -1924,6 +1928,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
1924 return paddr; 1928 return paddr;
1925} 1929}
1926 1930
1931static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
1932 unsigned long cap)
1933{
1934 return 0;
1935}
1936
1927static struct iommu_ops amd_iommu_ops = { 1937static struct iommu_ops amd_iommu_ops = {
1928 .domain_init = amd_iommu_domain_init, 1938 .domain_init = amd_iommu_domain_init,
1929 .domain_destroy = amd_iommu_domain_destroy, 1939 .domain_destroy = amd_iommu_domain_destroy,
@@ -1932,5 +1942,6 @@ static struct iommu_ops amd_iommu_ops = {
1932 .map = amd_iommu_map_range, 1942 .map = amd_iommu_map_range,
1933 .unmap = amd_iommu_unmap_range, 1943 .unmap = amd_iommu_unmap_range,
1934 .iova_to_phys = amd_iommu_iova_to_phys, 1944 .iova_to_phys = amd_iommu_iova_to_phys,
1945 .domain_has_cap = amd_iommu_domain_has_cap,
1935}; 1946};
1936 1947
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 000000000000..da7b7b9f8bd8
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for local APIC drivers and for the IO-APIC code
3#
4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o
8
9ifeq ($(CONFIG_X86_64),y)
10obj-y += apic_flat_64.o
11obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
12obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14endif
15
16obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
17obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
18obj-$(CONFIG_X86_ES7000) += es7000_32.o
19obj-$(CONFIG_X86_SUMMIT) += summit_32.o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic/apic.c
index 115449f869ee..098ec84b8c00 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Local APIC handling, local APIC timers 2 * Local APIC handling, local APIC timers
3 * 3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> 4 * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
5 * 5 *
6 * Fixes 6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs; 7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
@@ -14,51 +14,70 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 18#include <linux/mc146818rtc.h>
26#include <linux/ioport.h>
27#include <linux/cpu.h>
28#include <linux/clockchips.h>
29#include <linux/acpi_pmtmr.h> 19#include <linux/acpi_pmtmr.h>
20#include <linux/clockchips.h>
21#include <linux/interrupt.h>
22#include <linux/bootmem.h>
23#include <linux/ftrace.h>
24#include <linux/ioport.h>
30#include <linux/module.h> 25#include <linux/module.h>
31#include <linux/dmi.h> 26#include <linux/sysdev.h>
27#include <linux/delay.h>
28#include <linux/timex.h>
32#include <linux/dmar.h> 29#include <linux/dmar.h>
33#include <linux/ftrace.h> 30#include <linux/init.h>
34#include <linux/smp.h> 31#include <linux/cpu.h>
32#include <linux/dmi.h>
35#include <linux/nmi.h> 33#include <linux/nmi.h>
36#include <linux/timex.h> 34#include <linux/smp.h>
35#include <linux/mm.h>
37 36
37#include <asm/pgalloc.h>
38#include <asm/atomic.h> 38#include <asm/atomic.h>
39#include <asm/mtrr.h>
40#include <asm/mpspec.h> 39#include <asm/mpspec.h>
41#include <asm/desc.h>
42#include <asm/arch_hooks.h>
43#include <asm/hpet.h>
44#include <asm/pgalloc.h>
45#include <asm/i8253.h> 40#include <asm/i8253.h>
46#include <asm/idle.h> 41#include <asm/i8259.h>
47#include <asm/proto.h> 42#include <asm/proto.h>
48#include <asm/apic.h> 43#include <asm/apic.h>
49#include <asm/i8259.h> 44#include <asm/desc.h>
45#include <asm/hpet.h>
46#include <asm/idle.h>
47#include <asm/mtrr.h>
50#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
50
51unsigned int num_processors;
52
53unsigned disabled_cpus __cpuinitdata;
54
55/* Processor that is doing the boot up */
56unsigned int boot_cpu_physical_apicid = -1U;
57
58/*
59 * The highest APIC ID seen during enumeration.
60 *
61 * This determines the messaging protocol we can use: if all APIC IDs
62 * are in the 0 ... 7 range, then we can use logical addressing which
63 * has some performance advantages (better broadcasting).
64 *
65 * If there's an APIC ID above 8, we use physical addressing.
66 */
67unsigned int max_physical_apicid;
51 68
52#include <mach_apic.h> 69/*
53#include <mach_apicdef.h> 70 * Bitmask of physically existing CPUs:
54#include <mach_ipi.h> 71 */
72physid_mask_t phys_cpu_present_map;
55 73
56/* 74/*
57 * Sanity check 75 * Map cpu index to physical APIC ID
58 */ 76 */
59#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) 77DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
60# error SPURIOUS_APIC_VECTOR definition error 78DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
61#endif 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
62 81
63#ifdef CONFIG_X86_32 82#ifdef CONFIG_X86_32
64/* 83/*
@@ -92,11 +111,7 @@ static __init int setup_apicpmtimer(char *s)
92__setup("apicpmtimer", setup_apicpmtimer); 111__setup("apicpmtimer", setup_apicpmtimer);
93#endif 112#endif
94 113
95#ifdef CONFIG_X86_64 114#ifdef CONFIG_X86_X2APIC
96#define HAVE_X2APIC
97#endif
98
99#ifdef HAVE_X2APIC
100int x2apic; 115int x2apic;
101/* x2apic enabled before OS handover */ 116/* x2apic enabled before OS handover */
102static int x2apic_preenabled; 117static int x2apic_preenabled;
@@ -194,18 +209,13 @@ static int modern_apic(void)
194 return lapic_get_version() >= 0x14; 209 return lapic_get_version() >= 0x14;
195} 210}
196 211
197/* 212void native_apic_wait_icr_idle(void)
198 * Paravirt kernels also might be using these below ops. So we still
199 * use generic apic_read()/apic_write(), which might be pointing to different
200 * ops in PARAVIRT case.
201 */
202void xapic_wait_icr_idle(void)
203{ 213{
204 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
205 cpu_relax(); 215 cpu_relax();
206} 216}
207 217
208u32 safe_xapic_wait_icr_idle(void) 218u32 native_safe_apic_wait_icr_idle(void)
209{ 219{
210 u32 send_status; 220 u32 send_status;
211 int timeout; 221 int timeout;
@@ -221,13 +231,13 @@ u32 safe_xapic_wait_icr_idle(void)
221 return send_status; 231 return send_status;
222} 232}
223 233
224void xapic_icr_write(u32 low, u32 id) 234void native_apic_icr_write(u32 low, u32 id)
225{ 235{
226 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); 236 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
227 apic_write(APIC_ICR, low); 237 apic_write(APIC_ICR, low);
228} 238}
229 239
230static u64 xapic_icr_read(void) 240u64 native_apic_icr_read(void)
231{ 241{
232 u32 icr1, icr2; 242 u32 icr1, icr2;
233 243
@@ -237,54 +247,6 @@ static u64 xapic_icr_read(void)
237 return icr1 | ((u64)icr2 << 32); 247 return icr1 | ((u64)icr2 << 32);
238} 248}
239 249
240static struct apic_ops xapic_ops = {
241 .read = native_apic_mem_read,
242 .write = native_apic_mem_write,
243 .icr_read = xapic_icr_read,
244 .icr_write = xapic_icr_write,
245 .wait_icr_idle = xapic_wait_icr_idle,
246 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
247};
248
249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
250EXPORT_SYMBOL_GPL(apic_ops);
251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270static u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
288/** 250/**
289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 251 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
290 */ 252 */
@@ -457,7 +419,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
457static void lapic_timer_broadcast(const struct cpumask *mask) 419static void lapic_timer_broadcast(const struct cpumask *mask)
458{ 420{
459#ifdef CONFIG_SMP 421#ifdef CONFIG_SMP
460 send_IPI_mask(mask, LOCAL_TIMER_VECTOR); 422 apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
461#endif 423#endif
462} 424}
463 425
@@ -535,7 +497,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
535 } 497 }
536} 498}
537 499
538static int __init calibrate_by_pmtimer(long deltapm, long *delta) 500static int __init
501calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
539{ 502{
540 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; 503 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
541 const long pm_thresh = pm_100ms / 100; 504 const long pm_thresh = pm_100ms / 100;
@@ -546,7 +509,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
546 return -1; 509 return -1;
547#endif 510#endif
548 511
549 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 512 apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
550 513
551 /* Check, if the PM timer is available */ 514 /* Check, if the PM timer is available */
552 if (!deltapm) 515 if (!deltapm)
@@ -556,19 +519,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
556 519
557 if (deltapm > (pm_100ms - pm_thresh) && 520 if (deltapm > (pm_100ms - pm_thresh) &&
558 deltapm < (pm_100ms + pm_thresh)) { 521 deltapm < (pm_100ms + pm_thresh)) {
559 apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); 522 apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
560 } else { 523 return 0;
561 res = (((u64)deltapm) * mult) >> 22; 524 }
562 do_div(res, 1000000); 525
563 pr_warning("APIC calibration not consistent " 526 res = (((u64)deltapm) * mult) >> 22;
564 "with PM Timer: %ldms instead of 100ms\n", 527 do_div(res, 1000000);
565 (long)res); 528 pr_warning("APIC calibration not consistent "
566 /* Correct the lapic counter value */ 529 "with PM-Timer: %ldms instead of 100ms\n",(long)res);
567 res = (((u64)(*delta)) * pm_100ms); 530
531 /* Correct the lapic counter value */
532 res = (((u64)(*delta)) * pm_100ms);
533 do_div(res, deltapm);
534 pr_info("APIC delta adjusted to PM-Timer: "
535 "%lu (%ld)\n", (unsigned long)res, *delta);
536 *delta = (long)res;
537
538 /* Correct the tsc counter value */
539 if (cpu_has_tsc) {
540 res = (((u64)(*deltatsc)) * pm_100ms);
568 do_div(res, deltapm); 541 do_div(res, deltapm);
569 pr_info("APIC delta adjusted to PM-Timer: " 542 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
570 "%lu (%ld)\n", (unsigned long)res, *delta); 543 "PM-Timer: %lu (%ld) \n",
571 *delta = (long)res; 544 (unsigned long)res, *deltatsc);
545 *deltatsc = (long)res;
572 } 546 }
573 547
574 return 0; 548 return 0;
@@ -579,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
579 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 553 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
580 void (*real_handler)(struct clock_event_device *dev); 554 void (*real_handler)(struct clock_event_device *dev);
581 unsigned long deltaj; 555 unsigned long deltaj;
582 long delta; 556 long delta, deltatsc;
583 int pm_referenced = 0; 557 int pm_referenced = 0;
584 558
585 local_irq_disable(); 559 local_irq_disable();
@@ -609,9 +583,11 @@ static int __init calibrate_APIC_clock(void)
609 delta = lapic_cal_t1 - lapic_cal_t2; 583 delta = lapic_cal_t1 - lapic_cal_t2;
610 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 584 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
611 585
586 deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
587
612 /* we trust the PM based calibration if possible */ 588 /* we trust the PM based calibration if possible */
613 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, 589 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
614 &delta); 590 &delta, &deltatsc);
615 591
616 /* Calculate the scaled math multiplication factor */ 592 /* Calculate the scaled math multiplication factor */
617 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 593 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -629,11 +605,10 @@ static int __init calibrate_APIC_clock(void)
629 calibration_result); 605 calibration_result);
630 606
631 if (cpu_has_tsc) { 607 if (cpu_has_tsc) {
632 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
633 apic_printk(APIC_VERBOSE, "..... CPU clock speed is " 608 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
634 "%ld.%04ld MHz.\n", 609 "%ld.%04ld MHz.\n",
635 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), 610 (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
636 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); 611 (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
637 } 612 }
638 613
639 apic_printk(APIC_VERBOSE, "..... host bus clock speed is " 614 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
@@ -834,7 +809,7 @@ void clear_local_APIC(void)
834 u32 v; 809 u32 v;
835 810
836 /* APIC hasn't been mapped yet */ 811 /* APIC hasn't been mapped yet */
837 if (!apic_phys) 812 if (!x2apic && !apic_phys)
838 return; 813 return;
839 814
840 maxlvt = lapic_get_maxlvt(); 815 maxlvt = lapic_get_maxlvt();
@@ -862,12 +837,20 @@ void clear_local_APIC(void)
862 } 837 }
863 838
864 /* lets not touch this if we didn't frob it */ 839 /* lets not touch this if we didn't frob it */
865#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) 840#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
866 if (maxlvt >= 5) { 841 if (maxlvt >= 5) {
867 v = apic_read(APIC_LVTTHMR); 842 v = apic_read(APIC_LVTTHMR);
868 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
869 } 844 }
870#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
871 /* 854 /*
872 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
873 */ 856 */
@@ -991,11 +974,11 @@ int __init verify_local_APIC(void)
991 */ 974 */
992 reg0 = apic_read(APIC_ID); 975 reg0 = apic_read(APIC_ID);
993 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 976 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
994 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 977 apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
995 reg1 = apic_read(APIC_ID); 978 reg1 = apic_read(APIC_ID);
996 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 979 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
997 apic_write(APIC_ID, reg0); 980 apic_write(APIC_ID, reg0);
998 if (reg1 != (reg0 ^ APIC_ID_MASK)) 981 if (reg1 != (reg0 ^ apic->apic_id_mask))
999 return 0; 982 return 0;
1000 983
1001 /* 984 /*
@@ -1089,7 +1072,7 @@ static void __cpuinit lapic_setup_esr(void)
1089 return; 1072 return;
1090 } 1073 }
1091 1074
1092 if (esr_disable) { 1075 if (apic->disable_esr) {
1093 /* 1076 /*
1094 * Something untraceable is creating bad interrupts on 1077 * Something untraceable is creating bad interrupts on
1095 * secondary quads ... for the moment, just leave the 1078 * secondary quads ... for the moment, just leave the
@@ -1130,9 +1113,14 @@ void __cpuinit setup_local_APIC(void)
1130 unsigned int value; 1113 unsigned int value;
1131 int i, j; 1114 int i, j;
1132 1115
1116 if (disable_apic) {
1117 arch_disable_smp_support();
1118 return;
1119 }
1120
1133#ifdef CONFIG_X86_32 1121#ifdef CONFIG_X86_32
1134 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1122 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1135 if (lapic_is_integrated() && esr_disable) { 1123 if (lapic_is_integrated() && apic->disable_esr) {
1136 apic_write(APIC_ESR, 0); 1124 apic_write(APIC_ESR, 0);
1137 apic_write(APIC_ESR, 0); 1125 apic_write(APIC_ESR, 0);
1138 apic_write(APIC_ESR, 0); 1126 apic_write(APIC_ESR, 0);
@@ -1146,7 +1134,7 @@ void __cpuinit setup_local_APIC(void)
1146 * Double-check whether this APIC is really registered. 1134 * Double-check whether this APIC is really registered.
1147 * This is meaningless in clustered apic mode, so we skip it. 1135 * This is meaningless in clustered apic mode, so we skip it.
1148 */ 1136 */
1149 if (!apic_id_registered()) 1137 if (!apic->apic_id_registered())
1150 BUG(); 1138 BUG();
1151 1139
1152 /* 1140 /*
@@ -1154,7 +1142,7 @@ void __cpuinit setup_local_APIC(void)
1154 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 1142 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
1155 * document number 292116). So here it goes... 1143 * document number 292116). So here it goes...
1156 */ 1144 */
1157 init_apic_ldr(); 1145 apic->init_apic_ldr();
1158 1146
1159 /* 1147 /*
1160 * Set Task Priority to 'accept all'. We never change this 1148 * Set Task Priority to 'accept all'. We never change this
@@ -1262,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1262 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1263 1251
1264 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1265} 1259}
1266 1260
1267void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
@@ -1282,17 +1276,12 @@ void __cpuinit end_local_APIC_setup(void)
1282 apic_pm_activate(); 1276 apic_pm_activate();
1283} 1277}
1284 1278
1285#ifdef HAVE_X2APIC 1279#ifdef CONFIG_X86_X2APIC
1286void check_x2apic(void) 1280void check_x2apic(void)
1287{ 1281{
1288 int msr, msr2; 1282 if (x2apic_enabled()) {
1289
1290 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1291
1292 if (msr & X2APIC_ENABLE) {
1293 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1283 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1294 x2apic_preenabled = x2apic = 1; 1284 x2apic_preenabled = x2apic = 1;
1295 apic_ops = &x2apic_ops;
1296 } 1285 }
1297} 1286}
1298 1287
@@ -1300,6 +1289,9 @@ void enable_x2apic(void)
1300{ 1289{
1301 int msr, msr2; 1290 int msr, msr2;
1302 1291
1292 if (!x2apic)
1293 return;
1294
1303 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1295 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1304 if (!(msr & X2APIC_ENABLE)) { 1296 if (!(msr & X2APIC_ENABLE)) {
1305 pr_info("Enabling x2apic\n"); 1297 pr_info("Enabling x2apic\n");
@@ -1312,6 +1304,7 @@ void __init enable_IR_x2apic(void)
1312#ifdef CONFIG_INTR_REMAP 1304#ifdef CONFIG_INTR_REMAP
1313 int ret; 1305 int ret;
1314 unsigned long flags; 1306 unsigned long flags;
1307 struct IO_APIC_route_entry **ioapic_entries = NULL;
1315 1308
1316 if (!cpu_has_x2apic) 1309 if (!cpu_has_x2apic)
1317 return; 1310 return;
@@ -1342,16 +1335,23 @@ void __init enable_IR_x2apic(void)
1342 return; 1335 return;
1343 } 1336 }
1344 1337
1345 local_irq_save(flags); 1338 ioapic_entries = alloc_ioapic_entries();
1346 mask_8259A(); 1339 if (!ioapic_entries) {
1340 pr_info("Allocate ioapic_entries failed: %d\n", ret);
1341 goto end;
1342 }
1347 1343
1348 ret = save_mask_IO_APIC_setup(); 1344 ret = save_IO_APIC_setup(ioapic_entries);
1349 if (ret) { 1345 if (ret) {
1350 pr_info("Saving IO-APIC state failed: %d\n", ret); 1346 pr_info("Saving IO-APIC state failed: %d\n", ret);
1351 goto end; 1347 goto end;
1352 } 1348 }
1353 1349
1354 ret = enable_intr_remapping(1); 1350 local_irq_save(flags);
1351 mask_IO_APIC_setup(ioapic_entries);
1352 mask_8259A();
1353
1354 ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
1355 1355
1356 if (ret && x2apic_preenabled) { 1356 if (ret && x2apic_preenabled) {
1357 local_irq_restore(flags); 1357 local_irq_restore(flags);
@@ -1363,7 +1363,6 @@ void __init enable_IR_x2apic(void)
1363 1363
1364 if (!x2apic) { 1364 if (!x2apic) {
1365 x2apic = 1; 1365 x2apic = 1;
1366 apic_ops = &x2apic_ops;
1367 enable_x2apic(); 1366 enable_x2apic();
1368 } 1367 }
1369 1368
@@ -1372,14 +1371,14 @@ end_restore:
1372 /* 1371 /*
1373 * IR enabling failed 1372 * IR enabling failed
1374 */ 1373 */
1375 restore_IO_APIC_setup(); 1374 restore_IO_APIC_setup(ioapic_entries);
1376 else 1375 else
1377 reinit_intr_remapped_IO_APIC(x2apic_preenabled); 1376 reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
1378 1377
1379end:
1380 unmask_8259A(); 1378 unmask_8259A();
1381 local_irq_restore(flags); 1379 local_irq_restore(flags);
1382 1380
1381end:
1383 if (!ret) { 1382 if (!ret) {
1384 if (!x2apic_preenabled) 1383 if (!x2apic_preenabled)
1385 pr_info("Enabled x2apic and interrupt-remapping\n"); 1384 pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1387,6 +1386,8 @@ end:
1387 pr_info("Enabled Interrupt-remapping\n"); 1386 pr_info("Enabled Interrupt-remapping\n");
1388 } else 1387 } else
1389 pr_err("Failed to enable Interrupt-remapping and x2apic\n"); 1388 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1389 if (ioapic_entries)
1390 free_ioapic_entries(ioapic_entries);
1390#else 1391#else
1391 if (!cpu_has_x2apic) 1392 if (!cpu_has_x2apic)
1392 return; 1393 return;
@@ -1401,7 +1402,7 @@ end:
1401 1402
1402 return; 1403 return;
1403} 1404}
1404#endif /* HAVE_X2APIC */ 1405#endif /* CONFIG_X86_X2APIC */
1405 1406
1406#ifdef CONFIG_X86_64 1407#ifdef CONFIG_X86_64
1407/* 1408/*
@@ -1532,12 +1533,10 @@ void __init early_init_lapic_mapping(void)
1532 */ 1533 */
1533void __init init_apic_mappings(void) 1534void __init init_apic_mappings(void)
1534{ 1535{
1535#ifdef HAVE_X2APIC
1536 if (x2apic) { 1536 if (x2apic) {
1537 boot_cpu_physical_apicid = read_apic_id(); 1537 boot_cpu_physical_apicid = read_apic_id();
1538 return; 1538 return;
1539 } 1539 }
1540#endif
1541 1540
1542 /* 1541 /*
1543 * If no local APIC can be found then set up a fake all 1542 * If no local APIC can be found then set up a fake all
@@ -1570,11 +1569,11 @@ int apic_version[MAX_APICS];
1570 1569
1571int __init APIC_init_uniprocessor(void) 1570int __init APIC_init_uniprocessor(void)
1572{ 1571{
1573#ifdef CONFIG_X86_64
1574 if (disable_apic) { 1572 if (disable_apic) {
1575 pr_info("Apic disabled\n"); 1573 pr_info("Apic disabled\n");
1576 return -1; 1574 return -1;
1577 } 1575 }
1576#ifdef CONFIG_X86_64
1578 if (!cpu_has_apic) { 1577 if (!cpu_has_apic) {
1579 disable_apic = 1; 1578 disable_apic = 1;
1580 pr_info("Apic disabled by BIOS\n"); 1579 pr_info("Apic disabled by BIOS\n");
@@ -1596,11 +1595,9 @@ int __init APIC_init_uniprocessor(void)
1596 } 1595 }
1597#endif 1596#endif
1598 1597
1599#ifdef HAVE_X2APIC
1600 enable_IR_x2apic(); 1598 enable_IR_x2apic();
1601#endif
1602#ifdef CONFIG_X86_64 1599#ifdef CONFIG_X86_64
1603 setup_apic_routing(); 1600 default_setup_apic_routing();
1604#endif 1601#endif
1605 1602
1606 verify_local_APIC(); 1603 verify_local_APIC();
@@ -1621,35 +1618,31 @@ int __init APIC_init_uniprocessor(void)
1621 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1618 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1622 setup_local_APIC(); 1619 setup_local_APIC();
1623 1620
1624#ifdef CONFIG_X86_64 1621#ifdef CONFIG_X86_IO_APIC
1625 /* 1622 /*
1626 * Now enable IO-APICs, actually call clear_IO_APIC 1623 * Now enable IO-APICs, actually call clear_IO_APIC
1627 * We need clear_IO_APIC before enabling vector on BP 1624 * We need clear_IO_APIC before enabling error vector
1628 */ 1625 */
1629 if (!skip_ioapic_setup && nr_ioapics) 1626 if (!skip_ioapic_setup && nr_ioapics)
1630 enable_IO_APIC(); 1627 enable_IO_APIC();
1631#endif 1628#endif
1632 1629
1633#ifdef CONFIG_X86_IO_APIC
1634 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1635#endif
1636 localise_nmi_watchdog();
1637 end_local_APIC_setup(); 1630 end_local_APIC_setup();
1638 1631
1639#ifdef CONFIG_X86_IO_APIC 1632#ifdef CONFIG_X86_IO_APIC
1640 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1633 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1641 setup_IO_APIC(); 1634 setup_IO_APIC();
1642# ifdef CONFIG_X86_64 1635 else {
1643 else
1644 nr_ioapics = 0; 1636 nr_ioapics = 0;
1645# endif 1637 localise_nmi_watchdog();
1638 }
1639#else
1640 localise_nmi_watchdog();
1646#endif 1641#endif
1647 1642
1643 setup_boot_clock();
1648#ifdef CONFIG_X86_64 1644#ifdef CONFIG_X86_64
1649 setup_boot_APIC_clock();
1650 check_nmi_watchdog(); 1645 check_nmi_watchdog();
1651#else
1652 setup_boot_clock();
1653#endif 1646#endif
1654 1647
1655 return 0; 1648 return 0;
@@ -1738,7 +1731,8 @@ void __init connect_bsp_APIC(void)
1738 outb(0x01, 0x23); 1731 outb(0x01, 0x23);
1739 } 1732 }
1740#endif 1733#endif
1741 enable_apic_mode(); 1734 if (apic->enable_apic_mode)
1735 apic->enable_apic_mode();
1742} 1736}
1743 1737
1744/** 1738/**
@@ -1876,29 +1870,39 @@ void __cpuinit generic_processor_info(int apicid, int version)
1876 } 1870 }
1877#endif 1871#endif
1878 1872
1879#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1873#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1880 /* are we being called early in kernel startup? */ 1874 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1881 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1875 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1882 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1883 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1884
1885 cpu_to_apicid[cpu] = apicid;
1886 bios_cpu_apicid[cpu] = apicid;
1887 } else {
1888 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1889 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1890 }
1891#endif 1876#endif
1892 1877
1893 set_cpu_possible(cpu, true); 1878 set_cpu_possible(cpu, true);
1894 set_cpu_present(cpu, true); 1879 set_cpu_present(cpu, true);
1895} 1880}
1896 1881
1897#ifdef CONFIG_X86_64
1898int hard_smp_processor_id(void) 1882int hard_smp_processor_id(void)
1899{ 1883{
1900 return read_apic_id(); 1884 return read_apic_id();
1901} 1885}
1886
1887void default_init_apic_ldr(void)
1888{
1889 unsigned long val;
1890
1891 apic_write(APIC_DFR, APIC_DFR_VALUE);
1892 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
1893 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
1894 apic_write(APIC_LDR, val);
1895}
1896
1897#ifdef CONFIG_X86_32
1898int default_apicid_to_node(int logical_apicid)
1899{
1900#ifdef CONFIG_SMP
1901 return apicid_2_node[hard_smp_processor_id()];
1902#else
1903 return 0;
1904#endif
1905}
1902#endif 1906#endif
1903 1907
1904/* 1908/*
@@ -1959,6 +1963,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1959 1963
1960 local_irq_save(flags); 1964 local_irq_save(flags);
1961 disable_local_APIC(); 1965 disable_local_APIC();
1966#ifdef CONFIG_INTR_REMAP
1967 if (intr_remapping_enabled)
1968 disable_intr_remapping();
1969#endif
1962 local_irq_restore(flags); 1970 local_irq_restore(flags);
1963 return 0; 1971 return 0;
1964} 1972}
@@ -1969,19 +1977,42 @@ static int lapic_resume(struct sys_device *dev)
1969 unsigned long flags; 1977 unsigned long flags;
1970 int maxlvt; 1978 int maxlvt;
1971 1979
1980#ifdef CONFIG_INTR_REMAP
1981 int ret;
1982 struct IO_APIC_route_entry **ioapic_entries = NULL;
1983
1972 if (!apic_pm_state.active) 1984 if (!apic_pm_state.active)
1973 return 0; 1985 return 0;
1974 1986
1975 maxlvt = lapic_get_maxlvt();
1976
1977 local_irq_save(flags); 1987 local_irq_save(flags);
1988 if (x2apic) {
1989 ioapic_entries = alloc_ioapic_entries();
1990 if (!ioapic_entries) {
1991 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
1992 return -ENOMEM;
1993 }
1978 1994
1979#ifdef HAVE_X2APIC 1995 ret = save_IO_APIC_setup(ioapic_entries);
1996 if (ret) {
1997 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
1998 free_ioapic_entries(ioapic_entries);
1999 return ret;
2000 }
2001
2002 mask_IO_APIC_setup(ioapic_entries);
2003 mask_8259A();
2004 enable_x2apic();
2005 }
2006#else
2007 if (!apic_pm_state.active)
2008 return 0;
2009
2010 local_irq_save(flags);
1980 if (x2apic) 2011 if (x2apic)
1981 enable_x2apic(); 2012 enable_x2apic();
1982 else
1983#endif 2013#endif
1984 { 2014
2015 else {
1985 /* 2016 /*
1986 * Make sure the APICBASE points to the right address 2017 * Make sure the APICBASE points to the right address
1987 * 2018 *
@@ -1994,6 +2025,7 @@ static int lapic_resume(struct sys_device *dev)
1994 wrmsr(MSR_IA32_APICBASE, l, h); 2025 wrmsr(MSR_IA32_APICBASE, l, h);
1995 } 2026 }
1996 2027
2028 maxlvt = lapic_get_maxlvt();
1997 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 2029 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1998 apic_write(APIC_ID, apic_pm_state.apic_id); 2030 apic_write(APIC_ID, apic_pm_state.apic_id);
1999 apic_write(APIC_DFR, apic_pm_state.apic_dfr); 2031 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -2017,8 +2049,20 @@ static int lapic_resume(struct sys_device *dev)
2017 apic_write(APIC_ESR, 0); 2049 apic_write(APIC_ESR, 0);
2018 apic_read(APIC_ESR); 2050 apic_read(APIC_ESR);
2019 2051
2052#ifdef CONFIG_INTR_REMAP
2053 if (intr_remapping_enabled)
2054 reenable_intr_remapping(EIM_32BIT_APIC_ID);
2055
2056 if (x2apic) {
2057 unmask_8259A();
2058 restore_IO_APIC_setup(ioapic_entries);
2059 free_ioapic_entries(ioapic_entries);
2060 }
2061#endif
2062
2020 local_irq_restore(flags); 2063 local_irq_restore(flags);
2021 2064
2065
2022 return 0; 2066 return 0;
2023} 2067}
2024 2068
@@ -2056,7 +2100,9 @@ static int __init init_lapic_sysfs(void)
2056 error = sysdev_register(&device_lapic); 2100 error = sysdev_register(&device_lapic);
2057 return error; 2101 return error;
2058} 2102}
2059device_initcall(init_lapic_sysfs); 2103
2104/* local apic needs to resume before other devices access its registers. */
2105core_initcall(init_lapic_sysfs);
2060 2106
2061#else /* CONFIG_PM */ 2107#else /* CONFIG_PM */
2062 2108
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 34185488e4fb..0014714ea97b 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -17,9 +17,8 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/apic.h>
20#include <asm/ipi.h> 21#include <asm/ipi.h>
21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23 22
24#ifdef CONFIG_ACPI 23#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
@@ -74,7 +73,7 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
74 unsigned long flags; 73 unsigned long flags;
75 74
76 local_irq_save(flags); 75 local_irq_save(flags);
77 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); 76 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
78 local_irq_restore(flags); 77 local_irq_restore(flags);
79} 78}
80 79
@@ -85,14 +84,15 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
85 _flat_send_IPI_mask(mask, vector); 84 _flat_send_IPI_mask(mask, vector);
86} 85}
87 86
88static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, 87static void
89 int vector) 88 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
90{ 89{
91 unsigned long mask = cpumask_bits(cpumask)[0]; 90 unsigned long mask = cpumask_bits(cpumask)[0];
92 int cpu = smp_processor_id(); 91 int cpu = smp_processor_id();
93 92
94 if (cpu < BITS_PER_LONG) 93 if (cpu < BITS_PER_LONG)
95 clear_bit(cpu, &mask); 94 clear_bit(cpu, &mask);
95
96 _flat_send_IPI_mask(mask, vector); 96 _flat_send_IPI_mask(mask, vector);
97} 97}
98 98
@@ -114,23 +114,27 @@ static void flat_send_IPI_allbutself(int vector)
114 _flat_send_IPI_mask(mask, vector); 114 _flat_send_IPI_mask(mask, vector);
115 } 115 }
116 } else if (num_online_cpus() > 1) { 116 } else if (num_online_cpus() > 1) {
117 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 117 __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
118 vector, apic->dest_logical);
118 } 119 }
119} 120}
120 121
121static void flat_send_IPI_all(int vector) 122static void flat_send_IPI_all(int vector)
122{ 123{
123 if (vector == NMI_VECTOR) 124 if (vector == NMI_VECTOR) {
124 flat_send_IPI_mask(cpu_online_mask, vector); 125 flat_send_IPI_mask(cpu_online_mask, vector);
125 else 126 } else {
126 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 127 __default_send_IPI_shortcut(APIC_DEST_ALLINC,
128 vector, apic->dest_logical);
129 }
127} 130}
128 131
129static unsigned int get_apic_id(unsigned long x) 132static unsigned int flat_get_apic_id(unsigned long x)
130{ 133{
131 unsigned int id; 134 unsigned int id;
132 135
133 id = (((x)>>24) & 0xFFu); 136 id = (((x)>>24) & 0xFFu);
137
134 return id; 138 return id;
135} 139}
136 140
@@ -146,7 +150,7 @@ static unsigned int read_xapic_id(void)
146{ 150{
147 unsigned int id; 151 unsigned int id;
148 152
149 id = get_apic_id(apic_read(APIC_ID)); 153 id = flat_get_apic_id(apic_read(APIC_ID));
150 return id; 154 return id;
151} 155}
152 156
@@ -155,45 +159,67 @@ static int flat_apic_id_registered(void)
155 return physid_isset(read_xapic_id(), phys_cpu_present_map); 159 return physid_isset(read_xapic_id(), phys_cpu_present_map);
156} 160}
157 161
158static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
159{
160 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
161}
162
163static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
164 const struct cpumask *andmask)
165{
166 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
167 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
168
169 return mask1 & mask2;
170}
171
172static unsigned int phys_pkg_id(int index_msb)
173{ 163{
174 return hard_smp_processor_id() >> index_msb; 164 return hard_smp_processor_id() >> index_msb;
175} 165}
176 166
177struct genapic apic_flat = { 167struct apic apic_flat = {
178 .name = "flat", 168 .name = "flat",
179 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 169 .probe = NULL,
180 .int_delivery_mode = dest_LowestPrio, 170 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
181 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 171 .apic_id_registered = flat_apic_id_registered,
182 .target_cpus = flat_target_cpus, 172
183 .vector_allocation_domain = flat_vector_allocation_domain, 173 .irq_delivery_mode = dest_LowestPrio,
184 .apic_id_registered = flat_apic_id_registered, 174 .irq_dest_mode = 1, /* logical */
185 .init_apic_ldr = flat_init_apic_ldr, 175
186 .send_IPI_all = flat_send_IPI_all, 176 .target_cpus = flat_target_cpus,
187 .send_IPI_allbutself = flat_send_IPI_allbutself, 177 .disable_esr = 0,
188 .send_IPI_mask = flat_send_IPI_mask, 178 .dest_logical = APIC_DEST_LOGICAL,
189 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 179 .check_apicid_used = NULL,
190 .send_IPI_self = apic_send_IPI_self, 180 .check_apicid_present = NULL,
191 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 181
192 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 182 .vector_allocation_domain = flat_vector_allocation_domain,
193 .phys_pkg_id = phys_pkg_id, 183 .init_apic_ldr = flat_init_apic_ldr,
194 .get_apic_id = get_apic_id, 184
195 .set_apic_id = set_apic_id, 185 .ioapic_phys_id_map = NULL,
196 .apic_id_mask = (0xFFu<<24), 186 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL,
193 .check_phys_apicid_present = default_check_phys_apicid_present,
194 .enable_apic_mode = NULL,
195 .phys_pkg_id = flat_phys_pkg_id,
196 .mps_oem_check = NULL,
197
198 .get_apic_id = flat_get_apic_id,
199 .set_apic_id = set_apic_id,
200 .apic_id_mask = 0xFFu << 24,
201
202 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
203 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
204
205 .send_IPI_mask = flat_send_IPI_mask,
206 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
207 .send_IPI_allbutself = flat_send_IPI_allbutself,
208 .send_IPI_all = flat_send_IPI_all,
209 .send_IPI_self = apic_send_IPI_self,
210
211 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
212 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
213 .wait_for_init_deassert = NULL,
214 .smp_callin_clear_local_apic = NULL,
215 .inquire_remote_apic = NULL,
216
217 .read = native_apic_mem_read,
218 .write = native_apic_mem_write,
219 .icr_read = native_apic_icr_read,
220 .icr_write = native_apic_icr_write,
221 .wait_icr_idle = native_apic_wait_icr_idle,
222 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
197}; 223};
198 224
199/* 225/*
@@ -232,18 +258,18 @@ static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
232 258
233static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 259static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
234{ 260{
235 send_IPI_mask_sequence(cpumask, vector); 261 default_send_IPI_mask_sequence_phys(cpumask, vector);
236} 262}
237 263
238static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, 264static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
239 int vector) 265 int vector)
240{ 266{
241 send_IPI_mask_allbutself(cpumask, vector); 267 default_send_IPI_mask_allbutself_phys(cpumask, vector);
242} 268}
243 269
244static void physflat_send_IPI_allbutself(int vector) 270static void physflat_send_IPI_allbutself(int vector)
245{ 271{
246 send_IPI_mask_allbutself(cpu_online_mask, vector); 272 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
247} 273}
248 274
249static void physflat_send_IPI_all(int vector) 275static void physflat_send_IPI_all(int vector)
@@ -276,32 +302,72 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
276 * We're using fixed IRQ delivery, can only return one phys APIC ID. 302 * We're using fixed IRQ delivery, can only return one phys APIC ID.
277 * May as well be the first. 303 * May as well be the first.
278 */ 304 */
279 for_each_cpu_and(cpu, cpumask, andmask) 305 for_each_cpu_and(cpu, cpumask, andmask) {
280 if (cpumask_test_cpu(cpu, cpu_online_mask)) 306 if (cpumask_test_cpu(cpu, cpu_online_mask))
281 break; 307 break;
308 }
282 if (cpu < nr_cpu_ids) 309 if (cpu < nr_cpu_ids)
283 return per_cpu(x86_cpu_to_apicid, cpu); 310 return per_cpu(x86_cpu_to_apicid, cpu);
311
284 return BAD_APICID; 312 return BAD_APICID;
285} 313}
286 314
287struct genapic apic_physflat = { 315struct apic apic_physflat = {
288 .name = "physical flat", 316
289 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 317 .name = "physical flat",
290 .int_delivery_mode = dest_Fixed, 318 .probe = NULL,
291 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 319 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
292 .target_cpus = physflat_target_cpus, 320 .apic_id_registered = flat_apic_id_registered,
293 .vector_allocation_domain = physflat_vector_allocation_domain, 321
294 .apic_id_registered = flat_apic_id_registered, 322 .irq_delivery_mode = dest_Fixed,
295 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ 323 .irq_dest_mode = 0, /* physical */
296 .send_IPI_all = physflat_send_IPI_all, 324
297 .send_IPI_allbutself = physflat_send_IPI_allbutself, 325 .target_cpus = physflat_target_cpus,
298 .send_IPI_mask = physflat_send_IPI_mask, 326 .disable_esr = 0,
299 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 327 .dest_logical = 0,
300 .send_IPI_self = apic_send_IPI_self, 328 .check_apicid_used = NULL,
301 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 329 .check_apicid_present = NULL,
302 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, 330
303 .phys_pkg_id = phys_pkg_id, 331 .vector_allocation_domain = physflat_vector_allocation_domain,
304 .get_apic_id = get_apic_id, 332 /* not needed, but shouldn't hurt: */
305 .set_apic_id = set_apic_id, 333 .init_apic_ldr = flat_init_apic_ldr,
306 .apic_id_mask = (0xFFu<<24), 334
335 .ioapic_phys_id_map = NULL,
336 .setup_apic_routing = NULL,
337 .multi_timer_check = NULL,
338 .apicid_to_node = NULL,
339 .cpu_to_logical_apicid = NULL,
340 .cpu_present_to_apicid = default_cpu_present_to_apicid,
341 .apicid_to_cpu_present = NULL,
342 .setup_portio_remap = NULL,
343 .check_phys_apicid_present = default_check_phys_apicid_present,
344 .enable_apic_mode = NULL,
345 .phys_pkg_id = flat_phys_pkg_id,
346 .mps_oem_check = NULL,
347
348 .get_apic_id = flat_get_apic_id,
349 .set_apic_id = set_apic_id,
350 .apic_id_mask = 0xFFu << 24,
351
352 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
353 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
354
355 .send_IPI_mask = physflat_send_IPI_mask,
356 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
357 .send_IPI_allbutself = physflat_send_IPI_allbutself,
358 .send_IPI_all = physflat_send_IPI_all,
359 .send_IPI_self = apic_send_IPI_self,
360
361 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
362 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
363 .wait_for_init_deassert = NULL,
364 .smp_callin_clear_local_apic = NULL,
365 .inquire_remote_apic = NULL,
366
367 .read = native_apic_mem_read,
368 .write = native_apic_mem_write,
369 .icr_read = native_apic_icr_read,
370 .icr_write = native_apic_icr_write,
371 .wait_icr_idle = native_apic_wait_icr_idle,
372 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
307}; 373};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 000000000000..676cdac385c0
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,267 @@
1/*
2 * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
3 *
4 * Drives the local APIC in "clustered mode".
5 */
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/dmi.h>
11#include <linux/smp.h>
12
13#include <asm/apicdef.h>
14#include <asm/fixmap.h>
15#include <asm/mpspec.h>
16#include <asm/apic.h>
17#include <asm/ipi.h>
18
19static unsigned bigsmp_get_apic_id(unsigned long x)
20{
21 return (x >> 24) & 0xFF;
22}
23
24static int bigsmp_apic_id_registered(void)
25{
26 return 1;
27}
28
29static const struct cpumask *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return cpu_online_mask;
33#else
34 return cpumask_of(0);
35#endif
36}
37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
39{
40 return 0;
41}
42
43static unsigned long bigsmp_check_apicid_present(int bit)
44{
45 return 1;
46}
47
48static inline unsigned long calculate_ldr(int cpu)
49{
50 unsigned long val, id;
51
52 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
53 id = per_cpu(x86_bios_cpu_apicid, cpu);
54 val |= SET_APIC_LOGICAL_ID(id);
55
56 return val;
57}
58
59/*
60 * Set up the logical destination ID.
61 *
62 * Intel recommends to set DFR, LDR and TPR before enabling
63 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
64 * document number 292116). So here it goes...
65 */
66static void bigsmp_init_apic_ldr(void)
67{
68 unsigned long val;
69 int cpu = smp_processor_id();
70
71 apic_write(APIC_DFR, APIC_DFR_FLAT);
72 val = calculate_ldr(cpu);
73 apic_write(APIC_LDR, val);
74}
75
76static void bigsmp_setup_apic_routing(void)
77{
78 printk(KERN_INFO
79 "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
80 nr_ioapics);
81}
82
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{
90 if (mps_cpu < nr_cpu_ids)
91 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
92
93 return BAD_APICID;
94}
95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{
104 if (cpu >= nr_cpu_ids)
105 return BAD_APICID;
106 return cpu_physical_id(cpu);
107}
108
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
110{
111 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL);
113}
114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
116{
117 return 1;
118}
119
120/* As we are using single CPU as destination, pick only one CPU here */
121static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
122{
123 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
124}
125
126static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
127 const struct cpumask *andmask)
128{
129 int cpu;
130
131 /*
132 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first.
134 */
135 for_each_cpu_and(cpu, cpumask, andmask) {
136 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break;
138 }
139 if (cpu < nr_cpu_ids)
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143}
144
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
146{
147 return cpuid_apic >> index_msb;
148}
149
150static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
151{
152 default_send_IPI_mask_sequence_phys(mask, vector);
153}
154
155static void bigsmp_send_IPI_allbutself(int vector)
156{
157 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
158}
159
160static void bigsmp_send_IPI_all(int vector)
161{
162 bigsmp_send_IPI_mask(cpu_online_mask, vector);
163}
164
165static int dmi_bigsmp; /* can be set by dmi scanners */
166
167static int hp_ht_bigsmp(const struct dmi_system_id *d)
168{
169 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
170 dmi_bigsmp = 1;
171
172 return 0;
173}
174
175
176static const struct dmi_system_id bigsmp_dmi_table[] = {
177 { hp_ht_bigsmp, "HP ProLiant DL760 G2",
178 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
179 DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
180 }
181 },
182
183 { hp_ht_bigsmp, "HP ProLiant DL740",
184 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
185 DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
186 }
187 },
188 { } /* NULL entry stops DMI scanning */
189};
190
191static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
192{
193 cpumask_clear(retmask);
194 cpumask_set_cpu(cpu, retmask);
195}
196
197static int probe_bigsmp(void)
198{
199 if (def_to_bigsmp)
200 dmi_bigsmp = 1;
201 else
202 dmi_check_system(bigsmp_dmi_table);
203
204 return dmi_bigsmp;
205}
206
207struct apic apic_bigsmp = {
208
209 .name = "bigsmp",
210 .probe = probe_bigsmp,
211 .acpi_madt_oem_check = NULL,
212 .apic_id_registered = bigsmp_apic_id_registered,
213
214 .irq_delivery_mode = dest_Fixed,
215 /* phys delivery to target CPU: */
216 .irq_dest_mode = 0,
217
218 .target_cpus = bigsmp_target_cpus,
219 .disable_esr = 1,
220 .dest_logical = 0,
221 .check_apicid_used = bigsmp_check_apicid_used,
222 .check_apicid_present = bigsmp_check_apicid_present,
223
224 .vector_allocation_domain = bigsmp_vector_allocation_domain,
225 .init_apic_ldr = bigsmp_init_apic_ldr,
226
227 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
228 .setup_apic_routing = bigsmp_setup_apic_routing,
229 .multi_timer_check = NULL,
230 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
234 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL,
237 .phys_pkg_id = bigsmp_phys_pkg_id,
238 .mps_oem_check = NULL,
239
240 .get_apic_id = bigsmp_get_apic_id,
241 .set_apic_id = NULL,
242 .apic_id_mask = 0xFF << 24,
243
244 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
245 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
246
247 .send_IPI_mask = bigsmp_send_IPI_mask,
248 .send_IPI_mask_allbutself = NULL,
249 .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
250 .send_IPI_all = bigsmp_send_IPI_all,
251 .send_IPI_self = default_send_IPI_self,
252
253 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
254 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
255
256 .wait_for_init_deassert = default_wait_for_init_deassert,
257
258 .smp_callin_clear_local_apic = NULL,
259 .inquire_remote_apic = default_inquire_remote_apic,
260
261 .read = native_apic_mem_read,
262 .write = native_apic_mem_write,
263 .icr_read = native_apic_icr_read,
264 .icr_write = native_apic_icr_write,
265 .wait_icr_idle = native_apic_wait_icr_idle,
266 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
267};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
new file mode 100644
index 000000000000..1c11b819f245
--- /dev/null
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -0,0 +1,781 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 *
5 * This file contains the code to configure and interface
6 * with Unisys ES7000 series hardware system manager.
7 *
8 * Copyright (c) 2003 Unisys Corporation.
9 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
10 *
11 * All Rights Reserved.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of version 2 of the GNU General Public License as
15 * published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it would be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write the Free Software Foundation, Inc., 59
23 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
24 *
25 * Contact information: Unisys Corporation, Township Line & Union Meeting
26 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
27 *
28 * http://www.unisys.com
29 */
30#include <linux/notifier.h>
31#include <linux/spinlock.h>
32#include <linux/cpumask.h>
33#include <linux/threads.h>
34#include <linux/kernel.h>
35#include <linux/module.h>
36#include <linux/reboot.h>
37#include <linux/string.h>
38#include <linux/types.h>
39#include <linux/errno.h>
40#include <linux/acpi.h>
41#include <linux/init.h>
42#include <linux/nmi.h>
43#include <linux/smp.h>
44#include <linux/io.h>
45
46#include <asm/apicdef.h>
47#include <asm/atomic.h>
48#include <asm/fixmap.h>
49#include <asm/mpspec.h>
50#include <asm/setup.h>
51#include <asm/apic.h>
52#include <asm/ipi.h>
53
54/*
55 * ES7000 chipsets
56 */
57
58#define NON_UNISYS 0
59#define ES7000_CLASSIC 1
60#define ES7000_ZORRO 2
61
62#define MIP_REG 1
63#define MIP_PSAI_REG 4
64
65#define MIP_BUSY 1
66#define MIP_SPIN 0xf0000
67#define MIP_VALID 0x0100000000000000ULL
68#define MIP_SW_APIC 0x1020b
69
70#define MIP_PORT(val) ((val >> 32) & 0xffff)
71
72#define MIP_RD_LO(val) (val & 0xffffffff)
73
74struct mip_reg {
75 unsigned long long off_0x00;
76 unsigned long long off_0x08;
77 unsigned long long off_0x10;
78 unsigned long long off_0x18;
79 unsigned long long off_0x20;
80 unsigned long long off_0x28;
81 unsigned long long off_0x30;
82 unsigned long long off_0x38;
83};
84
85struct mip_reg_info {
86 unsigned long long mip_info;
87 unsigned long long delivery_info;
88 unsigned long long host_reg;
89 unsigned long long mip_reg;
90};
91
92struct psai {
93 unsigned long long entry_type;
94 unsigned long long addr;
95 unsigned long long bep_addr;
96};
97
98#ifdef CONFIG_ACPI
99
100struct es7000_oem_table {
101 struct acpi_table_header Header;
102 u32 OEMTableAddr;
103 u32 OEMTableSize;
104};
105
106static unsigned long oem_addrX;
107static unsigned long oem_size;
108
109#endif
110
111/*
112 * ES7000 Globals
113 */
114
115static volatile unsigned long *psai;
116static struct mip_reg *mip_reg;
117static struct mip_reg *host_reg;
118static int mip_port;
119static unsigned long mip_addr;
120static unsigned long host_addr;
121
122int es7000_plat;
123
124/*
125 * GSI override for ES7000 platforms.
126 */
127
128static unsigned int base;
129
130static int
131es7000_rename_gsi(int ioapic, int gsi)
132{
133 if (es7000_plat == ES7000_ZORRO)
134 return gsi;
135
136 if (!base) {
137 int i;
138 for (i = 0; i < nr_ioapics; i++)
139 base += nr_ioapic_registers[i];
140 }
141
142 if (!ioapic && (gsi < 16))
143 gsi += base;
144
145 return gsi;
146}
147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{
150 unsigned long vect = 0, psaival = 0;
151
152 if (psai == NULL)
153 return -1;
154
155 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
156 psaival = (0x1000000 | vect | cpu);
157
158 while (*psai & 0x1000000)
159 ;
160
161 *psai = psaival;
162
163 return 0;
164}
165
166static int es7000_apic_is_cluster(void)
167{
168 /* MPENTIUMIII */
169 if (boot_cpu_data.x86 == 6 &&
170 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
171 return 1;
172
173 return 0;
174}
175
176static void setup_unisys(void)
177{
178 /*
179 * Determine the generation of the ES7000 currently running.
180 *
181 * es7000_plat = 1 if the machine is a 5xx ES7000 box
182 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
183 *
184 */
185 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
186 es7000_plat = ES7000_ZORRO;
187 else
188 es7000_plat = ES7000_CLASSIC;
189 ioapic_renumber_irq = es7000_rename_gsi;
190}
191
192/*
193 * Parse the OEM Table:
194 */
195static int parse_unisys_oem(char *oemptr)
196{
197 int i;
198 int success = 0;
199 unsigned char type, size;
200 unsigned long val;
201 char *tp = NULL;
202 struct psai *psaip = NULL;
203 struct mip_reg_info *mi;
204 struct mip_reg *host, *mip;
205
206 tp = oemptr;
207
208 tp += 8;
209
210 for (i = 0; i <= 6; i++) {
211 type = *tp++;
212 size = *tp++;
213 tp -= 2;
214 switch (type) {
215 case MIP_REG:
216 mi = (struct mip_reg_info *)tp;
217 val = MIP_RD_LO(mi->host_reg);
218 host_addr = val;
219 host = (struct mip_reg *)val;
220 host_reg = __va(host);
221 val = MIP_RD_LO(mi->mip_reg);
222 mip_port = MIP_PORT(mi->mip_info);
223 mip_addr = val;
224 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
227 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
229 (unsigned long)mip_reg);
230 success++;
231 break;
232 case MIP_PSAI_REG:
233 psaip = (struct psai *)tp;
234 if (tp != NULL) {
235 if (psaip->addr)
236 psai = __va(psaip->addr);
237 else
238 psai = NULL;
239 success++;
240 }
241 break;
242 default:
243 break;
244 }
245 tp += size;
246 }
247
248 if (success < 2)
249 es7000_plat = NON_UNISYS;
250 else
251 setup_unisys();
252
253 return es7000_plat;
254}
255
256#ifdef CONFIG_ACPI
257static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
258{
259 struct acpi_table_header *header = NULL;
260 struct es7000_oem_table *table;
261 acpi_size tbl_size;
262 acpi_status ret;
263 int i = 0;
264
265 for (;;) {
266 ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
267 if (!ACPI_SUCCESS(ret))
268 return -1;
269
270 if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
271 break;
272
273 early_acpi_os_unmap_memory(header, tbl_size);
274 }
275
276 table = (void *)header;
277
278 oem_addrX = table->OEMTableAddr;
279 oem_size = table->OEMTableSize;
280
281 early_acpi_os_unmap_memory(header, tbl_size);
282
283 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
284
285 return 0;
286}
287
288static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
289{
290 if (!oem_addr)
291 return;
292
293 __acpi_unmap_table((char *)oem_addr, oem_size);
294}
295
296static int es7000_check_dsdt(void)
297{
298 struct acpi_table_header header;
299
300 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
301 !strncmp(header.oem_id, "UNISYS", 6))
302 return 1;
303 return 0;
304}
305
306static int es7000_acpi_ret;
307
308/* Hook from generic ACPI tables.c */
309static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
310{
311 unsigned long oem_addr = 0;
312 int check_dsdt;
313 int ret = 0;
314
315 /* check dsdt at first to avoid clear fix_map for oem_addr */
316 check_dsdt = es7000_check_dsdt();
317
318 if (!find_unisys_acpi_oem_table(&oem_addr)) {
319 if (check_dsdt) {
320 ret = parse_unisys_oem((char *)oem_addr);
321 } else {
322 setup_unisys();
323 ret = 1;
324 }
325 /*
326 * we need to unmap it
327 */
328 unmap_unisys_acpi_oem_table(oem_addr);
329 }
330
331 es7000_acpi_ret = ret;
332
333 return ret && !es7000_apic_is_cluster();
334}
335
336static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
337{
338 int ret = es7000_acpi_ret;
339
340 return ret && es7000_apic_is_cluster();
341}
342
343#else /* !CONFIG_ACPI: */
344static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
345{
346 return 0;
347}
348
349static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
350{
351 return 0;
352}
353#endif /* !CONFIG_ACPI */
354
355static void es7000_spin(int n)
356{
357 int i = 0;
358
359 while (i++ < n)
360 rep_nop();
361}
362
363static int es7000_mip_write(struct mip_reg *mip_reg)
364{
365 int status = 0;
366 int spin;
367
368 spin = MIP_SPIN;
369 while ((host_reg->off_0x38 & MIP_VALID) != 0) {
370 if (--spin <= 0) {
371 WARN(1, "Timeout waiting for Host Valid Flag\n");
372 return -1;
373 }
374 es7000_spin(MIP_SPIN);
375 }
376
377 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
378 outb(1, mip_port);
379
380 spin = MIP_SPIN;
381
382 while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
383 if (--spin <= 0) {
384 WARN(1, "Timeout waiting for MIP Valid Flag\n");
385 return -1;
386 }
387 es7000_spin(MIP_SPIN);
388 }
389
390 status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
391 mip_reg->off_0x38 &= ~MIP_VALID;
392
393 return status;
394}
395
396static void es7000_enable_apic_mode(void)
397{
398 struct mip_reg es7000_mip_reg;
399 int mip_status;
400
401 if (!es7000_plat)
402 return;
403
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID;
408
409 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
410 WARN(1, "Command failed, status = %x\n", mip_status);
411}
412
413static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
414{
415 /* Careful. Some cpus do not strictly honor the set of cpus
416 * specified in the interrupt destination when using lowest
417 * priority interrupt delivery mode.
418 *
419 * In particular there was a hyperthreading cpu observed to
420 * deliver interrupts to the wrong hyperthread when only one
421 * hyperthread was specified in the interrupt desitination.
422 */
423 cpumask_clear(retmask);
424 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
425}
426
427
428static void es7000_wait_for_init_deassert(atomic_t *deassert)
429{
430 while (!atomic_read(deassert))
431 cpu_relax();
432}
433
434static unsigned int es7000_get_apic_id(unsigned long x)
435{
436 return (x >> 24) & 0xFF;
437}
438
439static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
440{
441 default_send_IPI_mask_sequence_phys(mask, vector);
442}
443
444static void es7000_send_IPI_allbutself(int vector)
445{
446 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
447}
448
449static void es7000_send_IPI_all(int vector)
450{
451 es7000_send_IPI_mask(cpu_online_mask, vector);
452}
453
454static int es7000_apic_id_registered(void)
455{
456 return 1;
457}
458
459static const struct cpumask *target_cpus_cluster(void)
460{
461 return cpu_all_mask;
462}
463
464static const struct cpumask *es7000_target_cpus(void)
465{
466 return cpumask_of(smp_processor_id());
467}
468
469static unsigned long
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{
472 return 0;
473}
474static unsigned long es7000_check_apicid_present(int bit)
475{
476 return physid_isset(bit, phys_cpu_present_map);
477}
478
479static unsigned long calculate_ldr(int cpu)
480{
481 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
482
483 return SET_APIC_LOGICAL_ID(id);
484}
485
486/*
487 * Set up the logical destination ID.
488 *
489 * Intel recommends to set DFR, LdR and TPR before enabling
490 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
491 * document number 292116). So here it goes...
492 */
493static void es7000_init_apic_ldr_cluster(void)
494{
495 unsigned long val;
496 int cpu = smp_processor_id();
497
498 apic_write(APIC_DFR, APIC_DFR_CLUSTER);
499 val = calculate_ldr(cpu);
500 apic_write(APIC_LDR, val);
501}
502
503static void es7000_init_apic_ldr(void)
504{
505 unsigned long val;
506 int cpu = smp_processor_id();
507
508 apic_write(APIC_DFR, APIC_DFR_FLAT);
509 val = calculate_ldr(cpu);
510 apic_write(APIC_LDR, val);
511}
512
513static void es7000_setup_apic_routing(void)
514{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516
517 printk(KERN_INFO
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
522}
523
524static int es7000_apicid_to_node(int logical_apicid)
525{
526 return 0;
527}
528
529
530static int es7000_cpu_present_to_apicid(int mps_cpu)
531{
532 if (!mps_cpu)
533 return boot_cpu_physical_apicid;
534 else if (mps_cpu < nr_cpu_ids)
535 return per_cpu(x86_bios_cpu_apicid, mps_cpu);
536 else
537 return BAD_APICID;
538}
539
540static int cpu_id;
541
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
543{
544 physid_mask_t mask;
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id;
548
549 return mask;
550}
551
552/* Mapping from cpu number to logical apicid */
553static int es7000_cpu_to_logical_apicid(int cpu)
554{
555#ifdef CONFIG_SMP
556 if (cpu >= nr_cpu_ids)
557 return BAD_APICID;
558 return cpu_2_logical_apicid[cpu];
559#else
560 return logical_smp_processor_id();
561#endif
562}
563
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
565{
566 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff);
568}
569
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
571{
572 boot_cpu_physical_apicid = read_apic_id();
573 return 1;
574}
575
576static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
577{
578 unsigned int round = 0;
579 int cpu, uninitialized_var(apicid);
580
581 /*
582 * The cpus in the mask must all be on the apic cluster.
583 */
584 for_each_cpu(cpu, cpumask) {
585 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
586
587 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
588 WARN(1, "Not a valid mask!");
589
590 return BAD_APICID;
591 }
592 apicid = new_apicid;
593 round++;
594 }
595 return apicid;
596}
597
598static unsigned int
599es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
600 const struct cpumask *andmask)
601{
602 int apicid = es7000_cpu_to_logical_apicid(0);
603 cpumask_var_t cpumask;
604
605 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
606 return apicid;
607
608 cpumask_and(cpumask, inmask, andmask);
609 cpumask_and(cpumask, cpumask, cpu_online_mask);
610 apicid = es7000_cpu_mask_to_apicid(cpumask);
611
612 free_cpumask_var(cpumask);
613
614 return apicid;
615}
616
617static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
618{
619 return cpuid_apic >> index_msb;
620}
621
622static int probe_es7000(void)
623{
624 /* probed later in mptable/ACPI hooks */
625 return 0;
626}
627
628static int es7000_mps_ret;
629static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
630 char *productid)
631{
632 int ret = 0;
633
634 if (mpc->oemptr) {
635 struct mpc_oemtable *oem_table =
636 (struct mpc_oemtable *)mpc->oemptr;
637
638 if (!strncmp(oem, "UNISYS", 6))
639 ret = parse_unisys_oem((char *)oem_table);
640 }
641
642 es7000_mps_ret = ret;
643
644 return ret && !es7000_apic_is_cluster();
645}
646
647static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
648 char *productid)
649{
650 int ret = es7000_mps_ret;
651
652 return ret && es7000_apic_is_cluster();
653}
654
655struct apic apic_es7000_cluster = {
656
657 .name = "es7000",
658 .probe = probe_es7000,
659 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
660 .apic_id_registered = es7000_apic_id_registered,
661
662 .irq_delivery_mode = dest_LowestPrio,
663 /* logical delivery broadcast to all procs: */
664 .irq_dest_mode = 1,
665
666 .target_cpus = target_cpus_cluster,
667 .disable_esr = 1,
668 .dest_logical = 0,
669 .check_apicid_used = es7000_check_apicid_used,
670 .check_apicid_present = es7000_check_apicid_present,
671
672 .vector_allocation_domain = es7000_vector_allocation_domain,
673 .init_apic_ldr = es7000_init_apic_ldr_cluster,
674
675 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
676 .setup_apic_routing = es7000_setup_apic_routing,
677 .multi_timer_check = NULL,
678 .apicid_to_node = es7000_apicid_to_node,
679 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
680 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
681 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
682 .setup_portio_remap = NULL,
683 .check_phys_apicid_present = es7000_check_phys_apicid_present,
684 .enable_apic_mode = es7000_enable_apic_mode,
685 .phys_pkg_id = es7000_phys_pkg_id,
686 .mps_oem_check = es7000_mps_oem_check_cluster,
687
688 .get_apic_id = es7000_get_apic_id,
689 .set_apic_id = NULL,
690 .apic_id_mask = 0xFF << 24,
691
692 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
693 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
694
695 .send_IPI_mask = es7000_send_IPI_mask,
696 .send_IPI_mask_allbutself = NULL,
697 .send_IPI_allbutself = es7000_send_IPI_allbutself,
698 .send_IPI_all = es7000_send_IPI_all,
699 .send_IPI_self = default_send_IPI_self,
700
701 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
702
703 .trampoline_phys_low = 0x467,
704 .trampoline_phys_high = 0x469,
705
706 .wait_for_init_deassert = NULL,
707
708 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
709 .smp_callin_clear_local_apic = NULL,
710 .inquire_remote_apic = default_inquire_remote_apic,
711
712 .read = native_apic_mem_read,
713 .write = native_apic_mem_write,
714 .icr_read = native_apic_icr_read,
715 .icr_write = native_apic_icr_write,
716 .wait_icr_idle = native_apic_wait_icr_idle,
717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
718};
719
720struct apic apic_es7000 = {
721
722 .name = "es7000",
723 .probe = probe_es7000,
724 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
725 .apic_id_registered = es7000_apic_id_registered,
726
727 .irq_delivery_mode = dest_Fixed,
728 /* phys delivery to target CPUs: */
729 .irq_dest_mode = 0,
730
731 .target_cpus = es7000_target_cpus,
732 .disable_esr = 1,
733 .dest_logical = 0,
734 .check_apicid_used = es7000_check_apicid_used,
735 .check_apicid_present = es7000_check_apicid_present,
736
737 .vector_allocation_domain = es7000_vector_allocation_domain,
738 .init_apic_ldr = es7000_init_apic_ldr,
739
740 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
741 .setup_apic_routing = es7000_setup_apic_routing,
742 .multi_timer_check = NULL,
743 .apicid_to_node = es7000_apicid_to_node,
744 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
745 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
746 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
747 .setup_portio_remap = NULL,
748 .check_phys_apicid_present = es7000_check_phys_apicid_present,
749 .enable_apic_mode = es7000_enable_apic_mode,
750 .phys_pkg_id = es7000_phys_pkg_id,
751 .mps_oem_check = es7000_mps_oem_check,
752
753 .get_apic_id = es7000_get_apic_id,
754 .set_apic_id = NULL,
755 .apic_id_mask = 0xFF << 24,
756
757 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
758 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
759
760 .send_IPI_mask = es7000_send_IPI_mask,
761 .send_IPI_mask_allbutself = NULL,
762 .send_IPI_allbutself = es7000_send_IPI_allbutself,
763 .send_IPI_all = es7000_send_IPI_all,
764 .send_IPI_self = default_send_IPI_self,
765
766 .trampoline_phys_low = 0x467,
767 .trampoline_phys_high = 0x469,
768
769 .wait_for_init_deassert = es7000_wait_for_init_deassert,
770
771 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
772 .smp_callin_clear_local_apic = NULL,
773 .inquire_remote_apic = default_inquire_remote_apic,
774
775 .read = native_apic_mem_read,
776 .write = native_apic_mem_write,
777 .icr_read = native_apic_icr_read,
778 .icr_write = native_apic_icr_write,
779 .wait_icr_idle = native_apic_wait_icr_idle,
780 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
781};
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index bc7ac4da90d7..767fe7e46d68 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Intel IO-APIC support for multi-Pentium hosts. 2 * Intel IO-APIC support for multi-Pentium hosts.
3 * 3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo 4 * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
5 * 5 *
6 * Many thanks to Stig Venaas for trying out countless experimental 6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently! 7 * patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/io.h> 47#include <asm/io.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/cpu.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/proto.h> 51#include <asm/proto.h>
51#include <asm/acpi.h> 52#include <asm/acpi.h>
@@ -61,9 +62,7 @@
61#include <asm/uv/uv_hub.h> 62#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h> 63#include <asm/uv/uv_irq.h>
63 64
64#include <mach_ipi.h> 65#include <asm/apic.h>
65#include <mach_apic.h>
66#include <mach_apicdef.h>
67 66
68#define __apicdebuginit(type) static type __init 67#define __apicdebuginit(type) static type __init
69 68
@@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);
82int nr_ioapic_registers[MAX_IO_APICS]; 81int nr_ioapic_registers[MAX_IO_APICS];
83 82
84/* I/O APIC entries */ 83/* I/O APIC entries */
85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 84struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 85int nr_ioapics;
87 86
88/* MP IRQ source entries */ 87/* MP IRQ source entries */
89struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 88struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 89
91/* # of MP IRQ source entries */ 90/* # of MP IRQ source entries */
92int mp_irq_entries; 91int mp_irq_entries;
@@ -99,10 +98,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
99 98
100int skip_ioapic_setup; 99int skip_ioapic_setup;
101 100
101void arch_disable_smp_support(void)
102{
103#ifdef CONFIG_PCI
104 noioapicquirk = 1;
105 noioapicreroute = -1;
106#endif
107 skip_ioapic_setup = 1;
108}
109
102static int __init parse_noapic(char *str) 110static int __init parse_noapic(char *str)
103{ 111{
104 /* disable IO-APIC */ 112 /* disable IO-APIC */
105 disable_ioapic_setup(); 113 arch_disable_smp_support();
106 return 0; 114 return 0;
107} 115}
108early_param("noapic", parse_noapic); 116early_param("noapic", parse_noapic);
@@ -356,7 +364,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356 364
357 if (!cfg->move_in_progress) { 365 if (!cfg->move_in_progress) {
358 /* it means that domain is not changed */ 366 /* it means that domain is not changed */
359 if (!cpumask_intersects(&desc->affinity, mask)) 367 if (!cpumask_intersects(desc->affinity, mask))
360 cfg->move_desc_pending = 1; 368 cfg->move_desc_pending = 1;
361 } 369 }
362} 370}
@@ -381,12 +389,20 @@ struct io_apic {
381 unsigned int index; 389 unsigned int index;
382 unsigned int unused[3]; 390 unsigned int unused[3];
383 unsigned int data; 391 unsigned int data;
392 unsigned int unused2[11];
393 unsigned int eoi;
384}; 394};
385 395
386static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 396static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
387{ 397{
388 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 398 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
389 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); 399 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
400}
401
402static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
403{
404 struct io_apic __iomem *io_apic = io_apic_base(apic);
405 writel(vector, &io_apic->eoi);
390} 406}
391 407
392static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 408static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -478,7 +494,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 494 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479} 495}
480 496
481static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 497void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
482{ 498{
483 unsigned long flags; 499 unsigned long flags;
484 spin_lock_irqsave(&ioapic_lock, flags); 500 spin_lock_irqsave(&ioapic_lock, flags);
@@ -513,11 +529,11 @@ static void send_cleanup_vector(struct irq_cfg *cfg)
513 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 529 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
514 cfg->move_cleanup_count++; 530 cfg->move_cleanup_count++;
515 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 531 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
516 send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 532 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
517 } else { 533 } else {
518 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 534 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
519 cfg->move_cleanup_count = cpumask_weight(cleanup_mask); 535 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
520 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 536 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
521 free_cpumask_var(cleanup_mask); 537 free_cpumask_var(cleanup_mask);
522 } 538 }
523 cfg->move_in_progress = 0; 539 cfg->move_in_progress = 0;
@@ -538,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
538 554
539 apic = entry->apic; 555 apic = entry->apic;
540 pin = entry->pin; 556 pin = entry->pin;
541#ifdef CONFIG_INTR_REMAP
542 /* 557 /*
543 * With interrupt-remapping, destination information comes 558 * With interrupt-remapping, destination information comes
544 * from interrupt-remapping table entry. 559 * from interrupt-remapping table entry.
545 */ 560 */
546 if (!irq_remapped(irq)) 561 if (!irq_remapped(irq))
547 io_apic_write(apic, 0x11 + pin*2, dest); 562 io_apic_write(apic, 0x11 + pin*2, dest);
548#else
549 io_apic_write(apic, 0x11 + pin*2, dest);
550#endif
551 reg = io_apic_read(apic, 0x10 + pin*2); 563 reg = io_apic_read(apic, 0x10 + pin*2);
552 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
553 reg |= vector; 565 reg |= vector;
@@ -562,8 +574,9 @@ static int
562assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); 574assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
563 575
564/* 576/*
565 * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid 577 * Either sets desc->affinity to a valid value, and returns
566 * of that, or returns BAD_APICID and leaves desc->affinity untouched. 578 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
579 * leaves desc->affinity untouched.
567 */ 580 */
568static unsigned int 581static unsigned int
569set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 582set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
@@ -579,9 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
579 if (assign_irq_vector(irq, cfg, mask)) 592 if (assign_irq_vector(irq, cfg, mask))
580 return BAD_APICID; 593 return BAD_APICID;
581 594
582 cpumask_and(&desc->affinity, cfg->domain, mask); 595 /* check that before desc->addinity get updated */
583 set_extra_move_desc(desc, mask); 596 set_extra_move_desc(desc, mask);
584 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); 597
598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
585} 601}
586 602
587static void 603static void
@@ -796,23 +812,6 @@ static void clear_IO_APIC (void)
796 clear_IO_APIC_pin(apic, pin); 812 clear_IO_APIC_pin(apic, pin);
797} 813}
798 814
799#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
800void send_IPI_self(int vector)
801{
802 unsigned int cfg;
803
804 /*
805 * Wait for idle.
806 */
807 apic_wait_icr_idle();
808 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
809 /*
810 * Send the IPI. The write to APIC_ICR fires this off.
811 */
812 apic_write(APIC_ICR, cfg);
813}
814#endif /* !CONFIG_SMP && CONFIG_X86_32*/
815
816#ifdef CONFIG_X86_32 815#ifdef CONFIG_X86_32
817/* 816/*
818 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to 817 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -820,8 +819,9 @@ void send_IPI_self(int vector)
820 */ 819 */
821 820
822#define MAX_PIRQS 8 821#define MAX_PIRQS 8
823static int pirq_entries [MAX_PIRQS]; 822static int pirq_entries[MAX_PIRQS] = {
824static int pirqs_enabled; 823 [0 ... MAX_PIRQS - 1] = -1
824};
825 825
826static int __init ioapic_pirq_setup(char *str) 826static int __init ioapic_pirq_setup(char *str)
827{ 827{
@@ -830,10 +830,6 @@ static int __init ioapic_pirq_setup(char *str)
830 830
831 get_options(str, ARRAY_SIZE(ints), ints); 831 get_options(str, ARRAY_SIZE(ints), ints);
832 832
833 for (i = 0; i < MAX_PIRQS; i++)
834 pirq_entries[i] = -1;
835
836 pirqs_enabled = 1;
837 apic_printk(APIC_VERBOSE, KERN_INFO 833 apic_printk(APIC_VERBOSE, KERN_INFO
838 "PIRQ redirection, working around broken MP-BIOS.\n"); 834 "PIRQ redirection, working around broken MP-BIOS.\n");
839 max = MAX_PIRQS; 835 max = MAX_PIRQS;
@@ -855,75 +851,106 @@ __setup("pirq=", ioapic_pirq_setup);
855#endif /* CONFIG_X86_32 */ 851#endif /* CONFIG_X86_32 */
856 852
857#ifdef CONFIG_INTR_REMAP 853#ifdef CONFIG_INTR_REMAP
858/* I/O APIC RTE contents at the OS boot up */ 854struct IO_APIC_route_entry **alloc_ioapic_entries(void)
859static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; 855{
856 int apic;
857 struct IO_APIC_route_entry **ioapic_entries;
858
859 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
860 GFP_ATOMIC);
861 if (!ioapic_entries)
862 return 0;
863
864 for (apic = 0; apic < nr_ioapics; apic++) {
865 ioapic_entries[apic] =
866 kzalloc(sizeof(struct IO_APIC_route_entry) *
867 nr_ioapic_registers[apic], GFP_ATOMIC);
868 if (!ioapic_entries[apic])
869 goto nomem;
870 }
871
872 return ioapic_entries;
873
874nomem:
875 while (--apic >= 0)
876 kfree(ioapic_entries[apic]);
877 kfree(ioapic_entries);
878
879 return 0;
880}
860 881
861/* 882/*
862 * Saves and masks all the unmasked IO-APIC RTE's 883 * Saves all the IO-APIC RTE's
863 */ 884 */
864int save_mask_IO_APIC_setup(void) 885int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
865{ 886{
866 union IO_APIC_reg_01 reg_01;
867 unsigned long flags;
868 int apic, pin; 887 int apic, pin;
869 888
870 /* 889 if (!ioapic_entries)
871 * The number of IO-APIC IRQ registers (== #pins): 890 return -ENOMEM;
872 */ 891
873 for (apic = 0; apic < nr_ioapics; apic++) { 892 for (apic = 0; apic < nr_ioapics; apic++) {
874 spin_lock_irqsave(&ioapic_lock, flags); 893 if (!ioapic_entries[apic])
875 reg_01.raw = io_apic_read(apic, 1); 894 return -ENOMEM;
876 spin_unlock_irqrestore(&ioapic_lock, flags); 895
877 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 896 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
897 ioapic_entries[apic][pin] =
898 ioapic_read_entry(apic, pin);
878 } 899 }
879 900
901 return 0;
902}
903
904/*
905 * Mask all IO APIC entries.
906 */
907void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
908{
909 int apic, pin;
910
911 if (!ioapic_entries)
912 return;
913
880 for (apic = 0; apic < nr_ioapics; apic++) { 914 for (apic = 0; apic < nr_ioapics; apic++) {
881 early_ioapic_entries[apic] = 915 if (!ioapic_entries[apic])
882 kzalloc(sizeof(struct IO_APIC_route_entry) * 916 break;
883 nr_ioapic_registers[apic], GFP_KERNEL);
884 if (!early_ioapic_entries[apic])
885 goto nomem;
886 }
887 917
888 for (apic = 0; apic < nr_ioapics; apic++)
889 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 918 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
890 struct IO_APIC_route_entry entry; 919 struct IO_APIC_route_entry entry;
891 920
892 entry = early_ioapic_entries[apic][pin] = 921 entry = ioapic_entries[apic][pin];
893 ioapic_read_entry(apic, pin);
894 if (!entry.mask) { 922 if (!entry.mask) {
895 entry.mask = 1; 923 entry.mask = 1;
896 ioapic_write_entry(apic, pin, entry); 924 ioapic_write_entry(apic, pin, entry);
897 } 925 }
898 } 926 }
899 927 }
900 return 0;
901
902nomem:
903 while (apic >= 0)
904 kfree(early_ioapic_entries[apic--]);
905 memset(early_ioapic_entries, 0,
906 ARRAY_SIZE(early_ioapic_entries));
907
908 return -ENOMEM;
909} 928}
910 929
911void restore_IO_APIC_setup(void) 930/*
931 * Restore IO APIC entries which was saved in ioapic_entries.
932 */
933int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
912{ 934{
913 int apic, pin; 935 int apic, pin;
914 936
937 if (!ioapic_entries)
938 return -ENOMEM;
939
915 for (apic = 0; apic < nr_ioapics; apic++) { 940 for (apic = 0; apic < nr_ioapics; apic++) {
916 if (!early_ioapic_entries[apic]) 941 if (!ioapic_entries[apic])
917 break; 942 return -ENOMEM;
943
918 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 944 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
919 ioapic_write_entry(apic, pin, 945 ioapic_write_entry(apic, pin,
920 early_ioapic_entries[apic][pin]); 946 ioapic_entries[apic][pin]);
921 kfree(early_ioapic_entries[apic]);
922 early_ioapic_entries[apic] = NULL;
923 } 947 }
948 return 0;
924} 949}
925 950
926void reinit_intr_remapped_IO_APIC(int intr_remapping) 951void reinit_intr_remapped_IO_APIC(int intr_remapping,
952 struct IO_APIC_route_entry **ioapic_entries)
953
927{ 954{
928 /* 955 /*
929 * for now plain restore of previous settings. 956 * for now plain restore of previous settings.
@@ -932,7 +959,17 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping)
932 * table entries. for now, do a plain restore, and wait for 959 * table entries. for now, do a plain restore, and wait for
933 * the setup_IO_APIC_irqs() to do proper initialization. 960 * the setup_IO_APIC_irqs() to do proper initialization.
934 */ 961 */
935 restore_IO_APIC_setup(); 962 restore_IO_APIC_setup(ioapic_entries);
963}
964
965void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
966{
967 int apic;
968
969 for (apic = 0; apic < nr_ioapics; apic++)
970 kfree(ioapic_entries[apic]);
971
972 kfree(ioapic_entries);
936} 973}
937#endif 974#endif
938 975
@@ -944,10 +981,10 @@ static int find_irq_entry(int apic, int pin, int type)
944 int i; 981 int i;
945 982
946 for (i = 0; i < mp_irq_entries; i++) 983 for (i = 0; i < mp_irq_entries; i++)
947 if (mp_irqs[i].mp_irqtype == type && 984 if (mp_irqs[i].irqtype == type &&
948 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || 985 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
949 mp_irqs[i].mp_dstapic == MP_APIC_ALL) && 986 mp_irqs[i].dstapic == MP_APIC_ALL) &&
950 mp_irqs[i].mp_dstirq == pin) 987 mp_irqs[i].dstirq == pin)
951 return i; 988 return i;
952 989
953 return -1; 990 return -1;
@@ -961,13 +998,13 @@ static int __init find_isa_irq_pin(int irq, int type)
961 int i; 998 int i;
962 999
963 for (i = 0; i < mp_irq_entries; i++) { 1000 for (i = 0; i < mp_irq_entries; i++) {
964 int lbus = mp_irqs[i].mp_srcbus; 1001 int lbus = mp_irqs[i].srcbus;
965 1002
966 if (test_bit(lbus, mp_bus_not_pci) && 1003 if (test_bit(lbus, mp_bus_not_pci) &&
967 (mp_irqs[i].mp_irqtype == type) && 1004 (mp_irqs[i].irqtype == type) &&
968 (mp_irqs[i].mp_srcbusirq == irq)) 1005 (mp_irqs[i].srcbusirq == irq))
969 1006
970 return mp_irqs[i].mp_dstirq; 1007 return mp_irqs[i].dstirq;
971 } 1008 }
972 return -1; 1009 return -1;
973} 1010}
@@ -977,17 +1014,17 @@ static int __init find_isa_irq_apic(int irq, int type)
977 int i; 1014 int i;
978 1015
979 for (i = 0; i < mp_irq_entries; i++) { 1016 for (i = 0; i < mp_irq_entries; i++) {
980 int lbus = mp_irqs[i].mp_srcbus; 1017 int lbus = mp_irqs[i].srcbus;
981 1018
982 if (test_bit(lbus, mp_bus_not_pci) && 1019 if (test_bit(lbus, mp_bus_not_pci) &&
983 (mp_irqs[i].mp_irqtype == type) && 1020 (mp_irqs[i].irqtype == type) &&
984 (mp_irqs[i].mp_srcbusirq == irq)) 1021 (mp_irqs[i].srcbusirq == irq))
985 break; 1022 break;
986 } 1023 }
987 if (i < mp_irq_entries) { 1024 if (i < mp_irq_entries) {
988 int apic; 1025 int apic;
989 for(apic = 0; apic < nr_ioapics; apic++) { 1026 for(apic = 0; apic < nr_ioapics; apic++) {
990 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) 1027 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
991 return apic; 1028 return apic;
992 } 1029 }
993 } 1030 }
@@ -1012,23 +1049,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1012 return -1; 1049 return -1;
1013 } 1050 }
1014 for (i = 0; i < mp_irq_entries; i++) { 1051 for (i = 0; i < mp_irq_entries; i++) {
1015 int lbus = mp_irqs[i].mp_srcbus; 1052 int lbus = mp_irqs[i].srcbus;
1016 1053
1017 for (apic = 0; apic < nr_ioapics; apic++) 1054 for (apic = 0; apic < nr_ioapics; apic++)
1018 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || 1055 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1019 mp_irqs[i].mp_dstapic == MP_APIC_ALL) 1056 mp_irqs[i].dstapic == MP_APIC_ALL)
1020 break; 1057 break;
1021 1058
1022 if (!test_bit(lbus, mp_bus_not_pci) && 1059 if (!test_bit(lbus, mp_bus_not_pci) &&
1023 !mp_irqs[i].mp_irqtype && 1060 !mp_irqs[i].irqtype &&
1024 (bus == lbus) && 1061 (bus == lbus) &&
1025 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { 1062 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1026 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); 1063 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1027 1064
1028 if (!(apic || IO_APIC_IRQ(irq))) 1065 if (!(apic || IO_APIC_IRQ(irq)))
1029 continue; 1066 continue;
1030 1067
1031 if (pin == (mp_irqs[i].mp_srcbusirq & 3)) 1068 if (pin == (mp_irqs[i].srcbusirq & 3))
1032 return irq; 1069 return irq;
1033 /* 1070 /*
1034 * Use the first all-but-pin matching entry as a 1071 * Use the first all-but-pin matching entry as a
@@ -1071,7 +1108,7 @@ static int EISA_ELCR(unsigned int irq)
1071 * EISA conforming in the MP table, that means its trigger type must 1108 * EISA conforming in the MP table, that means its trigger type must
1072 * be read in from the ELCR */ 1109 * be read in from the ELCR */
1073 1110
1074#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) 1111#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
1075#define default_EISA_polarity(idx) default_ISA_polarity(idx) 1112#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1076 1113
1077/* PCI interrupts are always polarity one level triggered, 1114/* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1125,13 @@ static int EISA_ELCR(unsigned int irq)
1088 1125
1089static int MPBIOS_polarity(int idx) 1126static int MPBIOS_polarity(int idx)
1090{ 1127{
1091 int bus = mp_irqs[idx].mp_srcbus; 1128 int bus = mp_irqs[idx].srcbus;
1092 int polarity; 1129 int polarity;
1093 1130
1094 /* 1131 /*
1095 * Determine IRQ line polarity (high active or low active): 1132 * Determine IRQ line polarity (high active or low active):
1096 */ 1133 */
1097 switch (mp_irqs[idx].mp_irqflag & 3) 1134 switch (mp_irqs[idx].irqflag & 3)
1098 { 1135 {
1099 case 0: /* conforms, ie. bus-type dependent polarity */ 1136 case 0: /* conforms, ie. bus-type dependent polarity */
1100 if (test_bit(bus, mp_bus_not_pci)) 1137 if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1167,13 @@ static int MPBIOS_polarity(int idx)
1130 1167
1131static int MPBIOS_trigger(int idx) 1168static int MPBIOS_trigger(int idx)
1132{ 1169{
1133 int bus = mp_irqs[idx].mp_srcbus; 1170 int bus = mp_irqs[idx].srcbus;
1134 int trigger; 1171 int trigger;
1135 1172
1136 /* 1173 /*
1137 * Determine IRQ trigger mode (edge or level sensitive): 1174 * Determine IRQ trigger mode (edge or level sensitive):
1138 */ 1175 */
1139 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) 1176 switch ((mp_irqs[idx].irqflag>>2) & 3)
1140 { 1177 {
1141 case 0: /* conforms, ie. bus-type dependent */ 1178 case 0: /* conforms, ie. bus-type dependent */
1142 if (test_bit(bus, mp_bus_not_pci)) 1179 if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1251,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
1214static int pin_2_irq(int idx, int apic, int pin) 1251static int pin_2_irq(int idx, int apic, int pin)
1215{ 1252{
1216 int irq, i; 1253 int irq, i;
1217 int bus = mp_irqs[idx].mp_srcbus; 1254 int bus = mp_irqs[idx].srcbus;
1218 1255
1219 /* 1256 /*
1220 * Debugging check, we are in big trouble if this message pops up! 1257 * Debugging check, we are in big trouble if this message pops up!
1221 */ 1258 */
1222 if (mp_irqs[idx].mp_dstirq != pin) 1259 if (mp_irqs[idx].dstirq != pin)
1223 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1260 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1224 1261
1225 if (test_bit(bus, mp_bus_not_pci)) { 1262 if (test_bit(bus, mp_bus_not_pci)) {
1226 irq = mp_irqs[idx].mp_srcbusirq; 1263 irq = mp_irqs[idx].srcbusirq;
1227 } else { 1264 } else {
1228 /* 1265 /*
1229 * PCI IRQs are mapped in order 1266 * PCI IRQs are mapped in order
@@ -1315,7 +1352,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1315 int new_cpu; 1352 int new_cpu;
1316 int vector, offset; 1353 int vector, offset;
1317 1354
1318 vector_allocation_domain(cpu, tmp_mask); 1355 apic->vector_allocation_domain(cpu, tmp_mask);
1319 1356
1320 vector = current_vector; 1357 vector = current_vector;
1321 offset = current_offset; 1358 offset = current_offset;
@@ -1421,9 +1458,7 @@ void __setup_vector_irq(int cpu)
1421} 1458}
1422 1459
1423static struct irq_chip ioapic_chip; 1460static struct irq_chip ioapic_chip;
1424#ifdef CONFIG_INTR_REMAP
1425static struct irq_chip ir_ioapic_chip; 1461static struct irq_chip ir_ioapic_chip;
1426#endif
1427 1462
1428#define IOAPIC_AUTO -1 1463#define IOAPIC_AUTO -1
1429#define IOAPIC_EDGE 0 1464#define IOAPIC_EDGE 0
@@ -1462,7 +1497,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1462 else 1497 else
1463 desc->status &= ~IRQ_LEVEL; 1498 desc->status &= ~IRQ_LEVEL;
1464 1499
1465#ifdef CONFIG_INTR_REMAP
1466 if (irq_remapped(irq)) { 1500 if (irq_remapped(irq)) {
1467 desc->status |= IRQ_MOVE_PCNTXT; 1501 desc->status |= IRQ_MOVE_PCNTXT;
1468 if (trigger) 1502 if (trigger)
@@ -1474,7 +1508,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1474 handle_edge_irq, "edge"); 1508 handle_edge_irq, "edge");
1475 return; 1509 return;
1476 } 1510 }
1477#endif 1511
1478 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1512 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1479 trigger == IOAPIC_LEVEL) 1513 trigger == IOAPIC_LEVEL)
1480 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1514 set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1485,37 +1519,43 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1485 handle_edge_irq, "edge"); 1519 handle_edge_irq, "edge");
1486} 1520}
1487 1521
1488static int setup_ioapic_entry(int apic, int irq, 1522int setup_ioapic_entry(int apic_id, int irq,
1489 struct IO_APIC_route_entry *entry, 1523 struct IO_APIC_route_entry *entry,
1490 unsigned int destination, int trigger, 1524 unsigned int destination, int trigger,
1491 int polarity, int vector) 1525 int polarity, int vector, int pin)
1492{ 1526{
1493 /* 1527 /*
1494 * add it to the IO-APIC irq-routing table: 1528 * add it to the IO-APIC irq-routing table:
1495 */ 1529 */
1496 memset(entry,0,sizeof(*entry)); 1530 memset(entry,0,sizeof(*entry));
1497 1531
1498#ifdef CONFIG_INTR_REMAP
1499 if (intr_remapping_enabled) { 1532 if (intr_remapping_enabled) {
1500 struct intel_iommu *iommu = map_ioapic_to_ir(apic); 1533 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1501 struct irte irte; 1534 struct irte irte;
1502 struct IR_IO_APIC_route_entry *ir_entry = 1535 struct IR_IO_APIC_route_entry *ir_entry =
1503 (struct IR_IO_APIC_route_entry *) entry; 1536 (struct IR_IO_APIC_route_entry *) entry;
1504 int index; 1537 int index;
1505 1538
1506 if (!iommu) 1539 if (!iommu)
1507 panic("No mapping iommu for ioapic %d\n", apic); 1540 panic("No mapping iommu for ioapic %d\n", apic_id);
1508 1541
1509 index = alloc_irte(iommu, irq, 1); 1542 index = alloc_irte(iommu, irq, 1);
1510 if (index < 0) 1543 if (index < 0)
1511 panic("Failed to allocate IRTE for ioapic %d\n", apic); 1544 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1512 1545
1513 memset(&irte, 0, sizeof(irte)); 1546 memset(&irte, 0, sizeof(irte));
1514 1547
1515 irte.present = 1; 1548 irte.present = 1;
1516 irte.dst_mode = INT_DEST_MODE; 1549 irte.dst_mode = apic->irq_dest_mode;
1517 irte.trigger_mode = trigger; 1550 /*
1518 irte.dlvry_mode = INT_DELIVERY_MODE; 1551 * Trigger mode in the IRTE will always be edge, and the
1552 * actual level or edge trigger will be setup in the IO-APIC
1553 * RTE. This will help simplify level triggered irq migration.
1554 * For more details, see the comments above explainig IO-APIC
1555 * irq migration in the presence of interrupt-remapping.
1556 */
1557 irte.trigger_mode = 0;
1558 irte.dlvry_mode = apic->irq_delivery_mode;
1519 irte.vector = vector; 1559 irte.vector = vector;
1520 irte.dest_id = IRTE_DEST(destination); 1560 irte.dest_id = IRTE_DEST(destination);
1521 1561
@@ -1525,18 +1565,21 @@ static int setup_ioapic_entry(int apic, int irq,
1525 ir_entry->zero = 0; 1565 ir_entry->zero = 0;
1526 ir_entry->format = 1; 1566 ir_entry->format = 1;
1527 ir_entry->index = (index & 0x7fff); 1567 ir_entry->index = (index & 0x7fff);
1528 } else 1568 /*
1529#endif 1569 * IO-APIC RTE will be configured with virtual vector.
1530 { 1570 * irq handler will do the explicit EOI to the io-apic.
1531 entry->delivery_mode = INT_DELIVERY_MODE; 1571 */
1532 entry->dest_mode = INT_DEST_MODE; 1572 ir_entry->vector = pin;
1573 } else {
1574 entry->delivery_mode = apic->irq_delivery_mode;
1575 entry->dest_mode = apic->irq_dest_mode;
1533 entry->dest = destination; 1576 entry->dest = destination;
1577 entry->vector = vector;
1534 } 1578 }
1535 1579
1536 entry->mask = 0; /* enable IRQ */ 1580 entry->mask = 0; /* enable IRQ */
1537 entry->trigger = trigger; 1581 entry->trigger = trigger;
1538 entry->polarity = polarity; 1582 entry->polarity = polarity;
1539 entry->vector = vector;
1540 1583
1541 /* Mask level triggered irqs. 1584 /* Mask level triggered irqs.
1542 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1585 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1546,7 +1589,7 @@ static int setup_ioapic_entry(int apic, int irq,
1546 return 0; 1589 return 0;
1547} 1590}
1548 1591
1549static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, 1592static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
1550 int trigger, int polarity) 1593 int trigger, int polarity)
1551{ 1594{
1552 struct irq_cfg *cfg; 1595 struct irq_cfg *cfg;
@@ -1558,22 +1601,22 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1558 1601
1559 cfg = desc->chip_data; 1602 cfg = desc->chip_data;
1560 1603
1561 if (assign_irq_vector(irq, cfg, TARGET_CPUS)) 1604 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1562 return; 1605 return;
1563 1606
1564 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 1607 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
1565 1608
1566 apic_printk(APIC_VERBOSE,KERN_DEBUG 1609 apic_printk(APIC_VERBOSE,KERN_DEBUG
1567 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1610 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1568 "IRQ %d Mode:%i Active:%i)\n", 1611 "IRQ %d Mode:%i Active:%i)\n",
1569 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1612 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
1570 irq, trigger, polarity); 1613 irq, trigger, polarity);
1571 1614
1572 1615
1573 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1616 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1574 dest, trigger, polarity, cfg->vector)) { 1617 dest, trigger, polarity, cfg->vector, pin)) {
1575 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1618 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1576 mp_ioapics[apic].mp_apicid, pin); 1619 mp_ioapics[apic_id].apicid, pin);
1577 __clear_irq_vector(irq, cfg); 1620 __clear_irq_vector(irq, cfg);
1578 return; 1621 return;
1579 } 1622 }
@@ -1582,12 +1625,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1582 if (irq < NR_IRQS_LEGACY) 1625 if (irq < NR_IRQS_LEGACY)
1583 disable_8259A_irq(irq); 1626 disable_8259A_irq(irq);
1584 1627
1585 ioapic_write_entry(apic, pin, entry); 1628 ioapic_write_entry(apic_id, pin, entry);
1586} 1629}
1587 1630
1588static void __init setup_IO_APIC_irqs(void) 1631static void __init setup_IO_APIC_irqs(void)
1589{ 1632{
1590 int apic, pin, idx, irq; 1633 int apic_id, pin, idx, irq;
1591 int notcon = 0; 1634 int notcon = 0;
1592 struct irq_desc *desc; 1635 struct irq_desc *desc;
1593 struct irq_cfg *cfg; 1636 struct irq_cfg *cfg;
@@ -1595,21 +1638,19 @@ static void __init setup_IO_APIC_irqs(void)
1595 1638
1596 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1639 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1597 1640
1598 for (apic = 0; apic < nr_ioapics; apic++) { 1641 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
1599 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1642 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1600 1643
1601 idx = find_irq_entry(apic, pin, mp_INT); 1644 idx = find_irq_entry(apic_id, pin, mp_INT);
1602 if (idx == -1) { 1645 if (idx == -1) {
1603 if (!notcon) { 1646 if (!notcon) {
1604 notcon = 1; 1647 notcon = 1;
1605 apic_printk(APIC_VERBOSE, 1648 apic_printk(APIC_VERBOSE,
1606 KERN_DEBUG " %d-%d", 1649 KERN_DEBUG " %d-%d",
1607 mp_ioapics[apic].mp_apicid, 1650 mp_ioapics[apic_id].apicid, pin);
1608 pin);
1609 } else 1651 } else
1610 apic_printk(APIC_VERBOSE, " %d-%d", 1652 apic_printk(APIC_VERBOSE, " %d-%d",
1611 mp_ioapics[apic].mp_apicid, 1653 mp_ioapics[apic_id].apicid, pin);
1612 pin);
1613 continue; 1654 continue;
1614 } 1655 }
1615 if (notcon) { 1656 if (notcon) {
@@ -1618,20 +1659,25 @@ static void __init setup_IO_APIC_irqs(void)
1618 notcon = 0; 1659 notcon = 0;
1619 } 1660 }
1620 1661
1621 irq = pin_2_irq(idx, apic, pin); 1662 irq = pin_2_irq(idx, apic_id, pin);
1622#ifdef CONFIG_X86_32 1663
1623 if (multi_timer_check(apic, irq)) 1664 /*
1665 * Skip the timer IRQ if there's a quirk handler
1666 * installed and if it returns 1:
1667 */
1668 if (apic->multi_timer_check &&
1669 apic->multi_timer_check(apic_id, irq))
1624 continue; 1670 continue;
1625#endif 1671
1626 desc = irq_to_desc_alloc_cpu(irq, cpu); 1672 desc = irq_to_desc_alloc_cpu(irq, cpu);
1627 if (!desc) { 1673 if (!desc) {
1628 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1674 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1629 continue; 1675 continue;
1630 } 1676 }
1631 cfg = desc->chip_data; 1677 cfg = desc->chip_data;
1632 add_pin_to_irq_cpu(cfg, cpu, apic, pin); 1678 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
1633 1679
1634 setup_IO_APIC_irq(apic, pin, irq, desc, 1680 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1635 irq_trigger(idx), irq_polarity(idx)); 1681 irq_trigger(idx), irq_polarity(idx));
1636 } 1682 }
1637 } 1683 }
@@ -1644,15 +1690,13 @@ static void __init setup_IO_APIC_irqs(void)
1644/* 1690/*
1645 * Set up the timer pin, possibly with the 8259A-master behind. 1691 * Set up the timer pin, possibly with the 8259A-master behind.
1646 */ 1692 */
1647static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, 1693static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1648 int vector) 1694 int vector)
1649{ 1695{
1650 struct IO_APIC_route_entry entry; 1696 struct IO_APIC_route_entry entry;
1651 1697
1652#ifdef CONFIG_INTR_REMAP
1653 if (intr_remapping_enabled) 1698 if (intr_remapping_enabled)
1654 return; 1699 return;
1655#endif
1656 1700
1657 memset(&entry, 0, sizeof(entry)); 1701 memset(&entry, 0, sizeof(entry));
1658 1702
@@ -1660,10 +1704,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1660 * We use logical delivery to get the timer IRQ 1704 * We use logical delivery to get the timer IRQ
1661 * to the first CPU. 1705 * to the first CPU.
1662 */ 1706 */
1663 entry.dest_mode = INT_DEST_MODE; 1707 entry.dest_mode = apic->irq_dest_mode;
1664 entry.mask = 1; /* mask IRQ now */ 1708 entry.mask = 0; /* don't mask IRQ for edge */
1665 entry.dest = cpu_mask_to_apicid(TARGET_CPUS); 1709 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
1666 entry.delivery_mode = INT_DELIVERY_MODE; 1710 entry.delivery_mode = apic->irq_delivery_mode;
1667 entry.polarity = 0; 1711 entry.polarity = 0;
1668 entry.trigger = 0; 1712 entry.trigger = 0;
1669 entry.vector = vector; 1713 entry.vector = vector;
@@ -1677,7 +1721,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1677 /* 1721 /*
1678 * Add it to the IO-APIC irq-routing table: 1722 * Add it to the IO-APIC irq-routing table:
1679 */ 1723 */
1680 ioapic_write_entry(apic, pin, entry); 1724 ioapic_write_entry(apic_id, pin, entry);
1681} 1725}
1682 1726
1683 1727
@@ -1699,7 +1743,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1699 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1743 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1700 for (i = 0; i < nr_ioapics; i++) 1744 for (i = 0; i < nr_ioapics; i++)
1701 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1745 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1702 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); 1746 mp_ioapics[i].apicid, nr_ioapic_registers[i]);
1703 1747
1704 /* 1748 /*
1705 * We are a bit conservative about what we expect. We have to 1749 * We are a bit conservative about what we expect. We have to
@@ -1719,7 +1763,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1719 spin_unlock_irqrestore(&ioapic_lock, flags); 1763 spin_unlock_irqrestore(&ioapic_lock, flags);
1720 1764
1721 printk("\n"); 1765 printk("\n");
1722 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1766 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
1723 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1767 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1724 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1768 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1725 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1769 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1980,13 +2024,6 @@ void __init enable_IO_APIC(void)
1980 int apic; 2024 int apic;
1981 unsigned long flags; 2025 unsigned long flags;
1982 2026
1983#ifdef CONFIG_X86_32
1984 int i;
1985 if (!pirqs_enabled)
1986 for (i = 0; i < MAX_PIRQS; i++)
1987 pirq_entries[i] = -1;
1988#endif
1989
1990 /* 2027 /*
1991 * The number of IO-APIC IRQ registers (== #pins): 2028 * The number of IO-APIC IRQ registers (== #pins):
1992 */ 2029 */
@@ -2054,8 +2091,13 @@ void disable_IO_APIC(void)
2054 * If the i8259 is routed through an IOAPIC 2091 * If the i8259 is routed through an IOAPIC
2055 * Put that IOAPIC in virtual wire mode 2092 * Put that IOAPIC in virtual wire mode
2056 * so legacy interrupts can be delivered. 2093 * so legacy interrupts can be delivered.
2094 *
2095 * With interrupt-remapping, for now we will use virtual wire A mode,
2096 * as virtual wire B is little complex (need to configure both
2097 * IOAPIC RTE aswell as interrupt-remapping table entry).
2098 * As this gets called during crash dump, keep this simple for now.
2057 */ 2099 */
2058 if (ioapic_i8259.pin != -1) { 2100 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
2059 struct IO_APIC_route_entry entry; 2101 struct IO_APIC_route_entry entry;
2060 2102
2061 memset(&entry, 0, sizeof(entry)); 2103 memset(&entry, 0, sizeof(entry));
@@ -2075,7 +2117,10 @@ void disable_IO_APIC(void)
2075 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 2117 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2076 } 2118 }
2077 2119
2078 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 2120 /*
2121 * Use virtual wire A mode when interrupt remapping is enabled.
2122 */
2123 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
2079} 2124}
2080 2125
2081#ifdef CONFIG_X86_32 2126#ifdef CONFIG_X86_32
@@ -2090,7 +2135,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
2090{ 2135{
2091 union IO_APIC_reg_00 reg_00; 2136 union IO_APIC_reg_00 reg_00;
2092 physid_mask_t phys_id_present_map; 2137 physid_mask_t phys_id_present_map;
2093 int apic; 2138 int apic_id;
2094 int i; 2139 int i;
2095 unsigned char old_id; 2140 unsigned char old_id;
2096 unsigned long flags; 2141 unsigned long flags;
@@ -2109,26 +2154,26 @@ static void __init setup_ioapic_ids_from_mpc(void)
2109 * This is broken; anything with a real cpu count has to 2154 * This is broken; anything with a real cpu count has to
2110 * circumvent this idiocy regardless. 2155 * circumvent this idiocy regardless.
2111 */ 2156 */
2112 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); 2157 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
2113 2158
2114 /* 2159 /*
2115 * Set the IOAPIC ID to the value stored in the MPC table. 2160 * Set the IOAPIC ID to the value stored in the MPC table.
2116 */ 2161 */
2117 for (apic = 0; apic < nr_ioapics; apic++) { 2162 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2118 2163
2119 /* Read the register 0 value */ 2164 /* Read the register 0 value */
2120 spin_lock_irqsave(&ioapic_lock, flags); 2165 spin_lock_irqsave(&ioapic_lock, flags);
2121 reg_00.raw = io_apic_read(apic, 0); 2166 reg_00.raw = io_apic_read(apic_id, 0);
2122 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 spin_unlock_irqrestore(&ioapic_lock, flags);
2123 2168
2124 old_id = mp_ioapics[apic].mp_apicid; 2169 old_id = mp_ioapics[apic_id].apicid;
2125 2170
2126 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { 2171 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
2127 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 2172 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2128 apic, mp_ioapics[apic].mp_apicid); 2173 apic_id, mp_ioapics[apic_id].apicid);
2129 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2174 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2130 reg_00.bits.ID); 2175 reg_00.bits.ID);
2131 mp_ioapics[apic].mp_apicid = reg_00.bits.ID; 2176 mp_ioapics[apic_id].apicid = reg_00.bits.ID;
2132 } 2177 }
2133 2178
2134 /* 2179 /*
@@ -2136,10 +2181,10 @@ static void __init setup_ioapic_ids_from_mpc(void)
2136 * system must have a unique ID or we get lots of nice 2181 * system must have a unique ID or we get lots of nice
2137 * 'stuck on smp_invalidate_needed IPI wait' messages. 2182 * 'stuck on smp_invalidate_needed IPI wait' messages.
2138 */ 2183 */
2139 if (check_apicid_used(phys_id_present_map, 2184 if (apic->check_apicid_used(phys_id_present_map,
2140 mp_ioapics[apic].mp_apicid)) { 2185 mp_ioapics[apic_id].apicid)) {
2141 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2186 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2142 apic, mp_ioapics[apic].mp_apicid); 2187 apic_id, mp_ioapics[apic_id].apicid);
2143 for (i = 0; i < get_physical_broadcast(); i++) 2188 for (i = 0; i < get_physical_broadcast(); i++)
2144 if (!physid_isset(i, phys_id_present_map)) 2189 if (!physid_isset(i, phys_id_present_map))
2145 break; 2190 break;
@@ -2148,13 +2193,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
2148 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2193 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2149 i); 2194 i);
2150 physid_set(i, phys_id_present_map); 2195 physid_set(i, phys_id_present_map);
2151 mp_ioapics[apic].mp_apicid = i; 2196 mp_ioapics[apic_id].apicid = i;
2152 } else { 2197 } else {
2153 physid_mask_t tmp; 2198 physid_mask_t tmp;
2154 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); 2199 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
2155 apic_printk(APIC_VERBOSE, "Setting %d in the " 2200 apic_printk(APIC_VERBOSE, "Setting %d in the "
2156 "phys_id_present_map\n", 2201 "phys_id_present_map\n",
2157 mp_ioapics[apic].mp_apicid); 2202 mp_ioapics[apic_id].apicid);
2158 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2203 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2159 } 2204 }
2160 2205
@@ -2163,11 +2208,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
2163 * We need to adjust the IRQ routing table 2208 * We need to adjust the IRQ routing table
2164 * if the ID changed. 2209 * if the ID changed.
2165 */ 2210 */
2166 if (old_id != mp_ioapics[apic].mp_apicid) 2211 if (old_id != mp_ioapics[apic_id].apicid)
2167 for (i = 0; i < mp_irq_entries; i++) 2212 for (i = 0; i < mp_irq_entries; i++)
2168 if (mp_irqs[i].mp_dstapic == old_id) 2213 if (mp_irqs[i].dstapic == old_id)
2169 mp_irqs[i].mp_dstapic 2214 mp_irqs[i].dstapic
2170 = mp_ioapics[apic].mp_apicid; 2215 = mp_ioapics[apic_id].apicid;
2171 2216
2172 /* 2217 /*
2173 * Read the right value from the MPC table and 2218 * Read the right value from the MPC table and
@@ -2175,20 +2220,20 @@ static void __init setup_ioapic_ids_from_mpc(void)
2175 */ 2220 */
2176 apic_printk(APIC_VERBOSE, KERN_INFO 2221 apic_printk(APIC_VERBOSE, KERN_INFO
2177 "...changing IO-APIC physical APIC ID to %d ...", 2222 "...changing IO-APIC physical APIC ID to %d ...",
2178 mp_ioapics[apic].mp_apicid); 2223 mp_ioapics[apic_id].apicid);
2179 2224
2180 reg_00.bits.ID = mp_ioapics[apic].mp_apicid; 2225 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2181 spin_lock_irqsave(&ioapic_lock, flags); 2226 spin_lock_irqsave(&ioapic_lock, flags);
2182 io_apic_write(apic, 0, reg_00.raw); 2227 io_apic_write(apic_id, 0, reg_00.raw);
2183 spin_unlock_irqrestore(&ioapic_lock, flags); 2228 spin_unlock_irqrestore(&ioapic_lock, flags);
2184 2229
2185 /* 2230 /*
2186 * Sanity check 2231 * Sanity check
2187 */ 2232 */
2188 spin_lock_irqsave(&ioapic_lock, flags); 2233 spin_lock_irqsave(&ioapic_lock, flags);
2189 reg_00.raw = io_apic_read(apic, 0); 2234 reg_00.raw = io_apic_read(apic_id, 0);
2190 spin_unlock_irqrestore(&ioapic_lock, flags); 2235 spin_unlock_irqrestore(&ioapic_lock, flags);
2191 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) 2236 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2192 printk("could not set ID!\n"); 2237 printk("could not set ID!\n");
2193 else 2238 else
2194 apic_printk(APIC_VERBOSE, " ok.\n"); 2239 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2291,7 +2336,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2291 unsigned long flags; 2336 unsigned long flags;
2292 2337
2293 spin_lock_irqsave(&vector_lock, flags); 2338 spin_lock_irqsave(&vector_lock, flags);
2294 send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2339 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2295 spin_unlock_irqrestore(&vector_lock, flags); 2340 spin_unlock_irqrestore(&vector_lock, flags);
2296 2341
2297 return 1; 2342 return 1;
@@ -2299,7 +2344,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2299#else 2344#else
2300static int ioapic_retrigger_irq(unsigned int irq) 2345static int ioapic_retrigger_irq(unsigned int irq)
2301{ 2346{
2302 send_IPI_self(irq_cfg(irq)->vector); 2347 apic->send_IPI_self(irq_cfg(irq)->vector);
2303 2348
2304 return 1; 2349 return 1;
2305} 2350}
@@ -2317,37 +2362,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
2317#ifdef CONFIG_SMP 2362#ifdef CONFIG_SMP
2318 2363
2319#ifdef CONFIG_INTR_REMAP 2364#ifdef CONFIG_INTR_REMAP
2320static void ir_irq_migration(struct work_struct *work);
2321
2322static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2323 2365
2324/* 2366/*
2325 * Migrate the IO-APIC irq in the presence of intr-remapping. 2367 * Migrate the IO-APIC irq in the presence of intr-remapping.
2326 * 2368 *
2327 * For edge triggered, irq migration is a simple atomic update(of vector 2369 * For both level and edge triggered, irq migration is a simple atomic
2328 * and cpu destination) of IRTE and flush the hardware cache. 2370 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2329 *
2330 * For level triggered, we need to modify the io-apic RTE aswell with the update
2331 * vector information, along with modifying IRTE with vector and destination.
2332 * So irq migration for level triggered is little bit more complex compared to
2333 * edge triggered migration. But the good news is, we use the same algorithm
2334 * for level triggered migration as we have today, only difference being,
2335 * we now initiate the irq migration from process context instead of the
2336 * interrupt context.
2337 * 2371 *
2338 * In future, when we do a directed EOI (combined with cpu EOI broadcast 2372 * For level triggered, we eliminate the io-apic RTE modification (with the
2339 * suppression) to the IO-APIC, level triggered irq migration will also be 2373 * updated vector information), by using a virtual vector (io-apic pin number).
2340 * as simple as edge triggered migration and we can do the irq migration 2374 * Real vector that is used for interrupting cpu will be coming from
2341 * with a simple atomic update to IO-APIC RTE. 2375 * the interrupt-remapping table entry.
2342 */ 2376 */
2343static void 2377static void
2344migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2378migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2345{ 2379{
2346 struct irq_cfg *cfg; 2380 struct irq_cfg *cfg;
2347 struct irte irte; 2381 struct irte irte;
2348 int modify_ioapic_rte;
2349 unsigned int dest; 2382 unsigned int dest;
2350 unsigned long flags;
2351 unsigned int irq; 2383 unsigned int irq;
2352 2384
2353 if (!cpumask_intersects(mask, cpu_online_mask)) 2385 if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2363,14 +2395,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2363 2395
2364 set_extra_move_desc(desc, mask); 2396 set_extra_move_desc(desc, mask);
2365 2397
2366 dest = cpu_mask_to_apicid_and(cfg->domain, mask); 2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2367
2368 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2369 if (modify_ioapic_rte) {
2370 spin_lock_irqsave(&ioapic_lock, flags);
2371 __target_IO_APIC_irq(irq, dest, cfg);
2372 spin_unlock_irqrestore(&ioapic_lock, flags);
2373 }
2374 2399
2375 irte.vector = cfg->vector; 2400 irte.vector = cfg->vector;
2376 irte.dest_id = IRTE_DEST(dest); 2401 irte.dest_id = IRTE_DEST(dest);
@@ -2383,61 +2408,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2383 if (cfg->move_in_progress) 2408 if (cfg->move_in_progress)
2384 send_cleanup_vector(cfg); 2409 send_cleanup_vector(cfg);
2385 2410
2386 cpumask_copy(&desc->affinity, mask); 2411 cpumask_copy(desc->affinity, mask);
2387}
2388
2389static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2390{
2391 int ret = -1;
2392 struct irq_cfg *cfg = desc->chip_data;
2393
2394 mask_IO_APIC_irq_desc(desc);
2395
2396 if (io_apic_level_ack_pending(cfg)) {
2397 /*
2398 * Interrupt in progress. Migrating irq now will change the
2399 * vector information in the IO-APIC RTE and that will confuse
2400 * the EOI broadcast performed by cpu.
2401 * So, delay the irq migration to the next instance.
2402 */
2403 schedule_delayed_work(&ir_migration_work, 1);
2404 goto unmask;
2405 }
2406
2407 /* everthing is clear. we have right of way */
2408 migrate_ioapic_irq_desc(desc, &desc->pending_mask);
2409
2410 ret = 0;
2411 desc->status &= ~IRQ_MOVE_PENDING;
2412 cpumask_clear(&desc->pending_mask);
2413
2414unmask:
2415 unmask_IO_APIC_irq_desc(desc);
2416
2417 return ret;
2418}
2419
2420static void ir_irq_migration(struct work_struct *work)
2421{
2422 unsigned int irq;
2423 struct irq_desc *desc;
2424
2425 for_each_irq_desc(irq, desc) {
2426 if (desc->status & IRQ_MOVE_PENDING) {
2427 unsigned long flags;
2428
2429 spin_lock_irqsave(&desc->lock, flags);
2430 if (!desc->chip->set_affinity ||
2431 !(desc->status & IRQ_MOVE_PENDING)) {
2432 desc->status &= ~IRQ_MOVE_PENDING;
2433 spin_unlock_irqrestore(&desc->lock, flags);
2434 continue;
2435 }
2436
2437 desc->chip->set_affinity(irq, &desc->pending_mask);
2438 spin_unlock_irqrestore(&desc->lock, flags);
2439 }
2440 }
2441} 2412}
2442 2413
2443/* 2414/*
@@ -2446,13 +2417,6 @@ static void ir_irq_migration(struct work_struct *work)
2446static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2417static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2447 const struct cpumask *mask) 2418 const struct cpumask *mask)
2448{ 2419{
2449 if (desc->status & IRQ_LEVEL) {
2450 desc->status |= IRQ_MOVE_PENDING;
2451 cpumask_copy(&desc->pending_mask, mask);
2452 migrate_irq_remapped_level_desc(desc);
2453 return;
2454 }
2455
2456 migrate_ioapic_irq_desc(desc, mask); 2420 migrate_ioapic_irq_desc(desc, mask);
2457} 2421}
2458static void set_ir_ioapic_affinity_irq(unsigned int irq, 2422static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2462,6 +2426,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2462 2426
2463 set_ir_ioapic_affinity_irq_desc(desc, mask); 2427 set_ir_ioapic_affinity_irq_desc(desc, mask);
2464} 2428}
2429#else
2430static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2431 const struct cpumask *mask)
2432{
2433}
2465#endif 2434#endif
2466 2435
2467asmlinkage void smp_irq_move_cleanup_interrupt(void) 2436asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2475,6 +2444,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2475 me = smp_processor_id(); 2444 me = smp_processor_id();
2476 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2445 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2477 unsigned int irq; 2446 unsigned int irq;
2447 unsigned int irr;
2478 struct irq_desc *desc; 2448 struct irq_desc *desc;
2479 struct irq_cfg *cfg; 2449 struct irq_cfg *cfg;
2480 irq = __get_cpu_var(vector_irq)[vector]; 2450 irq = __get_cpu_var(vector_irq)[vector];
@@ -2494,6 +2464,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2494 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2464 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2495 goto unlock; 2465 goto unlock;
2496 2466
2467 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
2468 /*
2469 * Check if the vector that needs to be cleanedup is
2470 * registered at the cpu's IRR. If so, then this is not
2471 * the best time to clean it up. Lets clean it up in the
2472 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
2473 * to myself.
2474 */
2475 if (irr & (1 << (vector % 32))) {
2476 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2477 goto unlock;
2478 }
2497 __get_cpu_var(vector_irq)[vector] = -1; 2479 __get_cpu_var(vector_irq)[vector] = -1;
2498 cfg->move_cleanup_count--; 2480 cfg->move_cleanup_count--;
2499unlock: 2481unlock:
@@ -2516,7 +2498,7 @@ static void irq_complete_move(struct irq_desc **descp)
2516 2498
2517 /* domain has not changed, but affinity did */ 2499 /* domain has not changed, but affinity did */
2518 me = smp_processor_id(); 2500 me = smp_processor_id();
2519 if (cpu_isset(me, desc->affinity)) { 2501 if (cpumask_test_cpu(me, desc->affinity)) {
2520 *descp = desc = move_irq_desc(desc, me); 2502 *descp = desc = move_irq_desc(desc, me);
2521 /* get the new one */ 2503 /* get the new one */
2522 cfg = desc->chip_data; 2504 cfg = desc->chip_data;
@@ -2542,17 +2524,51 @@ static void irq_complete_move(struct irq_desc **descp)
2542static inline void irq_complete_move(struct irq_desc **descp) {} 2524static inline void irq_complete_move(struct irq_desc **descp) {}
2543#endif 2525#endif
2544 2526
2545#ifdef CONFIG_INTR_REMAP 2527#ifdef CONFIG_X86_X2APIC
2528static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2529{
2530 int apic, pin;
2531 struct irq_pin_list *entry;
2532
2533 entry = cfg->irq_2_pin;
2534 for (;;) {
2535
2536 if (!entry)
2537 break;
2538
2539 apic = entry->apic;
2540 pin = entry->pin;
2541 io_apic_eoi(apic, pin);
2542 entry = entry->next;
2543 }
2544}
2545
2546static void
2547eoi_ioapic_irq(struct irq_desc *desc)
2548{
2549 struct irq_cfg *cfg;
2550 unsigned long flags;
2551 unsigned int irq;
2552
2553 irq = desc->irq;
2554 cfg = desc->chip_data;
2555
2556 spin_lock_irqsave(&ioapic_lock, flags);
2557 __eoi_ioapic_irq(irq, cfg);
2558 spin_unlock_irqrestore(&ioapic_lock, flags);
2559}
2560
2546static void ack_x2apic_level(unsigned int irq) 2561static void ack_x2apic_level(unsigned int irq)
2547{ 2562{
2563 struct irq_desc *desc = irq_to_desc(irq);
2548 ack_x2APIC_irq(); 2564 ack_x2APIC_irq();
2565 eoi_ioapic_irq(desc);
2549} 2566}
2550 2567
2551static void ack_x2apic_edge(unsigned int irq) 2568static void ack_x2apic_edge(unsigned int irq)
2552{ 2569{
2553 ack_x2APIC_irq(); 2570 ack_x2APIC_irq();
2554} 2571}
2555
2556#endif 2572#endif
2557 2573
2558static void ack_apic_edge(unsigned int irq) 2574static void ack_apic_edge(unsigned int irq)
@@ -2663,6 +2679,26 @@ static void ack_apic_level(unsigned int irq)
2663#endif 2679#endif
2664} 2680}
2665 2681
2682#ifdef CONFIG_INTR_REMAP
2683static void ir_ack_apic_edge(unsigned int irq)
2684{
2685#ifdef CONFIG_X86_X2APIC
2686 if (x2apic_enabled())
2687 return ack_x2apic_edge(irq);
2688#endif
2689 return ack_apic_edge(irq);
2690}
2691
2692static void ir_ack_apic_level(unsigned int irq)
2693{
2694#ifdef CONFIG_X86_X2APIC
2695 if (x2apic_enabled())
2696 return ack_x2apic_level(irq);
2697#endif
2698 return ack_apic_level(irq);
2699}
2700#endif /* CONFIG_INTR_REMAP */
2701
2666static struct irq_chip ioapic_chip __read_mostly = { 2702static struct irq_chip ioapic_chip __read_mostly = {
2667 .name = "IO-APIC", 2703 .name = "IO-APIC",
2668 .startup = startup_ioapic_irq, 2704 .startup = startup_ioapic_irq,
@@ -2676,20 +2712,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
2676 .retrigger = ioapic_retrigger_irq, 2712 .retrigger = ioapic_retrigger_irq,
2677}; 2713};
2678 2714
2679#ifdef CONFIG_INTR_REMAP
2680static struct irq_chip ir_ioapic_chip __read_mostly = { 2715static struct irq_chip ir_ioapic_chip __read_mostly = {
2681 .name = "IR-IO-APIC", 2716 .name = "IR-IO-APIC",
2682 .startup = startup_ioapic_irq, 2717 .startup = startup_ioapic_irq,
2683 .mask = mask_IO_APIC_irq, 2718 .mask = mask_IO_APIC_irq,
2684 .unmask = unmask_IO_APIC_irq, 2719 .unmask = unmask_IO_APIC_irq,
2685 .ack = ack_x2apic_edge, 2720#ifdef CONFIG_INTR_REMAP
2686 .eoi = ack_x2apic_level, 2721 .ack = ir_ack_apic_edge,
2722 .eoi = ir_ack_apic_level,
2687#ifdef CONFIG_SMP 2723#ifdef CONFIG_SMP
2688 .set_affinity = set_ir_ioapic_affinity_irq, 2724 .set_affinity = set_ir_ioapic_affinity_irq,
2689#endif 2725#endif
2726#endif
2690 .retrigger = ioapic_retrigger_irq, 2727 .retrigger = ioapic_retrigger_irq,
2691}; 2728};
2692#endif
2693 2729
2694static inline void init_IO_APIC_traps(void) 2730static inline void init_IO_APIC_traps(void)
2695{ 2731{
@@ -2867,19 +2903,15 @@ static inline void __init check_timer(void)
2867 int cpu = boot_cpu_id; 2903 int cpu = boot_cpu_id;
2868 int apic1, pin1, apic2, pin2; 2904 int apic1, pin1, apic2, pin2;
2869 unsigned long flags; 2905 unsigned long flags;
2870 unsigned int ver;
2871 int no_pin1 = 0; 2906 int no_pin1 = 0;
2872 2907
2873 local_irq_save(flags); 2908 local_irq_save(flags);
2874 2909
2875 ver = apic_read(APIC_LVR);
2876 ver = GET_APIC_VERSION(ver);
2877
2878 /* 2910 /*
2879 * get/set the timer IRQ vector: 2911 * get/set the timer IRQ vector:
2880 */ 2912 */
2881 disable_8259A_irq(0); 2913 disable_8259A_irq(0);
2882 assign_irq_vector(0, cfg, TARGET_CPUS); 2914 assign_irq_vector(0, cfg, apic->target_cpus());
2883 2915
2884 /* 2916 /*
2885 * As IRQ0 is to be enabled in the 8259A, the virtual 2917 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2893,7 +2925,13 @@ static inline void __init check_timer(void)
2893 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2925 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2894 init_8259A(1); 2926 init_8259A(1);
2895#ifdef CONFIG_X86_32 2927#ifdef CONFIG_X86_32
2896 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2928 {
2929 unsigned int ver;
2930
2931 ver = apic_read(APIC_LVR);
2932 ver = GET_APIC_VERSION(ver);
2933 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2934 }
2897#endif 2935#endif
2898 2936
2899 pin1 = find_isa_irq_pin(0, mp_INT); 2937 pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2913,10 +2951,8 @@ static inline void __init check_timer(void)
2913 * 8259A. 2951 * 8259A.
2914 */ 2952 */
2915 if (pin1 == -1) { 2953 if (pin1 == -1) {
2916#ifdef CONFIG_INTR_REMAP
2917 if (intr_remapping_enabled) 2954 if (intr_remapping_enabled)
2918 panic("BIOS bug: timer not connected to IO-APIC"); 2955 panic("BIOS bug: timer not connected to IO-APIC");
2919#endif
2920 pin1 = pin2; 2956 pin1 = pin2;
2921 apic1 = apic2; 2957 apic1 = apic2;
2922 no_pin1 = 1; 2958 no_pin1 = 1;
@@ -2932,8 +2968,17 @@ static inline void __init check_timer(void)
2932 if (no_pin1) { 2968 if (no_pin1) {
2933 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2969 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2934 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2970 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2971 } else {
2972 /* for edge trigger, setup_IO_APIC_irq already
2973 * leave it unmasked.
2974 * so only need to unmask if it is level-trigger
2975 * do we really have level trigger timer?
2976 */
2977 int idx;
2978 idx = find_irq_entry(apic1, pin1, mp_INT);
2979 if (idx != -1 && irq_trigger(idx))
2980 unmask_IO_APIC_irq_desc(desc);
2935 } 2981 }
2936 unmask_IO_APIC_irq_desc(desc);
2937 if (timer_irq_works()) { 2982 if (timer_irq_works()) {
2938 if (nmi_watchdog == NMI_IO_APIC) { 2983 if (nmi_watchdog == NMI_IO_APIC) {
2939 setup_nmi(); 2984 setup_nmi();
@@ -2943,10 +2988,9 @@ static inline void __init check_timer(void)
2943 clear_IO_APIC_pin(0, pin1); 2988 clear_IO_APIC_pin(0, pin1);
2944 goto out; 2989 goto out;
2945 } 2990 }
2946#ifdef CONFIG_INTR_REMAP
2947 if (intr_remapping_enabled) 2991 if (intr_remapping_enabled)
2948 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2992 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2949#endif 2993 local_irq_disable();
2950 clear_IO_APIC_pin(apic1, pin1); 2994 clear_IO_APIC_pin(apic1, pin1);
2951 if (!no_pin1) 2995 if (!no_pin1)
2952 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2996 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2961,7 +3005,6 @@ static inline void __init check_timer(void)
2961 */ 3005 */
2962 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 3006 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2963 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3007 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2964 unmask_IO_APIC_irq_desc(desc);
2965 enable_8259A_irq(0); 3008 enable_8259A_irq(0);
2966 if (timer_irq_works()) { 3009 if (timer_irq_works()) {
2967 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3010 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2976,6 +3019,7 @@ static inline void __init check_timer(void)
2976 /* 3019 /*
2977 * Cleanup, just in case ... 3020 * Cleanup, just in case ...
2978 */ 3021 */
3022 local_irq_disable();
2979 disable_8259A_irq(0); 3023 disable_8259A_irq(0);
2980 clear_IO_APIC_pin(apic2, pin2); 3024 clear_IO_APIC_pin(apic2, pin2);
2981 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3025 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -3001,6 +3045,7 @@ static inline void __init check_timer(void)
3001 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3045 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3002 goto out; 3046 goto out;
3003 } 3047 }
3048 local_irq_disable();
3004 disable_8259A_irq(0); 3049 disable_8259A_irq(0);
3005 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3050 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3006 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3051 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3018,6 +3063,7 @@ static inline void __init check_timer(void)
3018 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3063 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3019 goto out; 3064 goto out;
3020 } 3065 }
3066 local_irq_disable();
3021 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 3067 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
3022 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 3068 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
3023 "report. Then try booting with the 'noapic' option.\n"); 3069 "report. Then try booting with the 'noapic' option.\n");
@@ -3047,13 +3093,9 @@ out:
3047void __init setup_IO_APIC(void) 3093void __init setup_IO_APIC(void)
3048{ 3094{
3049 3095
3050#ifdef CONFIG_X86_32
3051 enable_IO_APIC();
3052#else
3053 /* 3096 /*
3054 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3097 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3055 */ 3098 */
3056#endif
3057 3099
3058 io_apic_irqs = ~PIC_IRQS; 3100 io_apic_irqs = ~PIC_IRQS;
3059 3101
@@ -3118,8 +3160,8 @@ static int ioapic_resume(struct sys_device *dev)
3118 3160
3119 spin_lock_irqsave(&ioapic_lock, flags); 3161 spin_lock_irqsave(&ioapic_lock, flags);
3120 reg_00.raw = io_apic_read(dev->id, 0); 3162 reg_00.raw = io_apic_read(dev->id, 0);
3121 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { 3163 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3122 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; 3164 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3123 io_apic_write(dev->id, 0, reg_00.raw); 3165 io_apic_write(dev->id, 0, reg_00.raw);
3124 } 3166 }
3125 spin_unlock_irqrestore(&ioapic_lock, flags); 3167 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3169,6 +3211,7 @@ static int __init ioapic_init_sysfs(void)
3169 3211
3170device_initcall(ioapic_init_sysfs); 3212device_initcall(ioapic_init_sysfs);
3171 3213
3214static int nr_irqs_gsi = NR_IRQS_LEGACY;
3172/* 3215/*
3173 * Dynamic irq allocate and deallocation 3216 * Dynamic irq allocate and deallocation
3174 */ 3217 */
@@ -3183,11 +3226,11 @@ unsigned int create_irq_nr(unsigned int irq_want)
3183 struct irq_desc *desc_new = NULL; 3226 struct irq_desc *desc_new = NULL;
3184 3227
3185 irq = 0; 3228 irq = 0;
3186 spin_lock_irqsave(&vector_lock, flags); 3229 if (irq_want < nr_irqs_gsi)
3187 for (new = irq_want; new < NR_IRQS; new++) { 3230 irq_want = nr_irqs_gsi;
3188 if (platform_legacy_irq(new))
3189 continue;
3190 3231
3232 spin_lock_irqsave(&vector_lock, flags);
3233 for (new = irq_want; new < nr_irqs; new++) {
3191 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3234 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3192 if (!desc_new) { 3235 if (!desc_new) {
3193 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3236 printk(KERN_INFO "can not get irq_desc for %d\n", new);
@@ -3197,7 +3240,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3197 3240
3198 if (cfg_new->vector != 0) 3241 if (cfg_new->vector != 0)
3199 continue; 3242 continue;
3200 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) 3243 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3201 irq = new; 3244 irq = new;
3202 break; 3245 break;
3203 } 3246 }
@@ -3212,7 +3255,6 @@ unsigned int create_irq_nr(unsigned int irq_want)
3212 return irq; 3255 return irq;
3213} 3256}
3214 3257
3215static int nr_irqs_gsi = NR_IRQS_LEGACY;
3216int create_irq(void) 3258int create_irq(void)
3217{ 3259{
3218 unsigned int irq_want; 3260 unsigned int irq_want;
@@ -3241,9 +3283,7 @@ void destroy_irq(unsigned int irq)
3241 if (desc) 3283 if (desc)
3242 desc->chip_data = cfg; 3284 desc->chip_data = cfg;
3243 3285
3244#ifdef CONFIG_INTR_REMAP
3245 free_irte(irq); 3286 free_irte(irq);
3246#endif
3247 spin_lock_irqsave(&vector_lock, flags); 3287 spin_lock_irqsave(&vector_lock, flags);
3248 __clear_irq_vector(irq, cfg); 3288 __clear_irq_vector(irq, cfg);
3249 spin_unlock_irqrestore(&vector_lock, flags); 3289 spin_unlock_irqrestore(&vector_lock, flags);
@@ -3259,14 +3299,16 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3259 int err; 3299 int err;
3260 unsigned dest; 3300 unsigned dest;
3261 3301
3302 if (disable_apic)
3303 return -ENXIO;
3304
3262 cfg = irq_cfg(irq); 3305 cfg = irq_cfg(irq);
3263 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3306 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3264 if (err) 3307 if (err)
3265 return err; 3308 return err;
3266 3309
3267 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 3310 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3268 3311
3269#ifdef CONFIG_INTR_REMAP
3270 if (irq_remapped(irq)) { 3312 if (irq_remapped(irq)) {
3271 struct irte irte; 3313 struct irte irte;
3272 int ir_index; 3314 int ir_index;
@@ -3278,9 +3320,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3278 memset (&irte, 0, sizeof(irte)); 3320 memset (&irte, 0, sizeof(irte));
3279 3321
3280 irte.present = 1; 3322 irte.present = 1;
3281 irte.dst_mode = INT_DEST_MODE; 3323 irte.dst_mode = apic->irq_dest_mode;
3282 irte.trigger_mode = 0; /* edge */ 3324 irte.trigger_mode = 0; /* edge */
3283 irte.dlvry_mode = INT_DELIVERY_MODE; 3325 irte.dlvry_mode = apic->irq_delivery_mode;
3284 irte.vector = cfg->vector; 3326 irte.vector = cfg->vector;
3285 irte.dest_id = IRTE_DEST(dest); 3327 irte.dest_id = IRTE_DEST(dest);
3286 3328
@@ -3292,16 +3334,19 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3292 MSI_ADDR_IR_SHV | 3334 MSI_ADDR_IR_SHV |
3293 MSI_ADDR_IR_INDEX1(ir_index) | 3335 MSI_ADDR_IR_INDEX1(ir_index) |
3294 MSI_ADDR_IR_INDEX2(ir_index); 3336 MSI_ADDR_IR_INDEX2(ir_index);
3295 } else 3337 } else {
3296#endif 3338 if (x2apic_enabled())
3297 { 3339 msg->address_hi = MSI_ADDR_BASE_HI |
3298 msg->address_hi = MSI_ADDR_BASE_HI; 3340 MSI_ADDR_EXT_DEST_ID(dest);
3341 else
3342 msg->address_hi = MSI_ADDR_BASE_HI;
3343
3299 msg->address_lo = 3344 msg->address_lo =
3300 MSI_ADDR_BASE_LO | 3345 MSI_ADDR_BASE_LO |
3301 ((INT_DEST_MODE == 0) ? 3346 ((apic->irq_dest_mode == 0) ?
3302 MSI_ADDR_DEST_MODE_PHYSICAL: 3347 MSI_ADDR_DEST_MODE_PHYSICAL:
3303 MSI_ADDR_DEST_MODE_LOGICAL) | 3348 MSI_ADDR_DEST_MODE_LOGICAL) |
3304 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3349 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3305 MSI_ADDR_REDIRECTION_CPU: 3350 MSI_ADDR_REDIRECTION_CPU:
3306 MSI_ADDR_REDIRECTION_LOWPRI) | 3351 MSI_ADDR_REDIRECTION_LOWPRI) |
3307 MSI_ADDR_DEST_ID(dest); 3352 MSI_ADDR_DEST_ID(dest);
@@ -3309,7 +3354,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3309 msg->data = 3354 msg->data =
3310 MSI_DATA_TRIGGER_EDGE | 3355 MSI_DATA_TRIGGER_EDGE |
3311 MSI_DATA_LEVEL_ASSERT | 3356 MSI_DATA_LEVEL_ASSERT |
3312 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3357 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3313 MSI_DATA_DELIVERY_FIXED: 3358 MSI_DATA_DELIVERY_FIXED:
3314 MSI_DATA_DELIVERY_LOWPRI) | 3359 MSI_DATA_DELIVERY_LOWPRI) |
3315 MSI_DATA_VECTOR(cfg->vector); 3360 MSI_DATA_VECTOR(cfg->vector);
@@ -3395,15 +3440,16 @@ static struct irq_chip msi_chip = {
3395 .retrigger = ioapic_retrigger_irq, 3440 .retrigger = ioapic_retrigger_irq,
3396}; 3441};
3397 3442
3398#ifdef CONFIG_INTR_REMAP
3399static struct irq_chip msi_ir_chip = { 3443static struct irq_chip msi_ir_chip = {
3400 .name = "IR-PCI-MSI", 3444 .name = "IR-PCI-MSI",
3401 .unmask = unmask_msi_irq, 3445 .unmask = unmask_msi_irq,
3402 .mask = mask_msi_irq, 3446 .mask = mask_msi_irq,
3403 .ack = ack_x2apic_edge, 3447#ifdef CONFIG_INTR_REMAP
3448 .ack = ir_ack_apic_edge,
3404#ifdef CONFIG_SMP 3449#ifdef CONFIG_SMP
3405 .set_affinity = ir_set_msi_irq_affinity, 3450 .set_affinity = ir_set_msi_irq_affinity,
3406#endif 3451#endif
3452#endif
3407 .retrigger = ioapic_retrigger_irq, 3453 .retrigger = ioapic_retrigger_irq,
3408}; 3454};
3409 3455
@@ -3433,7 +3479,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3433 } 3479 }
3434 return index; 3480 return index;
3435} 3481}
3436#endif
3437 3482
3438static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3483static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3439{ 3484{
@@ -3447,7 +3492,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3447 set_irq_msi(irq, msidesc); 3492 set_irq_msi(irq, msidesc);
3448 write_msi_msg(irq, &msg); 3493 write_msi_msg(irq, &msg);
3449 3494
3450#ifdef CONFIG_INTR_REMAP
3451 if (irq_remapped(irq)) { 3495 if (irq_remapped(irq)) {
3452 struct irq_desc *desc = irq_to_desc(irq); 3496 struct irq_desc *desc = irq_to_desc(irq);
3453 /* 3497 /*
@@ -3456,7 +3500,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3456 desc->status |= IRQ_MOVE_PCNTXT; 3500 desc->status |= IRQ_MOVE_PCNTXT;
3457 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3501 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3458 } else 3502 } else
3459#endif
3460 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3503 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3461 3504
3462 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3505 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3464,60 +3507,26 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3464 return 0; 3507 return 0;
3465} 3508}
3466 3509
3467int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3468{
3469 unsigned int irq;
3470 int ret;
3471 unsigned int irq_want;
3472
3473 irq_want = nr_irqs_gsi;
3474 irq = create_irq_nr(irq_want);
3475 if (irq == 0)
3476 return -1;
3477
3478#ifdef CONFIG_INTR_REMAP
3479 if (!intr_remapping_enabled)
3480 goto no_ir;
3481
3482 ret = msi_alloc_irte(dev, irq, 1);
3483 if (ret < 0)
3484 goto error;
3485no_ir:
3486#endif
3487 ret = setup_msi_irq(dev, msidesc, irq);
3488 if (ret < 0) {
3489 destroy_irq(irq);
3490 return ret;
3491 }
3492 return 0;
3493
3494#ifdef CONFIG_INTR_REMAP
3495error:
3496 destroy_irq(irq);
3497 return ret;
3498#endif
3499}
3500
3501int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3510int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3502{ 3511{
3503 unsigned int irq; 3512 unsigned int irq;
3504 int ret, sub_handle; 3513 int ret, sub_handle;
3505 struct msi_desc *msidesc; 3514 struct msi_desc *msidesc;
3506 unsigned int irq_want; 3515 unsigned int irq_want;
3507 3516 struct intel_iommu *iommu = NULL;
3508#ifdef CONFIG_INTR_REMAP
3509 struct intel_iommu *iommu = 0;
3510 int index = 0; 3517 int index = 0;
3511#endif 3518
3519 /* x86 doesn't support multiple MSI yet */
3520 if (type == PCI_CAP_ID_MSI && nvec > 1)
3521 return 1;
3512 3522
3513 irq_want = nr_irqs_gsi; 3523 irq_want = nr_irqs_gsi;
3514 sub_handle = 0; 3524 sub_handle = 0;
3515 list_for_each_entry(msidesc, &dev->msi_list, list) { 3525 list_for_each_entry(msidesc, &dev->msi_list, list) {
3516 irq = create_irq_nr(irq_want); 3526 irq = create_irq_nr(irq_want);
3517 irq_want++;
3518 if (irq == 0) 3527 if (irq == 0)
3519 return -1; 3528 return -1;
3520#ifdef CONFIG_INTR_REMAP 3529 irq_want = irq + 1;
3521 if (!intr_remapping_enabled) 3530 if (!intr_remapping_enabled)
3522 goto no_ir; 3531 goto no_ir;
3523 3532
@@ -3545,7 +3554,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3545 set_irte_irq(irq, iommu, index, sub_handle); 3554 set_irte_irq(irq, iommu, index, sub_handle);
3546 } 3555 }
3547no_ir: 3556no_ir:
3548#endif
3549 ret = setup_msi_irq(dev, msidesc, irq); 3557 ret = setup_msi_irq(dev, msidesc, irq);
3550 if (ret < 0) 3558 if (ret < 0)
3551 goto error; 3559 goto error;
@@ -3563,7 +3571,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3563 destroy_irq(irq); 3571 destroy_irq(irq);
3564} 3572}
3565 3573
3566#ifdef CONFIG_DMAR 3574#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3567#ifdef CONFIG_SMP 3575#ifdef CONFIG_SMP
3568static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3576static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3569{ 3577{
@@ -3644,7 +3652,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3644 3652
3645#endif /* CONFIG_SMP */ 3653#endif /* CONFIG_SMP */
3646 3654
3647struct irq_chip hpet_msi_type = { 3655static struct irq_chip hpet_msi_type = {
3648 .name = "HPET_MSI", 3656 .name = "HPET_MSI",
3649 .unmask = hpet_msi_unmask, 3657 .unmask = hpet_msi_unmask,
3650 .mask = hpet_msi_mask, 3658 .mask = hpet_msi_mask,
@@ -3727,13 +3735,17 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3727 struct irq_cfg *cfg; 3735 struct irq_cfg *cfg;
3728 int err; 3736 int err;
3729 3737
3738 if (disable_apic)
3739 return -ENXIO;
3740
3730 cfg = irq_cfg(irq); 3741 cfg = irq_cfg(irq);
3731 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3742 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3732 if (!err) { 3743 if (!err) {
3733 struct ht_irq_msg msg; 3744 struct ht_irq_msg msg;
3734 unsigned dest; 3745 unsigned dest;
3735 3746
3736 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 3747 dest = apic->cpu_mask_to_apicid_and(cfg->domain,
3748 apic->target_cpus());
3737 3749
3738 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3750 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3739 3751
@@ -3741,11 +3753,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3741 HT_IRQ_LOW_BASE | 3753 HT_IRQ_LOW_BASE |
3742 HT_IRQ_LOW_DEST_ID(dest) | 3754 HT_IRQ_LOW_DEST_ID(dest) |
3743 HT_IRQ_LOW_VECTOR(cfg->vector) | 3755 HT_IRQ_LOW_VECTOR(cfg->vector) |
3744 ((INT_DEST_MODE == 0) ? 3756 ((apic->irq_dest_mode == 0) ?
3745 HT_IRQ_LOW_DM_PHYSICAL : 3757 HT_IRQ_LOW_DM_PHYSICAL :
3746 HT_IRQ_LOW_DM_LOGICAL) | 3758 HT_IRQ_LOW_DM_LOGICAL) |
3747 HT_IRQ_LOW_RQEOI_EDGE | 3759 HT_IRQ_LOW_RQEOI_EDGE |
3748 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3760 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3749 HT_IRQ_LOW_MT_FIXED : 3761 HT_IRQ_LOW_MT_FIXED :
3750 HT_IRQ_LOW_MT_ARBITRATED) | 3762 HT_IRQ_LOW_MT_ARBITRATED) |
3751 HT_IRQ_LOW_IRQ_MASKED; 3763 HT_IRQ_LOW_IRQ_MASKED;
@@ -3761,7 +3773,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3761} 3773}
3762#endif /* CONFIG_HT_IRQ */ 3774#endif /* CONFIG_HT_IRQ */
3763 3775
3764#ifdef CONFIG_X86_64 3776#ifdef CONFIG_X86_UV
3765/* 3777/*
3766 * Re-target the irq to the specified CPU and enable the specified MMR located 3778 * Re-target the irq to the specified CPU and enable the specified MMR located
3767 * on the specified blade to allow the sending of MSIs to the specified CPU. 3779 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3793,12 +3805,12 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3793 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3805 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3794 3806
3795 entry->vector = cfg->vector; 3807 entry->vector = cfg->vector;
3796 entry->delivery_mode = INT_DELIVERY_MODE; 3808 entry->delivery_mode = apic->irq_delivery_mode;
3797 entry->dest_mode = INT_DEST_MODE; 3809 entry->dest_mode = apic->irq_dest_mode;
3798 entry->polarity = 0; 3810 entry->polarity = 0;
3799 entry->trigger = 0; 3811 entry->trigger = 0;
3800 entry->mask = 0; 3812 entry->mask = 0;
3801 entry->dest = cpu_mask_to_apicid(eligible_cpu); 3813 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3802 3814
3803 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3815 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3804 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3816 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3861,6 +3873,28 @@ void __init probe_nr_irqs_gsi(void)
3861 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3873 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3862} 3874}
3863 3875
3876#ifdef CONFIG_SPARSE_IRQ
3877int __init arch_probe_nr_irqs(void)
3878{
3879 int nr;
3880
3881 if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
3882 nr_irqs = NR_VECTORS * nr_cpu_ids;
3883
3884 nr = nr_irqs_gsi + 8 * nr_cpu_ids;
3885#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
3886 /*
3887 * for MSI and HT dyn irq
3888 */
3889 nr += nr_irqs_gsi * 16;
3890#endif
3891 if (nr < nr_irqs)
3892 nr_irqs = nr;
3893
3894 return 0;
3895}
3896#endif
3897
3864/* -------------------------------------------------------------------------- 3898/* --------------------------------------------------------------------------
3865 ACPI-based IOAPIC Configuration 3899 ACPI-based IOAPIC Configuration
3866 -------------------------------------------------------------------------- */ 3900 -------------------------------------------------------------------------- */
@@ -3886,7 +3920,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3886 */ 3920 */
3887 3921
3888 if (physids_empty(apic_id_map)) 3922 if (physids_empty(apic_id_map))
3889 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); 3923 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
3890 3924
3891 spin_lock_irqsave(&ioapic_lock, flags); 3925 spin_lock_irqsave(&ioapic_lock, flags);
3892 reg_00.raw = io_apic_read(ioapic, 0); 3926 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3902,10 +3936,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3902 * Every APIC in a system must have a unique ID or we get lots of nice 3936 * Every APIC in a system must have a unique ID or we get lots of nice
3903 * 'stuck on smp_invalidate_needed IPI wait' messages. 3937 * 'stuck on smp_invalidate_needed IPI wait' messages.
3904 */ 3938 */
3905 if (check_apicid_used(apic_id_map, apic_id)) { 3939 if (apic->check_apicid_used(apic_id_map, apic_id)) {
3906 3940
3907 for (i = 0; i < get_physical_broadcast(); i++) { 3941 for (i = 0; i < get_physical_broadcast(); i++) {
3908 if (!check_apicid_used(apic_id_map, i)) 3942 if (!apic->check_apicid_used(apic_id_map, i))
3909 break; 3943 break;
3910 } 3944 }
3911 3945
@@ -3918,7 +3952,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3918 apic_id = i; 3952 apic_id = i;
3919 } 3953 }
3920 3954
3921 tmp = apicid_to_cpu_present(apic_id); 3955 tmp = apic->apicid_to_cpu_present(apic_id);
3922 physids_or(apic_id_map, apic_id_map, tmp); 3956 physids_or(apic_id_map, apic_id_map, tmp);
3923 3957
3924 if (reg_00.bits.ID != apic_id) { 3958 if (reg_00.bits.ID != apic_id) {
@@ -3995,8 +4029,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3995 return -1; 4029 return -1;
3996 4030
3997 for (i = 0; i < mp_irq_entries; i++) 4031 for (i = 0; i < mp_irq_entries; i++)
3998 if (mp_irqs[i].mp_irqtype == mp_INT && 4032 if (mp_irqs[i].irqtype == mp_INT &&
3999 mp_irqs[i].mp_srcbusirq == bus_irq) 4033 mp_irqs[i].srcbusirq == bus_irq)
4000 break; 4034 break;
4001 if (i >= mp_irq_entries) 4035 if (i >= mp_irq_entries)
4002 return -1; 4036 return -1;
@@ -4011,7 +4045,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4011/* 4045/*
4012 * This function currently is only a helper for the i386 smp boot process where 4046 * This function currently is only a helper for the i386 smp boot process where
4013 * we need to reprogram the ioredtbls to cater for the cpus which have come online 4047 * we need to reprogram the ioredtbls to cater for the cpus which have come online
4014 * so mask in all cases should simply be TARGET_CPUS 4048 * so mask in all cases should simply be apic->target_cpus()
4015 */ 4049 */
4016#ifdef CONFIG_SMP 4050#ifdef CONFIG_SMP
4017void __init setup_ioapic_dest(void) 4051void __init setup_ioapic_dest(void)
@@ -4050,15 +4084,13 @@ void __init setup_ioapic_dest(void)
4050 */ 4084 */
4051 if (desc->status & 4085 if (desc->status &
4052 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4086 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4053 mask = &desc->affinity; 4087 mask = desc->affinity;
4054 else 4088 else
4055 mask = TARGET_CPUS; 4089 mask = apic->target_cpus();
4056 4090
4057#ifdef CONFIG_INTR_REMAP
4058 if (intr_remapping_enabled) 4091 if (intr_remapping_enabled)
4059 set_ir_ioapic_affinity_irq_desc(desc, mask); 4092 set_ir_ioapic_affinity_irq_desc(desc, mask);
4060 else 4093 else
4061#endif
4062 set_ioapic_affinity_irq_desc(desc, mask); 4094 set_ioapic_affinity_irq_desc(desc, mask);
4063 } 4095 }
4064 4096
@@ -4111,7 +4143,7 @@ void __init ioapic_init_mappings(void)
4111 ioapic_res = ioapic_setup_resources(); 4143 ioapic_res = ioapic_setup_resources();
4112 for (i = 0; i < nr_ioapics; i++) { 4144 for (i = 0; i < nr_ioapics; i++) {
4113 if (smp_found_config) { 4145 if (smp_found_config) {
4114 ioapic_phys = mp_ioapics[i].mp_apicaddr; 4146 ioapic_phys = mp_ioapics[i].apicaddr;
4115#ifdef CONFIG_X86_32 4147#ifdef CONFIG_X86_32
4116 if (!ioapic_phys) { 4148 if (!ioapic_phys) {
4117 printk(KERN_ERR 4149 printk(KERN_ERR
@@ -4151,9 +4183,12 @@ static int __init ioapic_insert_resources(void)
4151 struct resource *r = ioapic_resources; 4183 struct resource *r = ioapic_resources;
4152 4184
4153 if (!r) { 4185 if (!r) {
4154 printk(KERN_ERR 4186 if (nr_ioapics > 0) {
4155 "IO APIC resources could be not be allocated.\n"); 4187 printk(KERN_ERR
4156 return -1; 4188 "IO APIC resources couldn't be allocated.\n");
4189 return -1;
4190 }
4191 return 0;
4157 } 4192 }
4158 4193
4159 for (i = 0; i < nr_ioapics; i++) { 4194 for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 000000000000..dbf5445727a9
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,164 @@
1#include <linux/cpumask.h>
2#include <linux/interrupt.h>
3#include <linux/init.h>
4
5#include <linux/mm.h>
6#include <linux/delay.h>
7#include <linux/spinlock.h>
8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h>
10#include <linux/cache.h>
11#include <linux/cpu.h>
12#include <linux/module.h>
13
14#include <asm/smp.h>
15#include <asm/mtrr.h>
16#include <asm/tlbflush.h>
17#include <asm/mmu_context.h>
18#include <asm/apic.h>
19#include <asm/proto.h>
20#include <asm/ipi.h>
21
22void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
23{
24 unsigned long query_cpu;
25 unsigned long flags;
26
27 /*
28 * Hack. The clustered APIC addressing mode doesn't allow us to send
29 * to an arbitrary mask, so I do a unicast to each CPU instead.
30 * - mbligh
31 */
32 local_irq_save(flags);
33 for_each_cpu(query_cpu, mask) {
34 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
35 query_cpu), vector, APIC_DEST_PHYSICAL);
36 }
37 local_irq_restore(flags);
38}
39
40void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
41 int vector)
42{
43 unsigned int this_cpu = smp_processor_id();
44 unsigned int query_cpu;
45 unsigned long flags;
46
47 /* See Hack comment above */
48
49 local_irq_save(flags);
50 for_each_cpu(query_cpu, mask) {
51 if (query_cpu == this_cpu)
52 continue;
53 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
54 query_cpu), vector, APIC_DEST_PHYSICAL);
55 }
56 local_irq_restore(flags);
57}
58
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector)
61{
62 unsigned long flags;
63 unsigned int query_cpu;
64
65 /*
66 * Hack. The clustered APIC addressing mode doesn't allow us to send
67 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
68 * should be modified to do 1 message per cluster ID - mbligh
69 */
70
71 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector,
75 apic->dest_logical);
76 local_irq_restore(flags);
77}
78
79void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
80 int vector)
81{
82 unsigned long flags;
83 unsigned int query_cpu;
84 unsigned int this_cpu = smp_processor_id();
85
86 /* See Hack comment above */
87
88 local_irq_save(flags);
89 for_each_cpu(query_cpu, mask) {
90 if (query_cpu == this_cpu)
91 continue;
92 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector,
94 apic->dest_logical);
95 }
96 local_irq_restore(flags);
97}
98
99#ifdef CONFIG_X86_32
100
101/*
102 * This is only used on smaller machines.
103 */
104void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
105{
106 unsigned long mask = cpumask_bits(cpumask)[0];
107 unsigned long flags;
108
109 local_irq_save(flags);
110 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
111 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
112 local_irq_restore(flags);
113}
114
115void default_send_IPI_allbutself(int vector)
116{
117 /*
118 * if there are no other CPUs in the system then we get an APIC send
119 * error if we try to broadcast, thus avoid sending IPIs in this case.
120 */
121 if (!(num_online_cpus() > 1))
122 return;
123
124 __default_local_send_IPI_allbutself(vector);
125}
126
127void default_send_IPI_all(int vector)
128{
129 __default_local_send_IPI_all(vector);
130}
131
132void default_send_IPI_self(int vector)
133{
134 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
135}
136
137/* must come after the send_IPI functions above for inlining */
138static int convert_apicid_to_cpu(int apic_id)
139{
140 int i;
141
142 for_each_possible_cpu(i) {
143 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
144 return i;
145 }
146 return -1;
147}
148
149int safe_smp_processor_id(void)
150{
151 int apicid, cpuid;
152
153 if (!boot_cpu_has(X86_FEATURE_APIC))
154 return 0;
155
156 apicid = hard_smp_processor_id();
157 if (apicid == BAD_APICID)
158 return 0;
159
160 cpuid = convert_apicid_to_cpu(apicid);
161
162 return cpuid >= 0 ? cpuid : 0;
163}
164#endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7228979f1e7f..d6bd62407152 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -34,12 +34,12 @@
34 34
35#include <asm/mce.h> 35#include <asm/mce.h>
36 36
37#include <mach_traps.h> 37#include <asm/mach_traps.h>
38 38
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask = CPU_MASK_NONE; 42static cpumask_var_t backtrace_mask;
43 43
44/* nmi_active: 44/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 45 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -61,11 +61,7 @@ static int endflag __initdata;
61 61
62static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
63{ 63{
64#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
65 return cpu_pda(cpu)->__nmi_count;
66#else
67 return nmi_count(cpu);
68#endif
69} 65}
70 66
71static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
82 */ 78 */
83static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
84{ 80{
85#ifdef CONFIG_X86_64
86 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
87#else
88 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
89 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
90#endif
91} 83}
92 84
93#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
@@ -146,6 +138,7 @@ int __init check_nmi_watchdog(void)
146 if (!prev_nmi_count) 138 if (!prev_nmi_count)
147 goto error; 139 goto error;
148 140
141 alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
149 printk(KERN_INFO "Testing NMI watchdog ... "); 142 printk(KERN_INFO "Testing NMI watchdog ... ");
150 143
151#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -421,14 +414,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
421 touched = 1; 414 touched = 1;
422 } 415 }
423 416
424 if (cpu_isset(cpu, backtrace_mask)) { 417 if (cpumask_test_cpu(cpu, backtrace_mask)) {
425 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
426 419
427 spin_lock(&lock); 420 spin_lock(&lock);
428 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
429 dump_stack(); 422 dump_stack();
430 spin_unlock(&lock); 423 spin_unlock(&lock);
431 cpu_clear(cpu, backtrace_mask); 424 cpumask_clear_cpu(cpu, backtrace_mask);
432 } 425 }
433 426
434 /* Could check oops_in_progress here too, but it's safer not to */ 427 /* Could check oops_in_progress here too, but it's safer not to */
@@ -562,10 +555,10 @@ void __trigger_all_cpu_backtrace(void)
562{ 555{
563 int i; 556 int i;
564 557
565 backtrace_mask = cpu_online_map; 558 cpumask_copy(backtrace_mask, cpu_online_mask);
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 559 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 560 for (i = 0; i < 10 * 1000; i++) {
568 if (cpus_empty(backtrace_mask)) 561 if (cpumask_empty(backtrace_mask))
569 break; 562 break;
570 mdelay(1); 563 mdelay(1);
571 } 564 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
new file mode 100644
index 000000000000..533e59c6fc82
--- /dev/null
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -0,0 +1,558 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to <gone@us.ibm.com>
25 */
26#include <linux/nodemask.h>
27#include <linux/topology.h>
28#include <linux/bootmem.h>
29#include <linux/threads.h>
30#include <linux/cpumask.h>
31#include <linux/kernel.h>
32#include <linux/mmzone.h>
33#include <linux/module.h>
34#include <linux/string.h>
35#include <linux/init.h>
36#include <linux/numa.h>
37#include <linux/smp.h>
38#include <linux/io.h>
39#include <linux/mm.h>
40
41#include <asm/processor.h>
42#include <asm/fixmap.h>
43#include <asm/mpspec.h>
44#include <asm/numaq.h>
45#include <asm/setup.h>
46#include <asm/apic.h>
47#include <asm/e820.h>
48#include <asm/ipi.h>
49
50#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
51
52int found_numaq;
53
54/*
55 * Have to match translation table entries to main table entries by counter
56 * hence the mpc_record variable .... can't see a less disgusting way of
57 * doing this ....
58 */
59struct mpc_trans {
60 unsigned char mpc_type;
61 unsigned char trans_len;
62 unsigned char trans_type;
63 unsigned char trans_quad;
64 unsigned char trans_global;
65 unsigned char trans_local;
66 unsigned short trans_reserved;
67};
68
69/* x86_quirks member */
70static int mpc_record;
71
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
73
74int mp_bus_id_to_node[MAX_MP_BUSSES];
75int mp_bus_id_to_local[MAX_MP_BUSSES];
76int quad_local_to_mp_bus_id[NR_CPUS/4][4];
77
78
79static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
80{
81 struct eachquadmem *eq = scd->eq + node;
82
83 node_set_online(node);
84
85 /* Convert to pages */
86 node_start_pfn[node] =
87 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
88
89 node_end_pfn[node] =
90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
91
92 e820_register_active_regions(node, node_start_pfn[node],
93 node_end_pfn[node]);
94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100}
101
102/*
103 * Function: smp_dump_qct()
104 *
105 * Description: gets memory layout from the quad config table. This
106 * function also updates node_online_map with the nodes (quads) present.
107 */
108static void __init smp_dump_qct(void)
109{
110 struct sys_cfg_data *scd;
111 int node;
112
113 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
114
115 nodes_clear(node_online_map);
116 for_each_node(node) {
117 if (scd->quads_present31_0 & (1 << node))
118 numaq_register_node(node, scd);
119 }
120}
121
122void __cpuinit numaq_tsc_disable(void)
123{
124 if (!found_numaq)
125 return;
126
127 if (num_online_nodes() > 1) {
128 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
129 setup_clear_cpu_cap(X86_FEATURE_TSC);
130 }
131}
132
133static int __init numaq_pre_time_init(void)
134{
135 numaq_tsc_disable();
136 return 0;
137}
138
139static inline int generate_logical_apicid(int quad, int phys_apicid)
140{
141 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
142}
143
144/* x86_quirks member */
145static int mpc_apic_id(struct mpc_cpu *m)
146{
147 int quad = translation_table[mpc_record]->trans_quad;
148 int logical_apicid = generate_logical_apicid(quad, m->apicid);
149
150 printk(KERN_DEBUG
151 "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
152 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
153 (m->cpufeature & CPU_MODEL_MASK) >> 4,
154 m->apicver, quad, logical_apicid);
155
156 return logical_apicid;
157}
158
159/* x86_quirks member */
160static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
161{
162 int quad = translation_table[mpc_record]->trans_quad;
163 int local = translation_table[mpc_record]->trans_local;
164
165 mp_bus_id_to_node[m->busid] = quad;
166 mp_bus_id_to_local[m->busid] = local;
167
168 printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
169}
170
171/* x86_quirks member */
172static void mpc_oem_pci_bus(struct mpc_bus *m)
173{
174 int quad = translation_table[mpc_record]->trans_quad;
175 int local = translation_table[mpc_record]->trans_local;
176
177 quad_local_to_mp_bus_id[quad][local] = m->busid;
178}
179
180static void __init MP_translation_info(struct mpc_trans *m)
181{
182 printk(KERN_INFO
183 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
184 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
185 m->trans_local);
186
187 if (mpc_record >= MAX_MPC_ENTRY)
188 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
189 else
190 translation_table[mpc_record] = m; /* stash this for later */
191
192 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
193 node_set_online(m->trans_quad);
194}
195
196static int __init mpf_checksum(unsigned char *mp, int len)
197{
198 int sum = 0;
199
200 while (len--)
201 sum += *mp++;
202
203 return sum & 0xFF;
204}
205
206/*
207 * Read/parse the MPC oem tables
208 */
209static void __init
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{
212 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214
215 mpc_record = 0;
216 printk(KERN_INFO
217 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
218
219 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
220 printk(KERN_WARNING
221 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
222 oemtable->signature[0], oemtable->signature[1],
223 oemtable->signature[2], oemtable->signature[3]);
224 return;
225 }
226
227 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
228 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
229 return;
230 }
231
232 while (count < oemtable->length) {
233 switch (*oemptr) {
234 case MP_TRANSLATION:
235 {
236 struct mpc_trans *m = (void *)oemptr;
237
238 MP_translation_info(m);
239 oemptr += sizeof(*m);
240 count += sizeof(*m);
241 ++mpc_record;
242 break;
243 }
244 default:
245 printk(KERN_WARNING
246 "Unrecognised OEM table entry type! - %d\n",
247 (int)*oemptr);
248 return;
249 }
250 }
251}
252
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void)
277{
278 /*
279 * Find possible boot-time SMP configuration:
280 */
281 early_find_smp_config();
282
283 /*
284 * get boot-time SMP configuration:
285 */
286 if (smp_found_config)
287 early_get_smp_config();
288
289 if (found_numaq)
290 x86_quirks = &numaq_x86_quirks;
291}
292
293int __init get_memcfg_numaq(void)
294{
295 early_check_numaq();
296 if (!found_numaq)
297 return 0;
298 smp_dump_qct();
299
300 return 1;
301}
302
303#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
304
305static inline unsigned int numaq_get_apic_id(unsigned long x)
306{
307 return (x >> 24) & 0x0F;
308}
309
310static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
311{
312 default_send_IPI_mask_sequence_logical(mask, vector);
313}
314
315static inline void numaq_send_IPI_allbutself(int vector)
316{
317 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
318}
319
320static inline void numaq_send_IPI_all(int vector)
321{
322 numaq_send_IPI_mask(cpu_online_mask, vector);
323}
324
325#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
326#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
327
328/*
329 * Because we use NMIs rather than the INIT-STARTUP sequence to
330 * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
331 */
332static inline void numaq_smp_callin_clear_local_apic(void)
333{
334 clear_local_APIC();
335}
336
337static inline const struct cpumask *numaq_target_cpus(void)
338{
339 return cpu_all_mask;
340}
341
342static inline unsigned long
343numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
344{
345 return physid_isset(apicid, bitmap);
346}
347
348static inline unsigned long numaq_check_apicid_present(int bit)
349{
350 return physid_isset(bit, phys_cpu_present_map);
351}
352
353static inline int numaq_apic_id_registered(void)
354{
355 return 1;
356}
357
358static inline void numaq_init_apic_ldr(void)
359{
360 /* Already done in NUMA-Q firmware */
361}
362
363static inline void numaq_setup_apic_routing(void)
364{
365 printk(KERN_INFO
366 "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
367 nr_ioapics);
368}
369
370/*
371 * Skip adding the timer int on secondary nodes, which causes
372 * a small but painful rift in the time-space continuum.
373 */
374static inline int numaq_multi_timer_check(int apic, int irq)
375{
376 return apic != 0 && irq == 0;
377}
378
379static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
380{
381 /* We don't have a good way to do this yet - hack */
382 return physids_promote(0xFUL);
383}
384
385static inline int numaq_cpu_to_logical_apicid(int cpu)
386{
387 if (cpu >= nr_cpu_ids)
388 return BAD_APICID;
389 return cpu_2_logical_apicid[cpu];
390}
391
392/*
393 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
394 * cpu to APIC ID relation to properly interact with the intelligent
395 * mode of the cluster controller.
396 */
397static inline int numaq_cpu_present_to_apicid(int mps_cpu)
398{
399 if (mps_cpu < 60)
400 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
401 else
402 return BAD_APICID;
403}
404
405static inline int numaq_apicid_to_node(int logical_apicid)
406{
407 return logical_apicid >> 4;
408}
409
410static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
411{
412 int node = numaq_apicid_to_node(logical_apicid);
413 int cpu = __ffs(logical_apicid & 0xf);
414
415 return physid_mask_of_physid(cpu + 4*node);
416}
417
418/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio;
420
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
422{
423 return 1;
424}
425
426/*
427 * We use physical apicids here, not logical, so just return the default
428 * physical broadcast to stop people from breaking us
429 */
430static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
431{
432 return 0x0F;
433}
434
435static inline unsigned int
436numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
437 const struct cpumask *andmask)
438{
439 return 0x0F;
440}
441
442/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
443static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
444{
445 return cpuid_apic >> index_msb;
446}
447
448static int
449numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
450{
451 if (strncmp(oem, "IBM NUMA", 8))
452 printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
453 else
454 found_numaq = 1;
455
456 return found_numaq;
457}
458
459static int probe_numaq(void)
460{
461 /* already know from get_memcfg_numaq() */
462 return found_numaq;
463}
464
465static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
466{
467 /* Careful. Some cpus do not strictly honor the set of cpus
468 * specified in the interrupt destination when using lowest
469 * priority interrupt delivery mode.
470 *
471 * In particular there was a hyperthreading cpu observed to
472 * deliver interrupts to the wrong hyperthread when only one
473 * hyperthread was specified in the interrupt desitination.
474 */
475 cpumask_clear(retmask);
476 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
477}
478
479static void numaq_setup_portio_remap(void)
480{
481 int num_quads = num_online_nodes();
482
483 if (num_quads <= 1)
484 return;
485
486 printk(KERN_INFO
487 "Remapping cross-quad port I/O for %d quads\n", num_quads);
488
489 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
490
491 printk(KERN_INFO
492 "xquad_portio vaddr 0x%08lx, len %08lx\n",
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494}
495
496struct apic apic_numaq = {
497
498 .name = "NUMAQ",
499 .probe = probe_numaq,
500 .acpi_madt_oem_check = NULL,
501 .apic_id_registered = numaq_apic_id_registered,
502
503 .irq_delivery_mode = dest_LowestPrio,
504 /* physical delivery on LOCAL quad: */
505 .irq_dest_mode = 0,
506
507 .target_cpus = numaq_target_cpus,
508 .disable_esr = 1,
509 .dest_logical = APIC_DEST_LOGICAL,
510 .check_apicid_used = numaq_check_apicid_used,
511 .check_apicid_present = numaq_check_apicid_present,
512
513 .vector_allocation_domain = numaq_vector_allocation_domain,
514 .init_apic_ldr = numaq_init_apic_ldr,
515
516 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
517 .setup_apic_routing = numaq_setup_apic_routing,
518 .multi_timer_check = numaq_multi_timer_check,
519 .apicid_to_node = numaq_apicid_to_node,
520 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
521 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
522 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
523 .setup_portio_remap = numaq_setup_portio_remap,
524 .check_phys_apicid_present = numaq_check_phys_apicid_present,
525 .enable_apic_mode = NULL,
526 .phys_pkg_id = numaq_phys_pkg_id,
527 .mps_oem_check = numaq_mps_oem_check,
528
529 .get_apic_id = numaq_get_apic_id,
530 .set_apic_id = NULL,
531 .apic_id_mask = 0x0F << 24,
532
533 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
534 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
535
536 .send_IPI_mask = numaq_send_IPI_mask,
537 .send_IPI_mask_allbutself = NULL,
538 .send_IPI_allbutself = numaq_send_IPI_allbutself,
539 .send_IPI_all = numaq_send_IPI_all,
540 .send_IPI_self = default_send_IPI_self,
541
542 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
543 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
544 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
545
546 /* We don't do anything here because we use NMI's to boot instead */
547 .wait_for_init_deassert = NULL,
548
549 .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
550 .inquire_remote_apic = NULL,
551
552 .read = native_apic_mem_read,
553 .write = native_apic_mem_write,
554 .icr_read = native_apic_icr_read,
555 .icr_write = native_apic_icr_write,
556 .wait_icr_idle = native_apic_wait_icr_idle,
557 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
558};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 000000000000..01eda2ac65e4
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,285 @@
1/*
2 * Default generic APIC driver. This handles up to 8 CPUs.
3 *
4 * Copyright 2003 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, v.2
6 *
7 * Generic x86 APIC driver probe layer.
8 */
9#include <linux/threads.h>
10#include <linux/cpumask.h>
11#include <linux/module.h>
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/ctype.h>
15#include <linux/init.h>
16#include <linux/errno.h>
17#include <asm/fixmap.h>
18#include <asm/mpspec.h>
19#include <asm/apicdef.h>
20#include <asm/apic.h>
21#include <asm/setup.h>
22
23#include <linux/threads.h>
24#include <linux/cpumask.h>
25#include <asm/mpspec.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/ipi.h>
33
34#include <linux/smp.h>
35#include <linux/init.h>
36#include <linux/interrupt.h>
37#include <asm/acpi.h>
38#include <asm/e820.h>
39#include <asm/setup.h>
40
41#ifdef CONFIG_HOTPLUG_CPU
42#define DEFAULT_SEND_IPI (1)
43#else
44#define DEFAULT_SEND_IPI (0)
45#endif
46
47int no_broadcast = DEFAULT_SEND_IPI;
48
49static __init int no_ipi_broadcast(char *str)
50{
51 get_option(&str, &no_broadcast);
52 pr_info("Using %s mode\n",
53 no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
54 return 1;
55}
56__setup("no_ipi_broadcast=", no_ipi_broadcast);
57
58static int __init print_ipi_mode(void)
59{
60 pr_info("Using IPI %s mode\n",
61 no_broadcast ? "No-Shortcut" : "Shortcut");
62 return 0;
63}
64late_initcall(print_ipi_mode);
65
66void default_setup_apic_routing(void)
67{
68#ifdef CONFIG_X86_IO_APIC
69 printk(KERN_INFO
70 "Enabling APIC mode: Flat. Using %d I/O APICs\n",
71 nr_ioapics);
72#endif
73}
74
75static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
76{
77 /*
78 * Careful. Some cpus do not strictly honor the set of cpus
79 * specified in the interrupt destination when using lowest
80 * priority interrupt delivery mode.
81 *
82 * In particular there was a hyperthreading cpu observed to
83 * deliver interrupts to the wrong hyperthread when only one
84 * hyperthread was specified in the interrupt desitination.
85 */
86 cpumask_clear(retmask);
87 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
88}
89
90/* should be called last. */
91static int probe_default(void)
92{
93 return 1;
94}
95
96struct apic apic_default = {
97
98 .name = "default",
99 .probe = probe_default,
100 .acpi_madt_oem_check = NULL,
101 .apic_id_registered = default_apic_id_registered,
102
103 .irq_delivery_mode = dest_LowestPrio,
104 /* logical delivery broadcast to all CPUs: */
105 .irq_dest_mode = 1,
106
107 .target_cpus = default_target_cpus,
108 .disable_esr = 0,
109 .dest_logical = APIC_DEST_LOGICAL,
110 .check_apicid_used = default_check_apicid_used,
111 .check_apicid_present = default_check_apicid_present,
112
113 .vector_allocation_domain = default_vector_allocation_domain,
114 .init_apic_ldr = default_init_apic_ldr,
115
116 .ioapic_phys_id_map = default_ioapic_phys_id_map,
117 .setup_apic_routing = default_setup_apic_routing,
118 .multi_timer_check = NULL,
119 .apicid_to_node = default_apicid_to_node,
120 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
121 .cpu_present_to_apicid = default_cpu_present_to_apicid,
122 .apicid_to_cpu_present = default_apicid_to_cpu_present,
123 .setup_portio_remap = NULL,
124 .check_phys_apicid_present = default_check_phys_apicid_present,
125 .enable_apic_mode = NULL,
126 .phys_pkg_id = default_phys_pkg_id,
127 .mps_oem_check = NULL,
128
129 .get_apic_id = default_get_apic_id,
130 .set_apic_id = NULL,
131 .apic_id_mask = 0x0F << 24,
132
133 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
134 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
135
136 .send_IPI_mask = default_send_IPI_mask_logical,
137 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
138 .send_IPI_allbutself = default_send_IPI_allbutself,
139 .send_IPI_all = default_send_IPI_all,
140 .send_IPI_self = default_send_IPI_self,
141
142 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
143 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
144
145 .wait_for_init_deassert = default_wait_for_init_deassert,
146
147 .smp_callin_clear_local_apic = NULL,
148 .inquire_remote_apic = default_inquire_remote_apic,
149
150 .read = native_apic_mem_read,
151 .write = native_apic_mem_write,
152 .icr_read = native_apic_icr_read,
153 .icr_write = native_apic_icr_write,
154 .wait_icr_idle = native_apic_wait_icr_idle,
155 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
156};
157
158extern struct apic apic_numaq;
159extern struct apic apic_summit;
160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164
165struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic);
167
168static struct apic *apic_probe[] __initdata = {
169#ifdef CONFIG_X86_NUMAQ
170 &apic_numaq,
171#endif
172#ifdef CONFIG_X86_SUMMIT
173 &apic_summit,
174#endif
175#ifdef CONFIG_X86_BIGSMP
176 &apic_bigsmp,
177#endif
178#ifdef CONFIG_X86_ES7000
179 &apic_es7000,
180 &apic_es7000_cluster,
181#endif
182 &apic_default, /* must be last */
183 NULL,
184};
185
186static int cmdline_apic __initdata;
187static int __init parse_apic(char *arg)
188{
189 int i;
190
191 if (!arg)
192 return -EINVAL;
193
194 for (i = 0; apic_probe[i]; i++) {
195 if (!strcmp(apic_probe[i]->name, arg)) {
196 apic = apic_probe[i];
197 cmdline_apic = 1;
198 return 0;
199 }
200 }
201
202 /* Parsed again by __setup for debug/verbose */
203 return 0;
204}
205early_param("apic", parse_apic);
206
207void __init generic_bigsmp_probe(void)
208{
209#ifdef CONFIG_X86_BIGSMP
210 /*
211 * This routine is used to switch to bigsmp mode when
212 * - There is no apic= option specified by the user
213 * - generic_apic_probe() has chosen apic_default as the sub_arch
214 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
215 */
216
217 if (!cmdline_apic && apic == &apic_default) {
218 if (apic_bigsmp.probe()) {
219 apic = &apic_bigsmp;
220 printk(KERN_INFO "Overriding APIC driver with %s\n",
221 apic->name);
222 }
223 }
224#endif
225}
226
227void __init generic_apic_probe(void)
228{
229 if (!cmdline_apic) {
230 int i;
231 for (i = 0; apic_probe[i]; i++) {
232 if (apic_probe[i]->probe()) {
233 apic = apic_probe[i];
234 break;
235 }
236 }
237 /* Not visible without early console */
238 if (!apic_probe[i])
239 panic("Didn't find an APIC driver");
240 }
241 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
242}
243
244/* These functions can switch the APIC even after the initial ->probe() */
245
246int __init
247generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
248{
249 int i;
250
251 for (i = 0; apic_probe[i]; ++i) {
252 if (!apic_probe[i]->mps_oem_check)
253 continue;
254 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
255 continue;
256
257 if (!cmdline_apic) {
258 apic = apic_probe[i];
259 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
260 apic->name);
261 }
262 return 1;
263 }
264 return 0;
265}
266
267int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
268{
269 int i;
270
271 for (i = 0; apic_probe[i]; ++i) {
272 if (!apic_probe[i]->acpi_madt_oem_check)
273 continue;
274 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
275 continue;
276
277 if (!cmdline_apic) {
278 apic = apic_probe[i];
279 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
280 apic->name);
281 }
282 return 1;
283 }
284 return 0;
285}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
new file mode 100644
index 000000000000..1783652bb0e5
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -0,0 +1,100 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/threads.h>
12#include <linux/cpumask.h>
13#include <linux/string.h>
14#include <linux/module.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/hardirq.h>
19#include <linux/dmar.h>
20
21#include <asm/smp.h>
22#include <asm/apic.h>
23#include <asm/ipi.h>
24#include <asm/setup.h>
25
26extern struct apic apic_flat;
27extern struct apic apic_physflat;
28extern struct apic apic_x2xpic_uv_x;
29extern struct apic apic_x2apic_phys;
30extern struct apic apic_x2apic_cluster;
31
32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
34
35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
40 &apic_x2apic_phys,
41 &apic_x2apic_cluster,
42#endif
43 &apic_physflat,
44 NULL,
45};
46
47/*
48 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
49 */
50void __init default_setup_apic_routing(void)
51{
52#ifdef CONFIG_X86_X2APIC
53 if (x2apic && (apic != &apic_x2apic_phys &&
54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x &&
56#endif
57 apic != &apic_x2apic_cluster)) {
58 if (x2apic_phys)
59 apic = &apic_x2apic_phys;
60 else
61 apic = &apic_x2apic_cluster;
62 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
63 }
64#endif
65
66 if (apic == &apic_flat) {
67 if (max_physical_apicid >= 8)
68 apic = &apic_physflat;
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 }
71
72 /*
73 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping.
75 */
76 if (intr_remapping_enabled)
77 enable_drhd_fault_handling();
78}
79
80/* Same for both flat and physical. */
81
82void apic_send_IPI_self(int vector)
83{
84 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
85}
86
87int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
88{
89 int i;
90
91 for (i = 0; apic_probe[i]; ++i) {
92 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
93 apic = apic_probe[i];
94 printk(KERN_INFO "Setting APIC routing to %s.\n",
95 apic->name);
96 return 1;
97 }
98 }
99 return 0;
100}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
new file mode 100644
index 000000000000..9cfe1f415d81
--- /dev/null
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -0,0 +1,576 @@
1/*
2 * IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/bios_ebda.h>
33
34/*
35 * APIC driver for the IBM "Summit" chipset.
36 */
37#include <linux/threads.h>
38#include <linux/cpumask.h>
39#include <asm/mpspec.h>
40#include <asm/apic.h>
41#include <asm/smp.h>
42#include <asm/fixmap.h>
43#include <asm/apicdef.h>
44#include <asm/ipi.h>
45#include <linux/kernel.h>
46#include <linux/string.h>
47#include <linux/init.h>
48#include <linux/gfp.h>
49#include <linux/smp.h>
50
51static unsigned summit_get_apic_id(unsigned long x)
52{
53 return (x >> 24) & 0xFF;
54}
55
56static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
57{
58 default_send_IPI_mask_sequence_logical(mask, vector);
59}
60
61static void summit_send_IPI_allbutself(int vector)
62{
63 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
64}
65
66static void summit_send_IPI_all(int vector)
67{
68 summit_send_IPI_mask(cpu_online_mask, vector);
69}
70
71#include <asm/tsc.h>
72
73extern int use_cyclone;
74
75#ifdef CONFIG_X86_SUMMIT_NUMA
76static void setup_summit(void);
77#else
78static inline void setup_summit(void) {}
79#endif
80
81static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
82 char *productid)
83{
84 if (!strncmp(oem, "IBM ENSW", 8) &&
85 (!strncmp(productid, "VIGIL SMP", 9)
86 || !strncmp(productid, "EXA", 3)
87 || !strncmp(productid, "RUTHLESS SMP", 12))){
88 mark_tsc_unstable("Summit based system");
89 use_cyclone = 1; /*enable cyclone-timer*/
90 setup_summit();
91 return 1;
92 }
93 return 0;
94}
95
96/* Hook from generic ACPI tables.c */
97static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
98{
99 if (!strncmp(oem_id, "IBM", 3) &&
100 (!strncmp(oem_table_id, "SERVIGIL", 8)
101 || !strncmp(oem_table_id, "EXA", 3))){
102 mark_tsc_unstable("Summit based system");
103 use_cyclone = 1; /*enable cyclone-timer*/
104 setup_summit();
105 return 1;
106 }
107 return 0;
108}
109
110struct rio_table_hdr {
111 unsigned char version; /* Version number of this data structure */
112 /* Version 3 adds chassis_num & WP_index */
113 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
114 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
115} __attribute__((packed));
116
117struct scal_detail {
118 unsigned char node_id; /* Scalability Node ID */
119 unsigned long CBAR; /* Address of 1MB register space */
120 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
121 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
122 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
123 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
124 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
125 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
126 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
127} __attribute__((packed));
128
129struct rio_detail {
130 unsigned char node_id; /* RIO Node ID */
131 unsigned long BBAR; /* Address of 1MB register space */
132 unsigned char type; /* Type of device */
133 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
134 /* For CYC: Node ID of Twister that owns this CYC */
135 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
136 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
137 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
138 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
139 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
140 /* For CYC: 0 */
141 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
142 /* = 0 : the XAPIC is not used, ie:*/
143 /* ints fwded to another XAPIC */
144 /* Bits1:7 Reserved */
145 /* For CYC: Bits0:7 Reserved */
146 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
147 /* lower slot numbers/PCI bus numbers */
148 /* For CYC: No meaning */
149 unsigned char chassis_num; /* 1 based Chassis number */
150 /* For LookOut WPEGs this field indicates the */
151 /* Expansion Chassis #, enumerated from Boot */
152 /* Node WPEG external port, then Boot Node CYC */
153 /* external port, then Next Vigil chassis WPEG */
154 /* external port, etc. */
155 /* Shared Lookouts have only 1 chassis number (the */
156 /* first one assigned) */
157} __attribute__((packed));
158
159
160typedef enum {
161 CompatTwister = 0, /* Compatibility Twister */
162 AltTwister = 1, /* Alternate Twister of internal 8-way */
163 CompatCyclone = 2, /* Compatibility Cyclone */
164 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
165 CompatWPEG = 4, /* Compatibility WPEG */
166 AltWPEG = 5, /* Second Planar WPEG */
167 LookOutAWPEG = 6, /* LookOut WPEG */
168 LookOutBWPEG = 7, /* LookOut WPEG */
169} node_type;
170
171static inline int is_WPEG(struct rio_detail *rio){
172 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
174}
175
176
177/* In clustered mode, the high nibble of APIC ID is a cluster number.
178 * The low nibble is a 4-bit bitmap. */
179#define XAPIC_DEST_CPUS_SHIFT 4
180#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
181#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
182
183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
184
185static const struct cpumask *summit_target_cpus(void)
186{
187 /* CPU_MASK_ALL (0xff) has undefined behaviour with
188 * dest_LowestPrio mode logical clustered apic interrupt routing
189 * Just start on cpu 0. IRQ balancing will spread load
190 */
191 return cpumask_of(0);
192}
193
194static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
195{
196 return 0;
197}
198
199/* we don't use the phys_cpu_present_map to indicate apicid presence */
200static unsigned long summit_check_apicid_present(int bit)
201{
202 return 1;
203}
204
205static void summit_init_apic_ldr(void)
206{
207 unsigned long val, id;
208 int count = 0;
209 u8 my_id = (u8)hard_smp_processor_id();
210 u8 my_cluster = APIC_CLUSTER(my_id);
211#ifdef CONFIG_SMP
212 u8 lid;
213 int i;
214
215 /* Create logical APIC IDs by counting CPUs already in cluster. */
216 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
217 lid = cpu_2_logical_apicid[i];
218 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
219 ++count;
220 }
221#endif
222 /* We only have a 4 wide bitmap in cluster mode. If a deranged
223 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
224 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
225 id = my_cluster | (1UL << count);
226 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
227 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
228 val |= SET_APIC_LOGICAL_ID(id);
229 apic_write(APIC_LDR, val);
230}
231
232static int summit_apic_id_registered(void)
233{
234 return 1;
235}
236
237static void summit_setup_apic_routing(void)
238{
239 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
240 nr_ioapics);
241}
242
243static int summit_apicid_to_node(int logical_apicid)
244{
245#ifdef CONFIG_SMP
246 return apicid_2_node[hard_smp_processor_id()];
247#else
248 return 0;
249#endif
250}
251
252/* Mapping from cpu number to logical apicid */
253static inline int summit_cpu_to_logical_apicid(int cpu)
254{
255#ifdef CONFIG_SMP
256 if (cpu >= nr_cpu_ids)
257 return BAD_APICID;
258 return cpu_2_logical_apicid[cpu];
259#else
260 return logical_smp_processor_id();
261#endif
262}
263
264static int summit_cpu_present_to_apicid(int mps_cpu)
265{
266 if (mps_cpu < nr_cpu_ids)
267 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
268 else
269 return BAD_APICID;
270}
271
272static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
273{
274 /* For clustered we don't have a good way to do this yet - hack */
275 return physids_promote(0x0F);
276}
277
278static physid_mask_t summit_apicid_to_cpu_present(int apicid)
279{
280 return physid_mask_of_physid(0);
281}
282
283static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
284{
285 return 1;
286}
287
288static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
289{
290 unsigned int round = 0;
291 int cpu, apicid = 0;
292
293 /*
294 * The cpus in the mask must all be on the apic cluster.
295 */
296 for_each_cpu(cpu, cpumask) {
297 int new_apicid = summit_cpu_to_logical_apicid(cpu);
298
299 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
300 printk("%s: Not a valid mask!\n", __func__);
301 return BAD_APICID;
302 }
303 apicid |= new_apicid;
304 round++;
305 }
306 return apicid;
307}
308
309static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
310 const struct cpumask *andmask)
311{
312 int apicid = summit_cpu_to_logical_apicid(0);
313 cpumask_var_t cpumask;
314
315 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
316 return apicid;
317
318 cpumask_and(cpumask, inmask, andmask);
319 cpumask_and(cpumask, cpumask, cpu_online_mask);
320 apicid = summit_cpu_mask_to_apicid(cpumask);
321
322 free_cpumask_var(cpumask);
323
324 return apicid;
325}
326
327/*
328 * cpuid returns the value latched in the HW at reset, not the APIC ID
329 * register's value. For any box whose BIOS changes APIC IDs, like
330 * clustered APIC systems, we must use hard_smp_processor_id.
331 *
332 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
333 */
334static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
335{
336 return hard_smp_processor_id() >> index_msb;
337}
338
339static int probe_summit(void)
340{
341 /* probed later in mptable/ACPI hooks */
342 return 0;
343}
344
345static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
346{
347 /* Careful. Some cpus do not strictly honor the set of cpus
348 * specified in the interrupt destination when using lowest
349 * priority interrupt delivery mode.
350 *
351 * In particular there was a hyperthreading cpu observed to
352 * deliver interrupts to the wrong hyperthread when only one
353 * hyperthread was specified in the interrupt desitination.
354 */
355 cpumask_clear(retmask);
356 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
357}
358
359#ifdef CONFIG_X86_SUMMIT_NUMA
360static struct rio_table_hdr *rio_table_hdr;
361static struct scal_detail *scal_devs[MAX_NUMNODES];
362static struct rio_detail *rio_devs[MAX_NUMNODES*4];
363
364#ifndef CONFIG_X86_NUMAQ
365static int mp_bus_id_to_node[MAX_MP_BUSSES];
366#endif
367
368static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
369{
370 int twister = 0, node = 0;
371 int i, bus, num_buses;
372
373 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
374 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
375 twister = rio_devs[i]->owner_id;
376 break;
377 }
378 }
379 if (i == rio_table_hdr->num_rio_dev) {
380 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
381 return last_bus;
382 }
383
384 for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
385 if (scal_devs[i]->node_id == twister) {
386 node = scal_devs[i]->node_id;
387 break;
388 }
389 }
390 if (i == rio_table_hdr->num_scal_dev) {
391 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
392 return last_bus;
393 }
394
395 switch (rio_devs[wpeg_num]->type) {
396 case CompatWPEG:
397 /*
398 * The Compatibility Winnipeg controls the 2 legacy buses,
399 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
400 * a PCI-PCI bridge card is used in either slot: total 5 buses.
401 */
402 num_buses = 5;
403 break;
404 case AltWPEG:
405 /*
406 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
407 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
408 * the "extra" buses for each of those slots: total 7 buses.
409 */
410 num_buses = 7;
411 break;
412 case LookOutAWPEG:
413 case LookOutBWPEG:
414 /*
415 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
416 * & the "extra" buses for each of those slots: total 9 buses.
417 */
418 num_buses = 9;
419 break;
420 default:
421 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
422 return last_bus;
423 }
424
425 for (bus = last_bus; bus < last_bus + num_buses; bus++)
426 mp_bus_id_to_node[bus] = node;
427 return bus;
428}
429
430static int build_detail_arrays(void)
431{
432 unsigned long ptr;
433 int i, scal_detail_size, rio_detail_size;
434
435 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
436 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
437 return 0;
438 }
439
440 switch (rio_table_hdr->version) {
441 default:
442 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
443 return 0;
444 case 2:
445 scal_detail_size = 11;
446 rio_detail_size = 13;
447 break;
448 case 3:
449 scal_detail_size = 12;
450 rio_detail_size = 15;
451 break;
452 }
453
454 ptr = (unsigned long)rio_table_hdr + 3;
455 for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
456 scal_devs[i] = (struct scal_detail *)ptr;
457
458 for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
459 rio_devs[i] = (struct rio_detail *)ptr;
460
461 return 1;
462}
463
464void setup_summit(void)
465{
466 unsigned long ptr;
467 unsigned short offset;
468 int i, next_wpeg, next_bus = 0;
469
470 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
471 ptr = get_bios_ebda();
472 ptr = (unsigned long)phys_to_virt(ptr);
473
474 rio_table_hdr = NULL;
475 offset = 0x180;
476 while (offset) {
477 /* The block id is stored in the 2nd word */
478 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
479 /* set the pointer past the offset & block id */
480 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
481 break;
482 }
483 /* The next offset is stored in the 1st word. 0 means no more */
484 offset = *((unsigned short *)(ptr + offset));
485 }
486 if (!rio_table_hdr) {
487 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
488 return;
489 }
490
491 if (!build_detail_arrays())
492 return;
493
494 /* The first Winnipeg we're looking for has an index of 0 */
495 next_wpeg = 0;
496 do {
497 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
498 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
499 /* It's the Winnipeg we're looking for! */
500 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
501 next_wpeg++;
502 break;
503 }
504 }
505 /*
506 * If we go through all Rio devices and don't find one with
507 * the next index, it means we've found all the Winnipegs,
508 * and thus all the PCI buses.
509 */
510 if (i == rio_table_hdr->num_rio_dev)
511 next_wpeg = 0;
512 } while (next_wpeg != 0);
513}
514#endif
515
516struct apic apic_summit = {
517
518 .name = "summit",
519 .probe = probe_summit,
520 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
521 .apic_id_registered = summit_apic_id_registered,
522
523 .irq_delivery_mode = dest_LowestPrio,
524 /* logical delivery broadcast to all CPUs: */
525 .irq_dest_mode = 1,
526
527 .target_cpus = summit_target_cpus,
528 .disable_esr = 1,
529 .dest_logical = APIC_DEST_LOGICAL,
530 .check_apicid_used = summit_check_apicid_used,
531 .check_apicid_present = summit_check_apicid_present,
532
533 .vector_allocation_domain = summit_vector_allocation_domain,
534 .init_apic_ldr = summit_init_apic_ldr,
535
536 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
537 .setup_apic_routing = summit_setup_apic_routing,
538 .multi_timer_check = NULL,
539 .apicid_to_node = summit_apicid_to_node,
540 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
541 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
542 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
543 .setup_portio_remap = NULL,
544 .check_phys_apicid_present = summit_check_phys_apicid_present,
545 .enable_apic_mode = NULL,
546 .phys_pkg_id = summit_phys_pkg_id,
547 .mps_oem_check = summit_mps_oem_check,
548
549 .get_apic_id = summit_get_apic_id,
550 .set_apic_id = NULL,
551 .apic_id_mask = 0xFF << 24,
552
553 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
554 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
555
556 .send_IPI_mask = summit_send_IPI_mask,
557 .send_IPI_mask_allbutself = NULL,
558 .send_IPI_allbutself = summit_send_IPI_allbutself,
559 .send_IPI_all = summit_send_IPI_all,
560 .send_IPI_self = default_send_IPI_self,
561
562 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
563 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
564
565 .wait_for_init_deassert = default_wait_for_init_deassert,
566
567 .smp_callin_clear_local_apic = NULL,
568 .inquire_remote_apic = default_inquire_remote_apic,
569
570 .read = native_apic_mem_read,
571 .write = native_apic_mem_write,
572 .icr_read = native_apic_icr_read,
573 .icr_write = native_apic_icr_write,
574 .wait_icr_idle = native_apic_wait_icr_idle,
575 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
576};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
new file mode 100644
index 000000000000..4a903e2f0d17
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -0,0 +1,245 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/apic.h>
11#include <asm/ipi.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 return x2apic_enabled();
18}
19
20/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
21
22static const struct cpumask *x2apic_target_cpus(void)
23{
24 return cpumask_of(0);
25}
26
27/*
28 * for now each logical cpu is in its own vector allocation domain.
29 */
30static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
31{
32 cpumask_clear(retmask);
33 cpumask_set_cpu(cpu, retmask);
34}
35
36static void
37 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
38{
39 unsigned long cfg;
40
41 cfg = __prepare_ICR(0, vector, dest);
42
43 /*
44 * send the IPI.
45 */
46 native_x2apic_icr_write(cfg, apicid);
47}
48
49/*
50 * for now, we send the IPI's one by one in the cpumask.
51 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
52 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
53 * writes.
54 */
55static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
56{
57 unsigned long query_cpu;
58 unsigned long flags;
59
60 x2apic_wrmsr_fence();
61
62 local_irq_save(flags);
63 for_each_cpu(query_cpu, mask) {
64 __x2apic_send_IPI_dest(
65 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
66 vector, apic->dest_logical);
67 }
68 local_irq_restore(flags);
69}
70
71static void
72 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
73{
74 unsigned long this_cpu = smp_processor_id();
75 unsigned long query_cpu;
76 unsigned long flags;
77
78 x2apic_wrmsr_fence();
79
80 local_irq_save(flags);
81 for_each_cpu(query_cpu, mask) {
82 if (query_cpu == this_cpu)
83 continue;
84 __x2apic_send_IPI_dest(
85 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
86 vector, apic->dest_logical);
87 }
88 local_irq_restore(flags);
89}
90
91static void x2apic_send_IPI_allbutself(int vector)
92{
93 unsigned long this_cpu = smp_processor_id();
94 unsigned long query_cpu;
95 unsigned long flags;
96
97 x2apic_wrmsr_fence();
98
99 local_irq_save(flags);
100 for_each_online_cpu(query_cpu) {
101 if (query_cpu == this_cpu)
102 continue;
103 __x2apic_send_IPI_dest(
104 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
105 vector, apic->dest_logical);
106 }
107 local_irq_restore(flags);
108}
109
110static void x2apic_send_IPI_all(int vector)
111{
112 x2apic_send_IPI_mask(cpu_online_mask, vector);
113}
114
115static int x2apic_apic_id_registered(void)
116{
117 return 1;
118}
119
120static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
121{
122 /*
123 * We're using fixed IRQ delivery, can only return one logical APIC ID.
124 * May as well be the first.
125 */
126 int cpu = cpumask_first(cpumask);
127
128 if ((unsigned)cpu < nr_cpu_ids)
129 return per_cpu(x86_cpu_to_logical_apicid, cpu);
130 else
131 return BAD_APICID;
132}
133
134static unsigned int
135x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 const struct cpumask *andmask)
137{
138 int cpu;
139
140 /*
141 * We're using fixed IRQ delivery, can only return one logical APIC ID.
142 * May as well be the first.
143 */
144 for_each_cpu_and(cpu, cpumask, andmask) {
145 if (cpumask_test_cpu(cpu, cpu_online_mask))
146 break;
147 }
148
149 if (cpu < nr_cpu_ids)
150 return per_cpu(x86_cpu_to_logical_apicid, cpu);
151
152 return BAD_APICID;
153}
154
155static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
156{
157 unsigned int id;
158
159 id = x;
160 return id;
161}
162
163static unsigned long set_apic_id(unsigned int id)
164{
165 unsigned long x;
166
167 x = id;
168 return x;
169}
170
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{
173 return current_cpu_data.initial_apicid >> index_msb;
174}
175
176static void x2apic_send_IPI_self(int vector)
177{
178 apic_write(APIC_SELF_IPI, vector);
179}
180
181static void init_x2apic_ldr(void)
182{
183 int cpu = smp_processor_id();
184
185 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
186}
187
188struct apic apic_x2apic_cluster = {
189
190 .name = "cluster x2apic",
191 .probe = NULL,
192 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
193 .apic_id_registered = x2apic_apic_id_registered,
194
195 .irq_delivery_mode = dest_LowestPrio,
196 .irq_dest_mode = 1, /* logical */
197
198 .target_cpus = x2apic_target_cpus,
199 .disable_esr = 0,
200 .dest_logical = APIC_DEST_LOGICAL,
201 .check_apicid_used = NULL,
202 .check_apicid_present = NULL,
203
204 .vector_allocation_domain = x2apic_vector_allocation_domain,
205 .init_apic_ldr = init_x2apic_ldr,
206
207 .ioapic_phys_id_map = NULL,
208 .setup_apic_routing = NULL,
209 .multi_timer_check = NULL,
210 .apicid_to_node = NULL,
211 .cpu_to_logical_apicid = NULL,
212 .cpu_present_to_apicid = default_cpu_present_to_apicid,
213 .apicid_to_cpu_present = NULL,
214 .setup_portio_remap = NULL,
215 .check_phys_apicid_present = default_check_phys_apicid_present,
216 .enable_apic_mode = NULL,
217 .phys_pkg_id = x2apic_cluster_phys_pkg_id,
218 .mps_oem_check = NULL,
219
220 .get_apic_id = x2apic_cluster_phys_get_apic_id,
221 .set_apic_id = set_apic_id,
222 .apic_id_mask = 0xFFFFFFFFu,
223
224 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
225 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
226
227 .send_IPI_mask = x2apic_send_IPI_mask,
228 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
229 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
230 .send_IPI_all = x2apic_send_IPI_all,
231 .send_IPI_self = x2apic_send_IPI_self,
232
233 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
234 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
235 .wait_for_init_deassert = NULL,
236 .smp_callin_clear_local_apic = NULL,
237 .inquire_remote_apic = NULL,
238
239 .read = native_apic_msr_read,
240 .write = native_apic_msr_write,
241 .icr_read = native_x2apic_icr_read,
242 .icr_write = native_x2apic_icr_write,
243 .wait_icr_idle = native_x2apic_wait_icr_idle,
244 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
245};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 21bcc0e098ba..a284359627e7 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,10 +7,10 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h>
10#include <asm/ipi.h> 11#include <asm/ipi.h>
11#include <asm/genapic.h>
12 12
13static int x2apic_phys; 13int x2apic_phys;
14 14
15static int set_x2apic_phys_mode(char *arg) 15static int set_x2apic_phys_mode(char *arg)
16{ 16{
@@ -21,10 +21,10 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
21 21
22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{ 23{
24 if (cpu_has_x2apic && x2apic_phys) 24 if (x2apic_phys)
25 return 1; 25 return x2apic_enabled();
26 26 else
27 return 0; 27 return 0;
28} 28}
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
@@ -50,13 +50,15 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
50 /* 50 /*
51 * send the IPI. 51 * send the IPI.
52 */ 52 */
53 x2apic_icr_write(cfg, apicid); 53 native_x2apic_icr_write(cfg, apicid);
54} 54}
55 55
56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) 56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57{ 57{
58 unsigned long flags;
59 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags;
60
61 x2apic_wrmsr_fence();
60 62
61 local_irq_save(flags); 63 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask) {
@@ -66,12 +68,14 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
66 local_irq_restore(flags); 68 local_irq_restore(flags);
67} 69}
68 70
69static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, 71static void
70 int vector) 72 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
71{ 73{
72 unsigned long flags;
73 unsigned long query_cpu;
74 unsigned long this_cpu = smp_processor_id(); 74 unsigned long this_cpu = smp_processor_id();
75 unsigned long query_cpu;
76 unsigned long flags;
77
78 x2apic_wrmsr_fence();
75 79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
@@ -85,16 +89,19 @@ static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
85 89
86static void x2apic_send_IPI_allbutself(int vector) 90static void x2apic_send_IPI_allbutself(int vector)
87{ 91{
88 unsigned long flags;
89 unsigned long query_cpu;
90 unsigned long this_cpu = smp_processor_id(); 92 unsigned long this_cpu = smp_processor_id();
93 unsigned long query_cpu;
94 unsigned long flags;
95
96 x2apic_wrmsr_fence();
91 97
92 local_irq_save(flags); 98 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) 99 for_each_online_cpu(query_cpu) {
94 if (query_cpu != this_cpu) 100 if (query_cpu == this_cpu)
95 __x2apic_send_IPI_dest( 101 continue;
96 per_cpu(x86_cpu_to_apicid, query_cpu), 102 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
97 vector, APIC_DEST_PHYSICAL); 103 vector, APIC_DEST_PHYSICAL);
104 }
98 local_irq_restore(flags); 105 local_irq_restore(flags);
99} 106}
100 107
@@ -110,21 +117,21 @@ static int x2apic_apic_id_registered(void)
110 117
111static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 118static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
112{ 119{
113 int cpu;
114
115 /* 120 /*
116 * We're using fixed IRQ delivery, can only return one phys APIC ID. 121 * We're using fixed IRQ delivery, can only return one phys APIC ID.
117 * May as well be the first. 122 * May as well be the first.
118 */ 123 */
119 cpu = cpumask_first(cpumask); 124 int cpu = cpumask_first(cpumask);
125
120 if ((unsigned)cpu < nr_cpu_ids) 126 if ((unsigned)cpu < nr_cpu_ids)
121 return per_cpu(x86_cpu_to_apicid, cpu); 127 return per_cpu(x86_cpu_to_apicid, cpu);
122 else 128 else
123 return BAD_APICID; 129 return BAD_APICID;
124} 130}
125 131
126static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 132static unsigned int
127 const struct cpumask *andmask) 133x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
134 const struct cpumask *andmask)
128{ 135{
129 int cpu; 136 int cpu;
130 137
@@ -132,31 +139,28 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
132 * We're using fixed IRQ delivery, can only return one phys APIC ID. 139 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first. 140 * May as well be the first.
134 */ 141 */
135 for_each_cpu_and(cpu, cpumask, andmask) 142 for_each_cpu_and(cpu, cpumask, andmask) {
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 143 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 144 break;
145 }
146
138 if (cpu < nr_cpu_ids) 147 if (cpu < nr_cpu_ids)
139 return per_cpu(x86_cpu_to_apicid, cpu); 148 return per_cpu(x86_cpu_to_apicid, cpu);
149
140 return BAD_APICID; 150 return BAD_APICID;
141} 151}
142 152
143static unsigned int get_apic_id(unsigned long x) 153static unsigned int x2apic_phys_get_apic_id(unsigned long x)
144{ 154{
145 unsigned int id; 155 return x;
146
147 id = x;
148 return id;
149} 156}
150 157
151static unsigned long set_apic_id(unsigned int id) 158static unsigned long set_apic_id(unsigned int id)
152{ 159{
153 unsigned long x; 160 return id;
154
155 x = id;
156 return x;
157} 161}
158 162
159static unsigned int phys_pkg_id(int index_msb) 163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
160{ 164{
161 return current_cpu_data.initial_apicid >> index_msb; 165 return current_cpu_data.initial_apicid >> index_msb;
162} 166}
@@ -168,27 +172,63 @@ static void x2apic_send_IPI_self(int vector)
168 172
169static void init_x2apic_ldr(void) 173static void init_x2apic_ldr(void)
170{ 174{
171 return; 175}
172} 176
173 177struct apic apic_x2apic_phys = {
174struct genapic apic_x2apic_phys = { 178
175 .name = "physical x2apic", 179 .name = "physical x2apic",
176 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 180 .probe = NULL,
177 .int_delivery_mode = dest_Fixed, 181 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
178 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 182 .apic_id_registered = x2apic_apic_id_registered,
179 .target_cpus = x2apic_target_cpus, 183
180 .vector_allocation_domain = x2apic_vector_allocation_domain, 184 .irq_delivery_mode = dest_Fixed,
181 .apic_id_registered = x2apic_apic_id_registered, 185 .irq_dest_mode = 0, /* physical */
182 .init_apic_ldr = init_x2apic_ldr, 186
183 .send_IPI_all = x2apic_send_IPI_all, 187 .target_cpus = x2apic_target_cpus,
184 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 188 .disable_esr = 0,
185 .send_IPI_mask = x2apic_send_IPI_mask, 189 .dest_logical = 0,
186 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 190 .check_apicid_used = NULL,
187 .send_IPI_self = x2apic_send_IPI_self, 191 .check_apicid_present = NULL,
188 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 192
189 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 193 .vector_allocation_domain = x2apic_vector_allocation_domain,
190 .phys_pkg_id = phys_pkg_id, 194 .init_apic_ldr = init_x2apic_ldr,
191 .get_apic_id = get_apic_id, 195
192 .set_apic_id = set_apic_id, 196 .ioapic_phys_id_map = NULL,
193 .apic_id_mask = (0xFFFFFFFFu), 197 .setup_apic_routing = NULL,
198 .multi_timer_check = NULL,
199 .apicid_to_node = NULL,
200 .cpu_to_logical_apicid = NULL,
201 .cpu_present_to_apicid = default_cpu_present_to_apicid,
202 .apicid_to_cpu_present = NULL,
203 .setup_portio_remap = NULL,
204 .check_phys_apicid_present = default_check_phys_apicid_present,
205 .enable_apic_mode = NULL,
206 .phys_pkg_id = x2apic_phys_pkg_id,
207 .mps_oem_check = NULL,
208
209 .get_apic_id = x2apic_phys_get_apic_id,
210 .set_apic_id = set_apic_id,
211 .apic_id_mask = 0xFFFFFFFFu,
212
213 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
214 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
215
216 .send_IPI_mask = x2apic_send_IPI_mask,
217 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
218 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
219 .send_IPI_all = x2apic_send_IPI_all,
220 .send_IPI_self = x2apic_send_IPI_self,
221
222 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
223 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
224 .wait_for_init_deassert = NULL,
225 .smp_callin_clear_local_apic = NULL,
226 .inquire_remote_apic = NULL,
227
228 .read = native_apic_msr_read,
229 .write = native_apic_msr_write,
230 .icr_read = native_x2apic_icr_read,
231 .icr_write = native_x2apic_icr_write,
232 .wait_icr_idle = native_x2apic_wait_icr_idle,
233 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
194}; 234};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b193e082f6ce..1248318436e8 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,27 +7,28 @@
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10
11#include <linux/kernel.h>
12#include <linux/threads.h>
13#include <linux/cpu.h>
14#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h>
12#include <linux/proc_fs.h>
13#include <linux/threads.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/ctype.h> 17#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <linux/timer.h> 19#include <linux/timer.h>
22#include <linux/proc_fs.h> 20#include <linux/cpu.h>
23#include <asm/current.h> 21#include <linux/init.h>
24#include <asm/smp.h> 22
25#include <asm/ipi.h>
26#include <asm/genapic.h>
27#include <asm/pgtable.h>
28#include <asm/uv/uv_mmrs.h> 23#include <asm/uv/uv_mmrs.h>
29#include <asm/uv/uv_hub.h> 24#include <asm/uv/uv_hub.h>
25#include <asm/current.h>
26#include <asm/pgtable.h>
30#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28#include <asm/uv/uv.h>
29#include <asm/apic.h>
30#include <asm/ipi.h>
31#include <asm/smp.h>
31 32
32DEFINE_PER_CPU(int, x2apic_extra_bits); 33DEFINE_PER_CPU(int, x2apic_extra_bits);
33 34
@@ -90,40 +91,39 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
90 cpumask_set_cpu(cpu, retmask); 91 cpumask_set_cpu(cpu, retmask);
91} 92}
92 93
93int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 94static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
94{ 95{
96#ifdef CONFIG_SMP
95 unsigned long val; 97 unsigned long val;
96 int pnode; 98 int pnode;
97 99
98 pnode = uv_apicid_to_pnode(phys_apicid); 100 pnode = uv_apicid_to_pnode(phys_apicid);
99 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 101 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
100 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 102 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
101 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 103 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
102 APIC_DM_INIT; 104 APIC_DM_INIT;
103 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 105 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
104 mdelay(10); 106 mdelay(10);
105 107
106 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 108 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
107 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 109 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
108 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 110 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
109 APIC_DM_STARTUP; 111 APIC_DM_STARTUP;
110 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 112 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
113
114 atomic_set(&init_deasserted, 1);
115#endif
111 return 0; 116 return 0;
112} 117}
113 118
114static void uv_send_IPI_one(int cpu, int vector) 119static void uv_send_IPI_one(int cpu, int vector)
115{ 120{
116 unsigned long val, apicid, lapicid; 121 unsigned long apicid;
117 int pnode; 122 int pnode;
118 123
119 apicid = per_cpu(x86_cpu_to_apicid, cpu); 124 apicid = per_cpu(x86_cpu_to_apicid, cpu);
120 lapicid = apicid & 0x3f; /* ZZZ macro needed */
121 pnode = uv_apicid_to_pnode(apicid); 125 pnode = uv_apicid_to_pnode(apicid);
122 val = 126 uv_hub_send_ipi(pnode, apicid, vector);
123 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
124 UVH_IPI_INT_APIC_ID_SHFT) |
125 (vector << UVH_IPI_INT_VECTOR_SHFT);
126 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
127} 127}
128 128
129static void uv_send_IPI_mask(const struct cpumask *mask, int vector) 129static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -136,22 +136,24 @@ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
136 136
137static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 137static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
138{ 138{
139 unsigned int cpu;
140 unsigned int this_cpu = smp_processor_id(); 139 unsigned int this_cpu = smp_processor_id();
140 unsigned int cpu;
141 141
142 for_each_cpu(cpu, mask) 142 for_each_cpu(cpu, mask) {
143 if (cpu != this_cpu) 143 if (cpu != this_cpu)
144 uv_send_IPI_one(cpu, vector); 144 uv_send_IPI_one(cpu, vector);
145 }
145} 146}
146 147
147static void uv_send_IPI_allbutself(int vector) 148static void uv_send_IPI_allbutself(int vector)
148{ 149{
149 unsigned int cpu;
150 unsigned int this_cpu = smp_processor_id(); 150 unsigned int this_cpu = smp_processor_id();
151 unsigned int cpu;
151 152
152 for_each_online_cpu(cpu) 153 for_each_online_cpu(cpu) {
153 if (cpu != this_cpu) 154 if (cpu != this_cpu)
154 uv_send_IPI_one(cpu, vector); 155 uv_send_IPI_one(cpu, vector);
156 }
155} 157}
156 158
157static void uv_send_IPI_all(int vector) 159static void uv_send_IPI_all(int vector)
@@ -170,21 +172,21 @@ static void uv_init_apic_ldr(void)
170 172
171static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) 173static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
172{ 174{
173 int cpu;
174
175 /* 175 /*
176 * We're using fixed IRQ delivery, can only return one phys APIC ID. 176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
177 * May as well be the first. 177 * May as well be the first.
178 */ 178 */
179 cpu = cpumask_first(cpumask); 179 int cpu = cpumask_first(cpumask);
180
180 if ((unsigned)cpu < nr_cpu_ids) 181 if ((unsigned)cpu < nr_cpu_ids)
181 return per_cpu(x86_cpu_to_apicid, cpu); 182 return per_cpu(x86_cpu_to_apicid, cpu);
182 else 183 else
183 return BAD_APICID; 184 return BAD_APICID;
184} 185}
185 186
186static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 187static unsigned int
187 const struct cpumask *andmask) 188uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
189 const struct cpumask *andmask)
188{ 190{
189 int cpu; 191 int cpu;
190 192
@@ -192,15 +194,17 @@ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
192 * We're using fixed IRQ delivery, can only return one phys APIC ID. 194 * We're using fixed IRQ delivery, can only return one phys APIC ID.
193 * May as well be the first. 195 * May as well be the first.
194 */ 196 */
195 for_each_cpu_and(cpu, cpumask, andmask) 197 for_each_cpu_and(cpu, cpumask, andmask) {
196 if (cpumask_test_cpu(cpu, cpu_online_mask)) 198 if (cpumask_test_cpu(cpu, cpu_online_mask))
197 break; 199 break;
200 }
198 if (cpu < nr_cpu_ids) 201 if (cpu < nr_cpu_ids)
199 return per_cpu(x86_cpu_to_apicid, cpu); 202 return per_cpu(x86_cpu_to_apicid, cpu);
203
200 return BAD_APICID; 204 return BAD_APICID;
201} 205}
202 206
203static unsigned int get_apic_id(unsigned long x) 207static unsigned int x2apic_get_apic_id(unsigned long x)
204{ 208{
205 unsigned int id; 209 unsigned int id;
206 210
@@ -222,10 +226,10 @@ static unsigned long set_apic_id(unsigned int id)
222static unsigned int uv_read_apic_id(void) 226static unsigned int uv_read_apic_id(void)
223{ 227{
224 228
225 return get_apic_id(apic_read(APIC_ID)); 229 return x2apic_get_apic_id(apic_read(APIC_ID));
226} 230}
227 231
228static unsigned int phys_pkg_id(int index_msb) 232static int uv_phys_pkg_id(int initial_apicid, int index_msb)
229{ 233{
230 return uv_read_apic_id() >> index_msb; 234 return uv_read_apic_id() >> index_msb;
231} 235}
@@ -235,26 +239,64 @@ static void uv_send_IPI_self(int vector)
235 apic_write(APIC_SELF_IPI, vector); 239 apic_write(APIC_SELF_IPI, vector);
236} 240}
237 241
238struct genapic apic_x2apic_uv_x = { 242struct apic apic_x2apic_uv_x = {
239 .name = "UV large system", 243
240 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 244 .name = "UV large system",
241 .int_delivery_mode = dest_Fixed, 245 .probe = NULL,
242 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 246 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
243 .target_cpus = uv_target_cpus, 247 .apic_id_registered = uv_apic_id_registered,
244 .vector_allocation_domain = uv_vector_allocation_domain, 248
245 .apic_id_registered = uv_apic_id_registered, 249 .irq_delivery_mode = dest_Fixed,
246 .init_apic_ldr = uv_init_apic_ldr, 250 .irq_dest_mode = 1, /* logical */
247 .send_IPI_all = uv_send_IPI_all, 251
248 .send_IPI_allbutself = uv_send_IPI_allbutself, 252 .target_cpus = uv_target_cpus,
249 .send_IPI_mask = uv_send_IPI_mask, 253 .disable_esr = 0,
250 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, 254 .dest_logical = APIC_DEST_LOGICAL,
251 .send_IPI_self = uv_send_IPI_self, 255 .check_apicid_used = NULL,
252 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 256 .check_apicid_present = NULL,
253 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 257
254 .phys_pkg_id = phys_pkg_id, 258 .vector_allocation_domain = uv_vector_allocation_domain,
255 .get_apic_id = get_apic_id, 259 .init_apic_ldr = uv_init_apic_ldr,
256 .set_apic_id = set_apic_id, 260
257 .apic_id_mask = (0xFFFFFFFFu), 261 .ioapic_phys_id_map = NULL,
262 .setup_apic_routing = NULL,
263 .multi_timer_check = NULL,
264 .apicid_to_node = NULL,
265 .cpu_to_logical_apicid = NULL,
266 .cpu_present_to_apicid = default_cpu_present_to_apicid,
267 .apicid_to_cpu_present = NULL,
268 .setup_portio_remap = NULL,
269 .check_phys_apicid_present = default_check_phys_apicid_present,
270 .enable_apic_mode = NULL,
271 .phys_pkg_id = uv_phys_pkg_id,
272 .mps_oem_check = NULL,
273
274 .get_apic_id = x2apic_get_apic_id,
275 .set_apic_id = set_apic_id,
276 .apic_id_mask = 0xFFFFFFFFu,
277
278 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
279 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
280
281 .send_IPI_mask = uv_send_IPI_mask,
282 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
283 .send_IPI_allbutself = uv_send_IPI_allbutself,
284 .send_IPI_all = uv_send_IPI_all,
285 .send_IPI_self = uv_send_IPI_self,
286
287 .wakeup_secondary_cpu = uv_wakeup_secondary,
288 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
289 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
290 .wait_for_init_deassert = NULL,
291 .smp_callin_clear_local_apic = NULL,
292 .inquire_remote_apic = NULL,
293
294 .read = native_apic_msr_read,
295 .write = native_apic_msr_write,
296 .icr_read = native_x2apic_icr_read,
297 .icr_write = native_x2apic_icr_write,
298 .wait_icr_idle = native_x2apic_wait_icr_idle,
299 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
258}; 300};
259 301
260static __cpuinit void set_x2apic_extra_bits(int pnode) 302static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -322,7 +364,7 @@ static __init void map_high(char *id, unsigned long base, int shift,
322 paddr = base << shift; 364 paddr = base << shift;
323 bytes = (1UL << shift) * (max_pnode + 1); 365 bytes = (1UL << shift) * (max_pnode + 1);
324 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 366 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
325 paddr + bytes); 367 paddr + bytes);
326 if (map_type == map_uc) 368 if (map_type == map_uc)
327 init_extra_mapping_uc(paddr, bytes); 369 init_extra_mapping_uc(paddr, bytes);
328 else 370 else
@@ -485,7 +527,7 @@ late_initcall(uv_init_heartbeat);
485 527
486/* 528/*
487 * Called on each cpu to initialize the per_cpu UV data area. 529 * Called on each cpu to initialize the per_cpu UV data area.
488 * ZZZ hotplug not supported yet 530 * FIXME: hotplug not supported yet
489 */ 531 */
490void __cpuinit uv_cpu_init(void) 532void __cpuinit uv_cpu_init(void)
491{ 533{
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 98807bb095ad..49e0939bac42 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);
301 */ 301 */
302#define APM_ZERO_SEGS 302#define APM_ZERO_SEGS
303 303
304#include "apm.h" 304#include <asm/apm.h>
305 305
306/* 306/*
307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. 307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
@@ -466,7 +466,7 @@ static const lookup_t error_table[] = {
466 * @err: APM BIOS return code 466 * @err: APM BIOS return code
467 * 467 *
468 * Write a meaningful log entry to the kernel log in the event of 468 * Write a meaningful log entry to the kernel log in the event of
469 * an APM error. 469 * an APM error. Note that this also handles (negative) kernel errors.
470 */ 470 */
471 471
472static void apm_error(char *str, int err) 472static void apm_error(char *str, int err)
@@ -478,43 +478,14 @@ static void apm_error(char *str, int err)
478 break; 478 break;
479 if (i < ERROR_COUNT) 479 if (i < ERROR_COUNT)
480 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 480 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
481 else if (err < 0)
482 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
481 else 483 else
482 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 484 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
483 str, err); 485 str, err);
484} 486}
485 487
486/* 488/*
487 * Lock APM functionality to physical CPU 0
488 */
489
490#ifdef CONFIG_SMP
491
492static cpumask_t apm_save_cpus(void)
493{
494 cpumask_t x = current->cpus_allowed;
495 /* Some bioses don't like being called from CPU != 0 */
496 set_cpus_allowed(current, cpumask_of_cpu(0));
497 BUG_ON(smp_processor_id() != 0);
498 return x;
499}
500
501static inline void apm_restore_cpus(cpumask_t mask)
502{
503 set_cpus_allowed(current, mask);
504}
505
506#else
507
508/*
509 * No CPU lockdown needed on a uniprocessor
510 */
511
512#define apm_save_cpus() (current->cpus_allowed)
513#define apm_restore_cpus(x) (void)(x)
514
515#endif
516
517/*
518 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and 489 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
519 * apm_info.allow_ints, we are being really paranoid here! Not only 490 * apm_info.allow_ints, we are being really paranoid here! Not only
520 * are interrupts disabled, but all the segment registers (except SS) 491 * are interrupts disabled, but all the segment registers (except SS)
@@ -568,16 +539,23 @@ static inline void apm_irq_restore(unsigned long flags)
568# define APM_DO_RESTORE_SEGS 539# define APM_DO_RESTORE_SEGS
569#endif 540#endif
570 541
542struct apm_bios_call {
543 u32 func;
544 /* In and out */
545 u32 ebx;
546 u32 ecx;
547 /* Out only */
548 u32 eax;
549 u32 edx;
550 u32 esi;
551
552 /* Error: -ENOMEM, or bits 8-15 of eax */
553 int err;
554};
555
571/** 556/**
572 * apm_bios_call - Make an APM BIOS 32bit call 557 * __apm_bios_call - Make an APM BIOS 32bit call
573 * @func: APM function to execute 558 * @_call: pointer to struct apm_bios_call.
574 * @ebx_in: EBX register for call entry
575 * @ecx_in: ECX register for call entry
576 * @eax: EAX register return
577 * @ebx: EBX register return
578 * @ecx: ECX register return
579 * @edx: EDX register return
580 * @esi: ESI register return
581 * 559 *
582 * Make an APM call using the 32bit protected mode interface. The 560 * Make an APM call using the 32bit protected mode interface. The
583 * caller is responsible for knowing if APM BIOS is configured and 561 * caller is responsible for knowing if APM BIOS is configured and
@@ -586,80 +564,142 @@ static inline void apm_irq_restore(unsigned long flags)
586 * flag is loaded into AL. If there is an error, then the error 564 * flag is loaded into AL. If there is an error, then the error
587 * code is returned in AH (bits 8-15 of eax) and this function 565 * code is returned in AH (bits 8-15 of eax) and this function
588 * returns non-zero. 566 * returns non-zero.
567 *
568 * Note: this makes the call on the current CPU.
589 */ 569 */
590 570static long __apm_bios_call(void *_call)
591static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
592 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
593{ 571{
594 APM_DECL_SEGS 572 APM_DECL_SEGS
595 unsigned long flags; 573 unsigned long flags;
596 cpumask_t cpus;
597 int cpu; 574 int cpu;
598 struct desc_struct save_desc_40; 575 struct desc_struct save_desc_40;
599 struct desc_struct *gdt; 576 struct desc_struct *gdt;
600 577 struct apm_bios_call *call = _call;
601 cpus = apm_save_cpus();
602 578
603 cpu = get_cpu(); 579 cpu = get_cpu();
580 BUG_ON(cpu != 0);
604 gdt = get_cpu_gdt_table(cpu); 581 gdt = get_cpu_gdt_table(cpu);
605 save_desc_40 = gdt[0x40 / 8]; 582 save_desc_40 = gdt[0x40 / 8];
606 gdt[0x40 / 8] = bad_bios_desc; 583 gdt[0x40 / 8] = bad_bios_desc;
607 584
608 apm_irq_save(flags); 585 apm_irq_save(flags);
609 APM_DO_SAVE_SEGS; 586 APM_DO_SAVE_SEGS;
610 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); 587 apm_bios_call_asm(call->func, call->ebx, call->ecx,
588 &call->eax, &call->ebx, &call->ecx, &call->edx,
589 &call->esi);
611 APM_DO_RESTORE_SEGS; 590 APM_DO_RESTORE_SEGS;
612 apm_irq_restore(flags); 591 apm_irq_restore(flags);
613 gdt[0x40 / 8] = save_desc_40; 592 gdt[0x40 / 8] = save_desc_40;
614 put_cpu(); 593 put_cpu();
615 apm_restore_cpus(cpus);
616 594
617 return *eax & 0xff; 595 return call->eax & 0xff;
596}
597
598/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
599static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
600{
601 int ret;
602
603 /* Don't bother with work_on_cpu in the common case, so we don't
604 * have to worry about OOM or overhead. */
605 if (get_cpu() == 0) {
606 ret = fn(call);
607 put_cpu();
608 } else {
609 put_cpu();
610 ret = work_on_cpu(0, fn, call);
611 }
612
613 /* work_on_cpu can fail with -ENOMEM */
614 if (ret < 0)
615 call->err = ret;
616 else
617 call->err = (call->eax >> 8) & 0xff;
618
619 return ret;
618} 620}
619 621
620/** 622/**
621 * apm_bios_call_simple - make a simple APM BIOS 32bit call 623 * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0)
622 * @func: APM function to invoke 624 * @call: the apm_bios_call registers.
623 * @ebx_in: EBX register value for BIOS call 625 *
624 * @ecx_in: ECX register value for BIOS call 626 * If there is an error, it is returned in @call.err.
625 * @eax: EAX register on return from the BIOS call 627 */
628static int apm_bios_call(struct apm_bios_call *call)
629{
630 return on_cpu0(__apm_bios_call, call);
631}
632
633/**
634 * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
635 * @_call: pointer to struct apm_bios_call.
626 * 636 *
627 * Make a BIOS call that returns one value only, or just status. 637 * Make a BIOS call that returns one value only, or just status.
628 * If there is an error, then the error code is returned in AH 638 * If there is an error, then the error code is returned in AH
629 * (bits 8-15 of eax) and this function returns non-zero. This is 639 * (bits 8-15 of eax) and this function returns non-zero (it can
630 * used for simpler BIOS operations. This call may hold interrupts 640 * also return -ENOMEM). This is used for simpler BIOS operations.
631 * off for a long time on some laptops. 641 * This call may hold interrupts off for a long time on some laptops.
642 *
643 * Note: this makes the call on the current CPU.
632 */ 644 */
633 645static long __apm_bios_call_simple(void *_call)
634static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
635{ 646{
636 u8 error; 647 u8 error;
637 APM_DECL_SEGS 648 APM_DECL_SEGS
638 unsigned long flags; 649 unsigned long flags;
639 cpumask_t cpus;
640 int cpu; 650 int cpu;
641 struct desc_struct save_desc_40; 651 struct desc_struct save_desc_40;
642 struct desc_struct *gdt; 652 struct desc_struct *gdt;
643 653 struct apm_bios_call *call = _call;
644 cpus = apm_save_cpus();
645 654
646 cpu = get_cpu(); 655 cpu = get_cpu();
656 BUG_ON(cpu != 0);
647 gdt = get_cpu_gdt_table(cpu); 657 gdt = get_cpu_gdt_table(cpu);
648 save_desc_40 = gdt[0x40 / 8]; 658 save_desc_40 = gdt[0x40 / 8];
649 gdt[0x40 / 8] = bad_bios_desc; 659 gdt[0x40 / 8] = bad_bios_desc;
650 660
651 apm_irq_save(flags); 661 apm_irq_save(flags);
652 APM_DO_SAVE_SEGS; 662 APM_DO_SAVE_SEGS;
653 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); 663 error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
664 &call->eax);
654 APM_DO_RESTORE_SEGS; 665 APM_DO_RESTORE_SEGS;
655 apm_irq_restore(flags); 666 apm_irq_restore(flags);
656 gdt[0x40 / 8] = save_desc_40; 667 gdt[0x40 / 8] = save_desc_40;
657 put_cpu(); 668 put_cpu();
658 apm_restore_cpus(cpus);
659 return error; 669 return error;
660} 670}
661 671
662/** 672/**
673 * apm_bios_call_simple - make a simple APM BIOS 32bit call
674 * @func: APM function to invoke
675 * @ebx_in: EBX register value for BIOS call
676 * @ecx_in: ECX register value for BIOS call
677 * @eax: EAX register on return from the BIOS call
678 * @err: bits
679 *
680 * Make a BIOS call that returns one value only, or just status.
681 * If there is an error, then the error code is returned in @err
682 * and this function returns non-zero. This is used for simpler
683 * BIOS operations. This call may hold interrupts off for a long
684 * time on some laptops.
685 */
686static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
687 int *err)
688{
689 struct apm_bios_call call;
690 int ret;
691
692 call.func = func;
693 call.ebx = ebx_in;
694 call.ecx = ecx_in;
695
696 ret = on_cpu0(__apm_bios_call_simple, &call);
697 *eax = call.eax;
698 *err = call.err;
699 return ret;
700}
701
702/**
663 * apm_driver_version - APM driver version 703 * apm_driver_version - APM driver version
664 * @val: loaded with the APM version on return 704 * @val: loaded with the APM version on return
665 * 705 *
@@ -678,9 +718,10 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
678static int apm_driver_version(u_short *val) 718static int apm_driver_version(u_short *val)
679{ 719{
680 u32 eax; 720 u32 eax;
721 int err;
681 722
682 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) 723 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
683 return (eax >> 8) & 0xff; 724 return err;
684 *val = eax; 725 *val = eax;
685 return APM_SUCCESS; 726 return APM_SUCCESS;
686} 727}
@@ -701,22 +742,21 @@ static int apm_driver_version(u_short *val)
701 * that APM 1.2 is in use. If no messges are pending the value 0x80 742 * that APM 1.2 is in use. If no messges are pending the value 0x80
702 * is returned (No power management events pending). 743 * is returned (No power management events pending).
703 */ 744 */
704
705static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) 745static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
706{ 746{
707 u32 eax; 747 struct apm_bios_call call;
708 u32 ebx;
709 u32 ecx;
710 u32 dummy;
711 748
712 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, 749 call.func = APM_FUNC_GET_EVENT;
713 &dummy, &dummy)) 750 call.ebx = call.ecx = 0;
714 return (eax >> 8) & 0xff; 751
715 *event = ebx; 752 if (apm_bios_call(&call))
753 return call.err;
754
755 *event = call.ebx;
716 if (apm_info.connection_version < 0x0102) 756 if (apm_info.connection_version < 0x0102)
717 *info = ~0; /* indicate info not valid */ 757 *info = ~0; /* indicate info not valid */
718 else 758 else
719 *info = ecx; 759 *info = call.ecx;
720 return APM_SUCCESS; 760 return APM_SUCCESS;
721} 761}
722 762
@@ -737,9 +777,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
737static int set_power_state(u_short what, u_short state) 777static int set_power_state(u_short what, u_short state)
738{ 778{
739 u32 eax; 779 u32 eax;
780 int err;
740 781
741 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) 782 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
742 return (eax >> 8) & 0xff; 783 return err;
743 return APM_SUCCESS; 784 return APM_SUCCESS;
744} 785}
745 786
@@ -770,6 +811,7 @@ static int apm_do_idle(void)
770 u8 ret = 0; 811 u8 ret = 0;
771 int idled = 0; 812 int idled = 0;
772 int polling; 813 int polling;
814 int err;
773 815
774 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
775 if (polling) { 817 if (polling) {
@@ -782,7 +824,7 @@ static int apm_do_idle(void)
782 } 824 }
783 if (!need_resched()) { 825 if (!need_resched()) {
784 idled = 1; 826 idled = 1;
785 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); 827 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
786 } 828 }
787 if (polling) 829 if (polling)
788 current_thread_info()->status |= TS_POLLING; 830 current_thread_info()->status |= TS_POLLING;
@@ -797,8 +839,7 @@ static int apm_do_idle(void)
797 * Only report the failure the first 5 times. 839 * Only report the failure the first 5 times.
798 */ 840 */
799 if (++t < 5) { 841 if (++t < 5) {
800 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", 842 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
801 (eax >> 8) & 0xff);
802 t = jiffies; 843 t = jiffies;
803 } 844 }
804 return -1; 845 return -1;
@@ -816,9 +857,10 @@ static int apm_do_idle(void)
816static void apm_do_busy(void) 857static void apm_do_busy(void)
817{ 858{
818 u32 dummy; 859 u32 dummy;
860 int err;
819 861
820 if (clock_slowed || ALWAYS_CALL_BUSY) { 862 if (clock_slowed || ALWAYS_CALL_BUSY) {
821 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); 863 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
822 clock_slowed = 0; 864 clock_slowed = 0;
823 } 865 }
824} 866}
@@ -937,7 +979,7 @@ static void apm_power_off(void)
937 979
938 /* Some bioses don't like being called from CPU != 0 */ 980 /* Some bioses don't like being called from CPU != 0 */
939 if (apm_info.realmode_power_off) { 981 if (apm_info.realmode_power_off) {
940 (void)apm_save_cpus(); 982 set_cpus_allowed_ptr(current, cpumask_of(0));
941 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 983 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 } else { 984 } else {
943 (void)set_system_power_state(APM_STATE_OFF); 985 (void)set_system_power_state(APM_STATE_OFF);
@@ -956,12 +998,13 @@ static void apm_power_off(void)
956static int apm_enable_power_management(int enable) 998static int apm_enable_power_management(int enable)
957{ 999{
958 u32 eax; 1000 u32 eax;
1001 int err;
959 1002
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) 1003 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED; 1004 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, 1005 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax)) 1006 enable, &eax, &err))
964 return (eax >> 8) & 0xff; 1007 return err;
965 if (enable) 1008 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED; 1009 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
967 else 1010 else
@@ -986,24 +1029,23 @@ static int apm_enable_power_management(int enable)
986 1029
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) 1030static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{ 1031{
989 u32 eax; 1032 struct apm_bios_call call;
990 u32 ebx; 1033
991 u32 ecx; 1034 call.func = APM_FUNC_GET_STATUS;
992 u32 edx; 1035 call.ebx = APM_DEVICE_ALL;
993 u32 dummy; 1036 call.ecx = 0;
994 1037
995 if (apm_info.get_power_status_broken) 1038 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED; 1039 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, 1040 if (apm_bios_call(&call))
998 &eax, &ebx, &ecx, &edx, &dummy)) 1041 return call.err;
999 return (eax >> 8) & 0xff; 1042 *status = call.ebx;
1000 *status = ebx; 1043 *bat = call.ecx;
1001 *bat = ecx;
1002 if (apm_info.get_power_status_swabinminutes) { 1044 if (apm_info.get_power_status_swabinminutes) {
1003 *life = swab16((u16)edx); 1045 *life = swab16((u16)call.edx);
1004 *life |= 0x8000; 1046 *life |= 0x8000;
1005 } else 1047 } else
1006 *life = edx; 1048 *life = call.edx;
1007 return APM_SUCCESS; 1049 return APM_SUCCESS;
1008} 1050}
1009 1051
@@ -1048,12 +1090,14 @@ static int apm_get_battery_status(u_short which, u_short *status,
1048static int apm_engage_power_management(u_short device, int enable) 1090static int apm_engage_power_management(u_short device, int enable)
1049{ 1091{
1050 u32 eax; 1092 u32 eax;
1093 int err;
1051 1094
1052 if ((enable == 0) && (device == APM_DEVICE_ALL) 1095 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED)) 1096 && (apm_info.bios.flags & APM_BIOS_DISABLED))
1054 return APM_DISABLED; 1097 return APM_DISABLED;
1055 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) 1098 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
1056 return (eax >> 8) & 0xff; 1099 &eax, &err))
1100 return err;
1057 if (device == APM_DEVICE_ALL) { 1101 if (device == APM_DEVICE_ALL) {
1058 if (enable) 1102 if (enable)
1059 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; 1103 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
@@ -1190,9 +1234,12 @@ static int suspend(int vetoable)
1190 struct apm_user *as; 1234 struct apm_user *as;
1191 1235
1192 device_suspend(PMSG_SUSPEND); 1236 device_suspend(PMSG_SUSPEND);
1193 local_irq_disable(); 1237
1194 device_power_down(PMSG_SUSPEND); 1238 device_power_down(PMSG_SUSPEND);
1195 1239
1240 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND);
1242
1196 local_irq_enable(); 1243 local_irq_enable();
1197 1244
1198 save_processor_state(); 1245 save_processor_state();
@@ -1208,8 +1255,12 @@ static int suspend(int vetoable)
1208 if (err != APM_SUCCESS) 1255 if (err != APM_SUCCESS)
1209 apm_error("suspend", err); 1256 apm_error("suspend", err);
1210 err = (err == APM_SUCCESS) ? 0 : -EIO; 1257 err = (err == APM_SUCCESS) ? 0 : -EIO;
1211 device_power_up(PMSG_RESUME); 1258
1259 sysdev_resume();
1212 local_irq_enable(); 1260 local_irq_enable();
1261
1262 device_power_up(PMSG_RESUME);
1263
1213 device_resume(PMSG_RESUME); 1264 device_resume(PMSG_RESUME);
1214 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1215 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
@@ -1226,8 +1277,10 @@ static void standby(void)
1226{ 1277{
1227 int err; 1278 int err;
1228 1279
1229 local_irq_disable();
1230 device_power_down(PMSG_SUSPEND); 1280 device_power_down(PMSG_SUSPEND);
1281
1282 local_irq_disable();
1283 sysdev_suspend(PMSG_SUSPEND);
1231 local_irq_enable(); 1284 local_irq_enable();
1232 1285
1233 err = set_system_power_state(APM_STATE_STANDBY); 1286 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1235,8 +1288,10 @@ static void standby(void)
1235 apm_error("standby", err); 1288 apm_error("standby", err);
1236 1289
1237 local_irq_disable(); 1290 local_irq_disable();
1238 device_power_up(PMSG_RESUME); 1291 sysdev_resume();
1239 local_irq_enable(); 1292 local_irq_enable();
1293
1294 device_power_up(PMSG_RESUME);
1240} 1295}
1241 1296
1242static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1678,16 +1733,14 @@ static int apm(void *unused)
1678 char *power_stat; 1733 char *power_stat;
1679 char *bat_stat; 1734 char *bat_stat;
1680 1735
1681#ifdef CONFIG_SMP
1682 /* 2002/08/01 - WT 1736 /* 2002/08/01 - WT
1683 * This is to avoid random crashes at boot time during initialization 1737 * This is to avoid random crashes at boot time during initialization
1684 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. 1738 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
1685 * Some bioses don't like being called from CPU != 0. 1739 * Some bioses don't like being called from CPU != 0.
1686 * Method suggested by Ingo Molnar. 1740 * Method suggested by Ingo Molnar.
1687 */ 1741 */
1688 set_cpus_allowed(current, cpumask_of_cpu(0)); 1742 set_cpus_allowed_ptr(current, cpumask_of(0));
1689 BUG_ON(smp_processor_id() != 0); 1743 BUG_ON(smp_processor_id() != 0);
1690#endif
1691 1744
1692 if (apm_info.connection_version == 0) { 1745 if (apm_info.connection_version == 0) {
1693 apm_info.connection_version = apm_info.bios.version; 1746 apm_info.connection_version = apm_info.bios.version;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08feee6..5a6aa1c1162f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20#include <asm/elf.h> 20#include <asm/elf.h>
21#include <asm/suspend.h>
21 22
22#include <xen/interface/xen.h> 23#include <xen/interface/xen.h>
23 24
@@ -75,6 +76,7 @@ void foo(void)
75 OFFSET(PT_DS, pt_regs, ds); 76 OFFSET(PT_DS, pt_regs, ds);
76 OFFSET(PT_ES, pt_regs, es); 77 OFFSET(PT_ES, pt_regs, es);
77 OFFSET(PT_FS, pt_regs, fs); 78 OFFSET(PT_FS, pt_regs, fs);
79 OFFSET(PT_GS, pt_regs, gs);
78 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); 80 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
79 OFFSET(PT_EIP, pt_regs, ip); 81 OFFSET(PT_EIP, pt_regs, ip);
80 OFFSET(PT_CS, pt_regs, cs); 82 OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..e72f062fb4b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,12 +11,12 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
18#include <asm/ia32.h> 17#include <asm/ia32.h>
19#include <asm/bootparam.h> 18#include <asm/bootparam.h>
19#include <asm/suspend.h>
20 20
21#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
22 22
@@ -48,16 +48,6 @@ int main(void)
48#endif 48#endif
49 BLANK(); 49 BLANK();
50#undef ENTRY 50#undef ENTRY
51#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
52 ENTRY(kernelstack);
53 ENTRY(oldrsp);
54 ENTRY(pcurrent);
55 ENTRY(irqcount);
56 ENTRY(cpunumber);
57 ENTRY(irqstackptr);
58 ENTRY(data_offset);
59 BLANK();
60#undef ENTRY
61#ifdef CONFIG_PARAVIRT 51#ifdef CONFIG_PARAVIRT
62 BLANK(); 52 BLANK();
63 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 53 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412a..fc999e6fc46a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
83 u64 size; 83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85 85
86 if (addr == 0) 86 if (!(addr + 1))
87 break;
88
89 if (addr >= corruption_check_size)
87 break; 90 break;
88 91
89 if ((addr + size) > corruption_check_size) 92 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr; 93 size = corruption_check_size - addr;
91 94
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr; 96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 97 scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..4e242f9a06e4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
15obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
16 16
17obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
18
17obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
18obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
19obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
20obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 22obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
21obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 25
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2cf23634b6d9..8220ae69849d 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h> 10#include <asm/apic.h>
11 11
12struct cpuid_bit { 12struct cpuid_bit {
13 u16 feature; 13 u16 feature;
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
29 u32 regs[4]; 29 u32 regs[4];
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { 0, 0, 0, 0 } 34 { 0, 0, 0, 0 }
35 }; 35 };
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
69 */ 69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) 70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{ 71{
72#ifdef CONFIG_X86_SMP 72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index; 73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width; 74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings; 75 unsigned int core_select_mask, core_level_siblings;
@@ -116,22 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
116 116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; 117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118 118
119#ifdef CONFIG_X86_32 119 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 120 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 121 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123 /* 122 /*
124 * Reinit the apicid, now that we have extended initial_apicid. 123 * Reinit the apicid, now that we have extended initial_apicid.
125 */ 124 */
126 c->apicid = phys_pkg_id(c->initial_apicid, 0); 125 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
127#else 126
128 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
129 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
130 /*
131 * Reinit the apicid, now that we have extended initial_apicid.
132 */
133 c->apicid = phys_pkg_id(0);
134#endif
135 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
136 128
137 129
@@ -143,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
143 return; 135 return;
144#endif 136#endif
145} 137}
146
147#ifdef CONFIG_X86_PAT
148void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
149{
150 if (!cpu_has_pat)
151 pat_disable("PAT not supported by CPU.");
152
153 switch (c->x86_vendor) {
154 case X86_VENDOR_INTEL:
155 /*
156 * There is a known erratum on Pentium III and Core Solo
157 * and Core Duo CPUs.
158 * " Page with PAT set to WC while associated MTRR is UC
159 * may consolidate to UC "
160 * Because of this erratum, it is better to stick with
161 * setting WC in MTRR rather than using PAT on these CPUs.
162 *
163 * Enable PAT WC only on P4, Core 2 or later CPUs.
164 */
165 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
166 return;
167
168 pat_disable("PAT WC disabled due to known CPU erratum.");
169 return;
170
171 case X86_VENDOR_AMD:
172 case X86_VENDOR_CENTAUR:
173 case X86_VENDOR_TRANSMETA:
174 return;
175 }
176
177 pat_disable("PAT disabled. Not yet verified on this CPU type.");
178}
179#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c878f6aa919..7e4a459daa64 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10# include <asm/numa_64.h> 11# include <asm/numa_64.h>
@@ -12,8 +13,6 @@
12# include <asm/cacheflush.h> 13# include <asm/cacheflush.h>
13#endif 14#endif
14 15
15#include <mach_apic.h>
16
17#include "cpu.h" 16#include "cpu.h"
18 17
19#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
@@ -143,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
143 } 142 }
144} 143}
145 144
145static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
146{
147#ifdef CONFIG_SMP
148 /* calling is from identify_secondary_cpu() ? */
149 if (c->cpu_index == boot_cpu_id)
150 return;
151
152 /*
153 * Certain Athlons might work (for various values of 'work') in SMP
154 * but they are not certified as MP capable.
155 */
156 /* Athlon 660/661 is valid. */
157 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
158 (c->x86_mask == 1)))
159 goto valid_k7;
160
161 /* Duron 670 is valid */
162 if ((c->x86_model == 7) && (c->x86_mask == 0))
163 goto valid_k7;
164
165 /*
166 * Athlon 662, Duron 671, and Athlon >model 7 have capability
167 * bit. It's worth noting that the A5 stepping (662) of some
168 * Athlon XP's have the MP bit set.
169 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
170 * more.
171 */
172 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
173 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
174 (c->x86_model > 7))
175 if (cpu_has_mp)
176 goto valid_k7;
177
178 /* If we get here, not a certified SMP capable AMD system. */
179
180 /*
181 * Don't taint if we are running SMP kernel on a single non-MP
182 * approved Athlon
183 */
184 WARN_ONCE(1, "WARNING: This combination of AMD"
185 "processors is not suitable for SMP.\n");
186 if (!test_taint(TAINT_UNSAFE_SMP))
187 add_taint(TAINT_UNSAFE_SMP);
188
189valid_k7:
190 ;
191#endif
192}
193
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 194static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{ 195{
148 u32 l, h; 196 u32 l, h;
@@ -177,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
177 } 225 }
178 226
179 set_cpu_cap(c, X86_FEATURE_K7); 227 set_cpu_cap(c, X86_FEATURE_K7);
228
229 amd_k7_smp_check(c);
180} 230}
181#endif 231#endif
182 232
@@ -452,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
452} 502}
453#endif 503#endif
454 504
455static struct cpu_dev amd_cpu_dev __cpuinitdata = { 505static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
456 .c_vendor = "AMD", 506 .c_vendor = "AMD",
457 .c_ident = { "AuthenticAMD" }, 507 .c_ident = { "AuthenticAMD" },
458#ifdef CONFIG_X86_32 508#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc6..c95e831bb095 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
1#include <linux/bitops.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/init.h> 3#include <linux/init.h>
3#include <linux/bitops.h>
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/msr.h>
7#include <asm/e820.h> 6#include <asm/e820.h>
8#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "cpu.h" 10#include "cpu.h"
11 11
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
276 */ 276 */
277 c->x86_capability[5] = cpuid_edx(0xC0000001); 277 c->x86_capability[5] = cpuid_edx(0xC0000001);
278 } 278 }
279 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
288 /* Before Nehemiah, the C3's had 3dNOW! */ 288 /* Before Nehemiah, the C3's had 3dNOW! */
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291#endif
292 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
293 c->x86_cache_alignment = c->x86_clflush_size * 2;
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 }
291 296
292 display_cacheinfo(c); 297 display_cacheinfo(c);
293} 298}
@@ -316,16 +321,25 @@ enum {
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) 321static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{ 322{
318 switch (c->x86) { 323 switch (c->x86) {
324#ifdef CONFIG_X86_32
319 case 5: 325 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */ 326 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); 327 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break; 328 break;
329#endif
330 case 6:
331 if (c->x86_model >= 0xf)
332 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
333 break;
323 } 334 }
335#ifdef CONFIG_X86_64
336 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
337#endif
324} 338}
325 339
326static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 340static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
327{ 341{
328 342#ifdef CONFIG_X86_32
329 char *name; 343 char *name;
330 u32 fcr_set = 0; 344 u32 fcr_set = 0;
331 u32 fcr_clr = 0; 345 u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
337 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 351 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
338 */ 352 */
339 clear_cpu_cap(c, 0*32+31); 353 clear_cpu_cap(c, 0*32+31);
340 354#endif
355 early_init_centaur(c);
341 switch (c->x86) { 356 switch (c->x86) {
357#ifdef CONFIG_X86_32
342 case 5: 358 case 5:
343 switch (c->x86_model) { 359 switch (c->x86_model) {
344 case 4: 360 case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
442 } 458 }
443 sprintf(c->x86_model_id, "WinChip %s", name); 459 sprintf(c->x86_model_id, "WinChip %s", name);
444 break; 460 break;
445 461#endif
446 case 6: 462 case 6:
447 init_c3(c); 463 init_c3(c);
448 break; 464 break;
449 } 465 }
466#ifdef CONFIG_X86_64
467 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
468#endif
450} 469}
451 470
452static unsigned int __cpuinit 471static unsigned int __cpuinit
453centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) 472centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
454{ 473{
474#ifdef CONFIG_X86_32
455 /* VIA C3 CPUs (670-68F) need further shifting. */ 475 /* VIA C3 CPUs (670-68F) need further shifting. */
456 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) 476 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
457 size >>= 8; 477 size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
464 if ((c->x86 == 6) && (c->x86_model == 9) && 484 if ((c->x86 == 6) && (c->x86_model == 9) &&
465 (c->x86_mask == 1) && (size == 65)) 485 (c->x86_mask == 1) && (size == 65))
466 size -= 1; 486 size -= 1;
467 487#endif
468 return size; 488 return size;
469} 489}
470 490
471static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 491static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
472 .c_vendor = "Centaur", 492 .c_vendor = "Centaur",
473 .c_ident = { "CentaurHauls" }, 493 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur, 494 .c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e78..000000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b1..c4f667896c28 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,118 +1,117 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h> 1#include <linux/bootmem.h>
2#include <linux/linkage.h>
6#include <linux/bitops.h> 3#include <linux/bitops.h>
4#include <linux/kernel.h>
7#include <linux/module.h> 5#include <linux/module.h>
8#include <linux/kgdb.h> 6#include <linux/percpu.h>
9#include <linux/topology.h> 7#include <linux/string.h>
10#include <linux/delay.h> 8#include <linux/delay.h>
9#include <linux/sched.h>
10#include <linux/init.h>
11#include <linux/kgdb.h>
11#include <linux/smp.h> 12#include <linux/smp.h>
12#include <linux/percpu.h> 13#include <linux/io.h>
13#include <asm/i387.h> 14
14#include <asm/msr.h> 15#include <asm/stackprotector.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h> 16#include <asm/mmu_context.h>
17#include <asm/hypervisor.h>
18#include <asm/processor.h>
19#include <asm/sections.h>
20#include <asm/topology.h>
21#include <asm/cpumask.h>
22#include <asm/pgtable.h>
23#include <asm/atomic.h>
24#include <asm/proto.h>
25#include <asm/setup.h>
26#include <asm/apic.h>
27#include <asm/desc.h>
28#include <asm/i387.h>
18#include <asm/mtrr.h> 29#include <asm/mtrr.h>
30#include <asm/numa.h>
31#include <asm/asm.h>
32#include <asm/cpu.h>
19#include <asm/mce.h> 33#include <asm/mce.h>
34#include <asm/msr.h>
20#include <asm/pat.h> 35#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#include <asm/smp.h> 36#include <asm/smp.h>
37
24#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 39#include <asm/uv/uv.h>
26#include <asm/apic.h>
27#include <mach_apic.h>
28#include <asm/genapic.h>
29#endif 40#endif
30 41
31#include <asm/pda.h>
32#include <asm/pgtable.h>
33#include <asm/processor.h>
34#include <asm/desc.h>
35#include <asm/atomic.h>
36#include <asm/proto.h>
37#include <asm/sections.h>
38#include <asm/setup.h>
39#include <asm/hypervisor.h>
40
41#include "cpu.h" 42#include "cpu.h"
42 43
43#ifdef CONFIG_X86_64
44
45/* all of these masks are initialized in setup_cpu_local_masks() */ 44/* all of these masks are initialized in setup_cpu_local_masks() */
46cpumask_var_t cpu_callin_mask;
47cpumask_var_t cpu_callout_mask;
48cpumask_var_t cpu_initialized_mask; 45cpumask_var_t cpu_initialized_mask;
46cpumask_var_t cpu_callout_mask;
47cpumask_var_t cpu_callin_mask;
49 48
50/* representing cpus for which sibling maps can be computed */ 49/* representing cpus for which sibling maps can be computed */
51cpumask_var_t cpu_sibling_setup_mask; 50cpumask_var_t cpu_sibling_setup_mask;
52 51
53#else /* CONFIG_X86_32 */ 52/* correctly size the local cpu masks */
54 53void __init setup_cpu_local_masks(void)
55cpumask_t cpu_callin_map; 54{
56cpumask_t cpu_callout_map; 55 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
57cpumask_t cpu_initialized; 56 alloc_bootmem_cpumask_var(&cpu_callin_mask);
58cpumask_t cpu_sibling_setup_map; 57 alloc_bootmem_cpumask_var(&cpu_callout_mask);
59 58 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
60#endif /* CONFIG_X86_32 */ 59}
61
62 60
63static struct cpu_dev *this_cpu __cpuinitdata; 61static const struct cpu_dev *this_cpu __cpuinitdata;
64 62
63DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 64#ifdef CONFIG_X86_64
66/* We need valid kernel segments for data and code in long mode too 65 /*
67 * IRET will check the segment types kkeil 2000/10/28 66 * We need valid kernel segments for data and code in long mode too
68 * Also sysret mandates a special GDT layout 67 * IRET will check the segment types kkeil 2000/10/28
69 */ 68 * Also sysret mandates a special GDT layout
70/* The TLS descriptors are currently at a different place compared to i386. 69 *
71 Hopefully nobody expects them at a fixed place (Wine?) */ 70 * TLS descriptors are currently at a different place compared to i386.
72DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 71 * Hopefully nobody expects them at a fixed place (Wine?)
73 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 72 */
74 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 73 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
75 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 74 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
76 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 75 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
77 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 76 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
78 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 77 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
79} }; 78 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
80#else 79#else
81DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 80 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
82 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 81 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
83 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 82 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
84 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 83 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
85 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
86 /* 84 /*
87 * Segments used for calling PnP BIOS have byte granularity. 85 * Segments used for calling PnP BIOS have byte granularity.
88 * They code segments and data segments have fixed 64k limits, 86 * They code segments and data segments have fixed 64k limits,
89 * the transfer segment sizes are set at run time. 87 * the transfer segment sizes are set at run time.
90 */ 88 */
91 /* 32-bit code */ 89 /* 32-bit code */
92 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, 90 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
93 /* 16-bit code */ 91 /* 16-bit code */
94 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, 92 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
95 /* 16-bit data */ 93 /* 16-bit data */
96 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, 94 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
97 /* 16-bit data */ 95 /* 16-bit data */
98 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, 96 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
99 /* 16-bit data */ 97 /* 16-bit data */
100 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, 98 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
101 /* 99 /*
102 * The APM segments have byte granularity and their bases 100 * The APM segments have byte granularity and their bases
103 * are set at run time. All have 64k limits. 101 * are set at run time. All have 64k limits.
104 */ 102 */
105 /* 32-bit code */ 103 /* 32-bit code */
106 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, 104 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
107 /* 16-bit code */ 105 /* 16-bit code */
108 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, 106 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
109 /* data */ 107 /* data */
110 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 108 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
111 109
112 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 110 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
113 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 111 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
114} }; 112 GDT_STACK_CANARY_INIT
115#endif 113#endif
114} };
116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 115EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
117 116
118#ifdef CONFIG_X86_32 117#ifdef CONFIG_X86_32
@@ -153,16 +152,17 @@ static inline int flag_is_changeable_p(u32 flag)
153 * the CPUID. Add "volatile" to not allow gcc to 152 * the CPUID. Add "volatile" to not allow gcc to
154 * optimize the subsequent calls to this function. 153 * optimize the subsequent calls to this function.
155 */ 154 */
156 asm volatile ("pushfl\n\t" 155 asm volatile ("pushfl \n\t"
157 "pushfl\n\t" 156 "pushfl \n\t"
158 "popl %0\n\t" 157 "popl %0 \n\t"
159 "movl %0,%1\n\t" 158 "movl %0, %1 \n\t"
160 "xorl %2,%0\n\t" 159 "xorl %2, %0 \n\t"
161 "pushl %0\n\t" 160 "pushl %0 \n\t"
162 "popfl\n\t" 161 "popfl \n\t"
163 "pushfl\n\t" 162 "pushfl \n\t"
164 "popl %0\n\t" 163 "popl %0 \n\t"
165 "popfl\n\t" 164 "popfl \n\t"
165
166 : "=&r" (f1), "=&r" (f2) 166 : "=&r" (f1), "=&r" (f2)
167 : "ir" (flag)); 167 : "ir" (flag));
168 168
@@ -177,18 +177,22 @@ static int __cpuinit have_cpuid_p(void)
177 177
178static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 178static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
179{ 179{
180 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 180 unsigned long lo, hi;
181 /* Disable processor serial number */ 181
182 unsigned long lo, hi; 182 if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
183 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 183 return;
184 lo |= 0x200000; 184
185 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 185 /* Disable processor serial number: */
186 printk(KERN_NOTICE "CPU serial number disabled.\n"); 186
187 clear_cpu_cap(c, X86_FEATURE_PN); 187 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
188 188 lo |= 0x200000;
189 /* Disabling the serial number may affect the cpuid level */ 189 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
190 c->cpuid_level = cpuid_eax(0); 190
191 } 191 printk(KERN_NOTICE "CPU serial number disabled.\n");
192 clear_cpu_cap(c, X86_FEATURE_PN);
193
194 /* Disabling the serial number may affect the cpuid level */
195 c->cpuid_level = cpuid_eax(0);
192} 196}
193 197
194static int __init x86_serial_nr_setup(char *s) 198static int __init x86_serial_nr_setup(char *s)
@@ -213,16 +217,64 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
213#endif 217#endif
214 218
215/* 219/*
220 * Some CPU features depend on higher CPUID levels, which may not always
221 * be available due to CPUID level capping or broken virtualization
222 * software. Add those features to this table to auto-disable them.
223 */
224struct cpuid_dependent_feature {
225 u32 feature;
226 u32 level;
227};
228
229static const struct cpuid_dependent_feature __cpuinitconst
230cpuid_dependent_features[] = {
231 { X86_FEATURE_MWAIT, 0x00000005 },
232 { X86_FEATURE_DCA, 0x00000009 },
233 { X86_FEATURE_XSAVE, 0x0000000d },
234 { 0, 0 }
235};
236
237static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
238{
239 const struct cpuid_dependent_feature *df;
240
241 for (df = cpuid_dependent_features; df->feature; df++) {
242
243 if (!cpu_has(c, df->feature))
244 continue;
245 /*
246 * Note: cpuid_level is set to -1 if unavailable, but
247 * extended_extended_level is set to 0 if unavailable
248 * and the legitimate extended levels are all negative
249 * when signed; hence the weird messing around with
250 * signs here...
251 */
252 if (!((s32)df->level < 0 ?
253 (u32)df->level > (u32)c->extended_cpuid_level :
254 (s32)df->level > (s32)c->cpuid_level))
255 continue;
256
257 clear_cpu_cap(c, df->feature);
258 if (!warn)
259 continue;
260
261 printk(KERN_WARNING
262 "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
263 x86_cap_flags[df->feature], df->level);
264 }
265}
266
267/*
216 * Naming convention should be: <Name> [(<Codename>)] 268 * Naming convention should be: <Name> [(<Codename>)]
217 * This table only is used unless init_<vendor>() below doesn't set it; 269 * This table only is used unless init_<vendor>() below doesn't set it;
218 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 270 * in particular, if CPUID levels 0x80000002..4 are supported, this
219 * 271 * isn't used
220 */ 272 */
221 273
222/* Look up CPU names by table lookup. */ 274/* Look up CPU names by table lookup. */
223static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) 275static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
224{ 276{
225 struct cpu_model_info *info; 277 const struct cpu_model_info *info;
226 278
227 if (c->x86_model >= 16) 279 if (c->x86_model >= 16)
228 return NULL; /* Range check */ 280 return NULL; /* Range check */
@@ -242,21 +294,34 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
242 294
243__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 295__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
244 296
245/* Current gdt points %fs at the "master" per-cpu area: after this, 297void load_percpu_segment(int cpu)
246 * it's on the real one. */ 298{
247void switch_to_new_gdt(void) 299#ifdef CONFIG_X86_32
300 loadsegment(fs, __KERNEL_PERCPU);
301#else
302 loadsegment(gs, 0);
303 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
304#endif
305 load_stack_canary_segment();
306}
307
308/*
309 * Current gdt points %fs at the "master" per-cpu area: after this,
310 * it's on the real one.
311 */
312void switch_to_new_gdt(int cpu)
248{ 313{
249 struct desc_ptr gdt_descr; 314 struct desc_ptr gdt_descr;
250 315
251 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 316 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
252 gdt_descr.size = GDT_SIZE - 1; 317 gdt_descr.size = GDT_SIZE - 1;
253 load_gdt(&gdt_descr); 318 load_gdt(&gdt_descr);
254#ifdef CONFIG_X86_32 319 /* Reload the per-cpu base */
255 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 320
256#endif 321 load_percpu_segment(cpu);
257} 322}
258 323
259static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 324static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
260 325
261static void __cpuinit default_init(struct cpuinfo_x86 *c) 326static void __cpuinit default_init(struct cpuinfo_x86 *c)
262{ 327{
@@ -275,7 +340,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
275#endif 340#endif
276} 341}
277 342
278static struct cpu_dev __cpuinitdata default_cpu = { 343static const struct cpu_dev __cpuinitconst default_cpu = {
279 .c_init = default_init, 344 .c_init = default_init,
280 .c_vendor = "Unknown", 345 .c_vendor = "Unknown",
281 .c_x86_vendor = X86_VENDOR_UNKNOWN, 346 .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -289,22 +354,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
289 if (c->extended_cpuid_level < 0x80000004) 354 if (c->extended_cpuid_level < 0x80000004)
290 return; 355 return;
291 356
292 v = (unsigned int *) c->x86_model_id; 357 v = (unsigned int *)c->x86_model_id;
293 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 358 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
294 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); 359 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
295 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); 360 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
296 c->x86_model_id[48] = 0; 361 c->x86_model_id[48] = 0;
297 362
298 /* Intel chips right-justify this string for some dumb reason; 363 /*
299 undo that brain damage */ 364 * Intel chips right-justify this string for some dumb reason;
365 * undo that brain damage:
366 */
300 p = q = &c->x86_model_id[0]; 367 p = q = &c->x86_model_id[0];
301 while (*p == ' ') 368 while (*p == ' ')
302 p++; 369 p++;
303 if (p != q) { 370 if (p != q) {
304 while (*p) 371 while (*p)
305 *q++ = *p++; 372 *q++ = *p++;
306 while (q <= &c->x86_model_id[48]) 373 while (q <= &c->x86_model_id[48])
307 *q++ = '\0'; /* Zero-pad the rest */ 374 *q++ = '\0'; /* Zero-pad the rest */
308 } 375 }
309} 376}
310 377
@@ -373,36 +440,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
373 440
374 if (smp_num_siblings == 1) { 441 if (smp_num_siblings == 1) {
375 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 442 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
376 } else if (smp_num_siblings > 1) { 443 goto out;
444 }
377 445
378 if (smp_num_siblings > nr_cpu_ids) { 446 if (smp_num_siblings <= 1)
379 printk(KERN_WARNING "CPU: Unsupported number of siblings %d", 447 goto out;
380 smp_num_siblings);
381 smp_num_siblings = 1;
382 return;
383 }
384 448
385 index_msb = get_count_order(smp_num_siblings); 449 if (smp_num_siblings > nr_cpu_ids) {
386#ifdef CONFIG_X86_64 450 pr_warning("CPU: Unsupported number of siblings %d",
387 c->phys_proc_id = phys_pkg_id(index_msb); 451 smp_num_siblings);
388#else 452 smp_num_siblings = 1;
389 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 453 return;
390#endif 454 }
391 455
392 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 456 index_msb = get_count_order(smp_num_siblings);
457 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
393 458
394 index_msb = get_count_order(smp_num_siblings); 459 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
395 460
396 core_bits = get_count_order(c->x86_max_cores); 461 index_msb = get_count_order(smp_num_siblings);
397 462
398#ifdef CONFIG_X86_64 463 core_bits = get_count_order(c->x86_max_cores);
399 c->cpu_core_id = phys_pkg_id(index_msb) & 464
400 ((1 << core_bits) - 1); 465 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
401#else 466 ((1 << core_bits) - 1);
402 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
403 ((1 << core_bits) - 1);
404#endif
405 }
406 467
407out: 468out:
408 if ((c->x86_max_cores * smp_num_siblings) > 1) { 469 if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -417,8 +478,8 @@ out:
417static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 478static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
418{ 479{
419 char *v = c->x86_vendor_id; 480 char *v = c->x86_vendor_id;
420 int i;
421 static int printed; 481 static int printed;
482 int i;
422 483
423 for (i = 0; i < X86_VENDOR_NUM; i++) { 484 for (i = 0; i < X86_VENDOR_NUM; i++) {
424 if (!cpu_devs[i]) 485 if (!cpu_devs[i])
@@ -427,6 +488,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
427 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 488 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
428 (cpu_devs[i]->c_ident[1] && 489 (cpu_devs[i]->c_ident[1] &&
429 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 490 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
491
430 this_cpu = cpu_devs[i]; 492 this_cpu = cpu_devs[i];
431 c->x86_vendor = this_cpu->c_x86_vendor; 493 c->x86_vendor = this_cpu->c_x86_vendor;
432 return; 494 return;
@@ -435,7 +497,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
435 497
436 if (!printed) { 498 if (!printed) {
437 printed++; 499 printed++;
438 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); 500 printk(KERN_ERR
501 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
502
439 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 503 printk(KERN_ERR "CPU: Your system may be unstable.\n");
440 } 504 }
441 505
@@ -455,14 +519,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
455 /* Intel-defined flags: level 0x00000001 */ 519 /* Intel-defined flags: level 0x00000001 */
456 if (c->cpuid_level >= 0x00000001) { 520 if (c->cpuid_level >= 0x00000001) {
457 u32 junk, tfms, cap0, misc; 521 u32 junk, tfms, cap0, misc;
522
458 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 523 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
459 c->x86 = (tfms >> 8) & 0xf; 524 c->x86 = (tfms >> 8) & 0xf;
460 c->x86_model = (tfms >> 4) & 0xf; 525 c->x86_model = (tfms >> 4) & 0xf;
461 c->x86_mask = tfms & 0xf; 526 c->x86_mask = tfms & 0xf;
527
462 if (c->x86 == 0xf) 528 if (c->x86 == 0xf)
463 c->x86 += (tfms >> 20) & 0xff; 529 c->x86 += (tfms >> 20) & 0xff;
464 if (c->x86 >= 0x6) 530 if (c->x86 >= 0x6)
465 c->x86_model += ((tfms >> 16) & 0xf) << 4; 531 c->x86_model += ((tfms >> 16) & 0xf) << 4;
532
466 if (cap0 & (1<<19)) { 533 if (cap0 & (1<<19)) {
467 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 534 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
468 c->x86_cache_alignment = c->x86_clflush_size; 535 c->x86_cache_alignment = c->x86_clflush_size;
@@ -478,6 +545,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
478 /* Intel-defined flags: level 0x00000001 */ 545 /* Intel-defined flags: level 0x00000001 */
479 if (c->cpuid_level >= 0x00000001) { 546 if (c->cpuid_level >= 0x00000001) {
480 u32 capability, excap; 547 u32 capability, excap;
548
481 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 549 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
482 c->x86_capability[0] = capability; 550 c->x86_capability[0] = capability;
483 c->x86_capability[4] = excap; 551 c->x86_capability[4] = excap;
@@ -486,6 +554,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
486 /* AMD-defined flags: level 0x80000001 */ 554 /* AMD-defined flags: level 0x80000001 */
487 xlvl = cpuid_eax(0x80000000); 555 xlvl = cpuid_eax(0x80000000);
488 c->extended_cpuid_level = xlvl; 556 c->extended_cpuid_level = xlvl;
557
489 if ((xlvl & 0xffff0000) == 0x80000000) { 558 if ((xlvl & 0xffff0000) == 0x80000000) {
490 if (xlvl >= 0x80000001) { 559 if (xlvl >= 0x80000001) {
491 c->x86_capability[1] = cpuid_edx(0x80000001); 560 c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -493,13 +562,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
493 } 562 }
494 } 563 }
495 564
496#ifdef CONFIG_X86_64
497 if (c->extended_cpuid_level >= 0x80000008) { 565 if (c->extended_cpuid_level >= 0x80000008) {
498 u32 eax = cpuid_eax(0x80000008); 566 u32 eax = cpuid_eax(0x80000008);
499 567
500 c->x86_virt_bits = (eax >> 8) & 0xff; 568 c->x86_virt_bits = (eax >> 8) & 0xff;
501 c->x86_phys_bits = eax & 0xff; 569 c->x86_phys_bits = eax & 0xff;
502 } 570 }
571#ifdef CONFIG_X86_32
572 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
573 c->x86_phys_bits = 36;
503#endif 574#endif
504 575
505 if (c->extended_cpuid_level >= 0x80000007) 576 if (c->extended_cpuid_level >= 0x80000007)
@@ -546,8 +617,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
546{ 617{
547#ifdef CONFIG_X86_64 618#ifdef CONFIG_X86_64
548 c->x86_clflush_size = 64; 619 c->x86_clflush_size = 64;
620 c->x86_phys_bits = 36;
621 c->x86_virt_bits = 48;
549#else 622#else
550 c->x86_clflush_size = 32; 623 c->x86_clflush_size = 32;
624 c->x86_phys_bits = 32;
625 c->x86_virt_bits = 32;
551#endif 626#endif
552 c->x86_cache_alignment = c->x86_clflush_size; 627 c->x86_cache_alignment = c->x86_clflush_size;
553 628
@@ -570,21 +645,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
570 if (this_cpu->c_early_init) 645 if (this_cpu->c_early_init)
571 this_cpu->c_early_init(c); 646 this_cpu->c_early_init(c);
572 647
573 validate_pat_support(c);
574
575#ifdef CONFIG_SMP 648#ifdef CONFIG_SMP
576 c->cpu_index = boot_cpu_id; 649 c->cpu_index = boot_cpu_id;
577#endif 650#endif
651 filter_cpuid_features(c, false);
578} 652}
579 653
580void __init early_cpu_init(void) 654void __init early_cpu_init(void)
581{ 655{
582 struct cpu_dev **cdev; 656 const struct cpu_dev *const *cdev;
583 int count = 0; 657 int count = 0;
584 658
585 printk("KERNEL supported cpus:\n"); 659 printk(KERN_INFO "KERNEL supported cpus:\n");
586 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 660 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
587 struct cpu_dev *cpudev = *cdev; 661 const struct cpu_dev *cpudev = *cdev;
588 unsigned int j; 662 unsigned int j;
589 663
590 if (count >= X86_VENDOR_NUM) 664 if (count >= X86_VENDOR_NUM)
@@ -595,7 +669,7 @@ void __init early_cpu_init(void)
595 for (j = 0; j < 2; j++) { 669 for (j = 0; j < 2; j++) {
596 if (!cpudev->c_ident[j]) 670 if (!cpudev->c_ident[j])
597 continue; 671 continue;
598 printk(" %s %s\n", cpudev->c_vendor, 672 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
599 cpudev->c_ident[j]); 673 cpudev->c_ident[j]);
600 } 674 }
601 } 675 }
@@ -637,7 +711,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
637 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; 711 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
638#ifdef CONFIG_X86_32 712#ifdef CONFIG_X86_32
639# ifdef CONFIG_X86_HT 713# ifdef CONFIG_X86_HT
640 c->apicid = phys_pkg_id(c->initial_apicid, 0); 714 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
641# else 715# else
642 c->apicid = c->initial_apicid; 716 c->apicid = c->initial_apicid;
643# endif 717# endif
@@ -671,9 +745,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
671 c->x86_coreid_bits = 0; 745 c->x86_coreid_bits = 0;
672#ifdef CONFIG_X86_64 746#ifdef CONFIG_X86_64
673 c->x86_clflush_size = 64; 747 c->x86_clflush_size = 64;
748 c->x86_phys_bits = 36;
749 c->x86_virt_bits = 48;
674#else 750#else
675 c->cpuid_level = -1; /* CPUID not detected */ 751 c->cpuid_level = -1; /* CPUID not detected */
676 c->x86_clflush_size = 32; 752 c->x86_clflush_size = 32;
753 c->x86_phys_bits = 32;
754 c->x86_virt_bits = 32;
677#endif 755#endif
678 c->x86_cache_alignment = c->x86_clflush_size; 756 c->x86_cache_alignment = c->x86_clflush_size;
679 memset(&c->x86_capability, 0, sizeof c->x86_capability); 757 memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -684,7 +762,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
684 this_cpu->c_identify(c); 762 this_cpu->c_identify(c);
685 763
686#ifdef CONFIG_X86_64 764#ifdef CONFIG_X86_64
687 c->apicid = phys_pkg_id(0); 765 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
688#endif 766#endif
689 767
690 /* 768 /*
@@ -704,13 +782,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
704 squash_the_stupid_serial_number(c); 782 squash_the_stupid_serial_number(c);
705 783
706 /* 784 /*
707 * The vendor-specific functions might have changed features. Now 785 * The vendor-specific functions might have changed features.
708 * we do "generic changes." 786 * Now we do "generic changes."
709 */ 787 */
710 788
789 /* Filter out anything that depends on CPUID levels we don't have */
790 filter_cpuid_features(c, true);
791
711 /* If the model name is still unset, do table lookup. */ 792 /* If the model name is still unset, do table lookup. */
712 if (!c->x86_model_id[0]) { 793 if (!c->x86_model_id[0]) {
713 char *p; 794 const char *p;
714 p = table_lookup_model(c); 795 p = table_lookup_model(c);
715 if (p) 796 if (p)
716 strcpy(c->x86_model_id, p); 797 strcpy(c->x86_model_id, p);
@@ -766,6 +847,7 @@ static void vgetcpu_set_mode(void)
766void __init identify_boot_cpu(void) 847void __init identify_boot_cpu(void)
767{ 848{
768 identify_cpu(&boot_cpu_data); 849 identify_cpu(&boot_cpu_data);
850 init_c1e_mask();
769#ifdef CONFIG_X86_32 851#ifdef CONFIG_X86_32
770 sysenter_setup(); 852 sysenter_setup();
771 enable_sep_cpu(); 853 enable_sep_cpu();
@@ -785,11 +867,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
785} 867}
786 868
787struct msr_range { 869struct msr_range {
788 unsigned min; 870 unsigned min;
789 unsigned max; 871 unsigned max;
790}; 872};
791 873
792static struct msr_range msr_range_array[] __cpuinitdata = { 874static const struct msr_range msr_range_array[] __cpuinitconst = {
793 { 0x00000000, 0x00000418}, 875 { 0x00000000, 0x00000418},
794 { 0xc0000000, 0xc000040b}, 876 { 0xc0000000, 0xc000040b},
795 { 0xc0010000, 0xc0010142}, 877 { 0xc0010000, 0xc0010142},
@@ -798,14 +880,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
798 880
799static void __cpuinit print_cpu_msr(void) 881static void __cpuinit print_cpu_msr(void)
800{ 882{
883 unsigned index_min, index_max;
801 unsigned index; 884 unsigned index;
802 u64 val; 885 u64 val;
803 int i; 886 int i;
804 unsigned index_min, index_max;
805 887
806 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { 888 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
807 index_min = msr_range_array[i].min; 889 index_min = msr_range_array[i].min;
808 index_max = msr_range_array[i].max; 890 index_max = msr_range_array[i].max;
891
809 for (index = index_min; index < index_max; index++) { 892 for (index = index_min; index < index_max; index++) {
810 if (rdmsrl_amd_safe(index, &val)) 893 if (rdmsrl_amd_safe(index, &val))
811 continue; 894 continue;
@@ -815,6 +898,7 @@ static void __cpuinit print_cpu_msr(void)
815} 898}
816 899
817static int show_msr __cpuinitdata; 900static int show_msr __cpuinitdata;
901
818static __init int setup_show_msr(char *arg) 902static __init int setup_show_msr(char *arg)
819{ 903{
820 int num; 904 int num;
@@ -836,12 +920,14 @@ __setup("noclflush", setup_noclflush);
836 920
837void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 921void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
838{ 922{
839 char *vendor = NULL; 923 const char *vendor = NULL;
840 924
841 if (c->x86_vendor < X86_VENDOR_NUM) 925 if (c->x86_vendor < X86_VENDOR_NUM) {
842 vendor = this_cpu->c_vendor; 926 vendor = this_cpu->c_vendor;
843 else if (c->cpuid_level >= 0) 927 } else {
844 vendor = c->x86_vendor_id; 928 if (c->cpuid_level >= 0)
929 vendor = c->x86_vendor_id;
930 }
845 931
846 if (vendor && !strstr(c->x86_model_id, vendor)) 932 if (vendor && !strstr(c->x86_model_id, vendor))
847 printk(KERN_CONT "%s ", vendor); 933 printk(KERN_CONT "%s ", vendor);
@@ -868,65 +954,45 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
868static __init int setup_disablecpuid(char *arg) 954static __init int setup_disablecpuid(char *arg)
869{ 955{
870 int bit; 956 int bit;
957
871 if (get_option(&arg, &bit) && bit < NCAPINTS*32) 958 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
872 setup_clear_cpu_cap(bit); 959 setup_clear_cpu_cap(bit);
873 else 960 else
874 return 0; 961 return 0;
962
875 return 1; 963 return 1;
876} 964}
877__setup("clearcpuid=", setup_disablecpuid); 965__setup("clearcpuid=", setup_disablecpuid);
878 966
879#ifdef CONFIG_X86_64 967#ifdef CONFIG_X86_64
880struct x8664_pda **_cpu_pda __read_mostly;
881EXPORT_SYMBOL(_cpu_pda);
882
883struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 968struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
884 969
885static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 970DEFINE_PER_CPU_FIRST(union irq_stack_union,
971 irq_stack_union) __aligned(PAGE_SIZE);
886 972
887void __cpuinit pda_init(int cpu) 973DEFINE_PER_CPU(char *, irq_stack_ptr) =
888{ 974 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
889 struct x8664_pda *pda = cpu_pda(cpu);
890 975
891 /* Setup up data that may be needed in __get_free_pages early */ 976DEFINE_PER_CPU(unsigned long, kernel_stack) =
892 loadsegment(fs, 0); 977 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
893 loadsegment(gs, 0); 978EXPORT_PER_CPU_SYMBOL(kernel_stack);
894 /* Memory clobbers used to order PDA accessed */
895 mb();
896 wrmsrl(MSR_GS_BASE, pda);
897 mb();
898
899 pda->cpunumber = cpu;
900 pda->irqcount = -1;
901 pda->kernelstack = (unsigned long)stack_thread_info() -
902 PDA_STACKOFFSET + THREAD_SIZE;
903 pda->active_mm = &init_mm;
904 pda->mmu_state = 0;
905
906 if (cpu == 0) {
907 /* others are initialized in smpboot.c */
908 pda->pcurrent = &init_task;
909 pda->irqstackptr = boot_cpu_stack;
910 pda->irqstackptr += IRQSTACKSIZE - 64;
911 } else {
912 if (!pda->irqstackptr) {
913 pda->irqstackptr = (char *)
914 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
915 if (!pda->irqstackptr)
916 panic("cannot allocate irqstack for cpu %d",
917 cpu);
918 pda->irqstackptr += IRQSTACKSIZE - 64;
919 }
920 979
921 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 980DEFINE_PER_CPU(unsigned int, irq_count) = -1;
922 pda->nodenumber = cpu_to_node(cpu);
923 }
924}
925 981
926static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 982/*
927 DEBUG_STKSZ] __page_aligned_bss; 983 * Special IST stacks which the CPU switches to when it calls
984 * an IST-marked descriptor entry. Up to 7 stacks (hardware
985 * limit), all of them are 4K, except the debug stack which
986 * is 8K.
987 */
988static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
989 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
990 [DEBUG_STACK - 1] = DEBUG_STKSZ
991};
928 992
929extern asmlinkage void ignore_sysret(void); 993static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
994 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
995 __aligned(PAGE_SIZE);
930 996
931/* May not be marked __init: used by software suspend */ 997/* May not be marked __init: used by software suspend */
932void syscall_init(void) 998void syscall_init(void)
@@ -957,16 +1023,38 @@ unsigned long kernel_eflags;
957 */ 1023 */
958DEFINE_PER_CPU(struct orig_ist, orig_ist); 1024DEFINE_PER_CPU(struct orig_ist, orig_ist);
959 1025
960#else 1026#else /* CONFIG_X86_64 */
1027
1028#ifdef CONFIG_CC_STACKPROTECTOR
1029DEFINE_PER_CPU(unsigned long, stack_canary);
1030#endif
961 1031
962/* Make sure %fs is initialized properly in idle threads */ 1032/* Make sure %fs and %gs are initialized properly in idle threads */
963struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 1033struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
964{ 1034{
965 memset(regs, 0, sizeof(struct pt_regs)); 1035 memset(regs, 0, sizeof(struct pt_regs));
966 regs->fs = __KERNEL_PERCPU; 1036 regs->fs = __KERNEL_PERCPU;
1037 regs->gs = __KERNEL_STACK_CANARY;
1038
967 return regs; 1039 return regs;
968} 1040}
969#endif 1041#endif /* CONFIG_X86_64 */
1042
1043/*
1044 * Clear all 6 debug registers:
1045 */
1046static void clear_all_debug_regs(void)
1047{
1048 int i;
1049
1050 for (i = 0; i < 8; i++) {
1051 /* Ignore db4, db5 */
1052 if ((i == 4) || (i == 5))
1053 continue;
1054
1055 set_debugreg(0, i);
1056 }
1057}
970 1058
971/* 1059/*
972 * cpu_init() initializes state that is per-CPU. Some data is already 1060 * cpu_init() initializes state that is per-CPU. Some data is already
@@ -976,21 +1064,25 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
976 * A lot of state is already set up in PDA init for 64 bit 1064 * A lot of state is already set up in PDA init for 64 bit
977 */ 1065 */
978#ifdef CONFIG_X86_64 1066#ifdef CONFIG_X86_64
1067
979void __cpuinit cpu_init(void) 1068void __cpuinit cpu_init(void)
980{ 1069{
981 int cpu = stack_smp_processor_id(); 1070 struct orig_ist *orig_ist;
982 struct tss_struct *t = &per_cpu(init_tss, cpu);
983 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
984 unsigned long v;
985 char *estacks = NULL;
986 struct task_struct *me; 1071 struct task_struct *me;
1072 struct tss_struct *t;
1073 unsigned long v;
1074 int cpu;
987 int i; 1075 int i;
988 1076
989 /* CPU 0 is initialised in head64.c */ 1077 cpu = stack_smp_processor_id();
990 if (cpu != 0) 1078 t = &per_cpu(init_tss, cpu);
991 pda_init(cpu); 1079 orig_ist = &per_cpu(orig_ist, cpu);
992 else 1080
993 estacks = boot_exception_stacks; 1081#ifdef CONFIG_NUMA
1082 if (cpu != 0 && percpu_read(node_number) == 0 &&
1083 cpu_to_node(cpu) != NUMA_NO_NODE)
1084 percpu_write(node_number, cpu_to_node(cpu));
1085#endif
994 1086
995 me = current; 1087 me = current;
996 1088
@@ -1006,7 +1098,9 @@ void __cpuinit cpu_init(void)
1006 * and set up the GDT descriptor: 1098 * and set up the GDT descriptor:
1007 */ 1099 */
1008 1100
1009 switch_to_new_gdt(); 1101 switch_to_new_gdt(cpu);
1102 loadsegment(fs, 0);
1103
1010 load_idt((const struct desc_ptr *)&idt_descr); 1104 load_idt((const struct desc_ptr *)&idt_descr);
1011 1105
1012 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 1106 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1017,31 +1111,24 @@ void __cpuinit cpu_init(void)
1017 barrier(); 1111 barrier();
1018 1112
1019 check_efer(); 1113 check_efer();
1020 if (cpu != 0 && x2apic) 1114 if (cpu != 0)
1021 enable_x2apic(); 1115 enable_x2apic();
1022 1116
1023 /* 1117 /*
1024 * set up and load the per-CPU TSS 1118 * set up and load the per-CPU TSS
1025 */ 1119 */
1026 if (!orig_ist->ist[0]) { 1120 if (!orig_ist->ist[0]) {
1027 static const unsigned int order[N_EXCEPTION_STACKS] = { 1121 char *estacks = per_cpu(exception_stacks, cpu);
1028 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1122
1029 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1030 };
1031 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1123 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1032 if (cpu) { 1124 estacks += exception_stack_sizes[v];
1033 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1034 if (!estacks)
1035 panic("Cannot allocate exception "
1036 "stack %ld %d\n", v, cpu);
1037 }
1038 estacks += PAGE_SIZE << order[v];
1039 orig_ist->ist[v] = t->x86_tss.ist[v] = 1125 orig_ist->ist[v] = t->x86_tss.ist[v] =
1040 (unsigned long)estacks; 1126 (unsigned long)estacks;
1041 } 1127 }
1042 } 1128 }
1043 1129
1044 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 1130 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1131
1045 /* 1132 /*
1046 * <= is required because the CPU will access up to 1133 * <= is required because the CPU will access up to
1047 * 8 bits beyond the end of the IO permission bitmap. 1134 * 8 bits beyond the end of the IO permission bitmap.
@@ -1051,8 +1138,7 @@ void __cpuinit cpu_init(void)
1051 1138
1052 atomic_inc(&init_mm.mm_count); 1139 atomic_inc(&init_mm.mm_count);
1053 me->active_mm = &init_mm; 1140 me->active_mm = &init_mm;
1054 if (me->mm) 1141 BUG_ON(me->mm);
1055 BUG();
1056 enter_lazy_tlb(&init_mm, me); 1142 enter_lazy_tlb(&init_mm, me);
1057 1143
1058 load_sp0(t, &current->thread); 1144 load_sp0(t, &current->thread);
@@ -1069,22 +1155,9 @@ void __cpuinit cpu_init(void)
1069 */ 1155 */
1070 if (kgdb_connected && arch_kgdb_ops.correct_hw_break) 1156 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1071 arch_kgdb_ops.correct_hw_break(); 1157 arch_kgdb_ops.correct_hw_break();
1072 else { 1158 else
1073#endif
1074 /*
1075 * Clear all 6 debug registers:
1076 */
1077
1078 set_debugreg(0UL, 0);
1079 set_debugreg(0UL, 1);
1080 set_debugreg(0UL, 2);
1081 set_debugreg(0UL, 3);
1082 set_debugreg(0UL, 6);
1083 set_debugreg(0UL, 7);
1084#ifdef CONFIG_KGDB
1085 /* If the kgdb is connected no debug regs should be altered. */
1086 }
1087#endif 1159#endif
1160 clear_all_debug_regs();
1088 1161
1089 fpu_init(); 1162 fpu_init();
1090 1163
@@ -1105,7 +1178,8 @@ void __cpuinit cpu_init(void)
1105 1178
1106 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1179 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1107 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 1180 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1108 for (;;) local_irq_enable(); 1181 for (;;)
1182 local_irq_enable();
1109 } 1183 }
1110 1184
1111 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1185 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1114,15 +1188,14 @@ void __cpuinit cpu_init(void)
1114 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1188 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1115 1189
1116 load_idt(&idt_descr); 1190 load_idt(&idt_descr);
1117 switch_to_new_gdt(); 1191 switch_to_new_gdt(cpu);
1118 1192
1119 /* 1193 /*
1120 * Set up and load the per-CPU TSS and LDT 1194 * Set up and load the per-CPU TSS and LDT
1121 */ 1195 */
1122 atomic_inc(&init_mm.mm_count); 1196 atomic_inc(&init_mm.mm_count);
1123 curr->active_mm = &init_mm; 1197 curr->active_mm = &init_mm;
1124 if (curr->mm) 1198 BUG_ON(curr->mm);
1125 BUG();
1126 enter_lazy_tlb(&init_mm, curr); 1199 enter_lazy_tlb(&init_mm, curr);
1127 1200
1128 load_sp0(t, thread); 1201 load_sp0(t, thread);
@@ -1135,16 +1208,7 @@ void __cpuinit cpu_init(void)
1135 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 1208 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1136#endif 1209#endif
1137 1210
1138 /* Clear %gs. */ 1211 clear_all_debug_regs();
1139 asm volatile ("mov %0, %%gs" : : "r" (0));
1140
1141 /* Clear all 6 debug registers: */
1142 set_debugreg(0, 0);
1143 set_debugreg(0, 1);
1144 set_debugreg(0, 2);
1145 set_debugreg(0, 3);
1146 set_debugreg(0, 6);
1147 set_debugreg(0, 7);
1148 1212
1149 /* 1213 /*
1150 * Force FPU initialization: 1214 * Force FPU initialization:
@@ -1164,6 +1228,4 @@ void __cpuinit cpu_init(void)
1164 1228
1165 xsave_init(); 1229 xsave_init();
1166} 1230}
1167
1168
1169#endif 1231#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a39210..6de9a908e400 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,33 +3,34 @@
3#define ARCH_X86_CPU_H 3#define ARCH_X86_CPU_H
4 4
5struct cpu_model_info { 5struct cpu_model_info {
6 int vendor; 6 int vendor;
7 int family; 7 int family;
8 char *model_names[16]; 8 const char *model_names[16];
9}; 9};
10 10
11/* attempt to consolidate cpu attributes */ 11/* attempt to consolidate cpu attributes */
12struct cpu_dev { 12struct cpu_dev {
13 char * c_vendor; 13 const char *c_vendor;
14 14
15 /* some have two possibilities for cpuid string */ 15 /* some have two possibilities for cpuid string */
16 char * c_ident[2]; 16 const char *c_ident[2];
17 17
18 struct cpu_model_info c_models[4]; 18 struct cpu_model_info c_models[4];
19 19
20 void (*c_early_init)(struct cpuinfo_x86 *c); 20 void (*c_early_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 24 int c_x86_vendor;
25}; 25};
26 26
27#define cpu_dev_register(cpu_devX) \ 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \ 28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX; 30 &cpu_devX;
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[];
33 34
34extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void display_cacheinfo(struct cpuinfo_x86 *c);
35 36
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 000000000000..46e29ab96c6a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38
39static DEFINE_MUTEX(cpu_debug_lock);
40
41static struct dentry *cpu_debugfs_dir;
42
43static struct cpu_debug_base cpu_base[] = {
44 { "mc", CPU_MC, 0 },
45 { "monitor", CPU_MONITOR, 0 },
46 { "time", CPU_TIME, 0 },
47 { "pmc", CPU_PMC, 1 },
48 { "platform", CPU_PLATFORM, 0 },
49 { "apic", CPU_APIC, 0 },
50 { "poweron", CPU_POWERON, 0 },
51 { "control", CPU_CONTROL, 0 },
52 { "features", CPU_FEATURES, 0 },
53 { "lastbranch", CPU_LBRANCH, 0 },
54 { "bios", CPU_BIOS, 0 },
55 { "freq", CPU_FREQ, 0 },
56 { "mtrr", CPU_MTRR, 0 },
57 { "perf", CPU_PERF, 0 },
58 { "cache", CPU_CACHE, 0 },
59 { "sysenter", CPU_SYSENTER, 0 },
60 { "therm", CPU_THERM, 0 },
61 { "misc", CPU_MISC, 0 },
62 { "debug", CPU_DEBUG, 0 },
63 { "pat", CPU_PAT, 0 },
64 { "vmx", CPU_VMX, 0 },
65 { "call", CPU_CALL, 0 },
66 { "base", CPU_BASE, 0 },
67 { "ver", CPU_VER, 0 },
68 { "conf", CPU_CONF, 0 },
69 { "smm", CPU_SMM, 0 },
70 { "svm", CPU_SVM, 0 },
71 { "osvm", CPU_OSVM, 0 },
72 { "tss", CPU_TSS, 0 },
73 { "cr", CPU_CR, 0 },
74 { "dt", CPU_DT, 0 },
75 { "registers", CPU_REG_ALL, 0 },
76};
77
78static struct cpu_file_base cpu_file[] = {
79 { "index", CPU_REG_ALL, 0 },
80 { "value", CPU_REG_ALL, 1 },
81};
82
83/* Intel Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
91
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
96
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
101
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
106
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
111
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
117
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
126
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
135
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
142
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
155
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178};
179
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{
355 unsigned vendor, modelflag;
356 int i, index;
357
358 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS)
360 return 1;
361
362 modelflag = per_cpu(cpu_modelflag, cpu);
363 vendor = per_cpu(cpu_model, cpu) >> 16;
364 index = get_cpu_range_count(cpu);
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 }
380
381 /* Invalid */
382 return 0;
383}
384
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag)
387{
388 unsigned modelflag;
389
390 modelflag = per_cpu(cpu_modelflag, cpu);
391 *max = 0;
392 switch (per_cpu(cpu_model, cpu) >> 16) {
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408
409 return *max;
410}
411
412/* This function can also be called with seq = NULL for printk */
413static void print_cpu_data(struct seq_file *seq, unsigned type,
414 u32 low, u32 high)
415{
416 struct cpu_private *priv;
417 u64 val = high;
418
419 if (seq) {
420 priv = seq->private;
421 if (priv->file) {
422 val = (val << 32) | low;
423 seq_printf(seq, "0x%llx\n", val);
424 } else
425 seq_printf(seq, " %08x: %08x_%08x\n",
426 type, high, low);
427 } else
428 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
429}
430
431/* This function can also be called with seq = NULL for printk */
432static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
433{
434 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv;
436 u32 low, high;
437 int i, range;
438
439 if (seq) {
440 priv = seq->private;
441 if (priv->file) {
442 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
443 &low, &high))
444 print_cpu_data(seq, priv->reg, low, high);
445 return;
446 }
447 }
448
449 range = get_cpu_range_count(cpu);
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue;
454
455 for (msr = msr_min; msr <= msr_max; msr++) {
456 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
457 continue;
458 print_cpu_data(seq, msr, low, high);
459 }
460 }
461}
462
463static void print_tss(void *arg)
464{
465 struct pt_regs *regs = task_pt_regs(current);
466 struct seq_file *seq = arg;
467 unsigned int seg;
468
469 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
470 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
471 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
472 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
473
474 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
475 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
476 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
477 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
478
479#ifdef CONFIG_X86_64
480 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
481 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
482 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
483 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
484 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
485 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
486 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
487 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
488#endif
489
490 asm("movl %%cs,%0" : "=r" (seg));
491 seq_printf(seq, " CS\t: %04x\n", seg);
492 asm("movl %%ds,%0" : "=r" (seg));
493 seq_printf(seq, " DS\t: %04x\n", seg);
494 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
495 asm("movl %%es,%0" : "=r" (seg));
496 seq_printf(seq, " ES\t: %04x\n", seg);
497 asm("movl %%fs,%0" : "=r" (seg));
498 seq_printf(seq, " FS\t: %04x\n", seg);
499 asm("movl %%gs,%0" : "=r" (seg));
500 seq_printf(seq, " GS\t: %04x\n", seg);
501
502 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
503
504 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
505}
506
507static void print_cr(void *arg)
508{
509 struct seq_file *seq = arg;
510
511 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
512 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
513 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
514 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
515#ifdef CONFIG_X86_64
516 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
517#endif
518}
519
520static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
521{
522 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
523}
524
525static void print_dt(void *seq)
526{
527 struct desc_ptr dt;
528 unsigned long ldt;
529
530 /* IDT */
531 store_idt((struct desc_ptr *)&dt);
532 print_desc_ptr("IDT", seq, dt);
533
534 /* GDT */
535 store_gdt((struct desc_ptr *)&dt);
536 print_desc_ptr("GDT", seq, dt);
537
538 /* LDT */
539 store_ldt(ldt);
540 seq_printf(seq, " LDT\t: %016lx\n", ldt);
541
542 /* TR */
543 store_tr(ldt);
544 seq_printf(seq, " TR\t: %016lx\n", ldt);
545}
546
547static void print_dr(void *arg)
548{
549 struct seq_file *seq = arg;
550 unsigned long dr;
551 int i;
552
553 for (i = 0; i < 8; i++) {
554 /* Ignore db4, db5 */
555 if ((i == 4) || (i == 5))
556 continue;
557 get_debugreg(dr, i);
558 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
559 }
560
561 seq_printf(seq, "\n MSR\t:\n");
562}
563
564static void print_apic(void *arg)
565{
566 struct seq_file *seq = arg;
567
568#ifdef CONFIG_X86_LOCAL_APIC
569 seq_printf(seq, " LAPIC\t:\n");
570 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
571 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
572 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
573 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
574 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
575 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
576 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
577 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
578 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
579 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
580 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
581 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
582 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
583 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
584 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
585 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
586 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
587 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */
592
593 seq_printf(seq, "\n MSR\t:\n");
594}
595
596static int cpu_seq_show(struct seq_file *seq, void *v)
597{
598 struct cpu_private *priv = seq->private;
599
600 if (priv == NULL)
601 return -EINVAL;
602
603 switch (cpu_base[priv->type].flag) {
604 case CPU_TSS:
605 smp_call_function_single(priv->cpu, print_tss, seq, 1);
606 break;
607 case CPU_CR:
608 smp_call_function_single(priv->cpu, print_cr, seq, 1);
609 break;
610 case CPU_DT:
611 smp_call_function_single(priv->cpu, print_dt, seq, 1);
612 break;
613 case CPU_DEBUG:
614 if (priv->file == CPU_INDEX_BIT)
615 smp_call_function_single(priv->cpu, print_dr, seq, 1);
616 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
617 break;
618 case CPU_APIC:
619 if (priv->file == CPU_INDEX_BIT)
620 smp_call_function_single(priv->cpu, print_apic, seq, 1);
621 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
622 break;
623
624 default:
625 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
626 break;
627 }
628 seq_printf(seq, "\n");
629
630 return 0;
631}
632
633static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
634{
635 if (*pos == 0) /* One time is enough ;-) */
636 return seq;
637
638 return NULL;
639}
640
641static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
642{
643 (*pos)++;
644
645 return cpu_seq_start(seq, pos);
646}
647
648static void cpu_seq_stop(struct seq_file *seq, void *v)
649{
650}
651
652static const struct seq_operations cpu_seq_ops = {
653 .start = cpu_seq_start,
654 .next = cpu_seq_next,
655 .stop = cpu_seq_stop,
656 .show = cpu_seq_show,
657};
658
659static int cpu_seq_open(struct inode *inode, struct file *file)
660{
661 struct cpu_private *priv = inode->i_private;
662 struct seq_file *seq;
663 int err;
664
665 err = seq_open(file, &cpu_seq_ops);
666 if (!err) {
667 seq = file->private_data;
668 seq->private = priv;
669 }
670
671 return err;
672}
673
674static int write_msr(struct cpu_private *priv, u64 val)
675{
676 u32 low, high;
677
678 high = (val >> 32) & 0xffffffff;
679 low = val & 0xffffffff;
680
681 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
682 return 0;
683
684 return -EPERM;
685}
686
687static int write_cpu_register(struct cpu_private *priv, const char *buf)
688{
689 int ret = -EPERM;
690 u64 val;
691
692 ret = strict_strtoull(buf, 0, &val);
693 if (ret < 0)
694 return ret;
695
696 /* Supporting only MSRs */
697 if (priv->type < CPU_TSS_BIT)
698 return write_msr(priv, val);
699
700 return ret;
701}
702
703static ssize_t cpu_write(struct file *file, const char __user *ubuf,
704 size_t count, loff_t *off)
705{
706 struct seq_file *seq = file->private_data;
707 struct cpu_private *priv = seq->private;
708 char buf[19];
709
710 if ((priv == NULL) || (count >= sizeof(buf)))
711 return -EINVAL;
712
713 if (copy_from_user(&buf, ubuf, count))
714 return -EFAULT;
715
716 buf[count] = 0;
717
718 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
719 if (!write_cpu_register(priv, buf))
720 return count;
721
722 return -EACCES;
723}
724
725static const struct file_operations cpu_fops = {
726 .owner = THIS_MODULE,
727 .open = cpu_seq_open,
728 .read = seq_read,
729 .write = cpu_write,
730 .llseek = seq_lseek,
731 .release = seq_release,
732};
733
734static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
735 unsigned file, struct dentry *dentry)
736{
737 struct cpu_private *priv = NULL;
738
739 /* Already intialized */
740 if (file == CPU_INDEX_BIT)
741 if (per_cpu(cpu_arr[type].init, cpu))
742 return 0;
743
744 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
745 if (priv == NULL)
746 return -ENOMEM;
747
748 priv->cpu = cpu;
749 priv->type = type;
750 priv->reg = reg;
751 priv->file = file;
752 mutex_lock(&cpu_debug_lock);
753 per_cpu(priv_arr[type], cpu) = priv;
754 per_cpu(cpu_priv_count, cpu)++;
755 mutex_unlock(&cpu_debug_lock);
756
757 if (file)
758 debugfs_create_file(cpu_file[file].name, S_IRUGO,
759 dentry, (void *)priv, &cpu_fops);
760 else {
761 debugfs_create_file(cpu_base[type].name, S_IRUGO,
762 per_cpu(cpu_arr[type].dentry, cpu),
763 (void *)priv, &cpu_fops);
764 mutex_lock(&cpu_debug_lock);
765 per_cpu(cpu_arr[type].init, cpu) = 1;
766 mutex_unlock(&cpu_debug_lock);
767 }
768
769 return 0;
770}
771
772static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
773 struct dentry *dentry)
774{
775 unsigned file;
776 int err = 0;
777
778 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
779 err = cpu_create_file(cpu, type, reg, file, dentry);
780 if (err)
781 return err;
782 }
783
784 return err;
785}
786
787static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{
789 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0;
792 char reg_dir[12];
793 u32 low, high;
794
795 range = get_cpu_range_count(cpu);
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag))
800 continue;
801
802 for (reg = reg_min; reg <= reg_max; reg++) {
803 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
804 continue;
805
806 sprintf(reg_dir, "0x%x", reg);
807 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
808 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
809 if (err)
810 return err;
811 }
812 }
813
814 return err;
815}
816
817static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
818{
819 struct dentry *cpu_dentry = NULL;
820 unsigned type;
821 int err = 0;
822
823 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
824 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
825 continue;
826 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
827 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
828
829 if (type < CPU_TSS_BIT)
830 err = cpu_init_msr(cpu, type, cpu_dentry);
831 else
832 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
833 cpu_dentry);
834 if (err)
835 return err;
836 }
837
838 return err;
839}
840
841static int cpu_init_cpu(void)
842{
843 struct dentry *cpu_dentry = NULL;
844 struct cpuinfo_x86 *cpui;
845 char cpu_dir[12];
846 unsigned cpu;
847 int err = 0;
848
849 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
850 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857
858 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
860 err = cpu_init_allreg(cpu, cpu_dentry);
861
862 pr_info("cpu%d(%d) debug files %d\n",
863 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
864 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
865 pr_err("Register files count %d exceeds limit %d\n",
866 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
867 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
868 err = -ENFILE;
869 }
870 if (err)
871 return err;
872 }
873
874 return err;
875}
876
877static int __init cpu_debug_init(void)
878{
879 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
880
881 return cpu_init_cpu();
882}
883
884static void __exit cpu_debug_exit(void)
885{
886 int i, cpu;
887
888 if (cpu_debugfs_dir)
889 debugfs_remove_recursive(cpu_debugfs_dir);
890
891 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
892 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
893 kfree(per_cpu(priv_arr[i], cpu));
894}
895
896module_init(cpu_debug_init);
897module_exit(cpu_debug_exit);
898
899MODULE_AUTHOR("Jaswinder Singh Rajput");
900MODULE_DESCRIPTION("CPU Debug module");
901MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 65792c2cc462..52c839875478 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -87,30 +87,15 @@ config X86_POWERNOW_K7_ACPI
87config X86_POWERNOW_K8 87config X86_POWERNOW_K8
88 tristate "AMD Opteron/Athlon64 PowerNow!" 88 tristate "AMD Opteron/Athlon64 PowerNow!"
89 select CPU_FREQ_TABLE 89 select CPU_FREQ_TABLE
90 depends on ACPI && ACPI_PROCESSOR
90 help 91 help
91 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. 92 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
92 93
93 To compile this driver as a module, choose M here: the 94 To compile this driver as a module, choose M here: the
94 module will be called powernow-k8. 95 module will be called powernow-k8.
95 96
96 For details, take a look at <file:Documentation/cpu-freq/>. 97 For details, take a look at <file:Documentation/cpu-freq/>.
97 98
98 If in doubt, say N.
99
100config X86_POWERNOW_K8_ACPI
101 bool
102 prompt "ACPI Support" if X86_32
103 depends on ACPI && X86_POWERNOW_K8 && ACPI_PROCESSOR
104 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
105 default y
106 help
107 This provides access to the K8s Processor Performance States via ACPI.
108 This driver is probably required for CPUFreq to work with multi-socket and
109 SMP systems. It is not required on at least some single-socket yet
110 multi-core systems, even if SMP is enabled.
111
112 It is safe to say Y here.
113
114config X86_GX_SUSPMOD 99config X86_GX_SUSPMOD
115 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" 100 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
116 depends on X86_32 && PCI 101 depends on X86_32 && PCI
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 560f7760dae5..509296df294d 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -1,6 +1,11 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
1obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o 10obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
6obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o 11obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
@@ -10,7 +15,6 @@ obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
10obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o 15obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o 16obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o 17obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
13obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
14obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o 18obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
15obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o 19obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
16obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o 20obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..19f6b9d27e83 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $) 2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 * 3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> 4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> 5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
@@ -33,19 +33,21 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h> 36#include <trace/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
39#include <acpi/processor.h> 43#include <acpi/processor.h>
40 44
41#include <asm/io.h>
42#include <asm/msr.h> 45#include <asm/msr.h>
43#include <asm/processor.h> 46#include <asm/processor.h>
44#include <asm/cpufeature.h> 47#include <asm/cpufeature.h>
45#include <asm/delay.h>
46#include <asm/uaccess.h>
47 48
48#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) 49#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
50 "acpi-cpufreq", msg)
49 51
50MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); 52MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
51MODULE_DESCRIPTION("ACPI Processor P-States Driver"); 53MODULE_DESCRIPTION("ACPI Processor P-States Driver");
@@ -70,6 +72,8 @@ struct acpi_cpufreq_data {
70 72
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
72 74
75DEFINE_TRACE(power_mark);
76
73/* acpi_perf_data is a pointer to percpu data. */ 77/* acpi_perf_data is a pointer to percpu data. */
74static struct acpi_processor_performance *acpi_perf_data; 78static struct acpi_processor_performance *acpi_perf_data;
75 79
@@ -95,7 +99,7 @@ static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
95 99
96 perf = data->acpi_data; 100 perf = data->acpi_data;
97 101
98 for (i=0; i<perf->state_count; i++) { 102 for (i = 0; i < perf->state_count; i++) {
99 if (value == perf->states[i].status) 103 if (value == perf->states[i].status)
100 return data->freq_table[i].frequency; 104 return data->freq_table[i].frequency;
101 } 105 }
@@ -110,7 +114,7 @@ static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
110 msr &= INTEL_MSR_RANGE; 114 msr &= INTEL_MSR_RANGE;
111 perf = data->acpi_data; 115 perf = data->acpi_data;
112 116
113 for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { 117 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
114 if (msr == perf->states[data->freq_table[i].index].status) 118 if (msr == perf->states[data->freq_table[i].index].status)
115 return data->freq_table[i].frequency; 119 return data->freq_table[i].frequency;
116 } 120 }
@@ -138,15 +142,13 @@ struct io_addr {
138 u8 bit_width; 142 u8 bit_width;
139}; 143};
140 144
141typedef union {
142 struct msr_addr msr;
143 struct io_addr io;
144} drv_addr_union;
145
146struct drv_cmd { 145struct drv_cmd {
147 unsigned int type; 146 unsigned int type;
148 const struct cpumask *mask; 147 const struct cpumask *mask;
149 drv_addr_union addr; 148 union {
149 struct msr_addr msr;
150 struct io_addr io;
151 } addr;
150 u32 val; 152 u32 val;
151}; 153};
152 154
@@ -369,7 +371,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
369 unsigned int cur_freq; 371 unsigned int cur_freq;
370 unsigned int i; 372 unsigned int i;
371 373
372 for (i=0; i<100; i++) { 374 for (i = 0; i < 100; i++) {
373 cur_freq = extract_freq(get_cur_val(mask), data); 375 cur_freq = extract_freq(get_cur_val(mask), data);
374 if (cur_freq == freq) 376 if (cur_freq == freq)
375 return 1; 377 return 1;
@@ -494,7 +496,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
494 unsigned long freq; 496 unsigned long freq;
495 unsigned long freqn = perf->states[0].core_frequency * 1000; 497 unsigned long freqn = perf->states[0].core_frequency * 1000;
496 498
497 for (i=0; i<(perf->state_count-1); i++) { 499 for (i = 0; i < (perf->state_count-1); i++) {
498 freq = freqn; 500 freq = freqn;
499 freqn = perf->states[i+1].core_frequency * 1000; 501 freqn = perf->states[i+1].core_frequency * 1000;
500 if ((2 * cpu_khz) > (freqn + freq)) { 502 if ((2 * cpu_khz) > (freqn + freq)) {
@@ -601,7 +603,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 603 if (!data)
602 return -ENOMEM; 604 return -ENOMEM;
603 605
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 606 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 607 per_cpu(drv_data, cpu) = data;
606 608
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 609 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
@@ -673,17 +675,29 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
673 675
674 /* detect transition latency */ 676 /* detect transition latency */
675 policy->cpuinfo.transition_latency = 0; 677 policy->cpuinfo.transition_latency = 0;
676 for (i=0; i<perf->state_count; i++) { 678 for (i = 0; i < perf->state_count; i++) {
677 if ((perf->states[i].transition_latency * 1000) > 679 if ((perf->states[i].transition_latency * 1000) >
678 policy->cpuinfo.transition_latency) 680 policy->cpuinfo.transition_latency)
679 policy->cpuinfo.transition_latency = 681 policy->cpuinfo.transition_latency =
680 perf->states[i].transition_latency * 1000; 682 perf->states[i].transition_latency * 1000;
681 } 683 }
682 684
685 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
686 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
687 policy->cpuinfo.transition_latency > 20 * 1000) {
688 static int print_once;
689 policy->cpuinfo.transition_latency = 20 * 1000;
690 if (!print_once) {
691 print_once = 1;
692 printk(KERN_INFO "Capping off P-state tranision latency"
693 " at 20 uS\n");
694 }
695 }
696
683 data->max_freq = perf->states[0].core_frequency * 1000; 697 data->max_freq = perf->states[0].core_frequency * 1000;
684 /* table init */ 698 /* table init */
685 for (i=0; i<perf->state_count; i++) { 699 for (i = 0; i < perf->state_count; i++) {
686 if (i>0 && perf->states[i].core_frequency >= 700 if (i > 0 && perf->states[i].core_frequency >=
687 data->freq_table[valid_states-1].frequency / 1000) 701 data->freq_table[valid_states-1].frequency / 1000)
688 continue; 702 continue;
689 703
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 965ea52767ac..733093d60436 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -32,7 +32,7 @@
32 * nforce2_chipset: 32 * nforce2_chipset:
33 * FSB is changed using the chipset 33 * FSB is changed using the chipset
34 */ 34 */
35static struct pci_dev *nforce2_chipset_dev; 35static struct pci_dev *nforce2_dev;
36 36
37/* fid: 37/* fid:
38 * multiplier * 10 38 * multiplier * 10
@@ -56,7 +56,9 @@ MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb, 56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50"); 57 "Minimum FSB to use, if not defined: current FSB - 50");
58 58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) 59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
60 62
61/** 63/**
62 * nforce2_calc_fsb - calculate FSB 64 * nforce2_calc_fsb - calculate FSB
@@ -118,11 +120,11 @@ static void nforce2_write_pll(int pll)
118 int temp; 120 int temp;
119 121
120 /* Set the pll addr. to 0x00 */ 122 /* Set the pll addr. to 0x00 */
121 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0); 123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
122 124
123 /* Now write the value in all 64 registers */ 125 /* Now write the value in all 64 registers */
124 for (temp = 0; temp <= 0x3f; temp++) 126 for (temp = 0; temp <= 0x3f; temp++)
125 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, pll); 127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
126 128
127 return; 129 return;
128} 130}
@@ -139,8 +141,8 @@ static unsigned int nforce2_fsb_read(int bootfsb)
139 u32 fsb, temp = 0; 141 u32 fsb, temp = 0;
140 142
141 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ 143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
142 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
143 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL); 145 PCI_ANY_ID, PCI_ANY_ID, NULL);
144 if (!nforce2_sub5) 146 if (!nforce2_sub5)
145 return 0; 147 return 0;
146 148
@@ -148,13 +150,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
148 fsb /= 1000000; 150 fsb /= 1000000;
149 151
150 /* Check if PLL register is already set */ 152 /* Check if PLL register is already set */
151 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); 153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
152 154
153 if (bootfsb || !temp) 155 if (bootfsb || !temp)
154 return fsb; 156 return fsb;
155 157
156 /* Use PLL register FSB value */ 158 /* Use PLL register FSB value */
157 pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp); 159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
158 fsb = nforce2_calc_fsb(temp); 160 fsb = nforce2_calc_fsb(temp);
159 161
160 return fsb; 162 return fsb;
@@ -174,18 +176,18 @@ static int nforce2_set_fsb(unsigned int fsb)
174 int pll = 0; 176 int pll = 0;
175 177
176 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { 178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
177 printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb); 179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
178 return -EINVAL; 180 return -EINVAL;
179 } 181 }
180 182
181 tfsb = nforce2_fsb_read(0); 183 tfsb = nforce2_fsb_read(0);
182 if (!tfsb) { 184 if (!tfsb) {
183 printk(KERN_ERR "cpufreq: Error while reading the FSB\n"); 185 printk(KERN_ERR PFX "Error while reading the FSB\n");
184 return -EINVAL; 186 return -EINVAL;
185 } 187 }
186 188
187 /* First write? Then set actual value */ 189 /* First write? Then set actual value */
188 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); 190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
189 if (!temp) { 191 if (!temp) {
190 pll = nforce2_calc_pll(tfsb); 192 pll = nforce2_calc_pll(tfsb);
191 193
@@ -197,7 +199,7 @@ static int nforce2_set_fsb(unsigned int fsb)
197 199
198 /* Enable write access */ 200 /* Enable write access */
199 temp = 0x01; 201 temp = 0x01;
200 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp); 202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
201 203
202 diff = tfsb - fsb; 204 diff = tfsb - fsb;
203 205
@@ -222,7 +224,7 @@ static int nforce2_set_fsb(unsigned int fsb)
222 } 224 }
223 225
224 temp = 0x40; 226 temp = 0x40;
225 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp); 227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
226 228
227 return 0; 229 return 0;
228} 230}
@@ -244,7 +246,8 @@ static unsigned int nforce2_get(unsigned int cpu)
244 * nforce2_target - set a new CPUFreq policy 246 * nforce2_target - set a new CPUFreq policy
245 * @policy: new policy 247 * @policy: new policy
246 * @target_freq: the target frequency 248 * @target_freq: the target frequency
247 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
248 * 251 *
249 * Sets a new CPUFreq policy. 252 * Sets a new CPUFreq policy.
250 */ 253 */
@@ -276,7 +279,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
276 /* local_irq_save(flags); */ 279 /* local_irq_save(flags); */
277 280
278 if (nforce2_set_fsb(target_fsb) < 0) 281 if (nforce2_set_fsb(target_fsb) < 0)
279 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", 282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
280 target_fsb); 283 target_fsb);
281 else 284 else
282 dprintk("Changed FSB successfully to %d\n", 285 dprintk("Changed FSB successfully to %d\n",
@@ -327,8 +330,8 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
327 /* FIX: Get FID from CPU */ 330 /* FIX: Get FID from CPU */
328 if (!fid) { 331 if (!fid) {
329 if (!cpu_khz) { 332 if (!cpu_khz) {
330 printk(KERN_WARNING 333 printk(KERN_WARNING PFX
331 "cpufreq: cpu_khz not set, can't calculate multiplier!\n"); 334 "cpu_khz not set, can't calculate multiplier!\n");
332 return -ENODEV; 335 return -ENODEV;
333 } 336 }
334 337
@@ -343,7 +346,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
343 } 346 }
344 } 347 }
345 348
346 printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb, 349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
347 fid / 10, fid % 10); 350 fid / 10, fid % 10);
348 351
349 /* Set maximum FSB to FSB at boot time */ 352 /* Set maximum FSB to FSB at boot time */
@@ -392,17 +395,18 @@ static struct cpufreq_driver nforce2_driver = {
392 */ 395 */
393static unsigned int nforce2_detect_chipset(void) 396static unsigned int nforce2_detect_chipset(void)
394{ 397{
395 nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
396 PCI_DEVICE_ID_NVIDIA_NFORCE2, 399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
397 PCI_ANY_ID, PCI_ANY_ID, NULL); 400 PCI_ANY_ID, PCI_ANY_ID, NULL);
398 401
399 if (nforce2_chipset_dev == NULL) 402 if (nforce2_dev == NULL)
400 return -ENODEV; 403 return -ENODEV;
401 404
402 printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n", 405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
403 nforce2_chipset_dev->revision); 406 nforce2_dev->revision);
404 printk(KERN_INFO 407 printk(KERN_INFO PFX
405 "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n"); 408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
406 410
407 return 0; 411 return 0;
408} 412}
@@ -420,7 +424,7 @@ static int __init nforce2_init(void)
420 424
421 /* detect chipset */ 425 /* detect chipset */
422 if (nforce2_detect_chipset()) { 426 if (nforce2_detect_chipset()) {
423 printk(KERN_ERR "cpufreq: No nForce2 chipset.\n"); 427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
424 return -ENODEV; 428 return -ENODEV;
425 } 429 }
426 430
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index c2f930d86640..35a257dd4bb7 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -12,12 +12,12 @@
12#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
13#include <linux/ioport.h> 13#include <linux/ioport.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
15 18
16#include <asm/msr.h> 19#include <asm/msr.h>
17#include <asm/tsc.h> 20#include <asm/tsc.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20#include <asm/delay.h>
21 21
22#define EPS_BRAND_C7M 0 22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1 23#define EPS_BRAND_C7 1
@@ -184,7 +184,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
184 break; 184 break;
185 } 185 }
186 186
187 switch(brand) { 187 switch (brand) {
188 case EPS_BRAND_C7M: 188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n"); 189 printk(KERN_CONT "C7-M\n");
190 break; 190 break;
@@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
204 } 204 }
205 /* Enable Enhanced PowerSaver */ 205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val); 206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & 1 << 16)) { 207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= 1 << 16; 208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val); 209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */ 210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val); 211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & 1 << 16)) { 212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); 213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV; 214 return -ENODEV;
215 } 215 }
@@ -218,17 +218,20 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
218 /* Print voltage and multiplier */ 218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi); 219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff; 220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700); 221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
222 current_multiplier = (lo >> 8) & 0xff; 223 current_multiplier = (lo >> 8) & 0xff;
223 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); 224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
224 225
225 /* Print limits */ 226 /* Print limits */
226 max_voltage = hi & 0xff; 227 max_voltage = hi & 0xff;
227 printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); 228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
228 max_multiplier = (hi >> 8) & 0xff; 230 max_multiplier = (hi >> 8) & 0xff;
229 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); 231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
230 min_voltage = (hi >> 16) & 0xff; 232 min_voltage = (hi >> 16) & 0xff;
231 printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); 233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
232 min_multiplier = (hi >> 24) & 0xff; 235 min_multiplier = (hi >> 24) & 0xff;
233 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); 236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
234 237
@@ -318,7 +321,7 @@ static int eps_cpu_exit(struct cpufreq_policy *policy)
318 return 0; 321 return 0;
319} 322}
320 323
321static struct freq_attr* eps_attr[] = { 324static struct freq_attr *eps_attr[] = {
322 &cpufreq_freq_attr_scaling_available_freqs, 325 &cpufreq_freq_attr_scaling_available_freqs,
323 NULL, 326 NULL,
324}; 327};
@@ -356,7 +359,7 @@ static void __exit eps_exit(void)
356 cpufreq_unregister_driver(&eps_driver); 359 cpufreq_unregister_driver(&eps_driver);
357} 360}
358 361
359MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>"); 362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
360MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); 363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
361MODULE_LICENSE("GPL"); 364MODULE_LICENSE("GPL");
362 365
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index fe613c93b366..006b278b0d5d 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -184,7 +184,8 @@ static int elanfreq_target(struct cpufreq_policy *policy,
184{ 184{
185 unsigned int newstate = 0; 185 unsigned int newstate = 0;
186 186
187 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate)) 187 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
188 target_freq, relation, &newstate))
188 return -EINVAL; 189 return -EINVAL;
189 190
190 elanfreq_set_cpu_state(newstate); 191 elanfreq_set_cpu_state(newstate);
@@ -301,7 +302,8 @@ static void __exit elanfreq_exit(void)
301module_param(max_freq, int, 0444); 302module_param(max_freq, int, 0444);
302 303
303MODULE_LICENSE("GPL"); 304MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 305MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
306 "Sven Geggus <sven@geggus.net>");
305MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); 307MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
306 308
307module_init(elanfreq_init); 309module_init(elanfreq_init);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 9d9eae82e60f..ac27ec2264d5 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -79,8 +79,9 @@
79#include <linux/smp.h> 79#include <linux/smp.h>
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h>
83
82#include <asm/processor-cyrix.h> 84#include <asm/processor-cyrix.h>
83#include <asm/errno.h>
84 85
85/* PCI config registers, all at F0 */ 86/* PCI config registers, all at F0 */
86#define PCI_PMER1 0x80 /* power management enable register 1 */ 87#define PCI_PMER1 0x80 /* power management enable register 1 */
@@ -122,8 +123,8 @@ static struct gxfreq_params *gx_params;
122static int stock_freq; 123static int stock_freq;
123 124
124/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ 125/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
125static int pci_busclk = 0; 126static int pci_busclk;
126module_param (pci_busclk, int, 0444); 127module_param(pci_busclk, int, 0444);
127 128
128/* maximum duration for which the cpu may be suspended 129/* maximum duration for which the cpu may be suspended
129 * (32us * MAX_DURATION). If no parameter is given, this defaults 130 * (32us * MAX_DURATION). If no parameter is given, this defaults
@@ -132,7 +133,7 @@ module_param (pci_busclk, int, 0444);
132 * is suspended -- processing power is just 0.39% of what it used to be, 133 * is suspended -- processing power is just 0.39% of what it used to be,
133 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ 134 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
134static int max_duration = 255; 135static int max_duration = 255;
135module_param (max_duration, int, 0444); 136module_param(max_duration, int, 0444);
136 137
137/* For the default policy, we want at least some processing power 138/* For the default policy, we want at least some processing power
138 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) 139 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
@@ -140,7 +141,8 @@ module_param (max_duration, int, 0444);
140#define POLICY_MIN_DIV 20 141#define POLICY_MIN_DIV 20
141 142
142 143
143#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg) 144#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
145 "gx-suspmod", msg)
144 146
145/** 147/**
146 * we can detect a core multipiler from dir0_lsb 148 * we can detect a core multipiler from dir0_lsb
@@ -166,12 +168,20 @@ static int gx_freq_mult[16] = {
166 * Low Level chipset interface * 168 * Low Level chipset interface *
167 ****************************************************************/ 169 ****************************************************************/
168static struct pci_device_id gx_chipset_tbl[] __initdata = { 170static struct pci_device_id gx_chipset_tbl[] __initdata = {
169 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID }, 171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
170 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID }, 172 PCI_ANY_ID, PCI_ANY_ID },
171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
174 PCI_ANY_ID, PCI_ANY_ID },
175 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
176 PCI_ANY_ID, PCI_ANY_ID },
172 { 0, }, 177 { 0, },
173}; 178};
174 179
180static void gx_write_byte(int reg, int value)
181{
182 pci_write_config_byte(gx_params->cs55x0, reg, value);
183}
184
175/** 185/**
176 * gx_detect_chipset: 186 * gx_detect_chipset:
177 * 187 *
@@ -200,7 +210,8 @@ static __init struct pci_dev *gx_detect_chipset(void)
200/** 210/**
201 * gx_get_cpuspeed: 211 * gx_get_cpuspeed:
202 * 212 *
203 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs. 213 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
214 * Geode CPU runs.
204 */ 215 */
205static unsigned int gx_get_cpuspeed(unsigned int cpu) 216static unsigned int gx_get_cpuspeed(unsigned int cpu)
206{ 217{
@@ -217,17 +228,18 @@ static unsigned int gx_get_cpuspeed(unsigned int cpu)
217 * 228 *
218 **/ 229 **/
219 230
220static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration) 231static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
232 u8 *off_duration)
221{ 233{
222 unsigned int i; 234 unsigned int i;
223 u8 tmp_on, tmp_off; 235 u8 tmp_on, tmp_off;
224 int old_tmp_freq = stock_freq; 236 int old_tmp_freq = stock_freq;
225 int tmp_freq; 237 int tmp_freq;
226 238
227 *off_duration=1; 239 *off_duration = 1;
228 *on_duration=0; 240 *on_duration = 0;
229 241
230 for (i=max_duration; i>0; i--) { 242 for (i = max_duration; i > 0; i--) {
231 tmp_off = ((khz * i) / stock_freq) & 0xff; 243 tmp_off = ((khz * i) / stock_freq) & 0xff;
232 tmp_on = i - tmp_off; 244 tmp_on = i - tmp_off;
233 tmp_freq = (stock_freq * tmp_off) / i; 245 tmp_freq = (stock_freq * tmp_off) / i;
@@ -259,26 +271,34 @@ static void gx_set_cpuspeed(unsigned int khz)
259 freqs.cpu = 0; 271 freqs.cpu = 0;
260 freqs.old = gx_get_cpuspeed(0); 272 freqs.old = gx_get_cpuspeed(0);
261 273
262 new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration); 274 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
275 &gx_params->off_duration);
263 276
264 freqs.new = new_khz; 277 freqs.new = new_khz;
265 278
266 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 279 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
267 local_irq_save(flags); 280 local_irq_save(flags);
268 281
269 if (new_khz != stock_freq) { /* if new khz == 100% of CPU speed, it is special case */ 282
283
284 if (new_khz != stock_freq) {
285 /* if new khz == 100% of CPU speed, it is special case */
270 switch (gx_params->cs55x0->device) { 286 switch (gx_params->cs55x0->device) {
271 case PCI_DEVICE_ID_CYRIX_5530_LEGACY: 287 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
272 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; 288 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
273 /* FIXME: need to test other values -- Zwane,Miura */ 289 /* FIXME: need to test other values -- Zwane,Miura */
274 pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */ 290 /* typical 2 to 4ms */
275 pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */ 291 gx_write_byte(PCI_IRQTC, 4);
276 pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1); 292 /* typical 50 to 100ms */
277 293 gx_write_byte(PCI_VIDTC, 100);
278 if (gx_params->cs55x0->revision < 0x10) { /* CS5530(rev 1.2, 1.3) */ 294 gx_write_byte(PCI_PMER1, pmer1);
279 suscfg = gx_params->pci_suscfg | SUSMOD; 295
280 } else { /* CS5530A,B.. */ 296 if (gx_params->cs55x0->revision < 0x10) {
281 suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE; 297 /* CS5530(rev 1.2, 1.3) */
298 suscfg = gx_params->pci_suscfg|SUSMOD;
299 } else {
300 /* CS5530A,B.. */
301 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
282 } 302 }
283 break; 303 break;
284 case PCI_DEVICE_ID_CYRIX_5520: 304 case PCI_DEVICE_ID_CYRIX_5520:
@@ -294,13 +314,13 @@ static void gx_set_cpuspeed(unsigned int khz)
294 suscfg = gx_params->pci_suscfg & ~(SUSMOD); 314 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
295 gx_params->off_duration = 0; 315 gx_params->off_duration = 0;
296 gx_params->on_duration = 0; 316 gx_params->on_duration = 0;
297 dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n"); 317 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
298 } 318 }
299 319
300 pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration); 320 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
301 pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration); 321 gx_write_byte(PCI_MODON, gx_params->on_duration);
302 322
303 pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg); 323 gx_write_byte(PCI_SUSCFG, suscfg);
304 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); 324 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
305 325
306 local_irq_restore(flags); 326 local_irq_restore(flags);
@@ -334,7 +354,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
334 return -EINVAL; 354 return -EINVAL;
335 355
336 policy->cpu = 0; 356 policy->cpu = 0;
337 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); 357 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
358 stock_freq);
338 359
339 /* it needs to be assured that at least one supported frequency is 360 /* it needs to be assured that at least one supported frequency is
340 * within policy->min and policy->max. If it is not, policy->max 361 * within policy->min and policy->max. If it is not, policy->max
@@ -354,7 +375,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
354 policy->max = tmp_freq; 375 policy->max = tmp_freq;
355 if (policy->max < policy->min) 376 if (policy->max < policy->min)
356 policy->max = policy->min; 377 policy->max = policy->min;
357 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); 378 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
379 stock_freq);
358 380
359 return 0; 381 return 0;
360} 382}
@@ -398,18 +420,18 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
398 return -ENODEV; 420 return -ENODEV;
399 421
400 /* determine maximum frequency */ 422 /* determine maximum frequency */
401 if (pci_busclk) { 423 if (pci_busclk)
402 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; 424 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
403 } else if (cpu_khz) { 425 else if (cpu_khz)
404 maxfreq = cpu_khz; 426 maxfreq = cpu_khz;
405 } else { 427 else
406 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; 428 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
407 } 429
408 stock_freq = maxfreq; 430 stock_freq = maxfreq;
409 curfreq = gx_get_cpuspeed(0); 431 curfreq = gx_get_cpuspeed(0);
410 432
411 dprintk("cpu max frequency is %d.\n", maxfreq); 433 dprintk("cpu max frequency is %d.\n", maxfreq);
412 dprintk("cpu current frequency is %dkHz.\n",curfreq); 434 dprintk("cpu current frequency is %dkHz.\n", curfreq);
413 435
414 /* setup basic struct for cpufreq API */ 436 /* setup basic struct for cpufreq API */
415 policy->cpu = 0; 437 policy->cpu = 0;
@@ -447,7 +469,8 @@ static int __init cpufreq_gx_init(void)
447 struct pci_dev *gx_pci; 469 struct pci_dev *gx_pci;
448 470
449 /* Test if we have the right hardware */ 471 /* Test if we have the right hardware */
450 if ((gx_pci = gx_detect_chipset()) == NULL) 472 gx_pci = gx_detect_chipset();
473 if (gx_pci == NULL)
451 return -ENODEV; 474 return -ENODEV;
452 475
453 /* check whether module parameters are sane */ 476 /* check whether module parameters are sane */
@@ -468,9 +491,11 @@ static int __init cpufreq_gx_init(void)
468 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); 491 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
469 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); 492 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
470 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); 493 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
471 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration)); 494 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
495 &(params->off_duration));
472 496
473 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) { 497 ret = cpufreq_register_driver(&gx_suspmod_driver);
498 if (ret) {
474 kfree(params); 499 kfree(params);
475 return ret; /* register error! */ 500 return ret; /* register error! */
476 } 501 }
@@ -485,9 +510,9 @@ static void __exit cpufreq_gx_exit(void)
485 kfree(gx_params); 510 kfree(gx_params);
486} 511}
487 512
488MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>"); 513MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
489MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); 514MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
490MODULE_LICENSE ("GPL"); 515MODULE_LICENSE("GPL");
491 516
492module_init(cpufreq_gx_init); 517module_init(cpufreq_gx_init);
493module_exit(cpufreq_gx_exit); 518module_exit(cpufreq_gx_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index a4cff5d6e380..0bd48e65a0ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -30,12 +30,12 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36#include <linux/kernel.h>
33 37
34#include <asm/msr.h> 38#include <asm/msr.h>
35#include <asm/timex.h>
36#include <asm/io.h>
37#include <asm/acpi.h>
38#include <linux/acpi.h>
39#include <acpi/processor.h> 39#include <acpi/processor.h>
40 40
41#include "longhaul.h" 41#include "longhaul.h"
@@ -58,7 +58,7 @@
58#define USE_NORTHBRIDGE (1 << 2) 58#define USE_NORTHBRIDGE (1 << 2)
59 59
60static int cpu_model; 60static int cpu_model;
61static unsigned int numscales=16; 61static unsigned int numscales = 16;
62static unsigned int fsb; 62static unsigned int fsb;
63 63
64static const struct mV_pos *vrm_mV_table; 64static const struct mV_pos *vrm_mV_table;
@@ -67,8 +67,8 @@ static const unsigned char *mV_vrm_table;
67static unsigned int highest_speed, lowest_speed; /* kHz */ 67static unsigned int highest_speed, lowest_speed; /* kHz */
68static unsigned int minmult, maxmult; 68static unsigned int minmult, maxmult;
69static int can_scale_voltage; 69static int can_scale_voltage;
70static struct acpi_processor *pr = NULL; 70static struct acpi_processor *pr;
71static struct acpi_processor_cx *cx = NULL; 71static struct acpi_processor_cx *cx;
72static u32 acpi_regs_addr; 72static u32 acpi_regs_addr;
73static u8 longhaul_flags; 73static u8 longhaul_flags;
74static unsigned int longhaul_index; 74static unsigned int longhaul_index;
@@ -78,12 +78,13 @@ static int scale_voltage;
78static int disable_acpi_c3; 78static int disable_acpi_c3;
79static int revid_errata; 79static int revid_errata;
80 80
81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) 81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
82 "longhaul", msg)
82 83
83 84
84/* Clock ratios multiplied by 10 */ 85/* Clock ratios multiplied by 10 */
85static int clock_ratio[32]; 86static int mults[32];
86static int eblcr_table[32]; 87static int eblcr[32];
87static int longhaul_version; 88static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table; 89static struct cpufreq_frequency_table *longhaul_table;
89 90
@@ -93,7 +94,7 @@ static char speedbuffer[8];
93static char *print_speed(int speed) 94static char *print_speed(int speed)
94{ 95{
95 if (speed < 1000) { 96 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed); 97 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer; 98 return speedbuffer;
98 } 99 }
99 100
@@ -122,27 +123,28 @@ static unsigned int calc_speed(int mult)
122 123
123static int longhaul_get_cpu_mult(void) 124static int longhaul_get_cpu_mult(void)
124{ 125{
125 unsigned long invalue=0,lo, hi; 126 unsigned long invalue = 0, lo, hi;
126 127
127 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi); 128 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22; 129 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) { 130 if (longhaul_version == TYPE_LONGHAUL_V2 ||
131 longhaul_version == TYPE_POWERSAVER) {
130 if (lo & (1<<27)) 132 if (lo & (1<<27))
131 invalue+=16; 133 invalue += 16;
132 } 134 }
133 return eblcr_table[invalue]; 135 return eblcr[invalue];
134} 136}
135 137
136/* For processor with BCR2 MSR */ 138/* For processor with BCR2 MSR */
137 139
138static void do_longhaul1(unsigned int clock_ratio_index) 140static void do_longhaul1(unsigned int mults_index)
139{ 141{
140 union msr_bcr2 bcr2; 142 union msr_bcr2 bcr2;
141 143
142 rdmsrl(MSR_VIA_BCR2, bcr2.val); 144 rdmsrl(MSR_VIA_BCR2, bcr2.val);
143 /* Enable software clock multiplier */ 145 /* Enable software clock multiplier */
144 bcr2.bits.ESOFTBF = 1; 146 bcr2.bits.ESOFTBF = 1;
145 bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff; 147 bcr2.bits.CLOCKMUL = mults_index & 0xff;
146 148
147 /* Sync to timer tick */ 149 /* Sync to timer tick */
148 safe_halt(); 150 safe_halt();
@@ -161,7 +163,7 @@ static void do_longhaul1(unsigned int clock_ratio_index)
161 163
162/* For processor with Longhaul MSR */ 164/* For processor with Longhaul MSR */
163 165
164static void do_powersaver(int cx_address, unsigned int clock_ratio_index, 166static void do_powersaver(int cx_address, unsigned int mults_index,
165 unsigned int dir) 167 unsigned int dir)
166{ 168{
167 union msr_longhaul longhaul; 169 union msr_longhaul longhaul;
@@ -173,11 +175,11 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
173 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; 175 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
174 else 176 else
175 longhaul.bits.RevisionKey = 0; 177 longhaul.bits.RevisionKey = 0;
176 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; 178 longhaul.bits.SoftBusRatio = mults_index & 0xf;
177 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; 179 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
178 /* Setup new voltage */ 180 /* Setup new voltage */
179 if (can_scale_voltage) 181 if (can_scale_voltage)
180 longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f; 182 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
181 /* Sync to timer tick */ 183 /* Sync to timer tick */
182 safe_halt(); 184 safe_halt();
183 /* Raise voltage if necessary */ 185 /* Raise voltage if necessary */
@@ -240,14 +242,14 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
240 242
241/** 243/**
242 * longhaul_set_cpu_frequency() 244 * longhaul_set_cpu_frequency()
243 * @clock_ratio_index : bitpattern of the new multiplier. 245 * @mults_index : bitpattern of the new multiplier.
244 * 246 *
245 * Sets a new clock ratio. 247 * Sets a new clock ratio.
246 */ 248 */
247 249
248static void longhaul_setstate(unsigned int table_index) 250static void longhaul_setstate(unsigned int table_index)
249{ 251{
250 unsigned int clock_ratio_index; 252 unsigned int mults_index;
251 int speed, mult; 253 int speed, mult;
252 struct cpufreq_freqs freqs; 254 struct cpufreq_freqs freqs;
253 unsigned long flags; 255 unsigned long flags;
@@ -256,9 +258,9 @@ static void longhaul_setstate(unsigned int table_index)
256 u32 bm_timeout = 1000; 258 u32 bm_timeout = 1000;
257 unsigned int dir = 0; 259 unsigned int dir = 0;
258 260
259 clock_ratio_index = longhaul_table[table_index].index; 261 mults_index = longhaul_table[table_index].index;
260 /* Safety precautions */ 262 /* Safety precautions */
261 mult = clock_ratio[clock_ratio_index & 0x1f]; 263 mult = mults[mults_index & 0x1f];
262 if (mult == -1) 264 if (mult == -1)
263 return; 265 return;
264 speed = calc_speed(mult); 266 speed = calc_speed(mult);
@@ -274,7 +276,7 @@ static void longhaul_setstate(unsigned int table_index)
274 276
275 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
276 278
277 dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", 279 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
278 fsb, mult/10, mult%10, print_speed(speed/1000)); 280 fsb, mult/10, mult%10, print_speed(speed/1000));
279retry_loop: 281retry_loop:
280 preempt_disable(); 282 preempt_disable();
@@ -282,8 +284,8 @@ retry_loop:
282 284
283 pic2_mask = inb(0xA1); 285 pic2_mask = inb(0xA1);
284 pic1_mask = inb(0x21); /* works on C3. save mask. */ 286 pic1_mask = inb(0x21); /* works on C3. save mask. */
285 outb(0xFF,0xA1); /* Overkill */ 287 outb(0xFF, 0xA1); /* Overkill */
286 outb(0xFE,0x21); /* TMR0 only */ 288 outb(0xFE, 0x21); /* TMR0 only */
287 289
288 /* Wait while PCI bus is busy. */ 290 /* Wait while PCI bus is busy. */
289 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE 291 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
@@ -303,7 +305,7 @@ retry_loop:
303 outb(3, 0x22); 305 outb(3, 0x22);
304 } else if ((pr != NULL) && pr->flags.bm_control) { 306 } else if ((pr != NULL) && pr->flags.bm_control) {
305 /* Disable bus master arbitration */ 307 /* Disable bus master arbitration */
306 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); 308 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
307 } 309 }
308 switch (longhaul_version) { 310 switch (longhaul_version) {
309 311
@@ -312,7 +314,7 @@ retry_loop:
312 * Software controlled multipliers only. 314 * Software controlled multipliers only.
313 */ 315 */
314 case TYPE_LONGHAUL_V1: 316 case TYPE_LONGHAUL_V1:
315 do_longhaul1(clock_ratio_index); 317 do_longhaul1(mults_index);
316 break; 318 break;
317 319
318 /* 320 /*
@@ -326,10 +328,10 @@ retry_loop:
326 case TYPE_POWERSAVER: 328 case TYPE_POWERSAVER:
327 if (longhaul_flags & USE_ACPI_C3) { 329 if (longhaul_flags & USE_ACPI_C3) {
328 /* Don't allow wakeup */ 330 /* Don't allow wakeup */
329 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); 331 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
330 do_powersaver(cx->address, clock_ratio_index, dir); 332 do_powersaver(cx->address, mults_index, dir);
331 } else { 333 } else {
332 do_powersaver(0, clock_ratio_index, dir); 334 do_powersaver(0, mults_index, dir);
333 } 335 }
334 break; 336 break;
335 } 337 }
@@ -339,10 +341,10 @@ retry_loop:
339 outb(0, 0x22); 341 outb(0, 0x22);
340 } else if ((pr != NULL) && pr->flags.bm_control) { 342 } else if ((pr != NULL) && pr->flags.bm_control) {
341 /* Enable bus master arbitration */ 343 /* Enable bus master arbitration */
342 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); 344 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
343 } 345 }
344 outb(pic2_mask,0xA1); /* restore mask */ 346 outb(pic2_mask, 0xA1); /* restore mask */
345 outb(pic1_mask,0x21); 347 outb(pic1_mask, 0x21);
346 348
347 local_irq_restore(flags); 349 local_irq_restore(flags);
348 preempt_enable(); 350 preempt_enable();
@@ -392,7 +394,8 @@ retry_loop:
392 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 394 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
393 395
394 if (!bm_timeout) 396 if (!bm_timeout)
395 printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n"); 397 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
398 "idle PCI bus.\n");
396} 399}
397 400
398/* 401/*
@@ -458,31 +461,32 @@ static int __init longhaul_get_ranges(void)
458 break; 461 break;
459 } 462 }
460 463
461 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", 464 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
462 minmult/10, minmult%10, maxmult/10, maxmult%10); 465 minmult/10, minmult%10, maxmult/10, maxmult%10);
463 466
464 highest_speed = calc_speed(maxmult); 467 highest_speed = calc_speed(maxmult);
465 lowest_speed = calc_speed(minmult); 468 lowest_speed = calc_speed(minmult);
466 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, 469 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
467 print_speed(lowest_speed/1000), 470 print_speed(lowest_speed/1000),
468 print_speed(highest_speed/1000)); 471 print_speed(highest_speed/1000));
469 472
470 if (lowest_speed == highest_speed) { 473 if (lowest_speed == highest_speed) {
471 printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n"); 474 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
472 return -EINVAL; 475 return -EINVAL;
473 } 476 }
474 if (lowest_speed > highest_speed) { 477 if (lowest_speed > highest_speed) {
475 printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", 478 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
476 lowest_speed, highest_speed); 479 lowest_speed, highest_speed);
477 return -EINVAL; 480 return -EINVAL;
478 } 481 }
479 482
480 longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL); 483 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
481 if(!longhaul_table) 484 GFP_KERNEL);
485 if (!longhaul_table)
482 return -ENOMEM; 486 return -ENOMEM;
483 487
484 for (j = 0; j < numscales; j++) { 488 for (j = 0; j < numscales; j++) {
485 ratio = clock_ratio[j]; 489 ratio = mults[j];
486 if (ratio == -1) 490 if (ratio == -1)
487 continue; 491 continue;
488 if (ratio > maxmult || ratio < minmult) 492 if (ratio > maxmult || ratio < minmult)
@@ -507,13 +511,10 @@ static int __init longhaul_get_ranges(void)
507 } 511 }
508 } 512 }
509 if (min_i != j) { 513 if (min_i != j) {
510 unsigned int temp; 514 swap(longhaul_table[j].frequency,
511 temp = longhaul_table[j].frequency; 515 longhaul_table[min_i].frequency);
512 longhaul_table[j].frequency = longhaul_table[min_i].frequency; 516 swap(longhaul_table[j].index,
513 longhaul_table[min_i].frequency = temp; 517 longhaul_table[min_i].index);
514 temp = longhaul_table[j].index;
515 longhaul_table[j].index = longhaul_table[min_i].index;
516 longhaul_table[min_i].index = temp;
517 } 518 }
518 } 519 }
519 520
@@ -521,7 +522,7 @@ static int __init longhaul_get_ranges(void)
521 522
522 /* Find index we are running on */ 523 /* Find index we are running on */
523 for (j = 0; j < k; j++) { 524 for (j = 0; j < k; j++) {
524 if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) { 525 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j; 526 longhaul_index = j;
526 break; 527 break;
527 } 528 }
@@ -559,20 +560,22 @@ static void __init longhaul_setup_voltagescaling(void)
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; 560 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560 561
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { 562 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " 563 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n", 564 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000); 565 minvid.mV/1000, minvid.mV%1000,
566 maxvid.mV/1000, maxvid.mV%1000);
565 return; 567 return;
566 } 568 }
567 569
568 if (minvid.mV == maxvid.mV) { 570 if (minvid.mV == maxvid.mV) {
569 printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are " 571 printk(KERN_INFO PFX "Claims to support voltage scaling but "
570 "both %d.%03d. Voltage scaling disabled\n", 572 "min & max are both %d.%03d. "
573 "Voltage scaling disabled\n",
571 maxvid.mV/1000, maxvid.mV%1000); 574 maxvid.mV/1000, maxvid.mV%1000);
572 return; 575 return;
573 } 576 }
574 577
575 /* How many voltage steps */ 578 /* How many voltage steps*/
576 numvscales = maxvid.pos - minvid.pos + 1; 579 numvscales = maxvid.pos - minvid.pos + 1;
577 printk(KERN_INFO PFX 580 printk(KERN_INFO PFX
578 "Max VID=%d.%03d " 581 "Max VID=%d.%03d "
@@ -586,7 +589,7 @@ static void __init longhaul_setup_voltagescaling(void)
586 j = longhaul.bits.MinMHzBR; 589 j = longhaul.bits.MinMHzBR;
587 if (longhaul.bits.MinMHzBR4) 590 if (longhaul.bits.MinMHzBR4)
588 j += 16; 591 j += 16;
589 min_vid_speed = eblcr_table[j]; 592 min_vid_speed = eblcr[j];
590 if (min_vid_speed == -1) 593 if (min_vid_speed == -1)
591 return; 594 return;
592 switch (longhaul.bits.MinMHzFSB) { 595 switch (longhaul.bits.MinMHzFSB) {
@@ -617,7 +620,8 @@ static void __init longhaul_setup_voltagescaling(void)
617 pos = minvid.pos; 620 pos = minvid.pos;
618 longhaul_table[j].index |= mV_vrm_table[pos] << 8; 621 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
619 vid = vrm_mV_table[mV_vrm_table[pos]]; 622 vid = vrm_mV_table[mV_vrm_table[pos]];
620 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV); 623 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
624 speed, j, vid.mV);
621 j++; 625 j++;
622 } 626 }
623 627
@@ -640,7 +644,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
640 unsigned int dir = 0; 644 unsigned int dir = 0;
641 u8 vid, current_vid; 645 u8 vid, current_vid;
642 646
643 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index)) 647 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
648 relation, &table_index))
644 return -EINVAL; 649 return -EINVAL;
645 650
646 /* Don't set same frequency again */ 651 /* Don't set same frequency again */
@@ -656,7 +661,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
656 * this in hardware, C3 is old and we need to do this 661 * this in hardware, C3 is old and we need to do this
657 * in software. */ 662 * in software. */
658 i = longhaul_index; 663 i = longhaul_index;
659 current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f; 664 current_vid = (longhaul_table[longhaul_index].index >> 8);
665 current_vid &= 0x1f;
660 if (table_index > longhaul_index) 666 if (table_index > longhaul_index)
661 dir = 1; 667 dir = 1;
662 while (i != table_index) { 668 while (i != table_index) {
@@ -691,9 +697,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
691{ 697{
692 struct acpi_device *d; 698 struct acpi_device *d;
693 699
694 if ( acpi_bus_get_device(obj_handle, &d) ) { 700 if (acpi_bus_get_device(obj_handle, &d))
695 return 0; 701 return 0;
696 } 702
697 *return_value = acpi_driver_data(d); 703 *return_value = acpi_driver_data(d);
698 return 1; 704 return 1;
699} 705}
@@ -750,7 +756,7 @@ static int longhaul_setup_southbridge(void)
750 /* Find VT8235 southbridge */ 756 /* Find VT8235 southbridge */
751 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); 757 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
752 if (dev == NULL) 758 if (dev == NULL)
753 /* Find VT8237 southbridge */ 759 /* Find VT8237 southbridge */
754 dev = pci_get_device(PCI_VENDOR_ID_VIA, 760 dev = pci_get_device(PCI_VENDOR_ID_VIA,
755 PCI_DEVICE_ID_VIA_8237, NULL); 761 PCI_DEVICE_ID_VIA_8237, NULL);
756 if (dev != NULL) { 762 if (dev != NULL) {
@@ -769,7 +775,8 @@ static int longhaul_setup_southbridge(void)
769 if (pci_cmd & 1 << 7) { 775 if (pci_cmd & 1 << 7) {
770 pci_read_config_dword(dev, 0x88, &acpi_regs_addr); 776 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
771 acpi_regs_addr &= 0xff00; 777 acpi_regs_addr &= 0xff00;
772 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr); 778 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
779 acpi_regs_addr);
773 } 780 }
774 781
775 pci_dev_put(dev); 782 pci_dev_put(dev);
@@ -781,7 +788,7 @@ static int longhaul_setup_southbridge(void)
781static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 788static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
782{ 789{
783 struct cpuinfo_x86 *c = &cpu_data(0); 790 struct cpuinfo_x86 *c = &cpu_data(0);
784 char *cpuname=NULL; 791 char *cpuname = NULL;
785 int ret; 792 int ret;
786 u32 lo, hi; 793 u32 lo, hi;
787 794
@@ -791,8 +798,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
791 cpu_model = CPU_SAMUEL; 798 cpu_model = CPU_SAMUEL;
792 cpuname = "C3 'Samuel' [C5A]"; 799 cpuname = "C3 'Samuel' [C5A]";
793 longhaul_version = TYPE_LONGHAUL_V1; 800 longhaul_version = TYPE_LONGHAUL_V1;
794 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); 801 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
795 memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr)); 802 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
796 break; 803 break;
797 804
798 case 7: 805 case 7:
@@ -803,10 +810,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
803 cpuname = "C3 'Samuel 2' [C5B]"; 810 cpuname = "C3 'Samuel 2' [C5B]";
804 /* Note, this is not a typo, early Samuel2's had 811 /* Note, this is not a typo, early Samuel2's had
805 * Samuel1 ratios. */ 812 * Samuel1 ratios. */
806 memcpy(clock_ratio, samuel1_clock_ratio, 813 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
807 sizeof(samuel1_clock_ratio)); 814 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
808 memcpy(eblcr_table, samuel2_eblcr,
809 sizeof(samuel2_eblcr));
810 break; 815 break;
811 case 1 ... 15: 816 case 1 ... 15:
812 longhaul_version = TYPE_LONGHAUL_V1; 817 longhaul_version = TYPE_LONGHAUL_V1;
@@ -817,10 +822,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
817 cpu_model = CPU_EZRA; 822 cpu_model = CPU_EZRA;
818 cpuname = "C3 'Ezra' [C5C]"; 823 cpuname = "C3 'Ezra' [C5C]";
819 } 824 }
820 memcpy(clock_ratio, ezra_clock_ratio, 825 memcpy(mults, ezra_mults, sizeof(ezra_mults));
821 sizeof(ezra_clock_ratio)); 826 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
822 memcpy(eblcr_table, ezra_eblcr,
823 sizeof(ezra_eblcr));
824 break; 827 break;
825 } 828 }
826 break; 829 break;
@@ -829,18 +832,16 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
829 cpu_model = CPU_EZRA_T; 832 cpu_model = CPU_EZRA_T;
830 cpuname = "C3 'Ezra-T' [C5M]"; 833 cpuname = "C3 'Ezra-T' [C5M]";
831 longhaul_version = TYPE_POWERSAVER; 834 longhaul_version = TYPE_POWERSAVER;
832 numscales=32; 835 numscales = 32;
833 memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio)); 836 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
834 memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr)); 837 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
835 break; 838 break;
836 839
837 case 9: 840 case 9:
838 longhaul_version = TYPE_POWERSAVER; 841 longhaul_version = TYPE_POWERSAVER;
839 numscales = 32; 842 numscales = 32;
840 memcpy(clock_ratio, 843 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
841 nehemiah_clock_ratio, 844 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
842 sizeof(nehemiah_clock_ratio));
843 memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) { 845 switch (c->x86_mask) {
845 case 0 ... 1: 846 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH; 847 cpu_model = CPU_NEHEMIAH;
@@ -869,14 +870,14 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
869 longhaul_version = TYPE_LONGHAUL_V1; 870 longhaul_version = TYPE_LONGHAUL_V1;
870 } 871 }
871 872
872 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); 873 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) { 874 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1: 875 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2: 876 case TYPE_LONGHAUL_V2:
876 printk ("Longhaul v%d supported.\n", longhaul_version); 877 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break; 878 break;
878 case TYPE_POWERSAVER: 879 case TYPE_POWERSAVER:
879 printk ("Powersaver supported.\n"); 880 printk(KERN_CONT "Powersaver supported.\n");
880 break; 881 break;
881 }; 882 };
882 883
@@ -940,7 +941,7 @@ static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
940 return 0; 941 return 0;
941} 942}
942 943
943static struct freq_attr* longhaul_attr[] = { 944static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs, 945 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL, 946 NULL,
946}; 947};
@@ -966,13 +967,15 @@ static int __init longhaul_init(void)
966 967
967#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) { 969 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n"); 970 printk(KERN_ERR PFX "More than 1 CPU detected, "
971 "longhaul disabled.\n");
970 return -ENODEV; 972 return -ENODEV;
971 } 973 }
972#endif 974#endif
973#ifdef CONFIG_X86_IO_APIC 975#ifdef CONFIG_X86_IO_APIC
974 if (cpu_has_apic) { 976 if (cpu_has_apic) {
975 printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n"); 977 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
978 "broken in this configuration.\n");
976 return -ENODEV; 979 return -ENODEV;
977 } 980 }
978#endif 981#endif
@@ -993,8 +996,8 @@ static void __exit longhaul_exit(void)
993{ 996{
994 int i; 997 int i;
995 998
996 for (i=0; i < numscales; i++) { 999 for (i = 0; i < numscales; i++) {
997 if (clock_ratio[i] == maxmult) { 1000 if (mults[i] == maxmult) {
998 longhaul_setstate(i); 1001 longhaul_setstate(i);
999 break; 1002 break;
1000 } 1003 }
@@ -1007,11 +1010,11 @@ static void __exit longhaul_exit(void)
1007/* Even if BIOS is exporting ACPI C3 state, and it is used 1010/* Even if BIOS is exporting ACPI C3 state, and it is used
1008 * with success when CPU is idle, this state doesn't 1011 * with success when CPU is idle, this state doesn't
1009 * trigger frequency transition in some cases. */ 1012 * trigger frequency transition in some cases. */
1010module_param (disable_acpi_c3, int, 0644); 1013module_param(disable_acpi_c3, int, 0644);
1011MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1014MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1012/* Change CPU voltage with frequency. Very usefull to save 1015/* Change CPU voltage with frequency. Very usefull to save
1013 * power, but most VIA C3 processors aren't supporting it. */ 1016 * power, but most VIA C3 processors aren't supporting it. */
1014module_param (scale_voltage, int, 0644); 1017module_param(scale_voltage, int, 0644);
1015MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1018MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1016/* Force revision key to 0 for processors which doesn't 1019/* Force revision key to 0 for processors which doesn't
1017 * support voltage scaling, but are introducing itself as 1020 * support voltage scaling, but are introducing itself as
@@ -1019,9 +1022,9 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1022module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1023MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1024
1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); 1025MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1026MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1027MODULE_LICENSE("GPL");
1025 1028
1026late_initcall(longhaul_init); 1029late_initcall(longhaul_init);
1027module_exit(longhaul_exit); 1030module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index 4fcc320997df..e2360a469f79 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -49,14 +49,14 @@ union msr_longhaul {
49 49
50/* 50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio. 51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr ones specify the ratio read from the CPU. 52 * The eblcr values specify the ratio read from the CPU.
53 * The clock_ratio ones specify what to write to the CPU. 53 * The mults values specify what to write to the CPU.
54 */ 54 */
55 55
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_clock_ratio[16] = { 59static const int __initdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_clock_ratio[16] = { 122static const int __initdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_clock_ratio[32] = { 163static const int __initdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_clock_ratio[32] = { 238static const int __initdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index 777a7ff075de..da5f70fcb766 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -11,12 +11,13 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/cpufreq.h> 13#include <linux/cpufreq.h>
14#include <linux/timex.h>
14 15
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/timex.h>
18 18
19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg) 19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
20 "longrun", msg)
20 21
21static struct cpufreq_driver longrun_driver; 22static struct cpufreq_driver longrun_driver;
22 23
@@ -51,7 +52,7 @@ static void __init longrun_get_policy(struct cpufreq_policy *policy)
51 msr_lo &= 0x0000007F; 52 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F; 53 msr_hi &= 0x0000007F;
53 54
54 if ( longrun_high_freq <= longrun_low_freq ) { 55 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */ 56 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq; 57 policy->min = policy->max = longrun_high_freq;
57 } else { 58 } else {
@@ -79,7 +80,7 @@ static int longrun_set_policy(struct cpufreq_policy *policy)
79 if (!policy) 80 if (!policy)
80 return -EINVAL; 81 return -EINVAL;
81 82
82 if ( longrun_high_freq <= longrun_low_freq ) { 83 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */ 84 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100; 85 pctg_lo = pctg_hi = 100;
85 } else { 86 } else {
@@ -152,7 +153,7 @@ static unsigned int longrun_get(unsigned int cpu)
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx); 153 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax); 154 dprintk("cpuid eax is %u\n", eax);
154 155
155 return (eax * 1000); 156 return eax * 1000;
156} 157}
157 158
158/** 159/**
@@ -196,7 +197,8 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); 197 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */ 198 *high_freq = msr_lo * 1000; /* to kHz */
198 199
199 dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq); 200 dprintk("longrun table interface told %u - %u kHz\n",
201 *low_freq, *high_freq);
200 202
201 if (*low_freq > *high_freq) 203 if (*low_freq > *high_freq)
202 *low_freq = *high_freq; 204 *low_freq = *high_freq;
@@ -219,7 +221,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
219 cpuid(0x80860007, &eax, &ebx, &ecx, &edx); 221 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
220 /* try decreasing in 10% steps, some processors react only 222 /* try decreasing in 10% steps, some processors react only
221 * on some barrier values */ 223 * on some barrier values */
222 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) { 224 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
223 /* set to 0 to try_hi perf_pctg */ 225 /* set to 0 to try_hi perf_pctg */
224 msr_lo &= 0xFFFFFF80; 226 msr_lo &= 0xFFFFFF80;
225 msr_hi &= 0xFFFFFF80; 227 msr_hi &= 0xFFFFFF80;
@@ -236,7 +238,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
236 238
237 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) 239 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
238 * eqals 240 * eqals
239 * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) 241 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
240 * 242 *
241 * high_freq * perf_pctg is stored tempoarily into "ebx". 243 * high_freq * perf_pctg is stored tempoarily into "ebx".
242 */ 244 */
@@ -317,9 +319,10 @@ static void __exit longrun_exit(void)
317} 319}
318 320
319 321
320MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); 322MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
321MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors."); 323MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
322MODULE_LICENSE ("GPL"); 324 "Efficeon processors.");
325MODULE_LICENSE("GPL");
323 326
324module_init(longrun_init); 327module_init(longrun_init);
325module_exit(longrun_exit); 328module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9e..6ac55bd341ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -27,15 +27,17 @@
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/cpumask.h> 29#include <linux/cpumask.h>
30#include <linux/timex.h>
30 31
31#include <asm/processor.h> 32#include <asm/processor.h>
32#include <asm/msr.h> 33#include <asm/msr.h>
33#include <asm/timex.h> 34#include <asm/timer.h>
34 35
35#include "speedstep-lib.h" 36#include "speedstep-lib.h"
36 37
37#define PFX "p4-clockmod: " 38#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg) 39#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
40 "p4-clockmod", msg)
39 41
40/* 42/*
41 * Duty Cycle (3bits), note DC_DISABLE is not specified in 43 * Duty Cycle (3bits), note DC_DISABLE is not specified in
@@ -58,7 +60,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
58{ 60{
59 u32 l, h; 61 u32 l, h;
60 62
61 if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) 63 if (!cpu_online(cpu) ||
64 (newstate > DC_DISABLE) || (newstate == DC_RESV))
62 return -EINVAL; 65 return -EINVAL;
63 66
64 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); 67 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
@@ -66,7 +69,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
66 if (l & 0x01) 69 if (l & 0x01)
67 dprintk("CPU#%d currently thermal throttled\n", cpu); 70 dprintk("CPU#%d currently thermal throttled\n", cpu);
68 71
69 if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT)) 72 if (has_N44_O17_errata[cpu] &&
73 (newstate == DC_25PT || newstate == DC_DFLT))
70 newstate = DC_38PT; 74 newstate = DC_38PT;
71 75
72 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); 76 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
@@ -112,7 +116,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
112 struct cpufreq_freqs freqs; 116 struct cpufreq_freqs freqs;
113 int i; 117 int i;
114 118
115 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate)) 119 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
120 target_freq, relation, &newstate))
116 return -EINVAL; 121 return -EINVAL;
117 122
118 freqs.old = cpufreq_p4_get(policy->cpu); 123 freqs.old = cpufreq_p4_get(policy->cpu);
@@ -127,7 +132,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 132 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 133 }
129 134
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 135 /* run on each logical CPU,
136 * see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 137 * Developer's Manual, Volume 3
132 */ 138 */
133 for_each_cpu(i, policy->cpus) 139 for_each_cpu(i, policy->cpus)
@@ -153,28 +159,30 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
153{ 159{
154 if (c->x86 == 0x06) { 160 if (c->x86 == 0x06) {
155 if (cpu_has(c, X86_FEATURE_EST)) 161 if (cpu_has(c, X86_FEATURE_EST))
156 printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. " 162 printk(KERN_WARNING PFX "Warning: EST-capable CPU "
157 "The acpi-cpufreq module offers voltage scaling" 163 "detected. The acpi-cpufreq module offers "
158 " in addition of frequency scaling. You should use " 164 "voltage scaling in addition of frequency "
159 "that instead of p4-clockmod, if possible.\n"); 165 "scaling. You should use that instead of "
166 "p4-clockmod, if possible.\n");
160 switch (c->x86_model) { 167 switch (c->x86_model) {
161 case 0x0E: /* Core */ 168 case 0x0E: /* Core */
162 case 0x0F: /* Core Duo */ 169 case 0x0F: /* Core Duo */
163 case 0x16: /* Celeron Core */ 170 case 0x16: /* Celeron Core */
164 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
165 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE); 172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
166 case 0x0D: /* Pentium M (Dothan) */ 173 case 0x0D: /* Pentium M (Dothan) */
167 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
168 /* fall through */ 175 /* fall through */
169 case 0x09: /* Pentium M (Banias) */ 176 case 0x09: /* Pentium M (Banias) */
170 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); 177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
171 } 178 }
172 } 179 }
173 180
174 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF) {
175 if (!cpu_has(c, X86_FEATURE_EST)) 182 if (!cpu_has(c, X86_FEATURE_EST))
176 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. " 183 printk(KERN_WARNING PFX "Unknown CPU. "
177 "Please send an e-mail to <cpufreq@vger.kernel.org>\n"); 184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
178 return 0; 186 return 0;
179 } 187 }
180 188
@@ -182,16 +190,16 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
182 * throttling is active or not. */ 190 * throttling is active or not. */
183 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 191 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
184 192
185 if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) { 193 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
186 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " 194 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
187 "The speedstep-ich or acpi cpufreq modules offer " 195 "The speedstep-ich or acpi cpufreq modules offer "
188 "voltage scaling in addition of frequency scaling. " 196 "voltage scaling in addition of frequency scaling. "
189 "You should use either one instead of p4-clockmod, " 197 "You should use either one instead of p4-clockmod, "
190 "if possible.\n"); 198 "if possible.\n");
191 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M); 199 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
192 } 200 }
193 201
194 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D); 202 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
195} 203}
196 204
197 205
@@ -203,7 +211,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203 unsigned int i; 211 unsigned int i;
204 212
205#ifdef CONFIG_SMP 213#ifdef CONFIG_SMP
206 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); 214 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
207#endif 215#endif
208 216
209 /* Errata workaround */ 217 /* Errata workaround */
@@ -217,14 +225,20 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
217 dprintk("has errata -- disabling low frequencies\n"); 225 dprintk("has errata -- disabling low frequencies\n");
218 } 226 }
219 227
228 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
229 c->x86_model < 2) {
230 /* switch to maximum frequency and measure result */
231 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
232 recalibrate_cpu_khz();
233 }
220 /* get max frequency */ 234 /* get max frequency */
221 stock_freq = cpufreq_p4_get_frequency(c); 235 stock_freq = cpufreq_p4_get_frequency(c);
222 if (!stock_freq) 236 if (!stock_freq)
223 return -EINVAL; 237 return -EINVAL;
224 238
225 /* table init */ 239 /* table init */
226 for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { 240 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
227 if ((i<2) && (has_N44_O17_errata[policy->cpu])) 241 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
228 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; 242 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
229 else 243 else
230 p4clockmod_table[i].frequency = (stock_freq * i)/8; 244 p4clockmod_table[i].frequency = (stock_freq * i)/8;
@@ -232,7 +246,10 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
232 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); 246 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
233 247
234 /* cpuinfo and default policy values */ 248 /* cpuinfo and default policy values */
235 policy->cpuinfo.transition_latency = 1000000; /* assumed */ 249
250 /* the transition latency is set to be 1 higher than the maximum
251 * transition latency of the ondemand governor */
252 policy->cpuinfo.transition_latency = 10000001;
236 policy->cur = stock_freq; 253 policy->cur = stock_freq;
237 254
238 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); 255 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
@@ -258,12 +275,12 @@ static unsigned int cpufreq_p4_get(unsigned int cpu)
258 l = DC_DISABLE; 275 l = DC_DISABLE;
259 276
260 if (l != DC_DISABLE) 277 if (l != DC_DISABLE)
261 return (stock_freq * l / 8); 278 return stock_freq * l / 8;
262 279
263 return stock_freq; 280 return stock_freq;
264} 281}
265 282
266static struct freq_attr* p4clockmod_attr[] = { 283static struct freq_attr *p4clockmod_attr[] = {
267 &cpufreq_freq_attr_scaling_available_freqs, 284 &cpufreq_freq_attr_scaling_available_freqs,
268 NULL, 285 NULL,
269}; 286};
@@ -277,7 +294,6 @@ static struct cpufreq_driver p4clockmod_driver = {
277 .name = "p4-clockmod", 294 .name = "p4-clockmod",
278 .owner = THIS_MODULE, 295 .owner = THIS_MODULE,
279 .attr = p4clockmod_attr, 296 .attr = p4clockmod_attr,
280 .hide_interface = 1,
281}; 297};
282 298
283 299
@@ -299,9 +315,10 @@ static int __init cpufreq_p4_init(void)
299 315
300 ret = cpufreq_register_driver(&p4clockmod_driver); 316 ret = cpufreq_register_driver(&p4clockmod_driver);
301 if (!ret) 317 if (!ret)
302 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n"); 318 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
319 "Modulation available\n");
303 320
304 return (ret); 321 return ret;
305} 322}
306 323
307 324
@@ -311,9 +328,9 @@ static void __exit cpufreq_p4_exit(void)
311} 328}
312 329
313 330
314MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>"); 331MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
315MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); 332MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
316MODULE_LICENSE ("GPL"); 333MODULE_LICENSE("GPL");
317 334
318late_initcall(cpufreq_p4_init); 335late_initcall(cpufreq_p4_init);
319module_exit(cpufreq_p4_exit); 336module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index c1ac5790c63e..f10dea409f40 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) 2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski. 3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
4 * 5 *
5 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
6 * 7 *
@@ -13,14 +14,15 @@
13#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
14#include <linux/ioport.h> 15#include <linux/ioport.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16
17#include <asm/msr.h>
18#include <linux/timex.h> 17#include <linux/timex.h>
19#include <linux/io.h> 18#include <linux/io.h>
20 19
20#include <asm/msr.h>
21
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */ 23 as it is unused */
23 24
25#define PFX "powernow-k6: "
24static unsigned int busfreq; /* FSB, in 10 kHz */ 26static unsigned int busfreq; /* FSB, in 10 kHz */
25static unsigned int max_multiplier; 27static unsigned int max_multiplier;
26 28
@@ -47,8 +49,8 @@ static struct cpufreq_frequency_table clock_ratio[] = {
47 */ 49 */
48static int powernow_k6_get_cpu_multiplier(void) 50static int powernow_k6_get_cpu_multiplier(void)
49{ 51{
50 u64 invalue = 0; 52 u64 invalue = 0;
51 u32 msrval; 53 u32 msrval;
52 54
53 msrval = POWERNOW_IOPORT + 0x1; 55 msrval = POWERNOW_IOPORT + 0x1;
54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 56 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
@@ -68,12 +70,12 @@ static int powernow_k6_get_cpu_multiplier(void)
68 */ 70 */
69static void powernow_k6_set_state(unsigned int best_i) 71static void powernow_k6_set_state(unsigned int best_i)
70{ 72{
71 unsigned long outvalue = 0, invalue = 0; 73 unsigned long outvalue = 0, invalue = 0;
72 unsigned long msrval; 74 unsigned long msrval;
73 struct cpufreq_freqs freqs; 75 struct cpufreq_freqs freqs;
74 76
75 if (clock_ratio[best_i].index > max_multiplier) { 77 if (clock_ratio[best_i].index > max_multiplier) {
76 printk(KERN_ERR "cpufreq: invalid target frequency\n"); 78 printk(KERN_ERR PFX "invalid target frequency\n");
77 return; 79 return;
78 } 80 }
79 81
@@ -119,7 +121,8 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
119 * powernow_k6_setpolicy - sets a new CPUFreq policy 121 * powernow_k6_setpolicy - sets a new CPUFreq policy
120 * @policy: new policy 122 * @policy: new policy
121 * @target_freq: the target frequency 123 * @target_freq: the target frequency
122 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 124 * @relation: how that frequency relates to achieved frequency
125 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
123 * 126 *
124 * sets a new CPUFreq policy 127 * sets a new CPUFreq policy
125 */ 128 */
@@ -127,9 +130,10 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
127 unsigned int target_freq, 130 unsigned int target_freq,
128 unsigned int relation) 131 unsigned int relation)
129{ 132{
130 unsigned int newstate = 0; 133 unsigned int newstate = 0;
131 134
132 if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate)) 135 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
136 target_freq, relation, &newstate))
133 return -EINVAL; 137 return -EINVAL;
134 138
135 powernow_k6_set_state(newstate); 139 powernow_k6_set_state(newstate);
@@ -140,7 +144,7 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
140 144
141static int powernow_k6_cpu_init(struct cpufreq_policy *policy) 145static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
142{ 146{
143 unsigned int i; 147 unsigned int i, f;
144 int result; 148 int result;
145 149
146 if (policy->cpu != 0) 150 if (policy->cpu != 0)
@@ -152,10 +156,11 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 156
153 /* table init */ 157 /* table init */
154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 158 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
155 if (clock_ratio[i].index > max_multiplier) 159 f = clock_ratio[i].index;
160 if (f > max_multiplier)
156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 161 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
157 else 162 else
158 clock_ratio[i].frequency = busfreq * clock_ratio[i].index; 163 clock_ratio[i].frequency = busfreq * f;
159 } 164 }
160 165
161 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
@@ -185,7 +190,9 @@ static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
185 190
186static unsigned int powernow_k6_get(unsigned int cpu) 191static unsigned int powernow_k6_get(unsigned int cpu)
187{ 192{
188 return busfreq * powernow_k6_get_cpu_multiplier(); 193 unsigned int ret;
194 ret = (busfreq * powernow_k6_get_cpu_multiplier());
195 return ret;
189} 196}
190 197
191static struct freq_attr *powernow_k6_attr[] = { 198static struct freq_attr *powernow_k6_attr[] = {
@@ -221,7 +228,7 @@ static int __init powernow_k6_init(void)
221 return -ENODEV; 228 return -ENODEV;
222 229
223 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { 230 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
224 printk("cpufreq: PowerNow IOPORT region already used.\n"); 231 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
225 return -EIO; 232 return -EIO;
226 } 233 }
227 234
@@ -246,7 +253,8 @@ static void __exit powernow_k6_exit(void)
246} 253}
247 254
248 255
249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); 256MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
257 "Dominik Brodowski <linux@brodo.de>");
250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 258MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
251MODULE_LICENSE("GPL"); 259MODULE_LICENSE("GPL");
252 260
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 1b446d79a8fd..3c28ccd49742 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -6,10 +6,12 @@
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD. 7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 * 8 *
9 * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt. 9 * Errata 5:
10 * - We cli/sti on stepping A0 CPUs around the FID/VID transition. 10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect. 11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * - We disable half multipliers if ACPI is used on A0 stepping CPUs. 12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
13 */ 15 */
14 16
15#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -20,11 +22,11 @@
20#include <linux/slab.h> 22#include <linux/slab.h>
21#include <linux/string.h> 23#include <linux/string.h>
22#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
23 27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
24#include <asm/msr.h> 29#include <asm/msr.h>
25#include <asm/timer.h>
26#include <asm/timex.h>
27#include <asm/io.h>
28#include <asm/system.h> 30#include <asm/system.h>
29 31
30#ifdef CONFIG_X86_POWERNOW_K7_ACPI 32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -58,9 +60,9 @@ struct pst_s {
58union powernow_acpi_control_t { 60union powernow_acpi_control_t {
59 struct { 61 struct {
60 unsigned long fid:5, 62 unsigned long fid:5,
61 vid:5, 63 vid:5,
62 sgtc:20, 64 sgtc:20,
63 res1:2; 65 res1:2;
64 } bits; 66 } bits;
65 unsigned long val; 67 unsigned long val;
66}; 68};
@@ -94,14 +96,15 @@ static struct cpufreq_frequency_table *powernow_table;
94 96
95static unsigned int can_scale_bus; 97static unsigned int can_scale_bus;
96static unsigned int can_scale_vid; 98static unsigned int can_scale_vid;
97static unsigned int minimum_speed=-1; 99static unsigned int minimum_speed = -1;
98static unsigned int maximum_speed; 100static unsigned int maximum_speed;
99static unsigned int number_scales; 101static unsigned int number_scales;
100static unsigned int fsb; 102static unsigned int fsb;
101static unsigned int latency; 103static unsigned int latency;
102static char have_a0; 104static char have_a0;
103 105
104#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg) 106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
105 108
106static int check_fsb(unsigned int fsbspeed) 109static int check_fsb(unsigned int fsbspeed)
107{ 110{
@@ -109,7 +112,7 @@ static int check_fsb(unsigned int fsbspeed)
109 unsigned int f = fsb / 1000; 112 unsigned int f = fsb / 1000;
110 113
111 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; 114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
112 return (delta < 5); 115 return delta < 5;
113} 116}
114 117
115static int check_powernow(void) 118static int check_powernow(void)
@@ -117,24 +120,26 @@ static int check_powernow(void)
117 struct cpuinfo_x86 *c = &cpu_data(0); 120 struct cpuinfo_x86 *c = &cpu_data(0);
118 unsigned int maxei, eax, ebx, ecx, edx; 121 unsigned int maxei, eax, ebx, ecx, edx;
119 122
120 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) { 123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
121#ifdef MODULE 124#ifdef MODULE
122 printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n"); 125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
123#endif 127#endif
124 return 0; 128 return 0;
125 } 129 }
126 130
127 /* Get maximum capabilities */ 131 /* Get maximum capabilities */
128 maxei = cpuid_eax (0x80000000); 132 maxei = cpuid_eax(0x80000000);
129 if (maxei < 0x80000007) { /* Any powernow info ? */ 133 if (maxei < 0x80000007) { /* Any powernow info ? */
130#ifdef MODULE 134#ifdef MODULE
131 printk (KERN_INFO PFX "No powernow capabilities detected\n"); 135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
132#endif 136#endif
133 return 0; 137 return 0;
134 } 138 }
135 139
136 if ((c->x86_model == 6) && (c->x86_mask == 0)) { 140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
137 printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n"); 141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
138 have_a0 = 1; 143 have_a0 = 1;
139 } 144 }
140 145
@@ -144,37 +149,42 @@ static int check_powernow(void)
144 if (!(edx & (1 << 1 | 1 << 2))) 149 if (!(edx & (1 << 1 | 1 << 2)))
145 return 0; 150 return 0;
146 151
147 printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); 152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
148 153
149 if (edx & 1 << 1) { 154 if (edx & 1 << 1) {
150 printk ("frequency"); 155 printk("frequency");
151 can_scale_bus=1; 156 can_scale_bus = 1;
152 } 157 }
153 158
154 if ((edx & (1 << 1 | 1 << 2)) == 0x6) 159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
155 printk (" and "); 160 printk(" and ");
156 161
157 if (edx & 1 << 2) { 162 if (edx & 1 << 2) {
158 printk ("voltage"); 163 printk("voltage");
159 can_scale_vid=1; 164 can_scale_vid = 1;
160 } 165 }
161 166
162 printk (".\n"); 167 printk(".\n");
163 return 1; 168 return 1;
164} 169}
165 170
171static void invalidate_entry(unsigned int entry)
172{
173 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
174}
166 175
167static int get_ranges (unsigned char *pst) 176static int get_ranges(unsigned char *pst)
168{ 177{
169 unsigned int j; 178 unsigned int j;
170 unsigned int speed; 179 unsigned int speed;
171 u8 fid, vid; 180 u8 fid, vid;
172 181
173 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL); 182 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
183 (number_scales + 1)), GFP_KERNEL);
174 if (!powernow_table) 184 if (!powernow_table)
175 return -ENOMEM; 185 return -ENOMEM;
176 186
177 for (j=0 ; j < number_scales; j++) { 187 for (j = 0 ; j < number_scales; j++) {
178 fid = *pst++; 188 fid = *pst++;
179 189
180 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; 190 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
@@ -182,10 +192,10 @@ static int get_ranges (unsigned char *pst)
182 192
183 speed = powernow_table[j].frequency; 193 speed = powernow_table[j].frequency;
184 194
185 if ((fid_codes[fid] % 10)==5) { 195 if ((fid_codes[fid] % 10) == 5) {
186#ifdef CONFIG_X86_POWERNOW_K7_ACPI 196#ifdef CONFIG_X86_POWERNOW_K7_ACPI
187 if (have_a0 == 1) 197 if (have_a0 == 1)
188 powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID; 198 invalidate_entry(j);
189#endif 199#endif
190 } 200 }
191 201
@@ -197,7 +207,7 @@ static int get_ranges (unsigned char *pst)
197 vid = *pst++; 207 vid = *pst++;
198 powernow_table[j].index |= (vid << 8); /* upper 8 bits */ 208 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
199 209
200 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " 210 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
201 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, 211 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
202 fid_codes[fid] % 10, speed/1000, vid, 212 fid_codes[fid] % 10, speed/1000, vid,
203 mobile_vid_table[vid]/1000, 213 mobile_vid_table[vid]/1000,
@@ -214,13 +224,13 @@ static void change_FID(int fid)
214{ 224{
215 union msr_fidvidctl fidvidctl; 225 union msr_fidvidctl fidvidctl;
216 226
217 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 227 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
218 if (fidvidctl.bits.FID != fid) { 228 if (fidvidctl.bits.FID != fid) {
219 fidvidctl.bits.SGTC = latency; 229 fidvidctl.bits.SGTC = latency;
220 fidvidctl.bits.FID = fid; 230 fidvidctl.bits.FID = fid;
221 fidvidctl.bits.VIDC = 0; 231 fidvidctl.bits.VIDC = 0;
222 fidvidctl.bits.FIDC = 1; 232 fidvidctl.bits.FIDC = 1;
223 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 233 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
224 } 234 }
225} 235}
226 236
@@ -229,18 +239,18 @@ static void change_VID(int vid)
229{ 239{
230 union msr_fidvidctl fidvidctl; 240 union msr_fidvidctl fidvidctl;
231 241
232 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 242 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
233 if (fidvidctl.bits.VID != vid) { 243 if (fidvidctl.bits.VID != vid) {
234 fidvidctl.bits.SGTC = latency; 244 fidvidctl.bits.SGTC = latency;
235 fidvidctl.bits.VID = vid; 245 fidvidctl.bits.VID = vid;
236 fidvidctl.bits.FIDC = 0; 246 fidvidctl.bits.FIDC = 0;
237 fidvidctl.bits.VIDC = 1; 247 fidvidctl.bits.VIDC = 1;
238 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 248 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
239 } 249 }
240} 250}
241 251
242 252
243static void change_speed (unsigned int index) 253static void change_speed(unsigned int index)
244{ 254{
245 u8 fid, vid; 255 u8 fid, vid;
246 struct cpufreq_freqs freqs; 256 struct cpufreq_freqs freqs;
@@ -257,7 +267,7 @@ static void change_speed (unsigned int index)
257 267
258 freqs.cpu = 0; 268 freqs.cpu = 0;
259 269
260 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 270 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
261 cfid = fidvidstatus.bits.CFID; 271 cfid = fidvidstatus.bits.CFID;
262 freqs.old = fsb * fid_codes[cfid] / 10; 272 freqs.old = fsb * fid_codes[cfid] / 10;
263 273
@@ -321,12 +331,14 @@ static int powernow_acpi_init(void)
321 goto err1; 331 goto err1;
322 } 332 }
323 333
324 if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { 334 if (acpi_processor_perf->control_register.space_id !=
335 ACPI_ADR_SPACE_FIXED_HARDWARE) {
325 retval = -ENODEV; 336 retval = -ENODEV;
326 goto err2; 337 goto err2;
327 } 338 }
328 339
329 if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { 340 if (acpi_processor_perf->status_register.space_id !=
341 ACPI_ADR_SPACE_FIXED_HARDWARE) {
330 retval = -ENODEV; 342 retval = -ENODEV;
331 goto err2; 343 goto err2;
332 } 344 }
@@ -338,7 +350,8 @@ static int powernow_acpi_init(void)
338 goto err2; 350 goto err2;
339 } 351 }
340 352
341 powernow_table = kzalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL); 353 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
354 (number_scales + 1)), GFP_KERNEL);
342 if (!powernow_table) { 355 if (!powernow_table) {
343 retval = -ENOMEM; 356 retval = -ENOMEM;
344 goto err2; 357 goto err2;
@@ -352,7 +365,7 @@ static int powernow_acpi_init(void)
352 unsigned int speed, speed_mhz; 365 unsigned int speed, speed_mhz;
353 366
354 pc.val = (unsigned long) state->control; 367 pc.val = (unsigned long) state->control;
355 dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", 368 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
356 i, 369 i,
357 (u32) state->core_frequency, 370 (u32) state->core_frequency,
358 (u32) state->power, 371 (u32) state->power,
@@ -381,12 +394,12 @@ static int powernow_acpi_init(void)
381 if (speed % 1000 > 0) 394 if (speed % 1000 > 0)
382 speed_mhz++; 395 speed_mhz++;
383 396
384 if ((fid_codes[fid] % 10)==5) { 397 if ((fid_codes[fid] % 10) == 5) {
385 if (have_a0 == 1) 398 if (have_a0 == 1)
386 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 399 invalidate_entry(i);
387 } 400 }
388 401
389 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " 402 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
390 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, 403 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
391 fid_codes[fid] % 10, speed_mhz, vid, 404 fid_codes[fid] % 10, speed_mhz, vid,
392 mobile_vid_table[vid]/1000, 405 mobile_vid_table[vid]/1000,
@@ -422,7 +435,8 @@ err1:
422err05: 435err05:
423 kfree(acpi_processor_perf); 436 kfree(acpi_processor_perf);
424err0: 437err0:
425 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); 438 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
439 "this platform\n");
426 acpi_processor_perf = NULL; 440 acpi_processor_perf = NULL;
427 return retval; 441 return retval;
428} 442}
@@ -435,7 +449,14 @@ static int powernow_acpi_init(void)
435} 449}
436#endif 450#endif
437 451
438static int powernow_decode_bios (int maxfid, int startvid) 452static void print_pst_entry(struct pst_s *pst, unsigned int j)
453{
454 dprintk("PST:%d (@%p)\n", j, pst);
455 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
456 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
457}
458
459static int powernow_decode_bios(int maxfid, int startvid)
439{ 460{
440 struct psb_s *psb; 461 struct psb_s *psb;
441 struct pst_s *pst; 462 struct pst_s *pst;
@@ -446,61 +467,67 @@ static int powernow_decode_bios (int maxfid, int startvid)
446 467
447 etuple = cpuid_eax(0x80000001); 468 etuple = cpuid_eax(0x80000001);
448 469
449 for (i=0xC0000; i < 0xffff0 ; i+=16) { 470 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
450 471
451 p = phys_to_virt(i); 472 p = phys_to_virt(i);
452 473
453 if (memcmp(p, "AMDK7PNOW!", 10) == 0){ 474 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
454 dprintk ("Found PSB header at %p\n", p); 475 dprintk("Found PSB header at %p\n", p);
455 psb = (struct psb_s *) p; 476 psb = (struct psb_s *) p;
456 dprintk ("Table version: 0x%x\n", psb->tableversion); 477 dprintk("Table version: 0x%x\n", psb->tableversion);
457 if (psb->tableversion != 0x12) { 478 if (psb->tableversion != 0x12) {
458 printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n"); 479 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
480 " supported right now\n");
459 return -ENODEV; 481 return -ENODEV;
460 } 482 }
461 483
462 dprintk ("Flags: 0x%x\n", psb->flags); 484 dprintk("Flags: 0x%x\n", psb->flags);
463 if ((psb->flags & 1)==0) { 485 if ((psb->flags & 1) == 0)
464 dprintk ("Mobile voltage regulator\n"); 486 dprintk("Mobile voltage regulator\n");
465 } else { 487 else
466 dprintk ("Desktop voltage regulator\n"); 488 dprintk("Desktop voltage regulator\n");
467 }
468 489
469 latency = psb->settlingtime; 490 latency = psb->settlingtime;
470 if (latency < 100) { 491 if (latency < 100) {
471 printk(KERN_INFO PFX "BIOS set settling time to %d microseconds. " 492 printk(KERN_INFO PFX "BIOS set settling time "
472 "Should be at least 100. Correcting.\n", latency); 493 "to %d microseconds. "
494 "Should be at least 100. "
495 "Correcting.\n", latency);
473 latency = 100; 496 latency = 100;
474 } 497 }
475 dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime); 498 dprintk("Settling Time: %d microseconds.\n",
476 dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst); 499 psb->settlingtime);
500 dprintk("Has %d PST tables. (Only dumping ones "
501 "relevant to this CPU).\n",
502 psb->numpst);
477 503
478 p += sizeof (struct psb_s); 504 p += sizeof(struct psb_s);
479 505
480 pst = (struct pst_s *) p; 506 pst = (struct pst_s *) p;
481 507
482 for (j=0; j<psb->numpst; j++) { 508 for (j = 0; j < psb->numpst; j++) {
483 pst = (struct pst_s *) p; 509 pst = (struct pst_s *) p;
484 number_scales = pst->numpstates; 510 number_scales = pst->numpstates;
485 511
486 if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) && 512 if ((etuple == pst->cpuid) &&
487 (maxfid==pst->maxfid) && (startvid==pst->startvid)) 513 check_fsb(pst->fsbspeed) &&
488 { 514 (maxfid == pst->maxfid) &&
489 dprintk ("PST:%d (@%p)\n", j, pst); 515 (startvid == pst->startvid)) {
490 dprintk (" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", 516 print_pst_entry(pst, j);
491 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); 517 p = (char *)pst + sizeof(struct pst_s);
492 518 ret = get_ranges(p);
493 ret = get_ranges ((char *) pst + sizeof (struct pst_s));
494 return ret; 519 return ret;
495 } else { 520 } else {
496 unsigned int k; 521 unsigned int k;
497 p = (char *) pst + sizeof (struct pst_s); 522 p = (char *)pst + sizeof(struct pst_s);
498 for (k=0; k<number_scales; k++) 523 for (k = 0; k < number_scales; k++)
499 p+=2; 524 p += 2;
500 } 525 }
501 } 526 }
502 printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple); 527 printk(KERN_INFO PFX "No PST tables match this cpuid "
503 printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n"); 528 "(0x%x)\n", etuple);
529 printk(KERN_INFO PFX "This is indicative of a broken "
530 "BIOS.\n");
504 531
505 return -EINVAL; 532 return -EINVAL;
506 } 533 }
@@ -511,13 +538,14 @@ static int powernow_decode_bios (int maxfid, int startvid)
511} 538}
512 539
513 540
514static int powernow_target (struct cpufreq_policy *policy, 541static int powernow_target(struct cpufreq_policy *policy,
515 unsigned int target_freq, 542 unsigned int target_freq,
516 unsigned int relation) 543 unsigned int relation)
517{ 544{
518 unsigned int newstate; 545 unsigned int newstate;
519 546
520 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate)) 547 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
548 relation, &newstate))
521 return -EINVAL; 549 return -EINVAL;
522 550
523 change_speed(newstate); 551 change_speed(newstate);
@@ -526,7 +554,7 @@ static int powernow_target (struct cpufreq_policy *policy,
526} 554}
527 555
528 556
529static int powernow_verify (struct cpufreq_policy *policy) 557static int powernow_verify(struct cpufreq_policy *policy)
530{ 558{
531 return cpufreq_frequency_table_verify(policy, powernow_table); 559 return cpufreq_frequency_table_verify(policy, powernow_table);
532} 560}
@@ -566,18 +594,23 @@ static unsigned int powernow_get(unsigned int cpu)
566 594
567 if (cpu) 595 if (cpu)
568 return 0; 596 return 0;
569 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 597 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
570 cfid = fidvidstatus.bits.CFID; 598 cfid = fidvidstatus.bits.CFID;
571 599
572 return (fsb * fid_codes[cfid] / 10); 600 return fsb * fid_codes[cfid] / 10;
573} 601}
574 602
575 603
576static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 604static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
577{ 605{
578 printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident); 606 printk(KERN_WARNING PFX
579 printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n"); 607 "%s laptop with broken PST tables in BIOS detected.\n",
580 printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n"); 608 d->ident);
609 printk(KERN_WARNING PFX
610 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
611 "BIOS than 3A71 (01/20/2003)\n");
612 printk(KERN_WARNING PFX
613 "cpufreq scaling has been disabled as a result of this.\n");
581 return 0; 614 return 0;
582} 615}
583 616
@@ -598,7 +631,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
598 { } 631 { }
599}; 632};
600 633
601static int __init powernow_cpu_init (struct cpufreq_policy *policy) 634static int __init powernow_cpu_init(struct cpufreq_policy *policy)
602{ 635{
603 union msr_fidvidstatus fidvidstatus; 636 union msr_fidvidstatus fidvidstatus;
604 int result; 637 int result;
@@ -606,7 +639,7 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
606 if (policy->cpu != 0) 639 if (policy->cpu != 0)
607 return -ENODEV; 640 return -ENODEV;
608 641
609 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 642 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
610 643
611 recalibrate_cpu_khz(); 644 recalibrate_cpu_khz();
612 645
@@ -618,19 +651,21 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
618 dprintk("FSB: %3dMHz\n", fsb/1000); 651 dprintk("FSB: %3dMHz\n", fsb/1000);
619 652
620 if (dmi_check_system(powernow_dmi_table) || acpi_force) { 653 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
621 printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n"); 654 printk(KERN_INFO PFX "PSB/PST known to be broken. "
655 "Trying ACPI instead\n");
622 result = powernow_acpi_init(); 656 result = powernow_acpi_init();
623 } else { 657 } else {
624 result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID); 658 result = powernow_decode_bios(fidvidstatus.bits.MFID,
659 fidvidstatus.bits.SVID);
625 if (result) { 660 if (result) {
626 printk (KERN_INFO PFX "Trying ACPI perflib\n"); 661 printk(KERN_INFO PFX "Trying ACPI perflib\n");
627 maximum_speed = 0; 662 maximum_speed = 0;
628 minimum_speed = -1; 663 minimum_speed = -1;
629 latency = 0; 664 latency = 0;
630 result = powernow_acpi_init(); 665 result = powernow_acpi_init();
631 if (result) { 666 if (result) {
632 printk (KERN_INFO PFX "ACPI and legacy methods failed\n"); 667 printk(KERN_INFO PFX
633 printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n"); 668 "ACPI and legacy methods failed\n");
634 } 669 }
635 } else { 670 } else {
636 /* SGTC use the bus clock as timer */ 671 /* SGTC use the bus clock as timer */
@@ -642,10 +677,11 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
642 if (result) 677 if (result)
643 return result; 678 return result;
644 679
645 printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", 680 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
646 minimum_speed/1000, maximum_speed/1000); 681 minimum_speed/1000, maximum_speed/1000);
647 682
648 policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency); 683 policy->cpuinfo.transition_latency =
684 cpufreq_scale(2000000UL, fsb, latency);
649 685
650 policy->cur = powernow_get(0); 686 policy->cur = powernow_get(0);
651 687
@@ -654,7 +690,8 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
654 return cpufreq_frequency_table_cpuinfo(policy, powernow_table); 690 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
655} 691}
656 692
657static int powernow_cpu_exit (struct cpufreq_policy *policy) { 693static int powernow_cpu_exit(struct cpufreq_policy *policy)
694{
658 cpufreq_frequency_table_put_attr(policy->cpu); 695 cpufreq_frequency_table_put_attr(policy->cpu);
659 696
660#ifdef CONFIG_X86_POWERNOW_K7_ACPI 697#ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -669,7 +706,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
669 return 0; 706 return 0;
670} 707}
671 708
672static struct freq_attr* powernow_table_attr[] = { 709static struct freq_attr *powernow_table_attr[] = {
673 &cpufreq_freq_attr_scaling_available_freqs, 710 &cpufreq_freq_attr_scaling_available_freqs,
674 NULL, 711 NULL,
675}; 712};
@@ -685,15 +722,15 @@ static struct cpufreq_driver powernow_driver = {
685 .attr = powernow_table_attr, 722 .attr = powernow_table_attr,
686}; 723};
687 724
688static int __init powernow_init (void) 725static int __init powernow_init(void)
689{ 726{
690 if (check_powernow()==0) 727 if (check_powernow() == 0)
691 return -ENODEV; 728 return -ENODEV;
692 return cpufreq_register_driver(&powernow_driver); 729 return cpufreq_register_driver(&powernow_driver);
693} 730}
694 731
695 732
696static void __exit powernow_exit (void) 733static void __exit powernow_exit(void)
697{ 734{
698 cpufreq_unregister_driver(&powernow_driver); 735 cpufreq_unregister_driver(&powernow_driver);
699} 736}
@@ -701,9 +738,9 @@ static void __exit powernow_exit (void)
701module_param(acpi_force, int, 0444); 738module_param(acpi_force, int, 0444);
702MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 739MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
703 740
704MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); 741MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
705MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 742MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
706MODULE_LICENSE ("GPL"); 743MODULE_LICENSE("GPL");
707 744
708late_initcall(powernow_init); 745late_initcall(powernow_init);
709module_exit(powernow_exit); 746module_exit(powernow_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index fb039cd345d8..4709ead2db52 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -33,16 +33,14 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/cpumask.h> 34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */ 35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
36 38
37#include <asm/msr.h> 39#include <asm/msr.h>
38#include <asm/io.h>
39#include <asm/delay.h>
40 40
41#ifdef CONFIG_X86_POWERNOW_K8_ACPI
42#include <linux/acpi.h> 41#include <linux/acpi.h>
43#include <linux/mutex.h> 42#include <linux/mutex.h>
44#include <acpi/processor.h> 43#include <acpi/processor.h>
45#endif
46 44
47#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
48#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
@@ -56,7 +54,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
56static int cpu_family = CPU_OPTERON; 54static int cpu_family = CPU_OPTERON;
57 55
58#ifndef CONFIG_SMP 56#ifndef CONFIG_SMP
59DEFINE_PER_CPU(cpumask_t, cpu_core_map); 57static inline const struct cpumask *cpu_core_mask(int cpu)
58{
59 return cpumask_of(0);
60}
60#endif 61#endif
61 62
62/* Return a frequency in MHz, given an input fid */ 63/* Return a frequency in MHz, given an input fid */
@@ -71,7 +72,8 @@ static u32 find_khz_freq_from_fid(u32 fid)
71 return 1000 * find_freq_from_fid(fid); 72 return 1000 * find_freq_from_fid(fid);
72} 73}
73 74
74static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 pstate) 75static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
76 u32 pstate)
75{ 77{
76 return data[pstate].frequency; 78 return data[pstate].frequency;
77} 79}
@@ -186,7 +188,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
186 return 1; 188 return 1;
187 } 189 }
188 190
189 lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; 191 lo = fid;
192 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
193 lo |= MSR_C_LO_INIT_FID_VID;
190 194
191 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", 195 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
192 fid, lo, data->plllock * PLL_LOCK_CONVERSION); 196 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
@@ -194,7 +198,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
194 do { 198 do {
195 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); 199 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
196 if (i++ > 100) { 200 if (i++ > 100) {
197 printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n"); 201 printk(KERN_ERR PFX
202 "Hardware error - pending bit very stuck - "
203 "no further pstate changes possible\n");
198 return 1; 204 return 1;
199 } 205 }
200 } while (query_current_values_with_pending_wait(data)); 206 } while (query_current_values_with_pending_wait(data));
@@ -202,14 +208,16 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
202 count_off_irt(data); 208 count_off_irt(data);
203 209
204 if (savevid != data->currvid) { 210 if (savevid != data->currvid) {
205 printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n", 211 printk(KERN_ERR PFX
206 savevid, data->currvid); 212 "vid change on fid trans, old 0x%x, new 0x%x\n",
213 savevid, data->currvid);
207 return 1; 214 return 1;
208 } 215 }
209 216
210 if (fid != data->currfid) { 217 if (fid != data->currfid) {
211 printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid, 218 printk(KERN_ERR PFX
212 data->currfid); 219 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
220 data->currfid);
213 return 1; 221 return 1;
214 } 222 }
215 223
@@ -228,7 +236,9 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
228 return 1; 236 return 1;
229 } 237 }
230 238
231 lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; 239 lo = data->currfid;
240 lo |= (vid << MSR_C_LO_VID_SHIFT);
241 lo |= MSR_C_LO_INIT_FID_VID;
232 242
233 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", 243 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
234 vid, lo, STOP_GRANT_5NS); 244 vid, lo, STOP_GRANT_5NS);
@@ -236,20 +246,24 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
236 do { 246 do {
237 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); 247 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
238 if (i++ > 100) { 248 if (i++ > 100) {
239 printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n"); 249 printk(KERN_ERR PFX "internal error - pending bit "
250 "very stuck - no further pstate "
251 "changes possible\n");
240 return 1; 252 return 1;
241 } 253 }
242 } while (query_current_values_with_pending_wait(data)); 254 } while (query_current_values_with_pending_wait(data));
243 255
244 if (savefid != data->currfid) { 256 if (savefid != data->currfid) {
245 printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n", 257 printk(KERN_ERR PFX "fid changed on vid trans, old "
258 "0x%x new 0x%x\n",
246 savefid, data->currfid); 259 savefid, data->currfid);
247 return 1; 260 return 1;
248 } 261 }
249 262
250 if (vid != data->currvid) { 263 if (vid != data->currvid) {
251 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid, 264 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
252 data->currvid); 265 "curr 0x%x\n",
266 vid, data->currvid);
253 return 1; 267 return 1;
254 } 268 }
255 269
@@ -261,7 +275,8 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
261 * Decreasing vid codes represent increasing voltages: 275 * Decreasing vid codes represent increasing voltages:
262 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. 276 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
263 */ 277 */
264static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step) 278static int decrease_vid_code_by_step(struct powernow_k8_data *data,
279 u32 reqvid, u32 step)
265{ 280{
266 if ((data->currvid - reqvid) > step) 281 if ((data->currvid - reqvid) > step)
267 reqvid = data->currvid - step; 282 reqvid = data->currvid - step;
@@ -283,7 +298,8 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
283} 298}
284 299
285/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ 300/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
286static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) 301static int transition_fid_vid(struct powernow_k8_data *data,
302 u32 reqfid, u32 reqvid)
287{ 303{
288 if (core_voltage_pre_transition(data, reqvid)) 304 if (core_voltage_pre_transition(data, reqvid))
289 return 1; 305 return 1;
@@ -298,7 +314,8 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
298 return 1; 314 return 1;
299 315
300 if ((reqfid != data->currfid) || (reqvid != data->currvid)) { 316 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
301 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n", 317 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
318 "curr 0x%x 0x%x\n",
302 smp_processor_id(), 319 smp_processor_id(),
303 reqfid, reqvid, data->currfid, data->currvid); 320 reqfid, reqvid, data->currfid, data->currvid);
304 return 1; 321 return 1;
@@ -311,13 +328,15 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
311} 328}
312 329
313/* Phase 1 - core voltage transition ... setup voltage */ 330/* Phase 1 - core voltage transition ... setup voltage */
314static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid) 331static int core_voltage_pre_transition(struct powernow_k8_data *data,
332 u32 reqvid)
315{ 333{
316 u32 rvosteps = data->rvo; 334 u32 rvosteps = data->rvo;
317 u32 savefid = data->currfid; 335 u32 savefid = data->currfid;
318 u32 maxvid, lo; 336 u32 maxvid, lo;
319 337
320 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n", 338 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
339 "reqvid 0x%x, rvo 0x%x\n",
321 smp_processor_id(), 340 smp_processor_id(),
322 data->currfid, data->currvid, reqvid, data->rvo); 341 data->currfid, data->currvid, reqvid, data->rvo);
323 342
@@ -340,7 +359,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
340 } else { 359 } else {
341 dprintk("ph1: changing vid for rvo, req 0x%x\n", 360 dprintk("ph1: changing vid for rvo, req 0x%x\n",
342 data->currvid - 1); 361 data->currvid - 1);
343 if (decrease_vid_code_by_step(data, data->currvid - 1, 1)) 362 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
344 return 1; 363 return 1;
345 rvosteps--; 364 rvosteps--;
346 } 365 }
@@ -350,7 +369,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
350 return 1; 369 return 1;
351 370
352 if (savefid != data->currfid) { 371 if (savefid != data->currfid) {
353 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid); 372 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
373 data->currfid);
354 return 1; 374 return 1;
355 } 375 }
356 376
@@ -363,20 +383,24 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
363/* Phase 2 - core frequency transition */ 383/* Phase 2 - core frequency transition */
364static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) 384static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
365{ 385{
366 u32 vcoreqfid, vcocurrfid, vcofiddiff, fid_interval, savevid = data->currvid; 386 u32 vcoreqfid, vcocurrfid, vcofiddiff;
387 u32 fid_interval, savevid = data->currvid;
367 388
368 if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { 389 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
369 printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n", 390 (data->currfid < HI_FID_TABLE_BOTTOM)) {
370 reqfid, data->currfid); 391 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
392 "0x%x 0x%x\n", reqfid, data->currfid);
371 return 1; 393 return 1;
372 } 394 }
373 395
374 if (data->currfid == reqfid) { 396 if (data->currfid == reqfid) {
375 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); 397 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
398 data->currfid);
376 return 0; 399 return 0;
377 } 400 }
378 401
379 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n", 402 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
403 "reqfid 0x%x\n",
380 smp_processor_id(), 404 smp_processor_id(),
381 data->currfid, data->currvid, reqfid); 405 data->currfid, data->currvid, reqfid);
382 406
@@ -390,14 +414,14 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
390 414
391 if (reqfid > data->currfid) { 415 if (reqfid > data->currfid) {
392 if (data->currfid > LO_FID_TABLE_TOP) { 416 if (data->currfid > LO_FID_TABLE_TOP) {
393 if (write_new_fid(data, data->currfid + fid_interval)) { 417 if (write_new_fid(data,
418 data->currfid + fid_interval))
394 return 1; 419 return 1;
395 }
396 } else { 420 } else {
397 if (write_new_fid 421 if (write_new_fid
398 (data, 2 + convert_fid_to_vco_fid(data->currfid))) { 422 (data,
423 2 + convert_fid_to_vco_fid(data->currfid)))
399 return 1; 424 return 1;
400 }
401 } 425 }
402 } else { 426 } else {
403 if (write_new_fid(data, data->currfid - fid_interval)) 427 if (write_new_fid(data, data->currfid - fid_interval))
@@ -417,7 +441,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
417 441
418 if (data->currfid != reqfid) { 442 if (data->currfid != reqfid) {
419 printk(KERN_ERR PFX 443 printk(KERN_ERR PFX
420 "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n", 444 "ph2: mismatch, failed fid transition, "
445 "curr 0x%x, req 0x%x\n",
421 data->currfid, reqfid); 446 data->currfid, reqfid);
422 return 1; 447 return 1;
423 } 448 }
@@ -435,7 +460,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
435} 460}
436 461
437/* Phase 3 - core voltage transition flow ... jump to the final vid. */ 462/* Phase 3 - core voltage transition flow ... jump to the final vid. */
438static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid) 463static int core_voltage_post_transition(struct powernow_k8_data *data,
464 u32 reqvid)
439{ 465{
440 u32 savefid = data->currfid; 466 u32 savefid = data->currfid;
441 u32 savereqvid = reqvid; 467 u32 savereqvid = reqvid;
@@ -457,7 +483,8 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
457 483
458 if (data->currvid != reqvid) { 484 if (data->currvid != reqvid) {
459 printk(KERN_ERR PFX 485 printk(KERN_ERR PFX
460 "ph3: failed vid transition\n, req 0x%x, curr 0x%x", 486 "ph3: failed vid transition\n, "
487 "req 0x%x, curr 0x%x",
461 reqvid, data->currvid); 488 reqvid, data->currvid);
462 return 1; 489 return 1;
463 } 490 }
@@ -508,7 +535,8 @@ static int check_supported_cpu(unsigned int cpu)
508 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { 535 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
509 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || 536 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
510 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { 537 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
511 printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); 538 printk(KERN_INFO PFX
539 "Processor cpuid %x not supported\n", eax);
512 goto out; 540 goto out;
513 } 541 }
514 542
@@ -520,8 +548,10 @@ static int check_supported_cpu(unsigned int cpu)
520 } 548 }
521 549
522 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 550 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
523 if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { 551 if ((edx & P_STATE_TRANSITION_CAPABLE)
524 printk(KERN_INFO PFX "Power state transitions not supported\n"); 552 != P_STATE_TRANSITION_CAPABLE) {
553 printk(KERN_INFO PFX
554 "Power state transitions not supported\n");
525 goto out; 555 goto out;
526 } 556 }
527 } else { /* must be a HW Pstate capable processor */ 557 } else { /* must be a HW Pstate capable processor */
@@ -539,7 +569,8 @@ out:
539 return rc; 569 return rc;
540} 570}
541 571
542static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) 572static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
573 u8 maxvid)
543{ 574{
544 unsigned int j; 575 unsigned int j;
545 u8 lastfid = 0xff; 576 u8 lastfid = 0xff;
@@ -550,12 +581,14 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
550 j, pst[j].vid); 581 j, pst[j].vid);
551 return -EINVAL; 582 return -EINVAL;
552 } 583 }
553 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ 584 if (pst[j].vid < data->rvo) {
585 /* vid + rvo >= 0 */
554 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" 586 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
555 " %d\n", j); 587 " %d\n", j);
556 return -ENODEV; 588 return -ENODEV;
557 } 589 }
558 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ 590 if (pst[j].vid < maxvid + data->rvo) {
591 /* vid + rvo >= maxvid */
559 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" 592 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
560 " %d\n", j); 593 " %d\n", j);
561 return -ENODEV; 594 return -ENODEV;
@@ -579,23 +612,31 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
579 return -EINVAL; 612 return -EINVAL;
580 } 613 }
581 if (lastfid > LO_FID_TABLE_TOP) 614 if (lastfid > LO_FID_TABLE_TOP)
582 printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n"); 615 printk(KERN_INFO FW_BUG PFX
616 "first fid not from lo freq table\n");
583 617
584 return 0; 618 return 0;
585} 619}
586 620
621static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
622{
623 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
624}
625
587static void print_basics(struct powernow_k8_data *data) 626static void print_basics(struct powernow_k8_data *data)
588{ 627{
589 int j; 628 int j;
590 for (j = 0; j < data->numps; j++) { 629 for (j = 0; j < data->numps; j++) {
591 if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) { 630 if (data->powernow_table[j].frequency !=
631 CPUFREQ_ENTRY_INVALID) {
592 if (cpu_family == CPU_HW_PSTATE) { 632 if (cpu_family == CPU_HW_PSTATE) {
593 printk(KERN_INFO PFX " %d : pstate %d (%d MHz)\n", 633 printk(KERN_INFO PFX
594 j, 634 " %d : pstate %d (%d MHz)\n", j,
595 data->powernow_table[j].index, 635 data->powernow_table[j].index,
596 data->powernow_table[j].frequency/1000); 636 data->powernow_table[j].frequency/1000);
597 } else { 637 } else {
598 printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", 638 printk(KERN_INFO PFX
639 " %d : fid 0x%x (%d MHz), vid 0x%x\n",
599 j, 640 j,
600 data->powernow_table[j].index & 0xff, 641 data->powernow_table[j].index & 0xff,
601 data->powernow_table[j].frequency/1000, 642 data->powernow_table[j].frequency/1000,
@@ -604,20 +645,25 @@ static void print_basics(struct powernow_k8_data *data)
604 } 645 }
605 } 646 }
606 if (data->batps) 647 if (data->batps)
607 printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps); 648 printk(KERN_INFO PFX "Only %d pstates on battery\n",
649 data->batps);
608} 650}
609 651
610static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) 652static int fill_powernow_table(struct powernow_k8_data *data,
653 struct pst_s *pst, u8 maxvid)
611{ 654{
612 struct cpufreq_frequency_table *powernow_table; 655 struct cpufreq_frequency_table *powernow_table;
613 unsigned int j; 656 unsigned int j;
614 657
615 if (data->batps) { /* use ACPI support to get full speed on mains power */ 658 if (data->batps) {
616 printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps); 659 /* use ACPI support to get full speed on mains power */
660 printk(KERN_WARNING PFX
661 "Only %d pstates usable (use ACPI driver for full "
662 "range\n", data->batps);
617 data->numps = data->batps; 663 data->numps = data->batps;
618 } 664 }
619 665
620 for ( j=1; j<data->numps; j++ ) { 666 for (j = 1; j < data->numps; j++) {
621 if (pst[j-1].fid >= pst[j].fid) { 667 if (pst[j-1].fid >= pst[j].fid) {
622 printk(KERN_ERR PFX "PST out of sequence\n"); 668 printk(KERN_ERR PFX "PST out of sequence\n");
623 return -EINVAL; 669 return -EINVAL;
@@ -640,9 +686,11 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
640 } 686 }
641 687
642 for (j = 0; j < data->numps; j++) { 688 for (j = 0; j < data->numps; j++) {
689 int freq;
643 powernow_table[j].index = pst[j].fid; /* lower 8 bits */ 690 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
644 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ 691 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
645 powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid); 692 freq = find_khz_freq_from_fid(pst[j].fid);
693 powernow_table[j].frequency = freq;
646 } 694 }
647 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; 695 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
648 powernow_table[data->numps].index = 0; 696 powernow_table[data->numps].index = 0;
@@ -654,11 +702,12 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
654 702
655 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); 703 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
656 data->powernow_table = powernow_table; 704 data->powernow_table = powernow_table;
657 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 705 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
658 print_basics(data); 706 print_basics(data);
659 707
660 for (j = 0; j < data->numps; j++) 708 for (j = 0; j < data->numps; j++)
661 if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid)) 709 if ((pst[j].fid == data->currfid) &&
710 (pst[j].vid == data->currvid))
662 return 0; 711 return 0;
663 712
664 dprintk("currfid/vid do not match PST, ignoring\n"); 713 dprintk("currfid/vid do not match PST, ignoring\n");
@@ -698,7 +747,8 @@ static int find_psb_table(struct powernow_k8_data *data)
698 } 747 }
699 748
700 data->vstable = psb->vstable; 749 data->vstable = psb->vstable;
701 dprintk("voltage stabilization time: %d(*20us)\n", data->vstable); 750 dprintk("voltage stabilization time: %d(*20us)\n",
751 data->vstable);
702 752
703 dprintk("flags2: 0x%x\n", psb->flags2); 753 dprintk("flags2: 0x%x\n", psb->flags2);
704 data->rvo = psb->flags2 & 3; 754 data->rvo = psb->flags2 & 3;
@@ -713,11 +763,12 @@ static int find_psb_table(struct powernow_k8_data *data)
713 763
714 dprintk("numpst: 0x%x\n", psb->num_tables); 764 dprintk("numpst: 0x%x\n", psb->num_tables);
715 cpst = psb->num_tables; 765 cpst = psb->num_tables;
716 if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){ 766 if ((psb->cpuid == 0x00000fc0) ||
767 (psb->cpuid == 0x00000fe0)) {
717 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 768 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
718 if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) { 769 if ((thiscpuid == 0x00000fc0) ||
770 (thiscpuid == 0x00000fe0))
719 cpst = 1; 771 cpst = 1;
720 }
721 } 772 }
722 if (cpst != 1) { 773 if (cpst != 1) {
723 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); 774 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
@@ -732,7 +783,8 @@ static int find_psb_table(struct powernow_k8_data *data)
732 783
733 data->numps = psb->numps; 784 data->numps = psb->numps;
734 dprintk("numpstates: 0x%x\n", data->numps); 785 dprintk("numpstates: 0x%x\n", data->numps);
735 return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); 786 return fill_powernow_table(data,
787 (struct pst_s *)(psb+1), maxvid);
736 } 788 }
737 /* 789 /*
738 * If you see this message, complain to BIOS manufacturer. If 790 * If you see this message, complain to BIOS manufacturer. If
@@ -745,28 +797,31 @@ static int find_psb_table(struct powernow_k8_data *data)
745 * BIOS and Kernel Developer's Guide, which is available on 797 * BIOS and Kernel Developer's Guide, which is available on
746 * www.amd.com 798 * www.amd.com
747 */ 799 */
748 printk(KERN_ERR PFX "BIOS error - no PSB or ACPI _PSS objects\n"); 800 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
749 return -ENODEV; 801 return -ENODEV;
750} 802}
751 803
752#ifdef CONFIG_X86_POWERNOW_K8_ACPI 804static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
753static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) 805 unsigned int index)
754{ 806{
807 acpi_integer control;
808
755 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 809 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
756 return; 810 return;
757 811
758 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; 812 control = data->acpi_data.states[index].control; data->irt = (control
759 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; 813 >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>
760 data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 814 RVO_SHIFT) & RVO_MASK; data->exttype = (control
761 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; 815 >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
762 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); 816 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1
763 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; 817 << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =
764} 818 (control >> VST_SHIFT) & VST_MASK; }
765 819
766static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 820static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
767{ 821{
768 struct cpufreq_frequency_table *powernow_table; 822 struct cpufreq_frequency_table *powernow_table;
769 int ret_val = -ENODEV; 823 int ret_val = -ENODEV;
824 acpi_integer space_id;
770 825
771 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 826 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
772 dprintk("register performance failed: bad ACPI data\n"); 827 dprintk("register performance failed: bad ACPI data\n");
@@ -779,11 +834,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
779 goto err_out; 834 goto err_out;
780 } 835 }
781 836
782 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 837 space_id = data->acpi_data.control_register.space_id;
783 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 838 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
839 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
784 dprintk("Invalid control/status registers (%x - %x)\n", 840 dprintk("Invalid control/status registers (%x - %x)\n",
785 data->acpi_data.control_register.space_id, 841 data->acpi_data.control_register.space_id,
786 data->acpi_data.status_register.space_id); 842 space_id);
787 goto err_out; 843 goto err_out;
788 } 844 }
789 845
@@ -802,13 +858,14 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
802 if (ret_val) 858 if (ret_val)
803 goto err_out_mem; 859 goto err_out_mem;
804 860
805 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; 861 powernow_table[data->acpi_data.state_count].frequency =
862 CPUFREQ_TABLE_END;
806 powernow_table[data->acpi_data.state_count].index = 0; 863 powernow_table[data->acpi_data.state_count].index = 0;
807 data->powernow_table = powernow_table; 864 data->powernow_table = powernow_table;
808 865
809 /* fill in data */ 866 /* fill in data */
810 data->numps = data->acpi_data.state_count; 867 data->numps = data->acpi_data.state_count;
811 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 868 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
812 print_basics(data); 869 print_basics(data);
813 powernow_k8_acpi_pst_values(data, 0); 870 powernow_k8_acpi_pst_values(data, 0);
814 871
@@ -830,13 +887,15 @@ err_out_mem:
830err_out: 887err_out:
831 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 888 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
832 889
833 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 890 /* data->acpi_data.state_count informs us at ->exit()
891 * whether ACPI was used */
834 data->acpi_data.state_count = 0; 892 data->acpi_data.state_count = 0;
835 893
836 return ret_val; 894 return ret_val;
837} 895}
838 896
839static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 897static int fill_powernow_table_pstate(struct powernow_k8_data *data,
898 struct cpufreq_frequency_table *powernow_table)
840{ 899{
841 int i; 900 int i;
842 u32 hi = 0, lo = 0; 901 u32 hi = 0, lo = 0;
@@ -848,84 +907,101 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
848 907
849 index = data->acpi_data.states[i].control & HW_PSTATE_MASK; 908 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
850 if (index > data->max_hw_pstate) { 909 if (index > data->max_hw_pstate) {
851 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); 910 printk(KERN_ERR PFX "invalid pstate %d - "
852 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); 911 "bad value %d.\n", i, index);
853 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 912 printk(KERN_ERR PFX "Please report to BIOS "
913 "manufacturer\n");
914 invalidate_entry(data, i);
854 continue; 915 continue;
855 } 916 }
856 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 917 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
857 if (!(hi & HW_PSTATE_VALID_MASK)) { 918 if (!(hi & HW_PSTATE_VALID_MASK)) {
858 dprintk("invalid pstate %d, ignoring\n", index); 919 dprintk("invalid pstate %d, ignoring\n", index);
859 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 920 invalidate_entry(data, i);
860 continue; 921 continue;
861 } 922 }
862 923
863 powernow_table[i].index = index; 924 powernow_table[i].index = index;
864 925
865 powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; 926 powernow_table[i].frequency =
927 data->acpi_data.states[i].core_frequency * 1000;
866 } 928 }
867 return 0; 929 return 0;
868} 930}
869 931
870static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 932static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
933 struct cpufreq_frequency_table *powernow_table)
871{ 934{
872 int i; 935 int i;
873 int cntlofreq = 0; 936 int cntlofreq = 0;
937
874 for (i = 0; i < data->acpi_data.state_count; i++) { 938 for (i = 0; i < data->acpi_data.state_count; i++) {
875 u32 fid; 939 u32 fid;
876 u32 vid; 940 u32 vid;
941 u32 freq, index;
942 acpi_integer status, control;
877 943
878 if (data->exttype) { 944 if (data->exttype) {
879 fid = data->acpi_data.states[i].status & EXT_FID_MASK; 945 status = data->acpi_data.states[i].status;
880 vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; 946 fid = status & EXT_FID_MASK;
947 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
881 } else { 948 } else {
882 fid = data->acpi_data.states[i].control & FID_MASK; 949 control = data->acpi_data.states[i].control;
883 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; 950 fid = control & FID_MASK;
951 vid = (control >> VID_SHIFT) & VID_MASK;
884 } 952 }
885 953
886 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); 954 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
887 955
888 powernow_table[i].index = fid; /* lower 8 bits */ 956 index = fid | (vid<<8);
889 powernow_table[i].index |= (vid << 8); /* upper 8 bits */ 957 powernow_table[i].index = index;
890 powernow_table[i].frequency = find_khz_freq_from_fid(fid); 958
959 freq = find_khz_freq_from_fid(fid);
960 powernow_table[i].frequency = freq;
891 961
892 /* verify frequency is OK */ 962 /* verify frequency is OK */
893 if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) || 963 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
894 (powernow_table[i].frequency < (MIN_FREQ * 1000))) { 964 dprintk("invalid freq %u kHz, ignoring\n", freq);
895 dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency); 965 invalidate_entry(data, i);
896 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
897 continue; 966 continue;
898 } 967 }
899 968
900 /* verify voltage is OK - BIOSs are using "off" to indicate invalid */ 969 /* verify voltage is OK -
970 * BIOSs are using "off" to indicate invalid */
901 if (vid == VID_OFF) { 971 if (vid == VID_OFF) {
902 dprintk("invalid vid %u, ignoring\n", vid); 972 dprintk("invalid vid %u, ignoring\n", vid);
903 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 973 invalidate_entry(data, i);
904 continue; 974 continue;
905 } 975 }
906 976
907 /* verify only 1 entry from the lo frequency table */ 977 /* verify only 1 entry from the lo frequency table */
908 if (fid < HI_FID_TABLE_BOTTOM) { 978 if (fid < HI_FID_TABLE_BOTTOM) {
909 if (cntlofreq) { 979 if (cntlofreq) {
910 /* if both entries are the same, ignore this one ... */ 980 /* if both entries are the same,
911 if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) || 981 * ignore this one ... */
912 (powernow_table[i].index != powernow_table[cntlofreq].index)) { 982 if ((freq != powernow_table[cntlofreq].frequency) ||
913 printk(KERN_ERR PFX "Too many lo freq table entries\n"); 983 (index != powernow_table[cntlofreq].index)) {
984 printk(KERN_ERR PFX
985 "Too many lo freq table "
986 "entries\n");
914 return 1; 987 return 1;
915 } 988 }
916 989
917 dprintk("double low frequency table entry, ignoring it.\n"); 990 dprintk("double low frequency table entry, "
918 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 991 "ignoring it.\n");
992 invalidate_entry(data, i);
919 continue; 993 continue;
920 } else 994 } else
921 cntlofreq = i; 995 cntlofreq = i;
922 } 996 }
923 997
924 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { 998 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
925 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", 999 printk(KERN_INFO PFX "invalid freq entries "
926 powernow_table[i].frequency, 1000 "%u kHz vs. %u kHz\n", freq,
927 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); 1001 (unsigned int)
928 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 1002 (data->acpi_data.states[i].core_frequency
1003 * 1000));
1004 invalidate_entry(data, i);
929 continue; 1005 continue;
930 } 1006 }
931 } 1007 }
@@ -935,7 +1011,8 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
935static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) 1011static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
936{ 1012{
937 if (data->acpi_data.state_count) 1013 if (data->acpi_data.state_count)
938 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 1014 acpi_processor_unregister_performance(&data->acpi_data,
1015 data->cpu);
939 free_cpumask_var(data->acpi_data.shared_cpu_map); 1016 free_cpumask_var(data->acpi_data.shared_cpu_map);
940} 1017}
941 1018
@@ -953,15 +1030,9 @@ static int get_transition_latency(struct powernow_k8_data *data)
953 return 1000 * max_latency; 1030 return 1000 * max_latency;
954} 1031}
955 1032
956#else
957static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
958static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
959static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
960static int get_transition_latency(struct powernow_k8_data *data) { return 0; }
961#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
962
963/* Take a frequency, and issue the fid/vid transition command */ 1033/* Take a frequency, and issue the fid/vid transition command */
964static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index) 1034static int transition_frequency_fidvid(struct powernow_k8_data *data,
1035 unsigned int index)
965{ 1036{
966 u32 fid = 0; 1037 u32 fid = 0;
967 u32 vid = 0; 1038 u32 vid = 0;
@@ -989,7 +1060,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
989 return 0; 1060 return 0;
990 } 1061 }
991 1062
992 if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { 1063 if ((fid < HI_FID_TABLE_BOTTOM) &&
1064 (data->currfid < HI_FID_TABLE_BOTTOM)) {
993 printk(KERN_ERR PFX 1065 printk(KERN_ERR PFX
994 "ignoring illegal change in lo freq table-%x to 0x%x\n", 1066 "ignoring illegal change in lo freq table-%x to 0x%x\n",
995 data->currfid, fid); 1067 data->currfid, fid);
@@ -1017,7 +1089,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
1017} 1089}
1018 1090
1019/* Take a frequency, and issue the hardware pstate transition command */ 1091/* Take a frequency, and issue the hardware pstate transition command */
1020static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index) 1092static int transition_frequency_pstate(struct powernow_k8_data *data,
1093 unsigned int index)
1021{ 1094{
1022 u32 pstate = 0; 1095 u32 pstate = 0;
1023 int res, i; 1096 int res, i;
@@ -1029,7 +1102,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1029 pstate = index & HW_PSTATE_MASK; 1102 pstate = index & HW_PSTATE_MASK;
1030 if (pstate > data->max_hw_pstate) 1103 if (pstate > data->max_hw_pstate)
1031 return 0; 1104 return 0;
1032 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1105 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1106 data->currpstate);
1033 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1107 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1034 1108
1035 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1109 for_each_cpu_mask_nr(i, *(data->available_cores)) {
@@ -1048,7 +1122,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1048} 1122}
1049 1123
1050/* Driver entry point to switch to the target frequency */ 1124/* Driver entry point to switch to the target frequency */
1051static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1125static int powernowk8_target(struct cpufreq_policy *pol,
1126 unsigned targfreq, unsigned relation)
1052{ 1127{
1053 cpumask_t oldmask; 1128 cpumask_t oldmask;
1054 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1129 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
@@ -1087,14 +1162,18 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1087 dprintk("targ: curr fid 0x%x, vid 0x%x\n", 1162 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1088 data->currfid, data->currvid); 1163 data->currfid, data->currvid);
1089 1164
1090 if ((checkvid != data->currvid) || (checkfid != data->currfid)) { 1165 if ((checkvid != data->currvid) ||
1166 (checkfid != data->currfid)) {
1091 printk(KERN_INFO PFX 1167 printk(KERN_INFO PFX
1092 "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n", 1168 "error - out of sync, fix 0x%x 0x%x, "
1093 checkfid, data->currfid, checkvid, data->currvid); 1169 "vid 0x%x 0x%x\n",
1170 checkfid, data->currfid,
1171 checkvid, data->currvid);
1094 } 1172 }
1095 } 1173 }
1096 1174
1097 if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate)) 1175 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1176 targfreq, relation, &newstate))
1098 goto err_out; 1177 goto err_out;
1099 1178
1100 mutex_lock(&fidvid_mutex); 1179 mutex_lock(&fidvid_mutex);
@@ -1114,7 +1193,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1114 mutex_unlock(&fidvid_mutex); 1193 mutex_unlock(&fidvid_mutex);
1115 1194
1116 if (cpu_family == CPU_HW_PSTATE) 1195 if (cpu_family == CPU_HW_PSTATE)
1117 pol->cur = find_khz_freq_from_pstate(data->powernow_table, newstate); 1196 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1197 newstate);
1118 else 1198 else
1119 pol->cur = find_khz_freq_from_fid(data->currfid); 1199 pol->cur = find_khz_freq_from_fid(data->currfid);
1120 ret = 0; 1200 ret = 0;
@@ -1141,6 +1221,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1141 struct powernow_k8_data *data; 1221 struct powernow_k8_data *data;
1142 cpumask_t oldmask; 1222 cpumask_t oldmask;
1143 int rc; 1223 int rc;
1224 static int print_once;
1144 1225
1145 if (!cpu_online(pol->cpu)) 1226 if (!cpu_online(pol->cpu))
1146 return -ENODEV; 1227 return -ENODEV;
@@ -1157,25 +1238,25 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1157 data->cpu = pol->cpu; 1238 data->cpu = pol->cpu;
1158 data->currpstate = HW_PSTATE_INVALID; 1239 data->currpstate = HW_PSTATE_INVALID;
1159 1240
1160 rc = powernow_k8_cpu_init_acpi(data); 1241 if (powernow_k8_cpu_init_acpi(data)) {
1161 if (rc) {
1162 /* 1242 /*
1163 * Use the PSB BIOS structure. This is only availabe on 1243 * Use the PSB BIOS structure. This is only availabe on
1164 * an UP version, and is deprecated by AMD. 1244 * an UP version, and is deprecated by AMD.
1165 */ 1245 */
1166 if (num_online_cpus() != 1) { 1246 if (num_online_cpus() != 1) {
1167#ifndef CONFIG_ACPI_PROCESSOR 1247 /*
1168 printk(KERN_ERR PFX "ACPI Processor support is required " 1248 * Replace this one with print_once as soon as such a
1169 "for SMP systems but is absent. Please load the " 1249 * thing gets introduced
1170 "ACPI Processor module before starting this " 1250 */
1171 "driver.\n"); 1251 if (!print_once) {
1172#else 1252 WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
1173 printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide" 1253 "does not provide ACPI _PSS objects "
1174 " ACPI _PSS objects in a way that Linux " 1254 "in a way that Linux understands. "
1175 "understands. Please report this to the Linux " 1255 "Please report this to the Linux ACPI"
1176 "ACPI maintainers and complain to your BIOS " 1256 " maintainers and complain to your "
1177 "vendor.\n"); 1257 "BIOS vendor.\n");
1178#endif 1258 print_once++;
1259 }
1179 goto err_out; 1260 goto err_out;
1180 } 1261 }
1181 if (pol->cpu != 0) { 1262 if (pol->cpu != 0) {
@@ -1185,9 +1266,9 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1185 goto err_out; 1266 goto err_out;
1186 } 1267 }
1187 rc = find_psb_table(data); 1268 rc = find_psb_table(data);
1188 if (rc) { 1269 if (rc)
1189 goto err_out; 1270 goto err_out;
1190 } 1271
1191 /* Take a crude guess here. 1272 /* Take a crude guess here.
1192 * That guess was in microseconds, so multiply with 1000 */ 1273 * That guess was in microseconds, so multiply with 1000 */
1193 pol->cpuinfo.transition_latency = ( 1274 pol->cpuinfo.transition_latency = (
@@ -1202,16 +1283,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1202 1283
1203 if (smp_processor_id() != pol->cpu) { 1284 if (smp_processor_id() != pol->cpu) {
1204 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1285 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1205 goto err_out; 1286 goto err_out_unmask;
1206 } 1287 }
1207 1288
1208 if (pending_bit_stuck()) { 1289 if (pending_bit_stuck()) {
1209 printk(KERN_ERR PFX "failing init, change pending bit set\n"); 1290 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1210 goto err_out; 1291 goto err_out_unmask;
1211 } 1292 }
1212 1293
1213 if (query_current_values_with_pending_wait(data)) 1294 if (query_current_values_with_pending_wait(data))
1214 goto err_out; 1295 goto err_out_unmask;
1215 1296
1216 if (cpu_family == CPU_OPTERON) 1297 if (cpu_family == CPU_OPTERON)
1217 fidvid_msr_init(); 1298 fidvid_msr_init();
@@ -1222,11 +1303,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1222 if (cpu_family == CPU_HW_PSTATE) 1303 if (cpu_family == CPU_HW_PSTATE)
1223 cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); 1304 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1224 else 1305 else
1225 cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu)); 1306 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1226 data->available_cores = pol->cpus; 1307 data->available_cores = pol->cpus;
1227 1308
1228 if (cpu_family == CPU_HW_PSTATE) 1309 if (cpu_family == CPU_HW_PSTATE)
1229 pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1310 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1311 data->currpstate);
1230 else 1312 else
1231 pol->cur = find_khz_freq_from_fid(data->currfid); 1313 pol->cur = find_khz_freq_from_fid(data->currfid);
1232 dprintk("policy current frequency %d kHz\n", pol->cur); 1314 dprintk("policy current frequency %d kHz\n", pol->cur);
@@ -1243,7 +1325,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1243 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1325 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1244 1326
1245 if (cpu_family == CPU_HW_PSTATE) 1327 if (cpu_family == CPU_HW_PSTATE)
1246 dprintk("cpu_init done, current pstate 0x%x\n", data->currpstate); 1328 dprintk("cpu_init done, current pstate 0x%x\n",
1329 data->currpstate);
1247 else 1330 else
1248 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", 1331 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1249 data->currfid, data->currvid); 1332 data->currfid, data->currvid);
@@ -1252,15 +1335,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1252 1335
1253 return 0; 1336 return 0;
1254 1337
1255err_out: 1338err_out_unmask:
1256 set_cpus_allowed_ptr(current, &oldmask); 1339 set_cpus_allowed_ptr(current, &oldmask);
1257 powernow_k8_cpu_exit_acpi(data); 1340 powernow_k8_cpu_exit_acpi(data);
1258 1341
1342err_out:
1259 kfree(data); 1343 kfree(data);
1260 return -ENODEV; 1344 return -ENODEV;
1261} 1345}
1262 1346
1263static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) 1347static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1264{ 1348{
1265 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1349 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1266 1350
@@ -1277,14 +1361,14 @@ static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1277 return 0; 1361 return 0;
1278} 1362}
1279 1363
1280static unsigned int powernowk8_get (unsigned int cpu) 1364static unsigned int powernowk8_get(unsigned int cpu)
1281{ 1365{
1282 struct powernow_k8_data *data; 1366 struct powernow_k8_data *data;
1283 cpumask_t oldmask = current->cpus_allowed; 1367 cpumask_t oldmask = current->cpus_allowed;
1284 unsigned int khz = 0; 1368 unsigned int khz = 0;
1285 unsigned int first; 1369 unsigned int first;
1286 1370
1287 first = first_cpu(per_cpu(cpu_core_map, cpu)); 1371 first = cpumask_first(cpu_core_mask(cpu));
1288 data = per_cpu(powernow_data, first); 1372 data = per_cpu(powernow_data, first);
1289 1373
1290 if (!data) 1374 if (!data)
@@ -1313,7 +1397,7 @@ out:
1313 return khz; 1397 return khz;
1314} 1398}
1315 1399
1316static struct freq_attr* powernow_k8_attr[] = { 1400static struct freq_attr *powernow_k8_attr[] = {
1317 &cpufreq_freq_attr_scaling_available_freqs, 1401 &cpufreq_freq_attr_scaling_available_freqs,
1318 NULL, 1402 NULL,
1319}; 1403};
@@ -1358,7 +1442,8 @@ static void __exit powernowk8_exit(void)
1358 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1442 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1359} 1443}
1360 1444
1361MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>"); 1445MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1446 "Mark Langsdorf <mark.langsdorf@amd.com>");
1362MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); 1447MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1363MODULE_LICENSE("GPL"); 1448MODULE_LICENSE("GPL");
1364 1449
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 8ecc75b6c7c3..6c6698feade1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -45,11 +45,10 @@ struct powernow_k8_data {
45 * frequency is in kHz */ 45 * frequency is in kHz */
46 struct cpufreq_frequency_table *powernow_table; 46 struct cpufreq_frequency_table *powernow_table;
47 47
48#ifdef CONFIG_X86_POWERNOW_K8_ACPI
49 /* the acpi table needs to be kept. it's only available if ACPI was 48 /* the acpi table needs to be kept. it's only available if ACPI was
50 * used to determine valid frequency/vid/fid states */ 49 * used to determine valid frequency/vid/fid states */
51 struct acpi_processor_performance acpi_data; 50 struct acpi_processor_performance acpi_data;
52#endif 51
53 /* we need to keep track of associated cores, but let cpufreq 52 /* we need to keep track of associated cores, but let cpufreq
54 * handle hotplug events - so just point at cpufreq pol->cpus 53 * handle hotplug events - so just point at cpufreq pol->cpus
55 * structure */ 54 * structure */
@@ -222,10 +221,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
222 221
223static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); 222static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
224 223
225#ifdef CONFIG_X86_POWERNOW_K8_ACPI
226static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
227static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
228#endif
229 226
230#ifdef CONFIG_SMP 227#ifdef CONFIG_SMP
231static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) 228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
index 42da9bd677d6..435a996a613a 100644
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -19,17 +19,19 @@
19 19
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/cpufreq.h> 21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
22 24
23#include <asm/msr.h> 25#include <asm/msr.h>
24#include <asm/timex.h>
25#include <asm/io.h>
26 26
27#define MMCR_BASE 0xfffef000 /* The default base address */ 27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */ 28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29 29
30static __u8 __iomem *cpuctl; 30static __u8 __iomem *cpuctl;
31 31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "sc520_freq", msg) 32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
33 35
34static struct cpufreq_frequency_table sc520_freq_table[] = { 36static struct cpufreq_frequency_table sc520_freq_table[] = {
35 {0x01, 100000}, 37 {0x01, 100000},
@@ -43,7 +45,8 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43 45
44 switch (clockspeed_reg & 0x03) { 46 switch (clockspeed_reg & 0x03) {
45 default: 47 default:
46 printk(KERN_ERR "sc520_freq: error: cpuctl register has unexpected value %02x\n", clockspeed_reg); 48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
47 case 0x01: 50 case 0x01:
48 return 100000; 51 return 100000;
49 case 0x02: 52 case 0x02:
@@ -51,7 +54,7 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
51 } 54 }
52} 55}
53 56
54static void sc520_freq_set_cpu_state (unsigned int state) 57static void sc520_freq_set_cpu_state(unsigned int state)
55{ 58{
56 59
57 struct cpufreq_freqs freqs; 60 struct cpufreq_freqs freqs;
@@ -76,18 +79,19 @@ static void sc520_freq_set_cpu_state (unsigned int state)
76 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
77}; 80};
78 81
79static int sc520_freq_verify (struct cpufreq_policy *policy) 82static int sc520_freq_verify(struct cpufreq_policy *policy)
80{ 83{
81 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); 84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
82} 85}
83 86
84static int sc520_freq_target (struct cpufreq_policy *policy, 87static int sc520_freq_target(struct cpufreq_policy *policy,
85 unsigned int target_freq, 88 unsigned int target_freq,
86 unsigned int relation) 89 unsigned int relation)
87{ 90{
88 unsigned int newstate = 0; 91 unsigned int newstate = 0;
89 92
90 if (cpufreq_frequency_table_target(policy, sc520_freq_table, target_freq, relation, &newstate)) 93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
91 return -EINVAL; 95 return -EINVAL;
92 96
93 sc520_freq_set_cpu_state(newstate); 97 sc520_freq_set_cpu_state(newstate);
@@ -116,7 +120,7 @@ static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
116 120
117 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); 121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
118 if (result) 122 if (result)
119 return (result); 123 return result;
120 124
121 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); 125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
122 126
@@ -131,7 +135,7 @@ static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
131} 135}
132 136
133 137
134static struct freq_attr* sc520_freq_attr[] = { 138static struct freq_attr *sc520_freq_attr[] = {
135 &cpufreq_freq_attr_scaling_available_freqs, 139 &cpufreq_freq_attr_scaling_available_freqs,
136 NULL, 140 NULL,
137}; 141};
@@ -155,13 +159,13 @@ static int __init sc520_freq_init(void)
155 int err; 159 int err;
156 160
157 /* Test if we have the right hardware */ 161 /* Test if we have the right hardware */
158 if(c->x86_vendor != X86_VENDOR_AMD || 162 if (c->x86_vendor != X86_VENDOR_AMD ||
159 c->x86 != 4 || c->x86_model != 9) { 163 c->x86 != 4 || c->x86_model != 9) {
160 dprintk("no Elan SC520 processor found!\n"); 164 dprintk("no Elan SC520 processor found!\n");
161 return -ENODEV; 165 return -ENODEV;
162 } 166 }
163 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); 167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
164 if(!cpuctl) { 168 if (!cpuctl) {
165 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); 169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
166 return -ENOMEM; 170 return -ENOMEM;
167 } 171 }
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index f08998278a3a..c9f1fdc02830 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
390 enable it if not. */ 390 enable it if not. */
391 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 391 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
392 392
393 if (!(l & (1<<16))) { 393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 l |= (1<<16); 394 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
395 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); 395 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
396 wrmsr(MSR_IA32_MISC_ENABLE, l, h); 396 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
397 397
398 /* check to see if it stuck */ 398 /* check to see if it stuck */
399 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
400 if (!(l & (1<<16))) { 400 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
401 printk(KERN_INFO PFX 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n"); 402 "couldn't enable Enhanced SpeedStep\n");
403 return -ENODEV; 403 return -ENODEV;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index dedc1e98f168..016c1a4fa3fc 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor = 0; 42static unsigned int speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -54,7 +54,8 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
54}; 54};
55 55
56 56
57#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg) 57#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
58 "speedstep-ich", msg)
58 59
59 60
60/** 61/**
@@ -62,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
62 * 63 *
63 * Returns: -ENODEV if no register could be found 64 * Returns: -ENODEV if no register could be found
64 */ 65 */
65static int speedstep_find_register (void) 66static int speedstep_find_register(void)
66{ 67{
67 if (!speedstep_chipset_dev) 68 if (!speedstep_chipset_dev)
68 return -ENODEV; 69 return -ENODEV;
@@ -90,7 +91,7 @@ static int speedstep_find_register (void)
90 * 91 *
91 * Tries to change the SpeedStep state. 92 * Tries to change the SpeedStep state.
92 */ 93 */
93static void speedstep_set_state (unsigned int state) 94static void speedstep_set_state(unsigned int state)
94{ 95{
95 u8 pm2_blk; 96 u8 pm2_blk;
96 u8 value; 97 u8 value;
@@ -133,11 +134,11 @@ static void speedstep_set_state (unsigned int state)
133 134
134 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); 135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
135 136
136 if (state == (value & 0x1)) { 137 if (state == (value & 0x1))
137 dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000)); 138 dprintk("change to %u MHz succeeded\n",
138 } else { 139 speedstep_get_frequency(speedstep_processor) / 1000);
139 printk (KERN_ERR "cpufreq: change failed - I/O error\n"); 140 else
140 } 141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
141 142
142 return; 143 return;
143} 144}
@@ -149,7 +150,7 @@ static void speedstep_set_state (unsigned int state)
149 * Tries to activate the SpeedStep status and control registers. 150 * Tries to activate the SpeedStep status and control registers.
150 * Returns -EINVAL on an unsupported chipset, and zero on success. 151 * Returns -EINVAL on an unsupported chipset, and zero on success.
151 */ 152 */
152static int speedstep_activate (void) 153static int speedstep_activate(void)
153{ 154{
154 u16 value = 0; 155 u16 value = 0;
155 156
@@ -175,20 +176,18 @@ static int speedstep_activate (void)
175 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected 176 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
176 * chipset, or zero on failure. 177 * chipset, or zero on failure.
177 */ 178 */
178static unsigned int speedstep_detect_chipset (void) 179static unsigned int speedstep_detect_chipset(void)
179{ 180{
180 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 181 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
181 PCI_DEVICE_ID_INTEL_82801DB_12, 182 PCI_DEVICE_ID_INTEL_82801DB_12,
182 PCI_ANY_ID, 183 PCI_ANY_ID, PCI_ANY_ID,
183 PCI_ANY_ID,
184 NULL); 184 NULL);
185 if (speedstep_chipset_dev) 185 if (speedstep_chipset_dev)
186 return 4; /* 4-M */ 186 return 4; /* 4-M */
187 187
188 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 188 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
189 PCI_DEVICE_ID_INTEL_82801CA_12, 189 PCI_DEVICE_ID_INTEL_82801CA_12,
190 PCI_ANY_ID, 190 PCI_ANY_ID, PCI_ANY_ID,
191 PCI_ANY_ID,
192 NULL); 191 NULL);
193 if (speedstep_chipset_dev) 192 if (speedstep_chipset_dev)
194 return 3; /* 3-M */ 193 return 3; /* 3-M */
@@ -196,8 +195,7 @@ static unsigned int speedstep_detect_chipset (void)
196 195
197 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 196 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
198 PCI_DEVICE_ID_INTEL_82801BA_10, 197 PCI_DEVICE_ID_INTEL_82801BA_10,
199 PCI_ANY_ID, 198 PCI_ANY_ID, PCI_ANY_ID,
200 PCI_ANY_ID,
201 NULL); 199 NULL);
202 if (speedstep_chipset_dev) { 200 if (speedstep_chipset_dev) {
203 /* speedstep.c causes lockups on Dell Inspirons 8000 and 201 /* speedstep.c causes lockups on Dell Inspirons 8000 and
@@ -208,8 +206,7 @@ static unsigned int speedstep_detect_chipset (void)
208 206
209 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, 207 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
210 PCI_DEVICE_ID_INTEL_82815_MC, 208 PCI_DEVICE_ID_INTEL_82815_MC,
211 PCI_ANY_ID, 209 PCI_ANY_ID, PCI_ANY_ID,
212 PCI_ANY_ID,
213 NULL); 210 NULL);
214 211
215 if (!hostbridge) 212 if (!hostbridge)
@@ -236,7 +233,7 @@ static unsigned int _speedstep_get(const struct cpumask *cpus)
236 233
237 cpus_allowed = current->cpus_allowed; 234 cpus_allowed = current->cpus_allowed;
238 set_cpus_allowed_ptr(current, cpus); 235 set_cpus_allowed_ptr(current, cpus);
239 speed = speedstep_get_processor_frequency(speedstep_processor); 236 speed = speedstep_get_frequency(speedstep_processor);
240 set_cpus_allowed_ptr(current, &cpus_allowed); 237 set_cpus_allowed_ptr(current, &cpus_allowed);
241 dprintk("detected %u kHz as current frequency\n", speed); 238 dprintk("detected %u kHz as current frequency\n", speed);
242 return speed; 239 return speed;
@@ -251,11 +248,12 @@ static unsigned int speedstep_get(unsigned int cpu)
251 * speedstep_target - set a new CPUFreq policy 248 * speedstep_target - set a new CPUFreq policy
252 * @policy: new policy 249 * @policy: new policy
253 * @target_freq: the target frequency 250 * @target_freq: the target frequency
254 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 251 * @relation: how that frequency relates to achieved frequency
252 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
255 * 253 *
256 * Sets a new CPUFreq policy. 254 * Sets a new CPUFreq policy.
257 */ 255 */
258static int speedstep_target (struct cpufreq_policy *policy, 256static int speedstep_target(struct cpufreq_policy *policy,
259 unsigned int target_freq, 257 unsigned int target_freq,
260 unsigned int relation) 258 unsigned int relation)
261{ 259{
@@ -264,7 +262,8 @@ static int speedstep_target (struct cpufreq_policy *policy,
264 cpumask_t cpus_allowed; 262 cpumask_t cpus_allowed;
265 int i; 263 int i;
266 264
267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 265 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
266 target_freq, relation, &newstate))
268 return -EINVAL; 267 return -EINVAL;
269 268
270 freqs.old = _speedstep_get(policy->cpus); 269 freqs.old = _speedstep_get(policy->cpus);
@@ -308,7 +307,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
308 * Limit must be within speedstep_low_freq and speedstep_high_freq, with 307 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
309 * at least one border included. 308 * at least one border included.
310 */ 309 */
311static int speedstep_verify (struct cpufreq_policy *policy) 310static int speedstep_verify(struct cpufreq_policy *policy)
312{ 311{
313 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 312 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
314} 313}
@@ -322,7 +321,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
322 321
323 /* only run on CPU to be set, or on its sibling */ 322 /* only run on CPU to be set, or on its sibling */
324#ifdef CONFIG_SMP 323#ifdef CONFIG_SMP
325 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); 324 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
326#endif 325#endif
327 326
328 cpus_allowed = current->cpus_allowed; 327 cpus_allowed = current->cpus_allowed;
@@ -344,7 +343,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
344 return -EIO; 343 return -EIO;
345 344
346 dprintk("currently at %s speed setting - %i MHz\n", 345 dprintk("currently at %s speed setting - %i MHz\n",
347 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", 346 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
347 ? "low" : "high",
348 (speed / 1000)); 348 (speed / 1000));
349 349
350 /* cpuinfo and default policy values */ 350 /* cpuinfo and default policy values */
@@ -352,9 +352,9 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
352 352
353 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); 353 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
354 if (result) 354 if (result)
355 return (result); 355 return result;
356 356
357 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); 357 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
358 358
359 return 0; 359 return 0;
360} 360}
@@ -366,7 +366,7 @@ static int speedstep_cpu_exit(struct cpufreq_policy *policy)
366 return 0; 366 return 0;
367} 367}
368 368
369static struct freq_attr* speedstep_attr[] = { 369static struct freq_attr *speedstep_attr[] = {
370 &cpufreq_freq_attr_scaling_available_freqs, 370 &cpufreq_freq_attr_scaling_available_freqs,
371 NULL, 371 NULL,
372}; 372};
@@ -396,13 +396,15 @@ static int __init speedstep_init(void)
396 /* detect processor */ 396 /* detect processor */
397 speedstep_processor = speedstep_detect_processor(); 397 speedstep_processor = speedstep_detect_processor();
398 if (!speedstep_processor) { 398 if (!speedstep_processor) {
399 dprintk("Intel(R) SpeedStep(TM) capable processor not found\n"); 399 dprintk("Intel(R) SpeedStep(TM) capable processor "
400 "not found\n");
400 return -ENODEV; 401 return -ENODEV;
401 } 402 }
402 403
403 /* detect chipset */ 404 /* detect chipset */
404 if (!speedstep_detect_chipset()) { 405 if (!speedstep_detect_chipset()) {
405 dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n"); 406 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
407 "(yet) available.\n");
406 return -ENODEV; 408 return -ENODEV;
407 } 409 }
408 410
@@ -431,9 +433,11 @@ static void __exit speedstep_exit(void)
431} 433}
432 434
433 435
434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); 436MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 437 "Dominik Brodowski <linux@brodo.de>");
436MODULE_LICENSE ("GPL"); 438MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
439 "with ICH-M southbridges.");
440MODULE_LICENSE("GPL");
437 441
438module_init(speedstep_init); 442module_init(speedstep_init);
439module_exit(speedstep_exit); 443module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index cdac7d62369b..2e3c6862657b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -16,12 +16,16 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/tsc.h>
19#include "speedstep-lib.h" 20#include "speedstep-lib.h"
20 21
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg) 22#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
23 "speedstep-lib", msg)
24
25#define PFX "speedstep-lib: "
22 26
23#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK 27#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
24static int relaxed_check = 0; 28static int relaxed_check;
25#else 29#else
26#define relaxed_check 0 30#define relaxed_check 0
27#endif 31#endif
@@ -30,14 +34,14 @@ static int relaxed_check = 0;
30 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
31 *********************************************************************/ 35 *********************************************************************/
32 36
33static unsigned int pentium3_get_frequency (unsigned int processor) 37static unsigned int pentium3_get_frequency(unsigned int processor)
34{ 38{
35 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
36 struct { 40 struct {
37 unsigned int ratio; /* Frequency Multiplier (x10) */ 41 unsigned int ratio; /* Frequency Multiplier (x10) */
38 u8 bitmap; /* power on configuration bits 42 u8 bitmap; /* power on configuration bits
39 [27, 25:22] (in MSR 0x2a) */ 43 [27, 25:22] (in MSR 0x2a) */
40 } msr_decode_mult [] = { 44 } msr_decode_mult[] = {
41 { 30, 0x01 }, 45 { 30, 0x01 },
42 { 35, 0x05 }, 46 { 35, 0x05 },
43 { 40, 0x02 }, 47 { 40, 0x02 },
@@ -52,7 +56,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
52 { 85, 0x26 }, 56 { 85, 0x26 },
53 { 90, 0x20 }, 57 { 90, 0x20 },
54 { 100, 0x2b }, 58 { 100, 0x2b },
55 { 0, 0xff } /* error or unknown value */ 59 { 0, 0xff } /* error or unknown value */
56 }; 60 };
57 61
58 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ 62 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
@@ -60,7 +64,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
60 unsigned int value; /* Front Side Bus speed in MHz */ 64 unsigned int value; /* Front Side Bus speed in MHz */
61 u8 bitmap; /* power on configuration bits [18: 19] 65 u8 bitmap; /* power on configuration bits [18: 19]
62 (in MSR 0x2a) */ 66 (in MSR 0x2a) */
63 } msr_decode_fsb [] = { 67 } msr_decode_fsb[] = {
64 { 66, 0x0 }, 68 { 66, 0x0 },
65 { 100, 0x2 }, 69 { 100, 0x2 },
66 { 133, 0x1 }, 70 { 133, 0x1 },
@@ -85,7 +89,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
85 } 89 }
86 90
87 /* decode the multiplier */ 91 /* decode the multiplier */
88 if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) { 92 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
89 dprintk("workaround for early PIIIs\n"); 93 dprintk("workaround for early PIIIs\n");
90 msr_lo &= 0x03c00000; 94 msr_lo &= 0x03c00000;
91 } else 95 } else
@@ -97,9 +101,10 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
97 j++; 101 j++;
98 } 102 }
99 103
100 dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); 104 dprintk("speed is %u\n",
105 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
101 106
102 return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100); 107 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
103} 108}
104 109
105 110
@@ -112,20 +117,23 @@ static unsigned int pentiumM_get_frequency(void)
112 117
113 /* see table B-2 of 24547212.pdf */ 118 /* see table B-2 of 24547212.pdf */
114 if (msr_lo & 0x00040000) { 119 if (msr_lo & 0x00040000) {
115 printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp); 120 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
121 msr_lo, msr_tmp);
116 return 0; 122 return 0;
117 } 123 }
118 124
119 msr_tmp = (msr_lo >> 22) & 0x1f; 125 msr_tmp = (msr_lo >> 22) & 0x1f;
120 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000)); 126 dprintk("bits 22-26 are 0x%x, speed is %u\n",
127 msr_tmp, (msr_tmp * 100 * 1000));
121 128
122 return (msr_tmp * 100 * 1000); 129 return msr_tmp * 100 * 1000;
123} 130}
124 131
125static unsigned int pentium_core_get_frequency(void) 132static unsigned int pentium_core_get_frequency(void)
126{ 133{
127 u32 fsb = 0; 134 u32 fsb = 0;
128 u32 msr_lo, msr_tmp; 135 u32 msr_lo, msr_tmp;
136 int ret;
129 137
130 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); 138 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
131 /* see table B-2 of 25366920.pdf */ 139 /* see table B-2 of 25366920.pdf */
@@ -153,12 +161,15 @@ static unsigned int pentium_core_get_frequency(void)
153 } 161 }
154 162
155 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); 163 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
156 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); 164 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
165 msr_lo, msr_tmp);
157 166
158 msr_tmp = (msr_lo >> 22) & 0x1f; 167 msr_tmp = (msr_lo >> 22) & 0x1f;
159 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb)); 168 dprintk("bits 22-26 are 0x%x, speed is %u\n",
169 msr_tmp, (msr_tmp * fsb));
160 170
161 return (msr_tmp * fsb); 171 ret = (msr_tmp * fsb);
172 return ret;
162} 173}
163 174
164 175
@@ -167,6 +178,16 @@ static unsigned int pentium4_get_frequency(void)
167 struct cpuinfo_x86 *c = &boot_cpu_data; 178 struct cpuinfo_x86 *c = &boot_cpu_data;
168 u32 msr_lo, msr_hi, mult; 179 u32 msr_lo, msr_hi, mult;
169 unsigned int fsb = 0; 180 unsigned int fsb = 0;
181 unsigned int ret;
182 u8 fsb_code;
183
184 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
185 * to System Bus Frequency Ratio Field in the Processor Frequency
186 * Configuration Register of the MSR. Therefore the current
187 * frequency cannot be calculated and has to be measured.
188 */
189 if (c->x86_model < 2)
190 return cpu_khz;
170 191
171 rdmsr(0x2c, msr_lo, msr_hi); 192 rdmsr(0x2c, msr_lo, msr_hi);
172 193
@@ -177,62 +198,61 @@ static unsigned int pentium4_get_frequency(void)
177 * revision #12 in Table B-1: MSRs in the Pentium 4 and 198 * revision #12 in Table B-1: MSRs in the Pentium 4 and
178 * Intel Xeon Processors, on page B-4 and B-5. 199 * Intel Xeon Processors, on page B-4 and B-5.
179 */ 200 */
180 if (c->x86_model < 2) 201 fsb_code = (msr_lo >> 16) & 0x7;
202 switch (fsb_code) {
203 case 0:
181 fsb = 100 * 1000; 204 fsb = 100 * 1000;
182 else { 205 break;
183 u8 fsb_code = (msr_lo >> 16) & 0x7; 206 case 1:
184 switch (fsb_code) { 207 fsb = 13333 * 10;
185 case 0: 208 break;
186 fsb = 100 * 1000; 209 case 2:
187 break; 210 fsb = 200 * 1000;
188 case 1: 211 break;
189 fsb = 13333 * 10;
190 break;
191 case 2:
192 fsb = 200 * 1000;
193 break;
194 }
195 } 212 }
196 213
197 if (!fsb) 214 if (!fsb)
198 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); 215 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
216 "Please send an e-mail to <linux@brodo.de>\n");
199 217
200 /* Multiplier. */ 218 /* Multiplier. */
201 mult = msr_lo >> 24; 219 mult = msr_lo >> 24;
202 220
203 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); 221 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
222 fsb, mult, (fsb * mult));
204 223
205 return (fsb * mult); 224 ret = (fsb * mult);
225 return ret;
206} 226}
207 227
208 228
209unsigned int speedstep_get_processor_frequency(unsigned int processor) 229unsigned int speedstep_get_frequency(unsigned int processor)
210{ 230{
211 switch (processor) { 231 switch (processor) {
212 case SPEEDSTEP_PROCESSOR_PCORE: 232 case SPEEDSTEP_CPU_PCORE:
213 return pentium_core_get_frequency(); 233 return pentium_core_get_frequency();
214 case SPEEDSTEP_PROCESSOR_PM: 234 case SPEEDSTEP_CPU_PM:
215 return pentiumM_get_frequency(); 235 return pentiumM_get_frequency();
216 case SPEEDSTEP_PROCESSOR_P4D: 236 case SPEEDSTEP_CPU_P4D:
217 case SPEEDSTEP_PROCESSOR_P4M: 237 case SPEEDSTEP_CPU_P4M:
218 return pentium4_get_frequency(); 238 return pentium4_get_frequency();
219 case SPEEDSTEP_PROCESSOR_PIII_T: 239 case SPEEDSTEP_CPU_PIII_T:
220 case SPEEDSTEP_PROCESSOR_PIII_C: 240 case SPEEDSTEP_CPU_PIII_C:
221 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: 241 case SPEEDSTEP_CPU_PIII_C_EARLY:
222 return pentium3_get_frequency(processor); 242 return pentium3_get_frequency(processor);
223 default: 243 default:
224 return 0; 244 return 0;
225 }; 245 };
226 return 0; 246 return 0;
227} 247}
228EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency); 248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
229 249
230 250
231/********************************************************************* 251/*********************************************************************
232 * DETECT SPEEDSTEP-CAPABLE PROCESSOR * 252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
233 *********************************************************************/ 253 *********************************************************************/
234 254
235unsigned int speedstep_detect_processor (void) 255unsigned int speedstep_detect_processor(void)
236{ 256{
237 struct cpuinfo_x86 *c = &cpu_data(0); 257 struct cpuinfo_x86 *c = &cpu_data(0);
238 u32 ebx, msr_lo, msr_hi; 258 u32 ebx, msr_lo, msr_hi;
@@ -261,7 +281,7 @@ unsigned int speedstep_detect_processor (void)
261 * sample has ebx = 0x0f, production has 0x0e. 281 * sample has ebx = 0x0f, production has 0x0e.
262 */ 282 */
263 if ((ebx == 0x0e) || (ebx == 0x0f)) 283 if ((ebx == 0x0e) || (ebx == 0x0f))
264 return SPEEDSTEP_PROCESSOR_P4M; 284 return SPEEDSTEP_CPU_P4M;
265 break; 285 break;
266 case 7: 286 case 7:
267 /* 287 /*
@@ -272,7 +292,7 @@ unsigned int speedstep_detect_processor (void)
272 * samples are only of B-stepping... 292 * samples are only of B-stepping...
273 */ 293 */
274 if (ebx == 0x0e) 294 if (ebx == 0x0e)
275 return SPEEDSTEP_PROCESSOR_P4M; 295 return SPEEDSTEP_CPU_P4M;
276 break; 296 break;
277 case 9: 297 case 9:
278 /* 298 /*
@@ -288,10 +308,13 @@ unsigned int speedstep_detect_processor (void)
288 * M-P4-Ms may have either ebx=0xe or 0xf [see above] 308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
289 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] 309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
290 * also, M-P4M HTs have ebx=0x8, too 310 * also, M-P4M HTs have ebx=0x8, too
291 * For now, they are distinguished by the model_id string 311 * For now, they are distinguished by the model_id
312 * string
292 */ 313 */
293 if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL)) 314 if ((ebx == 0x0e) ||
294 return SPEEDSTEP_PROCESSOR_P4M; 315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
295 break; 318 break;
296 default: 319 default:
297 break; 320 break;
@@ -301,7 +324,8 @@ unsigned int speedstep_detect_processor (void)
301 324
302 switch (c->x86_model) { 325 switch (c->x86_model) {
303 case 0x0B: /* Intel PIII [Tualatin] */ 326 case 0x0B: /* Intel PIII [Tualatin] */
304 /* cpuid_ebx(1) is 0x04 for desktop PIII, 0x06 for mobile PIII-M */ 327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
305 ebx = cpuid_ebx(0x00000001); 329 ebx = cpuid_ebx(0x00000001);
306 dprintk("ebx is %x\n", ebx); 330 dprintk("ebx is %x\n", ebx);
307 331
@@ -313,14 +337,15 @@ unsigned int speedstep_detect_processor (void)
313 /* So far all PIII-M processors support SpeedStep. See 337 /* So far all PIII-M processors support SpeedStep. See
314 * Intel's 24540640.pdf of June 2003 338 * Intel's 24540640.pdf of June 2003
315 */ 339 */
316 return SPEEDSTEP_PROCESSOR_PIII_T; 340 return SPEEDSTEP_CPU_PIII_T;
317 341
318 case 0x08: /* Intel PIII [Coppermine] */ 342 case 0x08: /* Intel PIII [Coppermine] */
319 343
320 /* all mobile PIII Coppermines have FSB 100 MHz 344 /* all mobile PIII Coppermines have FSB 100 MHz
321 * ==> sort out a few desktop PIIIs. */ 345 * ==> sort out a few desktop PIIIs. */
322 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); 346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
323 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi); 347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
324 msr_lo &= 0x00c0000; 349 msr_lo &= 0x00c0000;
325 if (msr_lo != 0x0080000) 350 if (msr_lo != 0x0080000)
326 return 0; 351 return 0;
@@ -332,13 +357,15 @@ unsigned int speedstep_detect_processor (void)
332 * bit 56 or 57 is set 357 * bit 56 or 57 is set
333 */ 358 */
334 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); 359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
335 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi); 360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
336 if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) { 361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
337 if (c->x86_mask == 0x01) { 364 if (c->x86_mask == 0x01) {
338 dprintk("early PIII version\n"); 365 dprintk("early PIII version\n");
339 return SPEEDSTEP_PROCESSOR_PIII_C_EARLY; 366 return SPEEDSTEP_CPU_PIII_C_EARLY;
340 } else 367 } else
341 return SPEEDSTEP_PROCESSOR_PIII_C; 368 return SPEEDSTEP_CPU_PIII_C;
342 } 369 }
343 370
344 default: 371 default:
@@ -369,7 +396,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
369 dprintk("trying to determine both speeds\n"); 396 dprintk("trying to determine both speeds\n");
370 397
371 /* get current speed */ 398 /* get current speed */
372 prev_speed = speedstep_get_processor_frequency(processor); 399 prev_speed = speedstep_get_frequency(processor);
373 if (!prev_speed) 400 if (!prev_speed)
374 return -EIO; 401 return -EIO;
375 402
@@ -379,7 +406,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
379 406
380 /* switch to low state */ 407 /* switch to low state */
381 set_state(SPEEDSTEP_LOW); 408 set_state(SPEEDSTEP_LOW);
382 *low_speed = speedstep_get_processor_frequency(processor); 409 *low_speed = speedstep_get_frequency(processor);
383 if (!*low_speed) { 410 if (!*low_speed) {
384 ret = -EIO; 411 ret = -EIO;
385 goto out; 412 goto out;
@@ -398,7 +425,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
398 if (transition_latency) 425 if (transition_latency)
399 do_gettimeofday(&tv2); 426 do_gettimeofday(&tv2);
400 427
401 *high_speed = speedstep_get_processor_frequency(processor); 428 *high_speed = speedstep_get_frequency(processor);
402 if (!*high_speed) { 429 if (!*high_speed) {
403 ret = -EIO; 430 ret = -EIO;
404 goto out; 431 goto out;
@@ -426,9 +453,12 @@ unsigned int speedstep_get_freqs(unsigned int processor,
426 /* check if the latency measurement is too high or too low 453 /* check if the latency measurement is too high or too low
427 * and set it to a safe value (500uSec) in that case 454 * and set it to a safe value (500uSec) in that case
428 */ 455 */
429 if (*transition_latency > 10000000 || *transition_latency < 50000) { 456 if (*transition_latency > 10000000 ||
430 printk (KERN_WARNING "speedstep: frequency transition measured seems out of " 457 *transition_latency < 50000) {
431 "range (%u nSec), falling back to a safe one of %u nSec.\n", 458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
432 *transition_latency, 500000); 462 *transition_latency, 500000);
433 *transition_latency = 500000; 463 *transition_latency = 500000;
434 } 464 }
@@ -436,15 +466,16 @@ unsigned int speedstep_get_freqs(unsigned int processor,
436 466
437out: 467out:
438 local_irq_restore(flags); 468 local_irq_restore(flags);
439 return (ret); 469 return ret;
440} 470}
441EXPORT_SYMBOL_GPL(speedstep_get_freqs); 471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
442 472
443#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK 473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
444module_param(relaxed_check, int, 0444); 474module_param(relaxed_check, int, 0444);
445MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability."); 475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
446#endif 477#endif
447 478
448MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); 479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
449MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); 480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
450MODULE_LICENSE ("GPL"); 481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index b11bcc608cac..2b6c04e5a304 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -12,17 +12,17 @@
12 12
13/* processors */ 13/* processors */
14 14
15#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */
16#define SPEEDSTEP_PROCESSOR_PIII_C 0x00000002 /* Coppermine core */ 16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */
17#define SPEEDSTEP_PROCESSOR_PIII_T 0x00000003 /* Tualatin core */ 17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */
18#define SPEEDSTEP_PROCESSOR_P4M 0x00000004 /* P4-M */ 18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */
19 19
20/* the following processors are not speedstep-capable and are not auto-detected 20/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 21 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_processor_frequency() call. */ 22 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */ 23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */
24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */ 24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */
25#define SPEEDSTEP_PROCESSOR_PCORE 0xFFFFFF05 /* Core */ 25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -34,7 +34,7 @@
34extern unsigned int speedstep_detect_processor (void); 34extern unsigned int speedstep_detect_processor (void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_processor_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(unsigned int processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8a85c93bd62a..befea088e4f5 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -19,8 +19,8 @@
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/io.h>
22#include <asm/ist.h> 23#include <asm/ist.h>
23#include <asm/io.h>
24 24
25#include "speedstep-lib.h" 25#include "speedstep-lib.h"
26 26
@@ -30,12 +30,12 @@
30 * If user gives it, these are used. 30 * If user gives it, these are used.
31 * 31 *
32 */ 32 */
33static int smi_port = 0; 33static int smi_port;
34static int smi_cmd = 0; 34static int smi_cmd;
35static unsigned int smi_sig = 0; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor = 0; 38static unsigned int speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
@@ -56,12 +56,13 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
56 * of DMA activity going on? */ 56 * of DMA activity going on? */
57#define SMI_TRIES 5 57#define SMI_TRIES 5
58 58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg) 59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
60 "speedstep-smi", msg)
60 61
61/** 62/**
62 * speedstep_smi_ownership 63 * speedstep_smi_ownership
63 */ 64 */
64static int speedstep_smi_ownership (void) 65static int speedstep_smi_ownership(void)
65{ 66{
66 u32 command, result, magic, dummy; 67 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER; 68 u32 function = GET_SPEEDSTEP_OWNER;
@@ -70,16 +71,18 @@ static int speedstep_smi_ownership (void)
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 71 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data); 72 magic = virt_to_phys(magic_data);
72 73
73 dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); 74 dprintk("trying to obtain ownership with command %x at port %x\n",
75 command, smi_port);
74 76
75 __asm__ __volatile__( 77 __asm__ __volatile__(
76 "push %%ebp\n" 78 "push %%ebp\n"
77 "out %%al, (%%dx)\n" 79 "out %%al, (%%dx)\n"
78 "pop %%ebp\n" 80 "pop %%ebp\n"
79 : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), 81 : "=D" (result),
80 "=S" (dummy) 82 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
83 "=S" (dummy)
81 : "a" (command), "b" (function), "c" (0), "d" (smi_port), 84 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
82 "D" (0), "S" (magic) 85 "D" (0), "S" (magic)
83 : "memory" 86 : "memory"
84 ); 87 );
85 88
@@ -97,10 +100,10 @@ static int speedstep_smi_ownership (void)
97 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing 100 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
98 * shows that the latter occurs if !(ist_info.event & 0xFFFF). 101 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
99 */ 102 */
100static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) 103static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
101{ 104{
102 u32 command, result = 0, edi, high_mhz, low_mhz, dummy; 105 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
103 u32 state=0; 106 u32 state = 0;
104 u32 function = GET_SPEEDSTEP_FREQS; 107 u32 function = GET_SPEEDSTEP_FREQS;
105 108
106 if (!(ist_info.event & 0xFFFF)) { 109 if (!(ist_info.event & 0xFFFF)) {
@@ -110,17 +113,25 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
110 113
111 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 114 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
112 115
113 dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); 116 dprintk("trying to determine frequencies with command %x at port %x\n",
117 command, smi_port);
114 118
115 __asm__ __volatile__( 119 __asm__ __volatile__(
116 "push %%ebp\n" 120 "push %%ebp\n"
117 "out %%al, (%%dx)\n" 121 "out %%al, (%%dx)\n"
118 "pop %%ebp" 122 "pop %%ebp"
119 : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) 123 : "=a" (result),
120 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) 124 "=b" (high_mhz),
125 "=c" (low_mhz),
126 "=d" (state), "=D" (edi), "=S" (dummy)
127 : "a" (command),
128 "b" (function),
129 "c" (state),
130 "d" (smi_port), "S" (0), "D" (0)
121 ); 131 );
122 132
123 dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); 133 dprintk("result %x, low_freq %u, high_freq %u\n",
134 result, low_mhz, high_mhz);
124 135
125 /* abort if results are obviously incorrect... */ 136 /* abort if results are obviously incorrect... */
126 if ((high_mhz + low_mhz) < 600) 137 if ((high_mhz + low_mhz) < 600)
@@ -137,26 +148,30 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
137 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 148 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
138 * 149 *
139 */ 150 */
140static int speedstep_get_state (void) 151static int speedstep_get_state(void)
141{ 152{
142 u32 function=GET_SPEEDSTEP_STATE; 153 u32 function = GET_SPEEDSTEP_STATE;
143 u32 result, state, edi, command, dummy; 154 u32 result, state, edi, command, dummy;
144 155
145 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 156 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
146 157
147 dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); 158 dprintk("trying to determine current setting with command %x "
159 "at port %x\n", command, smi_port);
148 160
149 __asm__ __volatile__( 161 __asm__ __volatile__(
150 "push %%ebp\n" 162 "push %%ebp\n"
151 "out %%al, (%%dx)\n" 163 "out %%al, (%%dx)\n"
152 "pop %%ebp\n" 164 "pop %%ebp\n"
153 : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) 165 : "=a" (result),
154 : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) 166 "=b" (state), "=D" (edi),
167 "=c" (dummy), "=d" (dummy), "=S" (dummy)
168 : "a" (command), "b" (function), "c" (0),
169 "d" (smi_port), "S" (0), "D" (0)
155 ); 170 );
156 171
157 dprintk("state is %x, result is %x\n", state, result); 172 dprintk("state is %x, result is %x\n", state, result);
158 173
159 return (state & 1); 174 return state & 1;
160} 175}
161 176
162 177
@@ -165,11 +180,11 @@ static int speedstep_get_state (void)
165 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 180 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
166 * 181 *
167 */ 182 */
168static void speedstep_set_state (unsigned int state) 183static void speedstep_set_state(unsigned int state)
169{ 184{
170 unsigned int result = 0, command, new_state, dummy; 185 unsigned int result = 0, command, new_state, dummy;
171 unsigned long flags; 186 unsigned long flags;
172 unsigned int function=SET_SPEEDSTEP_STATE; 187 unsigned int function = SET_SPEEDSTEP_STATE;
173 unsigned int retry = 0; 188 unsigned int retry = 0;
174 189
175 if (state > 0x1) 190 if (state > 0x1)
@@ -180,11 +195,14 @@ static void speedstep_set_state (unsigned int state)
180 195
181 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 196 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
182 197
183 dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port); 198 dprintk("trying to set frequency to state %u "
199 "with command %x at port %x\n",
200 state, command, smi_port);
184 201
185 do { 202 do {
186 if (retry) { 203 if (retry) {
187 dprintk("retry %u, previous result %u, waiting...\n", retry, result); 204 dprintk("retry %u, previous result %u, waiting...\n",
205 retry, result);
188 mdelay(retry * 50); 206 mdelay(retry * 50);
189 } 207 }
190 retry++; 208 retry++;
@@ -192,20 +210,26 @@ static void speedstep_set_state (unsigned int state)
192 "push %%ebp\n" 210 "push %%ebp\n"
193 "out %%al, (%%dx)\n" 211 "out %%al, (%%dx)\n"
194 "pop %%ebp" 212 "pop %%ebp"
195 : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), 213 : "=b" (new_state), "=D" (result),
196 "=d" (dummy), "=S" (dummy) 214 "=c" (dummy), "=a" (dummy),
197 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) 215 "=d" (dummy), "=S" (dummy)
216 : "a" (command), "b" (function), "c" (state),
217 "d" (smi_port), "S" (0), "D" (0)
198 ); 218 );
199 } while ((new_state != state) && (retry <= SMI_TRIES)); 219 } while ((new_state != state) && (retry <= SMI_TRIES));
200 220
201 /* enable IRQs */ 221 /* enable IRQs */
202 local_irq_restore(flags); 222 local_irq_restore(flags);
203 223
204 if (new_state == state) { 224 if (new_state == state)
205 dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); 225 dprintk("change to %u MHz succeeded after %u tries "
206 } else { 226 "with result %u\n",
207 printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); 227 (speedstep_freqs[new_state].frequency / 1000),
208 } 228 retry, result);
229 else
230 printk(KERN_ERR "cpufreq: change to state %u "
231 "failed with new_state %u and result %u\n",
232 state, new_state, result);
209 233
210 return; 234 return;
211} 235}
@@ -219,13 +243,14 @@ static void speedstep_set_state (unsigned int state)
219 * 243 *
220 * Sets a new CPUFreq policy/freq. 244 * Sets a new CPUFreq policy/freq.
221 */ 245 */
222static int speedstep_target (struct cpufreq_policy *policy, 246static int speedstep_target(struct cpufreq_policy *policy,
223 unsigned int target_freq, unsigned int relation) 247 unsigned int target_freq, unsigned int relation)
224{ 248{
225 unsigned int newstate = 0; 249 unsigned int newstate = 0;
226 struct cpufreq_freqs freqs; 250 struct cpufreq_freqs freqs;
227 251
228 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 252 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
253 target_freq, relation, &newstate))
229 return -EINVAL; 254 return -EINVAL;
230 255
231 freqs.old = speedstep_freqs[speedstep_get_state()].frequency; 256 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
@@ -250,7 +275,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
250 * Limit must be within speedstep_low_freq and speedstep_high_freq, with 275 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
251 * at least one border included. 276 * at least one border included.
252 */ 277 */
253static int speedstep_verify (struct cpufreq_policy *policy) 278static int speedstep_verify(struct cpufreq_policy *policy)
254{ 279{
255 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 280 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
256} 281}
@@ -259,7 +284,8 @@ static int speedstep_verify (struct cpufreq_policy *policy)
259static int speedstep_cpu_init(struct cpufreq_policy *policy) 284static int speedstep_cpu_init(struct cpufreq_policy *policy)
260{ 285{
261 int result; 286 int result;
262 unsigned int speed,state; 287 unsigned int speed, state;
288 unsigned int *low, *high;
263 289
264 /* capability check */ 290 /* capability check */
265 if (policy->cpu != 0) 291 if (policy->cpu != 0)
@@ -272,19 +298,23 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
272 } 298 }
273 299
274 /* detect low and high frequency */ 300 /* detect low and high frequency */
275 result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency, 301 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
276 &speedstep_freqs[SPEEDSTEP_HIGH].frequency); 302 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
303
304 result = speedstep_smi_get_freqs(low, high);
277 if (result) { 305 if (result) {
278 /* fall back to speedstep_lib.c dection mechanism: try both states out */ 306 /* fall back to speedstep_lib.c dection mechanism:
279 dprintk("could not detect low and high frequencies by SMI call.\n"); 307 * try both states out */
308 dprintk("could not detect low and high frequencies "
309 "by SMI call.\n");
280 result = speedstep_get_freqs(speedstep_processor, 310 result = speedstep_get_freqs(speedstep_processor,
281 &speedstep_freqs[SPEEDSTEP_LOW].frequency, 311 low, high,
282 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
283 NULL, 312 NULL,
284 &speedstep_set_state); 313 &speedstep_set_state);
285 314
286 if (result) { 315 if (result) {
287 dprintk("could not detect two different speeds -- aborting.\n"); 316 dprintk("could not detect two different speeds"
317 " -- aborting.\n");
288 return result; 318 return result;
289 } else 319 } else
290 dprintk("workaround worked.\n"); 320 dprintk("workaround worked.\n");
@@ -295,7 +325,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
295 speed = speedstep_freqs[state].frequency; 325 speed = speedstep_freqs[state].frequency;
296 326
297 dprintk("currently at %s speed setting - %i MHz\n", 327 dprintk("currently at %s speed setting - %i MHz\n",
298 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", 328 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
329 ? "low" : "high",
299 (speed / 1000)); 330 (speed / 1000));
300 331
301 /* cpuinfo and default policy values */ 332 /* cpuinfo and default policy values */
@@ -304,7 +335,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
304 335
305 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); 336 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
306 if (result) 337 if (result)
307 return (result); 338 return result;
308 339
309 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); 340 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
310 341
@@ -321,7 +352,7 @@ static unsigned int speedstep_get(unsigned int cpu)
321{ 352{
322 if (cpu) 353 if (cpu)
323 return -ENODEV; 354 return -ENODEV;
324 return speedstep_get_processor_frequency(speedstep_processor); 355 return speedstep_get_frequency(speedstep_processor);
325} 356}
326 357
327 358
@@ -335,7 +366,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
335 return result; 366 return result;
336} 367}
337 368
338static struct freq_attr* speedstep_attr[] = { 369static struct freq_attr *speedstep_attr[] = {
339 &cpufreq_freq_attr_scaling_available_freqs, 370 &cpufreq_freq_attr_scaling_available_freqs,
340 NULL, 371 NULL,
341}; 372};
@@ -364,21 +395,23 @@ static int __init speedstep_init(void)
364 speedstep_processor = speedstep_detect_processor(); 395 speedstep_processor = speedstep_detect_processor();
365 396
366 switch (speedstep_processor) { 397 switch (speedstep_processor) {
367 case SPEEDSTEP_PROCESSOR_PIII_T: 398 case SPEEDSTEP_CPU_PIII_T:
368 case SPEEDSTEP_PROCESSOR_PIII_C: 399 case SPEEDSTEP_CPU_PIII_C:
369 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: 400 case SPEEDSTEP_CPU_PIII_C_EARLY:
370 break; 401 break;
371 default: 402 default:
372 speedstep_processor = 0; 403 speedstep_processor = 0;
373 } 404 }
374 405
375 if (!speedstep_processor) { 406 if (!speedstep_processor) {
376 dprintk ("No supported Intel CPU detected.\n"); 407 dprintk("No supported Intel CPU detected.\n");
377 return -ENODEV; 408 return -ENODEV;
378 } 409 }
379 410
380 dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n", 411 dprintk("signature:0x%.8lx, command:0x%.8lx, "
381 ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level); 412 "event:0x%.8lx, perf_level:0x%.8lx.\n",
413 ist_info.signature, ist_info.command,
414 ist_info.event, ist_info.perf_level);
382 415
383 /* Error if no IST-SMI BIOS or no PARM 416 /* Error if no IST-SMI BIOS or no PARM
384 sig= 'ISGE' aka 'Intel Speedstep Gate E' */ 417 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
@@ -416,17 +449,20 @@ static void __exit speedstep_exit(void)
416 cpufreq_unregister_driver(&speedstep_driver); 449 cpufreq_unregister_driver(&speedstep_driver);
417} 450}
418 451
419module_param(smi_port, int, 0444); 452module_param(smi_port, int, 0444);
420module_param(smi_cmd, int, 0444); 453module_param(smi_cmd, int, 0444);
421module_param(smi_sig, uint, 0444); 454module_param(smi_sig, uint, 0444);
422 455
423MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2"); 456MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
424MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82"); 457 "-- Intel's default setting is 0xb2");
425MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface."); 458MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
459 "-- Intel's default setting is 0x82");
460MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
461 "SMI interface.");
426 462
427MODULE_AUTHOR ("Hiroshi Miura"); 463MODULE_AUTHOR("Hiroshi Miura");
428MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface."); 464MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
429MODULE_LICENSE ("GPL"); 465MODULE_LICENSE("GPL");
430 466
431module_init(speedstep_init); 467module_init(speedstep_init);
432module_exit(speedstep_exit); 468module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071a..593171e967ef 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
61 */ 61 */
62static unsigned char Cx86_dir0_msb __cpuinitdata = 0; 62static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
63 63
64static char Cx86_model[][9] __cpuinitdata = { 64static const char __cpuinitconst Cx86_model[][9] = {
65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", 65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
66 "M II ", "Unknown" 66 "M II ", "Unknown"
67}; 67};
68static char Cx486_name[][5] __cpuinitdata = { 68static const char __cpuinitconst Cx486_name[][5] = {
69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", 69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
70 "SRx2", "DRx2" 70 "SRx2", "DRx2"
71}; 71};
72static char Cx486S_name[][4] __cpuinitdata = { 72static const char __cpuinitconst Cx486S_name[][4] = {
73 "S", "S2", "Se", "S2e" 73 "S", "S2", "Se", "S2e"
74}; 74};
75static char Cx486D_name[][4] __cpuinitdata = { 75static const char __cpuinitconst Cx486D_name[][4] = {
76 "DX", "DX2", "?", "?", "?", "DX4" 76 "DX", "DX2", "?", "?", "?", "DX4"
77}; 77};
78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; 78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
79static char cyrix_model_mult1[] __cpuinitdata = "12??43"; 79static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
80static char cyrix_model_mult2[] __cpuinitdata = "12233445"; 80static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
81 81
82/* 82/*
83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old 83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
435 } 435 }
436} 436}
437 437
438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
439 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
440 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
446 446
447cpu_dev_register(cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 24ff26a38ade..7437fa133c02 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bitops.h> 5#include <linux/bitops.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/sched.h>
7#include <linux/thread_info.h> 8#include <linux/thread_info.h>
8#include <linux/module.h> 9#include <linux/module.h>
9 10
@@ -13,6 +14,7 @@
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
14#include <asm/ds.h> 15#include <asm/ds.h>
15#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h>
16 18
17#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
18#include <asm/topology.h> 20#include <asm/topology.h>
@@ -24,7 +26,6 @@
24#ifdef CONFIG_X86_LOCAL_APIC 26#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 27#include <asm/mpspec.h>
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <mach_apic.h>
28#endif 29#endif
29 30
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
@@ -54,15 +55,37 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
54 c->x86_cache_alignment = 128; 55 c->x86_cache_alignment = 128;
55#endif 56#endif
56 57
58 /* CPUID workaround for 0F33/0F34 CPU */
59 if (c->x86 == 0xF && c->x86_model == 0x3
60 && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
61 c->x86_phys_bits = 36;
62
57 /* 63 /*
58 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 64 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
59 * with P/T states and does not stop in deep C-states 65 * with P/T states and does not stop in deep C-states.
66 *
67 * It is also reliable across cores and sockets. (but not across
68 * cabinets - we turn it off in that case explicitly.)
60 */ 69 */
61 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
62 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
63 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1;
64 } 75 }
65 76
77 /*
78 * There is a known erratum on Pentium III and Core Solo
79 * and Core Duo CPUs.
80 * " Page with PAT set to WC while associated MTRR is UC
81 * may consolidate to UC "
82 * Because of this erratum, it is better to stick with
83 * setting WC in MTRR rather than using PAT on these CPUs.
84 *
85 * Enable PAT WC only on P4, Core 2 or later CPUs.
86 */
87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT);
66} 89}
67 90
68#ifdef CONFIG_X86_32 91#ifdef CONFIG_X86_32
@@ -99,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
99} 122}
100#endif 123#endif
101 124
125static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 /* calling is from identify_secondary_cpu() ? */
129 if (c->cpu_index == boot_cpu_id)
130 return;
131
132 /*
133 * Mask B, Pentium, but not Pentium MMX
134 */
135 if (c->x86 == 5 &&
136 c->x86_mask >= 1 && c->x86_mask <= 4 &&
137 c->x86_model <= 3) {
138 /*
139 * Remember we have B step Pentia with bugs
140 */
141 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
142 "with B stepping processors.\n");
143 }
144#endif
145}
146
102static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 147static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
103{ 148{
104 unsigned long lo, hi; 149 unsigned long lo, hi;
@@ -135,10 +180,10 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
135 */ 180 */
136 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 181 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
137 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 182 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
138 if ((lo & (1<<9)) == 0) { 183 if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
139 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); 184 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
140 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); 185 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
141 lo |= (1<<9); /* Disable hw prefetching */ 186 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
142 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 187 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
143 } 188 }
144 } 189 }
@@ -175,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
175#ifdef CONFIG_X86_NUMAQ 220#ifdef CONFIG_X86_NUMAQ
176 numaq_tsc_disable(); 221 numaq_tsc_disable();
177#endif 222#endif
223
224 intel_smp_check(c);
178} 225}
179#else 226#else
180static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 227static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -374,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
374} 421}
375#endif 422#endif
376 423
377static struct cpu_dev intel_cpu_dev __cpuinitdata = { 424static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
378 .c_vendor = "Intel", 425 .c_vendor = "Intel",
379 .c_ident = { "GenuineIntel" }, 426 .c_ident = { "GenuineIntel" },
380#ifdef CONFIG_X86_32 427#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index da299eb85fc0..483eda96e102 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
32}; 32};
33 33
34/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* all the cache descriptor types we care about (no TLB or trace cache entries) */
35static struct _cache_table cache_table[] __cpuinitdata = 35static const struct _cache_table __cpuinitconst cache_table[] =
36{ 36{
37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ 38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -147,10 +147,19 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ecx ecx; 147 union _cpuid4_leaf_ecx ecx;
148 unsigned long size; 148 unsigned long size;
149 unsigned long can_disable; 149 unsigned long can_disable;
150 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 150 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
151}; 151};
152 152
153#ifdef CONFIG_PCI 153/* subset of above _cpuid4_info w/o shared_cpu_map */
154struct _cpuid4_info_regs {
155 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx;
158 unsigned long size;
159 unsigned long can_disable;
160};
161
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
154static struct pci_device_id k8_nb_id[] = { 163static struct pci_device_id k8_nb_id[] = {
155 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
156 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
@@ -197,15 +206,15 @@ union l3_cache {
197 unsigned val; 206 unsigned val;
198}; 207};
199 208
200static unsigned short assocs[] __cpuinitdata = { 209static const unsigned short __cpuinitconst assocs[] = {
201 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 210 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
202 [8] = 16, [0xa] = 32, [0xb] = 48, 211 [8] = 16, [0xa] = 32, [0xb] = 48,
203 [0xc] = 64, 212 [0xc] = 64,
204 [0xf] = 0xffff // ?? 213 [0xf] = 0xffff // ??
205}; 214};
206 215
207static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
208static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 217static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
209 218
210static void __cpuinit 219static void __cpuinit
211amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
278} 287}
279 288
280static void __cpuinit 289static void __cpuinit
281amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 290amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
282{ 291{
283 if (index < 3) 292 if (index < 3)
284 return; 293 return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
286} 295}
287 296
288static int 297static int
289__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 298__cpuinit cpuid4_cache_lookup_regs(int index,
299 struct _cpuid4_info_regs *this_leaf)
290{ 300{
291 union _cpuid4_leaf_eax eax; 301 union _cpuid4_leaf_eax eax;
292 union _cpuid4_leaf_ebx ebx; 302 union _cpuid4_leaf_ebx ebx;
@@ -353,11 +363,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
353 * parameters cpuid leaf to find the cache details 363 * parameters cpuid leaf to find the cache details
354 */ 364 */
355 for (i = 0; i < num_cache_leaves; i++) { 365 for (i = 0; i < num_cache_leaves; i++) {
356 struct _cpuid4_info this_leaf; 366 struct _cpuid4_info_regs this_leaf;
357
358 int retval; 367 int retval;
359 368
360 retval = cpuid4_cache_lookup(i, &this_leaf); 369 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
361 if (retval >= 0) { 370 if (retval >= 0) {
362 switch(this_leaf.eax.split.level) { 371 switch(this_leaf.eax.split.level) {
363 case 1: 372 case 1:
@@ -490,6 +499,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
490 return l2; 499 return l2;
491} 500}
492 501
502#ifdef CONFIG_SYSFS
503
493/* pointer to _cpuid4_info array (for each cache leaf) */ 504/* pointer to _cpuid4_info array (for each cache leaf) */
494static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 505static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
495#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 506#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
@@ -506,17 +517,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
506 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 517 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
507 518
508 if (num_threads_sharing == 1) 519 if (num_threads_sharing == 1)
509 cpu_set(cpu, this_leaf->shared_cpu_map); 520 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
510 else { 521 else {
511 index_msb = get_count_order(num_threads_sharing); 522 index_msb = get_count_order(num_threads_sharing);
512 523
513 for_each_online_cpu(i) { 524 for_each_online_cpu(i) {
514 if (cpu_data(i).apicid >> index_msb == 525 if (cpu_data(i).apicid >> index_msb ==
515 c->apicid >> index_msb) { 526 c->apicid >> index_msb) {
516 cpu_set(i, this_leaf->shared_cpu_map); 527 cpumask_set_cpu(i,
528 to_cpumask(this_leaf->shared_cpu_map));
517 if (i != cpu && per_cpu(cpuid4_info, i)) { 529 if (i != cpu && per_cpu(cpuid4_info, i)) {
518 sibling_leaf = CPUID4_INFO_IDX(i, index); 530 sibling_leaf =
519 cpu_set(cpu, sibling_leaf->shared_cpu_map); 531 CPUID4_INFO_IDX(i, index);
532 cpumask_set_cpu(cpu, to_cpumask(
533 sibling_leaf->shared_cpu_map));
520 } 534 }
521 } 535 }
522 } 536 }
@@ -528,9 +542,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
528 int sibling; 542 int sibling;
529 543
530 this_leaf = CPUID4_INFO_IDX(cpu, index); 544 this_leaf = CPUID4_INFO_IDX(cpu, index);
531 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 545 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
532 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 546 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
533 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 547 cpumask_clear_cpu(cpu,
548 to_cpumask(sibling_leaf->shared_cpu_map));
534 } 549 }
535} 550}
536#else 551#else
@@ -549,6 +564,15 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
549 per_cpu(cpuid4_info, cpu) = NULL; 564 per_cpu(cpuid4_info, cpu) = NULL;
550} 565}
551 566
567static int
568__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
569{
570 struct _cpuid4_info_regs *leaf_regs =
571 (struct _cpuid4_info_regs *)this_leaf;
572
573 return cpuid4_cache_lookup_regs(index, leaf_regs);
574}
575
552static void __cpuinit get_cpu_leaves(void *_retval) 576static void __cpuinit get_cpu_leaves(void *_retval)
553{ 577{
554 int j, *retval = _retval, cpu = smp_processor_id(); 578 int j, *retval = _retval, cpu = smp_processor_id();
@@ -590,8 +614,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
590 return retval; 614 return retval;
591} 615}
592 616
593#ifdef CONFIG_SYSFS
594
595#include <linux/kobject.h> 617#include <linux/kobject.h>
596#include <linux/sysfs.h> 618#include <linux/sysfs.h>
597 619
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
635 int n = 0; 657 int n = 0;
636 658
637 if (len > 1) { 659 if (len > 1) {
638 cpumask_t *mask = &this_leaf->shared_cpu_map; 660 const struct cpumask *mask;
639 661
662 mask = to_cpumask(this_leaf->shared_cpu_map);
640 n = type? 663 n = type?
641 cpulist_scnprintf(buf, len-2, mask) : 664 cpulist_scnprintf(buf, len-2, mask) :
642 cpumask_scnprintf(buf, len-2, mask); 665 cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
699 722
700static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
701{ 724{
702 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
726 int node = cpu_to_node(cpumask_first(mask));
703 struct pci_dev *dev = NULL; 727 struct pci_dev *dev = NULL;
704 ssize_t ret = 0; 728 ssize_t ret = 0;
705 int i; 729 int i;
@@ -733,7 +757,8 @@ static ssize_t
733store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
734 size_t count) 758 size_t count)
735{ 759{
736 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
761 int node = cpu_to_node(cpumask_first(mask));
737 struct pci_dev *dev = NULL; 762 struct pci_dev *dev = NULL;
738 unsigned int ret, index, val; 763 unsigned int ret, index, val;
739 764
@@ -878,7 +903,7 @@ err_out:
878 return -ENOMEM; 903 return -ENOMEM;
879} 904}
880 905
881static cpumask_t cache_dev_map = CPU_MASK_NONE; 906static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
882 907
883/* Add/Remove cache interface for CPU device */ 908/* Add/Remove cache interface for CPU device */
884static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 909static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
918 } 943 }
919 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 944 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
920 } 945 }
921 cpu_set(cpu, cache_dev_map); 946 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
922 947
923 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 948 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
924 return 0; 949 return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
931 956
932 if (per_cpu(cpuid4_info, cpu) == NULL) 957 if (per_cpu(cpuid4_info, cpu) == NULL)
933 return; 958 return;
934 if (!cpu_isset(cpu, cache_dev_map)) 959 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
935 return; 960 return;
936 cpu_clear(cpu, cache_dev_map); 961 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
937 962
938 for (i = 0; i < num_cache_leaves; i++) 963 for (i = 0; i < num_cache_leaves; i++)
939 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 964 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 1c838032fd37..863f89568b1a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -295,11 +396,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
295 * If we know that the error was in user space, send a 396 * If we know that the error was in user space, send a
296 * SIGBUS. Otherwise, panic if tolerance is low. 397 * SIGBUS. Otherwise, panic if tolerance is low.
297 * 398 *
298 * do_exit() takes an awful lot of locks and has a slight 399 * force_sig() takes an awful lot of locks and has a slight
299 * risk of deadlocking. 400 * risk of deadlocking.
300 */ 401 */
301 if (user_space) { 402 if (user_space) {
302 do_exit(SIGBUS); 403 force_sig(SIGBUS, current);
303 } else if (panic_on_oops || tolerant < 2) { 404 } else if (panic_on_oops || tolerant < 2) {
304 mce_panic("Uncorrected machine check", 405 mce_panic("Uncorrected machine check",
305 &panicm, mcestart); 406 &panicm, mcestart);
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -490,7 +615,7 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
490 615
491} 616}
492 617
493static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) 618static void mce_cpu_features(struct cpuinfo_x86 *c)
494{ 619{
495 switch (c->x86_vendor) { 620 switch (c->x86_vendor) {
496 case X86_VENDOR_INTEL: 621 case X86_VENDOR_INTEL:
@@ -504,20 +629,38 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,29 +863,57 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
734static int mce_resume(struct sys_device *dev) 892static int mce_resume(struct sys_device *dev)
735{ 893{
736 mce_init(NULL); 894 mce_init(NULL);
895 mce_cpu_features(&current_cpu_data);
737 return 0; 896 return 0;
738} 897}
739 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
740/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
741static void mce_restart(void) 908static void mce_restart(void)
742{ 909{
743 if (next_interval)
744 cancel_delayed_work(&mcheck_work);
745 /* Timer race is harmless here */
746 on_each_cpu(mce_init, NULL, 1);
747 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
748 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
749 schedule_delayed_work(&mcheck_work,
750 round_jiffies_relative(next_interval));
751} 912}
752 913
753static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
754 .resume = mce_resume, 917 .resume = mce_resume,
755 .name = "machinecheck", 918 .name = "machinecheck",
756}; 919};
@@ -777,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
777 } \ 940 } \
778 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
779 942
780/* 943static struct sysdev_attribute *bank_attrs;
781 * TBD should generate these dynamically based on number of available banks. 944
782 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
783 */ 946 char *buf)
784ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
785ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
786ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
787ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
788ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
789ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
790 963
791static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
792 char *buf) 965 char *buf)
@@ -813,13 +986,11 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
813static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
814ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
815static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
816 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
817 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
818 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
819 NULL 990 NULL
820}; 991};
821 992
822static cpumask_t mce_device_initialized = CPU_MASK_NONE; 993static cpumask_var_t mce_device_initialized;
823 994
824/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ 995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
825static __cpuinit int mce_create_device(unsigned int cpu) 996static __cpuinit int mce_create_device(unsigned int cpu)
@@ -844,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
844 if (err) 1015 if (err)
845 goto error; 1016 goto error;
846 } 1017 }
847 cpu_set(cpu, mce_device_initialized); 1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
848 1025
849 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
850error: 1032error:
851 while (i--) { 1033 while (--i >= 0) {
852 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
853 mce_attributes[i]); 1035 mce_attributes[i]);
854 } 1036 }
@@ -861,14 +1043,44 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
861{ 1043{
862 int i; 1044 int i;
863 1045
864 if (!cpu_isset(cpu, mce_device_initialized)) 1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
865 return; 1047 return;
866 1048
867 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
868 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
869 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
870 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
871 cpu_clear(cpu, mce_device_initialized); 1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
872} 1084}
873 1085
874/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -876,6 +1088,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
876 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
877{ 1089{
878 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
879 1092
880 switch (action) { 1093 switch (action) {
881 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -890,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
890 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
891 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
892 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
893 } 1121 }
894 return NOTIFY_OK; 1122 return NOTIFY_OK;
895} 1123}
@@ -898,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
898 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
899}; 1127};
900 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
901static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
902{ 1158{
903 int err; 1159 int err;
@@ -905,6 +1161,13 @@ static __init int mce_init_device(void)
905 1161
906 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
907 return -EIO; 1163 return -EIO;
1164
1165 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1166
1167 err = mce_init_banks();
1168 if (err)
1169 return err;
1170
908 err = sysdev_class_register(&mce_sysclass); 1171 err = sysdev_class_register(&mce_sysclass);
909 if (err) 1172 if (err)
910 return err; 1173 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094d..56dde9c4bc96 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -90,7 +92,8 @@ struct thresh_restart {
90}; 92};
91 93
92/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
93static long threshold_restart_bank(void *_tr) 95/* Called via smp_call_function_single() */
96static void threshold_restart_bank(void *_tr)
94{ 97{
95 struct thresh_restart *tr = _tr; 98 struct thresh_restart *tr = _tr;
96 u32 mci_misc_hi, mci_misc_lo; 99 u32 mci_misc_hi, mci_misc_lo;
@@ -117,11 +120,10 @@ static long threshold_restart_bank(void *_tr)
117 120
118 mci_misc_hi |= MASK_COUNT_EN_HI; 121 mci_misc_hi |= MASK_COUNT_EN_HI;
119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 122 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
121} 123}
122 124
123/* cpu init entry point, called from mce.c with preempt off */ 125/* cpu init entry point, called from mce.c with preempt off */
124void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) 126void mce_amd_feature_init(struct cpuinfo_x86 *c)
125{ 127{
126 unsigned int bank, block; 128 unsigned int bank, block;
127 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
@@ -174,6 +176,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
283 tr.b = b; 279 tr.b = b;
284 tr.reset = 0; 280 tr.reset = 0;
285 tr.old_limit = 0; 281 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
287 283
288 return end - buf; 284 return end - buf;
289} 285}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
305 tr.b = b; 301 tr.b = b;
306 tr.reset = 0; 302 tr.reset = 0;
307 303
308 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 305
310 return end - buf; 306 return end - buf;
311} 307}
312 308
313static long local_error_count(void *_b) 309struct threshold_block_cross_cpu {
310 struct threshold_block *tb;
311 long retval;
312};
313
314static void local_error_count_handler(void *_tbcc)
314{ 315{
315 struct threshold_block *b = _b; 316 struct threshold_block_cross_cpu *tbcc = _tbcc;
317 struct threshold_block *b = tbcc->tb;
316 u32 low, high; 318 u32 low, high;
317 319
318 rdmsr(b->address, low, high); 320 rdmsr(b->address, low, high);
319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); 321 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
320} 322}
321 323
322static ssize_t show_error_count(struct threshold_block *b, char *buf) 324static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{ 325{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); 326 struct threshold_block_cross_cpu tbcc = { .tb = b, };
327
328 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
329 return sprintf(buf, "%lx\n", tbcc.retval);
325} 330}
326 331
327static ssize_t store_error_count(struct threshold_block *b, 332static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
329{ 334{
330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; 335 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 336
332 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 337 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
333 return 1; 338 return 1;
334} 339}
335 340
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0; 404 return 0;
400 405
401 if (rdmsr_safe(address, &low, &high)) 406 if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
402 return 0; 407 return 0;
403 408
404 if (!(high & MASK_VALID_HI)) { 409 if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
462 return err; 467 return err;
463} 468}
464 469
465static __cpuinit long local_allocate_threshold_blocks(void *_bank) 470static __cpuinit long
471local_allocate_threshold_blocks(int cpu, unsigned int bank)
466{ 472{
467 unsigned int *bank = _bank; 473 return allocate_threshold_blocks(cpu, bank, 0,
468 474 MSR_IA32_MC0_MISC + bank * 4);
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471} 475}
472 476
473/* symlinks sibling shared banks to first core. first core owns dir/files. */ 477/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -481,7 +485,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 485
482#ifdef CONFIG_SMP 486#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 487 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = first_cpu(per_cpu(cpu_core_map, cpu)); 488 i = cpumask_first(cpu_core_mask(cpu));
485 489
486 /* first core not up yet */ 490 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 491 if (cpu_data(i).cpu_core_id)
@@ -501,7 +505,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 505 if (err)
502 goto out; 506 goto out;
503 507
504 b->cpus = per_cpu(cpu_core_map, cpu); 508 cpumask_copy(b->cpus, cpu_core_mask(cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 509 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 510 goto out;
507 } 511 }
@@ -512,24 +516,29 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
512 err = -ENOMEM; 516 err = -ENOMEM;
513 goto out; 517 goto out;
514 } 518 }
519 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
520 kfree(b);
521 err = -ENOMEM;
522 goto out;
523 }
515 524
516 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
517 if (!b->kobj) 526 if (!b->kobj)
518 goto out_free; 527 goto out_free;
519 528
520#ifndef CONFIG_SMP 529#ifndef CONFIG_SMP
521 b->cpus = CPU_MASK_ALL; 530 cpumask_setall(b->cpus);
522#else 531#else
523 b->cpus = per_cpu(cpu_core_map, cpu); 532 cpumask_copy(b->cpus, cpu_core_mask(cpu));
524#endif 533#endif
525 534
526 per_cpu(threshold_banks, cpu)[bank] = b; 535 per_cpu(threshold_banks, cpu)[bank] = b;
527 536
528 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); 537 err = local_allocate_threshold_blocks(cpu, bank);
529 if (err) 538 if (err)
530 goto out_free; 539 goto out_free;
531 540
532 for_each_cpu_mask_nr(i, b->cpus) { 541 for_each_cpu(i, b->cpus) {
533 if (i == cpu) 542 if (i == cpu)
534 continue; 543 continue;
535 544
@@ -545,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
545 554
546out_free: 555out_free:
547 per_cpu(threshold_banks, cpu)[bank] = NULL; 556 per_cpu(threshold_banks, cpu)[bank] = NULL;
557 free_cpumask_var(b->cpus);
548 kfree(b); 558 kfree(b);
549out: 559out:
550 return err; 560 return err;
@@ -619,7 +629,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
619#endif 629#endif
620 630
621 /* remove all sibling symlinks before unregistering */ 631 /* remove all sibling symlinks before unregistering */
622 for_each_cpu_mask_nr(i, b->cpus) { 632 for_each_cpu(i, b->cpus) {
623 if (i == cpu) 633 if (i == cpu)
624 continue; 634 continue;
625 635
@@ -632,6 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
632free_out: 642free_out:
633 kobject_del(b->kobj); 643 kobject_del(b->kobj);
634 kobject_put(b->kobj); 644 kobject_put(b->kobj);
645 free_cpumask_var(b->cpus);
635 kfree(b); 646 kfree(b);
636 per_cpu(threshold_banks, cpu)[bank] = NULL; 647 per_cpu(threshold_banks, cpu)[bank] = NULL;
637} 648}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..d6b72df89d69 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,17 +1,21 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/interrupt.h> 9#include <linux/interrupt.h>
8#include <linux/percpu.h> 10#include <linux/percpu.h>
9#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/apic.h>
10#include <asm/msr.h> 13#include <asm/msr.h>
11#include <asm/mce.h> 14#include <asm/mce.h>
12#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
13#include <asm/idle.h> 16#include <asm/idle.h>
14#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
15 19
16asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
17{ 21{
@@ -24,13 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
24 28
25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
26 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
28 32
29 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 34 irq_exit();
31} 35}
32 36
33static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) 37static void intel_init_thermal(struct cpuinfo_x86 *c)
34{ 38{
35 u32 l, h; 39 u32 l, h;
36 int tm2 = 0; 40 int tm2 = 0;
@@ -48,13 +52,13 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
48 */ 52 */
49 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
50 h = apic_read(APIC_LVTTHMR); 54 h = apic_read(APIC_LVTTHMR);
51 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { 55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
52 printk(KERN_DEBUG 56 printk(KERN_DEBUG
53 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
54 return; 58 return;
55 } 59 }
56 60
57 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) 61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
58 tm2 = 1; 62 tm2 = 1;
59 63
60 if (h & APIC_VECTOR_MASK) { 64 if (h & APIC_VECTOR_MASK) {
@@ -72,7 +76,7 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
72 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); 76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
73 77
74 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
75 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); 79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
76 80
77 l = apic_read(APIC_LVTTHMR); 81 l = apic_read(APIC_LVTTHMR);
78 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
@@ -84,7 +88,209 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
84 return; 88 return;
85} 89}
86 90
87void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) 91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
292void mce_intel_feature_init(struct cpuinfo_x86 *c)
88{ 293{
89 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
90} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 9b60fce09f75..f53bdcbaf382 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -85,7 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 */ 85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR); 87 h = apic_read(APIC_LVTTHMR);
88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) { 88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", 89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu); 90 cpu);
91 return; /* -EBUSY */ 91 return; /* -EBUSY */
@@ -111,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
111 vendor_thermal_interrupt = intel_thermal_interrupt; 111 vendor_thermal_interrupt = intel_thermal_interrupt;
112 112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc0533649..f4361b56f8e9 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 000000000000..ce0fe4b5c04f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
1/* MTRR (Memory Type Range Register) cleanup
2
3 Copyright (C) 2009 Yinghai Lu
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/smp.h>
24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h>
27
28#include <asm/e820.h>
29#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35
36/* should be related to MTRR_VAR_RANGES nums */
37#define RANGE_NUM 256
38
39struct res_range {
40 unsigned long start;
41 unsigned long end;
42};
43
44static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start,
46 unsigned long end)
47{
48 /* out of slots */
49 if (nr_range >= RANGE_NUM)
50 return nr_range;
51
52 range[nr_range].start = start;
53 range[nr_range].end = end;
54
55 nr_range++;
56
57 return nr_range;
58}
59
60static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
62 unsigned long end)
63{
64 int i;
65
66 /* try to merge it with old one */
67 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end;
69 unsigned long common_start, common_end;
70
71 if (!range[i].end)
72 continue;
73
74 common_start = max(range[i].start, start);
75 common_end = min(range[i].end, end);
76 if (common_start > common_end + 1)
77 continue;
78
79 final_start = min(range[i].start, start);
80 final_end = max(range[i].end, end);
81
82 range[i].start = final_start;
83 range[i].end = final_end;
84 return nr_range;
85 }
86
87 /* need to add that */
88 return add_range(range, nr_range, start, end);
89}
90
91static void __init
92subtract_range(struct res_range *range, unsigned long start, unsigned long end)
93{
94 int i, j;
95
96 for (j = 0; j < RANGE_NUM; j++) {
97 if (!range[j].end)
98 continue;
99
100 if (start <= range[j].start && end >= range[j].end) {
101 range[j].start = 0;
102 range[j].end = 0;
103 continue;
104 }
105
106 if (start <= range[j].start && end < range[j].end &&
107 range[j].start < end + 1) {
108 range[j].start = end + 1;
109 continue;
110 }
111
112
113 if (start > range[j].start && end >= range[j].end &&
114 range[j].end > start - 1) {
115 range[j].end = start - 1;
116 continue;
117 }
118
119 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */
121 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0)
123 break;
124 }
125 if (i < RANGE_NUM) {
126 range[i].end = range[j].end;
127 range[i].start = end + 1;
128 } else {
129 printk(KERN_ERR "run of slot in ranges\n");
130 }
131 range[j].end = start - 1;
132 continue;
133 }
134 }
135}
136
137static int __init cmp_range(const void *x1, const void *x2)
138{
139 const struct res_range *r1 = x1;
140 const struct res_range *r2 = x2;
141 long start1, start2;
142
143 start1 = r1->start;
144 start2 = r2->start;
145
146 return start1 - start2;
147}
148
149struct var_mtrr_range_state {
150 unsigned long base_pfn;
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157
158static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
160 unsigned long extra_remove_base,
161 unsigned long extra_remove_size)
162{
163 unsigned long base, size;
164 mtrr_type type;
165 int i;
166
167 for (i = 0; i < num_var_ranges; i++) {
168 type = range_state[i].type;
169 if (type != MTRR_TYPE_WRBACK)
170 continue;
171 base = range_state[i].base_pfn;
172 size = range_state[i].size_pfn;
173 nr_range = add_range_with_merge(range, nr_range, base,
174 base + size - 1);
175 }
176 if (debug_print) {
177 printk(KERN_DEBUG "After WB checking\n");
178 for (i = 0; i < nr_range; i++)
179 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
180 range[i].start, range[i].end + 1);
181 }
182
183 /* take out UC ranges */
184 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE &&
187 type != MTRR_TYPE_WRPROT)
188 continue;
189 size = range_state[i].size_pfn;
190 if (!size)
191 continue;
192 base = range_state[i].base_pfn;
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base;
202 base = 1<<(20-PAGE_SHIFT);
203 }
204 subtract_range(range, base, base + size - 1);
205 }
206 if (extra_remove_size)
207 subtract_range(range, extra_remove_base,
208 extra_remove_base + extra_remove_size - 1);
209
210 /* get new range num */
211 nr_range = 0;
212 for (i = 0; i < RANGE_NUM; i++) {
213 if (!range[i].end)
214 continue;
215 nr_range++;
216 }
217 if (debug_print) {
218 printk(KERN_DEBUG "After UC checking\n");
219 for (i = 0; i < nr_range; i++)
220 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
221 range[i].start, range[i].end + 1);
222 }
223
224 /* sort the ranges */
225 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
226 if (debug_print) {
227 printk(KERN_DEBUG "After sorting\n");
228 for (i = 0; i < nr_range; i++)
229 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
230 range[i].start, range[i].end + 1);
231 }
232
233 /* clear those is not used */
234 for (i = nr_range; i < RANGE_NUM; i++)
235 memset(&range[i], 0, sizeof(range[i]));
236
237 return nr_range;
238}
239
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER
244
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{
247 unsigned long sum;
248 int i;
249
250 sum = 0;
251 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start;
253
254 return sum;
255}
256
257static int enable_mtrr_cleanup __initdata =
258 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
259
260static int __init disable_mtrr_cleanup_setup(char *str)
261{
262 enable_mtrr_cleanup = 0;
263 return 0;
264}
265early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
266
267static int __init enable_mtrr_cleanup_setup(char *str)
268{
269 enable_mtrr_cleanup = 1;
270 return 0;
271}
272early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
273
274static int __init mtrr_cleanup_debug_setup(char *str)
275{
276 debug_print = 1;
277 return 0;
278}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits)
292{
293 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask;
295
296 if (!sizek) {
297 fill_mtrr_var_range(reg, 0, 0, 0, 0);
298 return;
299 }
300
301 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1);
303
304 base = ((u64)basek) << 10;
305
306 base |= type;
307 mask |= 0x800;
308
309 base_lo = base & ((1ULL<<32) - 1);
310 base_hi = base >> 32;
311
312 mask_lo = mask & ((1ULL<<32) - 1);
313 mask_hi = mask >> 32;
314
315 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
316}
317
318static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type)
321{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type;
325}
326
327static void __init
328set_var_mtrr_all(unsigned int address_bits)
329{
330 unsigned long basek, sizek;
331 unsigned char type;
332 unsigned int reg;
333
334 for (reg = 0; reg < num_var_ranges; reg++) {
335 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
336 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
337 type = range_state[reg].type;
338
339 set_var_mtrr(reg, basek, sizek, type, address_bits);
340 }
341}
342
343static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{
345 char factor;
346 unsigned long base = sizek;
347
348 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */
350 factor = 'K';
351 } else if (base & ((1<<20) - 1)) {
352 factor = 'M';
353 base >>= 10;
354 } else {
355 factor = 'G';
356 base >>= 20;
357 }
358
359 *factorp = factor;
360
361 return base;
362}
363
364static unsigned int __init
365range_to_mtrr(unsigned int reg, unsigned long range_startk,
366 unsigned long range_sizek, unsigned char type)
367{
368 if (!range_sizek || (reg >= num_var_ranges))
369 return reg;
370
371 while (range_sizek) {
372 unsigned long max_align, align;
373 unsigned long sizek;
374
375 /* Compute the maximum size I can make a range */
376 if (range_startk)
377 max_align = ffs(range_startk) - 1;
378 else
379 max_align = 32;
380 align = fls(range_sizek) - 1;
381 if (align > max_align)
382 align = max_align;
383
384 sizek = 1 << align;
385 if (debug_print) {
386 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base;
388
389 start_base = to_size_factor(range_startk,
390 &start_factor),
391 size_base = to_size_factor(sizek, &size_factor),
392
393 printk(KERN_DEBUG "Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor,
396 size_base, size_factor,
397 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
398 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
399 );
400 }
401 save_var_mtrr(reg++, range_startk, sizek, type);
402 range_startk += sizek;
403 range_sizek -= sizek;
404 if (reg >= num_var_ranges)
405 break;
406 }
407 return reg;
408}
409
410static unsigned __init
411range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
412 unsigned long sizek)
413{
414 unsigned long hole_basek, hole_sizek;
415 unsigned long second_basek, second_sizek;
416 unsigned long range0_basek, range0_sizek;
417 unsigned long range_basek, range_sizek;
418 unsigned long chunk_sizek;
419 unsigned long gran_sizek;
420
421 hole_basek = 0;
422 hole_sizek = 0;
423 second_basek = 0;
424 second_sizek = 0;
425 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek;
427
428 /* align with gran size, prevent small block used up MTRRs */
429 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek)
431 return second_sizek;
432 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434
435 while (range_sizek > state->range_sizek) {
436 range_sizek -= gran_sizek;
437 if (!range_sizek)
438 return 0;
439 }
440 state->range_sizek = range_sizek;
441
442 /* try to append some small hole */
443 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445
446 /* no increase */
447 if (range0_sizek == state->range_sizek) {
448 if (debug_print)
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
450 range0_basek<<10,
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0;
455 }
456
457 /* only cut back, when it is not the last */
458 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek)
461 range0_sizek -= chunk_sizek;
462 else
463 range0_sizek = 0;
464
465 if (!range0_sizek)
466 break;
467 }
468 }
469
470second_try:
471 range_basek = range0_basek + range0_sizek;
472
473 /* one hole in the middle */
474 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek;
476
477 if (range0_sizek > state->range_sizek) {
478
479 /* one hole in middle or at end */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481
482 /* hole size should be less than half of range0 size */
483 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek;
486 second_sizek = 0;
487 hole_sizek = 0;
488
489 goto second_try;
490 }
491 }
492
493 if (range0_sizek) {
494 if (debug_print)
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
496 range0_basek<<10,
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK);
500 }
501
502 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */
504 range_sizek = state->range_sizek - range0_sizek;
505
506 if (debug_print)
507 printk(KERN_DEBUG "range: %016lx - %016lx\n",
508 range_basek<<10,
509 (range_basek + range_sizek)<<10);
510 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK);
512 }
513
514 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print)
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
518 hole_basek<<10,
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 }
523
524 return second_sizek;
525}
526
527static void __init
528set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
529 unsigned long size_pfn)
530{
531 unsigned long basek, sizek;
532 unsigned long second_sizek = 0;
533
534 if (state->reg >= num_var_ranges)
535 return;
536
537 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10);
539
540 /* See if I can merge with the last range */
541 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk;
545 return;
546 }
547 /* Write the range mtrrs */
548 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550
551 /* Allocate an msr */
552 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek;
554}
555
556/* mininum size of mtrr block that can take hole */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558
559static int __init parse_mtrr_chunk_size_opt(char *p)
560{
561 if (!p)
562 return -EINVAL;
563 mtrr_chunk_size = memparse(p, &p);
564 return 0;
565}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567
568/* granity of mtrr of block */
569static u64 mtrr_gran_size __initdata;
570
571static int __init parse_mtrr_gran_size_opt(char *p)
572{
573 if (!p)
574 return -EINVAL;
575 mtrr_gran_size = memparse(p, &p);
576 return 0;
577}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579
580static int nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582
583static int __init parse_mtrr_spare_reg(char *arg)
584{
585 if (arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0;
588}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591
592static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size)
595{
596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg;
599
600 var_state.range_startk = 0;
601 var_state.range_sizek = 0;
602 var_state.reg = 0;
603 var_state.chunk_sizek = chunk_size >> 10;
604 var_state.gran_sizek = gran_size >> 10;
605
606 memset(range_state, 0, sizeof(range_state));
607
608 /* Write the range etc */
609 for (i = 0; i < nr_range; i++)
610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1);
612
613 /* Write the last range */
614 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0);
616
617 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */
619 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++;
622 }
623
624 return num_reg;
625}
626
627struct mtrr_cleanup_result {
628 unsigned long gran_sizek;
629 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek;
631 unsigned int num_reg;
632 int bad;
633};
634
635/*
636 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
637 * chunk size: gran_size, ..., 2G
638 * so we need (1+16)*8
639 */
640#define NUM_RESULT 136
641#define PSHIFT (PAGE_SHIFT - 10)
642
643static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
644static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645
646static void __init print_out_mtrr_range_state(void)
647{
648 int i;
649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base;
651 mtrr_type type;
652
653 for (i = 0; i < num_var_ranges; i++) {
654
655 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
656 if (!size_base)
657 continue;
658
659 size_base = to_size_factor(size_base, &size_factor),
660 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
661 start_base = to_size_factor(start_base, &start_factor),
662 type = range_state[i].type;
663
664 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
665 i, start_base, start_factor,
666 size_base, size_factor,
667 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
668 ((type == MTRR_TYPE_WRPROT) ? "WP" :
669 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
670 );
671 }
672}
673
674static int __init mtrr_need_cleanup(void)
675{
676 int i;
677 mtrr_type type;
678 unsigned long size;
679 /* extra one for all 0 */
680 int num[MTRR_NUM_TYPES + 1];
681
682 /* check entries number */
683 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type;
686 size = range_state[i].size_pfn;
687 if (type >= MTRR_NUM_TYPES)
688 continue;
689 if (!size)
690 type = MTRR_NUM_TYPES;
691 if (type == MTRR_TYPE_WRPROT)
692 type = MTRR_TYPE_UNCACHABLE;
693 num[type]++;
694 }
695
696 /* check if we got UC entries */
697 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0;
699
700 /* check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0;
704
705 return 1;
706}
707
708static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
710 unsigned long extra_remove_base,
711 unsigned long extra_remove_size,
712 int i)
713{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new;
718
719 /* convert ranges to var ranges state */
720 num_reg = x86_setup_var_mtrrs(range, nr_range,
721 chunk_size, gran_size);
722
723 /* we got new setting in range_state, check it */
724 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new);
728
729 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg;
732 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek =
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1;
736 } else
737 result[i].lose_cover_sizek =
738 (range_sums - range_sums_new) << PSHIFT;
739
740 /* double check it */
741 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range ||
743 memcmp(range, range_new, sizeof(range)))
744 result[i].bad = 1;
745 }
746
747 if (!result[i].bad && (range_sums - range_sums_new <
748 min_loss_pfn[num_reg])) {
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752}
753
754static void __init mtrr_print_out_one_result(int i)
755{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base;
758
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
763 result[i].bad ? "*BAD*" : " ",
764 gran_base, gran_factor, chunk_base, chunk_factor);
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
766 result[i].num_reg, result[i].bad ? "-" : "",
767 lose_base, lose_factor);
768}
769
770static int __init mtrr_search_optimal_index(void)
771{
772 int i;
773 int num_reg_good;
774 int index_good;
775
776 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1;
778 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i])
781 num_reg_good = i;
782 }
783
784 index_good = -1;
785 if (num_reg_good != -1) {
786 for (i = 0; i < NUM_RESULT; i++) {
787 if (!result[i].bad &&
788 result[i].num_reg == num_reg_good &&
789 !result[i].lose_cover_sizek) {
790 index_good = i;
791 break;
792 }
793 }
794 }
795
796 return index_good;
797}
798
799
800int __init mtrr_cleanup(unsigned address_bits)
801{
802 unsigned long extra_remove_base, extra_remove_size;
803 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size;
806 int index_good;
807 int i;
808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy);
812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0;
815
816 /* get it and store it aside */
817 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type);
820 range_state[i].base_pfn = base;
821 range_state[i].size_pfn = size;
822 range_state[i].type = type;
823 }
824
825 /* check if we need handle it and can handle it */
826 if (!mtrr_need_cleanup())
827 return 0;
828
829 /* print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state();
832
833 memset(range, 0, sizeof(range));
834 extra_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2)
837 extra_remove_size =
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
840 extra_remove_size);
841 /*
842 * [0, 1M) should always be coverred by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it
844 */
845 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849
850 range_sums = sum_ranges(range, nr_range);
851 printk(KERN_INFO "total RAM coverred: %ldM\n",
852 range_sums >> (20 - PAGE_SHIFT));
853
854 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i);
858
859 mtrr_print_out_one_result(i);
860
861 if (!result[i].bad) {
862 set_var_mtrr_all(address_bits);
863 printk(KERN_DEBUG "New variable MTRRs\n");
864 print_out_mtrr_range_state();
865 return 1;
866 }
867 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
868 "will find optimal one\n");
869 }
870
871 i = 0;
872 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
873 memset(result, 0, sizeof(result));
874 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
875
876 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
877 chunk_size <<= 1) {
878
879 if (i >= NUM_RESULT)
880 continue;
881
882 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i);
884 if (debug_print) {
885 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n");
887 }
888
889 i++;
890 }
891 }
892
893 /* try to find the optimal index */
894 index_good = mtrr_search_optimal_index();
895
896 if (index_good != -1) {
897 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
898 i = index_good;
899 mtrr_print_out_one_result(i);
900
901 /* convert ranges to var ranges state */
902 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek;
905 gran_size <<= 10;
906 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
907 set_var_mtrr_all(address_bits);
908 printk(KERN_DEBUG "New variable MTRRs\n");
909 print_out_mtrr_range_state();
910 return 1;
911 } else {
912 /* print out all */
913 for (i = 0; i < NUM_RESULT; i++)
914 mtrr_print_out_one_result(i);
915 }
916
917 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
918 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
919
920 return 0;
921}
922#else
923int __init mtrr_cleanup(unsigned address_bits)
924{
925 return 0;
926}
927#endif
928
929static int disable_mtrr_trim;
930
931static int __init disable_mtrr_trim_setup(char *str)
932{
933 disable_mtrr_trim = 1;
934 return 0;
935}
936early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
937
938/*
939 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
940 * for memory >4GB. Check for that here.
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */
944#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22)
946
947int __init amd_special_default_mtrr(void)
948{
949 u32 l, h;
950
951 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
952 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0;
958 /*
959 * Memory between 4GB and top of mem is forced WB by this magic bit.
960 * Reserved before K8RevF, but should be zero there.
961 */
962 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
963 (Tom2Enabled | Tom2ForceMemTypeWB))
964 return 1;
965 return 0;
966}
967
968static u64 __init real_trim_memory(unsigned long start_pfn,
969 unsigned long limit_pfn)
970{
971 u64 trim_start, trim_size;
972 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT;
974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start;
977
978 return e820_update_range(trim_start, trim_size, E820_RAM,
979 E820_RESERVED);
980}
981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number
984 *
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message.
991 */
992int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
993{
994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type;
996 u64 total_trim_size;
997
998 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1];
1000 /*
1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture:
1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy);
1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0;
1010
1011 /* get it and store it aside */
1012 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type);
1015 range_state[i].base_pfn = base;
1016 range_state[i].size_pfn = size;
1017 range_state[i].type = type;
1018 }
1019
1020 /* Find highest cached pfn */
1021 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK)
1024 continue;
1025 base = range_state[i].base_pfn;
1026 size = range_state[i].size_pfn;
1027 if (highest_pfn < base + size)
1028 highest_pfn = base + size;
1029 }
1030
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1032 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0;
1035 }
1036
1037 /* check entries number */
1038 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type;
1041 if (type >= MTRR_NUM_TYPES)
1042 continue;
1043 size = range_state[i].size_pfn;
1044 if (!size)
1045 type = MTRR_NUM_TYPES;
1046 num[type]++;
1047 }
1048
1049 /* no entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK])
1051 return 0;
1052
1053 /* check if we only had WB and UC */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0;
1057
1058 memset(range, 0, sizeof(range));
1059 nr_range = 0;
1060 if (mtrr_tom2) {
1061 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1062 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1063 if (highest_pfn < range[nr_range].end + 1)
1064 highest_pfn = range[nr_range].end + 1;
1065 nr_range++;
1066 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068
1069 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */
1074 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start);
1078 }
1079 /* check the top */
1080 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn);
1084
1085 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089
1090 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1);
1092
1093 printk(KERN_INFO "update e820 for mtrr\n");
1094 update_e820();
1095
1096 return 1;
1097 }
1098
1099 return 0;
1100}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95c..0b776c09aff3 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36static int __initdata mtrr_show; 36/**
37static int __init mtrr_debug(char *opt) 37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
40 * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
41 * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
42 * 0 for operation."
43 */
44static inline void k8_check_syscfg_dram_mod_en(void)
38{ 45{
39 mtrr_show = 1; 46 u32 lo, hi;
40 return 0; 47
48 if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
49 (boot_cpu_data.x86 >= 0x0f)))
50 return;
51
52 rdmsr(MSR_K8_SYSCFG, lo, hi);
53 if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
54 printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
55 " not cleared by BIOS, clearing this bit\n",
56 smp_processor_id());
57 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
58 mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
59 }
41} 60}
42early_param("mtrr.show", mtrr_debug);
43 61
44/* 62/*
45 * Returns the effective MTRR type for the region 63 * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
174 unsigned int *p = (unsigned int *) frs; 192 unsigned int *p = (unsigned int *) frs;
175 int i; 193 int i;
176 194
195 k8_check_syscfg_dram_mod_en();
196
177 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
178 198
179 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
188 get_fixed_ranges(mtrr_state.fixed_ranges); 208 get_fixed_ranges(mtrr_state.fixed_ranges);
189} 209}
190 210
191static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) 211static unsigned __initdata last_fixed_start;
212static unsigned __initdata last_fixed_end;
213static mtrr_type __initdata last_fixed_type;
214
215static void __init print_fixed_last(void)
216{
217 if (!last_fixed_end)
218 return;
219
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222
223 last_fixed_end = 0;
224}
225
226static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type)
228{
229 last_fixed_start = base;
230 last_fixed_end = end;
231 last_fixed_type = type;
232}
233
234static void __init print_fixed(unsigned base, unsigned step,
235 const mtrr_type *types)
192{ 236{
193 unsigned i; 237 unsigned i;
194 238
195 for (i = 0; i < 8; ++i, ++types, base += step) 239 for (i = 0; i < 8; ++i, ++types, base += step) {
196 printk(KERN_INFO "MTRR %05X-%05X %s\n", 240 if (last_fixed_end == 0) {
197 base, base + step - 1, mtrr_attrib_to_str(*types)); 241 update_fixed_last(base, base + step, *types);
242 continue;
243 }
244 if (last_fixed_end == base && last_fixed_type == *types) {
245 last_fixed_end = base + step;
246 continue;
247 }
248 /* new segments: gap or different type */
249 print_fixed_last();
250 update_fixed_last(base, base + step, *types);
251 }
198} 252}
199 253
200static void prepare_set(void); 254static void prepare_set(void);
201static void post_set(void); 255static void post_set(void);
202 256
257static void __init print_mtrr_state(void)
258{
259 unsigned int i;
260 int high_width;
261
262 printk(KERN_DEBUG "MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
272
273 /* tail */
274 print_fixed_last();
275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
282 i,
283 high_width,
284 mtrr_state.var_ranges[i].base_hi,
285 mtrr_state.var_ranges[i].base_lo >> 12,
286 high_width,
287 mtrr_state.var_ranges[i].mask_hi,
288 mtrr_state.var_ranges[i].mask_lo >> 12,
289 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
290 else
291 printk(KERN_DEBUG " %u disabled\n", i);
292 }
293 if (mtrr_tom2) {
294 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
295 mtrr_tom2, mtrr_tom2>>20);
296 }
297}
298
203/* Grab all of the MTRR state for this CPU into *state */ 299/* Grab all of the MTRR state for this CPU into *state */
204void __init get_mtrr_state(void) 300void __init get_mtrr_state(void)
205{ 301{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
231 mtrr_tom2 |= low; 327 mtrr_tom2 |= low;
232 mtrr_tom2 &= 0xffffff800000ULL; 328 mtrr_tom2 &= 0xffffff800000ULL;
233 } 329 }
234 if (mtrr_show) { 330
235 int high_width; 331 print_mtrr_state();
236 332
237 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
238 if (mtrr_state.have_fixed) {
239 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
240 mtrr_state.enabled & 1 ? "en" : "dis");
241 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
242 for (i = 0; i < 2; ++i)
243 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
244 for (i = 0; i < 8; ++i)
245 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
246 }
247 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
248 mtrr_state.enabled & 2 ? "en" : "dis");
249 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
250 for (i = 0; i < num_var_ranges; ++i) {
251 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
252 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
253 i,
254 high_width,
255 mtrr_state.var_ranges[i].base_hi,
256 mtrr_state.var_ranges[i].base_lo >> 12,
257 high_width,
258 mtrr_state.var_ranges[i].mask_hi,
259 mtrr_state.var_ranges[i].mask_lo >> 12,
260 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
261 else
262 printk(KERN_INFO "MTRR %u disabled\n", i);
263 }
264 if (mtrr_tom2) {
265 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
266 mtrr_tom2, mtrr_tom2>>20);
267 }
268 }
269 mtrr_state_set = 1; 333 mtrr_state_set = 1;
270 334
271 /* PAT setup for BP. We need to go through sync steps here */ 335 /* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
308} 372}
309 373
310/** 374/**
311 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
312 * see AMD publication no. 24593, chapter 3.2.1 for more information
313 */
314static inline void k8_enable_fixed_iorrs(void)
315{
316 unsigned lo, hi;
317
318 rdmsr(MSR_K8_SYSCFG, lo, hi);
319 mtrr_wrmsr(MSR_K8_SYSCFG, lo
320 | K8_MTRRFIXRANGE_DRAM_ENABLE
321 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
322}
323
324/**
325 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 375 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
326 * @msr: MSR address of the MTTR which should be checked and updated 376 * @msr: MSR address of the MTTR which should be checked and updated
327 * @changed: pointer which indicates whether the MTRR needed to be changed 377 * @changed: pointer which indicates whether the MTRR needed to be changed
328 * @msrwords: pointer to the MSR values which the MSR should have 378 * @msrwords: pointer to the MSR values which the MSR should have
329 *
330 * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
331 * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
332 */ 379 */
333static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) 380static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
334{ 381{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
337 rdmsr(msr, lo, hi); 384 rdmsr(msr, lo, hi);
338 385
339 if (lo != msrwords[0] || hi != msrwords[1]) { 386 if (lo != msrwords[0] || hi != msrwords[1]) {
340 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
341 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
342 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
343 k8_enable_fixed_iorrs();
344 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 387 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
345 *changed = true; 388 *changed = true;
346 } 389 }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
376{ 419{
377 unsigned int mask_lo, mask_hi, base_lo, base_hi; 420 unsigned int mask_lo, mask_hi, base_lo, base_hi;
378 unsigned int tmp, hi; 421 unsigned int tmp, hi;
422 int cpu;
423
424 /*
425 * get_mtrr doesn't need to update mtrr_state, also it could be called
426 * from any cpu, so try to print it out directly.
427 */
428 cpu = get_cpu();
379 429
380 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 430 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
431
381 if ((mask_lo & 0x800) == 0) { 432 if ((mask_lo & 0x800) == 0) {
382 /* Invalid (i.e. free) range */ 433 /* Invalid (i.e. free) range */
383 *base = 0; 434 *base = 0;
384 *size = 0; 435 *size = 0;
385 *type = 0; 436 *type = 0;
386 return; 437 goto out_put_cpu;
387 } 438 }
388 439
389 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 440 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
390 441
391 /* Work out the shifted address mask. */ 442 /* Work out the shifted address mask: */
392 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; 443 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
393 mask_lo = size_or_mask | tmp; 444 mask_lo = size_or_mask | tmp;
394 /* Expand tmp with high bits to all 1s*/ 445
446 /* Expand tmp with high bits to all 1s: */
395 hi = fls(tmp); 447 hi = fls(tmp);
396 if (hi > 0) { 448 if (hi > 0) {
397 tmp |= ~((1<<(hi - 1)) - 1); 449 tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
402 } 454 }
403 } 455 }
404 456
405 /* This works correctly if size is a power of two, i.e. a 457 /*
406 contiguous range. */ 458 * This works correctly if size is a power of two, i.e. a
459 * contiguous range:
460 */
407 *size = -mask_lo; 461 *size = -mask_lo;
408 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; 462 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
409 *type = base_lo & 0xff; 463 *type = base_lo & 0xff;
464
465out_put_cpu:
466 put_cpu();
410} 467}
411 468
412/** 469/**
@@ -419,6 +476,8 @@ static int set_fixed_ranges(mtrr_type * frs)
419 bool changed = false; 476 bool changed = false;
420 int block=-1, range; 477 int block=-1, range;
421 478
479 k8_check_syscfg_dram_mod_en();
480
422 while (fixed_range_blocks[++block].ranges) 481 while (fixed_range_blocks[++block].ranges)
423 for (range=0; range < fixed_range_blocks[block].ranges; range++) 482 for (range=0; range < fixed_range_blocks[block].ranges; range++)
424 set_fixed_range(fixed_range_blocks[block].base_msr + range, 483 set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd1..fb73a52913a4 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = {
377 .release = mtrr_close, 377 .release = mtrr_close,
378}; 378};
379 379
380
381static struct proc_dir_entry *proc_root_mtrr;
382
383
384static int mtrr_seq_show(struct seq_file *seq, void *offset) 380static int mtrr_seq_show(struct seq_file *seq, void *offset)
385{ 381{
386 char factor; 382 char factor;
@@ -423,11 +419,7 @@ static int __init mtrr_if_init(void)
423 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) 419 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
424 return -ENODEV; 420 return -ENODEV;
425 421
426 proc_root_mtrr = 422 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
427 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
428
429 if (proc_root_mtrr)
430 proc_root_mtrr->owner = THIS_MODULE;
431 return 0; 423 return 0;
432} 424}
433 425
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b8259..03cda01f57c7 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
582 582
583 for (i = 0; i < num_var_ranges; i++) { 583 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 584 mtrr_if->get(i,
585 &mtrr_state[i].lbase, 585 &mtrr_value[i].lbase,
586 &mtrr_state[i].lsize, 586 &mtrr_value[i].lsize,
587 &mtrr_state[i].ltype); 587 &mtrr_value[i].ltype);
588 } 588 }
589 return 0; 589 return 0;
590} 590}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
594 int i; 594 int i;
595 595
596 for (i = 0; i < num_var_ranges; i++) { 596 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_state[i].lsize) 597 if (mtrr_value[i].lsize)
598 set_mtrr(i, 598 set_mtrr(i,
599 mtrr_state[i].lbase, 599 mtrr_value[i].lbase,
600 mtrr_state[i].lsize, 600 mtrr_value[i].lsize,
601 mtrr_state[i].ltype); 601 mtrr_value[i].ltype);
602 } 602 }
603 return 0; 603 return 0;
604} 604}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
610 .resume = mtrr_restore, 610 .resume = mtrr_restore,
611}; 611};
612 612
613/* should be related to MTRR_VAR_RANGES nums */ 613int __initdata changed_by_mtrr_cleanup;
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
807
808#ifdef CONFIG_MTRR_SANITIZER
809
810static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
811{
812 unsigned long sum;
813 int i;
814
815 sum = 0;
816 for (i = 0; i < nr_range; i++)
817 sum += range[i].end + 1 - range[i].start;
818
819 return sum;
820}
821
822static int enable_mtrr_cleanup __initdata =
823 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
824
825static int __init disable_mtrr_cleanup_setup(char *str)
826{
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839static int __init mtrr_cleanup_debug_setup(char *str)
840{
841 debug_print = 1;
842 return 0;
843}
844early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
845
846struct var_mtrr_state {
847 unsigned long range_startk;
848 unsigned long range_sizek;
849 unsigned long chunk_sizek;
850 unsigned long gran_sizek;
851 unsigned int reg;
852};
853
854static void __init
855set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
856 unsigned char type, unsigned int address_bits)
857{
858 u32 base_lo, base_hi, mask_lo, mask_hi;
859 u64 base, mask;
860
861 if (!sizek) {
862 fill_mtrr_var_range(reg, 0, 0, 0, 0);
863 return;
864 }
865
866 mask = (1ULL << address_bits) - 1;
867 mask &= ~((((u64)sizek) << 10) - 1);
868
869 base = ((u64)basek) << 10;
870
871 base |= type;
872 mask |= 0x800;
873
874 base_lo = base & ((1ULL<<32) - 1);
875 base_hi = base >> 32;
876
877 mask_lo = mask & ((1ULL<<32) - 1);
878 mask_hi = mask >> 32;
879
880 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
881}
882
883static void __init
884save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
885 unsigned char type)
886{
887 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
888 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
889 range_state[reg].type = type;
890}
891
892static void __init
893set_var_mtrr_all(unsigned int address_bits)
894{
895 unsigned long basek, sizek;
896 unsigned char type;
897 unsigned int reg;
898
899 for (reg = 0; reg < num_var_ranges; reg++) {
900 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
901 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
902 type = range_state[reg].type;
903
904 set_var_mtrr(reg, basek, sizek, type, address_bits);
905 }
906}
907
908static unsigned long to_size_factor(unsigned long sizek, char *factorp)
909{
910 char factor;
911 unsigned long base = sizek;
912
913 if (base & ((1<<10) - 1)) {
914 /* not MB alignment */
915 factor = 'K';
916 } else if (base & ((1<<20) - 1)){
917 factor = 'M';
918 base >>= 10;
919 } else {
920 factor = 'G';
921 base >>= 20;
922 }
923
924 *factorp = factor;
925
926 return base;
927}
928
929static unsigned int __init
930range_to_mtrr(unsigned int reg, unsigned long range_startk,
931 unsigned long range_sizek, unsigned char type)
932{
933 if (!range_sizek || (reg >= num_var_ranges))
934 return reg;
935
936 while (range_sizek) {
937 unsigned long max_align, align;
938 unsigned long sizek;
939
940 /* Compute the maximum size I can make a range */
941 if (range_startk)
942 max_align = ffs(range_startk) - 1;
943 else
944 max_align = 32;
945 align = fls(range_sizek) - 1;
946 if (align > max_align)
947 align = max_align;
948
949 sizek = 1 << align;
950 if (debug_print) {
951 char start_factor = 'K', size_factor = 'K';
952 unsigned long start_base, size_base;
953
954 start_base = to_size_factor(range_startk, &start_factor),
955 size_base = to_size_factor(sizek, &size_factor),
956
957 printk(KERN_DEBUG "Setting variable MTRR %d, "
958 "base: %ld%cB, range: %ld%cB, type %s\n",
959 reg, start_base, start_factor,
960 size_base, size_factor,
961 (type == MTRR_TYPE_UNCACHABLE)?"UC":
962 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
963 );
964 }
965 save_var_mtrr(reg++, range_startk, sizek, type);
966 range_startk += sizek;
967 range_sizek -= sizek;
968 if (reg >= num_var_ranges)
969 break;
970 }
971 return reg;
972}
973
974static unsigned __init
975range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
976 unsigned long sizek)
977{
978 unsigned long hole_basek, hole_sizek;
979 unsigned long second_basek, second_sizek;
980 unsigned long range0_basek, range0_sizek;
981 unsigned long range_basek, range_sizek;
982 unsigned long chunk_sizek;
983 unsigned long gran_sizek;
984
985 hole_basek = 0;
986 hole_sizek = 0;
987 second_basek = 0;
988 second_sizek = 0;
989 chunk_sizek = state->chunk_sizek;
990 gran_sizek = state->gran_sizek;
991
992 /* align with gran size, prevent small block used up MTRRs */
993 range_basek = ALIGN(state->range_startk, gran_sizek);
994 if ((range_basek > basek) && basek)
995 return second_sizek;
996 state->range_sizek -= (range_basek - state->range_startk);
997 range_sizek = ALIGN(state->range_sizek, gran_sizek);
998
999 while (range_sizek > state->range_sizek) {
1000 range_sizek -= gran_sizek;
1001 if (!range_sizek)
1002 return 0;
1003 }
1004 state->range_sizek = range_sizek;
1005
1006 /* try to append some small hole */
1007 range0_basek = state->range_startk;
1008 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1009
1010 /* no increase */
1011 if (range0_sizek == state->range_sizek) {
1012 if (debug_print)
1013 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1014 range0_basek<<10,
1015 (range0_basek + state->range_sizek)<<10);
1016 state->reg = range_to_mtrr(state->reg, range0_basek,
1017 state->range_sizek, MTRR_TYPE_WRBACK);
1018 return 0;
1019 }
1020
1021 /* only cut back, when it is not the last */
1022 if (sizek) {
1023 while (range0_basek + range0_sizek > (basek + sizek)) {
1024 if (range0_sizek >= chunk_sizek)
1025 range0_sizek -= chunk_sizek;
1026 else
1027 range0_sizek = 0;
1028
1029 if (!range0_sizek)
1030 break;
1031 }
1032 }
1033
1034second_try:
1035 range_basek = range0_basek + range0_sizek;
1036
1037 /* one hole in the middle */
1038 if (range_basek > basek && range_basek <= (basek + sizek))
1039 second_sizek = range_basek - basek;
1040
1041 if (range0_sizek > state->range_sizek) {
1042
1043 /* one hole in middle or at end */
1044 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1045
1046 /* hole size should be less than half of range0 size */
1047 if (hole_sizek >= (range0_sizek >> 1) &&
1048 range0_sizek >= chunk_sizek) {
1049 range0_sizek -= chunk_sizek;
1050 second_sizek = 0;
1051 hole_sizek = 0;
1052
1053 goto second_try;
1054 }
1055 }
1056
1057 if (range0_sizek) {
1058 if (debug_print)
1059 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1060 range0_basek<<10,
1061 (range0_basek + range0_sizek)<<10);
1062 state->reg = range_to_mtrr(state->reg, range0_basek,
1063 range0_sizek, MTRR_TYPE_WRBACK);
1064 }
1065
1066 if (range0_sizek < state->range_sizek) {
1067 /* need to handle left over */
1068 range_sizek = state->range_sizek - range0_sizek;
1069
1070 if (debug_print)
1071 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1072 range_basek<<10,
1073 (range_basek + range_sizek)<<10);
1074 state->reg = range_to_mtrr(state->reg, range_basek,
1075 range_sizek, MTRR_TYPE_WRBACK);
1076 }
1077
1078 if (hole_sizek) {
1079 hole_basek = range_basek - hole_sizek - second_sizek;
1080 if (debug_print)
1081 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1082 hole_basek<<10,
1083 (hole_basek + hole_sizek)<<10);
1084 state->reg = range_to_mtrr(state->reg, hole_basek,
1085 hole_sizek, MTRR_TYPE_UNCACHABLE);
1086 }
1087
1088 return second_sizek;
1089}
1090
1091static void __init
1092set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1093 unsigned long size_pfn)
1094{
1095 unsigned long basek, sizek;
1096 unsigned long second_sizek = 0;
1097
1098 if (state->reg >= num_var_ranges)
1099 return;
1100
1101 basek = base_pfn << (PAGE_SHIFT - 10);
1102 sizek = size_pfn << (PAGE_SHIFT - 10);
1103
1104 /* See if I can merge with the last range */
1105 if ((basek <= 1024) ||
1106 (state->range_startk + state->range_sizek == basek)) {
1107 unsigned long endk = basek + sizek;
1108 state->range_sizek = endk - state->range_startk;
1109 return;
1110 }
1111 /* Write the range mtrrs */
1112 if (state->range_sizek != 0)
1113 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1114
1115 /* Allocate an msr */
1116 state->range_startk = basek + second_sizek;
1117 state->range_sizek = sizek - second_sizek;
1118}
1119
1120/* mininum size of mtrr block that can take hole */
1121static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1122
1123static int __init parse_mtrr_chunk_size_opt(char *p)
1124{
1125 if (!p)
1126 return -EINVAL;
1127 mtrr_chunk_size = memparse(p, &p);
1128 return 0;
1129}
1130early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1131
1132/* granity of mtrr of block */
1133static u64 mtrr_gran_size __initdata;
1134
1135static int __init parse_mtrr_gran_size_opt(char *p)
1136{
1137 if (!p)
1138 return -EINVAL;
1139 mtrr_gran_size = memparse(p, &p);
1140 return 0;
1141}
1142early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1143
1144static int nr_mtrr_spare_reg __initdata =
1145 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1146
1147static int __init parse_mtrr_spare_reg(char *arg)
1148{
1149 if (arg)
1150 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1151 return 0;
1152}
1153
1154early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1155
1156static int __init
1157x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1158 u64 chunk_size, u64 gran_size)
1159{
1160 struct var_mtrr_state var_state;
1161 int i;
1162 int num_reg;
1163
1164 var_state.range_startk = 0;
1165 var_state.range_sizek = 0;
1166 var_state.reg = 0;
1167 var_state.chunk_sizek = chunk_size >> 10;
1168 var_state.gran_sizek = gran_size >> 10;
1169
1170 memset(range_state, 0, sizeof(range_state));
1171
1172 /* Write the range etc */
1173 for (i = 0; i < nr_range; i++)
1174 set_var_mtrr_range(&var_state, range[i].start,
1175 range[i].end - range[i].start + 1);
1176
1177 /* Write the last range */
1178 if (var_state.range_sizek != 0)
1179 range_to_mtrr_with_hole(&var_state, 0, 0);
1180
1181 num_reg = var_state.reg;
1182 /* Clear out the extra MTRR's */
1183 while (var_state.reg < num_var_ranges) {
1184 save_var_mtrr(var_state.reg, 0, 0, 0);
1185 var_state.reg++;
1186 }
1187
1188 return num_reg;
1189}
1190
1191struct mtrr_cleanup_result {
1192 unsigned long gran_sizek;
1193 unsigned long chunk_sizek;
1194 unsigned long lose_cover_sizek;
1195 unsigned int num_reg;
1196 int bad;
1197};
1198
1199/*
1200 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1201 * chunk size: gran_size, ..., 2G
1202 * so we need (1+16)*8
1203 */
1204#define NUM_RESULT 136
1205#define PSHIFT (PAGE_SHIFT - 10)
1206
1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1209
1210static void __init print_out_mtrr_range_state(void)
1211{
1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1216
1217 for (i = 0; i < num_var_ranges; i++) {
1218
1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1220 if (!size_base)
1221 continue;
1222
1223 size_base = to_size_factor(size_base, &size_factor),
1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1225 start_base = to_size_factor(start_base, &start_factor),
1226 type = range_state[i].type;
1227
1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1245
1246 /* check entries number */
1247 memset(num, 0, sizeof(num));
1248 for (i = 0; i < num_var_ranges; i++) {
1249 type = range_state[i].type;
1250 size = range_state[i].size_pfn;
1251 if (type >= MTRR_NUM_TYPES)
1252 continue;
1253 if (!size)
1254 type = MTRR_NUM_TYPES;
1255 if (type == MTRR_TYPE_WRPROT)
1256 type = MTRR_TYPE_UNCACHABLE;
1257 num[type]++;
1258 }
1259
1260 /* check if we got UC entries */
1261 if (!num[MTRR_TYPE_UNCACHABLE])
1262 return 0;
1263
1264 /* check if we only had WB and UC */
1265 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1266 num_var_ranges - num[MTRR_NUM_TYPES])
1267 return 0;
1268
1269 return 1;
1270}
1271
1272static unsigned long __initdata range_sums;
1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1303
1304 /* double check it */
1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1306 if (nr_range_new != nr_range ||
1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1310
1311 if (!result[i].bad && (range_sums - range_sums_new <
1312 min_loss_pfn[num_reg])) {
1313 min_loss_pfn[num_reg] =
1314 range_sums - range_sums_new;
1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1396
1397 memset(range, 0, sizeof(range));
1398 extra_remove_size = 0;
1399 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1400 if (mtrr_tom2)
1401 extra_remove_size =
1402 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1403 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1404 extra_remove_size);
1405 /*
1406 * [0, 1M) should always be coverred by var mtrr with WB
1407 * and fixed mtrrs should take effective before var mtrr for it
1408 */
1409 nr_range = add_range_with_merge(range, nr_range, 0,
1410 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1411 /* sort the ranges */
1412 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1413
1414 range_sums = sum_ranges(range, nr_range);
1415 printk(KERN_INFO "total RAM coverred: %ldM\n",
1416 range_sums >> (20 - PAGE_SHIFT));
1417
1418 if (mtrr_chunk_size && mtrr_gran_size) {
1419 i = 0;
1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1421 extra_remove_base, extra_remove_size, i);
1422
1423 mtrr_print_out_one_result(i);
1424
1425 if (!result[i].bad) {
1426 set_var_mtrr_all(address_bits);
1427 return 1;
1428 }
1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1430 "will find optimal one\n");
1431 }
1432
1433 i = 0;
1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1435 memset(result, 0, sizeof(result));
1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1437
1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1439 chunk_size <<= 1) {
1440
1441 if (i >= NUM_RESULT)
1442 continue;
1443
1444 mtrr_calc_range_state(chunk_size, gran_size,
1445 extra_remove_base, extra_remove_size, i);
1446 if (debug_print) {
1447 mtrr_print_out_one_result(i);
1448 printk(KERN_INFO "\n");
1449 }
1450
1451 i++;
1452 }
1453 }
1454
1455 /* try to find the optimal index */
1456 index_good = mtrr_search_optimal_index();
1457
1458 if (index_good != -1) {
1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1460 i = index_good;
1461 mtrr_print_out_one_result(i);
1462
1463 /* convert ranges to var ranges state */
1464 chunk_size = result[i].chunk_sizek;
1465 chunk_size <<= 10;
1466 gran_size = result[i].gran_sizek;
1467 gran_size <<= 10;
1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1477 }
1478
1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1480 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1481
1482 return 0;
1483}
1484#else
1485static int __init mtrr_cleanup(unsigned address_bits)
1486{
1487 return 0;
1488}
1489#endif
1490
1491static int __initdata changed_by_mtrr_cleanup;
1492
1493static int disable_mtrr_trim;
1494
1495static int __init disable_mtrr_trim_setup(char *str)
1496{
1497 disable_mtrr_trim = 1;
1498 return 0;
1499}
1500early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
1501
1502/*
1503 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
1504 * for memory >4GB. Check for that here.
1505 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
1506 * apply to are wrong, but so far we don't know of any such case in the wild.
1507 */
1508#define Tom2Enabled (1U << 21)
1509#define Tom2ForceMemTypeWB (1U << 22)
1510
1511int __init amd_special_default_mtrr(void)
1512{
1513 u32 l, h;
1514
1515 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
1516 return 0;
1517 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
1518 return 0;
1519 /* In case some hypervisor doesn't pass SYSCFG through */
1520 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
1521 return 0;
1522 /*
1523 * Memory between 4GB and top of mem is forced WB by this magic bit.
1524 * Reserved before K8RevF, but should be zero there.
1525 */
1526 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
1527 (Tom2Enabled | Tom2ForceMemTypeWB))
1528 return 1;
1529 return 0;
1530}
1531
1532static u64 __init real_trim_memory(unsigned long start_pfn,
1533 unsigned long limit_pfn)
1534{
1535 u64 trim_start, trim_size;
1536 trim_start = start_pfn;
1537 trim_start <<= PAGE_SHIFT;
1538 trim_size = limit_pfn;
1539 trim_size <<= PAGE_SHIFT;
1540 trim_size -= trim_start;
1541
1542 return e820_update_range(trim_start, trim_size, E820_RAM,
1543 E820_RESERVED);
1544}
1545/**
1546 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
1547 * @end_pfn: ending page frame number
1548 *
1549 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
1550 * memory configurations. This routine checks that the highest MTRR matches
1551 * the end of memory, to make sure the MTRRs having a write back type cover
1552 * all of the memory the kernel is intending to use. If not, it'll trim any
1553 * memory off the end by adjusting end_pfn, removing it from the kernel's
1554 * allocation pools, warning the user with an obnoxious message.
1555 */
1556int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1557{
1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1559 mtrr_type type;
1560 u64 total_trim_size;
1561
1562 /* extra one for all 0 */
1563 int num[MTRR_NUM_TYPES + 1];
1564 /*
1565 * Make sure we only trim uncachable memory on machines that
1566 * support the Intel MTRR architecture:
1567 */
1568 if (!is_cpu(INTEL) || disable_mtrr_trim)
1569 return 0;
1570 rdmsr(MTRRdefType_MSR, def, dummy);
1571 def &= 0xff;
1572 if (def != MTRR_TYPE_UNCACHABLE)
1573 return 0;
1574
1575 /* get it and store it aside */
1576 memset(range_state, 0, sizeof(range_state));
1577 for (i = 0; i < num_var_ranges; i++) {
1578 mtrr_if->get(i, &base, &size, &type);
1579 range_state[i].base_pfn = base;
1580 range_state[i].size_pfn = size;
1581 range_state[i].type = type;
1582 }
1583
1584 /* Find highest cached pfn */
1585 for (i = 0; i < num_var_ranges; i++) {
1586 type = range_state[i].type;
1587 if (type != MTRR_TYPE_WRBACK)
1588 continue;
1589 base = range_state[i].base_pfn;
1590 size = range_state[i].size_pfn;
1591 if (highest_pfn < base + size)
1592 highest_pfn = base + size;
1593 }
1594
1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1596 if (!highest_pfn) {
1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1598 return 0;
1599 }
1600
1601 /* check entries number */
1602 memset(num, 0, sizeof(num));
1603 for (i = 0; i < num_var_ranges; i++) {
1604 type = range_state[i].type;
1605 if (type >= MTRR_NUM_TYPES)
1606 continue;
1607 size = range_state[i].size_pfn;
1608 if (!size)
1609 type = MTRR_NUM_TYPES;
1610 num[type]++;
1611 }
1612
1613 /* no entry for WB? */
1614 if (!num[MTRR_TYPE_WRBACK])
1615 return 0;
1616
1617 /* check if we only had WB and UC */
1618 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1619 num_var_ranges - num[MTRR_NUM_TYPES])
1620 return 0;
1621
1622 memset(range, 0, sizeof(range));
1623 nr_range = 0;
1624 if (mtrr_tom2) {
1625 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1626 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1627 if (highest_pfn < range[nr_range].end + 1)
1628 highest_pfn = range[nr_range].end + 1;
1629 nr_range++;
1630 }
1631 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1632
1633 total_trim_size = 0;
1634 /* check the head */
1635 if (range[0].start)
1636 total_trim_size += real_trim_memory(0, range[0].start);
1637 /* check the holes */
1638 for (i = 0; i < nr_range - 1; i++) {
1639 if (range[i].end + 1 < range[i+1].start)
1640 total_trim_size += real_trim_memory(range[i].end + 1,
1641 range[i+1].start);
1642 }
1643 /* check the top */
1644 i = nr_range - 1;
1645 if (range[i].end + 1 < end_pfn)
1646 total_trim_size += real_trim_memory(range[i].end + 1,
1647 end_pfn);
1648
1649 if (total_trim_size) {
1650 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1651 " all of memory, losing %lluMB of RAM.\n",
1652 total_trim_size >> 20);
1653
1654 if (!changed_by_mtrr_cleanup)
1655 WARN_ON(1);
1656
1657 printk(KERN_INFO "update e820 for mtrr\n");
1658 update_e820();
1659
1660 return 1;
1661 }
1662
1663 return 0;
1664}
1665 614
1666/** 615/**
1667 * mtrr_bp_init - initialize mtrrs on the boot CPU 616 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6d..77f67f7b347a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
79 79
80extern unsigned int num_var_ranges; 80extern unsigned int num_var_ranges;
81extern u64 mtrr_tom2; 81extern u64 mtrr_tom2;
82extern struct mtrr_state_type mtrr_state;
82 83
83void mtrr_state_warn(void); 84void mtrr_state_warn(void);
84const char *mtrr_attrib_to_str(int x); 85const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
88int amd_init_mtrr(void); 89int amd_init_mtrr(void);
89int cyrix_init_mtrr(void); 90int cyrix_init_mtrr(void);
90int centaur_init_mtrr(void); 91int centaur_init_mtrr(void);
92
93extern int changed_by_mtrr_cleanup;
94extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..f6c70a164e32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,7 +19,7 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/genapic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 01b1244ef1c0..f93047fed791 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,15 +7,14 @@
7/* 7/*
8 * Get CPU information for use by the procfs. 8 * Get CPU information for use by the procfs.
9 */ 9 */
10#ifdef CONFIG_X86_32
11static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, 10static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
12 unsigned int cpu) 11 unsigned int cpu)
13{ 12{
14#ifdef CONFIG_X86_HT 13#ifdef CONFIG_SMP
15 if (c->x86_max_cores * smp_num_siblings > 1) { 14 if (c->x86_max_cores * smp_num_siblings > 1) {
16 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 15 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
17 seq_printf(m, "siblings\t: %d\n", 16 seq_printf(m, "siblings\t: %d\n",
18 cpus_weight(per_cpu(cpu_core_map, cpu))); 17 cpumask_weight(cpu_sibling_mask(cpu)));
19 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 18 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
20 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 19 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
21 seq_printf(m, "apicid\t\t: %d\n", c->apicid); 20 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
24#endif 23#endif
25} 24}
26 25
26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /* 29 /*
@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
50 c->wp_works_ok ? "yes" : "no"); 50 c->wp_works_ok ? "yes" : "no");
51} 51}
52#else 52#else
53static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
54 unsigned int cpu)
55{
56#ifdef CONFIG_SMP
57 if (c->x86_max_cores * smp_num_siblings > 1) {
58 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
59 seq_printf(m, "siblings\t: %d\n",
60 cpus_weight(per_cpu(cpu_core_map, cpu)));
61 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
62 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
63 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
64 seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
65 }
66#endif
67}
68
69static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 53static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
70{ 54{
71 seq_printf(m, 55 seq_printf(m,
@@ -159,9 +143,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
159static void *c_start(struct seq_file *m, loff_t *pos) 143static void *c_start(struct seq_file *m, loff_t *pos)
160{ 144{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 145 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 146 *pos = cpumask_first(cpu_online_mask);
163 else 147 else
164 *pos = next_cpu_nr(*pos - 1, cpu_online_map); 148 *pos = cpumask_next(*pos - 1, cpu_online_mask);
165 if ((*pos) < nr_cpu_ids) 149 if ((*pos) < nr_cpu_ids)
166 return &cpu_data(*pos); 150 return &cpu_data(*pos);
167 return NULL; 151 return NULL;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5af..bb62b3e5caad 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
98#endif 98#endif
99} 99}
100 100
101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
102 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
103 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta, 104 .c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e0960..fd2c37bf7acb 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
8 * so no special init takes place. 8 * so no special init takes place.
9 */ 9 */
10 10
11static struct cpu_dev umc_cpu_dev __cpuinitdata = { 11static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
12 .c_vendor = "UMC", 12 .c_vendor = "UMC",
13 .c_ident = { "UMC UMC UMC" }, 13 .c_ident = { "UMC UMC UMC" },
14 .c_models = { 14 .c_models = {
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c689d19e35ab..ff958248e61d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,12 +24,10 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
31#include <mach_ipi.h>
32
33 31
34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
35 33
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587be..87b67e3a765a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
729 729
730 spin_unlock_irqrestore(&ds_lock, irq); 730 spin_unlock_irqrestore(&ds_lock, irq);
731 731
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
733 ds_resume_pebs(tracer); 733 ds_resume_pebs(tracer);
734 734
735 return tracer; 735 return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1029 1029
1030void ds_exit_thread(struct task_struct *tsk) 1030void ds_exit_thread(struct task_struct *tsk)
1031{ 1031{
1032 WARN_ON(tsk->thread.ds_ctx);
1033} 1032}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..95ea5fa7d444 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,10 +10,12 @@
10#include <linux/kdebug.h> 10#include <linux/kdebug.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/ftrace.h>
13#include <linux/kexec.h> 14#include <linux/kexec.h>
14#include <linux/bug.h> 15#include <linux/bug.h>
15#include <linux/nmi.h> 16#include <linux/nmi.h>
16#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/ftrace.h>
17 19
18#include <asm/stacktrace.h> 20#include <asm/stacktrace.h>
19 21
@@ -99,7 +101,7 @@ print_context_stack(struct thread_info *tinfo,
99 frame = frame->next_frame; 101 frame = frame->next_frame;
100 bp = (unsigned long) frame; 102 bp = (unsigned long) frame;
101 } else { 103 } else {
102 ops->address(data, addr, bp == 0); 104 ops->address(data, addr, 0);
103 } 105 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph); 106 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 } 107 }
@@ -195,6 +197,11 @@ unsigned __kprobes long oops_begin(void)
195 int cpu; 197 int cpu;
196 unsigned long flags; 198 unsigned long flags;
197 199
200 /* notify the hw-branch tracer so it may disable tracing and
201 add the last trace to the trace buffer -
202 the earlier this happens, the more useful the trace. */
203 trace_hw_branch_oops();
204
198 oops_enter(); 205 oops_enter();
199 206
200 /* racy, but better than risking deadlock. */ 207 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
106 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
107{ 107{
108 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
110 unsigned used = 0; 111 unsigned used = 0;
111 struct thread_info *tinfo; 112 struct thread_info *tinfo;
112 int graph = 0; 113 int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
161 continue; 162 continue;
162 } 163 }
163 if (irqstack_end) { 164 if (irq_stack_end) {
164 unsigned long *irqstack; 165 unsigned long *irq_stack;
165 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
166 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
167 168
168 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
169 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
170 break; 171 break;
171 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
172 ops, data, irqstack_end, &graph); 173 ops, data, irq_stack_end, &graph);
173 /* 174 /*
174 * We link to the next stack (which would be 175 * We link to the next stack (which would be
175 * the process stack normally) the last 176 * the process stack normally) the last
176 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
177 */ 178 */
178 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
179 irqstack_end = NULL; 180 irq_stack_end = NULL;
180 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
181 continue; 182 continue;
182 } 183 }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
199 unsigned long *stack; 200 unsigned long *stack;
200 int i; 201 int i;
201 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
202 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
203 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
204 unsigned long *irqstack = 205 unsigned long *irq_stack =
205 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
206 207
207 /* 208 /*
208 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
218 219
219 stack = sp; 220 stack = sp;
220 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
221 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
222 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
223 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
224 printk(" <EOI> "); 225 printk(" <EOI> ");
225 } 226 }
226 } else { 227 } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
241 int i; 242 int i;
242 unsigned long sp; 243 unsigned long sp;
243 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
244 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
245 246
246 sp = regs->sp; 247 sp = regs->sp;
247 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e85826829cf2..ef2c3563357d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
110/* 110/*
111 * Add a memory region to the kernel e820 map. 111 * Add a memory region to the kernel e820 map.
112 */ 112 */
113void __init e820_add_region(u64 start, u64 size, int type) 113static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
114 int type)
114{ 115{
115 int x = e820.nr_map; 116 int x = e820x->nr_map;
116 117
117 if (x == ARRAY_SIZE(e820.map)) { 118 if (x == ARRAY_SIZE(e820x->map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return; 120 return;
120 } 121 }
121 122
122 e820.map[x].addr = start; 123 e820x->map[x].addr = start;
123 e820.map[x].size = size; 124 e820x->map[x].size = size;
124 e820.map[x].type = type; 125 e820x->map[x].type = type;
125 e820.nr_map++; 126 e820x->nr_map++;
127}
128
129void __init e820_add_region(u64 start, u64 size, int type)
130{
131 __e820_add_region(&e820, start, size, type);
132}
133
134static void __init e820_print_type(u32 type)
135{
136 switch (type) {
137 case E820_RAM:
138 case E820_RESERVED_KERN:
139 printk(KERN_CONT "(usable)");
140 break;
141 case E820_RESERVED:
142 printk(KERN_CONT "(reserved)");
143 break;
144 case E820_ACPI:
145 printk(KERN_CONT "(ACPI data)");
146 break;
147 case E820_NVS:
148 printk(KERN_CONT "(ACPI NVS)");
149 break;
150 case E820_UNUSABLE:
151 printk(KERN_CONT "(unusable)");
152 break;
153 default:
154 printk(KERN_CONT "type %u", type);
155 break;
156 }
126} 157}
127 158
128void __init e820_print_map(char *who) 159void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
134 (unsigned long long) e820.map[i].addr, 165 (unsigned long long) e820.map[i].addr,
135 (unsigned long long) 166 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size)); 167 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) { 168 e820_print_type(e820.map[i].type);
138 case E820_RAM: 169 printk(KERN_CONT "\n");
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
154 default:
155 printk(KERN_CONT "type %u\n", e820.map[i].type);
156 break;
157 }
158 } 170 }
159} 171}
160 172
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
221 */ 233 */
222 234
223int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 235int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
224 int *pnr_map) 236 u32 *pnr_map)
225{ 237{
226 struct change_member { 238 struct change_member {
227 struct e820entry *pbios; /* pointer to original bios entry */ 239 struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
417 return __append_e820_map(biosmap, nr_map); 429 return __append_e820_map(biosmap, nr_map);
418} 430}
419 431
420static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, 432static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
421 u64 size, unsigned old_type, 433 u64 size, unsigned old_type,
422 unsigned new_type) 434 unsigned new_type)
423{ 435{
424 int i; 436 u64 end;
437 unsigned int i;
425 u64 real_updated_size = 0; 438 u64 real_updated_size = 0;
426 439
427 BUG_ON(old_type == new_type); 440 BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
429 if (size > (ULLONG_MAX - start)) 442 if (size > (ULLONG_MAX - start))
430 size = ULLONG_MAX - start; 443 size = ULLONG_MAX - start;
431 444
432 for (i = 0; i < e820.nr_map; i++) { 445 end = start + size;
446 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
447 (unsigned long long) start,
448 (unsigned long long) end);
449 e820_print_type(old_type);
450 printk(KERN_CONT " ==> ");
451 e820_print_type(new_type);
452 printk(KERN_CONT "\n");
453
454 for (i = 0; i < e820x->nr_map; i++) {
433 struct e820entry *ei = &e820x->map[i]; 455 struct e820entry *ei = &e820x->map[i];
434 u64 final_start, final_end; 456 u64 final_start, final_end;
457 u64 ei_end;
458
435 if (ei->type != old_type) 459 if (ei->type != old_type)
436 continue; 460 continue;
437 /* totally covered? */ 461
438 if (ei->addr >= start && 462 ei_end = ei->addr + ei->size;
439 (ei->addr + ei->size) <= (start + size)) { 463 /* totally covered by new range? */
464 if (ei->addr >= start && ei_end <= end) {
440 ei->type = new_type; 465 ei->type = new_type;
441 real_updated_size += ei->size; 466 real_updated_size += ei->size;
442 continue; 467 continue;
443 } 468 }
469
470 /* new range is totally covered? */
471 if (ei->addr < start && ei_end > end) {
472 __e820_add_region(e820x, start, size, new_type);
473 __e820_add_region(e820x, end, ei_end - end, ei->type);
474 ei->size = start - ei->addr;
475 real_updated_size += size;
476 continue;
477 }
478
444 /* partially covered */ 479 /* partially covered */
445 final_start = max(start, ei->addr); 480 final_start = max(start, ei->addr);
446 final_end = min(start + size, ei->addr + ei->size); 481 final_end = min(end, ei_end);
447 if (final_start >= final_end) 482 if (final_start >= final_end)
448 continue; 483 continue;
449 e820_add_region(final_start, final_end - final_start, 484
450 new_type); 485 __e820_add_region(e820x, final_start, final_end - final_start,
486 new_type);
487
451 real_updated_size += final_end - final_start; 488 real_updated_size += final_end - final_start;
452 489
490 /*
491 * left range could be head or tail, so need to update
492 * size at first.
493 */
453 ei->size -= final_end - final_start; 494 ei->size -= final_end - final_start;
454 if (ei->addr < final_start) 495 if (ei->addr < final_start)
455 continue; 496 continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
461u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, 502u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
462 unsigned new_type) 503 unsigned new_type)
463{ 504{
464 return e820_update_range_map(&e820, start, size, old_type, new_type); 505 return __e820_update_range(&e820, start, size, old_type, new_type);
465} 506}
466 507
467static u64 __init e820_update_range_saved(u64 start, u64 size, 508static u64 __init e820_update_range_saved(u64 start, u64 size,
468 unsigned old_type, unsigned new_type) 509 unsigned old_type, unsigned new_type)
469{ 510{
470 return e820_update_range_map(&e820_saved, start, size, old_type, 511 return __e820_update_range(&e820_saved, start, size, old_type,
471 new_type); 512 new_type);
472} 513}
473 514
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
511 552
512void __init update_e820(void) 553void __init update_e820(void)
513{ 554{
514 int nr_map; 555 u32 nr_map;
515 556
516 nr_map = e820.nr_map; 557 nr_map = e820.nr_map;
517 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 558 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
522} 563}
523static void __init update_e820_saved(void) 564static void __init update_e820_saved(void)
524{ 565{
525 int nr_map; 566 u32 nr_map;
526 567
527 nr_map = e820_saved.nr_map; 568 nr_map = e820_saved.nr_map;
528 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) 569 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -858,6 +899,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
858 */ 899 */
859void __init reserve_early(u64 start, u64 end, char *name) 900void __init reserve_early(u64 start, u64 end, char *name)
860{ 901{
902 if (start >= end)
903 return;
904
861 drop_overlaps_that_are_ok(start, end); 905 drop_overlaps_that_are_ok(start, end);
862 __reserve_early(start, end, name, 0); 906 __reserve_early(start, end, name, 0);
863} 907}
@@ -1017,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1017 continue; 1061 continue;
1018 return addr; 1062 return addr;
1019 } 1063 }
1020 return -1UL;
1021 1064
1065 return -1ULL;
1022} 1066}
1023 1067
1024/* 1068/*
@@ -1031,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1031 u64 start; 1075 u64 start;
1032 1076
1033 start = startt; 1077 start = startt;
1034 while (size < sizet) 1078 while (size < sizet && (start + 1))
1035 start = find_e820_area_size(start, &size, align); 1079 start = find_e820_area_size(start, &size, align);
1036 1080
1037 if (size < sizet) 1081 if (size < sizet)
1038 return 0; 1082 return 0;
1039 1083
1084#ifdef CONFIG_X86_32
1085 if (start >= MAXMEM)
1086 return 0;
1087 if (start + size > MAXMEM)
1088 size = MAXMEM - start;
1089#endif
1090
1040 addr = round_down(start + size - sizet, align); 1091 addr = round_down(start + size - sizet, align);
1092 if (addr < start)
1093 return 0;
1041 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 1094 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1042 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 1095 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1043 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 1096 printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1250,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
1250void __init finish_e820_parsing(void) 1303void __init finish_e820_parsing(void)
1251{ 1304{
1252 if (userdef) { 1305 if (userdef) {
1253 int nr = e820.nr_map; 1306 u32 nr = e820.nr_map;
1254 1307
1255 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) 1308 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
1256 early_panic("Invalid user supplied memory map"); 1309 early_panic("Invalid user supplied memory map");
@@ -1333,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
1333char *__init default_machine_specific_memory_setup(void) 1386char *__init default_machine_specific_memory_setup(void)
1334{ 1387{
1335 char *who = "BIOS-e820"; 1388 char *who = "BIOS-e820";
1336 int new_nr; 1389 u32 new_nr;
1337 /* 1390 /*
1338 * Try to copy the BIOS-supplied E820-map. 1391 * Try to copy the BIOS-supplied E820-map.
1339 * 1392 *
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 504ad198e4ad..335f049d110f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,8 +13,8 @@
13#include <asm/setup.h> 13#include <asm/setup.h>
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 18#include <linux/usb/ehci_def.h>
19 19
20/* Simple VGA output */ 20/* Simple VGA output */
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); 250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251} 251}
252 252
253static void dbgp_mdelay(int ms) 253static void __init dbgp_mdelay(int ms)
254{ 254{
255 int i; 255 int i;
256 256
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
311 writel(hi, &ehci_debug->data47); 311 writel(hi, &ehci_debug->data47);
312} 312}
313 313
314static void dbgp_get_data(void *buf, int size) 314static void __init dbgp_get_data(void *buf, int size)
315{ 315{
316 unsigned char *bytes = buf; 316 unsigned char *bytes = buf;
317 u32 lo, hi; 317 u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
355 return ret; 355 return ret;
356} 356}
357 357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, 358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size) 359 int size)
360{ 360{
361 u32 pids, addr, ctrl; 361 u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
386 return ret; 386 return ret;
387} 387}
388 388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request, 389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int value, int index, void *data, int size) 390 int request, int value, int index, void *data, int size)
391{ 391{
392 u32 pids, addr, ctrl; 392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req; 393 struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
489 return 0; 489 return 0;
490} 490}
491 491
492static int ehci_reset_port(int port) 492static int __init ehci_reset_port(int port)
493{ 493{
494 u32 portsc; 494 u32 portsc;
495 u32 delay_time, delay; 495 u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
532 return -EBUSY; 532 return -EBUSY;
533} 533}
534 534
535static int ehci_wait_for_port(int port) 535static int __init ehci_wait_for_port(int port)
536{ 536{
537 u32 status; 537 u32 status;
538 int ret, reps; 538 int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
557 557
558typedef void (*set_debug_port_t)(int port); 558typedef void (*set_debug_port_t)(int port);
559 559
560static void default_set_debug_port(int port) 560static void __init default_set_debug_port(int port)
561{ 561{
562} 562}
563 563
564static set_debug_port_t set_debug_port = default_set_debug_port; 564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565 565
566static void nvidia_set_debug_port(int port) 566static void __init nvidia_set_debug_port(int port)
567{ 567{
568 u32 dword; 568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..1736acc4d7aa 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
@@ -467,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
467 efi_memory_desc_t *md; 469 efi_memory_desc_t *md;
468 efi_status_t status; 470 efi_status_t status;
469 unsigned long size; 471 unsigned long size;
470 u64 end, systab, addr, npages; 472 u64 end, systab, addr, npages, end_pfn;
471 void *p, *va; 473 void *p, *va;
472 474
473 efi.systab = NULL; 475 efi.systab = NULL;
@@ -479,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
479 size = md->num_pages << EFI_PAGE_SHIFT; 481 size = md->num_pages << EFI_PAGE_SHIFT;
480 end = md->phys_addr + size; 482 end = md->phys_addr + size;
481 483
482 if (PFN_UP(end) <= max_low_pfn_mapped) 484 end_pfn = PFN_UP(end);
485 if (end_pfn <= max_low_pfn_mapped
486 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
487 && end_pfn <= max_pfn_mapped))
483 va = __va(md->phys_addr); 488 va = __va(md->phys_addr);
484 else 489 else
485 va = efi_ioremap(md->phys_addr, size); 490 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..22c3b7828c50 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
@@ -99,24 +100,11 @@ void __init efi_call_phys_epilog(void)
99 100
100void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
101{ 102{
102 static unsigned pages_mapped __initdata; 103 unsigned long last_map_pfn;
103 unsigned i, pages;
104 unsigned long offset;
105 104
106 pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); 105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
107 offset = phys_addr & ~PAGE_MASK; 106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
108 phys_addr &= PAGE_MASK;
109
110 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
111 return NULL; 107 return NULL;
112 108
113 for (i = 0; i < pages; i++) { 109 return (void __iomem *)__va(phys_addr);
114 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
115 phys_addr, PAGE_KERNEL);
116 phys_addr += PAGE_SIZE;
117 pages_mapped++;
118 }
119
120 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
121 (pages_mapped - pages)) + offset;
122} 110}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
index ef00bb77d7e4..fbe66e626c09 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#include <linux/linkage.h> 8#include <linux/linkage.h>
9#include <asm/page.h> 9#include <asm/page_types.h>
10 10
11/* 11/*
12 * efi_call_phys(void *, ...) is a function with variable parameters. 12 * efi_call_phys(void *, ...) is a function with variable parameters.
@@ -113,6 +113,7 @@ ENTRY(efi_call_phys)
113 movl (%edx), %ecx 113 movl (%edx), %ecx
114 pushl %ecx 114 pushl %ecx
115 ret 115 ret
116ENDPROC(efi_call_phys)
116.previous 117.previous
117 118
118.data 119.data
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
index 99b47d48c9f4..4c07ccab8146 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -41,6 +41,7 @@ ENTRY(efi_call0)
41 addq $32, %rsp 41 addq $32, %rsp
42 RESTORE_XMM 42 RESTORE_XMM
43 ret 43 ret
44ENDPROC(efi_call0)
44 45
45ENTRY(efi_call1) 46ENTRY(efi_call1)
46 SAVE_XMM 47 SAVE_XMM
@@ -50,6 +51,7 @@ ENTRY(efi_call1)
50 addq $32, %rsp 51 addq $32, %rsp
51 RESTORE_XMM 52 RESTORE_XMM
52 ret 53 ret
54ENDPROC(efi_call1)
53 55
54ENTRY(efi_call2) 56ENTRY(efi_call2)
55 SAVE_XMM 57 SAVE_XMM
@@ -59,6 +61,7 @@ ENTRY(efi_call2)
59 addq $32, %rsp 61 addq $32, %rsp
60 RESTORE_XMM 62 RESTORE_XMM
61 ret 63 ret
64ENDPROC(efi_call2)
62 65
63ENTRY(efi_call3) 66ENTRY(efi_call3)
64 SAVE_XMM 67 SAVE_XMM
@@ -69,6 +72,7 @@ ENTRY(efi_call3)
69 addq $32, %rsp 72 addq $32, %rsp
70 RESTORE_XMM 73 RESTORE_XMM
71 ret 74 ret
75ENDPROC(efi_call3)
72 76
73ENTRY(efi_call4) 77ENTRY(efi_call4)
74 SAVE_XMM 78 SAVE_XMM
@@ -80,6 +84,7 @@ ENTRY(efi_call4)
80 addq $32, %rsp 84 addq $32, %rsp
81 RESTORE_XMM 85 RESTORE_XMM
82 ret 86 ret
87ENDPROC(efi_call4)
83 88
84ENTRY(efi_call5) 89ENTRY(efi_call5)
85 SAVE_XMM 90 SAVE_XMM
@@ -92,6 +97,7 @@ ENTRY(efi_call5)
92 addq $48, %rsp 97 addq $48, %rsp
93 RESTORE_XMM 98 RESTORE_XMM
94 ret 99 ret
100ENDPROC(efi_call5)
95 101
96ENTRY(efi_call6) 102ENTRY(efi_call6)
97 SAVE_XMM 103 SAVE_XMM
@@ -107,3 +113,4 @@ ENTRY(efi_call6)
107 addq $48, %rsp 113 addq $48, %rsp
108 RESTORE_XMM 114 RESTORE_XMM
109 ret 115 ret
116ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d3..c929add475c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
30 * 1C(%esp) - %ds 30 * 1C(%esp) - %ds
31 * 20(%esp) - %es 31 * 20(%esp) - %es
32 * 24(%esp) - %fs 32 * 24(%esp) - %fs
33 * 28(%esp) - orig_eax 33 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
34 * 2C(%esp) - %eip 34 * 2C(%esp) - orig_eax
35 * 30(%esp) - %cs 35 * 30(%esp) - %eip
36 * 34(%esp) - %eflags 36 * 34(%esp) - %cs
37 * 38(%esp) - %oldesp 37 * 38(%esp) - %eflags
38 * 3C(%esp) - %oldss 38 * 3C(%esp) - %oldesp
39 * 40(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -46,7 +47,7 @@
46#include <asm/errno.h> 47#include <asm/errno.h>
47#include <asm/segment.h> 48#include <asm/segment.h>
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page_types.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
51#include <asm/percpu.h> 52#include <asm/percpu.h>
52#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
@@ -101,121 +102,221 @@
101#define resume_userspace_sig resume_userspace 102#define resume_userspace_sig resume_userspace
102#endif 103#endif
103 104
104#define SAVE_ALL \ 105/*
105 cld; \ 106 * User gs save/restore
106 pushl %fs; \ 107 *
107 CFI_ADJUST_CFA_OFFSET 4;\ 108 * %gs is used for userland TLS and kernel only uses it for stack
108 /*CFI_REL_OFFSET fs, 0;*/\ 109 * canary which is required to be at %gs:20 by gcc. Read the comment
109 pushl %es; \ 110 * at the top of stackprotector.h for more info.
110 CFI_ADJUST_CFA_OFFSET 4;\ 111 *
111 /*CFI_REL_OFFSET es, 0;*/\ 112 * Local labels 98 and 99 are used.
112 pushl %ds; \ 113 */
113 CFI_ADJUST_CFA_OFFSET 4;\ 114#ifdef CONFIG_X86_32_LAZY_GS
114 /*CFI_REL_OFFSET ds, 0;*/\ 115
115 pushl %eax; \ 116 /* unfortunately push/pop can't be no-op */
116 CFI_ADJUST_CFA_OFFSET 4;\ 117.macro PUSH_GS
117 CFI_REL_OFFSET eax, 0;\ 118 pushl $0
118 pushl %ebp; \ 119 CFI_ADJUST_CFA_OFFSET 4
119 CFI_ADJUST_CFA_OFFSET 4;\ 120.endm
120 CFI_REL_OFFSET ebp, 0;\ 121.macro POP_GS pop=0
121 pushl %edi; \ 122 addl $(4 + \pop), %esp
122 CFI_ADJUST_CFA_OFFSET 4;\ 123 CFI_ADJUST_CFA_OFFSET -(4 + \pop)
123 CFI_REL_OFFSET edi, 0;\ 124.endm
124 pushl %esi; \ 125.macro POP_GS_EX
125 CFI_ADJUST_CFA_OFFSET 4;\ 126.endm
126 CFI_REL_OFFSET esi, 0;\ 127
127 pushl %edx; \ 128 /* all the rest are no-op */
128 CFI_ADJUST_CFA_OFFSET 4;\ 129.macro PTGS_TO_GS
129 CFI_REL_OFFSET edx, 0;\ 130.endm
130 pushl %ecx; \ 131.macro PTGS_TO_GS_EX
131 CFI_ADJUST_CFA_OFFSET 4;\ 132.endm
132 CFI_REL_OFFSET ecx, 0;\ 133.macro GS_TO_REG reg
133 pushl %ebx; \ 134.endm
134 CFI_ADJUST_CFA_OFFSET 4;\ 135.macro REG_TO_PTGS reg
135 CFI_REL_OFFSET ebx, 0;\ 136.endm
136 movl $(__USER_DS), %edx; \ 137.macro SET_KERNEL_GS reg
137 movl %edx, %ds; \ 138.endm
138 movl %edx, %es; \ 139
139 movl $(__KERNEL_PERCPU), %edx; \ 140#else /* CONFIG_X86_32_LAZY_GS */
141
142.macro PUSH_GS
143 pushl %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/
146.endm
147
148.macro POP_GS pop=0
14998: popl %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/
152 .if \pop <> 0
153 add $\pop, %esp
154 CFI_ADJUST_CFA_OFFSET -\pop
155 .endif
156.endm
157.macro POP_GS_EX
158.pushsection .fixup, "ax"
15999: movl $0, (%esp)
160 jmp 98b
161.section __ex_table, "a"
162 .align 4
163 .long 98b, 99b
164.popsection
165.endm
166
167.macro PTGS_TO_GS
16898: mov PT_GS(%esp), %gs
169.endm
170.macro PTGS_TO_GS_EX
171.pushsection .fixup, "ax"
17299: movl $0, PT_GS(%esp)
173 jmp 98b
174.section __ex_table, "a"
175 .align 4
176 .long 98b, 99b
177.popsection
178.endm
179
180.macro GS_TO_REG reg
181 movl %gs, \reg
182 /*CFI_REGISTER gs, \reg*/
183.endm
184.macro REG_TO_PTGS reg
185 movl \reg, PT_GS(%esp)
186 /*CFI_REL_OFFSET gs, PT_GS*/
187.endm
188.macro SET_KERNEL_GS reg
189 movl $(__KERNEL_STACK_CANARY), \reg
190 movl \reg, %gs
191.endm
192
193#endif /* CONFIG_X86_32_LAZY_GS */
194
195.macro SAVE_ALL
196 cld
197 PUSH_GS
198 pushl %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0
210 pushl %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0
213 pushl %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0
216 pushl %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0
219 pushl %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0
222 pushl %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0
225 pushl %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx
229 movl %edx, %ds
230 movl %edx, %es
231 movl $(__KERNEL_PERCPU), %edx
140 movl %edx, %fs 232 movl %edx, %fs
233 SET_KERNEL_GS %edx
234.endm
141 235
142#define RESTORE_INT_REGS \ 236.macro RESTORE_INT_REGS
143 popl %ebx; \ 237 popl %ebx
144 CFI_ADJUST_CFA_OFFSET -4;\ 238 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE ebx;\ 239 CFI_RESTORE ebx
146 popl %ecx; \ 240 popl %ecx
147 CFI_ADJUST_CFA_OFFSET -4;\ 241 CFI_ADJUST_CFA_OFFSET -4
148 CFI_RESTORE ecx;\ 242 CFI_RESTORE ecx
149 popl %edx; \ 243 popl %edx
150 CFI_ADJUST_CFA_OFFSET -4;\ 244 CFI_ADJUST_CFA_OFFSET -4
151 CFI_RESTORE edx;\ 245 CFI_RESTORE edx
152 popl %esi; \ 246 popl %esi
153 CFI_ADJUST_CFA_OFFSET -4;\ 247 CFI_ADJUST_CFA_OFFSET -4
154 CFI_RESTORE esi;\ 248 CFI_RESTORE esi
155 popl %edi; \ 249 popl %edi
156 CFI_ADJUST_CFA_OFFSET -4;\ 250 CFI_ADJUST_CFA_OFFSET -4
157 CFI_RESTORE edi;\ 251 CFI_RESTORE edi
158 popl %ebp; \ 252 popl %ebp
159 CFI_ADJUST_CFA_OFFSET -4;\ 253 CFI_ADJUST_CFA_OFFSET -4
160 CFI_RESTORE ebp;\ 254 CFI_RESTORE ebp
161 popl %eax; \ 255 popl %eax
162 CFI_ADJUST_CFA_OFFSET -4;\ 256 CFI_ADJUST_CFA_OFFSET -4
163 CFI_RESTORE eax 257 CFI_RESTORE eax
258.endm
164 259
165#define RESTORE_REGS \ 260.macro RESTORE_REGS pop=0
166 RESTORE_INT_REGS; \ 261 RESTORE_INT_REGS
1671: popl %ds; \ 2621: popl %ds
168 CFI_ADJUST_CFA_OFFSET -4;\ 263 CFI_ADJUST_CFA_OFFSET -4
169 /*CFI_RESTORE ds;*/\ 264 /*CFI_RESTORE ds;*/
1702: popl %es; \ 2652: popl %es
171 CFI_ADJUST_CFA_OFFSET -4;\ 266 CFI_ADJUST_CFA_OFFSET -4
172 /*CFI_RESTORE es;*/\ 267 /*CFI_RESTORE es;*/
1733: popl %fs; \ 2683: popl %fs
174 CFI_ADJUST_CFA_OFFSET -4;\ 269 CFI_ADJUST_CFA_OFFSET -4
175 /*CFI_RESTORE fs;*/\ 270 /*CFI_RESTORE fs;*/
176.pushsection .fixup,"ax"; \ 271 POP_GS \pop
1774: movl $0,(%esp); \ 272.pushsection .fixup, "ax"
178 jmp 1b; \ 2734: movl $0, (%esp)
1795: movl $0,(%esp); \ 274 jmp 1b
180 jmp 2b; \ 2755: movl $0, (%esp)
1816: movl $0,(%esp); \ 276 jmp 2b
182 jmp 3b; \ 2776: movl $0, (%esp)
183.section __ex_table,"a";\ 278 jmp 3b
184 .align 4; \ 279.section __ex_table, "a"
185 .long 1b,4b; \ 280 .align 4
186 .long 2b,5b; \ 281 .long 1b, 4b
187 .long 3b,6b; \ 282 .long 2b, 5b
283 .long 3b, 6b
188.popsection 284.popsection
285 POP_GS_EX
286.endm
189 287
190#define RING0_INT_FRAME \ 288.macro RING0_INT_FRAME
191 CFI_STARTPROC simple;\ 289 CFI_STARTPROC simple
192 CFI_SIGNAL_FRAME;\ 290 CFI_SIGNAL_FRAME
193 CFI_DEF_CFA esp, 3*4;\ 291 CFI_DEF_CFA esp, 3*4
194 /*CFI_OFFSET cs, -2*4;*/\ 292 /*CFI_OFFSET cs, -2*4;*/
195 CFI_OFFSET eip, -3*4 293 CFI_OFFSET eip, -3*4
294.endm
196 295
197#define RING0_EC_FRAME \ 296.macro RING0_EC_FRAME
198 CFI_STARTPROC simple;\ 297 CFI_STARTPROC simple
199 CFI_SIGNAL_FRAME;\ 298 CFI_SIGNAL_FRAME
200 CFI_DEF_CFA esp, 4*4;\ 299 CFI_DEF_CFA esp, 4*4
201 /*CFI_OFFSET cs, -2*4;*/\ 300 /*CFI_OFFSET cs, -2*4;*/
202 CFI_OFFSET eip, -3*4 301 CFI_OFFSET eip, -3*4
302.endm
203 303
204#define RING0_PTREGS_FRAME \ 304.macro RING0_PTREGS_FRAME
205 CFI_STARTPROC simple;\ 305 CFI_STARTPROC simple
206 CFI_SIGNAL_FRAME;\ 306 CFI_SIGNAL_FRAME
207 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ 307 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
208 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ 308 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
209 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ 309 CFI_OFFSET eip, PT_EIP-PT_OLDESP
210 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ 310 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
211 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ 311 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
212 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ 312 CFI_OFFSET eax, PT_EAX-PT_OLDESP
213 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ 313 CFI_OFFSET ebp, PT_EBP-PT_OLDESP
214 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ 314 CFI_OFFSET edi, PT_EDI-PT_OLDESP
215 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ 315 CFI_OFFSET esi, PT_ESI-PT_OLDESP
216 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ 316 CFI_OFFSET edx, PT_EDX-PT_OLDESP
217 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ 317 CFI_OFFSET ecx, PT_ECX-PT_OLDESP
218 CFI_OFFSET ebx, PT_EBX-PT_OLDESP 318 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
319.endm
219 320
220ENTRY(ret_from_fork) 321ENTRY(ret_from_fork)
221 CFI_STARTPROC 322 CFI_STARTPROC
@@ -341,8 +442,7 @@ sysenter_past_esp:
341 442
342 GET_THREAD_INFO(%ebp) 443 GET_THREAD_INFO(%ebp)
343 444
344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 445 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
346 jnz sysenter_audit 446 jnz sysenter_audit
347sysenter_do_call: 447sysenter_do_call:
348 cmpl $(nr_syscalls), %eax 448 cmpl $(nr_syscalls), %eax
@@ -353,7 +453,7 @@ sysenter_do_call:
353 DISABLE_INTERRUPTS(CLBR_ANY) 453 DISABLE_INTERRUPTS(CLBR_ANY)
354 TRACE_IRQS_OFF 454 TRACE_IRQS_OFF
355 movl TI_flags(%ebp), %ecx 455 movl TI_flags(%ebp), %ecx
356 testw $_TIF_ALLWORK_MASK, %cx 456 testl $_TIF_ALLWORK_MASK, %ecx
357 jne sysexit_audit 457 jne sysexit_audit
358sysenter_exit: 458sysenter_exit:
359/* if something modifies registers it must also disable sysexit */ 459/* if something modifies registers it must also disable sysexit */
@@ -362,11 +462,12 @@ sysenter_exit:
362 xorl %ebp,%ebp 462 xorl %ebp,%ebp
363 TRACE_IRQS_ON 463 TRACE_IRQS_ON
3641: mov PT_FS(%esp), %fs 4641: mov PT_FS(%esp), %fs
465 PTGS_TO_GS
365 ENABLE_INTERRUPTS_SYSEXIT 466 ENABLE_INTERRUPTS_SYSEXIT
366 467
367#ifdef CONFIG_AUDITSYSCALL 468#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit: 469sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 470 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry 471 jnz syscall_trace_entry
371 addl $4,%esp 472 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4 473 CFI_ADJUST_CFA_OFFSET -4
@@ -383,7 +484,7 @@ sysenter_audit:
383 jmp sysenter_do_call 484 jmp sysenter_do_call
384 485
385sysexit_audit: 486sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 487 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
387 jne syscall_exit_work 488 jne syscall_exit_work
388 TRACE_IRQS_ON 489 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY) 490 ENABLE_INTERRUPTS(CLBR_ANY)
@@ -396,7 +497,7 @@ sysexit_audit:
396 DISABLE_INTERRUPTS(CLBR_ANY) 497 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx 499 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 500 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
400 jne syscall_exit_work 501 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */ 502 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit 503 jmp sysenter_exit
@@ -410,6 +511,7 @@ sysexit_audit:
410 .align 4 511 .align 4
411 .long 1b,2b 512 .long 1b,2b
412.popsection 513.popsection
514 PTGS_TO_GS_EX
413ENDPROC(ia32_sysenter_target) 515ENDPROC(ia32_sysenter_target)
414 516
415 # system call handler stub 517 # system call handler stub
@@ -420,8 +522,7 @@ ENTRY(system_call)
420 SAVE_ALL 522 SAVE_ALL
421 GET_THREAD_INFO(%ebp) 523 GET_THREAD_INFO(%ebp)
422 # system call tracing in operation / emulation 524 # system call tracing in operation / emulation
423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 525 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
425 jnz syscall_trace_entry 526 jnz syscall_trace_entry
426 cmpl $(nr_syscalls), %eax 527 cmpl $(nr_syscalls), %eax
427 jae syscall_badsys 528 jae syscall_badsys
@@ -435,7 +536,7 @@ syscall_exit:
435 # between sampling and the iret 536 # between sampling and the iret
436 TRACE_IRQS_OFF 537 TRACE_IRQS_OFF
437 movl TI_flags(%ebp), %ecx 538 movl TI_flags(%ebp), %ecx
438 testw $_TIF_ALLWORK_MASK, %cx # current->work 539 testl $_TIF_ALLWORK_MASK, %ecx # current->work
439 jne syscall_exit_work 540 jne syscall_exit_work
440 541
441restore_all: 542restore_all:
@@ -452,8 +553,7 @@ restore_all:
452restore_nocheck: 553restore_nocheck:
453 TRACE_IRQS_IRET 554 TRACE_IRQS_IRET
454restore_nocheck_notrace: 555restore_nocheck_notrace:
455 RESTORE_REGS 556 RESTORE_REGS 4 # skip orig_eax/error_code
456 addl $4, %esp # skip orig_eax/error_code
457 CFI_ADJUST_CFA_OFFSET -4 557 CFI_ADJUST_CFA_OFFSET -4
458irq_return: 558irq_return:
459 INTERRUPT_RETURN 559 INTERRUPT_RETURN
@@ -571,7 +671,7 @@ END(syscall_trace_entry)
571 # perform syscall exit tracing 671 # perform syscall exit tracing
572 ALIGN 672 ALIGN
573syscall_exit_work: 673syscall_exit_work:
574 testb $_TIF_WORK_SYSCALL_EXIT, %cl 674 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
575 jz work_pending 675 jz work_pending
576 TRACE_IRQS_ON 676 TRACE_IRQS_ON
577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 677 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
@@ -595,28 +695,50 @@ syscall_badsys:
595END(syscall_badsys) 695END(syscall_badsys)
596 CFI_ENDPROC 696 CFI_ENDPROC
597 697
598#define FIXUP_ESPFIX_STACK \ 698/*
599 /* since we are on a wrong stack, we cant make it a C code :( */ \ 699 * System calls that need a pt_regs pointer.
600 PER_CPU(gdt_page, %ebx); \ 700 */
601 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 701#define PTREGSCALL(name) \
602 addl %esp, %eax; \ 702 ALIGN; \
603 pushl $__KERNEL_DS; \ 703ptregs_##name: \
604 CFI_ADJUST_CFA_OFFSET 4; \ 704 leal 4(%esp),%eax; \
605 pushl %eax; \ 705 jmp sys_##name;
606 CFI_ADJUST_CFA_OFFSET 4; \ 706
607 lss (%esp), %esp; \ 707PTREGSCALL(iopl)
608 CFI_ADJUST_CFA_OFFSET -8; 708PTREGSCALL(fork)
609#define UNWIND_ESPFIX_STACK \ 709PTREGSCALL(clone)
610 movl %ss, %eax; \ 710PTREGSCALL(vfork)
611 /* see if on espfix stack */ \ 711PTREGSCALL(execve)
612 cmpw $__ESPFIX_SS, %ax; \ 712PTREGSCALL(sigaltstack)
613 jne 27f; \ 713PTREGSCALL(sigreturn)
614 movl $__KERNEL_DS, %eax; \ 714PTREGSCALL(rt_sigreturn)
615 movl %eax, %ds; \ 715PTREGSCALL(vm86)
616 movl %eax, %es; \ 716PTREGSCALL(vm86old)
617 /* switch to normal stack */ \ 717
618 FIXUP_ESPFIX_STACK; \ 718.macro FIXUP_ESPFIX_STACK
61927:; 719 /* since we are on a wrong stack, we cant make it a C code :( */
720 PER_CPU(gdt_page, %ebx)
721 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
722 addl %esp, %eax
723 pushl $__KERNEL_DS
724 CFI_ADJUST_CFA_OFFSET 4
725 pushl %eax
726 CFI_ADJUST_CFA_OFFSET 4
727 lss (%esp), %esp
728 CFI_ADJUST_CFA_OFFSET -8
729.endm
730.macro UNWIND_ESPFIX_STACK
731 movl %ss, %eax
732 /* see if on espfix stack */
733 cmpw $__ESPFIX_SS, %ax
734 jne 27f
735 movl $__KERNEL_DS, %eax
736 movl %eax, %ds
737 movl %eax, %es
738 /* switch to normal stack */
739 FIXUP_ESPFIX_STACK
74027:
741.endm
620 742
621/* 743/*
622 * Build the entry stubs and pointer table with some assembler magic. 744 * Build the entry stubs and pointer table with some assembler magic.
@@ -672,7 +794,7 @@ common_interrupt:
672ENDPROC(common_interrupt) 794ENDPROC(common_interrupt)
673 CFI_ENDPROC 795 CFI_ENDPROC
674 796
675#define BUILD_INTERRUPT(name, nr) \ 797#define BUILD_INTERRUPT3(name, nr, fn) \
676ENTRY(name) \ 798ENTRY(name) \
677 RING0_INT_FRAME; \ 799 RING0_INT_FRAME; \
678 pushl $~(nr); \ 800 pushl $~(nr); \
@@ -680,13 +802,15 @@ ENTRY(name) \
680 SAVE_ALL; \ 802 SAVE_ALL; \
681 TRACE_IRQS_OFF \ 803 TRACE_IRQS_OFF \
682 movl %esp,%eax; \ 804 movl %esp,%eax; \
683 call smp_##name; \ 805 call fn; \
684 jmp ret_from_intr; \ 806 jmp ret_from_intr; \
685 CFI_ENDPROC; \ 807 CFI_ENDPROC; \
686ENDPROC(name) 808ENDPROC(name)
687 809
810#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
811
688/* The include is where all of the SMP etc. interrupts come from */ 812/* The include is where all of the SMP etc. interrupts come from */
689#include "entry_arch.h" 813#include <asm/entry_arch.h>
690 814
691ENTRY(coprocessor_error) 815ENTRY(coprocessor_error)
692 RING0_INT_FRAME 816 RING0_INT_FRAME
@@ -1068,7 +1192,10 @@ ENTRY(page_fault)
1068 CFI_ADJUST_CFA_OFFSET 4 1192 CFI_ADJUST_CFA_OFFSET 4
1069 ALIGN 1193 ALIGN
1070error_code: 1194error_code:
1071 /* the function address is in %fs's slot on the stack */ 1195 /* the function address is in %gs's slot on the stack */
1196 pushl %fs
1197 CFI_ADJUST_CFA_OFFSET 4
1198 /*CFI_REL_OFFSET fs, 0*/
1072 pushl %es 1199 pushl %es
1073 CFI_ADJUST_CFA_OFFSET 4 1200 CFI_ADJUST_CFA_OFFSET 4
1074 /*CFI_REL_OFFSET es, 0*/ 1201 /*CFI_REL_OFFSET es, 0*/
@@ -1097,20 +1224,15 @@ error_code:
1097 CFI_ADJUST_CFA_OFFSET 4 1224 CFI_ADJUST_CFA_OFFSET 4
1098 CFI_REL_OFFSET ebx, 0 1225 CFI_REL_OFFSET ebx, 0
1099 cld 1226 cld
1100 pushl %fs
1101 CFI_ADJUST_CFA_OFFSET 4
1102 /*CFI_REL_OFFSET fs, 0*/
1103 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1104 movl %ecx, %fs 1228 movl %ecx, %fs
1105 UNWIND_ESPFIX_STACK 1229 UNWIND_ESPFIX_STACK
1106 popl %ecx 1230 GS_TO_REG %ecx
1107 CFI_ADJUST_CFA_OFFSET -4 1231 movl PT_GS(%esp), %edi # get the function address
1108 /*CFI_REGISTER es, ecx*/
1109 movl PT_FS(%esp), %edi # get the function address
1110 movl PT_ORIG_EAX(%esp), %edx # get the error code 1232 movl PT_ORIG_EAX(%esp), %edx # get the error code
1111 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1233 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1112 mov %ecx, PT_FS(%esp) 1234 REG_TO_PTGS %ecx
1113 /*CFI_REL_OFFSET fs, ES*/ 1235 SET_KERNEL_GS %ecx
1114 movl $(__USER_DS), %ecx 1236 movl $(__USER_DS), %ecx
1115 movl %ecx, %ds 1237 movl %ecx, %ds
1116 movl %ecx, %es 1238 movl %ecx, %es
@@ -1134,26 +1256,27 @@ END(page_fault)
1134 * by hand onto the new stack - while updating the return eip past 1256 * by hand onto the new stack - while updating the return eip past
1135 * the instruction that would have done it for sysenter. 1257 * the instruction that would have done it for sysenter.
1136 */ 1258 */
1137#define FIX_STACK(offset, ok, label) \ 1259.macro FIX_STACK offset ok label
1138 cmpw $__KERNEL_CS,4(%esp); \ 1260 cmpw $__KERNEL_CS, 4(%esp)
1139 jne ok; \ 1261 jne \ok
1140label: \ 1262\label:
1141 movl TSS_sysenter_sp0+offset(%esp),%esp; \ 1263 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1142 CFI_DEF_CFA esp, 0; \ 1264 CFI_DEF_CFA esp, 0
1143 CFI_UNDEFINED eip; \ 1265 CFI_UNDEFINED eip
1144 pushfl; \ 1266 pushfl
1145 CFI_ADJUST_CFA_OFFSET 4; \ 1267 CFI_ADJUST_CFA_OFFSET 4
1146 pushl $__KERNEL_CS; \ 1268 pushl $__KERNEL_CS
1147 CFI_ADJUST_CFA_OFFSET 4; \ 1269 CFI_ADJUST_CFA_OFFSET 4
1148 pushl $sysenter_past_esp; \ 1270 pushl $sysenter_past_esp
1149 CFI_ADJUST_CFA_OFFSET 4; \ 1271 CFI_ADJUST_CFA_OFFSET 4
1150 CFI_REL_OFFSET eip, 0 1272 CFI_REL_OFFSET eip, 0
1273.endm
1151 1274
1152ENTRY(debug) 1275ENTRY(debug)
1153 RING0_INT_FRAME 1276 RING0_INT_FRAME
1154 cmpl $ia32_sysenter_target,(%esp) 1277 cmpl $ia32_sysenter_target,(%esp)
1155 jne debug_stack_correct 1278 jne debug_stack_correct
1156 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 1279 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1157debug_stack_correct: 1280debug_stack_correct:
1158 pushl $-1 # mark this as an int 1281 pushl $-1 # mark this as an int
1159 CFI_ADJUST_CFA_OFFSET 4 1282 CFI_ADJUST_CFA_OFFSET 4
@@ -1211,7 +1334,7 @@ nmi_stack_correct:
1211 1334
1212nmi_stack_fixup: 1335nmi_stack_fixup:
1213 RING0_INT_FRAME 1336 RING0_INT_FRAME
1214 FIX_STACK(12,nmi_stack_correct, 1) 1337 FIX_STACK 12, nmi_stack_correct, 1
1215 jmp nmi_stack_correct 1338 jmp nmi_stack_correct
1216 1339
1217nmi_debug_stack_check: 1340nmi_debug_stack_check:
@@ -1222,7 +1345,7 @@ nmi_debug_stack_check:
1222 jb nmi_stack_correct 1345 jb nmi_stack_correct
1223 cmpl $debug_esp_fix_insn,(%esp) 1346 cmpl $debug_esp_fix_insn,(%esp)
1224 ja nmi_stack_correct 1347 ja nmi_stack_correct
1225 FIX_STACK(24,nmi_stack_correct, 1) 1348 FIX_STACK 24, nmi_stack_correct, 1
1226 jmp nmi_stack_correct 1349 jmp nmi_stack_correct
1227 1350
1228nmi_espfix_stack: 1351nmi_espfix_stack:
@@ -1234,7 +1357,7 @@ nmi_espfix_stack:
1234 CFI_ADJUST_CFA_OFFSET 4 1357 CFI_ADJUST_CFA_OFFSET 4
1235 pushl %esp 1358 pushl %esp
1236 CFI_ADJUST_CFA_OFFSET 4 1359 CFI_ADJUST_CFA_OFFSET 4
1237 addw $4, (%esp) 1360 addl $4, (%esp)
1238 /* copy the iret frame of 12 bytes */ 1361 /* copy the iret frame of 12 bytes */
1239 .rept 3 1362 .rept 3
1240 pushl 16(%esp) 1363 pushl 16(%esp)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a1346217e43c..a331ec38af9e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -48,10 +48,11 @@
48#include <asm/unistd.h> 48#include <asm/unistd.h>
49#include <asm/thread_info.h> 49#include <asm/thread_info.h>
50#include <asm/hw_irq.h> 50#include <asm/hw_irq.h>
51#include <asm/page.h> 51#include <asm/page_types.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -76,20 +77,17 @@ ENTRY(ftrace_caller)
76 movq 8(%rbp), %rsi 77 movq 8(%rbp), %rsi
77 subq $MCOUNT_INSN_SIZE, %rdi 78 subq $MCOUNT_INSN_SIZE, %rdi
78 79
79.globl ftrace_call 80GLOBAL(ftrace_call)
80ftrace_call:
81 call ftrace_stub 81 call ftrace_stub
82 82
83 MCOUNT_RESTORE_FRAME 83 MCOUNT_RESTORE_FRAME
84 84
85#ifdef CONFIG_FUNCTION_GRAPH_TRACER 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
86.globl ftrace_graph_call 86GLOBAL(ftrace_graph_call)
87ftrace_graph_call:
88 jmp ftrace_stub 87 jmp ftrace_stub
89#endif 88#endif
90 89
91.globl ftrace_stub 90GLOBAL(ftrace_stub)
92ftrace_stub:
93 retq 91 retq
94END(ftrace_caller) 92END(ftrace_caller)
95 93
@@ -109,8 +107,7 @@ ENTRY(mcount)
109 jnz ftrace_graph_caller 107 jnz ftrace_graph_caller
110#endif 108#endif
111 109
112.globl ftrace_stub 110GLOBAL(ftrace_stub)
113ftrace_stub:
114 retq 111 retq
115 112
116trace: 113trace:
@@ -147,9 +144,7 @@ ENTRY(ftrace_graph_caller)
147 retq 144 retq
148END(ftrace_graph_caller) 145END(ftrace_graph_caller)
149 146
150 147GLOBAL(return_to_handler)
151.globl return_to_handler
152return_to_handler:
153 subq $80, %rsp 148 subq $80, %rsp
154 149
155 movq %rax, (%rsp) 150 movq %rax, (%rsp)
@@ -187,6 +182,7 @@ return_to_handler:
187ENTRY(native_usergs_sysret64) 182ENTRY(native_usergs_sysret64)
188 swapgs 183 swapgs
189 sysretq 184 sysretq
185ENDPROC(native_usergs_sysret64)
190#endif /* CONFIG_PARAVIRT */ 186#endif /* CONFIG_PARAVIRT */
191 187
192 188
@@ -209,7 +205,7 @@ ENTRY(native_usergs_sysret64)
209 205
210 /* %rsp:at FRAMEEND */ 206 /* %rsp:at FRAMEEND */
211 .macro FIXUP_TOP_OF_STACK tmp offset=0 207 .macro FIXUP_TOP_OF_STACK tmp offset=0
212 movq %gs:pda_oldrsp,\tmp 208 movq PER_CPU_VAR(old_rsp),\tmp
213 movq \tmp,RSP+\offset(%rsp) 209 movq \tmp,RSP+\offset(%rsp)
214 movq $__USER_DS,SS+\offset(%rsp) 210 movq $__USER_DS,SS+\offset(%rsp)
215 movq $__USER_CS,CS+\offset(%rsp) 211 movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +216,7 @@ ENTRY(native_usergs_sysret64)
220 216
221 .macro RESTORE_TOP_OF_STACK tmp offset=0 217 .macro RESTORE_TOP_OF_STACK tmp offset=0
222 movq RSP+\offset(%rsp),\tmp 218 movq RSP+\offset(%rsp),\tmp
223 movq \tmp,%gs:pda_oldrsp 219 movq \tmp,PER_CPU_VAR(old_rsp)
224 movq EFLAGS+\offset(%rsp),\tmp 220 movq EFLAGS+\offset(%rsp),\tmp
225 movq \tmp,R11+\offset(%rsp) 221 movq \tmp,R11+\offset(%rsp)
226 .endm 222 .endm
@@ -336,15 +332,15 @@ ENTRY(save_args)
336 je 1f 332 je 1f
337 SWAPGS 333 SWAPGS
338 /* 334 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack 335 * irq_count is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is 336 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of 337 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work) 338 * moving irq_enter into assembly, which would be too much work)
343 */ 339 */
3441: incl %gs:pda_irqcount 3401: incl PER_CPU_VAR(irq_count)
345 jne 2f 341 jne 2f
346 popq_cfi %rax /* move return address... */ 342 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp 343 mov PER_CPU_VAR(irq_stack_ptr),%rsp
348 EMPTY_FRAME 0 344 EMPTY_FRAME 0
349 pushq_cfi %rbp /* backlink for unwinder */ 345 pushq_cfi %rbp /* backlink for unwinder */
350 pushq_cfi %rax /* ... to the new stack */ 346 pushq_cfi %rax /* ... to the new stack */
@@ -372,6 +368,7 @@ ENTRY(save_rest)
372END(save_rest) 368END(save_rest)
373 369
374/* save complete stack frame */ 370/* save complete stack frame */
371 .pushsection .kprobes.text, "ax"
375ENTRY(save_paranoid) 372ENTRY(save_paranoid)
376 XCPT_FRAME 1 RDI+8 373 XCPT_FRAME 1 RDI+8
377 cld 374 cld
@@ -400,6 +397,7 @@ ENTRY(save_paranoid)
4001: ret 3971: ret
401 CFI_ENDPROC 398 CFI_ENDPROC
402END(save_paranoid) 399END(save_paranoid)
400 .popsection
403 401
404/* 402/*
405 * A newly forked process directly context switches into this address. 403 * A newly forked process directly context switches into this address.
@@ -409,6 +407,8 @@ END(save_paranoid)
409ENTRY(ret_from_fork) 407ENTRY(ret_from_fork)
410 DEFAULT_FRAME 408 DEFAULT_FRAME
411 409
410 LOCK ; btr $TIF_FORK,TI_flags(%r8)
411
412 push kernel_eflags(%rip) 412 push kernel_eflags(%rip)
413 CFI_ADJUST_CFA_OFFSET 8 413 CFI_ADJUST_CFA_OFFSET 8
414 popf # reset kernel eflags 414 popf # reset kernel eflags
@@ -418,7 +418,6 @@ ENTRY(ret_from_fork)
418 418
419 GET_THREAD_INFO(%rcx) 419 GET_THREAD_INFO(%rcx)
420 420
421 CFI_REMEMBER_STATE
422 RESTORE_REST 421 RESTORE_REST
423 422
424 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -430,7 +429,6 @@ ENTRY(ret_from_fork)
430 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
431 jmp ret_from_sys_call # go to the SYSRET fastpath 430 jmp ret_from_sys_call # go to the SYSRET fastpath
432 431
433 CFI_RESTORE_STATE
434 CFI_ENDPROC 432 CFI_ENDPROC
435END(ret_from_fork) 433END(ret_from_fork)
436 434
@@ -468,7 +466,7 @@ END(ret_from_fork)
468ENTRY(system_call) 466ENTRY(system_call)
469 CFI_STARTPROC simple 467 CFI_STARTPROC simple
470 CFI_SIGNAL_FRAME 468 CFI_SIGNAL_FRAME
471 CFI_DEF_CFA rsp,PDA_STACKOFFSET 469 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
472 CFI_REGISTER rip,rcx 470 CFI_REGISTER rip,rcx
473 /*CFI_REGISTER rflags,r11*/ 471 /*CFI_REGISTER rflags,r11*/
474 SWAPGS_UNSAFE_STACK 472 SWAPGS_UNSAFE_STACK
@@ -479,8 +477,8 @@ ENTRY(system_call)
479 */ 477 */
480ENTRY(system_call_after_swapgs) 478ENTRY(system_call_after_swapgs)
481 479
482 movq %rsp,%gs:pda_oldrsp 480 movq %rsp,PER_CPU_VAR(old_rsp)
483 movq %gs:pda_kernelstack,%rsp 481 movq PER_CPU_VAR(kernel_stack),%rsp
484 /* 482 /*
485 * No need to follow this irqs off/on section - it's straight 483 * No need to follow this irqs off/on section - it's straight
486 * and short: 484 * and short:
@@ -523,7 +521,7 @@ sysret_check:
523 CFI_REGISTER rip,rcx 521 CFI_REGISTER rip,rcx
524 RESTORE_ARGS 0,-ARG_SKIP,1 522 RESTORE_ARGS 0,-ARG_SKIP,1
525 /*CFI_REGISTER rflags,r11*/ 523 /*CFI_REGISTER rflags,r11*/
526 movq %gs:pda_oldrsp, %rsp 524 movq PER_CPU_VAR(old_rsp), %rsp
527 USERGS_SYSRET64 525 USERGS_SYSRET64
528 526
529 CFI_RESTORE_STATE 527 CFI_RESTORE_STATE
@@ -630,16 +628,14 @@ tracesys:
630 * Syscall return path ending with IRET. 628 * Syscall return path ending with IRET.
631 * Has correct top of stack, but partial stack frame. 629 * Has correct top of stack, but partial stack frame.
632 */ 630 */
633 .globl int_ret_from_sys_call 631GLOBAL(int_ret_from_sys_call)
634 .globl int_with_check
635int_ret_from_sys_call:
636 DISABLE_INTERRUPTS(CLBR_NONE) 632 DISABLE_INTERRUPTS(CLBR_NONE)
637 TRACE_IRQS_OFF 633 TRACE_IRQS_OFF
638 testl $3,CS-ARGOFFSET(%rsp) 634 testl $3,CS-ARGOFFSET(%rsp)
639 je retint_restore_args 635 je retint_restore_args
640 movl $_TIF_ALLWORK_MASK,%edi 636 movl $_TIF_ALLWORK_MASK,%edi
641 /* edi: mask to check */ 637 /* edi: mask to check */
642int_with_check: 638GLOBAL(int_with_check)
643 LOCKDEP_SYS_EXIT_IRQ 639 LOCKDEP_SYS_EXIT_IRQ
644 GET_THREAD_INFO(%rcx) 640 GET_THREAD_INFO(%rcx)
645 movl TI_flags(%rcx),%edx 641 movl TI_flags(%rcx),%edx
@@ -833,11 +829,11 @@ common_interrupt:
833 XCPT_FRAME 829 XCPT_FRAME
834 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 830 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
835 interrupt do_IRQ 831 interrupt do_IRQ
836 /* 0(%rsp): oldrsp-ARGOFFSET */ 832 /* 0(%rsp): old_rsp-ARGOFFSET */
837ret_from_intr: 833ret_from_intr:
838 DISABLE_INTERRUPTS(CLBR_NONE) 834 DISABLE_INTERRUPTS(CLBR_NONE)
839 TRACE_IRQS_OFF 835 TRACE_IRQS_OFF
840 decl %gs:pda_irqcount 836 decl PER_CPU_VAR(irq_count)
841 leaveq 837 leaveq
842 CFI_DEF_CFA_REGISTER rsp 838 CFI_DEF_CFA_REGISTER rsp
843 CFI_ADJUST_CFA_OFFSET -8 839 CFI_ADJUST_CFA_OFFSET -8
@@ -982,10 +978,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
982 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 978 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
983#endif 979#endif
984 980
981#ifdef CONFIG_X86_UV
985apicinterrupt UV_BAU_MESSAGE \ 982apicinterrupt UV_BAU_MESSAGE \
986 uv_bau_message_intr1 uv_bau_message_interrupt 983 uv_bau_message_intr1 uv_bau_message_interrupt
984#endif
987apicinterrupt LOCAL_TIMER_VECTOR \ 985apicinterrupt LOCAL_TIMER_VECTOR \
988 apic_timer_interrupt smp_apic_timer_interrupt 986 apic_timer_interrupt smp_apic_timer_interrupt
987apicinterrupt GENERIC_INTERRUPT_VECTOR \
988 generic_interrupt smp_generic_interrupt
989 989
990#ifdef CONFIG_SMP 990#ifdef CONFIG_SMP
991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1073,10 +1073,10 @@ ENTRY(\sym)
1073 TRACE_IRQS_OFF 1073 TRACE_IRQS_OFF
1074 movq %rsp,%rdi /* pt_regs pointer */ 1074 movq %rsp,%rdi /* pt_regs pointer */
1075 xorl %esi,%esi /* no error code */ 1075 xorl %esi,%esi /* no error code */
1076 movq %gs:pda_data_offset, %rbp 1076 PER_CPU(init_tss, %rbp)
1077 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1077 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1078 call \do_sym 1078 call \do_sym
1079 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1079 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1080 jmp paranoid_exit /* %ebx: no swapgs flag */ 1080 jmp paranoid_exit /* %ebx: no swapgs flag */
1081 CFI_ENDPROC 1081 CFI_ENDPROC
1082END(\sym) 1082END(\sym)
@@ -1138,7 +1138,7 @@ ENTRY(native_load_gs_index)
1138 CFI_STARTPROC 1138 CFI_STARTPROC
1139 pushf 1139 pushf
1140 CFI_ADJUST_CFA_OFFSET 8 1140 CFI_ADJUST_CFA_OFFSET 8
1141 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1141 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1142 SWAPGS 1142 SWAPGS
1143gs_change: 1143gs_change:
1144 movl %edi,%gs 1144 movl %edi,%gs
@@ -1260,14 +1260,14 @@ ENTRY(call_softirq)
1260 CFI_REL_OFFSET rbp,0 1260 CFI_REL_OFFSET rbp,0
1261 mov %rsp,%rbp 1261 mov %rsp,%rbp
1262 CFI_DEF_CFA_REGISTER rbp 1262 CFI_DEF_CFA_REGISTER rbp
1263 incl %gs:pda_irqcount 1263 incl PER_CPU_VAR(irq_count)
1264 cmove %gs:pda_irqstackptr,%rsp 1264 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1265 push %rbp # backlink for old unwinder 1265 push %rbp # backlink for old unwinder
1266 call __do_softirq 1266 call __do_softirq
1267 leaveq 1267 leaveq
1268 CFI_DEF_CFA_REGISTER rsp 1268 CFI_DEF_CFA_REGISTER rsp
1269 CFI_ADJUST_CFA_OFFSET -8 1269 CFI_ADJUST_CFA_OFFSET -8
1270 decl %gs:pda_irqcount 1270 decl PER_CPU_VAR(irq_count)
1271 ret 1271 ret
1272 CFI_ENDPROC 1272 CFI_ENDPROC
1273END(call_softirq) 1273END(call_softirq)
@@ -1297,15 +1297,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1297 movq %rdi, %rsp # we don't return, adjust the stack frame 1297 movq %rdi, %rsp # we don't return, adjust the stack frame
1298 CFI_ENDPROC 1298 CFI_ENDPROC
1299 DEFAULT_FRAME 1299 DEFAULT_FRAME
130011: incl %gs:pda_irqcount 130011: incl PER_CPU_VAR(irq_count)
1301 movq %rsp,%rbp 1301 movq %rsp,%rbp
1302 CFI_DEF_CFA_REGISTER rbp 1302 CFI_DEF_CFA_REGISTER rbp
1303 cmovzq %gs:pda_irqstackptr,%rsp 1303 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1304 pushq %rbp # backlink for old unwinder 1304 pushq %rbp # backlink for old unwinder
1305 call xen_evtchn_do_upcall 1305 call xen_evtchn_do_upcall
1306 popq %rsp 1306 popq %rsp
1307 CFI_DEF_CFA_REGISTER rsp 1307 CFI_DEF_CFA_REGISTER rsp
1308 decl %gs:pda_irqcount 1308 decl PER_CPU_VAR(irq_count)
1309 jmp error_exit 1309 jmp error_exit
1310 CFI_ENDPROC 1310 CFI_ENDPROC
1311END(do_hypervisor_callback) 1311END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
deleted file mode 100644
index 53699c931ad4..000000000000
--- a/arch/x86/kernel/es7000_32.c
+++ /dev/null
@@ -1,378 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/smp.h>
31#include <linux/string.h>
32#include <linux/spinlock.h>
33#include <linux/errno.h>
34#include <linux/notifier.h>
35#include <linux/reboot.h>
36#include <linux/init.h>
37#include <linux/acpi.h>
38#include <asm/io.h>
39#include <asm/nmi.h>
40#include <asm/smp.h>
41#include <asm/atomic.h>
42#include <asm/apicdef.h>
43#include <mach_mpparse.h>
44#include <asm/genapic.h>
45#include <asm/setup.h>
46
47/*
48 * ES7000 chipsets
49 */
50
51#define NON_UNISYS 0
52#define ES7000_CLASSIC 1
53#define ES7000_ZORRO 2
54
55
56#define MIP_REG 1
57#define MIP_PSAI_REG 4
58
59#define MIP_BUSY 1
60#define MIP_SPIN 0xf0000
61#define MIP_VALID 0x0100000000000000ULL
62#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
63
64#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
65
66struct mip_reg_info {
67 unsigned long long mip_info;
68 unsigned long long delivery_info;
69 unsigned long long host_reg;
70 unsigned long long mip_reg;
71};
72
73struct part_info {
74 unsigned char type;
75 unsigned char length;
76 unsigned char part_id;
77 unsigned char apic_mode;
78 unsigned long snum;
79 char ptype[16];
80 char sname[64];
81 char pname[64];
82};
83
84struct psai {
85 unsigned long long entry_type;
86 unsigned long long addr;
87 unsigned long long bep_addr;
88};
89
90struct es7000_mem_info {
91 unsigned char type;
92 unsigned char length;
93 unsigned char resv[6];
94 unsigned long long start;
95 unsigned long long size;
96};
97
98struct es7000_oem_table {
99 unsigned long long hdr;
100 struct mip_reg_info mip;
101 struct part_info pif;
102 struct es7000_mem_info shm;
103 struct psai psai;
104};
105
106#ifdef CONFIG_ACPI
107
108struct oem_table {
109 struct acpi_table_header Header;
110 u32 OEMTableAddr;
111 u32 OEMTableSize;
112};
113
114extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
115extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
116#endif
117
118struct mip_reg {
119 unsigned long long off_0;
120 unsigned long long off_8;
121 unsigned long long off_10;
122 unsigned long long off_18;
123 unsigned long long off_20;
124 unsigned long long off_28;
125 unsigned long long off_30;
126 unsigned long long off_38;
127};
128
129#define MIP_SW_APIC 0x1020b
130#define MIP_FUNC(VALUE) (VALUE & 0xff)
131
132/*
133 * ES7000 Globals
134 */
135
136static volatile unsigned long *psai = NULL;
137static struct mip_reg *mip_reg;
138static struct mip_reg *host_reg;
139static int mip_port;
140static unsigned long mip_addr, host_addr;
141
142int es7000_plat;
143
144/*
145 * GSI override for ES7000 platforms.
146 */
147
148static unsigned int base;
149
150static int
151es7000_rename_gsi(int ioapic, int gsi)
152{
153 if (es7000_plat == ES7000_ZORRO)
154 return gsi;
155
156 if (!base) {
157 int i;
158 for (i = 0; i < nr_ioapics; i++)
159 base += nr_ioapic_registers[i];
160 }
161
162 if (!ioapic && (gsi < 16))
163 gsi += base;
164 return gsi;
165}
166
167static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
168{
169 unsigned long vect = 0, psaival = 0;
170
171 if (psai == NULL)
172 return -1;
173
174 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
175 psaival = (0x1000000 | vect | cpu);
176
177 while (*psai & 0x1000000)
178 ;
179
180 *psai = psaival;
181
182 return 0;
183}
184
185static void noop_wait_for_deassert(atomic_t *deassert_not_used)
186{
187}
188
189static int __init es7000_update_genapic(void)
190{
191 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
192
193 /* MPENTIUMIII */
194 if (boot_cpu_data.x86 == 6 &&
195 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
196 es7000_update_genapic_to_cluster();
197 genapic->wait_for_init_deassert = noop_wait_for_deassert;
198 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
199 }
200
201 return 0;
202}
203
204void __init
205setup_unisys(void)
206{
207 /*
208 * Determine the generation of the ES7000 currently running.
209 *
210 * es7000_plat = 1 if the machine is a 5xx ES7000 box
211 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
212 *
213 */
214 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
215 es7000_plat = ES7000_ZORRO;
216 else
217 es7000_plat = ES7000_CLASSIC;
218 ioapic_renumber_irq = es7000_rename_gsi;
219
220 x86_quirks->update_genapic = es7000_update_genapic;
221}
222
223/*
224 * Parse the OEM Table
225 */
226
227int __init
228parse_unisys_oem (char *oemptr)
229{
230 int i;
231 int success = 0;
232 unsigned char type, size;
233 unsigned long val;
234 char *tp = NULL;
235 struct psai *psaip = NULL;
236 struct mip_reg_info *mi;
237 struct mip_reg *host, *mip;
238
239 tp = oemptr;
240
241 tp += 8;
242
243 for (i=0; i <= 6; i++) {
244 type = *tp++;
245 size = *tp++;
246 tp -= 2;
247 switch (type) {
248 case MIP_REG:
249 mi = (struct mip_reg_info *)tp;
250 val = MIP_RD_LO(mi->host_reg);
251 host_addr = val;
252 host = (struct mip_reg *)val;
253 host_reg = __va(host);
254 val = MIP_RD_LO(mi->mip_reg);
255 mip_port = MIP_PORT(mi->mip_info);
256 mip_addr = val;
257 mip = (struct mip_reg *)val;
258 mip_reg = __va(mip);
259 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
260 (unsigned long)host_reg);
261 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
262 (unsigned long)mip_reg);
263 success++;
264 break;
265 case MIP_PSAI_REG:
266 psaip = (struct psai *)tp;
267 if (tp != NULL) {
268 if (psaip->addr)
269 psai = __va(psaip->addr);
270 else
271 psai = NULL;
272 success++;
273 }
274 break;
275 default:
276 break;
277 }
278 tp += size;
279 }
280
281 if (success < 2) {
282 es7000_plat = NON_UNISYS;
283 } else
284 setup_unisys();
285 return es7000_plat;
286}
287
288#ifdef CONFIG_ACPI
289static unsigned long oem_addrX;
290static unsigned long oem_size;
291int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
292{
293 struct acpi_table_header *header = NULL;
294 int i = 0;
295
296 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
297 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
298 struct oem_table *t = (struct oem_table *)header;
299
300 oem_addrX = t->OEMTableAddr;
301 oem_size = t->OEMTableSize;
302
303 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
304 oem_size);
305 return 0;
306 }
307 }
308 return -1;
309}
310
311void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
312{
313}
314#endif
315
316static void
317es7000_spin(int n)
318{
319 int i = 0;
320
321 while (i++ < n)
322 rep_nop();
323}
324
325static int __init
326es7000_mip_write(struct mip_reg *mip_reg)
327{
328 int status = 0;
329 int spin;
330
331 spin = MIP_SPIN;
332 while (((unsigned long long)host_reg->off_38 &
333 (unsigned long long)MIP_VALID) != 0) {
334 if (--spin <= 0) {
335 printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
336 return -1;
337 }
338 es7000_spin(MIP_SPIN);
339 }
340
341 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
342 outb(1, mip_port);
343
344 spin = MIP_SPIN;
345
346 while (((unsigned long long)mip_reg->off_38 &
347 (unsigned long long)MIP_VALID) == 0) {
348 if (--spin <= 0) {
349 printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
350 return -1;
351 }
352 es7000_spin(MIP_SPIN);
353 }
354
355 status = ((unsigned long long)mip_reg->off_0 &
356 (unsigned long long)0xffff0000000000ULL) >> 48;
357 mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
358 (unsigned long long)~MIP_VALID);
359 return status;
360}
361
362void __init
363es7000_sw_apic(void)
364{
365 if (es7000_plat) {
366 int mip_status;
367 struct mip_reg es7000_mip_reg;
368
369 printk("ES7000: Enabling APIC mode.\n");
370 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
371 es7000_mip_reg.off_0 = MIP_SW_APIC;
372 es7000_mip_reg.off_38 = (MIP_VALID);
373 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
374 printk("es7000_sw_apic: command failed, status = %x\n",
375 mip_status);
376 return;
377 }
378}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1c..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/list.h> 19#include <linux/list.h>
20 20
21#include <asm/cacheflush.h>
21#include <asm/ftrace.h> 22#include <asm/ftrace.h>
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <asm/nops.h> 24#include <asm/nops.h>
@@ -26,6 +27,18 @@
26 27
27#ifdef CONFIG_DYNAMIC_FTRACE 28#ifdef CONFIG_DYNAMIC_FTRACE
28 29
30int ftrace_arch_code_modify_prepare(void)
31{
32 set_kernel_text_rw();
33 return 0;
34}
35
36int ftrace_arch_code_modify_post_process(void)
37{
38 set_kernel_text_ro();
39 return 0;
40}
41
29union ftrace_code_union { 42union ftrace_code_union {
30 char code[MCOUNT_INSN_SIZE]; 43 char code[MCOUNT_INSN_SIZE];
31 struct { 44 struct {
@@ -66,11 +79,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
66 * 79 *
67 * 1) Put the instruction pointer into the IP buffer 80 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer. 81 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code 82 * 2) Wait for any running NMIs to finish and set a flag that says
70 * 3) Wait for any running NMIs to finish. 83 * we are modifying code, it is done in an atomic operation.
71 * 4) Write the code 84 * 3) Write the code
72 * 5) clear the flag. 85 * 4) clear the flag.
73 * 6) Wait for any running NMIs to finish. 86 * 5) Wait for any running NMIs to finish.
74 * 87 *
75 * If an NMI is executed, the first thing it does is to call 88 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write 89 * "ftrace_nmi_enter". This will check if the flag is set to write
@@ -82,9 +95,9 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
82 * are the same as what exists. 95 * are the same as what exists.
83 */ 96 */
84 97
85static atomic_t in_nmi = ATOMIC_INIT(0); 98#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
99static atomic_t nmi_running = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */ 100static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */ 101static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */ 102static void *mod_code_newcode; /* holds the text to write to the IP */
90 103
@@ -101,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, int size)
101 return r; 114 return r;
102} 115}
103 116
117static void clear_mod_flag(void)
118{
119 int old = atomic_read(&nmi_running);
120
121 for (;;) {
122 int new = old & ~MOD_CODE_WRITE_FLAG;
123
124 if (old == new)
125 break;
126
127 old = atomic_cmpxchg(&nmi_running, old, new);
128 }
129}
130
104static void ftrace_mod_code(void) 131static void ftrace_mod_code(void)
105{ 132{
106 /* 133 /*
@@ -111,37 +138,52 @@ static void ftrace_mod_code(void)
111 */ 138 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, 139 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE); 140 MCOUNT_INSN_SIZE);
141
142 /* if we fail, then kill any new writers */
143 if (mod_code_status)
144 clear_mod_flag();
114} 145}
115 146
116void ftrace_nmi_enter(void) 147void ftrace_nmi_enter(void)
117{ 148{
118 atomic_inc(&in_nmi); 149 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
119 /* Must have in_nmi seen before reading write flag */ 150 smp_rmb();
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code(); 151 ftrace_mod_code();
123 atomic_inc(&nmi_update_count); 152 atomic_inc(&nmi_update_count);
124 } 153 }
154 /* Must have previous changes seen before executions */
155 smp_mb();
125} 156}
126 157
127void ftrace_nmi_exit(void) 158void ftrace_nmi_exit(void)
128{ 159{
129 /* Finish all executions before clearing in_nmi */ 160 /* Finish all executions before clearing nmi_running */
130 smp_wmb(); 161 smp_mb();
131 atomic_dec(&in_nmi); 162 atomic_dec(&nmi_running);
163}
164
165static void wait_for_nmi_and_set_mod_flag(void)
166{
167 if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
168 return;
169
170 do {
171 cpu_relax();
172 } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
173
174 nmi_wait_count++;
132} 175}
133 176
134static void wait_for_nmi(void) 177static void wait_for_nmi(void)
135{ 178{
136 int waited = 0; 179 if (!atomic_read(&nmi_running))
180 return;
137 181
138 while (atomic_read(&in_nmi)) { 182 do {
139 waited = 1;
140 cpu_relax(); 183 cpu_relax();
141 } 184 } while (atomic_read(&nmi_running));
142 185
143 if (waited) 186 nmi_wait_count++;
144 nmi_wait_count++;
145} 187}
146 188
147static int 189static int
@@ -151,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
151 mod_code_newcode = new_code; 193 mod_code_newcode = new_code;
152 194
153 /* The buffers need to be visible before we let NMIs write them */ 195 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb(); 196 smp_mb();
160 197
161 wait_for_nmi(); 198 wait_for_nmi_and_set_mod_flag();
162 199
163 /* Make sure all running NMIs have finished before we write the code */ 200 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb(); 201 smp_mb();
@@ -166,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
166 ftrace_mod_code(); 203 ftrace_mod_code();
167 204
168 /* Make sure the write happens before clearing the bit */ 205 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb(); 206 smp_mb();
175 207
208 clear_mod_flag();
176 wait_for_nmi(); 209 wait_for_nmi();
177 210
178 return mod_code_status; 211 return mod_code_status;
@@ -368,100 +401,8 @@ int ftrace_disable_ftrace_graph_caller(void)
368 return ftrace_mod_jmp(ip, old_offset, new_offset); 401 return ftrace_mod_jmp(ip, old_offset, new_offset);
369} 402}
370 403
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */ 404#endif /* !CONFIG_DYNAMIC_FTRACE */
391 405
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/* 406/*
466 * Hook the return address and push it in the stack of return addrs 407 * Hook the return address and push it in the stack of return addrs
467 * in current thread info. 408 * in current thread info.
@@ -469,14 +410,13 @@ unsigned long ftrace_return_to_handler(void)
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) 410void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{ 411{
471 unsigned long old; 412 unsigned long old;
472 unsigned long long calltime;
473 int faulted; 413 int faulted;
474 struct ftrace_graph_ent trace; 414 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long) 415 unsigned long return_hooker = (unsigned long)
476 &return_to_handler; 416 &return_to_handler;
477 417
478 /* Nmi's are currently unsupported */ 418 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi))) 419 if (unlikely(in_nmi()))
480 return; 420 return;
481 421
482 if (unlikely(atomic_read(&current->tracing_graph_pause))) 422 if (unlikely(atomic_read(&current->tracing_graph_pause)))
@@ -512,17 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
512 return; 452 return;
513 } 453 }
514 454
515 if (unlikely(!__kernel_text_address(old))) { 455 if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
516 ftrace_graph_stop();
517 *parent = old;
518 WARN_ON(1);
519 return;
520 }
521
522 calltime = cpu_clock(raw_smp_processor_id());
523
524 if (push_return_trace(old, calltime,
525 self_addr, &trace.depth) == -EBUSY) {
526 *parent = old; 456 *parent = old;
527 return; 457 return;
528 } 458 }
@@ -536,3 +466,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
536 } 466 }
537} 467}
538#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 468#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
469
470#ifdef CONFIG_FTRACE_SYSCALLS
471
472extern unsigned long __start_syscalls_metadata[];
473extern unsigned long __stop_syscalls_metadata[];
474extern unsigned long *sys_call_table;
475
476static struct syscall_metadata **syscalls_metadata;
477
478static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
479{
480 struct syscall_metadata *start;
481 struct syscall_metadata *stop;
482 char str[KSYM_SYMBOL_LEN];
483
484
485 start = (struct syscall_metadata *)__start_syscalls_metadata;
486 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
487 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
488
489 for ( ; start < stop; start++) {
490 if (start->name && !strcmp(start->name, str))
491 return start;
492 }
493 return NULL;
494}
495
496struct syscall_metadata *syscall_nr_to_meta(int nr)
497{
498 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
499 return NULL;
500
501 return syscalls_metadata[nr];
502}
503
504void arch_init_ftrace_syscalls(void)
505{
506 int i;
507 struct syscall_metadata *meta;
508 unsigned long **psys_syscall_table = &sys_call_table;
509 static atomic_t refs;
510
511 if (atomic_inc_return(&refs) != 1)
512 goto end;
513
514 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
515 FTRACE_SYSCALL_MAX, GFP_KERNEL);
516 if (!syscalls_metadata) {
517 WARN_ON(1);
518 return;
519 }
520
521 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
522 meta = find_syscall_meta(psys_syscall_table[i]);
523 syscalls_metadata[i] = meta;
524 }
525 return;
526
527 /* Paranoid: avoid overflow */
528end:
529 atomic_dec(&refs);
530}
531#endif
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
deleted file mode 100644
index 2bced78b0b8e..000000000000
--- a/arch/x86/kernel/genapic_64.c
+++ /dev/null
@@ -1,82 +0,0 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/threads.h>
12#include <linux/cpumask.h>
13#include <linux/string.h>
14#include <linux/module.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/hardirq.h>
19#include <linux/dmar.h>
20
21#include <asm/smp.h>
22#include <asm/ipi.h>
23#include <asm/genapic.h>
24#include <asm/setup.h>
25
26extern struct genapic apic_flat;
27extern struct genapic apic_physflat;
28extern struct genapic apic_x2xpic_uv_x;
29extern struct genapic apic_x2apic_phys;
30extern struct genapic apic_x2apic_cluster;
31
32struct genapic __read_mostly *genapic = &apic_flat;
33
34static struct genapic *apic_probe[] __initdata = {
35 &apic_x2apic_uv_x,
36 &apic_x2apic_phys,
37 &apic_x2apic_cluster,
38 &apic_physflat,
39 NULL,
40};
41
42/*
43 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
44 */
45void __init setup_apic_routing(void)
46{
47 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
48 if (!intr_remapping_enabled)
49 genapic = &apic_flat;
50 }
51
52 if (genapic == &apic_flat) {
53 if (max_physical_apicid >= 8)
54 genapic = &apic_physflat;
55 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
56 }
57
58 if (x86_quirks->update_genapic)
59 x86_quirks->update_genapic();
60}
61
62/* Same for both flat and physical. */
63
64void apic_send_IPI_self(int vector)
65{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67}
68
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{
71 int i;
72
73 for (i = 0; apic_probe[i]; ++i) {
74 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 genapic = apic_probe[i];
76 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 genapic->name);
78 return 1;
79 }
80 }
81 return 0;
82}
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
deleted file mode 100644
index 6ce497cc372d..000000000000
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ /dev/null
@@ -1,198 +0,0 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static const struct cpumask *x2apic_target_cpus(void)
26{
27 return cpumask_of(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
34{
35 cpumask_clear(retmask);
36 cpumask_set_cpu(cpu, retmask);
37}
38
39static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
40 unsigned int dest)
41{
42 unsigned long cfg;
43
44 cfg = __prepare_ICR(0, vector, dest);
45
46 /*
47 * send the IPI.
48 */
49 x2apic_icr_write(cfg, apicid);
50}
51
52/*
53 * for now, we send the IPI's one by one in the cpumask.
54 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
55 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
56 * writes.
57 */
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{
60 unsigned long flags;
61 unsigned long query_cpu;
62
63 local_irq_save(flags);
64 for_each_cpu(query_cpu, mask)
65 __x2apic_send_IPI_dest(
66 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 local_irq_restore(flags);
69}
70
71static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
72 int vector)
73{
74 unsigned long flags;
75 unsigned long query_cpu;
76 unsigned long this_cpu = smp_processor_id();
77
78 local_irq_save(flags);
79 for_each_cpu(query_cpu, mask)
80 if (query_cpu != this_cpu)
81 __x2apic_send_IPI_dest(
82 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
83 vector, APIC_DEST_LOGICAL);
84 local_irq_restore(flags);
85}
86
87static void x2apic_send_IPI_allbutself(int vector)
88{
89 unsigned long flags;
90 unsigned long query_cpu;
91 unsigned long this_cpu = smp_processor_id();
92
93 local_irq_save(flags);
94 for_each_online_cpu(query_cpu)
95 if (query_cpu != this_cpu)
96 __x2apic_send_IPI_dest(
97 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
98 vector, APIC_DEST_LOGICAL);
99 local_irq_restore(flags);
100}
101
102static void x2apic_send_IPI_all(int vector)
103{
104 x2apic_send_IPI_mask(cpu_online_mask, vector);
105}
106
107static int x2apic_apic_id_registered(void)
108{
109 return 1;
110}
111
112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
113{
114 int cpu;
115
116 /*
117 * We're using fixed IRQ delivery, can only return one logical APIC ID.
118 * May as well be the first.
119 */
120 cpu = cpumask_first(cpumask);
121 if ((unsigned)cpu < nr_cpu_ids)
122 return per_cpu(x86_cpu_to_logical_apicid, cpu);
123 else
124 return BAD_APICID;
125}
126
127static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
128 const struct cpumask *andmask)
129{
130 int cpu;
131
132 /*
133 * We're using fixed IRQ delivery, can only return one logical APIC ID.
134 * May as well be the first.
135 */
136 for_each_cpu_and(cpu, cpumask, andmask)
137 if (cpumask_test_cpu(cpu, cpu_online_mask))
138 break;
139 if (cpu < nr_cpu_ids)
140 return per_cpu(x86_cpu_to_logical_apicid, cpu);
141 return BAD_APICID;
142}
143
144static unsigned int get_apic_id(unsigned long x)
145{
146 unsigned int id;
147
148 id = x;
149 return id;
150}
151
152static unsigned long set_apic_id(unsigned int id)
153{
154 unsigned long x;
155
156 x = id;
157 return x;
158}
159
160static unsigned int phys_pkg_id(int index_msb)
161{
162 return current_cpu_data.initial_apicid >> index_msb;
163}
164
165static void x2apic_send_IPI_self(int vector)
166{
167 apic_write(APIC_SELF_IPI, vector);
168}
169
170static void init_x2apic_ldr(void)
171{
172 int cpu = smp_processor_id();
173
174 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
175 return;
176}
177
178struct genapic apic_x2apic_cluster = {
179 .name = "cluster x2apic",
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
181 .int_delivery_mode = dest_LowestPrio,
182 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
183 .target_cpus = x2apic_target_cpus,
184 .vector_allocation_domain = x2apic_vector_allocation_domain,
185 .apic_id_registered = x2apic_apic_id_registered,
186 .init_apic_ldr = init_x2apic_ldr,
187 .send_IPI_all = x2apic_send_IPI_all,
188 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
189 .send_IPI_mask = x2apic_send_IPI_mask,
190 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
191 .send_IPI_self = x2apic_send_IPI_self,
192 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
193 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
194 .phys_pkg_id = phys_pkg_id,
195 .get_apic_id = get_apic_id,
196 .set_apic_id = set_apic_id,
197 .apic_id_mask = (0xFFFFFFFFu),
198};
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe182..3f8579f8d42c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
18{ 18{
19 reserve_trampoline_memory(); 19 reserve_trampoline_memory();
20 20
21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
22 22
23#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
24 /* Reserve INITRD */ 24 /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 30 }
31#endif 31#endif
32 reserve_early(init_pg_tables_start, init_pg_tables_end,
33 "INIT_PG_TABLE");
34
35 reserve_ebda_region(); 32 reserve_ebda_region();
36 33
37 /* 34 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..70eaa852c732 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda */
30static struct x8664_pda _boot_cpu_pda;
31
32#ifdef CONFIG_SMP
33/*
34 * We install an empty cpu_pda pointer table to indicate to early users
35 * (numa_set_node) that the cpu_pda pointer table for cpus other than
36 * the boot cpu is not yet setup.
37 */
38static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39#else
40static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
41#endif
42
43void __init x86_64_init_pda(void)
44{
45 _cpu_pda = __cpu_pda;
46 cpu_pda(0) = &_boot_cpu_pda;
47 pda_init(0);
48}
49
50static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
51{ 30{
52 pgd_t *pgd = pgd_offset_k(0UL); 31 pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
112 if (console_loglevel == 10) 91 if (console_loglevel == 10)
113 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
114 93
115 x86_64_init_pda();
116
117 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
118} 95}
119 96
@@ -123,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
123 100
124 reserve_trampoline_memory(); 101 reserve_trampoline_memory();
125 102
126 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
127 104
128#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
129 /* Reserve INITRD */ 106 /* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..30683883e0cd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -11,14 +11,15 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/page.h> 14#include <asm/page_types.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable_types.h>
16#include <asm/desc.h> 16#include <asm/desc.h>
17#include <asm/cache.h> 17#include <asm/cache.h>
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/asm-offsets.h> 19#include <asm/asm-offsets.h>
20#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23/* Physical address */ 24/* Physical address */
24#define pa(X) ((X) - __PAGE_OFFSET) 25#define pa(X) ((X) - __PAGE_OFFSET)
@@ -37,42 +38,40 @@
37#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id 38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
38 39
39/* 40/*
40 * This is how much memory *in addition to the memory covered up to 41 * This is how much memory in addition to the memory covered up to
41 * and including _end* we need mapped initially. 42 * and including _end we need mapped initially.
42 * We need: 43 * We need:
43 * - one bit for each possible page, but only in low memory, which means 44 * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
44 * 2^32/4096/8 = 128K worst case (4G/4G split.) 45 * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
45 * - enough space to map all low memory, which means
46 * (2^32/4096) / 1024 pages (worst case, non PAE)
47 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
48 * - a few pages for allocator use before the kernel pagetable has
49 * been set up
50 * 46 *
51 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
52 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
53 * 49 *
54 * This should be a multiple of a page. 50 * This should be a multiple of a page.
51 *
52 * KERNEL_IMAGE_SIZE should be greater than pa(_end)
53 * and small than max_low_pfn, otherwise will waste some page table entries
55 */ 54 */
56LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
57
58/*
59 * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
60 * pagetables from above the 16MB DMA limit, so we'll have to set
61 * up pagetables 16MB more (worst-case):
62 */
63#ifdef CONFIG_DEBUG_PAGEALLOC
64LOW_PAGES = LOW_PAGES + 0x1000000
65#endif
66 55
67#if PTRS_PER_PMD > 1 56#if PTRS_PER_PMD > 1
68PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD 57#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
69#else 58#else
70PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) 59#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
71#endif 60#endif
72BOOTBITMAP_SIZE = LOW_PAGES / 8
73ALLOCATOR_SLOP = 4
74 61
75INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm 62/* Enough space to fit pagetables for the low memory linear map */
63MAPPING_BEYOND_END = \
64 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
65
66/*
67 * Worst-case size of the kernel mapping we need to make:
68 * the worst-case size of the kernel itself, plus the extra we need
69 * to map for the linear map.
70 */
71KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
72
73INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
74RESERVE_BRK(pagetables, INIT_MAP_SIZE)
76 75
77/* 76/*
78 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 77 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -165,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
165 164
166/* 165/*
167 * Initialize page tables. This creates a PDE and a set of page 166 * Initialize page tables. This creates a PDE and a set of page
168 * tables, which are located immediately beyond _end. The variable 167 * tables, which are located immediately beyond __brk_base. The variable
169 * init_pg_tables_end is set up to point to the first "safe" location. 168 * _brk_end is set up to point to the first "safe" location.
170 * Mappings are created both at virtual address 0 (identity mapping) 169 * Mappings are created both at virtual address 0 (identity mapping)
171 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 170 * and PAGE_OFFSET for up to _end.
172 * 171 *
173 * Note that the stack is not yet set up! 172 * Note that the stack is not yet set up!
174 */ 173 */
@@ -189,8 +188,7 @@ default_entry:
189 188
190 xorl %ebx,%ebx /* %ebx is kept at zero */ 189 xorl %ebx,%ebx /* %ebx is kept at zero */
191 190
192 movl $pa(pg0), %edi 191 movl $pa(__brk_base), %edi
193 movl %edi, pa(init_pg_tables_start)
194 movl $pa(swapper_pg_pmd), %edx 192 movl $pa(swapper_pg_pmd), %edx
195 movl $PTE_IDENT_ATTR, %eax 193 movl $PTE_IDENT_ATTR, %eax
19610: 19410:
@@ -208,14 +206,14 @@ default_entry:
208 loop 11b 206 loop 11b
209 207
210 /* 208 /*
211 * End condition: we must map up to and including INIT_MAP_BEYOND_END 209 * End condition: we must map up to the end + MAPPING_BEYOND_END.
212 * bytes beyond the end of our own page tables.
213 */ 210 */
214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 211 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
215 cmpl %ebp,%eax 212 cmpl %ebp,%eax
216 jb 10b 213 jb 10b
2171: 2141:
218 movl %edi,pa(init_pg_tables_end) 215 addl $__PAGE_OFFSET, %edi
216 movl %edi, pa(_brk_end)
219 shrl $12, %eax 217 shrl $12, %eax
220 movl %eax, pa(max_pfn_mapped) 218 movl %eax, pa(max_pfn_mapped)
221 219
@@ -226,8 +224,7 @@ default_entry:
226 224
227page_pde_offset = (__PAGE_OFFSET >> 20); 225page_pde_offset = (__PAGE_OFFSET >> 20);
228 226
229 movl $pa(pg0), %edi 227 movl $pa(__brk_base), %edi
230 movl %edi, pa(init_pg_tables_start)
231 movl $pa(swapper_pg_dir), %edx 228 movl $pa(swapper_pg_dir), %edx
232 movl $PTE_IDENT_ATTR, %eax 229 movl $PTE_IDENT_ATTR, %eax
23310: 23010:
@@ -241,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
241 addl $0x1000,%eax 238 addl $0x1000,%eax
242 loop 11b 239 loop 11b
243 /* 240 /*
244 * End condition: we must map up to and including INIT_MAP_BEYOND_END 241 * End condition: we must map up to the end + MAPPING_BEYOND_END.
245 * bytes beyond the end of our own page tables; the +0x007 is
246 * the attribute bits
247 */ 242 */
248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 243 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
249 cmpl %ebp,%eax 244 cmpl %ebp,%eax
250 jb 10b 245 jb 10b
251 movl %edi,pa(init_pg_tables_end) 246 addl $__PAGE_OFFSET, %edi
247 movl %edi, pa(_brk_end)
252 shrl $12, %eax 248 shrl $12, %eax
253 movl %eax, pa(max_pfn_mapped) 249 movl %eax, pa(max_pfn_mapped)
254 250
@@ -429,14 +425,34 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 425 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4261: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 427 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 428
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 429 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 430 movl %eax,%ds
436 movl %eax,%es 431 movl %eax,%es
437 432
438 xorl %eax,%eax # Clear GS and LDT 433 movl $(__KERNEL_PERCPU), %eax
434 movl %eax,%fs # set this cpu's percpu
435
436#ifdef CONFIG_CC_STACKPROTECTOR
437 /*
438 * The linker can't handle this by relocation. Manually set
439 * base address in stack canary segment descriptor.
440 */
441 cmpb $0,ready
442 jne 1f
443 movl $per_cpu__gdt_page,%eax
444 movl $per_cpu__stack_canary,%ecx
445 subl $20, %ecx
446 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
447 shrl $16, %ecx
448 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
449 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
4501:
451#endif
452 movl $(__KERNEL_STACK_CANARY),%eax
439 movl %eax,%gs 453 movl %eax,%gs
454
455 xorl %eax,%eax # Clear LDT
440 lldt %ax 456 lldt %ax
441 457
442 cld # gcc2 wants the direction flag cleared at all times 458 cld # gcc2 wants the direction flag cleared at all times
@@ -446,8 +462,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 462 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 463 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 464 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 465 movl (stack_start), %esp
4521: 4661:
453#endif /* CONFIG_SMP */ 467#endif /* CONFIG_SMP */
@@ -548,12 +562,8 @@ early_fault:
548 pushl %eax 562 pushl %eax
549 pushl %edx /* trapno */ 563 pushl %edx /* trapno */
550 pushl $fault_msg 564 pushl $fault_msg
551#ifdef CONFIG_EARLY_PRINTK
552 call early_printk
553#else
554 call printk 565 call printk
555#endif 566#endif
556#endif
557 call dump_stack 567 call dump_stack
558hlt_loop: 568hlt_loop:
559 hlt 569 hlt
@@ -580,11 +590,10 @@ ignore_int:
580 pushl 32(%esp) 590 pushl 32(%esp)
581 pushl 40(%esp) 591 pushl 40(%esp)
582 pushl $int_msg 592 pushl $int_msg
583#ifdef CONFIG_EARLY_PRINTK
584 call early_printk
585#else
586 call printk 593 call printk
587#endif 594
595 call dump_stack
596
588 addl $(5*4),%esp 597 addl $(5*4),%esp
589 popl %ds 598 popl %ds
590 popl %es 599 popl %es
@@ -622,6 +631,7 @@ swapper_pg_fixmap:
622 .fill 1024,4,0 631 .fill 1024,4,0
623ENTRY(empty_zero_page) 632ENTRY(empty_zero_page)
624 .fill 4096,1,0 633 .fill 4096,1,0
634
625/* 635/*
626 * This starts the data section. 636 * This starts the data section.
627 */ 637 */
@@ -660,7 +670,7 @@ early_recursion_flag:
660 .long 0 670 .long 0
661 671
662int_msg: 672int_msg:
663 .asciz "Unknown interrupt or fault at EIP %p %p %p\n" 673 .asciz "Unknown interrupt or fault at: %p %p %p\n"
664 674
665fault_msg: 675fault_msg:
666/* fault info: */ 676/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..54b29bb24e71 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 227 movl %eax,%fs
227 movl %eax,%gs 228 movl %eax,%gs
228 229
229 /* 230 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 231 *
231 * that does in_interrupt() 232 * The base of %gs always points to the bottom of the irqstack
232 */ 233 * union. If the stack protector canary is enabled, it is
234 * located at %gs:40. Note that, on SMP, the boot cpu uses
235 * init data section till per cpu areas are set up.
236 */
233 movl $MSR_GS_BASE,%ecx 237 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 238 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 239 movq %rax,%rdx
236 shrq $32,%rdx 240 shrq $32,%rdx
237 wrmsr 241 wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
257 .align 8 261 .align 8
258 ENTRY(initial_code) 262 ENTRY(initial_code)
259 .quad x86_64_start_kernel 263 .quad x86_64_start_kernel
264 ENTRY(initial_gs)
265 .quad INIT_PER_CPU_VAR(irq_stack_union)
260 __FINITDATA 266 __FINITDATA
261 267
262 ENTRY(stack_start) 268 ENTRY(stack_start)
@@ -323,8 +329,6 @@ early_idt_ripmsg:
323#endif /* CONFIG_EARLY_PRINTK */ 329#endif /* CONFIG_EARLY_PRINTK */
324 .previous 330 .previous
325 331
326.balign PAGE_SIZE
327
328#define NEXT_PAGE(name) \ 332#define NEXT_PAGE(name) \
329 .balign PAGE_SIZE; \ 333 .balign PAGE_SIZE; \
330ENTRY(name) 334ENTRY(name)
@@ -401,7 +405,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 405 .globl early_gdt_descr
402early_gdt_descr: 406early_gdt_descr:
403 .word GDT_ENTRIES*8-1 407 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 408early_gdt_descr_base:
409 .quad INIT_PER_CPU_VAR(gdt_page)
405 410
406ENTRY(phys_base) 411ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 412 /* This must match the first entry in level2_kernel_pgt */
@@ -412,7 +417,7 @@ ENTRY(phys_base)
412 .section .bss, "aw", @nobits 417 .section .bss, "aw", @nobits
413 .align L1_CACHE_BYTES 418 .align L1_CACHE_BYTES
414ENTRY(idt_table) 419ENTRY(idt_table)
415 .skip 256 * 16 420 .skip IDT_ENTRIES * 16
416 421
417 .section .bss.page_aligned, "aw", @nobits 422 .section .bss.page_aligned, "aw", @nobits
418 .align PAGE_SIZE 423 .align PAGE_SIZE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 388254f69a2a..648b3a2a3a44 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -80,6 +80,7 @@ static inline void hpet_clear_mapping(void)
80 */ 80 */
81static int boot_hpet_disable; 81static int boot_hpet_disable;
82int hpet_force_user; 82int hpet_force_user;
83static int hpet_verbose;
83 84
84static int __init hpet_setup(char *str) 85static int __init hpet_setup(char *str)
85{ 86{
@@ -88,6 +89,8 @@ static int __init hpet_setup(char *str)
88 boot_hpet_disable = 1; 89 boot_hpet_disable = 1;
89 if (!strncmp("force", str, 5)) 90 if (!strncmp("force", str, 5))
90 hpet_force_user = 1; 91 hpet_force_user = 1;
92 if (!strncmp("verbose", str, 7))
93 hpet_verbose = 1;
91 } 94 }
92 return 1; 95 return 1;
93} 96}
@@ -119,6 +122,43 @@ int is_hpet_enabled(void)
119} 122}
120EXPORT_SYMBOL_GPL(is_hpet_enabled); 123EXPORT_SYMBOL_GPL(is_hpet_enabled);
121 124
125static void _hpet_print_config(const char *function, int line)
126{
127 u32 i, timers, l, h;
128 printk(KERN_INFO "hpet: %s(%d):\n", function, line);
129 l = hpet_readl(HPET_ID);
130 h = hpet_readl(HPET_PERIOD);
131 timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
132 printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
133 l = hpet_readl(HPET_CFG);
134 h = hpet_readl(HPET_STATUS);
135 printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
136 l = hpet_readl(HPET_COUNTER);
137 h = hpet_readl(HPET_COUNTER+4);
138 printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
139
140 for (i = 0; i < timers; i++) {
141 l = hpet_readl(HPET_Tn_CFG(i));
142 h = hpet_readl(HPET_Tn_CFG(i)+4);
143 printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
144 i, l, h);
145 l = hpet_readl(HPET_Tn_CMP(i));
146 h = hpet_readl(HPET_Tn_CMP(i)+4);
147 printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
148 i, l, h);
149 l = hpet_readl(HPET_Tn_ROUTE(i));
150 h = hpet_readl(HPET_Tn_ROUTE(i)+4);
151 printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
152 i, l, h);
153 }
154}
155
156#define hpet_print_config() \
157do { \
158 if (hpet_verbose) \
159 _hpet_print_config(__FUNCTION__, __LINE__); \
160} while (0)
161
122/* 162/*
123 * When the hpet driver (/dev/hpet) is enabled, we need to reserve 163 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
124 * timer 0 and timer 1 in case of RTC emulation. 164 * timer 0 and timer 1 in case of RTC emulation.
@@ -191,27 +231,37 @@ static struct clock_event_device hpet_clockevent = {
191 .rating = 50, 231 .rating = 50,
192}; 232};
193 233
194static void hpet_start_counter(void) 234static void hpet_stop_counter(void)
195{ 235{
196 unsigned long cfg = hpet_readl(HPET_CFG); 236 unsigned long cfg = hpet_readl(HPET_CFG);
197
198 cfg &= ~HPET_CFG_ENABLE; 237 cfg &= ~HPET_CFG_ENABLE;
199 hpet_writel(cfg, HPET_CFG); 238 hpet_writel(cfg, HPET_CFG);
200 hpet_writel(0, HPET_COUNTER); 239 hpet_writel(0, HPET_COUNTER);
201 hpet_writel(0, HPET_COUNTER + 4); 240 hpet_writel(0, HPET_COUNTER + 4);
241}
242
243static void hpet_start_counter(void)
244{
245 unsigned long cfg = hpet_readl(HPET_CFG);
202 cfg |= HPET_CFG_ENABLE; 246 cfg |= HPET_CFG_ENABLE;
203 hpet_writel(cfg, HPET_CFG); 247 hpet_writel(cfg, HPET_CFG);
204} 248}
205 249
250static void hpet_restart_counter(void)
251{
252 hpet_stop_counter();
253 hpet_start_counter();
254}
255
206static void hpet_resume_device(void) 256static void hpet_resume_device(void)
207{ 257{
208 force_hpet_resume(); 258 force_hpet_resume();
209} 259}
210 260
211static void hpet_restart_counter(void) 261static void hpet_resume_counter(void)
212{ 262{
213 hpet_resume_device(); 263 hpet_resume_device();
214 hpet_start_counter(); 264 hpet_restart_counter();
215} 265}
216 266
217static void hpet_enable_legacy_int(void) 267static void hpet_enable_legacy_int(void)
@@ -259,27 +309,23 @@ static int hpet_setup_msi_irq(unsigned int irq);
259static void hpet_set_mode(enum clock_event_mode mode, 309static void hpet_set_mode(enum clock_event_mode mode,
260 struct clock_event_device *evt, int timer) 310 struct clock_event_device *evt, int timer)
261{ 311{
262 unsigned long cfg, cmp, now; 312 unsigned long cfg;
263 uint64_t delta; 313 uint64_t delta;
264 314
265 switch (mode) { 315 switch (mode) {
266 case CLOCK_EVT_MODE_PERIODIC: 316 case CLOCK_EVT_MODE_PERIODIC:
317 hpet_stop_counter();
267 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 318 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
268 delta >>= evt->shift; 319 delta >>= evt->shift;
269 now = hpet_readl(HPET_COUNTER);
270 cmp = now + (unsigned long) delta;
271 cfg = hpet_readl(HPET_Tn_CFG(timer)); 320 cfg = hpet_readl(HPET_Tn_CFG(timer));
321 /* Make sure we use edge triggered interrupts */
322 cfg &= ~HPET_TN_LEVEL;
272 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 323 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
273 HPET_TN_SETVAL | HPET_TN_32BIT; 324 HPET_TN_SETVAL | HPET_TN_32BIT;
274 hpet_writel(cfg, HPET_Tn_CFG(timer)); 325 hpet_writel(cfg, HPET_Tn_CFG(timer));
275 /*
276 * The first write after writing TN_SETVAL to the
277 * config register sets the counter value, the second
278 * write sets the period.
279 */
280 hpet_writel(cmp, HPET_Tn_CMP(timer));
281 udelay(1);
282 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 326 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
327 hpet_start_counter();
328 hpet_print_config();
283 break; 329 break;
284 330
285 case CLOCK_EVT_MODE_ONESHOT: 331 case CLOCK_EVT_MODE_ONESHOT:
@@ -306,6 +352,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
306 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); 352 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
307 enable_irq(hdev->irq); 353 enable_irq(hdev->irq);
308 } 354 }
355 hpet_print_config();
309 break; 356 break;
310 } 357 }
311} 358}
@@ -524,6 +571,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
524 571
525 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 572 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
526 num_timers++; /* Value read out starts from 0 */ 573 num_timers++; /* Value read out starts from 0 */
574 hpet_print_config();
527 575
528 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); 576 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
529 if (!hpet_devs) 577 if (!hpet_devs)
@@ -693,7 +741,7 @@ static struct clocksource clocksource_hpet = {
693 .mask = HPET_MASK, 741 .mask = HPET_MASK,
694 .shift = HPET_SHIFT, 742 .shift = HPET_SHIFT,
695 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 743 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
696 .resume = hpet_restart_counter, 744 .resume = hpet_resume_counter,
697#ifdef CONFIG_X86_64 745#ifdef CONFIG_X86_64
698 .vread = vread_hpet, 746 .vread = vread_hpet,
699#endif 747#endif
@@ -705,7 +753,7 @@ static int hpet_clocksource_register(void)
705 cycle_t t1; 753 cycle_t t1;
706 754
707 /* Start the counter */ 755 /* Start the counter */
708 hpet_start_counter(); 756 hpet_restart_counter();
709 757
710 /* Verify whether hpet counter works */ 758 /* Verify whether hpet counter works */
711 t1 = read_hpet(); 759 t1 = read_hpet();
@@ -791,6 +839,7 @@ int __init hpet_enable(void)
791 * information and the number of channels 839 * information and the number of channels
792 */ 840 */
793 id = hpet_readl(HPET_ID); 841 id = hpet_readl(HPET_ID);
842 hpet_print_config();
794 843
795#ifdef CONFIG_HPET_EMULATE_RTC 844#ifdef CONFIG_HPET_EMULATE_RTC
796 /* 845 /*
@@ -843,6 +892,7 @@ static __init int hpet_late_init(void)
843 return -ENODEV; 892 return -ENODEV;
844 893
845 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 894 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
895 hpet_print_config();
846 896
847 for_each_online_cpu(cpu) { 897 for_each_online_cpu(cpu) {
848 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 898 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0a..f2f8540a7f3d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
136#ifdef CONFIG_X86_32 136#ifdef CONFIG_X86_32
137 if (!HAVE_HWFP) { 137 if (!HAVE_HWFP) {
138 memset(tsk->thread.xstate, 0, xstate_size); 138 memset(tsk->thread.xstate, 0, xstate_size);
139 finit(); 139 finit_task(tsk);
140 set_stopped_child_used_math(tsk); 140 set_stopped_child_used_math(tsk);
141 return 0; 141 return 0;
142 } 142 }
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f3..3475440baa54 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
3 * 3 *
4 */ 4 */
5#include <linux/clockchips.h> 5#include <linux/clockchips.h>
6#include <linux/init.h>
7#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/spinlock.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/io.h>
11 13
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h>
16#include <asm/hpet.h> 15#include <asm/hpet.h>
16#include <asm/smp.h>
17 17
18DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
19EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
40{ 40{
41 spin_lock(&i8253_lock); 41 spin_lock(&i8253_lock);
42 42
43 switch(mode) { 43 switch (mode) {
44 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
45 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
46 outb_pit(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - 95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
96 * !using_apic_timer decisions in do_timer_interrupt_hook() 96 * !using_apic_timer decisions in do_timer_interrupt_hook()
97 */ 97 */
98static struct clock_event_device pit_clockevent = { 98static struct clock_event_device pit_ce = {
99 .name = "pit", 99 .name = "pit",
100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
101 .set_mode = init_pit_timer, 101 .set_mode = init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of(smp_processor_id()); 117 pit_ce.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
119 pit_clockevent.shift); 119 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
120 pit_clockevent.max_delta_ns = 120 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
121 clockevent_delta2ns(0x7FFF, &pit_clockevent); 121
122 pit_clockevent.min_delta_ns = 122 clockevents_register_device(&pit_ce);
123 clockevent_delta2ns(0xF, &pit_clockevent); 123 global_clock_event = &pit_ce;
124 clockevents_register_device(&pit_clockevent);
125 global_clock_event = &pit_clockevent;
126} 124}
127 125
128#ifndef CONFIG_X86_64 126#ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
133 */ 131 */
134static cycle_t pit_read(void) 132static cycle_t pit_read(void)
135{ 133{
134 static int old_count;
135 static u32 old_jifs;
136 unsigned long flags; 136 unsigned long flags;
137 int count; 137 int count;
138 u32 jifs; 138 u32 jifs;
139 static int old_count;
140 static u32 old_jifs;
141 139
142 spin_lock_irqsave(&i8253_lock, flags); 140 spin_lock_irqsave(&i8253_lock, flags);
143 /* 141 /*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
179 * Previous attempts to handle these cases intelligently were 177 * Previous attempts to handle these cases intelligently were
180 * buggy, so we just do the simple thing now. 178 * buggy, so we just do the simple thing now.
181 */ 179 */
182 if (count > old_count && jifs == old_jifs) { 180 if (count > old_count && jifs == old_jifs)
183 count = old_count; 181 count = old_count;
184 } 182
185 old_count = count; 183 old_count = count;
186 old_jifs = jifs; 184 old_jifs = jifs;
187 185
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
192 return (cycle_t)(jifs * LATCH) + count; 190 return (cycle_t)(jifs * LATCH) + count;
193} 191}
194 192
195static struct clocksource clocksource_pit = { 193static struct clocksource pit_cs = {
196 .name = "pit", 194 .name = "pit",
197 .rating = 110, 195 .rating = 110,
198 .read = pit_read, 196 .read = pit_read,
199 .mask = CLOCKSOURCE_MASK(32), 197 .mask = CLOCKSOURCE_MASK(32),
200 .mult = 0, 198 .mult = 0,
201 .shift = 20, 199 .shift = 20,
202}; 200};
203 201
204static void pit_disable_clocksource(void) 202static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
206 /* 204 /*
207 * Use mult to check whether it is registered or not 205 * Use mult to check whether it is registered or not
208 */ 206 */
209 if (clocksource_pit.mult) { 207 if (pit_cs.mult) {
210 clocksource_unregister(&clocksource_pit); 208 clocksource_unregister(&pit_cs);
211 clocksource_pit.mult = 0; 209 pit_cs.mult = 0;
212 } 210 }
213} 211}
214 212
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
222 * - when local APIC timer is active (PIT is switched off) 220 * - when local APIC timer is active (PIT is switched off)
223 */ 221 */
224 if (num_possible_cpus() > 1 || is_hpet_enabled() || 222 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
225 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) 223 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
226 return 0; 224 return 0;
227 225
228 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 226 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
229 clocksource_pit.shift); 227
230 return clocksource_register(&clocksource_pit); 228 return clocksource_register(&pit_cs);
231} 229}
232arch_initcall(init_pit_clocksource); 230arch_initcall(init_pit_clocksource);
233 231
234#endif 232#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 11d5093eb281..df89102bef80 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -22,7 +22,6 @@
22#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/arch_hooks.h>
26#include <asm/i8259.h> 25#include <asm/i8259.h>
27 26
28/* 27/*
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aacb..a979b5bd2fc0 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
7 */ 7 */
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <asm/io.h> 13#include <linux/io.h>
14 14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; 15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16 16
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) 47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
48{ 48{
49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { 49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
50 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", 50 pr_notice("%s: using 0xed I/O delay port\n", id->ident);
51 id->ident);
52 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; 51 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
53 } 52 }
54 53
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 .callback = dmi_io_delay_0xed_port, 63 .callback = dmi_io_delay_0xed_port,
65 .ident = "Compaq Presario V6000", 64 .ident = "Compaq Presario V6000",
66 .matches = { 65 .matches = {
67 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 66 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
68 DMI_MATCH(DMI_BOARD_NAME, "30B7") 67 DMI_MATCH(DMI_BOARD_NAME, "30B7")
69 } 68 }
70 }, 69 },
71 { 70 {
72 .callback = dmi_io_delay_0xed_port, 71 .callback = dmi_io_delay_0xed_port,
73 .ident = "HP Pavilion dv9000z", 72 .ident = "HP Pavilion dv9000z",
74 .matches = { 73 .matches = {
75 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 74 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
76 DMI_MATCH(DMI_BOARD_NAME, "30B9") 75 DMI_MATCH(DMI_BOARD_NAME, "30B9")
77 } 76 }
78 }, 77 },
79 { 78 {
80 .callback = dmi_io_delay_0xed_port, 79 .callback = dmi_io_delay_0xed_port,
81 .ident = "HP Pavilion dv6000", 80 .ident = "HP Pavilion dv6000",
82 .matches = { 81 .matches = {
83 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 82 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
84 DMI_MATCH(DMI_BOARD_NAME, "30B8") 83 DMI_MATCH(DMI_BOARD_NAME, "30B8")
85 } 84 }
86 }, 85 },
87 { 86 {
88 .callback = dmi_io_delay_0xed_port, 87 .callback = dmi_io_delay_0xed_port,
89 .ident = "HP Pavilion tx1000", 88 .ident = "HP Pavilion tx1000",
90 .matches = { 89 .matches = {
91 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 90 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 91 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 92 }
94 }, 93 },
95 { 94 {
96 .callback = dmi_io_delay_0xed_port, 95 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700", 96 .ident = "Presario F700",
98 .matches = { 97 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 98 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3") 99 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 } 100 }
102 }, 101 },
103 { } 102 { }
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b12208f4dfee..99c4d308f16b 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
85 85
86 t->io_bitmap_max = bytes; 86 t->io_bitmap_max = bytes;
87 87
88#ifdef CONFIG_X86_32
89 /*
90 * Sets the lazy trigger so that the next I/O operation will
91 * reload the correct bitmap.
92 * Reset the owner so that a process switch will not set
93 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
94 */
95 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
96 tss->io_bitmap_owner = NULL;
97#else
98 /* Update the TSS: */ 88 /* Update the TSS: */
99 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); 89 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
100#endif
101 90
102 put_cpu(); 91 put_cpu();
103 92
@@ -131,9 +120,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
131} 120}
132 121
133#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
134asmlinkage long sys_iopl(unsigned long regsp) 123long sys_iopl(struct pt_regs *regs)
135{ 124{
136 struct pt_regs *regs = (struct pt_regs *)&regsp;
137 unsigned int level = regs->bx; 125 unsigned int level = regs->bx;
138 struct thread_struct *t = &current->thread; 126 struct thread_struct *t = &current->thread;
139 int rc; 127 int rc;
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
deleted file mode 100644
index 285bbf8831fa..000000000000
--- a/arch/x86/kernel/ipi.c
+++ /dev/null
@@ -1,190 +0,0 @@
1#include <linux/cpumask.h>
2#include <linux/interrupt.h>
3#include <linux/init.h>
4
5#include <linux/mm.h>
6#include <linux/delay.h>
7#include <linux/spinlock.h>
8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h>
10#include <linux/cache.h>
11#include <linux/cpu.h>
12#include <linux/module.h>
13
14#include <asm/smp.h>
15#include <asm/mtrr.h>
16#include <asm/tlbflush.h>
17#include <asm/mmu_context.h>
18#include <asm/apic.h>
19#include <asm/proto.h>
20
21#ifdef CONFIG_X86_32
22#include <mach_apic.h>
23#include <mach_ipi.h>
24
25/*
26 * the following functions deal with sending IPIs between CPUs.
27 *
28 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
29 */
30
31static inline int __prepare_ICR(unsigned int shortcut, int vector)
32{
33 unsigned int icr = shortcut | APIC_DEST_LOGICAL;
34
35 switch (vector) {
36 default:
37 icr |= APIC_DM_FIXED | vector;
38 break;
39 case NMI_VECTOR:
40 icr |= APIC_DM_NMI;
41 break;
42 }
43 return icr;
44}
45
46static inline int __prepare_ICR2(unsigned int mask)
47{
48 return SET_APIC_DEST_FIELD(mask);
49}
50
51void __send_IPI_shortcut(unsigned int shortcut, int vector)
52{
53 /*
54 * Subtle. In the case of the 'never do double writes' workaround
55 * we have to lock out interrupts to be safe. As we don't care
56 * of the value read we use an atomic rmw access to avoid costly
57 * cli/sti. Otherwise we use an even cheaper single atomic write
58 * to the APIC.
59 */
60 unsigned int cfg;
61
62 /*
63 * Wait for idle.
64 */
65 apic_wait_icr_idle();
66
67 /*
68 * No need to touch the target chip field
69 */
70 cfg = __prepare_ICR(shortcut, vector);
71
72 /*
73 * Send the IPI. The write to APIC_ICR fires this off.
74 */
75 apic_write(APIC_ICR, cfg);
76}
77
78void send_IPI_self(int vector)
79{
80 __send_IPI_shortcut(APIC_DEST_SELF, vector);
81}
82
83/*
84 * This is used to send an IPI with no shorthand notation (the destination is
85 * specified in bits 56 to 63 of the ICR).
86 */
87static inline void __send_IPI_dest_field(unsigned long mask, int vector)
88{
89 unsigned long cfg;
90
91 /*
92 * Wait for idle.
93 */
94 if (unlikely(vector == NMI_VECTOR))
95 safe_apic_wait_icr_idle();
96 else
97 apic_wait_icr_idle();
98
99 /*
100 * prepare target chip field
101 */
102 cfg = __prepare_ICR2(mask);
103 apic_write(APIC_ICR2, cfg);
104
105 /*
106 * program the ICR
107 */
108 cfg = __prepare_ICR(0, vector);
109
110 /*
111 * Send the IPI. The write to APIC_ICR fires this off.
112 */
113 apic_write(APIC_ICR, cfg);
114}
115
116/*
117 * This is only used on smaller machines.
118 */
119void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
120{
121 unsigned long mask = cpumask_bits(cpumask)[0];
122 unsigned long flags;
123
124 local_irq_save(flags);
125 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
126 __send_IPI_dest_field(mask, vector);
127 local_irq_restore(flags);
128}
129
130void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
131{
132 unsigned long flags;
133 unsigned int query_cpu;
134
135 /*
136 * Hack. The clustered APIC addressing mode doesn't allow us to send
137 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
138 * should be modified to do 1 message per cluster ID - mbligh
139 */
140
141 local_irq_save(flags);
142 for_each_cpu(query_cpu, mask)
143 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
144 local_irq_restore(flags);
145}
146
147void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
148{
149 unsigned long flags;
150 unsigned int query_cpu;
151 unsigned int this_cpu = smp_processor_id();
152
153 /* See Hack comment above */
154
155 local_irq_save(flags);
156 for_each_cpu(query_cpu, mask)
157 if (query_cpu != this_cpu)
158 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
159 vector);
160 local_irq_restore(flags);
161}
162
163/* must come after the send_IPI functions above for inlining */
164static int convert_apicid_to_cpu(int apic_id)
165{
166 int i;
167
168 for_each_possible_cpu(i) {
169 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
170 return i;
171 }
172 return -1;
173}
174
175int safe_smp_processor_id(void)
176{
177 int apicid, cpuid;
178
179 if (!boot_cpu_has(X86_FEATURE_APIC))
180 return 0;
181
182 apicid = hard_smp_processor_id();
183 if (apicid == BAD_APICID)
184 return 0;
185
186 cpuid = convert_apicid_to_cpu(apicid);
187
188 return cpuid >= 0 ? cpuid : 0;
189}
190#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..3aaf7b9e3a8b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -6,13 +6,18 @@
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/ftrace.h>
9 10
10#include <asm/apic.h> 11#include <asm/apic.h>
11#include <asm/io_apic.h> 12#include <asm/io_apic.h>
12#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h>
13 15
14atomic_t irq_err_count; 16atomic_t irq_err_count;
15 17
18/* Function pointer for generic interrupt vector handling */
19void (*generic_interrupt_extension)(void) = NULL;
20
16/* 21/*
17 * 'what should we do if we get a hw irq event on an illegal vector'. 22 * 'what should we do if we get a hw irq event on an illegal vector'.
18 * each architecture has to answer this themselves. 23 * each architecture has to answer this themselves.
@@ -36,63 +41,64 @@ void ack_bad_irq(unsigned int irq)
36#endif 41#endif
37} 42}
38 43
39#ifdef CONFIG_X86_32 44#define irq_stats(x) (&per_cpu(irq_stat, x))
40# define irq_stats(x) (&per_cpu(irq_stat, x))
41#else
42# define irq_stats(x) cpu_pda(x)
43#endif
44/* 45/*
45 * /proc/interrupts printing: 46 * /proc/interrupts printing:
46 */ 47 */
47static int show_other_interrupts(struct seq_file *p) 48static int show_other_interrupts(struct seq_file *p, int prec)
48{ 49{
49 int j; 50 int j;
50 51
51 seq_printf(p, "NMI: "); 52 seq_printf(p, "%*s: ", prec, "NMI");
52 for_each_online_cpu(j) 53 for_each_online_cpu(j)
53 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); 54 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
54 seq_printf(p, " Non-maskable interrupts\n"); 55 seq_printf(p, " Non-maskable interrupts\n");
55#ifdef CONFIG_X86_LOCAL_APIC 56#ifdef CONFIG_X86_LOCAL_APIC
56 seq_printf(p, "LOC: "); 57 seq_printf(p, "%*s: ", prec, "LOC");
57 for_each_online_cpu(j) 58 for_each_online_cpu(j)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 59 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
59 seq_printf(p, " Local timer interrupts\n"); 60 seq_printf(p, " Local timer interrupts\n");
61
62 seq_printf(p, "%*s: ", prec, "SPU");
63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n");
60#endif 66#endif
67 if (generic_interrupt_extension) {
68 seq_printf(p, "PLT: ");
69 for_each_online_cpu(j)
70 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
71 seq_printf(p, " Platform interrupts\n");
72 }
61#ifdef CONFIG_SMP 73#ifdef CONFIG_SMP
62 seq_printf(p, "RES: "); 74 seq_printf(p, "%*s: ", prec, "RES");
63 for_each_online_cpu(j) 75 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); 76 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
65 seq_printf(p, " Rescheduling interrupts\n"); 77 seq_printf(p, " Rescheduling interrupts\n");
66 seq_printf(p, "CAL: "); 78 seq_printf(p, "%*s: ", prec, "CAL");
67 for_each_online_cpu(j) 79 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 80 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
69 seq_printf(p, " Function call interrupts\n"); 81 seq_printf(p, " Function call interrupts\n");
70 seq_printf(p, "TLB: "); 82 seq_printf(p, "%*s: ", prec, "TLB");
71 for_each_online_cpu(j) 83 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 84 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
73 seq_printf(p, " TLB shootdowns\n"); 85 seq_printf(p, " TLB shootdowns\n");
74#endif 86#endif
75#ifdef CONFIG_X86_MCE 87#ifdef CONFIG_X86_MCE
76 seq_printf(p, "TRM: "); 88 seq_printf(p, "%*s: ", prec, "TRM");
77 for_each_online_cpu(j) 89 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 90 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
79 seq_printf(p, " Thermal event interrupts\n"); 91 seq_printf(p, " Thermal event interrupts\n");
80# ifdef CONFIG_X86_64 92# ifdef CONFIG_X86_64
81 seq_printf(p, "THR: "); 93 seq_printf(p, "%*s: ", prec, "THR");
82 for_each_online_cpu(j) 94 for_each_online_cpu(j)
83 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 95 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
84 seq_printf(p, " Threshold APIC interrupts\n"); 96 seq_printf(p, " Threshold APIC interrupts\n");
85# endif 97# endif
86#endif 98#endif
87#ifdef CONFIG_X86_LOCAL_APIC 99 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
88 seq_printf(p, "SPU: ");
89 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
91 seq_printf(p, " Spurious interrupts\n");
92#endif
93 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
94#if defined(CONFIG_X86_IO_APIC) 100#if defined(CONFIG_X86_IO_APIC)
95 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 101 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
96#endif 102#endif
97 return 0; 103 return 0;
98} 104}
@@ -100,19 +106,22 @@ static int show_other_interrupts(struct seq_file *p)
100int show_interrupts(struct seq_file *p, void *v) 106int show_interrupts(struct seq_file *p, void *v)
101{ 107{
102 unsigned long flags, any_count = 0; 108 unsigned long flags, any_count = 0;
103 int i = *(loff_t *) v, j; 109 int i = *(loff_t *) v, j, prec;
104 struct irqaction *action; 110 struct irqaction *action;
105 struct irq_desc *desc; 111 struct irq_desc *desc;
106 112
107 if (i > nr_irqs) 113 if (i > nr_irqs)
108 return 0; 114 return 0;
109 115
116 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
117 j *= 10;
118
110 if (i == nr_irqs) 119 if (i == nr_irqs)
111 return show_other_interrupts(p); 120 return show_other_interrupts(p, prec);
112 121
113 /* print header */ 122 /* print header */
114 if (i == 0) { 123 if (i == 0) {
115 seq_printf(p, " "); 124 seq_printf(p, "%*s", prec + 8, "");
116 for_each_online_cpu(j) 125 for_each_online_cpu(j)
117 seq_printf(p, "CPU%-8d", j); 126 seq_printf(p, "CPU%-8d", j);
118 seq_putc(p, '\n'); 127 seq_putc(p, '\n');
@@ -123,23 +132,15 @@ int show_interrupts(struct seq_file *p, void *v)
123 return 0; 132 return 0;
124 133
125 spin_lock_irqsave(&desc->lock, flags); 134 spin_lock_irqsave(&desc->lock, flags);
126#ifndef CONFIG_SMP
127 any_count = kstat_irqs(i);
128#else
129 for_each_online_cpu(j) 135 for_each_online_cpu(j)
130 any_count |= kstat_irqs_cpu(i, j); 136 any_count |= kstat_irqs_cpu(i, j);
131#endif
132 action = desc->action; 137 action = desc->action;
133 if (!action && !any_count) 138 if (!action && !any_count)
134 goto out; 139 goto out;
135 140
136 seq_printf(p, "%3d: ", i); 141 seq_printf(p, "%*d: ", prec, i);
137#ifndef CONFIG_SMP
138 seq_printf(p, "%10u ", kstat_irqs(i));
139#else
140 for_each_online_cpu(j) 142 for_each_online_cpu(j)
141 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 143 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
142#endif
143 seq_printf(p, " %8s", desc->chip->name); 144 seq_printf(p, " %8s", desc->chip->name);
144 seq_printf(p, "-%-8s", desc->name); 145 seq_printf(p, "-%-8s", desc->name);
145 146
@@ -164,7 +165,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
164 165
165#ifdef CONFIG_X86_LOCAL_APIC 166#ifdef CONFIG_X86_LOCAL_APIC
166 sum += irq_stats(cpu)->apic_timer_irqs; 167 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count;
167#endif 169#endif
170 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs;
168#ifdef CONFIG_SMP 172#ifdef CONFIG_SMP
169 sum += irq_stats(cpu)->irq_resched_count; 173 sum += irq_stats(cpu)->irq_resched_count;
170 sum += irq_stats(cpu)->irq_call_count; 174 sum += irq_stats(cpu)->irq_call_count;
@@ -176,9 +180,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
176 sum += irq_stats(cpu)->irq_threshold_count; 180 sum += irq_stats(cpu)->irq_threshold_count;
177#endif 181#endif
178#endif 182#endif
179#ifdef CONFIG_X86_LOCAL_APIC
180 sum += irq_stats(cpu)->irq_spurious_count;
181#endif
182 return sum; 183 return sum;
183} 184}
184 185
@@ -192,4 +193,63 @@ u64 arch_irq_stat(void)
192 return sum; 193 return sum;
193} 194}
194 195
196
197/*
198 * do_IRQ handles all normal device IRQ's (the special
199 * SMP cross-CPU interrupts have their own specific
200 * handlers).
201 */
202unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
203{
204 struct pt_regs *old_regs = set_irq_regs(regs);
205
206 /* high bit used in ret_from_ code */
207 unsigned vector = ~regs->orig_ax;
208 unsigned irq;
209
210 exit_idle();
211 irq_enter();
212
213 irq = __get_cpu_var(vector_irq)[vector];
214
215 if (!handle_irq(irq, regs)) {
216#ifdef CONFIG_X86_64
217 if (!disable_apic)
218 ack_APIC_irq();
219#endif
220
221 if (printk_ratelimit())
222 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
223 __func__, smp_processor_id(), vector, irq);
224 }
225
226 irq_exit();
227
228 set_irq_regs(old_regs);
229 return 1;
230}
231
232/*
233 * Handler for GENERIC_INTERRUPT_VECTOR.
234 */
235void smp_generic_interrupt(struct pt_regs *regs)
236{
237 struct pt_regs *old_regs = set_irq_regs(regs);
238
239 ack_APIC_irq();
240
241 exit_idle();
242
243 irq_enter();
244
245 inc_irq_stat(generic_irqs);
246
247 if (generic_interrupt_extension)
248 generic_interrupt_extension();
249
250 irq_exit();
251
252 set_irq_regs(old_regs);
253}
254
195EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 255EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
@@ -191,33 +192,16 @@ static inline int
191execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } 192execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
192#endif 193#endif
193 194
194/* 195bool handle_irq(unsigned irq, struct pt_regs *regs)
195 * do_IRQ handles all normal device IRQ's (the special
196 * SMP cross-CPU interrupts have their own specific
197 * handlers).
198 */
199unsigned int do_IRQ(struct pt_regs *regs)
200{ 196{
201 struct pt_regs *old_regs;
202 /* high bit used in ret_from_ code */
203 int overflow;
204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc; 197 struct irq_desc *desc;
206 unsigned irq; 198 int overflow;
207
208
209 old_regs = set_irq_regs(regs);
210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
212 199
213 overflow = check_stack_overflow(); 200 overflow = check_stack_overflow();
214 201
215 desc = irq_to_desc(irq); 202 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) { 203 if (unlikely(!desc))
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", 204 return false;
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221 205
222 if (!execute_on_irq_stack(overflow, desc, irq)) { 206 if (!execute_on_irq_stack(overflow, desc, irq)) {
223 if (unlikely(overflow)) 207 if (unlikely(overflow))
@@ -225,13 +209,10 @@ unsigned int do_IRQ(struct pt_regs *regs)
225 desc->handle_irq(irq, desc); 209 desc->handle_irq(irq, desc);
226 } 210 }
227 211
228 irq_exit(); 212 return true;
229 set_irq_regs(old_regs);
230 return 1;
231} 213}
232 214
233#ifdef CONFIG_HOTPLUG_CPU 215#ifdef CONFIG_HOTPLUG_CPU
234#include <mach_apic.h>
235 216
236/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
237void fixup_irqs(void) 218void fixup_irqs(void)
@@ -248,7 +229,7 @@ void fixup_irqs(void)
248 if (irq == 2) 229 if (irq == 2)
249 continue; 230 continue;
250 231
251 affinity = &desc->affinity; 232 affinity = desc->affinity;
252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 233 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
253 printk("Breaking affinity for irq %i\n", irq); 234 printk("Breaking affinity for irq %i\n", irq);
254 affinity = cpu_all_mask; 235 affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..977d8b43a0dd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
21 28
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
@@ -41,42 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
41#endif 48#endif
42} 49}
43 50
44/* 51bool handle_irq(unsigned irq, struct pt_regs *regs)
45 * do_IRQ handles all normal device IRQ's (the special
46 * SMP cross-CPU interrupts have their own specific
47 * handlers).
48 */
49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
50{ 52{
51 struct pt_regs *old_regs = set_irq_regs(regs);
52 struct irq_desc *desc; 53 struct irq_desc *desc;
53 54
54 /* high bit used in ret_from_ code */
55 unsigned vector = ~regs->orig_ax;
56 unsigned irq;
57
58 exit_idle();
59 irq_enter();
60 irq = __get_cpu_var(vector_irq)[vector];
61
62 stack_overflow_check(regs); 55 stack_overflow_check(regs);
63 56
64 desc = irq_to_desc(irq); 57 desc = irq_to_desc(irq);
65 if (likely(desc)) 58 if (unlikely(!desc))
66 generic_handle_irq_desc(irq, desc); 59 return false;
67 else {
68 if (!disable_apic)
69 ack_APIC_irq();
70
71 if (printk_ratelimit())
72 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
73 __func__, smp_processor_id(), vector);
74 }
75
76 irq_exit();
77 60
78 set_irq_regs(old_regs); 61 generic_handle_irq_desc(irq, desc);
79 return 1; 62 return true;
80} 63}
81 64
82#ifdef CONFIG_HOTPLUG_CPU 65#ifdef CONFIG_HOTPLUG_CPU
@@ -100,7 +83,7 @@ void fixup_irqs(void)
100 /* interrupt's are disabled at this point */ 83 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 84 spin_lock(&desc->lock);
102 85
103 affinity = &desc->affinity; 86 affinity = desc->affinity;
104 if (!irq_has_action(irq) || 87 if (!irq_has_action(irq) ||
105 cpumask_equal(affinity, cpu_online_mask)) { 88 cpumask_equal(affinity, cpu_online_mask)) {
106 spin_unlock(&desc->lock); 89 spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 10a09c2f1828..368b0a8836f9 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -18,7 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/arch_hooks.h> 21#include <asm/setup.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23#include <asm/traps.h> 23#include <asm/traps.h>
24 24
@@ -50,7 +50,6 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
50 */ 50 */
51static struct irqaction fpu_irq = { 51static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 52 .handler = math_error_irq,
53 .mask = CPU_MASK_NONE,
54 .name = "fpu", 53 .name = "fpu",
55}; 54};
56 55
@@ -78,6 +77,14 @@ void __init init_ISA_irqs(void)
78 } 77 }
79} 78}
80 79
80/*
81 * IRQ2 is cascade interrupt to second interrupt controller
82 */
83static struct irqaction irq2 = {
84 .handler = no_action,
85 .name = "cascade",
86};
87
81DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
82 [0 ... IRQ0_VECTOR - 1] = -1, 89 [0 ... IRQ0_VECTOR - 1] = -1,
83 [IRQ0_VECTOR] = 0, 90 [IRQ0_VECTOR] = 0,
@@ -118,8 +125,8 @@ void __init native_init_IRQ(void)
118{ 125{
119 int i; 126 int i;
120 127
121 /* all the set up before the call gates are initialised */ 128 /* Execute any quirks before the call gates are initialised: */
122 pre_intr_init_hook(); 129 x86_quirk_pre_intr_init();
123 130
124 /* 131 /*
125 * Cover the whole vector space, no vector can escape 132 * Cover the whole vector space, no vector can escape
@@ -140,8 +147,15 @@ void __init native_init_IRQ(void)
140 */ 147 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 148 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142 149
143 /* IPI for invalidation */ 150 /* IPIs for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 151 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
152 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
153 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
154 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
155 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
156 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
157 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
158 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
145 159
146 /* IPI for generic function call */ 160 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 161 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -159,6 +173,9 @@ void __init native_init_IRQ(void)
159 /* self generated IPI for local APIC timer */ 173 /* self generated IPI for local APIC timer */
160 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 174 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
161 175
176 /* generic IPI for platform specific use */
177 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
178
162 /* IPI vectors for APIC spurious and error interrupts */ 179 /* IPI vectors for APIC spurious and error interrupts */
163 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
164 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
@@ -169,10 +186,14 @@ void __init native_init_IRQ(void)
169 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
170#endif 187#endif
171 188
172 /* setup after call gates are initialised (usually add in 189 if (!acpi_ioapic)
173 * the architecture specific gates) 190 setup_irq(2, &irq2);
191
192 /*
193 * Call quirks after call gates are initialised (usually add in
194 * the architecture specific gates):
174 */ 195 */
175 intr_init_hook(); 196 x86_quirk_intr_init();
176 197
177 /* 198 /*
178 * External FPU? Set up irq13 if so, for 199 * External FPU? Set up irq13 if so, for
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..8cd10537fd46 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -45,7 +45,6 @@
45 45
46static struct irqaction irq2 = { 46static struct irqaction irq2 = {
47 .handler = no_action, 47 .handler = no_action,
48 .mask = CPU_MASK_NONE,
49 .name = "cascade", 48 .name = "cascade",
50}; 49};
51DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -147,6 +146,9 @@ static void __init apic_intr_init(void)
147 /* self generated IPI for local APIC timer */ 146 /* self generated IPI for local APIC timer */
148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
149 148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
150 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f1..e444357375ce 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
8 */ 8 */
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/stat.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/stat.h>
13#include <linux/io.h> 14#include <linux/io.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/module.h>
16 16
17#include <asm/setup.h> 17#include <asm/setup.h>
18 18
@@ -26,9 +26,8 @@ struct setup_data_node {
26 u32 len; 26 u32 len;
27}; 27};
28 28
29static ssize_t 29static ssize_t setup_data_read(struct file *file, char __user *user_buf,
30setup_data_read(struct file *file, char __user *user_buf, size_t count, 30 size_t count, loff_t *ppos)
31 loff_t *ppos)
32{ 31{
33 struct setup_data_node *node = file->private_data; 32 struct setup_data_node *node = file->private_data;
34 unsigned long remain; 33 unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
39 38
40 if (pos < 0) 39 if (pos < 0)
41 return -EINVAL; 40 return -EINVAL;
41
42 if (pos >= node->len) 42 if (pos >= node->len)
43 return 0; 43 return 0;
44 44
45 if (count > node->len - pos) 45 if (count > node->len - pos)
46 count = node->len - pos; 46 count = node->len - pos;
47
47 pa = node->paddr + sizeof(struct setup_data) + pos; 48 pa = node->paddr + sizeof(struct setup_data) + pos;
48 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 49 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
49 if (PageHighMem(pg)) { 50 if (PageHighMem(pg)) {
50 p = ioremap_cache(pa, count); 51 p = ioremap_cache(pa, count);
51 if (!p) 52 if (!p)
52 return -ENXIO; 53 return -ENXIO;
53 } else { 54 } else
54 p = __va(pa); 55 p = __va(pa);
55 }
56 56
57 remain = copy_to_user(user_buf, p, count); 57 remain = copy_to_user(user_buf, p, count);
58 58
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
70static int setup_data_open(struct inode *inode, struct file *file) 70static int setup_data_open(struct inode *inode, struct file *file)
71{ 71{
72 file->private_data = inode->i_private; 72 file->private_data = inode->i_private;
73
73 return 0; 74 return 0;
74} 75}
75 76
76static const struct file_operations fops_setup_data = { 77static const struct file_operations fops_setup_data = {
77 .read = setup_data_read, 78 .read = setup_data_read,
78 .open = setup_data_open, 79 .open = setup_data_open,
79}; 80};
80 81
81static int __init 82static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
84{ 85{
85 struct dentry *d, *type, *data; 86 struct dentry *d, *type, *data;
86 char buf[16]; 87 char buf[16];
87 int error;
88 88
89 sprintf(buf, "%d", no); 89 sprintf(buf, "%d", no);
90 d = debugfs_create_dir(buf, parent); 90 d = debugfs_create_dir(buf, parent);
91 if (!d) { 91 if (!d)
92 error = -ENOMEM; 92 return -ENOMEM;
93 goto err_return; 93
94 }
95 type = debugfs_create_x32("type", S_IRUGO, d, &node->type); 94 type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
96 if (!type) { 95 if (!type)
97 error = -ENOMEM;
98 goto err_dir; 96 goto err_dir;
99 } 97
100 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); 98 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
101 if (!data) { 99 if (!data)
102 error = -ENOMEM;
103 goto err_type; 100 goto err_type;
104 } 101
105 return 0; 102 return 0;
106 103
107err_type: 104err_type:
108 debugfs_remove(type); 105 debugfs_remove(type);
109err_dir: 106err_dir:
110 debugfs_remove(d); 107 debugfs_remove(d);
111err_return: 108 return -ENOMEM;
112 return error;
113} 109}
114 110
115static int __init create_setup_data_nodes(struct dentry *parent) 111static int __init create_setup_data_nodes(struct dentry *parent)
116{ 112{
117 struct setup_data_node *node; 113 struct setup_data_node *node;
118 struct setup_data *data; 114 struct setup_data *data;
119 int error, no = 0; 115 int error = -ENOMEM;
120 struct dentry *d; 116 struct dentry *d;
121 struct page *pg; 117 struct page *pg;
122 u64 pa_data; 118 u64 pa_data;
119 int no = 0;
123 120
124 d = debugfs_create_dir("setup_data", parent); 121 d = debugfs_create_dir("setup_data", parent);
125 if (!d) { 122 if (!d)
126 error = -ENOMEM; 123 return -ENOMEM;
127 goto err_return;
128 }
129 124
130 pa_data = boot_params.hdr.setup_data; 125 pa_data = boot_params.hdr.setup_data;
131 126
132 while (pa_data) { 127 while (pa_data) {
133 node = kmalloc(sizeof(*node), GFP_KERNEL); 128 node = kmalloc(sizeof(*node), GFP_KERNEL);
134 if (!node) { 129 if (!node)
135 error = -ENOMEM;
136 goto err_dir; 130 goto err_dir;
137 } 131
138 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 132 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
139 if (PageHighMem(pg)) { 133 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 134 data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
143 error = -ENXIO; 137 error = -ENXIO;
144 goto err_dir; 138 goto err_dir;
145 } 139 }
146 } else { 140 } else
147 data = __va(pa_data); 141 data = __va(pa_data);
148 }
149 142
150 node->paddr = pa_data; 143 node->paddr = pa_data;
151 node->type = data->type; 144 node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
159 goto err_dir; 152 goto err_dir;
160 no++; 153 no++;
161 } 154 }
155
162 return 0; 156 return 0;
163 157
164err_dir: 158err_dir:
165 debugfs_remove(d); 159 debugfs_remove(d);
166err_return:
167 return error; 160 return error;
168} 161}
169 162
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
175static int __init boot_params_kdebugfs_init(void) 168static int __init boot_params_kdebugfs_init(void)
176{ 169{
177 struct dentry *dbp, *version, *data; 170 struct dentry *dbp, *version, *data;
178 int error; 171 int error = -ENOMEM;
179 172
180 dbp = debugfs_create_dir("boot_params", NULL); 173 dbp = debugfs_create_dir("boot_params", NULL);
181 if (!dbp) { 174 if (!dbp)
182 error = -ENOMEM; 175 return -ENOMEM;
183 goto err_return; 176
184 }
185 version = debugfs_create_x16("version", S_IRUGO, dbp, 177 version = debugfs_create_x16("version", S_IRUGO, dbp,
186 &boot_params.hdr.version); 178 &boot_params.hdr.version);
187 if (!version) { 179 if (!version)
188 error = -ENOMEM;
189 goto err_dir; 180 goto err_dir;
190 } 181
191 data = debugfs_create_blob("data", S_IRUGO, dbp, 182 data = debugfs_create_blob("data", S_IRUGO, dbp,
192 &boot_params_blob); 183 &boot_params_blob);
193 if (!data) { 184 if (!data)
194 error = -ENOMEM;
195 goto err_version; 185 goto err_version;
196 } 186
197 error = create_setup_data_nodes(dbp); 187 error = create_setup_data_nodes(dbp);
198 if (error) 188 if (error)
199 goto err_data; 189 goto err_data;
190
200 return 0; 191 return 0;
201 192
202err_data: 193err_data:
@@ -205,10 +196,9 @@ err_version:
205 debugfs_remove(version); 196 debugfs_remove(version);
206err_dir: 197err_dir:
207 debugfs_remove(dbp); 198 debugfs_remove(dbp);
208err_return:
209 return error; 199 return error;
210} 200}
211#endif 201#endif /* CONFIG_DEBUG_BOOT_PARAMS */
212 202
213static int __init arch_kdebugfs_init(void) 203static int __init arch_kdebugfs_init(void)
214{ 204{
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 10435a120d22..eedfaebe1063 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
46#include <asm/apicdef.h> 46#include <asm/apicdef.h>
47#include <asm/system.h> 47#include <asm/system.h>
48 48
49#include <mach_ipi.h> 49#include <asm/apic.h>
50 50
51/* 51/*
52 * Put the error code here just in case the user cares: 52 * Put the error code here just in case the user cares:
@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
347 */ 347 */
348void kgdb_roundup_cpus(unsigned long flags) 348void kgdb_roundup_cpus(unsigned long flags)
349{ 349{
350 send_IPI_allbutself(APIC_DM_NMI); 350 apic->send_IPI_allbutself(APIC_DM_NMI);
351} 351}
352#endif 352#endif
353 353
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e948b28a5a9a..7b5169d2b000 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
193 kprobe_opcode_t opcode; 193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes; 194 kprobe_opcode_t *orig_opcodes = opcodes;
195 195
196 if (search_exception_tables((unsigned long)opcodes))
197 return 0; /* Page fault may occur on this address. */
198
196retry: 199retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) 200 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0; 201 return 0;
@@ -635,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
635#else 638#else
636 " pushf\n" 639 " pushf\n"
637 /* 640 /*
638 * Skip cs, ip, orig_ax. 641 * Skip cs, ip, orig_ax and gs.
639 * trampoline_handler() will plug in these values 642 * trampoline_handler() will plug in these values
640 */ 643 */
641 " subl $12, %esp\n" 644 " subl $16, %esp\n"
642 " pushl %fs\n" 645 " pushl %fs\n"
643 " pushl %ds\n"
644 " pushl %es\n" 646 " pushl %es\n"
647 " pushl %ds\n"
645 " pushl %eax\n" 648 " pushl %eax\n"
646 " pushl %ebp\n" 649 " pushl %ebp\n"
647 " pushl %edi\n" 650 " pushl %edi\n"
@@ -652,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
652 " movl %esp, %eax\n" 655 " movl %esp, %eax\n"
653 " call trampoline_handler\n" 656 " call trampoline_handler\n"
654 /* Move flags to cs */ 657 /* Move flags to cs */
655 " movl 52(%esp), %edx\n" 658 " movl 56(%esp), %edx\n"
656 " movl %edx, 48(%esp)\n" 659 " movl %edx, 52(%esp)\n"
657 /* Replace saved flags with true return address. */ 660 /* Replace saved flags with true return address. */
658 " movl %eax, 52(%esp)\n" 661 " movl %eax, 56(%esp)\n"
659 " popl %ebx\n" 662 " popl %ebx\n"
660 " popl %ecx\n" 663 " popl %ecx\n"
661 " popl %edx\n" 664 " popl %edx\n"
@@ -663,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
663 " popl %edi\n" 666 " popl %edi\n"
664 " popl %ebp\n" 667 " popl %ebp\n"
665 " popl %eax\n" 668 " popl %eax\n"
666 /* Skip ip, orig_ax, es, ds, fs */ 669 /* Skip ds, es, fs, gs, orig_ax and ip */
667 " addl $20, %esp\n" 670 " addl $24, %esp\n"
668 " popf\n" 671 " popf\n"
669#endif 672#endif
670 " ret\n"); 673 " ret\n");
@@ -688,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
688 regs->cs = __KERNEL_CS; 691 regs->cs = __KERNEL_CS;
689#else 692#else
690 regs->cs = __KERNEL_CS | get_kernel_rpl(); 693 regs->cs = __KERNEL_CS | get_kernel_rpl();
694 regs->gs = 0;
691#endif 695#endif
692 regs->ip = trampoline_address; 696 regs->ip = trampoline_address;
693 regs->orig_ax = ~0UL; 697 regs->orig_ax = ~0UL;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986eca..33019ddb56b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
138 kvm_mmu_write(ptep, pte_val(pte)); 138 kvm_mmu_write(ptep, pte_val(pte));
139} 139}
140 140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm, 141static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep) 142 unsigned long addr, pte_t *ptep)
149{ 143{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
220#if PAGETABLE_LEVELS >= 3 214#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE 215#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 216 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear; 217 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 218 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif 219#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 652fce6d2cce..137f2e8132df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -19,7 +19,6 @@
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h> 21#include <asm/pvclock.h>
22#include <asm/arch_hooks.h>
23#include <asm/msr.h> 22#include <asm/msr.h>
24#include <asm/apic.h> 23#include <asm/apic.h>
25#include <linux/percpu.h> 24#include <linux/percpu.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 37f420018a41..e7368c1da01d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/io.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/io.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
63 "\tmovl %%eax,%%fs\n" 63 "\tmovl %%eax,%%fs\n"
64 "\tmovl %%eax,%%gs\n" 64 "\tmovl %%eax,%%gs\n"
65 "\tmovl %%eax,%%ss\n" 65 "\tmovl %%eax,%%ss\n"
66 ::: "eax", "memory"); 66 : : : "eax", "memory");
67#undef STR 67#undef STR
68#undef __STR 68#undef __STR
69} 69}
@@ -121,7 +121,7 @@ static void machine_kexec_page_table_set_one(
121static void machine_kexec_prepare_page_tables(struct kimage *image) 121static void machine_kexec_prepare_page_tables(struct kimage *image)
122{ 122{
123 void *control_page; 123 void *control_page;
124 pmd_t *pmd = 0; 124 pmd_t *pmd = NULL;
125 125
126 control_page = page_address(image->control_code_page); 126 control_page = page_address(image->control_code_page);
127#ifdef CONFIG_X86_PAE 127#ifdef CONFIG_X86_PAE
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
208 /* We need to put APICs in legacy mode so that we can 208 /*
209 * We need to put APICs in legacy mode so that we can
209 * get timer interrupts in second kernel. kexec/kdump 210 * get timer interrupts in second kernel. kexec/kdump
210 * paths already have calls to disable_IO_APIC() in 211 * paths already have calls to disable_IO_APIC() in
211 * one form or other. kexec jump path also need 212 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 228 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
228 << PAGE_SHIFT); 229 << PAGE_SHIFT);
229 230
230 /* The segment registers are funny things, they have both a 231 /*
232 * The segment registers are funny things, they have both a
231 * visible and an invisible part. Whenever the visible part is 233 * visible and an invisible part. Whenever the visible part is
232 * set to a specific selector, the invisible part is loaded 234 * set to a specific selector, the invisible part is loaded
233 * with from a table in memory. At no other time is the 235 * with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
237 * segments, before I zap the gdt with an invalid value. 239 * segments, before I zap the gdt with an invalid value.
238 */ 240 */
239 load_segments(); 241 load_segments();
240 /* The gdt & idt are now invalid. 242 /*
243 * The gdt & idt are now invalid.
241 * If you want to load them you must set up your own idt & gdt. 244 * If you want to load them you must set up your own idt & gdt.
242 */ 245 */
243 set_gdt(phys_to_virt(0),0); 246 set_gdt(phys_to_virt(0), 0);
244 set_idt(phys_to_virt(0),0); 247 set_idt(phys_to_virt(0), 0);
245 248
246 /* now call it */ 249 /* now call it */
247 image->start = relocate_kernel_ptr((unsigned long)image->head, 250 image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c43caa3a91f3..89cea4d44679 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,20 +12,47 @@
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/io.h>
16#include <linux/suspend.h>
15 17
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19#include <asm/io.h>
20 21
21#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 22static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
22static u64 kexec_pgd[512] PAGE_ALIGNED; 23 unsigned long addr)
23static u64 kexec_pud0[512] PAGE_ALIGNED; 24{
24static u64 kexec_pmd0[512] PAGE_ALIGNED; 25 pud_t *pud;
25static u64 kexec_pte0[512] PAGE_ALIGNED; 26 pmd_t *pmd;
26static u64 kexec_pud1[512] PAGE_ALIGNED; 27 struct page *page;
27static u64 kexec_pmd1[512] PAGE_ALIGNED; 28 int result = -ENOMEM;
28static u64 kexec_pte1[512] PAGE_ALIGNED; 29
30 addr &= PMD_MASK;
31 pgd += pgd_index(addr);
32 if (!pgd_present(*pgd)) {
33 page = kimage_alloc_control_pages(image, 0);
34 if (!page)
35 goto out;
36 pud = (pud_t *)page_address(page);
37 memset(pud, 0, PAGE_SIZE);
38 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
39 }
40 pud = pud_offset(pgd, addr);
41 if (!pud_present(*pud)) {
42 page = kimage_alloc_control_pages(image, 0);
43 if (!page)
44 goto out;
45 pmd = (pmd_t *)page_address(page);
46 memset(pmd, 0, PAGE_SIZE);
47 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
48 }
49 pmd = pmd_offset(pud, addr);
50 if (!pmd_present(*pmd))
51 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
52 result = 0;
53out:
54 return result;
55}
29 56
30static void init_level2_page(pmd_t *level2p, unsigned long addr) 57static void init_level2_page(pmd_t *level2p, unsigned long addr)
31{ 58{
@@ -92,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
92 } 119 }
93 level3p = (pud_t *)page_address(page); 120 level3p = (pud_t *)page_address(page);
94 result = init_level3_page(image, level3p, addr, last_addr); 121 result = init_level3_page(image, level3p, addr, last_addr);
95 if (result) { 122 if (result)
96 goto out; 123 goto out;
97 }
98 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 124 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
99 addr += PGDIR_SIZE; 125 addr += PGDIR_SIZE;
100 } 126 }
@@ -107,12 +133,72 @@ out:
107 return result; 133 return result;
108} 134}
109 135
136static void free_transition_pgtable(struct kimage *image)
137{
138 free_page((unsigned long)image->arch.pud);
139 free_page((unsigned long)image->arch.pmd);
140 free_page((unsigned long)image->arch.pte);
141}
142
143static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
144{
145 pud_t *pud;
146 pmd_t *pmd;
147 pte_t *pte;
148 unsigned long vaddr, paddr;
149 int result = -ENOMEM;
150
151 vaddr = (unsigned long)relocate_kernel;
152 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
153 pgd += pgd_index(vaddr);
154 if (!pgd_present(*pgd)) {
155 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
156 if (!pud)
157 goto err;
158 image->arch.pud = pud;
159 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
160 }
161 pud = pud_offset(pgd, vaddr);
162 if (!pud_present(*pud)) {
163 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
164 if (!pmd)
165 goto err;
166 image->arch.pmd = pmd;
167 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
168 }
169 pmd = pmd_offset(pud, vaddr);
170 if (!pmd_present(*pmd)) {
171 pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
172 if (!pte)
173 goto err;
174 image->arch.pte = pte;
175 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
176 }
177 pte = pte_offset_kernel(pmd, vaddr);
178 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
179 return 0;
180err:
181 free_transition_pgtable(image);
182 return result;
183}
184
110 185
111static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 186static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
112{ 187{
113 pgd_t *level4p; 188 pgd_t *level4p;
189 int result;
114 level4p = (pgd_t *)__va(start_pgtable); 190 level4p = (pgd_t *)__va(start_pgtable);
115 return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 191 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
192 if (result)
193 return result;
194 /*
195 * image->start may be outside 0 ~ max_pfn, for example when
196 * jump back to original kernel from kexeced kernel
197 */
198 result = init_one_level2_page(image, level4p, image->start);
199 if (result)
200 return result;
201 return init_transition_pgtable(image, level4p);
116} 202}
117 203
118static void set_idt(void *newidt, u16 limit) 204static void set_idt(void *newidt, u16 limit)
@@ -174,7 +260,7 @@ int machine_kexec_prepare(struct kimage *image)
174 260
175void machine_kexec_cleanup(struct kimage *image) 261void machine_kexec_cleanup(struct kimage *image)
176{ 262{
177 return; 263 free_transition_pgtable(image);
178} 264}
179 265
180/* 266/*
@@ -185,36 +271,45 @@ void machine_kexec(struct kimage *image)
185{ 271{
186 unsigned long page_list[PAGES_NR]; 272 unsigned long page_list[PAGES_NR];
187 void *control_page; 273 void *control_page;
274 int save_ftrace_enabled;
188 275
189 tracer_disable(); 276#ifdef CONFIG_KEXEC_JUMP
277 if (kexec_image->preserve_context)
278 save_processor_state();
279#endif
280
281 save_ftrace_enabled = __ftrace_enabled_save();
190 282
191 /* Interrupts aren't acceptable while we reboot */ 283 /* Interrupts aren't acceptable while we reboot */
192 local_irq_disable(); 284 local_irq_disable();
193 285
286 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC
288 /*
289 * We need to put APICs in legacy mode so that we can
290 * get timer interrupts in second kernel. kexec/kdump
291 * paths already have calls to disable_IO_APIC() in
292 * one form or other. kexec jump path also need
293 * one.
294 */
295 disable_IO_APIC();
296#endif
297 }
298
194 control_page = page_address(image->control_code_page) + PAGE_SIZE; 299 control_page = page_address(image->control_code_page) + PAGE_SIZE;
195 memcpy(control_page, relocate_kernel, PAGE_SIZE); 300 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
196 301
197 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); 302 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
198 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 303 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
199 page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
200 page_list[VA_PGD] = (unsigned long)kexec_pgd;
201 page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
202 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
203 page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
204 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
205 page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
206 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
207 page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
208 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
209 page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
210 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
211 page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
212 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
213
214 page_list[PA_TABLE_PAGE] = 304 page_list[PA_TABLE_PAGE] =
215 (unsigned long)__pa(page_address(image->control_code_page)); 305 (unsigned long)__pa(page_address(image->control_code_page));
216 306
217 /* The segment registers are funny things, they have both a 307 if (image->type == KEXEC_TYPE_DEFAULT)
308 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
309 << PAGE_SHIFT);
310
311 /*
312 * The segment registers are funny things, they have both a
218 * visible and an invisible part. Whenever the visible part is 313 * visible and an invisible part. Whenever the visible part is
219 * set to a specific selector, the invisible part is loaded 314 * set to a specific selector, the invisible part is loaded
220 * with from a table in memory. At no other time is the 315 * with from a table in memory. At no other time is the
@@ -224,15 +319,25 @@ void machine_kexec(struct kimage *image)
224 * segments, before I zap the gdt with an invalid value. 319 * segments, before I zap the gdt with an invalid value.
225 */ 320 */
226 load_segments(); 321 load_segments();
227 /* The gdt & idt are now invalid. 322 /*
323 * The gdt & idt are now invalid.
228 * If you want to load them you must set up your own idt & gdt. 324 * If you want to load them you must set up your own idt & gdt.
229 */ 325 */
230 set_gdt(phys_to_virt(0),0); 326 set_gdt(phys_to_virt(0), 0);
231 set_idt(phys_to_virt(0),0); 327 set_idt(phys_to_virt(0), 0);
232 328
233 /* now call it */ 329 /* now call it */
234 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 330 image->start = relocate_kernel((unsigned long)image->head,
235 image->start); 331 (unsigned long)page_list,
332 image->start,
333 image->preserve_context);
334
335#ifdef CONFIG_KEXEC_JUMP
336 if (kexec_image->preserve_context)
337 restore_processor_state();
338#endif
339
340 __ftrace_enabled_restore(save_ftrace_enabled);
236} 341}
237 342
238void arch_crash_save_vmcoreinfo(void) 343void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 2dc183758be3..845d80ce1ef1 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -51,7 +51,6 @@
51#include <linux/ioport.h> 51#include <linux/ioport.h>
52#include <asm/uaccess.h> 52#include <asm/uaccess.h>
53#include <linux/init.h> 53#include <linux/init.h>
54#include <asm/arch_hooks.h>
55 54
56static unsigned char which_scsi; 55static unsigned char which_scsi;
57 56
@@ -474,6 +473,4 @@ void __kprobes mca_handle_nmi(void)
474 * adapter was responsible for the error. 473 * adapter was responsible for the error.
475 */ 474 */
476 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); 475 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
477 476}
478 mca_nmi_hook();
479} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 8815f3c7fec7..846510b78a09 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -348,7 +348,6 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
351 .mask = CPU_MASK_NONE,
352 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
353}; 352};
354 353
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c25fdb382292..453b5795a5c6 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -12,31 +12,30 @@
12 * 12 *
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15 */
16 16#include <linux/platform_device.h>
17#include <linux/capability.h> 17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h> 18#include <linux/miscdevice.h>
19#include <linux/firmware.h>
26#include <linux/spinlock.h> 20#include <linux/spinlock.h>
27#include <linux/mm.h> 21#include <linux/cpumask.h>
28#include <linux/fs.h> 22#include <linux/pci_ids.h>
23#include <linux/uaccess.h>
24#include <linux/vmalloc.h>
25#include <linux/kernel.h>
26#include <linux/module.h>
29#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
30#include <linux/cpu.h> 31#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h> 32#include <linux/pci.h>
34#include <linux/pci_ids.h> 33#include <linux/fs.h>
35#include <linux/uaccess.h> 34#include <linux/mm.h>
36 35
37#include <asm/msr.h>
38#include <asm/processor.h>
39#include <asm/microcode.h> 36#include <asm/microcode.h>
37#include <asm/processor.h>
38#include <asm/msr.h>
40 39
41MODULE_DESCRIPTION("AMD Microcode Update Driver"); 40MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba"); 41MODULE_AUTHOR("Peter Oruba");
@@ -72,8 +71,8 @@ struct microcode_header_amd {
72} __attribute__((packed)); 71} __attribute__((packed));
73 72
74struct microcode_amd { 73struct microcode_amd {
75 struct microcode_header_amd hdr; 74 struct microcode_header_amd hdr;
76 unsigned int mpb[0]; 75 unsigned int mpb[0];
77}; 76};
78 77
79#define UCODE_MAX_SIZE 2048 78#define UCODE_MAX_SIZE 2048
@@ -184,8 +183,8 @@ static int get_ucode_data(void *to, const u8 *from, size_t n)
184 return 0; 183 return 0;
185} 184}
186 185
187static void *get_next_ucode(const u8 *buf, unsigned int size, 186static void *
188 unsigned int *mc_size) 187get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
189{ 188{
190 unsigned int total_size; 189 unsigned int total_size;
191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 190 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
@@ -223,7 +222,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size,
223 return mc; 222 return mc;
224} 223}
225 224
226
227static int install_equiv_cpu_table(const u8 *buf) 225static int install_equiv_cpu_table(const u8 *buf)
228{ 226{
229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 227 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
@@ -372,4 +370,3 @@ struct microcode_ops * __init init_amd_microcode(void)
372{ 370{
373 return &microcode_amd_ops; 371 return &microcode_amd_ops;
374} 372}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9b721ba968c..a0f3851ef310 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,67 +70,78 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
73#include <linux/capability.h> 74#include <linux/capability.h>
74#include <linux/kernel.h> 75#include <linux/miscdevice.h>
75#include <linux/init.h> 76#include <linux/firmware.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h> 77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
78#include <linux/cpumask.h> 79#include <linux/cpumask.h>
79#include <linux/module.h> 80#include <linux/uaccess.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h> 81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h> 82#include <linux/kernel.h>
83#include <linux/spinlock.h> 83#include <linux/module.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h> 84#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
87#include <linux/cpu.h> 88#include <linux/cpu.h>
88#include <linux/firmware.h> 89#include <linux/fs.h>
89#include <linux/platform_device.h> 90#include <linux/mm.h>
90 91
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h> 92#include <asm/microcode.h>
93#include <asm/processor.h>
94#include <asm/msr.h>
95 95
96MODULE_DESCRIPTION("Microcode Update Driver"); 96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL"); 98MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102static struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
106 106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111struct update_for_cpu {
112 const void __user *buf;
113 size_t size;
114};
115
116static long update_for_cpu(void *_ufc)
117{
118 struct update_for_cpu *ufc = _ufc;
119 int error;
120
121 error = microcode_ops->request_microcode_user(smp_processor_id(),
122 ufc->buf, ufc->size);
123 if (error < 0)
124 return error;
125 if (!error)
126 microcode_ops->apply_microcode(smp_processor_id());
127 return error;
128}
129
111static int do_microcode_update(const void __user *buf, size_t size) 130static int do_microcode_update(const void __user *buf, size_t size)
112{ 131{
113 cpumask_t old;
114 int error = 0; 132 int error = 0;
115 int cpu; 133 int cpu;
116 134 struct update_for_cpu ufc = { .buf = buf, .size = size };
117 old = current->cpus_allowed;
118 135
119 for_each_online_cpu(cpu) { 136 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 137 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121 138
122 if (!uci->valid) 139 if (!uci->valid)
123 continue; 140 continue;
124 141 error = work_on_cpu(cpu, update_for_cpu, &ufc);
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0) 142 if (error < 0)
128 goto out; 143 break;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 } 144 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error; 145 return error;
135} 146}
136 147
@@ -198,18 +209,33 @@ static void microcode_dev_exit(void)
198 209
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); 210MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else 211#else
201#define microcode_dev_init() 0 212#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0) 213#define microcode_dev_exit() do { } while (0)
203#endif 214#endif
204 215
205/* fake device for request_firmware */ 216/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 217static struct platform_device *microcode_pdev;
218
219static long reload_for_cpu(void *unused)
220{
221 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
222 int err = 0;
223
224 mutex_lock(&microcode_mutex);
225 if (uci->valid) {
226 err = microcode_ops->request_microcode_fw(smp_processor_id(),
227 &microcode_pdev->dev);
228 if (!err)
229 microcode_ops->apply_microcode(smp_processor_id());
230 }
231 mutex_unlock(&microcode_mutex);
232 return err;
233}
207 234
208static ssize_t reload_store(struct sys_device *dev, 235static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 236 struct sysdev_attribute *attr,
210 const char *buf, size_t sz) 237 const char *buf, size_t sz)
211{ 238{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end; 239 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0); 240 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0; 241 int err = 0;
@@ -218,21 +244,9 @@ static ssize_t reload_store(struct sys_device *dev,
218 if (end == buf) 244 if (end == buf)
219 return -EINVAL; 245 return -EINVAL;
220 if (val == 1) { 246 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus(); 247 get_online_cpus();
224 if (cpu_online(cpu)) { 248 if (cpu_online(cpu))
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 249 err = work_on_cpu(cpu, reload_for_cpu, NULL);
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus(); 250 put_online_cpus();
237 } 251 }
238 if (err) 252 if (err)
@@ -268,8 +282,8 @@ static struct attribute *mc_default_attrs[] = {
268}; 282};
269 283
270static struct attribute_group mc_attr_group = { 284static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs, 285 .attrs = mc_default_attrs,
272 .name = "microcode", 286 .name = "microcode",
273}; 287};
274 288
275static void __microcode_fini_cpu(int cpu) 289static void __microcode_fini_cpu(int cpu)
@@ -328,9 +342,9 @@ static int microcode_resume_cpu(int cpu)
328 return 0; 342 return 0;
329} 343}
330 344
331static void microcode_update_cpu(int cpu) 345static long microcode_update_cpu(void *unused)
332{ 346{
333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 347 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
334 int err = 0; 348 int err = 0;
335 349
336 /* 350 /*
@@ -338,30 +352,27 @@ static void microcode_update_cpu(int cpu)
338 * otherwise just request a firmware: 352 * otherwise just request a firmware:
339 */ 353 */
340 if (uci->valid) { 354 if (uci->valid) {
341 err = microcode_resume_cpu(cpu); 355 err = microcode_resume_cpu(smp_processor_id());
342 } else { 356 } else {
343 collect_cpu_info(cpu); 357 collect_cpu_info(smp_processor_id());
344 if (uci->valid && system_state == SYSTEM_RUNNING) 358 if (uci->valid && system_state == SYSTEM_RUNNING)
345 err = microcode_ops->request_microcode_fw(cpu, 359 err = microcode_ops->request_microcode_fw(
360 smp_processor_id(),
346 &microcode_pdev->dev); 361 &microcode_pdev->dev);
347 } 362 }
348 if (!err) 363 if (!err)
349 microcode_ops->apply_microcode(cpu); 364 microcode_ops->apply_microcode(smp_processor_id());
365 return err;
350} 366}
351 367
352static void microcode_init_cpu(int cpu) 368static int microcode_init_cpu(int cpu)
353{ 369{
354 cpumask_t old = current->cpus_allowed; 370 int err;
355
356 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
357 /* We should bind the task to the CPU */
358 BUG_ON(raw_smp_processor_id() != cpu);
359
360 mutex_lock(&microcode_mutex); 371 mutex_lock(&microcode_mutex);
361 microcode_update_cpu(cpu); 372 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex); 373 mutex_unlock(&microcode_mutex);
363 374
364 set_cpus_allowed_ptr(current, &old); 375 return err;
365} 376}
366 377
367static int mc_sysdev_add(struct sys_device *sys_dev) 378static int mc_sysdev_add(struct sys_device *sys_dev)
@@ -379,8 +390,11 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
379 if (err) 390 if (err)
380 return err; 391 return err;
381 392
382 microcode_init_cpu(cpu); 393 err = microcode_init_cpu(cpu);
383 return 0; 394 if (err)
395 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
396
397 return err;
384} 398}
385 399
386static int mc_sysdev_remove(struct sys_device *sys_dev) 400static int mc_sysdev_remove(struct sys_device *sys_dev)
@@ -404,14 +418,14 @@ static int mc_sysdev_resume(struct sys_device *dev)
404 return 0; 418 return 0;
405 419
406 /* only CPU 0 will apply ucode here */ 420 /* only CPU 0 will apply ucode here */
407 microcode_update_cpu(0); 421 microcode_update_cpu(NULL);
408 return 0; 422 return 0;
409} 423}
410 424
411static struct sysdev_driver mc_sysdev_driver = { 425static struct sysdev_driver mc_sysdev_driver = {
412 .add = mc_sysdev_add, 426 .add = mc_sysdev_add,
413 .remove = mc_sysdev_remove, 427 .remove = mc_sysdev_remove,
414 .resume = mc_sysdev_resume, 428 .resume = mc_sysdev_resume,
415}; 429};
416 430
417static __cpuinit int 431static __cpuinit int
@@ -424,7 +438,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
424 switch (action) { 438 switch (action) {
425 case CPU_ONLINE: 439 case CPU_ONLINE:
426 case CPU_ONLINE_FROZEN: 440 case CPU_ONLINE_FROZEN:
427 microcode_init_cpu(cpu); 441 if (microcode_init_cpu(cpu))
442 printk(KERN_ERR "microcode: failed to init CPU%d\n",
443 cpu);
428 case CPU_DOWN_FAILED: 444 case CPU_DOWN_FAILED:
429 case CPU_DOWN_FAILED_FROZEN: 445 case CPU_DOWN_FAILED_FROZEN:
430 pr_debug("microcode: CPU%d added\n", cpu); 446 pr_debug("microcode: CPU%d added\n", cpu);
@@ -448,7 +464,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
448} 464}
449 465
450static struct notifier_block __refdata mc_cpu_notifier = { 466static struct notifier_block __refdata mc_cpu_notifier = {
451 .notifier_call = mc_cpu_callback, 467 .notifier_call = mc_cpu_callback,
452}; 468};
453 469
454static int __init microcode_init(void) 470static int __init microcode_init(void)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index b7f4c929e615..149b9ec7c1ab 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,28 +70,28 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
73#include <linux/capability.h> 74#include <linux/capability.h>
74#include <linux/kernel.h> 75#include <linux/miscdevice.h>
75#include <linux/init.h> 76#include <linux/firmware.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h> 77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
78#include <linux/cpumask.h> 79#include <linux/cpumask.h>
79#include <linux/module.h> 80#include <linux/uaccess.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h> 81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h> 82#include <linux/kernel.h>
83#include <linux/spinlock.h> 83#include <linux/module.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h> 84#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
87#include <linux/cpu.h> 88#include <linux/cpu.h>
88#include <linux/firmware.h> 89#include <linux/fs.h>
89#include <linux/platform_device.h> 90#include <linux/mm.h>
90 91
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h> 92#include <asm/microcode.h>
93#include <asm/processor.h>
94#include <asm/msr.h>
95 95
96MODULE_DESCRIPTION("Microcode Update Driver"); 96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -129,12 +129,13 @@ struct extended_sigtable {
129 struct extended_signature sigs[0]; 129 struct extended_signature sigs[0];
130}; 130};
131 131
132#define DEFAULT_UCODE_DATASIZE (2000) 132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) 133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) 134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) 135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) 136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32)) 137#define DWSIZE (sizeof(u32))
138
138#define get_totalsize(mc) \ 139#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \ 140 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \ 141 ((struct microcode_intel *)mc)->hdr.totalsize : \
@@ -196,31 +197,32 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; 197 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
197} 198}
198 199
199static inline int 200static inline int
200update_match_revision(struct microcode_header_intel *mc_header, int rev) 201update_match_revision(struct microcode_header_intel *mc_header, int rev)
201{ 202{
202 return (mc_header->rev <= rev) ? 0 : 1; 203 return (mc_header->rev <= rev) ? 0 : 1;
203} 204}
204 205
205static int microcode_sanity_check(void *mc) 206static int microcode_sanity_check(void *mc)
206{ 207{
208 unsigned long total_size, data_size, ext_table_size;
207 struct microcode_header_intel *mc_header = mc; 209 struct microcode_header_intel *mc_header = mc;
208 struct extended_sigtable *ext_header = NULL; 210 struct extended_sigtable *ext_header = NULL;
209 struct extended_signature *ext_sig;
210 unsigned long total_size, data_size, ext_table_size;
211 int sum, orig_sum, ext_sigcount = 0, i; 211 int sum, orig_sum, ext_sigcount = 0, i;
212 struct extended_signature *ext_sig;
212 213
213 total_size = get_totalsize(mc_header); 214 total_size = get_totalsize(mc_header);
214 data_size = get_datasize(mc_header); 215 data_size = get_datasize(mc_header);
216
215 if (data_size + MC_HEADER_SIZE > total_size) { 217 if (data_size + MC_HEADER_SIZE > total_size) {
216 printk(KERN_ERR "microcode: error! " 218 printk(KERN_ERR "microcode: error! "
217 "Bad data size in microcode data file\n"); 219 "Bad data size in microcode data file\n");
218 return -EINVAL; 220 return -EINVAL;
219 } 221 }
220 222
221 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 223 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
222 printk(KERN_ERR "microcode: error! " 224 printk(KERN_ERR "microcode: error! "
223 "Unknown microcode update format\n"); 225 "Unknown microcode update format\n");
224 return -EINVAL; 226 return -EINVAL;
225 } 227 }
226 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 228 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
@@ -318,11 +320,15 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 320
319static void apply_microcode(int cpu) 321static void apply_microcode(int cpu)
320{ 322{
323 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci;
321 unsigned long flags; 325 unsigned long flags;
322 unsigned int val[2]; 326 unsigned int val[2];
323 int cpu_num = raw_smp_processor_id(); 327 int cpu_num;
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 328
325 struct microcode_intel *mc_intel = uci->mc; 329 cpu_num = raw_smp_processor_id();
330 uci = ucode_cpu_info + cpu;
331 mc_intel = uci->mc;
326 332
327 /* We should bind the task to the CPU */ 333 /* We should bind the task to the CPU */
328 BUG_ON(cpu_num != cpu); 334 BUG_ON(cpu_num != cpu);
@@ -348,15 +354,17 @@ static void apply_microcode(int cpu)
348 spin_unlock_irqrestore(&microcode_update_lock, flags); 354 spin_unlock_irqrestore(&microcode_update_lock, flags);
349 if (val[1] != mc_intel->hdr.rev) { 355 if (val[1] != mc_intel->hdr.rev) {
350 printk(KERN_ERR "microcode: CPU%d update from revision " 356 printk(KERN_ERR "microcode: CPU%d update from revision "
351 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); 357 "0x%x to 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]);
352 return; 359 return;
353 } 360 }
354 printk(KERN_INFO "microcode: CPU%d updated from revision " 361 printk(KERN_INFO "microcode: CPU%d updated from revision "
355 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 362 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
356 cpu_num, uci->cpu_sig.rev, val[1], 363 cpu_num, uci->cpu_sig.rev, val[1],
357 mc_intel->hdr.date & 0xffff, 364 mc_intel->hdr.date & 0xffff,
358 mc_intel->hdr.date >> 24, 365 mc_intel->hdr.date >> 24,
359 (mc_intel->hdr.date >> 16) & 0xff); 366 (mc_intel->hdr.date >> 16) & 0xff);
367
360 uci->cpu_sig.rev = val[1]; 368 uci->cpu_sig.rev = val[1];
361} 369}
362 370
@@ -404,18 +412,23 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
404 leftover -= mc_size; 412 leftover -= mc_size;
405 } 413 }
406 414
407 if (new_mc) { 415 if (!new_mc)
408 if (!leftover) { 416 goto out;
409 if (uci->mc) 417
410 vfree(uci->mc); 418 if (leftover) {
411 uci->mc = (struct microcode_intel *)new_mc; 419 vfree(new_mc);
412 pr_debug("microcode: CPU%d found a matching microcode update with" 420 goto out;
413 " version 0x%x (current=0x%x)\n",
414 cpu, new_rev, uci->cpu_sig.rev);
415 } else
416 vfree(new_mc);
417 } 421 }
418 422
423 if (uci->mc)
424 vfree(uci->mc);
425 uci->mc = (struct microcode_intel *)new_mc;
426
427 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev);
430
431 out:
419 return (int)leftover; 432 return (int)leftover;
420} 433}
421 434
@@ -442,8 +455,8 @@ static int request_microcode_fw(int cpu, struct device *device)
442 return ret; 455 return ret;
443 } 456 }
444 457
445 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 458 ret = generic_load_microcode(cpu, (void *)firmware->data,
446 &get_ucode_fw); 459 firmware->size, &get_ucode_fw);
447 460
448 release_firmware(firmware); 461 release_firmware(firmware);
449 462
@@ -460,7 +473,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
460 /* We should bind the task to the CPU */ 473 /* We should bind the task to the CPU */
461 BUG_ON(cpu != raw_smp_processor_id()); 474 BUG_ON(cpu != raw_smp_processor_id());
462 475
463 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); 476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
464} 477}
465 478
466static void microcode_fini_cpu(int cpu) 479static void microcode_fini_cpu(int cpu)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f9..712d15fdc416 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
226 return 0; 226 return 0;
227} 227}
228 228
229static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { 229static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
230 { 230 {
231 .callback = set_check_enable_amd_mmconf, 231 .callback = set_check_enable_amd_mmconf,
232 .ident = "Sun Microsystems Machine", 232 .ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb1..0edd819050e7 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
42{ 42{
43 vfree(module_region); 43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception 44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */ 45 table entries. */
46} 46}
47 47
48/* We don't need anything special. */ 48/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
113 *para = NULL; 113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115 115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name)) 117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s; 118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s; 120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s; 122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s; 124 para = s;
125 } 125 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b1..c23880b90b5c 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/pgtable.h> 31#include <asm/pgtable.h>
32 32
33#define DEBUGP(fmt...) 33#define DEBUGP(fmt...)
34 34
35#ifndef CONFIG_UML 35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region) 36void module_free(struct module *mod, void *module_region)
37{ 37{
38 vfree(module_region); 38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception 39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */ 40 table entries. */
41} 41}
42 42
43void *module_alloc(unsigned long size) 43void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; 77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
78 Elf64_Sym *sym; 78 Elf64_Sym *sym;
79 void *loc; 79 void *loc;
80 u64 val; 80 u64 val;
81 81
82 DEBUGP("Applying relocate section %u to %u\n", relsec, 82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info); 83 sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr 91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
92 + ELF64_R_SYM(rel[i].r_info); 92 + ELF64_R_SYM(rel[i].r_info);
93 93
94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
95 (int)ELF64_R_TYPE(rel[i].r_info), 95 (int)ELF64_R_TYPE(rel[i].r_info),
96 sym->st_value, rel[i].r_addend, (u64)loc); 96 sym->st_value, rel[i].r_addend, (u64)loc);
97 97
98 val = sym->st_value + rel[i].r_addend; 98 val = sym->st_value + rel[i].r_addend;
99 99
100 switch (ELF64_R_TYPE(rel[i].r_info)) { 100 switch (ELF64_R_TYPE(rel[i].r_info)) {
101 case R_X86_64_NONE: 101 case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
113 if ((s64)val != *(s32 *)loc) 113 if ((s64)val != *(s32 *)loc)
114 goto overflow; 114 goto overflow;
115 break; 115 break;
116 case R_X86_64_PC32: 116 case R_X86_64_PC32:
117 val -= (u64)loc; 117 val -= (u64)loc;
118 *(u32 *)loc = val; 118 *(u32 *)loc = val;
119#if 0 119#if 0
120 if ((s64)val != *(s32 *)loc) 120 if ((s64)val != *(s32 *)loc)
121 goto overflow; 121 goto overflow;
122#endif 122#endif
123 break; 123 break;
124 default: 124 default:
125 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", 125 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
126 me->name, ELF64_R_TYPE(rel[i].r_info)); 126 me->name, ELF64_R_TYPE(rel[i].r_info));
127 return -ENOEXEC; 127 return -ENOEXEC;
128 } 128 }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
130 return 0; 130 return 0;
131 131
132overflow: 132overflow:
133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
134 (int)ELF64_R_TYPE(rel[i].r_info), val); 134 (int)ELF64_R_TYPE(rel[i].r_info), val);
135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
136 me->name); 136 me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
143 unsigned int relsec, 143 unsigned int relsec,
144 struct module *me) 144 struct module *me)
145{ 145{
146 printk("non add relocation not supported\n"); 146 printk(KERN_ERR "non add relocation not supported\n");
147 return -ENOSYS; 147 return -ENOSYS;
148} 148}
149 149
150int module_finalize(const Elf_Ehdr *hdr, 150int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
152 struct module *me) 152 struct module *me)
153{ 153{
154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL; 155 *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
161 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 161 if (!strcmp(".altinstructions", secstrings + s->sh_name))
162 alt = s; 162 alt = s;
163 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
164 locks= s; 164 locks = s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s; 166 para = s;
167 } 167 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a649a4ccad43..dce99dca6cf8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -3,7 +3,7 @@
3 * compliant MP-table parsing routines. 3 * compliant MP-table parsing routines.
4 * 4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 6 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de> 7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
8 */ 8 */
9 9
@@ -29,12 +29,7 @@
29#include <asm/setup.h> 29#include <asm/setup.h>
30#include <asm/smp.h> 30#include <asm/smp.h>
31 31
32#include <mach_apic.h> 32#include <asm/apic.h>
33#ifdef CONFIG_X86_32
34#include <mach_apicdef.h>
35#include <mach_mpparse.h>
36#endif
37
38/* 33/*
39 * Checksum an MP configuration block. 34 * Checksum an MP configuration block.
40 */ 35 */
@@ -114,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
114 } else 109 } else
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
116} 111}
117#endif
118
119#ifdef CONFIG_X86_IO_APIC
120 112
121static int bad_ioapic(unsigned long address) 113static int bad_ioapic(unsigned long address)
122{ 114{
@@ -144,11 +136,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
144 if (bad_ioapic(m->apicaddr)) 136 if (bad_ioapic(m->apicaddr))
145 return; 137 return;
146 138
147 mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; 139 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->apicid; 140 mp_ioapics[nr_ioapics].apicid = m->apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->type; 141 mp_ioapics[nr_ioapics].type = m->type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->apicver; 142 mp_ioapics[nr_ioapics].apicver = m->apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->flags; 143 mp_ioapics[nr_ioapics].flags = m->flags;
152 nr_ioapics++; 144 nr_ioapics++;
153} 145}
154 146
@@ -160,55 +152,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
160 m->srcbusirq, m->dstapic, m->dstirq); 152 m->srcbusirq, m->dstapic, m->dstirq);
161} 153}
162 154
163static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 155static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
164{ 156{
165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
166 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
167 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 159 mp_irq->irqtype, mp_irq->irqflag & 3,
168 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 160 (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
169 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); 161 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
170} 162}
171 163
172static void __init assign_to_mp_irq(struct mpc_intsrc *m, 164static void __init assign_to_mp_irq(struct mpc_intsrc *m,
173 struct mp_config_intsrc *mp_irq) 165 struct mpc_intsrc *mp_irq)
174{ 166{
175 mp_irq->mp_dstapic = m->dstapic; 167 mp_irq->dstapic = m->dstapic;
176 mp_irq->mp_type = m->type; 168 mp_irq->type = m->type;
177 mp_irq->mp_irqtype = m->irqtype; 169 mp_irq->irqtype = m->irqtype;
178 mp_irq->mp_irqflag = m->irqflag; 170 mp_irq->irqflag = m->irqflag;
179 mp_irq->mp_srcbus = m->srcbus; 171 mp_irq->srcbus = m->srcbus;
180 mp_irq->mp_srcbusirq = m->srcbusirq; 172 mp_irq->srcbusirq = m->srcbusirq;
181 mp_irq->mp_dstirq = m->dstirq; 173 mp_irq->dstirq = m->dstirq;
182} 174}
183 175
184static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, 176static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
185 struct mpc_intsrc *m) 177 struct mpc_intsrc *m)
186{ 178{
187 m->dstapic = mp_irq->mp_dstapic; 179 m->dstapic = mp_irq->dstapic;
188 m->type = mp_irq->mp_type; 180 m->type = mp_irq->type;
189 m->irqtype = mp_irq->mp_irqtype; 181 m->irqtype = mp_irq->irqtype;
190 m->irqflag = mp_irq->mp_irqflag; 182 m->irqflag = mp_irq->irqflag;
191 m->srcbus = mp_irq->mp_srcbus; 183 m->srcbus = mp_irq->srcbus;
192 m->srcbusirq = mp_irq->mp_srcbusirq; 184 m->srcbusirq = mp_irq->srcbusirq;
193 m->dstirq = mp_irq->mp_dstirq; 185 m->dstirq = mp_irq->dstirq;
194} 186}
195 187
196static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, 188static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
197 struct mpc_intsrc *m) 189 struct mpc_intsrc *m)
198{ 190{
199 if (mp_irq->mp_dstapic != m->dstapic) 191 if (mp_irq->dstapic != m->dstapic)
200 return 1; 192 return 1;
201 if (mp_irq->mp_type != m->type) 193 if (mp_irq->type != m->type)
202 return 2; 194 return 2;
203 if (mp_irq->mp_irqtype != m->irqtype) 195 if (mp_irq->irqtype != m->irqtype)
204 return 3; 196 return 3;
205 if (mp_irq->mp_irqflag != m->irqflag) 197 if (mp_irq->irqflag != m->irqflag)
206 return 4; 198 return 4;
207 if (mp_irq->mp_srcbus != m->srcbus) 199 if (mp_irq->srcbus != m->srcbus)
208 return 5; 200 return 5;
209 if (mp_irq->mp_srcbusirq != m->srcbusirq) 201 if (mp_irq->srcbusirq != m->srcbusirq)
210 return 6; 202 return 6;
211 if (mp_irq->mp_dstirq != m->dstirq) 203 if (mp_irq->dstirq != m->dstirq)
212 return 7; 204 return 7;
213 205
214 return 0; 206 return 0;
@@ -229,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
229 if (++mp_irq_entries == MAX_IRQ_SOURCES) 221 if (++mp_irq_entries == MAX_IRQ_SOURCES)
230 panic("Max # of irq sources exceeded!!\n"); 222 panic("Max # of irq sources exceeded!!\n");
231} 223}
224#else /* CONFIG_X86_IO_APIC */
225static inline void __init MP_bus_info(struct mpc_bus *m) {}
226static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
227static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
228#endif /* CONFIG_X86_IO_APIC */
232 229
233#endif
234 230
235static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 231static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
236{ 232{
@@ -280,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
280 return 1; 276 return 1;
281} 277}
282 278
279static void skip_entry(unsigned char **ptr, int *count, int size)
280{
281 *ptr += size;
282 *count += size;
283}
284
285static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
286{
287 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
288 "type %x\n", *mpt);
289 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
290 1, mpc, mpc->length, 1);
291}
292
283static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 293static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
284{ 294{
285 char str[16]; 295 char str[16];
@@ -292,16 +302,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
292 return 0; 302 return 0;
293 303
294#ifdef CONFIG_X86_32 304#ifdef CONFIG_X86_32
295 /* 305 generic_mps_oem_check(mpc, oem, str);
296 * need to make sure summit and es7000's mps_oem_check is safe to be
297 * called early via genericarch 's mps_oem_check
298 */
299 if (early) {
300#ifdef CONFIG_X86_NUMAQ
301 numaq_mps_oem_check(mpc, oem, str);
302#endif
303 } else
304 mps_oem_check(mpc, oem, str);
305#endif 306#endif
306 /* save the local APIC address, it might be non-default */ 307 /* save the local APIC address, it might be non-default */
307 if (!acpi_lapic) 308 if (!acpi_lapic)
@@ -324,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
324 while (count < mpc->length) { 325 while (count < mpc->length) {
325 switch (*mpt) { 326 switch (*mpt) {
326 case MP_PROCESSOR: 327 case MP_PROCESSOR:
327 { 328 /* ACPI may have already provided this data */
328 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 329 if (!acpi_lapic)
329 /* ACPI may have already provided this data */ 330 MP_processor_info((struct mpc_cpu *)mpt);
330 if (!acpi_lapic) 331 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
331 MP_processor_info(m); 332 break;
332 mpt += sizeof(*m);
333 count += sizeof(*m);
334 break;
335 }
336 case MP_BUS: 333 case MP_BUS:
337 { 334 MP_bus_info((struct mpc_bus *)mpt);
338 struct mpc_bus *m = (struct mpc_bus *)mpt; 335 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
339#ifdef CONFIG_X86_IO_APIC 336 break;
340 MP_bus_info(m);
341#endif
342 mpt += sizeof(*m);
343 count += sizeof(*m);
344 break;
345 }
346 case MP_IOAPIC: 337 case MP_IOAPIC:
347 { 338 MP_ioapic_info((struct mpc_ioapic *)mpt);
348#ifdef CONFIG_X86_IO_APIC 339 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
349 struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; 340 break;
350 MP_ioapic_info(m);
351#endif
352 mpt += sizeof(struct mpc_ioapic);
353 count += sizeof(struct mpc_ioapic);
354 break;
355 }
356 case MP_INTSRC: 341 case MP_INTSRC:
357 { 342 MP_intsrc_info((struct mpc_intsrc *)mpt);
358#ifdef CONFIG_X86_IO_APIC 343 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
359 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 344 break;
360
361 MP_intsrc_info(m);
362#endif
363 mpt += sizeof(struct mpc_intsrc);
364 count += sizeof(struct mpc_intsrc);
365 break;
366 }
367 case MP_LINTSRC: 345 case MP_LINTSRC:
368 { 346 MP_lintsrc_info((struct mpc_lintsrc *)mpt);
369 struct mpc_lintsrc *m = 347 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
370 (struct mpc_lintsrc *)mpt; 348 break;
371 MP_lintsrc_info(m);
372 mpt += sizeof(*m);
373 count += sizeof(*m);
374 break;
375 }
376 default: 349 default:
377 /* wrong mptable */ 350 /* wrong mptable */
378 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 351 smp_dump_mptable(mpc, mpt);
379 printk(KERN_ERR "type %x\n", *mpt);
380 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
381 1, mpc, mpc->length, 1);
382 count = mpc->length; 352 count = mpc->length;
383 break; 353 break;
384 } 354 }
@@ -386,13 +356,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
386 (*x86_quirks->mpc_record)++; 356 (*x86_quirks->mpc_record)++;
387 } 357 }
388 358
389#ifdef CONFIG_X86_GENERICARCH 359#ifdef CONFIG_X86_BIGSMP
390 generic_bigsmp_probe(); 360 generic_bigsmp_probe();
391#endif 361#endif
392 362
393#ifdef CONFIG_X86_32 363 if (apic->setup_apic_routing)
394 setup_apic_routing(); 364 apic->setup_apic_routing();
395#endif 365
396 if (!num_processors) 366 if (!num_processors)
397 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 367 printk(KERN_ERR "MPTABLE: no processors registered!\n");
398 return num_processors; 368 return num_processors;
@@ -417,7 +387,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
417 intsrc.type = MP_INTSRC; 387 intsrc.type = MP_INTSRC;
418 intsrc.irqflag = 0; /* conforming */ 388 intsrc.irqflag = 0; /* conforming */
419 intsrc.srcbus = 0; 389 intsrc.srcbus = 0;
420 intsrc.dstapic = mp_ioapics[0].mp_apicid; 390 intsrc.dstapic = mp_ioapics[0].apicid;
421 391
422 intsrc.irqtype = mp_INT; 392 intsrc.irqtype = mp_INT;
423 393
@@ -570,14 +540,76 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
570 } 540 }
571} 541}
572 542
573static struct intel_mp_floating *mpf_found; 543static struct mpf_intel *mpf_found;
544
545static unsigned long __init get_mpc_size(unsigned long physptr)
546{
547 struct mpc_table *mpc;
548 unsigned long size;
549
550 mpc = early_ioremap(physptr, PAGE_SIZE);
551 size = mpc->length;
552 early_iounmap(mpc, PAGE_SIZE);
553 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
554
555 return size;
556}
557
558static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
559{
560 struct mpc_table *mpc;
561 unsigned long size;
562
563 size = get_mpc_size(mpf->physptr);
564 mpc = early_ioremap(mpf->physptr, size);
565 /*
566 * Read the physical hardware table. Anything here will
567 * override the defaults.
568 */
569 if (!smp_read_mpc(mpc, early)) {
570#ifdef CONFIG_X86_LOCAL_APIC
571 smp_found_config = 0;
572#endif
573 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
574 "... disabling SMP support. (tell your hw vendor)\n");
575 early_iounmap(mpc, size);
576 return -1;
577 }
578 early_iounmap(mpc, size);
579
580 if (early)
581 return -1;
582
583#ifdef CONFIG_X86_IO_APIC
584 /*
585 * If there are no explicit MP IRQ entries, then we are
586 * broken. We set up most of the low 16 IO-APIC pins to
587 * ISA defaults and hope it will work.
588 */
589 if (!mp_irq_entries) {
590 struct mpc_bus bus;
591
592 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
593 "using default mptable. (tell your hw vendor)\n");
594
595 bus.type = MP_BUS;
596 bus.busid = 0;
597 memcpy(bus.bustype, "ISA ", 6);
598 MP_bus_info(&bus);
599
600 construct_default_ioirq_mptable(0);
601 }
602#endif
603
604 return 0;
605}
574 606
575/* 607/*
576 * Scan the memory blocks for an SMP configuration block. 608 * Scan the memory blocks for an SMP configuration block.
577 */ 609 */
578static void __init __get_smp_config(unsigned int early) 610static void __init __get_smp_config(unsigned int early)
579{ 611{
580 struct intel_mp_floating *mpf = mpf_found; 612 struct mpf_intel *mpf = mpf_found;
581 613
582 if (!mpf) 614 if (!mpf)
583 return; 615 return;
@@ -598,9 +630,9 @@ static void __init __get_smp_config(unsigned int early)
598 } 630 }
599 631
600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 632 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
601 mpf->mpf_specification); 633 mpf->specification);
602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 634#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
603 if (mpf->mpf_feature2 & (1 << 7)) { 635 if (mpf->feature2 & (1 << 7)) {
604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 636 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
605 pic_mode = 1; 637 pic_mode = 1;
606 } else { 638 } else {
@@ -611,7 +643,7 @@ static void __init __get_smp_config(unsigned int early)
611 /* 643 /*
612 * Now see if we need to read further. 644 * Now see if we need to read further.
613 */ 645 */
614 if (mpf->mpf_feature1 != 0) { 646 if (mpf->feature1 != 0) {
615 if (early) { 647 if (early) {
616 /* 648 /*
617 * local APIC has default address 649 * local APIC has default address
@@ -621,49 +653,12 @@ static void __init __get_smp_config(unsigned int early)
621 } 653 }
622 654
623 printk(KERN_INFO "Default MP configuration #%d\n", 655 printk(KERN_INFO "Default MP configuration #%d\n",
624 mpf->mpf_feature1); 656 mpf->feature1);
625 construct_default_ISA_mptable(mpf->mpf_feature1); 657 construct_default_ISA_mptable(mpf->feature1);
626
627 } else if (mpf->mpf_physptr) {
628 658
629 /* 659 } else if (mpf->physptr) {
630 * Read the physical hardware table. Anything here will 660 if (check_physptr(mpf, early))
631 * override the defaults.
632 */
633 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
634#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0;
636#endif
637 printk(KERN_ERR
638 "BIOS bug, MP table errors detected!...\n");
639 printk(KERN_ERR "... disabling SMP support. "
640 "(tell your hw vendor)\n");
641 return;
642 }
643
644 if (early)
645 return; 661 return;
646#ifdef CONFIG_X86_IO_APIC
647 /*
648 * If there are no explicit MP IRQ entries, then we are
649 * broken. We set up most of the low 16 IO-APIC pins to
650 * ISA defaults and hope it will work.
651 */
652 if (!mp_irq_entries) {
653 struct mpc_bus bus;
654
655 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
656 "using default mptable. "
657 "(tell your hw vendor)\n");
658
659 bus.type = MP_BUS;
660 bus.busid = 0;
661 memcpy(bus.bustype, "ISA ", 6);
662 MP_bus_info(&bus);
663
664 construct_default_ioirq_mptable(0);
665 }
666#endif
667 } else 662 } else
668 BUG(); 663 BUG();
669 664
@@ -684,54 +679,62 @@ void __init get_smp_config(void)
684 __get_smp_config(0); 679 __get_smp_config(0);
685} 680}
686 681
682static void smp_reserve_bootmem(struct mpf_intel *mpf)
683{
684 unsigned long size = get_mpc_size(mpf->physptr);
685#ifdef CONFIG_X86_32
686 /*
687 * We cannot access to MPC table to compute table size yet,
688 * as only few megabytes from the bottom is mapped now.
689 * PC-9800's MPC table places on the very last of physical
690 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
691 * yields BUG() in reserve_bootmem.
692 * also need to make sure physptr is below than max_low_pfn
693 * we don't need reserve the area above max_low_pfn
694 */
695 unsigned long end = max_low_pfn * PAGE_SIZE;
696
697 if (mpf->physptr < end) {
698 if (mpf->physptr + size > end)
699 size = end - mpf->physptr;
700 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
701 }
702#else
703 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
704#endif
705}
706
687static int __init smp_scan_config(unsigned long base, unsigned long length, 707static int __init smp_scan_config(unsigned long base, unsigned long length,
688 unsigned reserve) 708 unsigned reserve)
689{ 709{
690 unsigned int *bp = phys_to_virt(base); 710 unsigned int *bp = phys_to_virt(base);
691 struct intel_mp_floating *mpf; 711 struct mpf_intel *mpf;
692 712
693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 713 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
694 bp, length); 714 bp, length);
695 BUILD_BUG_ON(sizeof(*mpf) != 16); 715 BUILD_BUG_ON(sizeof(*mpf) != 16);
696 716
697 while (length > 0) { 717 while (length > 0) {
698 mpf = (struct intel_mp_floating *)bp; 718 mpf = (struct mpf_intel *)bp;
699 if ((*bp == SMP_MAGIC_IDENT) && 719 if ((*bp == SMP_MAGIC_IDENT) &&
700 (mpf->mpf_length == 1) && 720 (mpf->length == 1) &&
701 !mpf_checksum((unsigned char *)bp, 16) && 721 !mpf_checksum((unsigned char *)bp, 16) &&
702 ((mpf->mpf_specification == 1) 722 ((mpf->specification == 1)
703 || (mpf->mpf_specification == 4))) { 723 || (mpf->specification == 4))) {
704#ifdef CONFIG_X86_LOCAL_APIC 724#ifdef CONFIG_X86_LOCAL_APIC
705 smp_found_config = 1; 725 smp_found_config = 1;
706#endif 726#endif
707 mpf_found = mpf; 727 mpf_found = mpf;
708 728
709 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 729 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
710 mpf, virt_to_phys(mpf)); 730 mpf, (u64)virt_to_phys(mpf));
711 731
712 if (!reserve) 732 if (!reserve)
713 return 1; 733 return 1;
714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 734 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
715 BOOTMEM_DEFAULT);
716 if (mpf->mpf_physptr) {
717 unsigned long size = PAGE_SIZE;
718#ifdef CONFIG_X86_32
719 /*
720 * We cannot access to MPC table to compute
721 * table size yet, as only few megabytes from
722 * the bottom is mapped now.
723 * PC-9800's MPC table places on the very last
724 * of physical memory; so that simply reserving
725 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
726 * in reserve_bootmem.
727 */
728 unsigned long end = max_low_pfn * PAGE_SIZE;
729 if (mpf->mpf_physptr + size > end)
730 size = end - mpf->mpf_physptr;
731#endif
732 reserve_bootmem_generic(mpf->mpf_physptr, size,
733 BOOTMEM_DEFAULT); 735 BOOTMEM_DEFAULT);
734 } 736 if (mpf->physptr)
737 smp_reserve_bootmem(mpf);
735 738
736 return 1; 739 return 1;
737 } 740 }
@@ -809,15 +812,15 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
809 /* not legacy */ 812 /* not legacy */
810 813
811 for (i = 0; i < mp_irq_entries; i++) { 814 for (i = 0; i < mp_irq_entries; i++) {
812 if (mp_irqs[i].mp_irqtype != mp_INT) 815 if (mp_irqs[i].irqtype != mp_INT)
813 continue; 816 continue;
814 817
815 if (mp_irqs[i].mp_irqflag != 0x0f) 818 if (mp_irqs[i].irqflag != 0x0f)
816 continue; 819 continue;
817 820
818 if (mp_irqs[i].mp_srcbus != m->srcbus) 821 if (mp_irqs[i].srcbus != m->srcbus)
819 continue; 822 continue;
820 if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) 823 if (mp_irqs[i].srcbusirq != m->srcbusirq)
821 continue; 824 continue;
822 if (irq_used[i]) { 825 if (irq_used[i]) {
823 /* already claimed */ 826 /* already claimed */
@@ -834,7 +837,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
834#define SPARE_SLOT_NUM 20 837#define SPARE_SLOT_NUM 20
835 838
836static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; 839static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
837#endif 840
841static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
842{
843 int i;
844
845 apic_printk(APIC_VERBOSE, "OLD ");
846 print_MP_intsrc_info(m);
847
848 i = get_MP_intsrc_index(m);
849 if (i > 0) {
850 assign_to_mpc_intsrc(&mp_irqs[i], m);
851 apic_printk(APIC_VERBOSE, "NEW ");
852 print_mp_irq_info(&mp_irqs[i]);
853 return;
854 }
855 if (!i) {
856 /* legacy, do nothing */
857 return;
858 }
859 if (*nr_m_spare < SPARE_SLOT_NUM) {
860 /*
861 * not found (-1), or duplicated (-2) are invalid entries,
862 * we need to use the slot later
863 */
864 m_spare[*nr_m_spare] = m;
865 *nr_m_spare += 1;
866 }
867}
868#else /* CONFIG_X86_IO_APIC */
869static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
870#endif /* CONFIG_X86_IO_APIC */
871
872static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
873 int count)
874{
875 if (!mpc_new_phys) {
876 pr_info("No spare slots, try to append...take your risk, "
877 "new mpc_length %x\n", count);
878 } else {
879 if (count <= mpc_new_length)
880 pr_info("No spare slots, try to append..., "
881 "new mpc_length %x\n", count);
882 else {
883 pr_err("mpc_new_length %lx is too small\n",
884 mpc_new_length);
885 return -1;
886 }
887 }
888
889 return 0;
890}
838 891
839static int __init replace_intsrc_all(struct mpc_table *mpc, 892static int __init replace_intsrc_all(struct mpc_table *mpc,
840 unsigned long mpc_new_phys, 893 unsigned long mpc_new_phys,
@@ -842,77 +895,33 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
842{ 895{
843#ifdef CONFIG_X86_IO_APIC 896#ifdef CONFIG_X86_IO_APIC
844 int i; 897 int i;
845 int nr_m_spare = 0;
846#endif 898#endif
847
848 int count = sizeof(*mpc); 899 int count = sizeof(*mpc);
900 int nr_m_spare = 0;
849 unsigned char *mpt = ((unsigned char *)mpc) + count; 901 unsigned char *mpt = ((unsigned char *)mpc) + count;
850 902
851 printk(KERN_INFO "mpc_length %x\n", mpc->length); 903 printk(KERN_INFO "mpc_length %x\n", mpc->length);
852 while (count < mpc->length) { 904 while (count < mpc->length) {
853 switch (*mpt) { 905 switch (*mpt) {
854 case MP_PROCESSOR: 906 case MP_PROCESSOR:
855 { 907 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
856 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 908 break;
857 mpt += sizeof(*m);
858 count += sizeof(*m);
859 break;
860 }
861 case MP_BUS: 909 case MP_BUS:
862 { 910 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
863 struct mpc_bus *m = (struct mpc_bus *)mpt; 911 break;
864 mpt += sizeof(*m);
865 count += sizeof(*m);
866 break;
867 }
868 case MP_IOAPIC: 912 case MP_IOAPIC:
869 { 913 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
870 mpt += sizeof(struct mpc_ioapic); 914 break;
871 count += sizeof(struct mpc_ioapic);
872 break;
873 }
874 case MP_INTSRC: 915 case MP_INTSRC:
875 { 916 check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
876#ifdef CONFIG_X86_IO_APIC 917 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
877 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 918 break;
878
879 printk(KERN_INFO "OLD ");
880 print_MP_intsrc_info(m);
881 i = get_MP_intsrc_index(m);
882 if (i > 0) {
883 assign_to_mpc_intsrc(&mp_irqs[i], m);
884 printk(KERN_INFO "NEW ");
885 print_mp_irq_info(&mp_irqs[i]);
886 } else if (!i) {
887 /* legacy, do nothing */
888 } else if (nr_m_spare < SPARE_SLOT_NUM) {
889 /*
890 * not found (-1), or duplicated (-2)
891 * are invalid entries,
892 * we need to use the slot later
893 */
894 m_spare[nr_m_spare] = m;
895 nr_m_spare++;
896 }
897#endif
898 mpt += sizeof(struct mpc_intsrc);
899 count += sizeof(struct mpc_intsrc);
900 break;
901 }
902 case MP_LINTSRC: 919 case MP_LINTSRC:
903 { 920 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
904 struct mpc_lintsrc *m = 921 break;
905 (struct mpc_lintsrc *)mpt;
906 mpt += sizeof(*m);
907 count += sizeof(*m);
908 break;
909 }
910 default: 922 default:
911 /* wrong mptable */ 923 /* wrong mptable */
912 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 924 smp_dump_mptable(mpc, mpt);
913 printk(KERN_ERR "type %x\n", *mpt);
914 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
915 1, mpc, mpc->length, 1);
916 goto out; 925 goto out;
917 } 926 }
918 } 927 }
@@ -922,30 +931,22 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
922 if (irq_used[i]) 931 if (irq_used[i])
923 continue; 932 continue;
924 933
925 if (mp_irqs[i].mp_irqtype != mp_INT) 934 if (mp_irqs[i].irqtype != mp_INT)
926 continue; 935 continue;
927 936
928 if (mp_irqs[i].mp_irqflag != 0x0f) 937 if (mp_irqs[i].irqflag != 0x0f)
929 continue; 938 continue;
930 939
931 if (nr_m_spare > 0) { 940 if (nr_m_spare > 0) {
932 printk(KERN_INFO "*NEW* found "); 941 apic_printk(APIC_VERBOSE, "*NEW* found\n");
933 nr_m_spare--; 942 nr_m_spare--;
934 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 943 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
935 m_spare[nr_m_spare] = NULL; 944 m_spare[nr_m_spare] = NULL;
936 } else { 945 } else {
937 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 946 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
938 count += sizeof(struct mpc_intsrc); 947 count += sizeof(struct mpc_intsrc);
939 if (!mpc_new_phys) { 948 if (!check_slot(mpc_new_phys, mpc_new_length, count))
940 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); 949 goto out;
941 } else {
942 if (count <= mpc_new_length)
943 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
944 else {
945 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
946 goto out;
947 }
948 }
949 assign_to_mpc_intsrc(&mp_irqs[i], m); 950 assign_to_mpc_intsrc(&mp_irqs[i], m);
950 mpc->length = count; 951 mpc->length = count;
951 mpt += sizeof(struct mpc_intsrc); 952 mpt += sizeof(struct mpc_intsrc);
@@ -1001,7 +1002,7 @@ static int __init update_mp_table(void)
1001{ 1002{
1002 char str[16]; 1003 char str[16];
1003 char oem[10]; 1004 char oem[10];
1004 struct intel_mp_floating *mpf; 1005 struct mpf_intel *mpf;
1005 struct mpc_table *mpc, *mpc_new; 1006 struct mpc_table *mpc, *mpc_new;
1006 1007
1007 if (!enable_update_mptable) 1008 if (!enable_update_mptable)
@@ -1014,19 +1015,19 @@ static int __init update_mp_table(void)
1014 /* 1015 /*
1015 * Now see if we need to go further. 1016 * Now see if we need to go further.
1016 */ 1017 */
1017 if (mpf->mpf_feature1 != 0) 1018 if (mpf->feature1 != 0)
1018 return 0; 1019 return 0;
1019 1020
1020 if (!mpf->mpf_physptr) 1021 if (!mpf->physptr)
1021 return 0; 1022 return 0;
1022 1023
1023 mpc = phys_to_virt(mpf->mpf_physptr); 1024 mpc = phys_to_virt(mpf->physptr);
1024 1025
1025 if (!smp_check_mpc(mpc, oem, str)) 1026 if (!smp_check_mpc(mpc, oem, str))
1026 return 0; 1027 return 0;
1027 1028
1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); 1029 printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
1029 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); 1030 printk(KERN_INFO "physptr: %x\n", mpf->physptr);
1030 1031
1031 if (mpc_new_phys && mpc->length > mpc_new_length) { 1032 if (mpc_new_phys && mpc->length > mpc_new_length) {
1032 mpc_new_phys = 0; 1033 mpc_new_phys = 0;
@@ -1047,23 +1048,23 @@ static int __init update_mp_table(void)
1047 } 1048 }
1048 printk(KERN_INFO "use in-positon replacing\n"); 1049 printk(KERN_INFO "use in-positon replacing\n");
1049 } else { 1050 } else {
1050 mpf->mpf_physptr = mpc_new_phys; 1051 mpf->physptr = mpc_new_phys;
1051 mpc_new = phys_to_virt(mpc_new_phys); 1052 mpc_new = phys_to_virt(mpc_new_phys);
1052 memcpy(mpc_new, mpc, mpc->length); 1053 memcpy(mpc_new, mpc, mpc->length);
1053 mpc = mpc_new; 1054 mpc = mpc_new;
1054 /* check if we can modify that */ 1055 /* check if we can modify that */
1055 if (mpc_new_phys - mpf->mpf_physptr) { 1056 if (mpc_new_phys - mpf->physptr) {
1056 struct intel_mp_floating *mpf_new; 1057 struct mpf_intel *mpf_new;
1057 /* steal 16 bytes from [0, 1k) */ 1058 /* steal 16 bytes from [0, 1k) */
1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); 1059 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1059 mpf_new = phys_to_virt(0x400 - 16); 1060 mpf_new = phys_to_virt(0x400 - 16);
1060 memcpy(mpf_new, mpf, 16); 1061 memcpy(mpf_new, mpf, 16);
1061 mpf = mpf_new; 1062 mpf = mpf_new;
1062 mpf->mpf_physptr = mpc_new_phys; 1063 mpf->physptr = mpc_new_phys;
1063 } 1064 }
1064 mpf->mpf_checksum = 0; 1065 mpf->checksum = 0;
1065 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); 1066 mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
1066 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); 1067 printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
1067 } 1068 }
1068 1069
1069 /* 1070 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 726266695b2c..3cf3413ec626 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/uaccess.h>
38 39
39#include <asm/processor.h> 40#include <asm/processor.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h> 42#include <asm/system.h>
43 43
44static struct class *msr_class; 44static struct class *msr_class;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
deleted file mode 100644
index f2191d4f2717..000000000000
--- a/arch/x86/kernel/numaq_32.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/module.h>
30#include <linux/nodemask.h>
31#include <asm/numaq.h>
32#include <asm/topology.h>
33#include <asm/processor.h>
34#include <asm/genapic.h>
35#include <asm/e820.h>
36#include <asm/setup.h>
37
38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
39
40/*
41 * Function: smp_dump_qct()
42 *
43 * Description: gets memory layout from the quad config table. This
44 * function also updates node_online_map with the nodes (quads) present.
45 */
46static void __init smp_dump_qct(void)
47{
48 int node;
49 struct eachquadmem *eq;
50 struct sys_cfg_data *scd =
51 (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
52
53 nodes_clear(node_online_map);
54 for_each_node(node) {
55 if (scd->quads_present31_0 & (1 << node)) {
56 node_set_online(node);
57 eq = &scd->eq[node];
58 /* Convert to pages */
59 node_start_pfn[node] = MB_TO_PAGES(
60 eq->hi_shrd_mem_start - eq->priv_mem_size);
61 node_end_pfn[node] = MB_TO_PAGES(
62 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
63
64 e820_register_active_regions(node, node_start_pfn[node],
65 node_end_pfn[node]);
66 memory_present(node,
67 node_start_pfn[node], node_end_pfn[node]);
68 node_remap_size[node] = node_memmap_size_bytes(node,
69 node_start_pfn[node],
70 node_end_pfn[node]);
71 }
72 }
73}
74
75
76void __cpuinit numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_cpu *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
127 (m->cpufeature & CPU_MODEL_MASK) >> 4,
128 m->apicver, quad, logical_apicid);
129 return logical_apicid;
130}
131
132int mp_bus_id_to_node[MAX_MP_BUSSES];
133
134int mp_bus_id_to_local[MAX_MP_BUSSES];
135
136/* x86_quirks member */
137static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
138{
139 int quad = translation_table[mpc_record]->trans_quad;
140 int local = translation_table[mpc_record]->trans_local;
141
142 mp_bus_id_to_node[m->busid] = quad;
143 mp_bus_id_to_local[m->busid] = local;
144 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
145 m->busid, name, quad);
146}
147
148int quad_local_to_mp_bus_id [NR_CPUS/4][4];
149
150/* x86_quirks member */
151static void mpc_oem_pci_bus(struct mpc_bus *m)
152{
153 int quad = translation_table[mpc_record]->trans_quad;
154 int local = translation_table[mpc_record]->trans_local;
155
156 quad_local_to_mp_bus_id[quad][local] = m->busid;
157}
158
159static void __init MP_translation_info(struct mpc_config_translation *m)
160{
161 printk(KERN_INFO
162 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
163 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
164 m->trans_local);
165
166 if (mpc_record >= MAX_MPC_ENTRY)
167 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
168 else
169 translation_table[mpc_record] = m; /* stash this for later */
170 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
171 node_set_online(m->trans_quad);
172}
173
174static int __init mpf_checksum(unsigned char *mp, int len)
175{
176 int sum = 0;
177
178 while (len--)
179 sum += *mp++;
180
181 return sum & 0xFF;
182}
183
184/*
185 * Read/parse the MPC oem tables
186 */
187
188static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable,
189 unsigned short oemsize)
190{
191 int count = sizeof(*oemtable); /* the header size */
192 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
193
194 mpc_record = 0;
195 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
196 oemtable);
197 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
198 printk(KERN_WARNING
199 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
200 oemtable->signature[0], oemtable->signature[1],
201 oemtable->signature[2], oemtable->signature[3]);
202 return;
203 }
204 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
205 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
206 return;
207 }
208 while (count < oemtable->length) {
209 switch (*oemptr) {
210 case MP_TRANSLATION:
211 {
212 struct mpc_config_translation *m =
213 (struct mpc_config_translation *)oemptr;
214 MP_translation_info(m);
215 oemptr += sizeof(*m);
216 count += sizeof(*m);
217 ++mpc_record;
218 break;
219 }
220 default:
221 {
222 printk(KERN_WARNING
223 "Unrecognised OEM table entry type! - %d\n",
224 (int)*oemptr);
225 return;
226 }
227 }
228 }
229}
230
231static int __init numaq_setup_ioapic_ids(void)
232{
233 /* so can skip it */
234 return 1;
235}
236
237static int __init numaq_update_genapic(void)
238{
239 genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
240
241 return 0;
242}
243
244static struct x86_quirks numaq_x86_quirks __initdata = {
245 .arch_pre_time_init = numaq_pre_time_init,
246 .arch_time_init = NULL,
247 .arch_pre_intr_init = NULL,
248 .arch_memory_setup = NULL,
249 .arch_intr_init = NULL,
250 .arch_trap_init = NULL,
251 .mach_get_smp_config = NULL,
252 .mach_find_smp_config = NULL,
253 .mpc_record = &mpc_record,
254 .mpc_apic_id = mpc_apic_id,
255 .mpc_oem_bus_info = mpc_oem_bus_info,
256 .mpc_oem_pci_bus = mpc_oem_pci_bus,
257 .smp_read_mpc_oem = smp_read_mpc_oem,
258 .setup_ioapic_ids = numaq_setup_ioapic_ids,
259 .update_genapic = numaq_update_genapic,
260};
261
262void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
263{
264 if (strncmp(oem, "IBM NUMA", 8))
265 printk("Warning! Not a NUMA-Q system!\n");
266 else
267 found_numaq = 1;
268}
269
270static __init void early_check_numaq(void)
271{
272 /*
273 * Find possible boot-time SMP configuration:
274 */
275 early_find_smp_config();
276 /*
277 * get boot-time SMP configuration:
278 */
279 if (smp_found_config)
280 early_get_smp_config();
281
282 if (found_numaq)
283 x86_quirks = &numaq_x86_quirks;
284}
285
286int __init get_memcfg_numaq(void)
287{
288 early_check_numaq();
289 if (!found_numaq)
290 return 0;
291 smp_dump_qct();
292 return 1;
293}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 7a13fac63a1f..4006c522adc7 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = 0xc2; 206 olpc_platform_info.boardrev = olpc_board(0xc2);
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 95777b0faa73..3a7c5a44082e 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -26,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {
26}; 26};
27EXPORT_SYMBOL(pv_lock_ops); 27EXPORT_SYMBOL(pv_lock_ops);
28 28
29void __init paravirt_use_bytelocks(void)
30{
31#ifdef CONFIG_SMP
32 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
33 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
34 pv_lock_ops.spin_lock = __byte_spin_lock;
35 pv_lock_ops.spin_trylock = __byte_spin_trylock;
36 pv_lock_ops.spin_unlock = __byte_spin_unlock;
37#endif
38}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..8e45f4464880 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -28,7 +28,6 @@
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h> 31#include <asm/pgtable.h>
33#include <asm/time.h> 32#include <asm/time.h>
34#include <asm/pgalloc.h> 33#include <asm/pgalloc.h>
@@ -44,6 +43,17 @@ void _paravirt_nop(void)
44{ 43{
45} 44}
46 45
46/* identity function, which can be inlined */
47u32 _paravirt_ident_32(u32 x)
48{
49 return x;
50}
51
52u64 _paravirt_ident_64(u64 x)
53{
54 return x;
55}
56
47static void __init default_banner(void) 57static void __init default_banner(void)
48{ 58{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +148,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 148 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 149 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 150 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 151 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 152 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 153 ret = paravirt_patch_nop();
154
155 /* identity functions just return their single argument */
156 else if (opfunc == _paravirt_ident_32)
157 ret = paravirt_patch_ident_32(insnbuf, len);
158 else if (opfunc == _paravirt_ident_64)
159 ret = paravirt_patch_ident_64(insnbuf, len);
160
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 161 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 162 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -268,6 +285,32 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 285 return __get_cpu_var(paravirt_lazy_mode);
269} 286}
270 287
288void arch_flush_lazy_mmu_mode(void)
289{
290 preempt_disable();
291
292 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
293 WARN_ON(preempt_count() == 1);
294 arch_leave_lazy_mmu_mode();
295 arch_enter_lazy_mmu_mode();
296 }
297
298 preempt_enable();
299}
300
301void arch_flush_lazy_cpu_mode(void)
302{
303 preempt_disable();
304
305 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
306 WARN_ON(preempt_count() == 1);
307 arch_leave_lazy_cpu_mode();
308 arch_enter_lazy_cpu_mode();
309 }
310
311 preempt_enable();
312}
313
271struct pv_info pv_info = { 314struct pv_info pv_info = {
272 .name = "bare hardware", 315 .name = "bare hardware",
273 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -292,10 +335,10 @@ struct pv_time_ops pv_time_ops = {
292 335
293struct pv_irq_ops pv_irq_ops = { 336struct pv_irq_ops pv_irq_ops = {
294 .init_IRQ = native_init_IRQ, 337 .init_IRQ = native_init_IRQ,
295 .save_fl = native_save_fl, 338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
296 .restore_fl = native_restore_fl, 339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
297 .irq_disable = native_irq_disable, 340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
298 .irq_enable = native_irq_enable, 341 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
299 .safe_halt = native_safe_halt, 342 .safe_halt = native_safe_halt,
300 .halt = native_halt, 343 .halt = native_halt,
301#ifdef CONFIG_X86_64 344#ifdef CONFIG_X86_64
@@ -373,6 +416,14 @@ struct pv_apic_ops pv_apic_ops = {
373#endif 416#endif
374}; 417};
375 418
419#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
420/* 32-bit pagetable entries */
421#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
422#else
423/* 64-bit pagetable entries */
424#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
425#endif
426
376struct pv_mmu_ops pv_mmu_ops = { 427struct pv_mmu_ops pv_mmu_ops = {
377#ifndef CONFIG_X86_64 428#ifndef CONFIG_X86_64
378 .pagetable_setup_start = native_pagetable_setup_start, 429 .pagetable_setup_start = native_pagetable_setup_start,
@@ -419,27 +470,27 @@ struct pv_mmu_ops pv_mmu_ops = {
419#if PAGETABLE_LEVELS >= 3 470#if PAGETABLE_LEVELS >= 3
420#ifdef CONFIG_X86_PAE 471#ifdef CONFIG_X86_PAE
421 .set_pte_atomic = native_set_pte_atomic, 472 .set_pte_atomic = native_set_pte_atomic,
422 .set_pte_present = native_set_pte_present,
423 .pte_clear = native_pte_clear, 473 .pte_clear = native_pte_clear,
424 .pmd_clear = native_pmd_clear, 474 .pmd_clear = native_pmd_clear,
425#endif 475#endif
426 .set_pud = native_set_pud, 476 .set_pud = native_set_pud,
427 .pmd_val = native_pmd_val, 477
428 .make_pmd = native_make_pmd, 478 .pmd_val = PTE_IDENT,
479 .make_pmd = PTE_IDENT,
429 480
430#if PAGETABLE_LEVELS == 4 481#if PAGETABLE_LEVELS == 4
431 .pud_val = native_pud_val, 482 .pud_val = PTE_IDENT,
432 .make_pud = native_make_pud, 483 .make_pud = PTE_IDENT,
484
433 .set_pgd = native_set_pgd, 485 .set_pgd = native_set_pgd,
434#endif 486#endif
435#endif /* PAGETABLE_LEVELS >= 3 */ 487#endif /* PAGETABLE_LEVELS >= 3 */
436 488
437 .pte_val = native_pte_val, 489 .pte_val = PTE_IDENT,
438 .pte_flags = native_pte_flags, 490 .pgd_val = PTE_IDENT,
439 .pgd_val = native_pgd_val,
440 491
441 .make_pte = native_make_pte, 492 .make_pte = PTE_IDENT,
442 .make_pgd = native_make_pgd, 493 .make_pgd = PTE_IDENT,
443 494
444 .dup_mmap = paravirt_nop, 495 .dup_mmap = paravirt_nop,
445 .exit_mmap = paravirt_nop, 496 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d28bbdc35e4e..755c21e906f3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -380,8 +380,9 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
380 return tbl; 380 return tbl;
381} 381}
382 382
383static void calgary_unmap_sg(struct device *dev, 383static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
384 struct scatterlist *sglist, int nelems, int direction) 384 int nelems,enum dma_data_direction dir,
385 struct dma_attrs *attrs)
385{ 386{
386 struct iommu_table *tbl = find_iommu_table(dev); 387 struct iommu_table *tbl = find_iommu_table(dev);
387 struct scatterlist *s; 388 struct scatterlist *s;
@@ -404,7 +405,8 @@ static void calgary_unmap_sg(struct device *dev,
404} 405}
405 406
406static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 407static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
407 int nelems, int direction) 408 int nelems, enum dma_data_direction dir,
409 struct dma_attrs *attrs)
408{ 410{
409 struct iommu_table *tbl = find_iommu_table(dev); 411 struct iommu_table *tbl = find_iommu_table(dev);
410 struct scatterlist *s; 412 struct scatterlist *s;
@@ -429,15 +431,14 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
429 s->dma_address = (entry << PAGE_SHIFT) | s->offset; 431 s->dma_address = (entry << PAGE_SHIFT) | s->offset;
430 432
431 /* insert into HW table */ 433 /* insert into HW table */
432 tce_build(tbl, entry, npages, vaddr & PAGE_MASK, 434 tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
433 direction);
434 435
435 s->dma_length = s->length; 436 s->dma_length = s->length;
436 } 437 }
437 438
438 return nelems; 439 return nelems;
439error: 440error:
440 calgary_unmap_sg(dev, sg, nelems, direction); 441 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
441 for_each_sg(sg, s, nelems, i) { 442 for_each_sg(sg, s, nelems, i) {
442 sg->dma_address = bad_dma_address; 443 sg->dma_address = bad_dma_address;
443 sg->dma_length = 0; 444 sg->dma_length = 0;
@@ -445,10 +446,12 @@ error:
445 return 0; 446 return 0;
446} 447}
447 448
448static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 449static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
449 size_t size, int direction) 450 unsigned long offset, size_t size,
451 enum dma_data_direction dir,
452 struct dma_attrs *attrs)
450{ 453{
451 void *vaddr = phys_to_virt(paddr); 454 void *vaddr = page_address(page) + offset;
452 unsigned long uaddr; 455 unsigned long uaddr;
453 unsigned int npages; 456 unsigned int npages;
454 struct iommu_table *tbl = find_iommu_table(dev); 457 struct iommu_table *tbl = find_iommu_table(dev);
@@ -456,17 +459,18 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
456 uaddr = (unsigned long)vaddr; 459 uaddr = (unsigned long)vaddr;
457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE); 460 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
458 461
459 return iommu_alloc(dev, tbl, vaddr, npages, direction); 462 return iommu_alloc(dev, tbl, vaddr, npages, dir);
460} 463}
461 464
462static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 465static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
463 size_t size, int direction) 466 size_t size, enum dma_data_direction dir,
467 struct dma_attrs *attrs)
464{ 468{
465 struct iommu_table *tbl = find_iommu_table(dev); 469 struct iommu_table *tbl = find_iommu_table(dev);
466 unsigned int npages; 470 unsigned int npages;
467 471
468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); 472 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
469 iommu_free(tbl, dma_handle, npages); 473 iommu_free(tbl, dma_addr, npages);
470} 474}
471 475
472static void* calgary_alloc_coherent(struct device *dev, size_t size, 476static void* calgary_alloc_coherent(struct device *dev, size_t size,
@@ -515,13 +519,13 @@ static void calgary_free_coherent(struct device *dev, size_t size,
515 free_pages((unsigned long)vaddr, get_order(size)); 519 free_pages((unsigned long)vaddr, get_order(size));
516} 520}
517 521
518static struct dma_mapping_ops calgary_dma_ops = { 522static struct dma_map_ops calgary_dma_ops = {
519 .alloc_coherent = calgary_alloc_coherent, 523 .alloc_coherent = calgary_alloc_coherent,
520 .free_coherent = calgary_free_coherent, 524 .free_coherent = calgary_free_coherent,
521 .map_single = calgary_map_single,
522 .unmap_single = calgary_unmap_single,
523 .map_sg = calgary_map_sg, 525 .map_sg = calgary_map_sg,
524 .unmap_sg = calgary_unmap_sg, 526 .unmap_sg = calgary_unmap_sg,
527 .map_page = calgary_map_page,
528 .unmap_page = calgary_unmap_page,
525}; 529};
526 530
527static inline void __iomem * busno_to_bbar(unsigned char num) 531static inline void __iomem * busno_to_bbar(unsigned char num)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b25428533141..90f5b9ef5def 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,4 +1,5 @@
1#include <linux/dma-mapping.h> 1#include <linux/dma-mapping.h>
2#include <linux/dma-debug.h>
2#include <linux/dmar.h> 3#include <linux/dmar.h>
3#include <linux/bootmem.h> 4#include <linux/bootmem.h>
4#include <linux/pci.h> 5#include <linux/pci.h>
@@ -12,7 +13,7 @@
12 13
13static int forbid_dac __read_mostly; 14static int forbid_dac __read_mostly;
14 15
15struct dma_mapping_ops *dma_ops; 16struct dma_map_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 17EXPORT_SYMBOL(dma_ops);
17 18
18static int iommu_sac_force __read_mostly; 19static int iommu_sac_force __read_mostly;
@@ -44,6 +45,9 @@ struct device x86_dma_fallback_dev = {
44}; 45};
45EXPORT_SYMBOL(x86_dma_fallback_dev); 46EXPORT_SYMBOL(x86_dma_fallback_dev);
46 47
48/* Number of entries preallocated for DMA-API debugging */
49#define PREALLOC_DMA_DEBUG_ENTRIES 32768
50
47int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
48{ 52{
49 if (!dev->dma_mask || !dma_supported(dev, mask)) 53 if (!dev->dma_mask || !dma_supported(dev, mask))
@@ -224,7 +228,7 @@ early_param("iommu", iommu_setup);
224 228
225int dma_supported(struct device *dev, u64 mask) 229int dma_supported(struct device *dev, u64 mask)
226{ 230{
227 struct dma_mapping_ops *ops = get_dma_ops(dev); 231 struct dma_map_ops *ops = get_dma_ops(dev);
228 232
229#ifdef CONFIG_PCI 233#ifdef CONFIG_PCI
230 if (mask > 0xffffffff && forbid_dac > 0) { 234 if (mask > 0xffffffff && forbid_dac > 0) {
@@ -265,6 +269,12 @@ EXPORT_SYMBOL(dma_supported);
265 269
266static int __init pci_iommu_init(void) 270static int __init pci_iommu_init(void)
267{ 271{
272 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
273
274#ifdef CONFIG_PCI
275 dma_debug_add_bus(&pci_bus_type);
276#endif
277
268 calgary_iommu_init(); 278 calgary_iommu_init();
269 279
270 intel_iommu_init(); 280 intel_iommu_init();
@@ -290,8 +300,7 @@ fs_initcall(pci_iommu_init);
290static __devinit void via_no_dac(struct pci_dev *dev) 300static __devinit void via_no_dac(struct pci_dev *dev)
291{ 301{
292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
293 printk(KERN_INFO 303 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
295 forbid_dac = 1; 304 forbid_dac = 1;
296 } 305 }
297} 306}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d5768b1af080..b284b58c035c 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -255,10 +255,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
255} 255}
256 256
257/* Map a single area into the IOMMU */ 257/* Map a single area into the IOMMU */
258static dma_addr_t 258static dma_addr_t gart_map_page(struct device *dev, struct page *page,
259gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 259 unsigned long offset, size_t size,
260 enum dma_data_direction dir,
261 struct dma_attrs *attrs)
260{ 262{
261 unsigned long bus; 263 unsigned long bus;
264 phys_addr_t paddr = page_to_phys(page) + offset;
262 265
263 if (!dev) 266 if (!dev)
264 dev = &x86_dma_fallback_dev; 267 dev = &x86_dma_fallback_dev;
@@ -275,8 +278,9 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
275/* 278/*
276 * Free a DMA mapping. 279 * Free a DMA mapping.
277 */ 280 */
278static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, 281static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
279 size_t size, int direction) 282 size_t size, enum dma_data_direction dir,
283 struct dma_attrs *attrs)
280{ 284{
281 unsigned long iommu_page; 285 unsigned long iommu_page;
282 int npages; 286 int npages;
@@ -298,8 +302,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
298/* 302/*
299 * Wrapper for pci_unmap_single working with scatterlists. 303 * Wrapper for pci_unmap_single working with scatterlists.
300 */ 304 */
301static void 305static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
302gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 306 enum dma_data_direction dir, struct dma_attrs *attrs)
303{ 307{
304 struct scatterlist *s; 308 struct scatterlist *s;
305 int i; 309 int i;
@@ -307,7 +311,7 @@ gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
307 for_each_sg(sg, s, nents, i) { 311 for_each_sg(sg, s, nents, i) {
308 if (!s->dma_length || !s->length) 312 if (!s->dma_length || !s->length)
309 break; 313 break;
310 gart_unmap_single(dev, s->dma_address, s->dma_length, dir); 314 gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
311 } 315 }
312} 316}
313 317
@@ -329,7 +333,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
329 addr = dma_map_area(dev, addr, s->length, dir, 0); 333 addr = dma_map_area(dev, addr, s->length, dir, 0);
330 if (addr == bad_dma_address) { 334 if (addr == bad_dma_address) {
331 if (i > 0) 335 if (i > 0)
332 gart_unmap_sg(dev, sg, i, dir); 336 gart_unmap_sg(dev, sg, i, dir, NULL);
333 nents = 0; 337 nents = 0;
334 sg[0].dma_length = 0; 338 sg[0].dma_length = 0;
335 break; 339 break;
@@ -400,8 +404,8 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
400 * DMA map all entries in a scatterlist. 404 * DMA map all entries in a scatterlist.
401 * Merge chunks that have page aligned sizes into a continuous mapping. 405 * Merge chunks that have page aligned sizes into a continuous mapping.
402 */ 406 */
403static int 407static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
404gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 408 enum dma_data_direction dir, struct dma_attrs *attrs)
405{ 409{
406 struct scatterlist *s, *ps, *start_sg, *sgmap; 410 struct scatterlist *s, *ps, *start_sg, *sgmap;
407 int need = 0, nextneed, i, out, start; 411 int need = 0, nextneed, i, out, start;
@@ -468,7 +472,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
468 472
469error: 473error:
470 flush_gart(); 474 flush_gart();
471 gart_unmap_sg(dev, sg, out, dir); 475 gart_unmap_sg(dev, sg, out, dir, NULL);
472 476
473 /* When it was forced or merged try again in a dumb way */ 477 /* When it was forced or merged try again in a dumb way */
474 if (force_iommu || iommu_merge) { 478 if (force_iommu || iommu_merge) {
@@ -521,7 +525,7 @@ static void
521gart_free_coherent(struct device *dev, size_t size, void *vaddr, 525gart_free_coherent(struct device *dev, size_t size, void *vaddr,
522 dma_addr_t dma_addr) 526 dma_addr_t dma_addr)
523{ 527{
524 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); 528 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
525 free_pages((unsigned long)vaddr, get_order(size)); 529 free_pages((unsigned long)vaddr, get_order(size));
526} 530}
527 531
@@ -707,11 +711,11 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
707 return -1; 711 return -1;
708} 712}
709 713
710static struct dma_mapping_ops gart_dma_ops = { 714static struct dma_map_ops gart_dma_ops = {
711 .map_single = gart_map_single,
712 .unmap_single = gart_unmap_single,
713 .map_sg = gart_map_sg, 715 .map_sg = gart_map_sg,
714 .unmap_sg = gart_unmap_sg, 716 .unmap_sg = gart_unmap_sg,
717 .map_page = gart_map_page,
718 .unmap_page = gart_unmap_page,
715 .alloc_coherent = gart_alloc_coherent, 719 .alloc_coherent = gart_alloc_coherent,
716 .free_coherent = gart_free_coherent, 720 .free_coherent = gart_free_coherent,
717}; 721};
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c8..c6d703b39326 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This 1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */ 2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/pci.h>
8#include <linux/mm.h>
9 9
10#include <asm/iommu.h>
11#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/iommu.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
@@ -25,19 +25,19 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
25 return 1; 25 return 1;
26} 26}
27 27
28static dma_addr_t 28static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
29nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, 29 unsigned long offset, size_t size,
30 int direction) 30 enum dma_data_direction dir,
31 struct dma_attrs *attrs)
31{ 32{
32 dma_addr_t bus = paddr; 33 dma_addr_t bus = page_to_phys(page) + offset;
33 WARN_ON(size == 0); 34 WARN_ON(size == 0);
34 if (!check_addr("map_single", hwdev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
35 return bad_dma_address; 36 return bad_dma_address;
36 flush_write_buffers(); 37 flush_write_buffers();
37 return bus; 38 return bus;
38} 39}
39 40
40
41/* Map a set of buffers described by scatterlist in streaming 41/* Map a set of buffers described by scatterlist in streaming
42 * mode for DMA. This is the scatter-gather version of the 42 * mode for DMA. This is the scatter-gather version of the
43 * above pci_map_single interface. Here the scatter gather list 43 * above pci_map_single interface. Here the scatter gather list
@@ -54,7 +54,8 @@ nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
54 * the same here. 54 * the same here.
55 */ 55 */
56static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, 56static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
57 int nents, int direction) 57 int nents, enum dma_data_direction dir,
58 struct dma_attrs *attrs)
58{ 59{
59 struct scatterlist *s; 60 struct scatterlist *s;
60 int i; 61 int i;
@@ -78,12 +79,12 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 free_pages((unsigned long)vaddr, get_order(size)); 79 free_pages((unsigned long)vaddr, get_order(size));
79} 80}
80 81
81struct dma_mapping_ops nommu_dma_ops = { 82struct dma_map_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent, 83 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent, 84 .free_coherent = nommu_free_coherent,
84 .map_single = nommu_map_single, 85 .map_sg = nommu_map_sg,
85 .map_sg = nommu_map_sg, 86 .map_page = nommu_map_page,
86 .is_phys = 1, 87 .is_phys = 1,
87}; 88};
88 89
89void __init no_iommu_init(void) 90void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb.c
index d59c91747665..34f12e9996ed 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -33,18 +33,11 @@ phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
33 return baddr; 33 return baddr;
34} 34}
35 35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) 36int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
37{ 37{
38 return 0; 38 return 0;
39} 39}
40 40
41static dma_addr_t
42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
43 int direction)
44{
45 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
46}
47
48static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 41static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
49 dma_addr_t *dma_handle, gfp_t flags) 42 dma_addr_t *dma_handle, gfp_t flags)
50{ 43{
@@ -57,20 +50,20 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
57 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); 50 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
58} 51}
59 52
60struct dma_mapping_ops swiotlb_dma_ops = { 53struct dma_map_ops swiotlb_dma_ops = {
61 .mapping_error = swiotlb_dma_mapping_error, 54 .mapping_error = swiotlb_dma_mapping_error,
62 .alloc_coherent = x86_swiotlb_alloc_coherent, 55 .alloc_coherent = x86_swiotlb_alloc_coherent,
63 .free_coherent = swiotlb_free_coherent, 56 .free_coherent = swiotlb_free_coherent,
64 .map_single = swiotlb_map_single_phys,
65 .unmap_single = swiotlb_unmap_single,
66 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 57 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
67 .sync_single_for_device = swiotlb_sync_single_for_device, 58 .sync_single_for_device = swiotlb_sync_single_for_device,
68 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, 59 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
69 .sync_single_range_for_device = swiotlb_sync_single_range_for_device, 60 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
70 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 61 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
71 .sync_sg_for_device = swiotlb_sync_sg_for_device, 62 .sync_sg_for_device = swiotlb_sync_sg_for_device,
72 .map_sg = swiotlb_map_sg, 63 .map_sg = swiotlb_map_sg_attrs,
73 .unmap_sg = swiotlb_unmap_sg, 64 .unmap_sg = swiotlb_unmap_sg_attrs,
65 .map_page = swiotlb_map_page,
66 .unmap_page = swiotlb_unmap_page,
74 .dma_supported = NULL, 67 .dma_supported = NULL,
75}; 68};
76 69
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
index 675a48c404a5..071e7fea42e5 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -18,7 +18,7 @@
18#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/sections.h> 19#include <asm/sections.h>
20#include <asm/io.h> 20#include <asm/io.h>
21#include <setup_arch.h> 21#include <asm/setup_arch.h>
22 22
23static struct resource system_rom_resource = { 23static struct resource system_rom_resource = {
24 .name = "System ROM", 24 .name = "System ROM",
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6d12f7e37f8c..ca989158e847 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,16 +1,19 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
5#include <linux/smp.h> 4#include <linux/smp.h>
5#include <linux/prctl.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h> 11#include <trace/power.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/idle.h>
15#include <asm/uaccess.h>
16#include <asm/i387.h>
14 17
15unsigned long idle_halt; 18unsigned long idle_halt;
16EXPORT_SYMBOL(idle_halt); 19EXPORT_SYMBOL(idle_halt);
@@ -19,6 +22,9 @@ EXPORT_SYMBOL(idle_nomwait);
19 22
20struct kmem_cache *task_xstate_cachep; 23struct kmem_cache *task_xstate_cachep;
21 24
25DEFINE_TRACE(power_start);
26DEFINE_TRACE(power_end);
27
22int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 28int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
23{ 29{
24 *dst = *src; 30 *dst = *src;
@@ -56,6 +62,193 @@ void arch_task_cache_init(void)
56} 62}
57 63
58/* 64/*
65 * Free current thread data structures etc..
66 */
67void exit_thread(void)
68{
69 struct task_struct *me = current;
70 struct thread_struct *t = &me->thread;
71 unsigned long *bp = t->io_bitmap_ptr;
72
73 if (bp) {
74 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
75
76 t->io_bitmap_ptr = NULL;
77 clear_thread_flag(TIF_IO_BITMAP);
78 /*
79 * Careful, clear this in the TSS too:
80 */
81 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
82 t->io_bitmap_max = 0;
83 put_cpu();
84 kfree(bp);
85 }
86
87 ds_exit_thread(current);
88}
89
90void flush_thread(void)
91{
92 struct task_struct *tsk = current;
93
94#ifdef CONFIG_X86_64
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
98 clear_tsk_thread_flag(tsk, TIF_IA32);
99 } else {
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /*
116 * Forget coprocessor state..
117 */
118 tsk->fpu_counter = 0;
119 clear_fpu(tsk);
120 clear_used_math();
121}
122
123static void hard_disable_TSC(void)
124{
125 write_cr4(read_cr4() | X86_CR4_TSD);
126}
127
128void disable_TSC(void)
129{
130 preempt_disable();
131 if (!test_and_set_thread_flag(TIF_NOTSC))
132 /*
133 * Must flip the CPU state synchronously with
134 * TIF_NOTSC in the current running context.
135 */
136 hard_disable_TSC();
137 preempt_enable();
138}
139
140static void hard_enable_TSC(void)
141{
142 write_cr4(read_cr4() & ~X86_CR4_TSD);
143}
144
145static void enable_TSC(void)
146{
147 preempt_disable();
148 if (test_and_clear_thread_flag(TIF_NOTSC))
149 /*
150 * Must flip the CPU state synchronously with
151 * TIF_NOTSC in the current running context.
152 */
153 hard_enable_TSC();
154 preempt_enable();
155}
156
157int get_tsc_mode(unsigned long adr)
158{
159 unsigned int val;
160
161 if (test_thread_flag(TIF_NOTSC))
162 val = PR_TSC_SIGSEGV;
163 else
164 val = PR_TSC_ENABLE;
165
166 return put_user(val, (unsigned int __user *)adr);
167}
168
169int set_tsc_mode(unsigned int val)
170{
171 if (val == PR_TSC_SIGSEGV)
172 disable_TSC();
173 else if (val == PR_TSC_ENABLE)
174 enable_TSC();
175 else
176 return -EINVAL;
177
178 return 0;
179}
180
181void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
182 struct tss_struct *tss)
183{
184 struct thread_struct *prev, *next;
185
186 prev = &prev_p->thread;
187 next = &next_p->thread;
188
189 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
190 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
191 ds_switch_to(prev_p, next_p);
192 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr);
194
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */
208 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
209 hard_disable_TSC();
210 else
211 hard_enable_TSC();
212 }
213
214 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
215 /*
216 * Copy the relevant range of the IO bitmap.
217 * Normally this is 128 bytes or less:
218 */
219 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
220 max(prev->io_bitmap_max, next->io_bitmap_max));
221 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
222 /*
223 * Clear any possible leftover bits:
224 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 }
227}
228
229int sys_fork(struct pt_regs *regs)
230{
231 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
232}
233
234/*
235 * This is trivial, and on the face of it looks like it
236 * could equally well be done in user mode.
237 *
238 * Not so, for quite unobvious reasons - register pressure.
239 * In user mode vfork() cannot have a stack frame, and if
240 * done by calling the "clone()" system call directly, you
241 * do not have enough call-clobbered registers to hold all
242 * the information you need.
243 */
244int sys_vfork(struct pt_regs *regs)
245{
246 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
247 NULL, NULL);
248}
249
250
251/*
59 * Idle related variables and functions 252 * Idle related variables and functions
60 */ 253 */
61unsigned long boot_option_idle_override = 0; 254unsigned long boot_option_idle_override = 0;
@@ -135,7 +328,7 @@ void stop_this_cpu(void *dummy)
135 /* 328 /*
136 * Remove this CPU: 329 * Remove this CPU:
137 */ 330 */
138 cpu_clear(smp_processor_id(), cpu_online_map); 331 set_cpu_online(smp_processor_id(), false);
139 disable_local_APIC(); 332 disable_local_APIC();
140 333
141 for (;;) { 334 for (;;) {
@@ -285,12 +478,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
285 return 1; 478 return 1;
286} 479}
287 480
288static cpumask_t c1e_mask = CPU_MASK_NONE; 481static cpumask_var_t c1e_mask;
289static int c1e_detected; 482static int c1e_detected;
290 483
291void c1e_remove_cpu(int cpu) 484void c1e_remove_cpu(int cpu)
292{ 485{
293 cpu_clear(cpu, c1e_mask); 486 if (c1e_mask != NULL)
487 cpumask_clear_cpu(cpu, c1e_mask);
294} 488}
295 489
296/* 490/*
@@ -319,8 +513,8 @@ static void c1e_idle(void)
319 if (c1e_detected) { 513 if (c1e_detected) {
320 int cpu = smp_processor_id(); 514 int cpu = smp_processor_id();
321 515
322 if (!cpu_isset(cpu, c1e_mask)) { 516 if (!cpumask_test_cpu(cpu, c1e_mask)) {
323 cpu_set(cpu, c1e_mask); 517 cpumask_set_cpu(cpu, c1e_mask);
324 /* 518 /*
325 * Force broadcast so ACPI can not interfere. Needs 519 * Force broadcast so ACPI can not interfere. Needs
326 * to run with interrupts enabled as it uses 520 * to run with interrupts enabled as it uses
@@ -350,7 +544,7 @@ static void c1e_idle(void)
350 544
351void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 545void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
352{ 546{
353#ifdef CONFIG_X86_SMP 547#ifdef CONFIG_SMP
354 if (pm_idle == poll_idle && smp_num_siblings > 1) { 548 if (pm_idle == poll_idle && smp_num_siblings > 1) {
355 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 549 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
356 " performance may degrade.\n"); 550 " performance may degrade.\n");
@@ -372,6 +566,15 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
372 pm_idle = default_idle; 566 pm_idle = default_idle;
373} 567}
374 568
569void __init init_c1e_mask(void)
570{
571 /* If we're using c1e_idle, we need to allocate c1e_mask. */
572 if (pm_idle == c1e_idle) {
573 alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
574 cpumask_clear(c1e_mask);
575 }
576}
577
375static int __init idle_setup(char *str) 578static int __init idle_setup(char *str)
376{ 579{
377 if (!str) 580 if (!str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
11 11
12#include <stdarg.h> 12#include <stdarg.h>
13 13
14#include <linux/stackprotector.h>
14#include <linux/cpu.h> 15#include <linux/cpu.h>
15#include <linux/errno.h> 16#include <linux/errno.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -66,9 +67,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 67DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 68EXPORT_PER_CPU_SYMBOL(current_task);
68 69
69DEFINE_PER_CPU(int, cpu_number);
70EXPORT_PER_CPU_SYMBOL(cpu_number);
71
72/* 70/*
73 * Return saved PC of a blocked thread. 71 * Return saved PC of a blocked thread.
74 */ 72 */
@@ -94,6 +92,15 @@ void cpu_idle(void)
94{ 92{
95 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
96 94
95 /*
96 * If we're the non-boot CPU, nothing set the stack canary up
97 * for us. CPU0 already has it initialized but no harm in
98 * doing it again. This is a good place for updating it, as
99 * we wont ever return from this function (so the invalid
100 * canaries already on the stack wont ever trigger).
101 */
102 boot_init_stack_canary();
103
97 current_thread_info()->status |= TS_POLLING; 104 current_thread_info()->status |= TS_POLLING;
98 105
99 /* endless idle loop with no priority at all */ 106 /* endless idle loop with no priority at all */
@@ -104,14 +111,10 @@ void cpu_idle(void)
104 check_pgt_cache(); 111 check_pgt_cache();
105 rmb(); 112 rmb();
106 113
107 if (rcu_pending(cpu))
108 rcu_check_callbacks(cpu, 0);
109
110 if (cpu_is_offline(cpu)) 114 if (cpu_is_offline(cpu))
111 play_dead(); 115 play_dead();
112 116
113 local_irq_disable(); 117 local_irq_disable();
114 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
115 /* Don't trace irqs off for idle */ 118 /* Don't trace irqs off for idle */
116 stop_critical_timings(); 119 stop_critical_timings();
117 pm_idle(); 120 pm_idle();
@@ -135,7 +138,7 @@ void __show_regs(struct pt_regs *regs, int all)
135 if (user_mode_vm(regs)) { 138 if (user_mode_vm(regs)) {
136 sp = regs->sp; 139 sp = regs->sp;
137 ss = regs->ss & 0xffff; 140 ss = regs->ss & 0xffff;
138 savesegment(gs, gs); 141 gs = get_user_gs(regs);
139 } else { 142 } else {
140 sp = (unsigned long) (&regs->sp); 143 sp = (unsigned long) (&regs->sp);
141 savesegment(ss, ss); 144 savesegment(ss, ss);
@@ -216,6 +219,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
216 regs.ds = __USER_DS; 219 regs.ds = __USER_DS;
217 regs.es = __USER_DS; 220 regs.es = __USER_DS;
218 regs.fs = __KERNEL_PERCPU; 221 regs.fs = __KERNEL_PERCPU;
222 regs.gs = __KERNEL_STACK_CANARY;
219 regs.orig_ax = -1; 223 regs.orig_ax = -1;
220 regs.ip = (unsigned long) kernel_thread_helper; 224 regs.ip = (unsigned long) kernel_thread_helper;
221 regs.cs = __KERNEL_CS | get_kernel_rpl(); 225 regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -226,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
226} 230}
227EXPORT_SYMBOL(kernel_thread); 231EXPORT_SYMBOL(kernel_thread);
228 232
229/*
230 * Free current thread data structures etc..
231 */
232void exit_thread(void)
233{
234 /* The process may have allocated an io port bitmap... nuke it. */
235 if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
236 struct task_struct *tsk = current;
237 struct thread_struct *t = &tsk->thread;
238 int cpu = get_cpu();
239 struct tss_struct *tss = &per_cpu(init_tss, cpu);
240
241 kfree(t->io_bitmap_ptr);
242 t->io_bitmap_ptr = NULL;
243 clear_thread_flag(TIF_IO_BITMAP);
244 /*
245 * Careful, clear this in the TSS too:
246 */
247 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
248 t->io_bitmap_max = 0;
249 tss->io_bitmap_owner = NULL;
250 tss->io_bitmap_max = 0;
251 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
252 put_cpu();
253 }
254
255 ds_exit_thread(current);
256}
257
258void flush_thread(void)
259{
260 struct task_struct *tsk = current;
261
262 tsk->thread.debugreg0 = 0;
263 tsk->thread.debugreg1 = 0;
264 tsk->thread.debugreg2 = 0;
265 tsk->thread.debugreg3 = 0;
266 tsk->thread.debugreg6 = 0;
267 tsk->thread.debugreg7 = 0;
268 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
269 clear_tsk_thread_flag(tsk, TIF_DEBUG);
270 /*
271 * Forget coprocessor state..
272 */
273 tsk->fpu_counter = 0;
274 clear_fpu(tsk);
275 clear_used_math();
276}
277
278void release_thread(struct task_struct *dead_task) 233void release_thread(struct task_struct *dead_task)
279{ 234{
280 BUG_ON(dead_task->mm); 235 BUG_ON(dead_task->mm);
@@ -290,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
290 unlazy_fpu(tsk); 245 unlazy_fpu(tsk);
291} 246}
292 247
293int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 248int copy_thread(unsigned long clone_flags, unsigned long sp,
294 unsigned long unused, 249 unsigned long unused,
295 struct task_struct *p, struct pt_regs *regs) 250 struct task_struct *p, struct pt_regs *regs)
296{ 251{
@@ -308,7 +263,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
308 263
309 p->thread.ip = (unsigned long) ret_from_fork; 264 p->thread.ip = (unsigned long) ret_from_fork;
310 265
311 savesegment(gs, p->thread.gs); 266 task_user_gs(p) = get_user_gs(regs);
312 267
313 tsk = current; 268 tsk = current;
314 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -346,7 +301,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
346void 301void
347start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 302start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
348{ 303{
349 __asm__("movl %0, %%gs" : : "r"(0)); 304 set_user_gs(regs, 0);
350 regs->fs = 0; 305 regs->fs = 0;
351 set_fs(USER_DS); 306 set_fs(USER_DS);
352 regs->ds = __USER_DS; 307 regs->ds = __USER_DS;
@@ -362,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
362} 317}
363EXPORT_SYMBOL_GPL(start_thread); 318EXPORT_SYMBOL_GPL(start_thread);
364 319
365static void hard_disable_TSC(void)
366{
367 write_cr4(read_cr4() | X86_CR4_TSD);
368}
369
370void disable_TSC(void)
371{
372 preempt_disable();
373 if (!test_and_set_thread_flag(TIF_NOTSC))
374 /*
375 * Must flip the CPU state synchronously with
376 * TIF_NOTSC in the current running context.
377 */
378 hard_disable_TSC();
379 preempt_enable();
380}
381
382static void hard_enable_TSC(void)
383{
384 write_cr4(read_cr4() & ~X86_CR4_TSD);
385}
386
387static void enable_TSC(void)
388{
389 preempt_disable();
390 if (test_and_clear_thread_flag(TIF_NOTSC))
391 /*
392 * Must flip the CPU state synchronously with
393 * TIF_NOTSC in the current running context.
394 */
395 hard_enable_TSC();
396 preempt_enable();
397}
398
399int get_tsc_mode(unsigned long adr)
400{
401 unsigned int val;
402
403 if (test_thread_flag(TIF_NOTSC))
404 val = PR_TSC_SIGSEGV;
405 else
406 val = PR_TSC_ENABLE;
407
408 return put_user(val, (unsigned int __user *)adr);
409}
410
411int set_tsc_mode(unsigned int val)
412{
413 if (val == PR_TSC_SIGSEGV)
414 disable_TSC();
415 else if (val == PR_TSC_ENABLE)
416 enable_TSC();
417 else
418 return -EINVAL;
419
420 return 0;
421}
422
423static noinline void
424__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
425 struct tss_struct *tss)
426{
427 struct thread_struct *prev, *next;
428
429 prev = &prev_p->thread;
430 next = &next_p->thread;
431
432 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
433 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
434 ds_switch_to(prev_p, next_p);
435 else if (next->debugctlmsr != prev->debugctlmsr)
436 update_debugctlmsr(next->debugctlmsr);
437
438 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
439 set_debugreg(next->debugreg0, 0);
440 set_debugreg(next->debugreg1, 1);
441 set_debugreg(next->debugreg2, 2);
442 set_debugreg(next->debugreg3, 3);
443 /* no 4 and 5 */
444 set_debugreg(next->debugreg6, 6);
445 set_debugreg(next->debugreg7, 7);
446 }
447
448 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
449 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
450 /* prev and next are different */
451 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
452 hard_disable_TSC();
453 else
454 hard_enable_TSC();
455 }
456
457 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
458 /*
459 * Disable the bitmap via an invalid offset. We still cache
460 * the previous bitmap owner and the IO bitmap contents:
461 */
462 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
463 return;
464 }
465
466 if (likely(next == tss->io_bitmap_owner)) {
467 /*
468 * Previous owner of the bitmap (hence the bitmap content)
469 * matches the next task, we dont have to do anything but
470 * to set a valid offset in the TSS:
471 */
472 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
473 return;
474 }
475 /*
476 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
477 * and we let the task to get a GPF in case an I/O instruction
478 * is performed. The handler of the GPF will verify that the
479 * faulting task has a valid I/O bitmap and, it true, does the
480 * real copy and restart the instruction. This will save us
481 * redundant copies when the currently switched task does not
482 * perform any I/O during its timeslice.
483 */
484 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
485}
486 320
487/* 321/*
488 * switch_to(x,yn) should switch tasks from x to y. 322 * switch_to(x,yn) should switch tasks from x to y.
@@ -543,7 +377,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
543 * used %fs or %gs (it does not today), or if the kernel is 377 * used %fs or %gs (it does not today), or if the kernel is
544 * running inside of a hypervisor layer. 378 * running inside of a hypervisor layer.
545 */ 379 */
546 savesegment(gs, prev->gs); 380 lazy_save_gs(prev->gs);
547 381
548 /* 382 /*
549 * Load the per-thread Thread-Local Storage descriptor. 383 * Load the per-thread Thread-Local Storage descriptor.
@@ -589,64 +423,44 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
589 * Restore %gs if needed (which is common) 423 * Restore %gs if needed (which is common)
590 */ 424 */
591 if (prev->gs | next->gs) 425 if (prev->gs | next->gs)
592 loadsegment(gs, next->gs); 426 lazy_load_gs(next->gs);
593 427
594 x86_write_percpu(current_task, next_p); 428 percpu_write(current_task, next_p);
595 429
596 return prev_p; 430 return prev_p;
597} 431}
598 432
599asmlinkage int sys_fork(struct pt_regs regs) 433int sys_clone(struct pt_regs *regs)
600{
601 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
602}
603
604asmlinkage int sys_clone(struct pt_regs regs)
605{ 434{
606 unsigned long clone_flags; 435 unsigned long clone_flags;
607 unsigned long newsp; 436 unsigned long newsp;
608 int __user *parent_tidptr, *child_tidptr; 437 int __user *parent_tidptr, *child_tidptr;
609 438
610 clone_flags = regs.bx; 439 clone_flags = regs->bx;
611 newsp = regs.cx; 440 newsp = regs->cx;
612 parent_tidptr = (int __user *)regs.dx; 441 parent_tidptr = (int __user *)regs->dx;
613 child_tidptr = (int __user *)regs.di; 442 child_tidptr = (int __user *)regs->di;
614 if (!newsp) 443 if (!newsp)
615 newsp = regs.sp; 444 newsp = regs->sp;
616 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
617}
618
619/*
620 * This is trivial, and on the face of it looks like it
621 * could equally well be done in user mode.
622 *
623 * Not so, for quite unobvious reasons - register pressure.
624 * In user mode vfork() cannot have a stack frame, and if
625 * done by calling the "clone()" system call directly, you
626 * do not have enough call-clobbered registers to hold all
627 * the information you need.
628 */
629asmlinkage int sys_vfork(struct pt_regs regs)
630{
631 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
632} 446}
633 447
634/* 448/*
635 * sys_execve() executes a new program. 449 * sys_execve() executes a new program.
636 */ 450 */
637asmlinkage int sys_execve(struct pt_regs regs) 451int sys_execve(struct pt_regs *regs)
638{ 452{
639 int error; 453 int error;
640 char *filename; 454 char *filename;
641 455
642 filename = getname((char __user *) regs.bx); 456 filename = getname((char __user *) regs->bx);
643 error = PTR_ERR(filename); 457 error = PTR_ERR(filename);
644 if (IS_ERR(filename)) 458 if (IS_ERR(filename))
645 goto out; 459 goto out;
646 error = do_execve(filename, 460 error = do_execve(filename,
647 (char __user * __user *) regs.cx, 461 (char __user * __user *) regs->cx,
648 (char __user * __user *) regs.dx, 462 (char __user * __user *) regs->dx,
649 &regs); 463 regs);
650 if (error == 0) { 464 if (error == 0) {
651 /* Make sure we don't return using sysenter.. */ 465 /* Make sure we don't return using sysenter.. */
652 set_thread_flag(TIF_IRET); 466 set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 85b4cb5c1980..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -47,7 +48,6 @@
47#include <asm/processor.h> 48#include <asm/processor.h>
48#include <asm/i387.h> 49#include <asm/i387.h>
49#include <asm/mmu_context.h> 50#include <asm/mmu_context.h>
50#include <asm/pda.h>
51#include <asm/prctl.h> 51#include <asm/prctl.h>
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/proto.h> 53#include <asm/proto.h>
@@ -58,6 +58,12 @@
58 58
59asmlinkage extern void ret_from_fork(void); 59asmlinkage extern void ret_from_fork(void);
60 60
61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62EXPORT_PER_CPU_SYMBOL(current_task);
63
64DEFINE_PER_CPU(unsigned long, old_rsp);
65static DEFINE_PER_CPU(unsigned char, is_idle);
66
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62 68
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
76 82
77void enter_idle(void) 83void enter_idle(void)
78{ 84{
79 write_pda(isidle, 1); 85 percpu_write(is_idle, 1);
80 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81} 87}
82 88
83static void __exit_idle(void) 89static void __exit_idle(void)
84{ 90{
85 if (test_and_clear_bit_pda(0, isidle) == 0) 91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
86 return; 92 return;
87 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88} 94}
@@ -112,6 +118,16 @@ static inline void play_dead(void)
112void cpu_idle(void) 118void cpu_idle(void)
113{ 119{
114 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121
122 /*
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
128 */
129 boot_init_stack_canary();
130
115 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
116 while (1) { 132 while (1) {
117 tick_nohz_stop_sched_tick(1); 133 tick_nohz_stop_sched_tick(1);
@@ -221,61 +237,6 @@ void show_regs(struct pt_regs *regs)
221 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
222} 238}
223 239
224/*
225 * Free current thread data structures etc..
226 */
227void exit_thread(void)
228{
229 struct task_struct *me = current;
230 struct thread_struct *t = &me->thread;
231
232 if (me->thread.io_bitmap_ptr) {
233 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
234
235 kfree(t->io_bitmap_ptr);
236 t->io_bitmap_ptr = NULL;
237 clear_thread_flag(TIF_IO_BITMAP);
238 /*
239 * Careful, clear this in the TSS too:
240 */
241 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
242 t->io_bitmap_max = 0;
243 put_cpu();
244 }
245
246 ds_exit_thread(current);
247}
248
249void flush_thread(void)
250{
251 struct task_struct *tsk = current;
252
253 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
254 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
255 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
256 clear_tsk_thread_flag(tsk, TIF_IA32);
257 } else {
258 set_tsk_thread_flag(tsk, TIF_IA32);
259 current_thread_info()->status |= TS_COMPAT;
260 }
261 }
262 clear_tsk_thread_flag(tsk, TIF_DEBUG);
263
264 tsk->thread.debugreg0 = 0;
265 tsk->thread.debugreg1 = 0;
266 tsk->thread.debugreg2 = 0;
267 tsk->thread.debugreg3 = 0;
268 tsk->thread.debugreg6 = 0;
269 tsk->thread.debugreg7 = 0;
270 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
271 /*
272 * Forget coprocessor state..
273 */
274 tsk->fpu_counter = 0;
275 clear_fpu(tsk);
276 clear_used_math();
277}
278
279void release_thread(struct task_struct *dead_task) 240void release_thread(struct task_struct *dead_task)
280{ 241{
281 if (dead_task->mm) { 242 if (dead_task->mm) {
@@ -317,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
317 unlazy_fpu(tsk); 278 unlazy_fpu(tsk);
318} 279}
319 280
320int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 281int copy_thread(unsigned long clone_flags, unsigned long sp,
321 unsigned long unused, 282 unsigned long unused,
322 struct task_struct *p, struct pt_regs *regs) 283 struct task_struct *p, struct pt_regs *regs)
323{ 284{
@@ -397,7 +358,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
397 load_gs_index(0); 358 load_gs_index(0);
398 regs->ip = new_ip; 359 regs->ip = new_ip;
399 regs->sp = new_sp; 360 regs->sp = new_sp;
400 write_pda(oldrsp, new_sp); 361 percpu_write(old_rsp, new_sp);
401 regs->cs = __USER_CS; 362 regs->cs = __USER_CS;
402 regs->ss = __USER_DS; 363 regs->ss = __USER_DS;
403 regs->flags = 0x200; 364 regs->flags = 0x200;
@@ -409,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
409} 370}
410EXPORT_SYMBOL_GPL(start_thread); 371EXPORT_SYMBOL_GPL(start_thread);
411 372
412static void hard_disable_TSC(void)
413{
414 write_cr4(read_cr4() | X86_CR4_TSD);
415}
416
417void disable_TSC(void)
418{
419 preempt_disable();
420 if (!test_and_set_thread_flag(TIF_NOTSC))
421 /*
422 * Must flip the CPU state synchronously with
423 * TIF_NOTSC in the current running context.
424 */
425 hard_disable_TSC();
426 preempt_enable();
427}
428
429static void hard_enable_TSC(void)
430{
431 write_cr4(read_cr4() & ~X86_CR4_TSD);
432}
433
434static void enable_TSC(void)
435{
436 preempt_disable();
437 if (test_and_clear_thread_flag(TIF_NOTSC))
438 /*
439 * Must flip the CPU state synchronously with
440 * TIF_NOTSC in the current running context.
441 */
442 hard_enable_TSC();
443 preempt_enable();
444}
445
446int get_tsc_mode(unsigned long adr)
447{
448 unsigned int val;
449
450 if (test_thread_flag(TIF_NOTSC))
451 val = PR_TSC_SIGSEGV;
452 else
453 val = PR_TSC_ENABLE;
454
455 return put_user(val, (unsigned int __user *)adr);
456}
457
458int set_tsc_mode(unsigned int val)
459{
460 if (val == PR_TSC_SIGSEGV)
461 disable_TSC();
462 else if (val == PR_TSC_ENABLE)
463 enable_TSC();
464 else
465 return -EINVAL;
466
467 return 0;
468}
469
470/*
471 * This special macro can be used to load a debugging register
472 */
473#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
474
475static inline void __switch_to_xtra(struct task_struct *prev_p,
476 struct task_struct *next_p,
477 struct tss_struct *tss)
478{
479 struct thread_struct *prev, *next;
480
481 prev = &prev_p->thread,
482 next = &next_p->thread;
483
484 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
485 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
486 ds_switch_to(prev_p, next_p);
487 else if (next->debugctlmsr != prev->debugctlmsr)
488 update_debugctlmsr(next->debugctlmsr);
489
490 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
491 loaddebug(next, 0);
492 loaddebug(next, 1);
493 loaddebug(next, 2);
494 loaddebug(next, 3);
495 /* no 4 and 5 */
496 loaddebug(next, 6);
497 loaddebug(next, 7);
498 }
499
500 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
501 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
502 /* prev and next are different */
503 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
504 hard_disable_TSC();
505 else
506 hard_enable_TSC();
507 }
508
509 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
510 /*
511 * Copy the relevant range of the IO bitmap.
512 * Normally this is 128 bytes or less:
513 */
514 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
515 max(prev->io_bitmap_max, next->io_bitmap_max));
516 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
517 /*
518 * Clear any possible leftover bits:
519 */
520 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
521 }
522}
523
524/* 373/*
525 * switch_to(x,y) should switch tasks from x to y. 374 * switch_to(x,y) should switch tasks from x to y.
526 * 375 *
@@ -618,21 +467,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* 467 /*
619 * Switch the PDA and FPU contexts. 468 * Switch the PDA and FPU contexts.
620 */ 469 */
621 prev->usersp = read_pda(oldrsp); 470 prev->usersp = percpu_read(old_rsp);
622 write_pda(oldrsp, next->usersp); 471 percpu_write(old_rsp, next->usersp);
623 write_pda(pcurrent, next_p); 472 percpu_write(current_task, next_p);
624 473
625 write_pda(kernelstack, 474 percpu_write(kernel_stack,
626 (unsigned long)task_stack_page(next_p) + 475 (unsigned long)task_stack_page(next_p) +
627 THREAD_SIZE - PDA_STACKOFFSET); 476 THREAD_SIZE - KERNEL_STACK_OFFSET);
628#ifdef CONFIG_CC_STACKPROTECTOR
629 write_pda(stack_canary, next_p->stack_canary);
630 /*
631 * Build time only check to make sure the stack_canary is at
632 * offset 40 in the pda; this is a gcc ABI requirement
633 */
634 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
635#endif
636 477
637 /* 478 /*
638 * Now maybe reload the debug registers and handle I/O bitmaps 479 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -686,11 +527,6 @@ void set_personality_64bit(void)
686 current->personality &= ~READ_IMPLIES_EXEC; 527 current->personality &= ~READ_IMPLIES_EXEC;
687} 528}
688 529
689asmlinkage long sys_fork(struct pt_regs *regs)
690{
691 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
692}
693
694asmlinkage long 530asmlinkage long
695sys_clone(unsigned long clone_flags, unsigned long newsp, 531sys_clone(unsigned long clone_flags, unsigned long newsp,
696 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 532 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -700,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
700 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 536 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
701} 537}
702 538
703/*
704 * This is trivial, and on the face of it looks like it
705 * could equally well be done in user mode.
706 *
707 * Not so, for quite unobvious reasons - register pressure.
708 * In user mode vfork() cannot have a stack frame, and if
709 * done by calling the "clone()" system call directly, you
710 * do not have enough call-clobbered registers to hold all
711 * the information you need.
712 */
713asmlinkage long sys_vfork(struct pt_regs *regs)
714{
715 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
716 NULL, NULL);
717}
718
719unsigned long get_wchan(struct task_struct *p) 539unsigned long get_wchan(struct task_struct *p)
720{ 540{
721 unsigned long stack; 541 unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a5df5f82fb9..fe9345c967de 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/ftrace.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -75,10 +76,7 @@ static inline bool invalid_selector(u16 value)
75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 76static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
76{ 77{
77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 78 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
78 regno >>= 2; 79 return &regs->bx + (regno >> 2);
79 if (regno > FS)
80 --regno;
81 return &regs->bx + regno;
82} 80}
83 81
84static u16 get_segment_reg(struct task_struct *task, unsigned long offset) 82static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +88,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
90 if (offset != offsetof(struct user_regs_struct, gs)) 88 if (offset != offsetof(struct user_regs_struct, gs))
91 retval = *pt_regs_access(task_pt_regs(task), offset); 89 retval = *pt_regs_access(task_pt_regs(task), offset);
92 else { 90 else {
93 retval = task->thread.gs;
94 if (task == current) 91 if (task == current)
95 savesegment(gs, retval); 92 retval = get_user_gs(task_pt_regs(task));
93 else
94 retval = task_user_gs(task);
96 } 95 }
97 return retval; 96 return retval;
98} 97}
@@ -126,13 +125,10 @@ static int set_segment_reg(struct task_struct *task,
126 break; 125 break;
127 126
128 case offsetof(struct user_regs_struct, gs): 127 case offsetof(struct user_regs_struct, gs):
129 task->thread.gs = value;
130 if (task == current) 128 if (task == current)
131 /* 129 set_user_gs(task_pt_regs(task), value);
132 * The user-mode %gs is not affected by 130 else
133 * kernel entry, so we must update the CPU. 131 task_user_gs(task) = value;
134 */
135 loadsegment(gs, value);
136 } 132 }
137 133
138 return 0; 134 return 0;
@@ -273,7 +269,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task)
273 if (test_tsk_thread_flag(task, TIF_IA32)) 269 if (test_tsk_thread_flag(task, TIF_IA32))
274 return IA32_PAGE_OFFSET - 3; 270 return IA32_PAGE_OFFSET - 3;
275#endif 271#endif
276 return TASK_SIZE64 - 7; 272 return TASK_SIZE_MAX - 7;
277} 273}
278 274
279#endif /* CONFIG_X86_32 */ 275#endif /* CONFIG_X86_32 */
@@ -690,9 +686,8 @@ static int ptrace_bts_config(struct task_struct *child,
690 if (!cfg.signal) 686 if (!cfg.signal)
691 return -EINVAL; 687 return -EINVAL;
692 688
693 return -EOPNOTSUPP;
694
695 child->thread.bts_ovfl_signal = cfg.signal; 689 child->thread.bts_ovfl_signal = cfg.signal;
690 return -EOPNOTSUPP;
696 } 691 }
697 692
698 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
@@ -810,12 +805,16 @@ static void ptrace_bts_untrace(struct task_struct *child)
810 805
811static void ptrace_bts_detach(struct task_struct *child) 806static void ptrace_bts_detach(struct task_struct *child)
812{ 807{
813 if (unlikely(child->bts)) { 808 /*
814 ds_release_bts(child->bts); 809 * Ptrace_detach() races with ptrace_untrace() in case
815 child->bts = NULL; 810 * the child dies and is reaped by another thread.
816 811 *
817 ptrace_bts_free_buffer(child); 812 * We only do the memory accounting at this point and
818 } 813 * leave the buffer deallocation and the bts tracer
814 * release to ptrace_bts_untrace() which will be called
815 * later on with tasklist_lock held.
816 */
817 release_locked_buffer(child->bts_buffer, child->bts_size);
819} 818}
820#else 819#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {} 820static inline void ptrace_bts_fork(struct task_struct *tsk) {}
@@ -1384,7 +1383,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1384#ifdef CONFIG_X86_32 1383#ifdef CONFIG_X86_32
1385# define IS_IA32 1 1384# define IS_IA32 1
1386#elif defined CONFIG_IA32_EMULATION 1385#elif defined CONFIG_IA32_EMULATION
1387# define IS_IA32 test_thread_flag(TIF_IA32) 1386# define IS_IA32 is_compat_task()
1388#else 1387#else
1389# define IS_IA32 0 1388# define IS_IA32 0
1390#endif 1389#endif
@@ -1417,6 +1416,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1417 tracehook_report_syscall_entry(regs)) 1416 tracehook_report_syscall_entry(regs))
1418 ret = -1L; 1417 ret = -1L;
1419 1418
1419 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1420 ftrace_syscall_enter(regs);
1421
1420 if (unlikely(current->audit_context)) { 1422 if (unlikely(current->audit_context)) {
1421 if (IS_IA32) 1423 if (IS_IA32)
1422 audit_syscall_entry(AUDIT_ARCH_I386, 1424 audit_syscall_entry(AUDIT_ARCH_I386,
@@ -1440,6 +1442,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1440 if (unlikely(current->audit_context)) 1442 if (unlikely(current->audit_context))
1441 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1443 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1442 1444
1445 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1446 ftrace_syscall_exit(regs);
1447
1443 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1448 if (test_thread_flag(TIF_SYSCALL_TRACE))
1444 tracehook_report_syscall_exit(regs, 0); 1449 tracehook_report_syscall_exit(regs, 0);
1445 1450
@@ -1457,6 +1462,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1457 * system call instruction. 1462 * system call instruction.
1458 */ 1463 */
1459 if (test_thread_flag(TIF_SINGLESTEP) && 1464 if (test_thread_flag(TIF_SINGLESTEP) &&
1460 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) 1465 tracehook_consider_fatal_signal(current, SIGTRAP))
1461 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1466 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1462} 1467}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c1..e95022e4f5d5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
74 if (!force_hpet_address) 74 if (!force_hpet_address)
75 return; 75 return;
76 76
77 if (rcba_base == NULL) 77 BUG_ON(rcba_base == NULL);
78 BUG();
79 78
80 /* read the Function Disable register, dword mode only */ 79 /* read the Function Disable register, dword mode only */
81 val = readl(rcba_base + 0x3404); 80 val = readl(rcba_base + 0x3404);
@@ -172,7 +171,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet); 171 ich_force_enable_hpet);
173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 172DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
174 ich_force_enable_hpet); 173 ich_force_enable_hpet);
175 174DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */
175 ich_force_enable_hpet);
176 176
177static struct pci_dev *cached_dev; 177static struct pci_dev *cached_dev;
178 178
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2b46eb41643b..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -14,6 +14,7 @@
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <asm/virtext.h> 16#include <asm/virtext.h>
17#include <asm/cpu.h>
17 18
18#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
19# include <linux/dmi.h> 20# include <linux/dmi.h>
@@ -23,8 +24,6 @@
23# include <asm/iommu.h> 24# include <asm/iommu.h>
24#endif 25#endif
25 26
26#include <mach_ipi.h>
27
28/* 27/*
29 * Power off function, if any 28 * Power off function, if any
30 */ 29 */
@@ -217,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
217 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), 216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
218 }, 217 },
219 }, 218 },
219 { /* Handle problems with rebooting on Dell XPS710 */
220 .callback = set_bios_reboot,
221 .ident = "Dell XPS710",
222 .matches = {
223 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
224 DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
225 },
226 },
220 { } 227 { }
221}; 228};
222 229
@@ -650,7 +657,7 @@ static int crash_nmi_callback(struct notifier_block *self,
650 657
651static void smp_send_nmi_allbutself(void) 658static void smp_send_nmi_allbutself(void)
652{ 659{
653 send_IPI_allbutself(NMI_VECTOR); 660 apic->send_IPI_allbutself(NMI_VECTOR);
654} 661}
655 662
656static struct notifier_block crash_nmi_nb = { 663static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index a160f3119725..41235531b11c 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -7,7 +7,7 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h> 10#include <asm/page_types.h>
11#include <asm/kexec.h> 11#include <asm/kexec.h>
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13 13
@@ -17,7 +17,8 @@
17 17
18#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
19 19
20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/*
21 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
21 * ~ control_page + PAGE_SIZE are used as data storage and stack for 22 * ~ control_page + PAGE_SIZE are used as data storage and stack for
22 * jumping back 23 * jumping back
23 */ 24 */
@@ -76,8 +77,10 @@ relocate_kernel:
76 movl %eax, CP_PA_SWAP_PAGE(%edi) 77 movl %eax, CP_PA_SWAP_PAGE(%edi)
77 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) 78 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
78 79
79 /* get physical address of control page now */ 80 /*
80 /* this is impossible after page table switch */ 81 * get physical address of control page now
82 * this is impossible after page table switch
83 */
81 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 84 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
82 85
83 /* switch to new set of page tables */ 86 /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
97 /* store the start address on the stack */ 100 /* store the start address on the stack */
98 pushl %edx 101 pushl %edx
99 102
100 /* Set cr0 to a known state: 103 /*
104 * Set cr0 to a known state:
101 * - Paging disabled 105 * - Paging disabled
102 * - Alignment check disabled 106 * - Alignment check disabled
103 * - Write protect disabled 107 * - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
113 /* clear cr4 if applicable */ 117 /* clear cr4 if applicable */
114 testl %ecx, %ecx 118 testl %ecx, %ecx
115 jz 1f 119 jz 1f
116 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
117 * Setting everything to zero seems safe. 122 * Setting everything to zero seems safe.
118 */ 123 */
119 xorl %eax, %eax 124 xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
132 call swap_pages 137 call swap_pages
133 addl $8, %esp 138 addl $8, %esp
134 139
135 /* To be certain of avoiding problems with self-modifying code 140 /*
141 * To be certain of avoiding problems with self-modifying code
136 * I need to execute a serializing instruction here. 142 * I need to execute a serializing instruction here.
137 * So I flush the TLB, it's handy, and not processor dependent. 143 * So I flush the TLB, it's handy, and not processor dependent.
138 */ 144 */
139 xorl %eax, %eax 145 xorl %eax, %eax
140 movl %eax, %cr3 146 movl %eax, %cr3
141 147
142 /* set all of the registers to known values */ 148 /*
143 /* leave %esp alone */ 149 * set all of the registers to known values
150 * leave %esp alone
151 */
144 152
145 testl %esi, %esi 153 testl %esi, %esi
146 jnz 1f 154 jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index f5afe665a82b..4de8f5b3d476 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -7,10 +7,10 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h> 10#include <asm/page_types.h>
11#include <asm/kexec.h> 11#include <asm/kexec.h>
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable_types.h>
14 14
15/* 15/*
16 * Must be relocatable PIC code callable as a C function 16 * Must be relocatable PIC code callable as a C function
@@ -19,145 +19,76 @@
19#define PTR(x) (x << 3) 19#define PTR(x) (x << 3)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21 21
22/*
23 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define RSP DATA(0x0)
31#define CR0 DATA(0x8)
32#define CR3 DATA(0x10)
33#define CR4 DATA(0x18)
34
35/* other data */
36#define CP_PA_TABLE_PAGE DATA(0x20)
37#define CP_PA_SWAP_PAGE DATA(0x28)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
39
22 .text 40 .text
23 .align PAGE_SIZE 41 .align PAGE_SIZE
24 .code64 42 .code64
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 /* %rdi indirection_page 45 /*
46 * %rdi indirection_page
28 * %rsi page_list 47 * %rsi page_list
29 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context
30 */ 50 */
31 51
32 /* map the control page at its virtual address */ 52 /* Save the CPU context, used for jumping back */
33 53 pushq %rbx
34 movq $0x0000ff8000000000, %r10 /* mask */ 54 pushq %rbp
35 mov $(39 - 3), %cl /* bits to shift */ 55 pushq %r12
36 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ 56 pushq %r13
37 57 pushq %r14
38 movq %r11, %r9 58 pushq %r15
39 andq %r10, %r9 59 pushf
40 shrq %cl, %r9 60
41 61 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
42 movq PTR(VA_PGD)(%rsi), %r8 62 movq %rsp, RSP(%r11)
43 addq %r8, %r9 63 movq %cr0, %rax
44 movq PTR(PA_PUD_0)(%rsi), %r8 64 movq %rax, CR0(%r11)
45 orq $PAGE_ATTR, %r8 65 movq %cr3, %rax
46 movq %r8, (%r9) 66 movq %rax, CR3(%r11)
47 67 movq %cr4, %rax
48 shrq $9, %r10 68 movq %rax, CR4(%r11)
49 sub $9, %cl
50
51 movq %r11, %r9
52 andq %r10, %r9
53 shrq %cl, %r9
54
55 movq PTR(VA_PUD_0)(%rsi), %r8
56 addq %r8, %r9
57 movq PTR(PA_PMD_0)(%rsi), %r8
58 orq $PAGE_ATTR, %r8
59 movq %r8, (%r9)
60
61 shrq $9, %r10
62 sub $9, %cl
63
64 movq %r11, %r9
65 andq %r10, %r9
66 shrq %cl, %r9
67
68 movq PTR(VA_PMD_0)(%rsi), %r8
69 addq %r8, %r9
70 movq PTR(PA_PTE_0)(%rsi), %r8
71 orq $PAGE_ATTR, %r8
72 movq %r8, (%r9)
73
74 shrq $9, %r10
75 sub $9, %cl
76
77 movq %r11, %r9
78 andq %r10, %r9
79 shrq %cl, %r9
80
81 movq PTR(VA_PTE_0)(%rsi), %r8
82 addq %r8, %r9
83 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
84 orq $PAGE_ATTR, %r8
85 movq %r8, (%r9)
86
87 /* identity map the control page at its physical address */
88
89 movq $0x0000ff8000000000, %r10 /* mask */
90 mov $(39 - 3), %cl /* bits to shift */
91 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
92
93 movq %r11, %r9
94 andq %r10, %r9
95 shrq %cl, %r9
96
97 movq PTR(VA_PGD)(%rsi), %r8
98 addq %r8, %r9
99 movq PTR(PA_PUD_1)(%rsi), %r8
100 orq $PAGE_ATTR, %r8
101 movq %r8, (%r9)
102
103 shrq $9, %r10
104 sub $9, %cl
105
106 movq %r11, %r9
107 andq %r10, %r9
108 shrq %cl, %r9
109
110 movq PTR(VA_PUD_1)(%rsi), %r8
111 addq %r8, %r9
112 movq PTR(PA_PMD_1)(%rsi), %r8
113 orq $PAGE_ATTR, %r8
114 movq %r8, (%r9)
115
116 shrq $9, %r10
117 sub $9, %cl
118
119 movq %r11, %r9
120 andq %r10, %r9
121 shrq %cl, %r9
122
123 movq PTR(VA_PMD_1)(%rsi), %r8
124 addq %r8, %r9
125 movq PTR(PA_PTE_1)(%rsi), %r8
126 orq $PAGE_ATTR, %r8
127 movq %r8, (%r9)
128
129 shrq $9, %r10
130 sub $9, %cl
131
132 movq %r11, %r9
133 andq %r10, %r9
134 shrq %cl, %r9
135
136 movq PTR(VA_PTE_1)(%rsi), %r8
137 addq %r8, %r9
138 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
139 orq $PAGE_ATTR, %r8
140 movq %r8, (%r9)
141
142relocate_new_kernel:
143 /* %rdi indirection_page
144 * %rsi page_list
145 * %rdx start address
146 */
147 69
148 /* zero out flags, and disable interrupts */ 70 /* zero out flags, and disable interrupts */
149 pushq $0 71 pushq $0
150 popfq 72 popfq
151 73
152 /* get physical address of control page now */ 74 /*
153 /* this is impossible after page table switch */ 75 * get physical address of control page now
76 * this is impossible after page table switch
77 */
154 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 78 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
155 79
156 /* get physical address of page table now too */ 80 /* get physical address of page table now too */
157 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx 81 movq PTR(PA_TABLE_PAGE)(%rsi), %r9
82
83 /* get physical address of swap page now */
84 movq PTR(PA_SWAP_PAGE)(%rsi), %r10
158 85
159 /* switch to new set of page tables */ 86 /* save some information for jumping back */
160 movq PTR(PA_PGD)(%rsi), %r9 87 movq %r9, CP_PA_TABLE_PAGE(%r11)
88 movq %r10, CP_PA_SWAP_PAGE(%r11)
89 movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
90
91 /* Switch to the identity mapped page tables */
161 movq %r9, %cr3 92 movq %r9, %cr3
162 93
163 /* setup a new stack at the end of the physical control page */ 94 /* setup a new stack at the end of the physical control page */
@@ -172,7 +103,8 @@ identity_mapped:
172 /* store the start address on the stack */ 103 /* store the start address on the stack */
173 pushq %rdx 104 pushq %rdx
174 105
175 /* Set cr0 to a known state: 106 /*
107 * Set cr0 to a known state:
176 * - Paging enabled 108 * - Paging enabled
177 * - Alignment check disabled 109 * - Alignment check disabled
178 * - Write protect disabled 110 * - Write protect disabled
@@ -185,7 +117,8 @@ identity_mapped:
185 orl $(X86_CR0_PG | X86_CR0_PE), %eax 117 orl $(X86_CR0_PG | X86_CR0_PE), %eax
186 movq %rax, %cr0 118 movq %rax, %cr0
187 119
188 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
189 * - physical address extension enabled 122 * - physical address extension enabled
190 */ 123 */
191 movq $X86_CR4_PAE, %rax 124 movq $X86_CR4_PAE, %rax
@@ -194,12 +127,88 @@ identity_mapped:
194 jmp 1f 127 jmp 1f
1951: 1281:
196 129
197 /* Switch to the identity mapped page tables, 130 /* Flush the TLB (needed?) */
198 * and flush the TLB. 131 movq %r9, %cr3
199 */ 132
200 movq %rcx, %cr3 133 movq %rcx, %r11
134 call swap_pages
135
136 /*
137 * To be certain of avoiding problems with self-modifying code
138 * I need to execute a serializing instruction here.
139 * So I flush the TLB by reloading %cr3 here, it's handy,
140 * and not processor dependent.
141 */
142 movq %cr3, %rax
143 movq %rax, %cr3
144
145 /*
146 * set all of the registers to known values
147 * leave %rsp alone
148 */
149
150 testq %r11, %r11
151 jnz 1f
152 xorq %rax, %rax
153 xorq %rbx, %rbx
154 xorq %rcx, %rcx
155 xorq %rdx, %rdx
156 xorq %rsi, %rsi
157 xorq %rdi, %rdi
158 xorq %rbp, %rbp
159 xorq %r8, %r8
160 xorq %r9, %r9
161 xorq %r10, %r9
162 xorq %r11, %r11
163 xorq %r12, %r12
164 xorq %r13, %r13
165 xorq %r14, %r14
166 xorq %r15, %r15
167
168 ret
169
1701:
171 popq %rdx
172 leaq PAGE_SIZE(%r10), %rsp
173 call *%rdx
174
175 /* get the re-entry point of the peer system */
176 movq 0(%rsp), %rbp
177 call 1f
1781:
179 popq %r8
180 subq $(1b - relocate_kernel), %r8
181 movq CP_PA_SWAP_PAGE(%r8), %r10
182 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
183 movq CP_PA_TABLE_PAGE(%r8), %rax
184 movq %rax, %cr3
185 lea PAGE_SIZE(%r8), %rsp
186 call swap_pages
187 movq $virtual_mapped, %rax
188 pushq %rax
189 ret
190
191virtual_mapped:
192 movq RSP(%r8), %rsp
193 movq CR4(%r8), %rax
194 movq %rax, %cr4
195 movq CR3(%r8), %rax
196 movq CR0(%r8), %r8
197 movq %rax, %cr3
198 movq %r8, %cr0
199 movq %rbp, %rax
200
201 popf
202 popq %r15
203 popq %r14
204 popq %r13
205 popq %r12
206 popq %rbp
207 popq %rbx
208 ret
201 209
202 /* Do the copies */ 210 /* Do the copies */
211swap_pages:
203 movq %rdi, %rcx /* Put the page_list in %rcx */ 212 movq %rdi, %rcx /* Put the page_list in %rcx */
204 xorq %rdi, %rdi 213 xorq %rdi, %rdi
205 xorq %rsi, %rsi 214 xorq %rsi, %rsi
@@ -231,36 +240,27 @@ identity_mapped:
231 movq %rcx, %rsi /* For ever source page do a copy */ 240 movq %rcx, %rsi /* For ever source page do a copy */
232 andq $0xfffffffffffff000, %rsi 241 andq $0xfffffffffffff000, %rsi
233 242
243 movq %rdi, %rdx
244 movq %rsi, %rax
245
246 movq %r10, %rdi
234 movq $512, %rcx 247 movq $512, %rcx
235 rep ; movsq 248 rep ; movsq
236 jmp 0b
2373:
238
239 /* To be certain of avoiding problems with self-modifying code
240 * I need to execute a serializing instruction here.
241 * So I flush the TLB by reloading %cr3 here, it's handy,
242 * and not processor dependent.
243 */
244 movq %cr3, %rax
245 movq %rax, %cr3
246 249
247 /* set all of the registers to known values */ 250 movq %rax, %rdi
248 /* leave %rsp alone */ 251 movq %rdx, %rsi
252 movq $512, %rcx
253 rep ; movsq
249 254
250 xorq %rax, %rax 255 movq %rdx, %rdi
251 xorq %rbx, %rbx 256 movq %r10, %rsi
252 xorq %rcx, %rcx 257 movq $512, %rcx
253 xorq %rdx, %rdx 258 rep ; movsq
254 xorq %rsi, %rsi
255 xorq %rdi, %rdi
256 xorq %rbp, %rbp
257 xorq %r8, %r8
258 xorq %r9, %r9
259 xorq %r10, %r9
260 xorq %r11, %r11
261 xorq %r12, %r12
262 xorq %r13, %r13
263 xorq %r14, %r14
264 xorq %r15, %r15
265 259
260 lea PAGE_SIZE(%rax), %rsi
261 jmp 0b
2623:
266 ret 263 ret
264
265 .globl kexec_control_code_size
266.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561b..5d465b207e72 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
1/* 1/*
2 * RTC related functions 2 * RTC related functions
3 */ 3 */
4#include <linux/platform_device.h>
5#include <linux/mc146818rtc.h>
4#include <linux/acpi.h> 6#include <linux/acpi.h>
5#include <linux/bcd.h> 7#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7#include <linux/platform_device.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/time.h>
11#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/time.h>
12 12
13#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
14/* 14/*
@@ -16,9 +16,9 @@
16 * register we are working with. It is required for NMI access to the 16 * register we are working with. It is required for NMI access to the
17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. 17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
18 */ 18 */
19volatile unsigned long cmos_lock = 0; 19volatile unsigned long cmos_lock;
20EXPORT_SYMBOL(cmos_lock); 20EXPORT_SYMBOL(cmos_lock);
21#endif 21#endif /* CONFIG_X86_32 */
22 22
23/* For two digit years assume time is always after that */ 23/* For two digit years assume time is always after that */
24#define CMOS_YEARS_OFFS 2000 24#define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
38 */ 38 */
39int mach_set_rtc_mmss(unsigned long nowtime) 39int mach_set_rtc_mmss(unsigned long nowtime)
40{ 40{
41 int retval = 0;
42 int real_seconds, real_minutes, cmos_minutes; 41 int real_seconds, real_minutes, cmos_minutes;
43 unsigned char save_control, save_freq_select; 42 unsigned char save_control, save_freq_select;
43 int retval = 0;
44 44
45 /* tell the clock it's being set */ 45 /* tell the clock it's being set */
46 save_control = CMOS_READ(RTC_CONTROL); 46 save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
72 real_seconds = bin2bcd(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 real_minutes = bin2bcd(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds, RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes, RTC_MINUTES);
77 } else { 77 } else {
78 printk(KERN_WARNING 78 printk(KERN_WARNING
79 "set_rtc_mmss: can't update from %d to %d\n", 79 "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
151 outb(addr, RTC_PORT(0)); 151 outb(addr, RTC_PORT(0));
152 val = inb(RTC_PORT(1)); 152 val = inb(RTC_PORT(1));
153 lock_cmos_suffix(addr); 153 lock_cmos_suffix(addr);
154
154 return val; 155 return val;
155} 156}
156EXPORT_SYMBOL(rtc_cmos_read); 157EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
166 167
167static int set_rtc_mmss(unsigned long nowtime) 168static int set_rtc_mmss(unsigned long nowtime)
168{ 169{
169 int retval;
170 unsigned long flags; 170 unsigned long flags;
171 int retval;
171 172
172 spin_lock_irqsave(&rtc_lock, flags); 173 spin_lock_irqsave(&rtc_lock, flags);
173 retval = set_wallclock(nowtime); 174 retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
242 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
243 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n"); 245 "registered platform RTC device (no PNP device found)\n");
246
245 return 0; 247 return 0;
246} 248}
247device_initcall(add_rtc_cmos); 249device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c461f6d69074..b4158439bf63 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -74,14 +74,15 @@
74#include <asm/e820.h> 74#include <asm/e820.h>
75#include <asm/mpspec.h> 75#include <asm/mpspec.h>
76#include <asm/setup.h> 76#include <asm/setup.h>
77#include <asm/arch_hooks.h>
78#include <asm/efi.h> 77#include <asm/efi.h>
78#include <asm/timer.h>
79#include <asm/i8259.h>
79#include <asm/sections.h> 80#include <asm/sections.h>
80#include <asm/dmi.h> 81#include <asm/dmi.h>
81#include <asm/io_apic.h> 82#include <asm/io_apic.h>
82#include <asm/ist.h> 83#include <asm/ist.h>
83#include <asm/vmi.h> 84#include <asm/vmi.h>
84#include <setup_arch.h> 85#include <asm/setup_arch.h>
85#include <asm/bios_ebda.h> 86#include <asm/bios_ebda.h>
86#include <asm/cacheflush.h> 87#include <asm/cacheflush.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -89,7 +90,7 @@
89 90
90#include <asm/system.h> 91#include <asm/system.h>
91#include <asm/vsyscall.h> 92#include <asm/vsyscall.h>
92#include <asm/smp.h> 93#include <asm/cpu.h>
93#include <asm/desc.h> 94#include <asm/desc.h>
94#include <asm/dma.h> 95#include <asm/dma.h>
95#include <asm/iommu.h> 96#include <asm/iommu.h>
@@ -97,7 +98,6 @@
97#include <asm/mmu_context.h> 98#include <asm/mmu_context.h>
98#include <asm/proto.h> 99#include <asm/proto.h>
99 100
100#include <mach_apic.h>
101#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h> 102#include <asm/hypervisor.h>
103 103
@@ -112,6 +112,25 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115RESERVE_BRK(dmi_alloc, 65536);
116
117unsigned int boot_cpu_id __read_mostly;
118
119static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
120unsigned long _brk_end = (unsigned long)__brk_base;
121
122#ifdef CONFIG_X86_64
123int default_cpu_present_to_apicid(int mps_cpu)
124{
125 return __default_cpu_present_to_apicid(mps_cpu);
126}
127
128int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
129{
130 return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
131}
132#endif
133
115#ifndef CONFIG_DEBUG_BOOT_PARAMS 134#ifndef CONFIG_DEBUG_BOOT_PARAMS
116struct boot_params __initdata boot_params; 135struct boot_params __initdata boot_params;
117#else 136#else
@@ -144,12 +163,6 @@ static struct resource bss_resource = {
144 163
145 164
146#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
147/* This value is set up by the early boot code to point to the value
148 immediately after the boot time page tables. It contains a *physical*
149 address, and must not be in the .bss segment! */
150unsigned long init_pg_tables_start __initdata = ~0UL;
151unsigned long init_pg_tables_end __initdata = ~0UL;
152
153static struct resource video_ram_resource = { 166static struct resource video_ram_resource = {
154 .name = "Video RAM area", 167 .name = "Video RAM area",
155 .start = 0xa0000, 168 .start = 0xa0000,
@@ -188,7 +201,9 @@ struct ist_info ist_info;
188#endif 201#endif
189 202
190#else 203#else
191struct cpuinfo_x86 boot_cpu_data __read_mostly; 204struct cpuinfo_x86 boot_cpu_data __read_mostly = {
205 .x86_phys_bits = MAX_PHYSMEM_BITS,
206};
192EXPORT_SYMBOL(boot_cpu_data); 207EXPORT_SYMBOL(boot_cpu_data);
193#endif 208#endif
194 209
@@ -203,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
203int bootloader_type; 218int bootloader_type;
204 219
205/* 220/*
206 * Early DMI memory
207 */
208int dmi_alloc_index;
209char dmi_alloc_data[DMI_MAX_DATA];
210
211/*
212 * Setup options 221 * Setup options
213 */ 222 */
214struct screen_info screen_info; 223struct screen_info screen_info;
@@ -253,6 +262,35 @@ static inline void copy_edd(void)
253} 262}
254#endif 263#endif
255 264
265void * __init extend_brk(size_t size, size_t align)
266{
267 size_t mask = align - 1;
268 void *ret;
269
270 BUG_ON(_brk_start == 0);
271 BUG_ON(align & mask);
272
273 _brk_end = (_brk_end + mask) & ~mask;
274 BUG_ON((char *)(_brk_end + size) > __brk_limit);
275
276 ret = (void *)_brk_end;
277 _brk_end += size;
278
279 memset(ret, 0, size);
280
281 return ret;
282}
283
284static void __init reserve_brk(void)
285{
286 if (_brk_end > _brk_start)
287 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
288
289 /* Mark brk area as locked down and no longer taking any
290 new allocations */
291 _brk_start = 0;
292}
293
256#ifdef CONFIG_BLK_DEV_INITRD 294#ifdef CONFIG_BLK_DEV_INITRD
257 295
258#ifdef CONFIG_X86_32 296#ifdef CONFIG_X86_32
@@ -586,20 +624,7 @@ static int __init setup_elfcorehdr(char *arg)
586early_param("elfcorehdr", setup_elfcorehdr); 624early_param("elfcorehdr", setup_elfcorehdr);
587#endif 625#endif
588 626
589static int __init default_update_genapic(void) 627static struct x86_quirks default_x86_quirks __initdata;
590{
591#ifdef CONFIG_X86_SMP
592# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
593 genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
594# endif
595#endif
596
597 return 0;
598}
599
600static struct x86_quirks default_x86_quirks __initdata = {
601 .update_genapic = default_update_genapic,
602};
603 628
604struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 629struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
605 630
@@ -656,7 +681,6 @@ void __init setup_arch(char **cmdline_p)
656#ifdef CONFIG_X86_32 681#ifdef CONFIG_X86_32
657 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 682 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
658 visws_early_detect(); 683 visws_early_detect();
659 pre_setup_arch_hook();
660#else 684#else
661 printk(KERN_INFO "Command line: %s\n", boot_command_line); 685 printk(KERN_INFO "Command line: %s\n", boot_command_line);
662#endif 686#endif
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
715 init_mm.start_code = (unsigned long) _text; 739 init_mm.start_code = (unsigned long) _text;
716 init_mm.end_code = (unsigned long) _etext; 740 init_mm.end_code = (unsigned long) _etext;
717 init_mm.end_data = (unsigned long) _edata; 741 init_mm.end_data = (unsigned long) _edata;
718#ifdef CONFIG_X86_32 742 init_mm.brk = _brk_end;
719 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
720#else
721 init_mm.brk = (unsigned long) &_end;
722#endif
723 743
724 code_resource.start = virt_to_phys(_text); 744 code_resource.start = virt_to_phys(_text);
725 code_resource.end = virt_to_phys(_etext)-1; 745 code_resource.end = virt_to_phys(_etext)-1;
@@ -770,6 +790,9 @@ void __init setup_arch(char **cmdline_p)
770 790
771 finish_e820_parsing(); 791 finish_e820_parsing();
772 792
793 if (efi_enabled)
794 efi_init();
795
773 dmi_scan_machine(); 796 dmi_scan_machine();
774 797
775 dmi_check_system(bad_bios_dmi_table); 798 dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +812,6 @@ void __init setup_arch(char **cmdline_p)
789 insert_resource(&iomem_resource, &data_resource); 812 insert_resource(&iomem_resource, &data_resource);
790 insert_resource(&iomem_resource, &bss_resource); 813 insert_resource(&iomem_resource, &bss_resource);
791 814
792 if (efi_enabled)
793 efi_init();
794 815
795#ifdef CONFIG_X86_32 816#ifdef CONFIG_X86_32
796 if (ppro_with_ram_bug()) { 817 if (ppro_with_ram_bug()) {
@@ -823,8 +844,7 @@ void __init setup_arch(char **cmdline_p)
823#else 844#else
824 num_physpages = max_pfn; 845 num_physpages = max_pfn;
825 846
826 if (cpu_has_x2apic) 847 check_x2apic();
827 check_x2apic();
828 848
829 /* How many end-of-memory variables you have, grandma! */ 849 /* How many end-of-memory variables you have, grandma! */
830 /* need this before calling reserve_initrd */ 850 /* need this before calling reserve_initrd */
@@ -840,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
840 setup_bios_corruption_check(); 860 setup_bios_corruption_check();
841#endif 861#endif
842 862
863 reserve_brk();
864
843 /* max_pfn_mapped is updated here */ 865 /* max_pfn_mapped is updated here */
844 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 866 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
845 max_pfn_mapped = max_low_pfn_mapped; 867 max_pfn_mapped = max_low_pfn_mapped;
@@ -864,9 +886,7 @@ void __init setup_arch(char **cmdline_p)
864 886
865 reserve_initrd(); 887 reserve_initrd();
866 888
867#ifdef CONFIG_X86_64
868 vsmp_init(); 889 vsmp_init();
869#endif
870 890
871 io_delay_init(); 891 io_delay_init();
872 892
@@ -892,12 +912,11 @@ void __init setup_arch(char **cmdline_p)
892 */ 912 */
893 acpi_reserve_bootmem(); 913 acpi_reserve_bootmem();
894#endif 914#endif
895#ifdef CONFIG_X86_FIND_SMP_CONFIG
896 /* 915 /*
897 * Find and reserve possible boot-time SMP configuration: 916 * Find and reserve possible boot-time SMP configuration:
898 */ 917 */
899 find_smp_config(); 918 find_smp_config();
900#endif 919
901 reserve_crashkernel(); 920 reserve_crashkernel();
902 921
903#ifdef CONFIG_X86_64 922#ifdef CONFIG_X86_64
@@ -924,9 +943,7 @@ void __init setup_arch(char **cmdline_p)
924 map_vsyscall(); 943 map_vsyscall();
925#endif 944#endif
926 945
927#ifdef CONFIG_X86_GENERICARCH
928 generic_apic_probe(); 946 generic_apic_probe();
929#endif
930 947
931 early_quirks(); 948 early_quirks();
932 949
@@ -977,4 +994,94 @@ void __init setup_arch(char **cmdline_p)
977#endif 994#endif
978} 995}
979 996
997#ifdef CONFIG_X86_32
998
999/**
1000 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
1001 *
1002 * Description:
1003 * Perform any necessary interrupt initialisation prior to setting up
1004 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
1005 * interrupts should be initialised here if the machine emulates a PC
1006 * in any way.
1007 **/
1008void __init x86_quirk_pre_intr_init(void)
1009{
1010 if (x86_quirks->arch_pre_intr_init) {
1011 if (x86_quirks->arch_pre_intr_init())
1012 return;
1013 }
1014 init_ISA_irqs();
1015}
1016
1017/**
1018 * x86_quirk_intr_init - post gate setup interrupt initialisation
1019 *
1020 * Description:
1021 * Fill in any interrupts that may have been left out by the general
1022 * init_IRQ() routine. interrupts having to do with the machine rather
1023 * than the devices on the I/O bus (like APIC interrupts in intel MP
1024 * systems) are started here.
1025 **/
1026void __init x86_quirk_intr_init(void)
1027{
1028 if (x86_quirks->arch_intr_init) {
1029 if (x86_quirks->arch_intr_init())
1030 return;
1031 }
1032}
980 1033
1034/**
1035 * x86_quirk_trap_init - initialise system specific traps
1036 *
1037 * Description:
1038 * Called as the final act of trap_init(). Used in VISWS to initialise
1039 * the various board specific APIC traps.
1040 **/
1041void __init x86_quirk_trap_init(void)
1042{
1043 if (x86_quirks->arch_trap_init) {
1044 if (x86_quirks->arch_trap_init())
1045 return;
1046 }
1047}
1048
1049static struct irqaction irq0 = {
1050 .handler = timer_interrupt,
1051 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1052 .name = "timer"
1053};
1054
1055/**
1056 * x86_quirk_pre_time_init - do any specific initialisations before.
1057 *
1058 **/
1059void __init x86_quirk_pre_time_init(void)
1060{
1061 if (x86_quirks->arch_pre_time_init)
1062 x86_quirks->arch_pre_time_init();
1063}
1064
1065/**
1066 * x86_quirk_time_init - do any specific initialisations for the system timer.
1067 *
1068 * Description:
1069 * Must plug the system timer interrupt source at HZ into the IRQ listed
1070 * in irq_vectors.h:TIMER_IRQ
1071 **/
1072void __init x86_quirk_time_init(void)
1073{
1074 if (x86_quirks->arch_time_init) {
1075 /*
1076 * A nonzero return code does not mean failure, it means
1077 * that the architecture quirk does not want any
1078 * generic (timer) setup to be performed after this:
1079 */
1080 if (x86_quirks->arch_time_init())
1081 return;
1082 }
1083
1084 irq0.mask = cpumask_of_cpu(0);
1085 setup_irq(0, &irq0);
1086}
1087#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49c..3a97a4cf1872 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,402 +7,425 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/mpspec.h> 14#include <asm/mpspec.h>
14#include <asm/apicdef.h> 15#include <asm/apicdef.h>
15#include <asm/highmem.h> 16#include <asm/highmem.h>
17#include <asm/proto.h>
18#include <asm/cpumask.h>
19#include <asm/cpu.h>
20#include <asm/stackprotector.h>
16 21
17#ifdef CONFIG_X86_LOCAL_APIC 22#ifdef CONFIG_DEBUG_PER_CPU_MAPS
18unsigned int num_processors; 23# define DBG(x...) printk(KERN_DEBUG x)
19unsigned disabled_cpus __cpuinitdata; 24#else
20/* Processor that is doing the boot up */ 25# define DBG(x...)
21unsigned int boot_cpu_physical_apicid = -1U;
22EXPORT_SYMBOL(boot_cpu_physical_apicid);
23unsigned int max_physical_apicid;
24
25/* Bitmask of physically existing CPUs */
26physid_mask_t phys_cpu_present_map;
27#endif 26#endif
28 27
29/* map cpu index to physical APIC ID */ 28DEFINE_PER_CPU(int, cpu_number);
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 29EXPORT_PER_CPU_SYMBOL(cpu_number);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1
37
38/* map cpu index to node index */
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41
42/* which logical CPUs are on which nodes */
43cpumask_t *node_to_cpumask_map;
44EXPORT_SYMBOL(node_to_cpumask_map);
45
46/* setup node_to_cpumask_map */
47static void __init setup_node_to_cpumask_map(void);
48 30
31#ifdef CONFIG_X86_64
32#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
49#else 33#else
50static inline void setup_node_to_cpumask_map(void) { } 34#define BOOT_PERCPU_OFFSET 0
51#endif 35#endif
52 36
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 37DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
54/* 38EXPORT_PER_CPU_SYMBOL(this_cpu_off);
55 * Copy data used in early init routines from the initial arrays to the
56 * per cpu data areas. These arrays then become expendable and the
57 * *_early_ptr's are zeroed indicating that the static arrays are gone.
58 */
59static void __init setup_per_cpu_maps(void)
60{
61 int cpu;
62
63 for_each_possible_cpu(cpu) {
64 per_cpu(x86_cpu_to_apicid, cpu) =
65 early_per_cpu_map(x86_cpu_to_apicid, cpu);
66 per_cpu(x86_bios_cpu_apicid, cpu) =
67 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
68#ifdef X86_64_NUMA
69 per_cpu(x86_cpu_to_node_map, cpu) =
70 early_per_cpu_map(x86_cpu_to_node_map, cpu);
71#endif
72 }
73 39
74 /* indicate the early static arrays will soon be gone */ 40unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
75 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 41 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
76 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 42};
77#ifdef X86_64_NUMA 43EXPORT_SYMBOL(__per_cpu_offset);
78 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
79#endif
80}
81 44
82#ifdef CONFIG_X86_32
83/* 45/*
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 46 * On x86_64 symbols referenced from code should be reachable using
85 * the same way 47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
86 */ 51 */
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 52#ifdef CONFIG_X86_64
88EXPORT_SYMBOL(__per_cpu_offset); 53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
89static inline void setup_cpu_pda_map(void) { } 54#else
90 55#define PERCPU_FIRST_CHUNK_RESERVE 0
91#elif !defined(CONFIG_SMP) 56#endif
92static inline void setup_cpu_pda_map(void) { }
93
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95 57
96/* 58/**
97 * Allocate cpu_pda pointer table and array via alloc_bootmem. 59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 *
61 * If NUMA is not configured or there is only one NUMA node available,
62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
98 */ 67 */
99static void __init setup_cpu_pda_map(void) 68static bool __init pcpu_need_numa(void)
100{ 69{
101 char *pda; 70#ifdef CONFIG_NEED_MULTIPLE_NODES
102 struct x8664_pda **new_cpu_pda; 71 pg_data_t *last = NULL;
103 unsigned long size; 72 unsigned int cpu;
104 int cpu;
105
106 size = roundup(sizeof(struct x8664_pda), cache_line_size());
107 73
108 /* allocate cpu_pda array and pointer table */
109 {
110 unsigned long tsize = nr_cpu_ids * sizeof(void *);
111 unsigned long asize = size * (nr_cpu_ids - 1);
112
113 tsize = roundup(tsize, cache_line_size());
114 new_cpu_pda = alloc_bootmem(tsize + asize);
115 pda = (char *)new_cpu_pda + tsize;
116 }
117
118 /* initialize pointer table to static pda's */
119 for_each_possible_cpu(cpu) { 74 for_each_possible_cpu(cpu) {
120 if (cpu == 0) { 75 int node = early_cpu_to_node(cpu);
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 }
129
130 /* point to new pointer table */
131 _cpu_pda = new_cpu_pda;
132}
133
134#endif /* CONFIG_SMP && CONFIG_X86_64 */
135
136#ifdef CONFIG_X86_64
137
138/* correctly size the local cpu masks */
139static void __init setup_cpu_local_masks(void)
140{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
142 alloc_bootmem_cpumask_var(&cpu_callin_mask);
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146 76
147#else /* CONFIG_X86_32 */ 77 if (node_online(node) && NODE_DATA(node) &&
78 last && last != NODE_DATA(node))
79 return true;
148 80
149static inline void setup_cpu_local_masks(void) 81 last = NODE_DATA(node);
150{ 82 }
83#endif
84 return false;
151} 85}
152 86
153#endif /* CONFIG_X86_32 */ 87/**
154 88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
155/* 89 * @cpu: cpu to allocate for
156 * Great future plan: 90 * @size: size allocation in bytes
157 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 91 * @align: alignment
158 * Always point %gs to its beginning 92 *
93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
94 * does the right thing for NUMA regardless of the current
95 * configuration.
96 *
97 * RETURNS:
98 * Pointer to the allocated area on success, NULL on failure.
159 */ 99 */
160void __init setup_per_cpu_areas(void) 100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
161{ 102{
162 ssize_t size, old_size; 103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
163 char *ptr; 104#ifdef CONFIG_NEED_MULTIPLE_NODES
164 int cpu; 105 int node = early_cpu_to_node(cpu);
165 unsigned long align = 1; 106 void *ptr;
166 107
167 /* Setup cpu_pda map */ 108 if (!node_online(node) || !NODE_DATA(node)) {
168 setup_cpu_pda_map(); 109 ptr = __alloc_bootmem_nopanic(size, align, goal);
169 110 pr_info("cpu %d has no node %d or node-local memory\n",
170 /* Copy section for each CPU (we discard the original) */ 111 cpu, node);
171 old_size = PERCPU_ENOUGH_ROOM; 112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
172 align = max_t(unsigned long, PAGE_SIZE, align); 113 cpu, size, __pa(ptr));
173 size = roundup(old_size, align); 114 } else {
174 115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
175 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 116 size, align, goal);
176 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
177 118 "%016lx\n", cpu, size, node, __pa(ptr));
178 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 119 }
179 120 return ptr;
180 for_each_possible_cpu(cpu) {
181#ifndef CONFIG_NEED_MULTIPLE_NODES
182 ptr = __alloc_bootmem(size, align,
183 __pa(MAX_DMA_ADDRESS));
184#else 121#else
185 int node = early_cpu_to_node(cpu); 122 return __alloc_bootmem_nopanic(size, align, goal);
186 if (!node_online(node) || !NODE_DATA(node)) {
187 ptr = __alloc_bootmem(size, align,
188 __pa(MAX_DMA_ADDRESS));
189 pr_info("cpu %d has no node %d or node-local memory\n",
190 cpu, node);
191 pr_debug("per cpu data for cpu%d at %016lx\n",
192 cpu, __pa(ptr));
193 } else {
194 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
195 __pa(MAX_DMA_ADDRESS));
196 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
197 cpu, node, __pa(ptr));
198 }
199#endif 123#endif
200 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
202 }
203
204 /* Setup percpu data maps */
205 setup_per_cpu_maps();
206
207 /* Setup node to cpumask map */
208 setup_node_to_cpumask_map();
209
210 /* Setup cpu initialized, callin, callout masks */
211 setup_cpu_local_masks();
212} 124}
213 125
214#endif
215
216#ifdef X86_64_NUMA
217
218/* 126/*
219 * Allocate node_to_cpumask_map based on number of available nodes 127 * Remap allocator
220 * Requires node_possible_map to be valid. 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
221 * 133 *
222 * Note: node_to_cpumask() is not valid until after this is done. 134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
223 */ 138 */
224static void __init setup_node_to_cpumask_map(void) 139#ifdef CONFIG_NEED_MULTIPLE_NODES
225{ 140static size_t pcpur_size __initdata;
226 unsigned int node, num = 0; 141static void **pcpur_ptrs __initdata;
227 cpumask_t *map;
228
229 /* setup nr_node_ids if not done yet */
230 if (nr_node_ids == MAX_NUMNODES) {
231 for_each_node_mask(node, node_possible_map)
232 num = node;
233 nr_node_ids = num + 1;
234 }
235 142
236 /* allocate the map */ 143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 144{
145 size_t off = (size_t)pageno << PAGE_SHIFT;
238 146
239 pr_debug("Node to cpumask map at %p for %d nodes\n", 147 if (off >= pcpur_size)
240 map, nr_node_ids); 148 return NULL;
241 149
242 /* node_to_cpumask() will now work */ 150 return virt_to_page(pcpur_ptrs[cpu] + off);
243 node_to_cpumask_map = map;
244} 151}
245 152
246void __cpuinit numa_set_node(int cpu, int node) 153static ssize_t __init setup_pcpu_remap(size_t static_size)
247{ 154{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 155 static struct vm_struct vm;
249 156 size_t ptrs_size, dyn_size;
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE) 157 unsigned int cpu;
251 cpu_pda(cpu)->nodenumber = node; 158 ssize_t ret;
159
160 /*
161 * If large page isn't supported, there's no benefit in doing
162 * this. Also, on non-NUMA, embedding is better.
163 */
164 if (!cpu_has_pse || !pcpu_need_numa())
165 return -EINVAL;
166
167 /*
168 * Currently supports only single page. Supporting multiple
169 * pages won't be too difficult if it ever becomes necessary.
170 */
171 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
172 PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
252 179
253 if (cpu_to_node_map) 180 /* allocate pointer array and alloc large pages */
254 cpu_to_node_map[cpu] = node; 181 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
182 pcpur_ptrs = alloc_bootmem(ptrs_size);
255 183
256 else if (per_cpu_offset(cpu)) 184 for_each_possible_cpu(cpu) {
257 per_cpu(x86_cpu_to_node_map, cpu) = node; 185 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
186 if (!pcpur_ptrs[cpu])
187 goto enomem;
188
189 /*
190 * Only use pcpur_size bytes and give back the rest.
191 *
192 * Ingo: The 2MB up-rounding bootmem is needed to make
193 * sure the partial 2MB page is still fully RAM - it's
194 * not well-specified to have a PAT-incompatible area
195 * (unmapped RAM, device memory, etc.) in that hole.
196 */
197 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
198 PMD_SIZE - pcpur_size);
199
200 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
201 }
258 202
259 else 203 /* allocate address and map */
260 pr_debug("Setting node for non-present cpu %d\n", cpu); 204 vm.flags = VM_ALLOC;
261} 205 vm.size = num_possible_cpus() * PMD_SIZE;
206 vm_area_register_early(&vm, PMD_SIZE);
262 207
263void __cpuinit numa_clear_node(int cpu) 208 for_each_possible_cpu(cpu) {
264{ 209 pmd_t *pmd;
265 numa_set_node(cpu, NUMA_NO_NODE);
266}
267 210
268#ifndef CONFIG_DEBUG_PER_CPU_MAPS 211 pmd = populate_extra_pmd((unsigned long)vm.addr
212 + cpu * PMD_SIZE);
213 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
214 PAGE_KERNEL_LARGE));
215 }
269 216
270void __cpuinit numa_add_cpu(int cpu) 217 /* we're ready, commit */
271{ 218 pr_info("PERCPU: Remapped at %p with large pages, static data "
272 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 219 "%zu bytes\n", vm.addr, static_size);
220
221 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
222 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
223 PMD_SIZE, vm.addr, NULL);
224 goto out_free_ar;
225
226enomem:
227 for_each_possible_cpu(cpu)
228 if (pcpur_ptrs[cpu])
229 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
230 ret = -ENOMEM;
231out_free_ar:
232 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
233 return ret;
273} 234}
274 235#else
275void __cpuinit numa_remove_cpu(int cpu) 236static ssize_t __init setup_pcpu_remap(size_t static_size)
276{ 237{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); 238 return -EINVAL;
278} 239}
279 240#endif
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */
281 241
282/* 242/*
283 * --------- debug versions of the numa functions --------- 243 * Embedding allocator
244 *
245 * The first chunk is sized to just contain the static area plus
246 * module and dynamic reserves and embedded into linear physical
247 * mapping so that it can use PMD mapping without additional TLB
248 * pressure.
284 */ 249 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable) 250static ssize_t __init setup_pcpu_embed(size_t static_size)
286{ 251{
287 int node = cpu_to_node(cpu); 252 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
288 cpumask_t *mask; 253
289 char buf[64]; 254 /*
290 255 * If large page isn't supported, there's no benefit in doing
291 if (node_to_cpumask_map == NULL) { 256 * this. Also, embedding allocation doesn't play well with
292 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 257 * NUMA.
293 dump_stack(); 258 */
294 return; 259 if (!cpu_has_pse || pcpu_need_numa())
295 } 260 return -EINVAL;
296 261
297 mask = &node_to_cpumask_map[node]; 262 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
298 if (enable) 263 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
299 cpu_set(cpu, *mask);
300 else
301 cpu_clear(cpu, *mask);
302
303 cpulist_scnprintf(buf, sizeof(buf), mask);
304 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
305 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
306} 264}
307 265
308void __cpuinit numa_add_cpu(int cpu) 266/*
309{ 267 * 4k page allocator
310 numa_set_cpumask(cpu, 1); 268 *
311} 269 * This is the basic allocator. Static percpu area is allocated
270 * page-by-page and most of initialization is done by the generic
271 * setup function.
272 */
273static struct page **pcpu4k_pages __initdata;
274static int pcpu4k_nr_static_pages __initdata;
312 275
313void __cpuinit numa_remove_cpu(int cpu) 276static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
314{ 277{
315 numa_set_cpumask(cpu, 0); 278 if (pageno < pcpu4k_nr_static_pages)
279 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
280 return NULL;
316} 281}
317 282
318int cpu_to_node(int cpu) 283static void __init pcpu4k_populate_pte(unsigned long addr)
319{ 284{
320 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 285 populate_extra_pte(addr);
321 printk(KERN_WARNING
322 "cpu_to_node(%d): usage too early!\n", cpu);
323 dump_stack();
324 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
325 }
326 return per_cpu(x86_cpu_to_node_map, cpu);
327} 286}
328EXPORT_SYMBOL(cpu_to_node);
329 287
330/* 288static ssize_t __init setup_pcpu_4k(size_t static_size)
331 * Same function as cpu_to_node() but used if called before the
332 * per_cpu areas are setup.
333 */
334int early_cpu_to_node(int cpu)
335{ 289{
336 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 290 size_t pages_size;
337 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 291 unsigned int cpu;
338 292 int i, j;
339 if (!per_cpu_offset(cpu)) { 293 ssize_t ret;
340 printk(KERN_WARNING 294
341 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 295 pcpu4k_nr_static_pages = PFN_UP(static_size);
342 dump_stack(); 296
343 return NUMA_NO_NODE; 297 /* unaligned allocations can't be freed, round up to page size */
344 } 298 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
345 return per_cpu(x86_cpu_to_node_map, cpu); 299 * sizeof(pcpu4k_pages[0]));
346} 300 pcpu4k_pages = alloc_bootmem(pages_size);
347 301
302 /* allocate and copy */
303 j = 0;
304 for_each_possible_cpu(cpu)
305 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
306 void *ptr;
307
308 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
309 if (!ptr)
310 goto enomem;
311
312 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
313 pcpu4k_pages[j++] = virt_to_page(ptr);
314 }
348 315
349/* empty cpumask */ 316 /* we're ready, commit */
350static const cpumask_t cpu_mask_none; 317 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
318 pcpu4k_nr_static_pages, static_size);
319
320 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
321 PERCPU_FIRST_CHUNK_RESERVE, -1,
322 -1, NULL, pcpu4k_populate_pte);
323 goto out_free_ar;
324
325enomem:
326 while (--j >= 0)
327 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
328 ret = -ENOMEM;
329out_free_ar:
330 free_bootmem(__pa(pcpu4k_pages), pages_size);
331 return ret;
332}
351 333
352/* 334static inline void setup_percpu_segment(int cpu)
353 * Returns a pointer to the bitmask of CPUs on Node 'node'.
354 */
355const cpumask_t *cpumask_of_node(int node)
356{ 335{
357 if (node_to_cpumask_map == NULL) { 336#ifdef CONFIG_X86_32
358 printk(KERN_WARNING 337 struct desc_struct gdt;
359 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 338
360 node); 339 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
361 dump_stack(); 340 0x2 | DESCTYPE_S, 0x8);
362 return (const cpumask_t *)&cpu_online_map; 341 gdt.s = 1;
363 } 342 write_gdt_entry(get_cpu_gdt_table(cpu),
364 if (node >= nr_node_ids) { 343 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
365 printk(KERN_WARNING 344#endif
366 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
367 node, nr_node_ids);
368 dump_stack();
369 return &cpu_mask_none;
370 }
371 return &node_to_cpumask_map[node];
372} 345}
373EXPORT_SYMBOL(cpumask_of_node);
374 346
375/* 347/*
376 * Returns a bitmask of CPUs on Node 'node'. 348 * Great future plan:
377 * 349 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
378 * Side note: this function creates the returned cpumask on the stack 350 * Always point %gs to its beginning
379 * so with a high NR_CPUS count, excessive stack space is used. The
380 * node_to_cpumask_ptr function should be used whenever possible.
381 */ 351 */
382cpumask_t node_to_cpumask(int node) 352void __init setup_per_cpu_areas(void)
383{ 353{
384 if (node_to_cpumask_map == NULL) { 354 size_t static_size = __per_cpu_end - __per_cpu_start;
385 printk(KERN_WARNING 355 unsigned int cpu;
386 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); 356 unsigned long delta;
387 dump_stack(); 357 size_t pcpu_unit_size;
388 return cpu_online_map; 358 ssize_t ret;
389 }
390 if (node >= nr_node_ids) {
391 printk(KERN_WARNING
392 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
393 node, nr_node_ids);
394 dump_stack();
395 return cpu_mask_none;
396 }
397 return node_to_cpumask_map[node];
398}
399EXPORT_SYMBOL(node_to_cpumask);
400 359
401/* 360 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
402 * --------- end of debug versions of the numa functions --------- 361 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
403 */
404 362
405#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 363 /*
364 * Allocate percpu area. If PSE is supported, try to make use
365 * of large page mappings. Please read comments on top of
366 * each allocator for details.
367 */
368 ret = setup_pcpu_remap(static_size);
369 if (ret < 0)
370 ret = setup_pcpu_embed(static_size);
371 if (ret < 0)
372 ret = setup_pcpu_4k(static_size);
373 if (ret < 0)
374 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
375 static_size, ret);
376
377 pcpu_unit_size = ret;
378
379 /* alrighty, percpu areas up and running */
380 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
381 for_each_possible_cpu(cpu) {
382 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
383 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
384 per_cpu(cpu_number, cpu) = cpu;
385 setup_percpu_segment(cpu);
386 setup_stack_canary_segment(cpu);
387 /*
388 * Copy data used in early init routines from the
389 * initial arrays to the per cpu data areas. These
390 * arrays then become expendable and the *_early_ptr's
391 * are zeroed indicating that the static arrays are
392 * gone.
393 */
394#ifdef CONFIG_X86_LOCAL_APIC
395 per_cpu(x86_cpu_to_apicid, cpu) =
396 early_per_cpu_map(x86_cpu_to_apicid, cpu);
397 per_cpu(x86_bios_cpu_apicid, cpu) =
398 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
399#endif
400#ifdef CONFIG_X86_64
401 per_cpu(irq_stack_ptr, cpu) =
402 per_cpu(irq_stack_union.irq_stack, cpu) +
403 IRQ_STACK_SIZE - 64;
404#ifdef CONFIG_NUMA
405 per_cpu(x86_cpu_to_node_map, cpu) =
406 early_per_cpu_map(x86_cpu_to_node_map, cpu);
407#endif
408#endif
409 /*
410 * Up to this point, the boot CPU has been using .data.init
411 * area. Reload any changed state for the boot CPU.
412 */
413 if (cpu == boot_cpu_id)
414 switch_to_new_gdt(cpu);
415 }
406 416
407#endif /* X86_64_NUMA */ 417 /* indicate the early static arrays will soon be gone */
418#ifdef CONFIG_X86_LOCAL_APIC
419 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
420 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
421#endif
422#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
423 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
424#endif
408 425
426 /* Setup node to cpumask map */
427 setup_node_to_cpumask_map();
428
429 /* Setup cpu initialized, callin, callout masks */
430 setup_cpu_local_masks();
431}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index df0587f24c54..14425166b8e3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -50,27 +50,23 @@
50# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
51#endif 51#endif
52 52
53#define COPY(x) { \ 53#define COPY(x) do { \
54 err |= __get_user(regs->x, &sc->x); \ 54 get_user_ex(regs->x, &sc->x); \
55} 55} while (0)
56 56
57#define COPY_SEG(seg) { \ 57#define GET_SEG(seg) ({ \
58 unsigned short tmp; \ 58 unsigned short tmp; \
59 err |= __get_user(tmp, &sc->seg); \ 59 get_user_ex(tmp, &sc->seg); \
60 regs->seg = tmp; \ 60 tmp; \
61} 61})
62 62
63#define COPY_SEG_CPL3(seg) { \ 63#define COPY_SEG(seg) do { \
64 unsigned short tmp; \ 64 regs->seg = GET_SEG(seg); \
65 err |= __get_user(tmp, &sc->seg); \ 65} while (0)
66 regs->seg = tmp | 3; \
67}
68 66
69#define GET_SEG(seg) { \ 67#define COPY_SEG_CPL3(seg) do { \
70 unsigned short tmp; \ 68 regs->seg = GET_SEG(seg) | 3; \
71 err |= __get_user(tmp, &sc->seg); \ 69} while (0)
72 loadsegment(seg, tmp); \
73}
74 70
75static int 71static int
76restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 72restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
@@ -83,45 +79,49 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
83 /* Always make any pending restarted system calls return -EINTR */ 79 /* Always make any pending restarted system calls return -EINTR */
84 current_thread_info()->restart_block.fn = do_no_restart_syscall; 80 current_thread_info()->restart_block.fn = do_no_restart_syscall;
85 81
82 get_user_try {
83
86#ifdef CONFIG_X86_32 84#ifdef CONFIG_X86_32
87 GET_SEG(gs); 85 set_user_gs(regs, GET_SEG(gs));
88 COPY_SEG(fs); 86 COPY_SEG(fs);
89 COPY_SEG(es); 87 COPY_SEG(es);
90 COPY_SEG(ds); 88 COPY_SEG(ds);
91#endif /* CONFIG_X86_32 */ 89#endif /* CONFIG_X86_32 */
92 90
93 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 91 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
94 COPY(dx); COPY(cx); COPY(ip); 92 COPY(dx); COPY(cx); COPY(ip);
95 93
96#ifdef CONFIG_X86_64 94#ifdef CONFIG_X86_64
97 COPY(r8); 95 COPY(r8);
98 COPY(r9); 96 COPY(r9);
99 COPY(r10); 97 COPY(r10);
100 COPY(r11); 98 COPY(r11);
101 COPY(r12); 99 COPY(r12);
102 COPY(r13); 100 COPY(r13);
103 COPY(r14); 101 COPY(r14);
104 COPY(r15); 102 COPY(r15);
105#endif /* CONFIG_X86_64 */ 103#endif /* CONFIG_X86_64 */
106 104
107#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
108 COPY_SEG_CPL3(cs); 106 COPY_SEG_CPL3(cs);
109 COPY_SEG_CPL3(ss); 107 COPY_SEG_CPL3(ss);
110#else /* !CONFIG_X86_32 */ 108#else /* !CONFIG_X86_32 */
111 /* Kernel saves and restores only the CS segment register on signals, 109 /* Kernel saves and restores only the CS segment register on signals,
112 * which is the bare minimum needed to allow mixed 32/64-bit code. 110 * which is the bare minimum needed to allow mixed 32/64-bit code.
113 * App's signal handler can save/restore other segments if needed. */ 111 * App's signal handler can save/restore other segments if needed. */
114 COPY_SEG_CPL3(cs); 112 COPY_SEG_CPL3(cs);
115#endif /* CONFIG_X86_32 */ 113#endif /* CONFIG_X86_32 */
116 114
117 err |= __get_user(tmpflags, &sc->flags); 115 get_user_ex(tmpflags, &sc->flags);
118 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 116 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
119 regs->orig_ax = -1; /* disable syscall checks */ 117 regs->orig_ax = -1; /* disable syscall checks */
120 118
121 err |= __get_user(buf, &sc->fpstate); 119 get_user_ex(buf, &sc->fpstate);
122 err |= restore_i387_xstate(buf); 120 err |= restore_i387_xstate(buf);
121
122 get_user_ex(*pax, &sc->ax);
123 } get_user_catch(err);
123 124
124 err |= __get_user(*pax, &sc->ax);
125 return err; 125 return err;
126} 126}
127 127
@@ -131,57 +131,55 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
131{ 131{
132 int err = 0; 132 int err = 0;
133 133
134#ifdef CONFIG_X86_32 134 put_user_try {
135 {
136 unsigned int tmp;
137 135
138 savesegment(gs, tmp); 136#ifdef CONFIG_X86_32
139 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 137 put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
140 } 138 put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
141 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 139 put_user_ex(regs->es, (unsigned int __user *)&sc->es);
142 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 140 put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
143 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
144#endif /* CONFIG_X86_32 */ 141#endif /* CONFIG_X86_32 */
145 142
146 err |= __put_user(regs->di, &sc->di); 143 put_user_ex(regs->di, &sc->di);
147 err |= __put_user(regs->si, &sc->si); 144 put_user_ex(regs->si, &sc->si);
148 err |= __put_user(regs->bp, &sc->bp); 145 put_user_ex(regs->bp, &sc->bp);
149 err |= __put_user(regs->sp, &sc->sp); 146 put_user_ex(regs->sp, &sc->sp);
150 err |= __put_user(regs->bx, &sc->bx); 147 put_user_ex(regs->bx, &sc->bx);
151 err |= __put_user(regs->dx, &sc->dx); 148 put_user_ex(regs->dx, &sc->dx);
152 err |= __put_user(regs->cx, &sc->cx); 149 put_user_ex(regs->cx, &sc->cx);
153 err |= __put_user(regs->ax, &sc->ax); 150 put_user_ex(regs->ax, &sc->ax);
154#ifdef CONFIG_X86_64 151#ifdef CONFIG_X86_64
155 err |= __put_user(regs->r8, &sc->r8); 152 put_user_ex(regs->r8, &sc->r8);
156 err |= __put_user(regs->r9, &sc->r9); 153 put_user_ex(regs->r9, &sc->r9);
157 err |= __put_user(regs->r10, &sc->r10); 154 put_user_ex(regs->r10, &sc->r10);
158 err |= __put_user(regs->r11, &sc->r11); 155 put_user_ex(regs->r11, &sc->r11);
159 err |= __put_user(regs->r12, &sc->r12); 156 put_user_ex(regs->r12, &sc->r12);
160 err |= __put_user(regs->r13, &sc->r13); 157 put_user_ex(regs->r13, &sc->r13);
161 err |= __put_user(regs->r14, &sc->r14); 158 put_user_ex(regs->r14, &sc->r14);
162 err |= __put_user(regs->r15, &sc->r15); 159 put_user_ex(regs->r15, &sc->r15);
163#endif /* CONFIG_X86_64 */ 160#endif /* CONFIG_X86_64 */
164 161
165 err |= __put_user(current->thread.trap_no, &sc->trapno); 162 put_user_ex(current->thread.trap_no, &sc->trapno);
166 err |= __put_user(current->thread.error_code, &sc->err); 163 put_user_ex(current->thread.error_code, &sc->err);
167 err |= __put_user(regs->ip, &sc->ip); 164 put_user_ex(regs->ip, &sc->ip);
168#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
169 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 166 put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
170 err |= __put_user(regs->flags, &sc->flags); 167 put_user_ex(regs->flags, &sc->flags);
171 err |= __put_user(regs->sp, &sc->sp_at_signal); 168 put_user_ex(regs->sp, &sc->sp_at_signal);
172 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 169 put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
173#else /* !CONFIG_X86_32 */ 170#else /* !CONFIG_X86_32 */
174 err |= __put_user(regs->flags, &sc->flags); 171 put_user_ex(regs->flags, &sc->flags);
175 err |= __put_user(regs->cs, &sc->cs); 172 put_user_ex(regs->cs, &sc->cs);
176 err |= __put_user(0, &sc->gs); 173 put_user_ex(0, &sc->gs);
177 err |= __put_user(0, &sc->fs); 174 put_user_ex(0, &sc->fs);
178#endif /* CONFIG_X86_32 */ 175#endif /* CONFIG_X86_32 */
179 176
180 err |= __put_user(fpstate, &sc->fpstate); 177 put_user_ex(fpstate, &sc->fpstate);
181 178
182 /* non-iBCS2 extensions.. */ 179 /* non-iBCS2 extensions.. */
183 err |= __put_user(mask, &sc->oldmask); 180 put_user_ex(mask, &sc->oldmask);
184 err |= __put_user(current->thread.cr2, &sc->cr2); 181 put_user_ex(current->thread.cr2, &sc->cr2);
182 } put_user_catch(err);
185 183
186 return err; 184 return err;
187} 185}
@@ -189,6 +187,77 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
189/* 187/*
190 * Set up a signal frame. 188 * Set up a signal frame.
191 */ 189 */
190
191/*
192 * Determine which stack to use..
193 */
194static unsigned long align_sigframe(unsigned long sp)
195{
196#ifdef CONFIG_X86_32
197 /*
198 * Align the stack pointer according to the i386 ABI,
199 * i.e. so that on function entry ((sp + 4) & 15) == 0.
200 */
201 sp = ((sp + 4) & -16ul) - 4;
202#else /* !CONFIG_X86_32 */
203 sp = round_down(sp, 16) - 8;
204#endif
205 return sp;
206}
207
208static inline void __user *
209get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
210 void __user **fpstate)
211{
212 /* Default to using normal stack */
213 unsigned long sp = regs->sp;
214 int onsigstack = on_sig_stack(sp);
215
216#ifdef CONFIG_X86_64
217 /* redzone */
218 sp -= 128;
219#endif /* CONFIG_X86_64 */
220
221 if (!onsigstack) {
222 /* This is the X/Open sanctioned signal stack switching. */
223 if (ka->sa.sa_flags & SA_ONSTACK) {
224 if (current->sas_ss_size)
225 sp = current->sas_ss_sp + current->sas_ss_size;
226 } else {
227#ifdef CONFIG_X86_32
228 /* This is the legacy signal stack switching. */
229 if ((regs->ss & 0xffff) != __USER_DS &&
230 !(ka->sa.sa_flags & SA_RESTORER) &&
231 ka->sa.sa_restorer)
232 sp = (unsigned long) ka->sa.sa_restorer;
233#endif /* CONFIG_X86_32 */
234 }
235 }
236
237 if (used_math()) {
238 sp -= sig_xstate_size;
239#ifdef CONFIG_X86_64
240 sp = round_down(sp, 64);
241#endif /* CONFIG_X86_64 */
242 *fpstate = (void __user *)sp;
243 }
244
245 sp = align_sigframe(sp - frame_size);
246
247 /*
248 * If we are on the alternate signal stack and would overflow it, don't.
249 * Return an always-bogus address instead so we will die with SIGSEGV.
250 */
251 if (onsigstack && !likely(on_sig_stack(sp)))
252 return (void __user *)-1L;
253
254 /* save i387 state */
255 if (used_math() && save_i387_xstate(*fpstate) < 0)
256 return (void __user *)-1L;
257
258 return (void __user *)sp;
259}
260
192#ifdef CONFIG_X86_32 261#ifdef CONFIG_X86_32
193static const struct { 262static const struct {
194 u16 poplmovl; 263 u16 poplmovl;
@@ -212,54 +281,6 @@ static const struct {
212 0 281 0
213}; 282};
214 283
215/*
216 * Determine which stack to use..
217 */
218static inline void __user *
219get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
220 void **fpstate)
221{
222 unsigned long sp;
223
224 /* Default to using normal stack */
225 sp = regs->sp;
226
227 /*
228 * If we are on the alternate signal stack and would overflow it, don't.
229 * Return an always-bogus address instead so we will die with SIGSEGV.
230 */
231 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
232 return (void __user *) -1L;
233
234 /* This is the X/Open sanctioned signal stack switching. */
235 if (ka->sa.sa_flags & SA_ONSTACK) {
236 if (sas_ss_flags(sp) == 0)
237 sp = current->sas_ss_sp + current->sas_ss_size;
238 } else {
239 /* This is the legacy signal stack switching. */
240 if ((regs->ss & 0xffff) != __USER_DS &&
241 !(ka->sa.sa_flags & SA_RESTORER) &&
242 ka->sa.sa_restorer)
243 sp = (unsigned long) ka->sa.sa_restorer;
244 }
245
246 if (used_math()) {
247 sp = sp - sig_xstate_size;
248 *fpstate = (struct _fpstate *) sp;
249 if (save_i387_xstate(*fpstate) < 0)
250 return (void __user *)-1L;
251 }
252
253 sp -= frame_size;
254 /*
255 * Align the stack pointer according to the i386 ABI,
256 * i.e. so that on function entry ((sp + 4) & 15) == 0.
257 */
258 sp = ((sp + 4) & -16ul) - 4;
259
260 return (void __user *) sp;
261}
262
263static int 284static int
264__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 285__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
265 struct pt_regs *regs) 286 struct pt_regs *regs)
@@ -336,43 +357,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
336 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 357 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
337 return -EFAULT; 358 return -EFAULT;
338 359
339 err |= __put_user(sig, &frame->sig); 360 put_user_try {
340 err |= __put_user(&frame->info, &frame->pinfo); 361 put_user_ex(sig, &frame->sig);
341 err |= __put_user(&frame->uc, &frame->puc); 362 put_user_ex(&frame->info, &frame->pinfo);
342 err |= copy_siginfo_to_user(&frame->info, info); 363 put_user_ex(&frame->uc, &frame->puc);
343 if (err) 364 err |= copy_siginfo_to_user(&frame->info, info);
344 return -EFAULT;
345
346 /* Create the ucontext. */
347 if (cpu_has_xsave)
348 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
349 else
350 err |= __put_user(0, &frame->uc.uc_flags);
351 err |= __put_user(0, &frame->uc.uc_link);
352 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
353 err |= __put_user(sas_ss_flags(regs->sp),
354 &frame->uc.uc_stack.ss_flags);
355 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
356 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
357 regs, set->sig[0]);
358 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
359 if (err)
360 return -EFAULT;
361 365
362 /* Set up to return from userspace. */ 366 /* Create the ucontext. */
363 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 367 if (cpu_has_xsave)
364 if (ka->sa.sa_flags & SA_RESTORER) 368 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
365 restorer = ka->sa.sa_restorer; 369 else
366 err |= __put_user(restorer, &frame->pretcode); 370 put_user_ex(0, &frame->uc.uc_flags);
371 put_user_ex(0, &frame->uc.uc_link);
372 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
373 put_user_ex(sas_ss_flags(regs->sp),
374 &frame->uc.uc_stack.ss_flags);
375 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
376 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
377 regs, set->sig[0]);
378 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
379
380 /* Set up to return from userspace. */
381 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
382 if (ka->sa.sa_flags & SA_RESTORER)
383 restorer = ka->sa.sa_restorer;
384 put_user_ex(restorer, &frame->pretcode);
367 385
368 /* 386 /*
369 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 387 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
370 * 388 *
371 * WE DO NOT USE IT ANY MORE! It's only left here for historical 389 * WE DO NOT USE IT ANY MORE! It's only left here for historical
372 * reasons and because gdb uses it as a signature to notice 390 * reasons and because gdb uses it as a signature to notice
373 * signal handler stack frames. 391 * signal handler stack frames.
374 */ 392 */
375 err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); 393 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
394 } put_user_catch(err);
376 395
377 if (err) 396 if (err)
378 return -EFAULT; 397 return -EFAULT;
@@ -392,24 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
392 return 0; 411 return 0;
393} 412}
394#else /* !CONFIG_X86_32 */ 413#else /* !CONFIG_X86_32 */
395/*
396 * Determine which stack to use..
397 */
398static void __user *
399get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
400{
401 /* Default to using normal stack - redzone*/
402 sp -= 128;
403
404 /* This is the X/Open sanctioned signal stack switching. */
405 if (ka->sa.sa_flags & SA_ONSTACK) {
406 if (sas_ss_flags(sp) == 0)
407 sp = current->sas_ss_sp + current->sas_ss_size;
408 }
409
410 return (void __user *)round_down(sp - size, 64);
411}
412
413static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 414static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 sigset_t *set, struct pt_regs *regs) 415 sigset_t *set, struct pt_regs *regs)
415{ 416{
@@ -418,15 +419,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
418 int err = 0; 419 int err = 0;
419 struct task_struct *me = current; 420 struct task_struct *me = current;
420 421
421 if (used_math()) { 422 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
422 fp = get_stack(ka, regs->sp, sig_xstate_size);
423 frame = (void __user *)round_down(
424 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
425
426 if (save_i387_xstate(fp) < 0)
427 return -EFAULT;
428 } else
429 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
430 423
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 424 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 return -EFAULT; 425 return -EFAULT;
@@ -436,28 +429,30 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
436 return -EFAULT; 429 return -EFAULT;
437 } 430 }
438 431
439 /* Create the ucontext. */ 432 put_user_try {
440 if (cpu_has_xsave) 433 /* Create the ucontext. */
441 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 434 if (cpu_has_xsave)
442 else 435 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
443 err |= __put_user(0, &frame->uc.uc_flags); 436 else
444 err |= __put_user(0, &frame->uc.uc_link); 437 put_user_ex(0, &frame->uc.uc_flags);
445 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 438 put_user_ex(0, &frame->uc.uc_link);
446 err |= __put_user(sas_ss_flags(regs->sp), 439 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
447 &frame->uc.uc_stack.ss_flags); 440 put_user_ex(sas_ss_flags(regs->sp),
448 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 441 &frame->uc.uc_stack.ss_flags);
449 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); 442 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
450 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 443 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
451 444 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
452 /* Set up to return from userspace. If provided, use a stub 445
453 already in userspace. */ 446 /* Set up to return from userspace. If provided, use a stub
454 /* x86-64 should always use SA_RESTORER. */ 447 already in userspace. */
455 if (ka->sa.sa_flags & SA_RESTORER) { 448 /* x86-64 should always use SA_RESTORER. */
456 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 449 if (ka->sa.sa_flags & SA_RESTORER) {
457 } else { 450 put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
458 /* could use a vstub here */ 451 } else {
459 return -EFAULT; 452 /* could use a vstub here */
460 } 453 err |= -EFAULT;
454 }
455 } put_user_catch(err);
461 456
462 if (err) 457 if (err)
463 return -EFAULT; 458 return -EFAULT;
@@ -509,31 +504,41 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
509 struct old_sigaction __user *oact) 504 struct old_sigaction __user *oact)
510{ 505{
511 struct k_sigaction new_ka, old_ka; 506 struct k_sigaction new_ka, old_ka;
512 int ret; 507 int ret = 0;
513 508
514 if (act) { 509 if (act) {
515 old_sigset_t mask; 510 old_sigset_t mask;
516 511
517 if (!access_ok(VERIFY_READ, act, sizeof(*act)) || 512 if (!access_ok(VERIFY_READ, act, sizeof(*act)))
518 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
519 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
520 return -EFAULT; 513 return -EFAULT;
521 514
522 __get_user(new_ka.sa.sa_flags, &act->sa_flags); 515 get_user_try {
523 __get_user(mask, &act->sa_mask); 516 get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
517 get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
518 get_user_ex(mask, &act->sa_mask);
519 get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
520 } get_user_catch(ret);
521
522 if (ret)
523 return -EFAULT;
524 siginitset(&new_ka.sa.sa_mask, mask); 524 siginitset(&new_ka.sa.sa_mask, mask);
525 } 525 }
526 526
527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
528 528
529 if (!ret && oact) { 529 if (!ret && oact) {
530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
531 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
532 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
533 return -EFAULT; 531 return -EFAULT;
534 532
535 __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 533 put_user_try {
536 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); 534 put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
535 put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
536 put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
537 put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
538 } put_user_catch(ret);
539
540 if (ret)
541 return -EFAULT;
537 } 542 }
538 543
539 return ret; 544 return ret;
@@ -541,14 +546,9 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
541#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
542 547
543#ifdef CONFIG_X86_32 548#ifdef CONFIG_X86_32
544asmlinkage int sys_sigaltstack(unsigned long bx) 549int sys_sigaltstack(struct pt_regs *regs)
545{ 550{
546 /* 551 const stack_t __user *uss = (const stack_t __user *)regs->bx;
547 * This is needed to make gcc realize it doesn't own the
548 * "struct pt_regs"
549 */
550 struct pt_regs *regs = (struct pt_regs *)&bx;
551 const stack_t __user *uss = (const stack_t __user *)bx;
552 stack_t __user *uoss = (stack_t __user *)regs->cx; 552 stack_t __user *uoss = (stack_t __user *)regs->cx;
553 553
554 return do_sigaltstack(uss, uoss, regs->sp); 554 return do_sigaltstack(uss, uoss, regs->sp);
@@ -566,14 +566,12 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
566 * Do a signal return; undo the signal stack. 566 * Do a signal return; undo the signal stack.
567 */ 567 */
568#ifdef CONFIG_X86_32 568#ifdef CONFIG_X86_32
569asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 569unsigned long sys_sigreturn(struct pt_regs *regs)
570{ 570{
571 struct sigframe __user *frame; 571 struct sigframe __user *frame;
572 struct pt_regs *regs;
573 unsigned long ax; 572 unsigned long ax;
574 sigset_t set; 573 sigset_t set;
575 574
576 regs = (struct pt_regs *) &__unused;
577 frame = (struct sigframe __user *)(regs->sp - 8); 575 frame = (struct sigframe __user *)(regs->sp - 8);
578 576
579 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 577 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -600,7 +598,7 @@ badframe:
600} 598}
601#endif /* CONFIG_X86_32 */ 599#endif /* CONFIG_X86_32 */
602 600
603static long do_rt_sigreturn(struct pt_regs *regs) 601long sys_rt_sigreturn(struct pt_regs *regs)
604{ 602{
605 struct rt_sigframe __user *frame; 603 struct rt_sigframe __user *frame;
606 unsigned long ax; 604 unsigned long ax;
@@ -631,25 +629,6 @@ badframe:
631 return 0; 629 return 0;
632} 630}
633 631
634#ifdef CONFIG_X86_32
635/*
636 * Note: do not pass in pt_regs directly as with tail-call optimization
637 * GCC will incorrectly stomp on the caller's frame and corrupt user-space
638 * register state:
639 */
640asmlinkage int sys_rt_sigreturn(unsigned long __unused)
641{
642 struct pt_regs *regs = (struct pt_regs *)&__unused;
643
644 return do_rt_sigreturn(regs);
645}
646#else /* !CONFIG_X86_32 */
647asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
648{
649 return do_rt_sigreturn(regs);
650}
651#endif /* CONFIG_X86_32 */
652
653/* 632/*
654 * OK, we're invoking a handler: 633 * OK, we're invoking a handler:
655 */ 634 */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e6faa3316bd2..13f33ea8ccaa 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -2,7 +2,7 @@
2 * Intel SMP support routines. 2 * Intel SMP support routines.
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs. 6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 * 7 *
8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> 8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,8 +26,7 @@
26#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
27#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <mach_ipi.h> 29#include <asm/apic.h>
30#include <mach_apic.h>
31/* 30/*
32 * Some notes on x86 processor bugs affecting SMP operation: 31 * Some notes on x86 processor bugs affecting SMP operation:
33 * 32 *
@@ -118,12 +117,12 @@ static void native_smp_send_reschedule(int cpu)
118 WARN_ON(1); 117 WARN_ON(1);
119 return; 118 return;
120 } 119 }
121 send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); 120 apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
122} 121}
123 122
124void native_send_call_func_single_ipi(int cpu) 123void native_send_call_func_single_ipi(int cpu)
125{ 124{
126 send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); 125 apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
127} 126}
128 127
129void native_send_call_func_ipi(const struct cpumask *mask) 128void native_send_call_func_ipi(const struct cpumask *mask)
@@ -131,7 +130,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)
131 cpumask_var_t allbutself; 130 cpumask_var_t allbutself;
132 131
133 if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { 132 if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
134 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 133 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
135 return; 134 return;
136 } 135 }
137 136
@@ -140,9 +139,9 @@ void native_send_call_func_ipi(const struct cpumask *mask)
140 139
141 if (cpumask_equal(mask, allbutself) && 140 if (cpumask_equal(mask, allbutself) &&
142 cpumask_equal(cpu_online_mask, cpu_callout_mask)) 141 cpumask_equal(cpu_online_mask, cpu_callout_mask))
143 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 142 apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
144 else 143 else
145 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 144 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
146 145
147 free_cpumask_var(allbutself); 146 free_cpumask_var(allbutself);
148} 147}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bb1a3b1fc87f..58d24ef917d8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -2,7 +2,7 @@
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs. 6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 * 7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to 8 * Much of the core SMP work is based on previous work by Thomas Radke, to
@@ -53,7 +53,6 @@
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h> 55#include <asm/idle.h>
56#include <asm/smp.h>
57#include <asm/trampoline.h> 56#include <asm/trampoline.h>
58#include <asm/cpu.h> 57#include <asm/cpu.h>
59#include <asm/numa.h> 58#include <asm/numa.h>
@@ -61,13 +60,12 @@
61#include <asm/tlbflush.h> 60#include <asm/tlbflush.h>
62#include <asm/mtrr.h> 61#include <asm/mtrr.h>
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/apic.h>
65#include <asm/setup.h> 64#include <asm/setup.h>
65#include <asm/uv/uv.h>
66#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
67 67
68#include <mach_apic.h> 68#include <asm/smpboot_hooks.h>
69#include <mach_wakecpu.h>
70#include <smpboot_hooks.h>
71 69
72#ifdef CONFIG_X86_32 70#ifdef CONFIG_X86_32
73u8 apicid_2_node[MAX_APICID]; 71u8 apicid_2_node[MAX_APICID];
@@ -103,29 +101,20 @@ EXPORT_SYMBOL(smp_num_siblings);
103DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 101DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
104 102
105/* representing HT siblings of each logical CPU */ 103/* representing HT siblings of each logical CPU */
106DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); 104DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
107EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 105EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
108 106
109/* representing HT and core siblings of each logical CPU */ 107/* representing HT and core siblings of each logical CPU */
110DEFINE_PER_CPU(cpumask_t, cpu_core_map); 108DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
111EXPORT_PER_CPU_SYMBOL(cpu_core_map); 109EXPORT_PER_CPU_SYMBOL(cpu_core_map);
112 110
113/* Per CPU bogomips and other parameters */ 111/* Per CPU bogomips and other parameters */
114DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 112DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
115EXPORT_PER_CPU_SYMBOL(cpu_info); 113EXPORT_PER_CPU_SYMBOL(cpu_info);
116 114
117static atomic_t init_deasserted; 115atomic_t init_deasserted;
118
119
120/* Set if we find a B stepping CPU */
121static int __cpuinitdata smp_b_stepping;
122 116
123#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 117#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
124
125/* which logical CPUs are on which nodes */
126cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
127 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
128EXPORT_SYMBOL(node_to_cpumask_map);
129/* which node each logical CPU is on */ 118/* which node each logical CPU is on */
130int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 119int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
131EXPORT_SYMBOL(cpu_to_node_map); 120EXPORT_SYMBOL(cpu_to_node_map);
@@ -134,7 +123,7 @@ EXPORT_SYMBOL(cpu_to_node_map);
134static void map_cpu_to_node(int cpu, int node) 123static void map_cpu_to_node(int cpu, int node)
135{ 124{
136 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); 125 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
137 cpumask_set_cpu(cpu, &node_to_cpumask_map[node]); 126 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
138 cpu_to_node_map[cpu] = node; 127 cpu_to_node_map[cpu] = node;
139} 128}
140 129
@@ -145,7 +134,7 @@ static void unmap_cpu_to_node(int cpu)
145 134
146 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); 135 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
147 for (node = 0; node < MAX_NUMNODES; node++) 136 for (node = 0; node < MAX_NUMNODES; node++)
148 cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]); 137 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = 0; 138 cpu_to_node_map[cpu] = 0;
150} 139}
151#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ 140#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
@@ -163,7 +152,7 @@ static void map_cpu_to_logical_apicid(void)
163{ 152{
164 int cpu = smp_processor_id(); 153 int cpu = smp_processor_id();
165 int apicid = logical_smp_processor_id(); 154 int apicid = logical_smp_processor_id();
166 int node = apicid_to_node(apicid); 155 int node = apic->apicid_to_node(apicid);
167 156
168 if (!node_online(node)) 157 if (!node_online(node))
169 node = first_online_node; 158 node = first_online_node;
@@ -196,7 +185,8 @@ static void __cpuinit smp_callin(void)
196 * our local APIC. We have to wait for the IPI or we'll 185 * our local APIC. We have to wait for the IPI or we'll
197 * lock up on an APIC access. 186 * lock up on an APIC access.
198 */ 187 */
199 wait_for_init_deassert(&init_deasserted); 188 if (apic->wait_for_init_deassert)
189 apic->wait_for_init_deassert(&init_deasserted);
200 190
201 /* 191 /*
202 * (This works even if the APIC is not enabled.) 192 * (This works even if the APIC is not enabled.)
@@ -243,7 +233,8 @@ static void __cpuinit smp_callin(void)
243 */ 233 */
244 234
245 pr_debug("CALLIN, before setup_local_APIC().\n"); 235 pr_debug("CALLIN, before setup_local_APIC().\n");
246 smp_callin_clear_local_apic(); 236 if (apic->smp_callin_clear_local_apic)
237 apic->smp_callin_clear_local_apic();
247 setup_local_APIC(); 238 setup_local_APIC();
248 end_local_APIC_setup(); 239 end_local_APIC_setup();
249 map_cpu_to_logical_apicid(); 240 map_cpu_to_logical_apicid();
@@ -271,8 +262,6 @@ static void __cpuinit smp_callin(void)
271 cpumask_set_cpu(cpuid, cpu_callin_mask); 262 cpumask_set_cpu(cpuid, cpu_callin_mask);
272} 263}
273 264
274static int __cpuinitdata unsafe_smp;
275
276/* 265/*
277 * Activate a secondary processor. 266 * Activate a secondary processor.
278 */ 267 */
@@ -307,7 +296,7 @@ notrace static void __cpuinit start_secondary(void *unused)
307 __flush_tlb_all(); 296 __flush_tlb_all();
308#endif 297#endif
309 298
310 /* This must be done before setting cpu_online_map */ 299 /* This must be done before setting cpu_online_mask */
311 set_cpu_sibling_map(raw_smp_processor_id()); 300 set_cpu_sibling_map(raw_smp_processor_id());
312 wmb(); 301 wmb();
313 302
@@ -340,75 +329,22 @@ notrace static void __cpuinit start_secondary(void *unused)
340 cpu_idle(); 329 cpu_idle();
341} 330}
342 331
343static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) 332#ifdef CONFIG_CPUMASK_OFFSTACK
333/* In this case, llc_shared_map is a pointer to a cpumask. */
334static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
335 const struct cpuinfo_x86 *src)
344{ 336{
345 /* 337 struct cpumask *llc = dst->llc_shared_map;
346 * Mask B, Pentium, but not Pentium MMX 338 *dst = *src;
347 */ 339 dst->llc_shared_map = llc;
348 if (c->x86_vendor == X86_VENDOR_INTEL &&
349 c->x86 == 5 &&
350 c->x86_mask >= 1 && c->x86_mask <= 4 &&
351 c->x86_model <= 3)
352 /*
353 * Remember we have B step Pentia with bugs
354 */
355 smp_b_stepping = 1;
356
357 /*
358 * Certain Athlons might work (for various values of 'work') in SMP
359 * but they are not certified as MP capable.
360 */
361 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
362
363 if (num_possible_cpus() == 1)
364 goto valid_k7;
365
366 /* Athlon 660/661 is valid. */
367 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
368 (c->x86_mask == 1)))
369 goto valid_k7;
370
371 /* Duron 670 is valid */
372 if ((c->x86_model == 7) && (c->x86_mask == 0))
373 goto valid_k7;
374
375 /*
376 * Athlon 662, Duron 671, and Athlon >model 7 have capability
377 * bit. It's worth noting that the A5 stepping (662) of some
378 * Athlon XP's have the MP bit set.
379 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
380 * more.
381 */
382 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
383 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
384 (c->x86_model > 7))
385 if (cpu_has_mp)
386 goto valid_k7;
387
388 /* If we get here, not a certified SMP capable AMD system. */
389 unsafe_smp = 1;
390 }
391
392valid_k7:
393 ;
394} 340}
395 341#else
396static void __cpuinit smp_checks(void) 342static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
343 const struct cpuinfo_x86 *src)
397{ 344{
398 if (smp_b_stepping) 345 *dst = *src;
399 printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
400 "with B stepping processors.\n");
401
402 /*
403 * Don't taint if we are running SMP kernel on a single non-MP
404 * approved Athlon
405 */
406 if (unsafe_smp && num_online_cpus() > 1) {
407 printk(KERN_INFO "WARNING: This combination of AMD"
408 "processors is not suitable for SMP.\n");
409 add_taint(TAINT_UNSAFE_SMP);
410 }
411} 346}
347#endif /* CONFIG_CPUMASK_OFFSTACK */
412 348
413/* 349/*
414 * The bootstrap kernel entry code has set these up. Save them for 350 * The bootstrap kernel entry code has set these up. Save them for
@@ -419,11 +355,10 @@ void __cpuinit smp_store_cpu_info(int id)
419{ 355{
420 struct cpuinfo_x86 *c = &cpu_data(id); 356 struct cpuinfo_x86 *c = &cpu_data(id);
421 357
422 *c = boot_cpu_data; 358 copy_cpuinfo_x86(c, &boot_cpu_data);
423 c->cpu_index = id; 359 c->cpu_index = id;
424 if (id != 0) 360 if (id != 0)
425 identify_secondary_cpu(c); 361 identify_secondary_cpu(c);
426 smp_apply_quirks(c);
427} 362}
428 363
429 364
@@ -444,15 +379,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
444 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 379 cpumask_set_cpu(cpu, cpu_sibling_mask(i));
445 cpumask_set_cpu(i, cpu_core_mask(cpu)); 380 cpumask_set_cpu(i, cpu_core_mask(cpu));
446 cpumask_set_cpu(cpu, cpu_core_mask(i)); 381 cpumask_set_cpu(cpu, cpu_core_mask(i));
447 cpumask_set_cpu(i, &c->llc_shared_map); 382 cpumask_set_cpu(i, c->llc_shared_map);
448 cpumask_set_cpu(cpu, &o->llc_shared_map); 383 cpumask_set_cpu(cpu, o->llc_shared_map);
449 } 384 }
450 } 385 }
451 } else { 386 } else {
452 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 387 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
453 } 388 }
454 389
455 cpumask_set_cpu(cpu, &c->llc_shared_map); 390 cpumask_set_cpu(cpu, c->llc_shared_map);
456 391
457 if (current_cpu_data.x86_max_cores == 1) { 392 if (current_cpu_data.x86_max_cores == 1) {
458 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 393 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -463,8 +398,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
463 for_each_cpu(i, cpu_sibling_setup_mask) { 398 for_each_cpu(i, cpu_sibling_setup_mask) {
464 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 399 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
465 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 400 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
466 cpumask_set_cpu(i, &c->llc_shared_map); 401 cpumask_set_cpu(i, c->llc_shared_map);
467 cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map); 402 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
468 } 403 }
469 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 404 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
470 cpumask_set_cpu(i, cpu_core_mask(cpu)); 405 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -502,12 +437,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
502 if (sched_mc_power_savings || sched_smt_power_savings) 437 if (sched_mc_power_savings || sched_smt_power_savings)
503 return cpu_core_mask(cpu); 438 return cpu_core_mask(cpu);
504 else 439 else
505 return &c->llc_shared_map; 440 return c->llc_shared_map;
506}
507
508cpumask_t cpu_coregroup_map(int cpu)
509{
510 return *cpu_coregroup_mask(cpu);
511} 441}
512 442
513static void impress_friends(void) 443static void impress_friends(void)
@@ -583,7 +513,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
583 /* Target chip */ 513 /* Target chip */
584 /* Boot on the stack */ 514 /* Boot on the stack */
585 /* Kick the second */ 515 /* Kick the second */
586 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); 516 apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
587 517
588 pr_debug("Waiting for send to finish...\n"); 518 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 519 send_status = safe_apic_wait_icr_idle();
@@ -614,12 +544,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
614 unsigned long send_status, accept_status = 0; 544 unsigned long send_status, accept_status = 0;
615 int maxlvt, num_starts, j; 545 int maxlvt, num_starts, j;
616 546
617 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
618 send_status = uv_wakeup_secondary(phys_apicid, start_eip);
619 atomic_set(&init_deasserted, 1);
620 return send_status;
621 }
622
623 maxlvt = lapic_get_maxlvt(); 547 maxlvt = lapic_get_maxlvt();
624 548
625 /* 549 /*
@@ -745,78 +669,23 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
745 complete(&c_idle->done); 669 complete(&c_idle->done);
746} 670}
747 671
748#ifdef CONFIG_X86_64
749
750/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
751static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
752{
753 if (!after_bootmem)
754 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
755}
756
757/*
758 * Allocate node local memory for the AP pda.
759 *
760 * Must be called after the _cpu_pda pointer table is initialized.
761 */
762int __cpuinit get_local_pda(int cpu)
763{
764 struct x8664_pda *oldpda, *newpda;
765 unsigned long size = sizeof(struct x8664_pda);
766 int node = cpu_to_node(cpu);
767
768 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
769 return 0;
770
771 oldpda = cpu_pda(cpu);
772 newpda = kmalloc_node(size, GFP_ATOMIC, node);
773 if (!newpda) {
774 printk(KERN_ERR "Could not allocate node local PDA "
775 "for CPU %d on node %d\n", cpu, node);
776
777 if (oldpda)
778 return 0; /* have a usable pda */
779 else
780 return -1;
781 }
782
783 if (oldpda) {
784 memcpy(newpda, oldpda, size);
785 free_bootmem_pda(oldpda);
786 }
787
788 newpda->in_bootmem = 0;
789 cpu_pda(cpu) = newpda;
790 return 0;
791}
792#endif /* CONFIG_X86_64 */
793
794static int __cpuinit do_boot_cpu(int apicid, int cpu)
795/* 672/*
796 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 673 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
797 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 674 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
798 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. 675 * Returns zero if CPU booted OK, else error code from
676 * ->wakeup_secondary_cpu.
799 */ 677 */
678static int __cpuinit do_boot_cpu(int apicid, int cpu)
800{ 679{
801 unsigned long boot_error = 0; 680 unsigned long boot_error = 0;
802 int timeout;
803 unsigned long start_ip; 681 unsigned long start_ip;
804 unsigned short nmi_high = 0, nmi_low = 0; 682 int timeout;
805 struct create_idle c_idle = { 683 struct create_idle c_idle = {
806 .cpu = cpu, 684 .cpu = cpu,
807 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 685 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
808 }; 686 };
809 INIT_WORK(&c_idle.work, do_fork_idle);
810 687
811#ifdef CONFIG_X86_64 688 INIT_WORK(&c_idle.work, do_fork_idle);
812 /* Allocate node local memory for AP pdas */
813 if (cpu > 0) {
814 boot_error = get_local_pda(cpu);
815 if (boot_error)
816 goto restore_state;
817 /* if can't get pda memory, can't start cpu */
818 }
819#endif
820 689
821 alternatives_smp_switch(1); 690 alternatives_smp_switch(1);
822 691
@@ -847,14 +716,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
847 716
848 set_idle_for_cpu(cpu, c_idle.idle); 717 set_idle_for_cpu(cpu, c_idle.idle);
849do_rest: 718do_rest:
850#ifdef CONFIG_X86_32
851 per_cpu(current_task, cpu) = c_idle.idle; 719 per_cpu(current_task, cpu) = c_idle.idle;
852 init_gdt(cpu); 720#ifdef CONFIG_X86_32
853 /* Stack for startup_32 can be just as for start_secondary onwards */ 721 /* Stack for startup_32 can be just as for start_secondary onwards */
854 irq_ctx_init(cpu); 722 irq_ctx_init(cpu);
855#else 723#else
856 cpu_pda(cpu)->pcurrent = c_idle.idle;
857 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 724 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
725 initial_gs = per_cpu_offset(cpu);
726 per_cpu(kernel_stack, cpu) =
727 (unsigned long)task_stack_page(c_idle.idle) -
728 KERNEL_STACK_OFFSET + THREAD_SIZE;
858#endif 729#endif
859 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 730 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
860 initial_code = (unsigned long)start_secondary; 731 initial_code = (unsigned long)start_secondary;
@@ -878,8 +749,6 @@ do_rest:
878 749
879 pr_debug("Setting warm reset code and vector.\n"); 750 pr_debug("Setting warm reset code and vector.\n");
880 751
881 store_NMI_vector(&nmi_high, &nmi_low);
882
883 smpboot_setup_warm_reset_vector(start_ip); 752 smpboot_setup_warm_reset_vector(start_ip);
884 /* 753 /*
885 * Be paranoid about clearing APIC errors. 754 * Be paranoid about clearing APIC errors.
@@ -891,9 +760,13 @@ do_rest:
891 } 760 }
892 761
893 /* 762 /*
894 * Starting actual IPI sequence... 763 * Kick the secondary CPU. Use the method in the APIC driver
764 * if it's defined - or use an INIT boot APIC message otherwise:
895 */ 765 */
896 boot_error = wakeup_secondary_cpu(apicid, start_ip); 766 if (apic->wakeup_secondary_cpu)
767 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
768 else
769 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
897 770
898 if (!boot_error) { 771 if (!boot_error) {
899 /* 772 /*
@@ -927,13 +800,11 @@ do_rest:
927 else 800 else
928 /* trampoline code not run */ 801 /* trampoline code not run */
929 printk(KERN_ERR "Not responding.\n"); 802 printk(KERN_ERR "Not responding.\n");
930 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) 803 if (apic->inquire_remote_apic)
931 inquire_remote_apic(apicid); 804 apic->inquire_remote_apic(apicid);
932 } 805 }
933 } 806 }
934#ifdef CONFIG_X86_64 807
935restore_state:
936#endif
937 if (boot_error) { 808 if (boot_error) {
938 /* Try to put things back the way they were before ... */ 809 /* Try to put things back the way they were before ... */
939 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 810 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -961,7 +832,7 @@ restore_state:
961 832
962int __cpuinit native_cpu_up(unsigned int cpu) 833int __cpuinit native_cpu_up(unsigned int cpu)
963{ 834{
964 int apicid = cpu_present_to_apicid(cpu); 835 int apicid = apic->cpu_present_to_apicid(cpu);
965 unsigned long flags; 836 unsigned long flags;
966 int err; 837 int err;
967 838
@@ -1033,9 +904,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1033 */ 904 */
1034static __init void disable_smp(void) 905static __init void disable_smp(void)
1035{ 906{
1036 /* use the read/write pointers to the present and possible maps */ 907 init_cpu_present(cpumask_of(0));
1037 cpumask_copy(&cpu_present_map, cpumask_of(0)); 908 init_cpu_possible(cpumask_of(0));
1038 cpumask_copy(&cpu_possible_map, cpumask_of(0));
1039 smpboot_clear_io_apic_irqs(); 909 smpboot_clear_io_apic_irqs();
1040 910
1041 if (smp_found_config) 911 if (smp_found_config)
@@ -1054,14 +924,14 @@ static int __init smp_sanity_check(unsigned max_cpus)
1054{ 924{
1055 preempt_disable(); 925 preempt_disable();
1056 926
1057#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) 927#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1058 if (def_to_bigsmp && nr_cpu_ids > 8) { 928 if (def_to_bigsmp && nr_cpu_ids > 8) {
1059 unsigned int cpu; 929 unsigned int cpu;
1060 unsigned nr; 930 unsigned nr;
1061 931
1062 printk(KERN_WARNING 932 printk(KERN_WARNING
1063 "More than 8 CPUs detected - skipping them.\n" 933 "More than 8 CPUs detected - skipping them.\n"
1064 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); 934 "Use CONFIG_X86_BIGSMP.\n");
1065 935
1066 nr = 0; 936 nr = 0;
1067 for_each_present_cpu(cpu) { 937 for_each_present_cpu(cpu) {
@@ -1107,7 +977,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1107 * Should not be necessary because the MP table should list the boot 977 * Should not be necessary because the MP table should list the boot
1108 * CPU too, but we do it for the sake of robustness anyway. 978 * CPU too, but we do it for the sake of robustness anyway.
1109 */ 979 */
1110 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { 980 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1111 printk(KERN_NOTICE 981 printk(KERN_NOTICE
1112 "weird, boot CPU (#%d) not listed by the BIOS.\n", 982 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1113 boot_cpu_physical_apicid); 983 boot_cpu_physical_apicid);
@@ -1125,6 +995,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1125 printk(KERN_ERR "... forcing use of dummy APIC emulation." 995 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1126 "(tell your hw vendor)\n"); 996 "(tell your hw vendor)\n");
1127 smpboot_clear_io_apic(); 997 smpboot_clear_io_apic();
998 arch_disable_smp_support();
1128 return -1; 999 return -1;
1129 } 1000 }
1130 1001
@@ -1166,6 +1037,8 @@ static void __init smp_cpu_index_default(void)
1166 */ 1037 */
1167void __init native_smp_prepare_cpus(unsigned int max_cpus) 1038void __init native_smp_prepare_cpus(unsigned int max_cpus)
1168{ 1039{
1040 unsigned int i;
1041
1169 preempt_disable(); 1042 preempt_disable();
1170 smp_cpu_index_default(); 1043 smp_cpu_index_default();
1171 current_cpu_data = boot_cpu_data; 1044 current_cpu_data = boot_cpu_data;
@@ -1179,11 +1052,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1179 boot_cpu_logical_apicid = logical_smp_processor_id(); 1052 boot_cpu_logical_apicid = logical_smp_processor_id();
1180#endif 1053#endif
1181 current_thread_info()->cpu = 0; /* needed? */ 1054 current_thread_info()->cpu = 0; /* needed? */
1055 for_each_possible_cpu(i) {
1056 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1057 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1058 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1059 cpumask_clear(per_cpu(cpu_core_map, i));
1060 cpumask_clear(per_cpu(cpu_sibling_map, i));
1061 cpumask_clear(cpu_data(i).llc_shared_map);
1062 }
1182 set_cpu_sibling_map(0); 1063 set_cpu_sibling_map(0);
1183 1064
1184#ifdef CONFIG_X86_64
1185 enable_IR_x2apic(); 1065 enable_IR_x2apic();
1186 setup_apic_routing(); 1066#ifdef CONFIG_X86_64
1067 default_setup_apic_routing();
1187#endif 1068#endif
1188 1069
1189 if (smp_sanity_check(max_cpus) < 0) { 1070 if (smp_sanity_check(max_cpus) < 0) {
@@ -1207,18 +1088,18 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1207 */ 1088 */
1208 setup_local_APIC(); 1089 setup_local_APIC();
1209 1090
1210#ifdef CONFIG_X86_64
1211 /* 1091 /*
1212 * Enable IO APIC before setting up error vector 1092 * Enable IO APIC before setting up error vector
1213 */ 1093 */
1214 if (!skip_ioapic_setup && nr_ioapics) 1094 if (!skip_ioapic_setup && nr_ioapics)
1215 enable_IO_APIC(); 1095 enable_IO_APIC();
1216#endif 1096
1217 end_local_APIC_setup(); 1097 end_local_APIC_setup();
1218 1098
1219 map_cpu_to_logical_apicid(); 1099 map_cpu_to_logical_apicid();
1220 1100
1221 setup_portio_remap(); 1101 if (apic->setup_portio_remap)
1102 apic->setup_portio_remap();
1222 1103
1223 smpboot_setup_io_apic(); 1104 smpboot_setup_io_apic();
1224 /* 1105 /*
@@ -1240,10 +1121,7 @@ out:
1240void __init native_smp_prepare_boot_cpu(void) 1121void __init native_smp_prepare_boot_cpu(void)
1241{ 1122{
1242 int me = smp_processor_id(); 1123 int me = smp_processor_id();
1243#ifdef CONFIG_X86_32 1124 switch_to_new_gdt(me);
1244 init_gdt(me);
1245#endif
1246 switch_to_new_gdt();
1247 /* already set me in cpu_online_mask in boot_cpu_init() */ 1125 /* already set me in cpu_online_mask in boot_cpu_init() */
1248 cpumask_set_cpu(me, cpu_callout_mask); 1126 cpumask_set_cpu(me, cpu_callout_mask);
1249 per_cpu(cpu_state, me) = CPU_ONLINE; 1127 per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1254,7 +1132,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1254 pr_debug("Boot done.\n"); 1132 pr_debug("Boot done.\n");
1255 1133
1256 impress_friends(); 1134 impress_friends();
1257 smp_checks();
1258#ifdef CONFIG_X86_IO_APIC 1135#ifdef CONFIG_X86_IO_APIC
1259 setup_ioapic_dest(); 1136 setup_ioapic_dest();
1260#endif 1137#endif
@@ -1271,11 +1148,11 @@ early_param("possible_cpus", _setup_possible_cpus);
1271 1148
1272 1149
1273/* 1150/*
1274 * cpu_possible_map should be static, it cannot change as cpu's 1151 * cpu_possible_mask should be static, it cannot change as cpu's
1275 * are onlined, or offlined. The reason is per-cpu data-structures 1152 * are onlined, or offlined. The reason is per-cpu data-structures
1276 * are allocated by some modules at init time, and dont expect to 1153 * are allocated by some modules at init time, and dont expect to
1277 * do this dynamically on cpu arrival/departure. 1154 * do this dynamically on cpu arrival/departure.
1278 * cpu_present_map on the other hand can change dynamically. 1155 * cpu_present_mask on the other hand can change dynamically.
1279 * In case when cpu_hotplug is not compiled, then we resort to current 1156 * In case when cpu_hotplug is not compiled, then we resort to current
1280 * behaviour, which is cpu_possible == cpu_present. 1157 * behaviour, which is cpu_possible == cpu_present.
1281 * - Ashok Raj 1158 * - Ashok Raj
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7#ifdef CONFIG_X86_32
8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10
11/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
16__cpuinit void init_gdt(int cpu)
17{
18 struct desc_struct gdt;
19
20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
23
24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29}
30#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 10786af95545..f7bddc2e37d1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Stack trace management functions 2 * Stack trace management functions
3 * 3 *
4 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 4 * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 */ 5 */
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
deleted file mode 100644
index 7b987852e876..000000000000
--- a/arch/x86/kernel/summit_32.c
+++ /dev/null
@@ -1,188 +0,0 @@
1/*
2 * IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/bios_ebda.h>
33#include <asm/summit/mpparse.h>
34
35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
38
39#ifndef CONFIG_X86_NUMAQ
40static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
41#endif
42
43static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
44{
45 int twister = 0, node = 0;
46 int i, bus, num_buses;
47
48 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
49 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
50 twister = rio_devs[i]->owner_id;
51 break;
52 }
53 }
54 if (i == rio_table_hdr->num_rio_dev) {
55 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
56 return last_bus;
57 }
58
59 for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
60 if (scal_devs[i]->node_id == twister) {
61 node = scal_devs[i]->node_id;
62 break;
63 }
64 }
65 if (i == rio_table_hdr->num_scal_dev) {
66 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
67 return last_bus;
68 }
69
70 switch (rio_devs[wpeg_num]->type) {
71 case CompatWPEG:
72 /*
73 * The Compatibility Winnipeg controls the 2 legacy buses,
74 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
75 * a PCI-PCI bridge card is used in either slot: total 5 buses.
76 */
77 num_buses = 5;
78 break;
79 case AltWPEG:
80 /*
81 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
82 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
83 * the "extra" buses for each of those slots: total 7 buses.
84 */
85 num_buses = 7;
86 break;
87 case LookOutAWPEG:
88 case LookOutBWPEG:
89 /*
90 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
91 * & the "extra" buses for each of those slots: total 9 buses.
92 */
93 num_buses = 9;
94 break;
95 default:
96 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
97 return last_bus;
98 }
99
100 for (bus = last_bus; bus < last_bus + num_buses; bus++)
101 mp_bus_id_to_node[bus] = node;
102 return bus;
103}
104
105static int __init build_detail_arrays(void)
106{
107 unsigned long ptr;
108 int i, scal_detail_size, rio_detail_size;
109
110 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
111 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
112 return 0;
113 }
114
115 switch (rio_table_hdr->version) {
116 default:
117 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
118 return 0;
119 case 2:
120 scal_detail_size = 11;
121 rio_detail_size = 13;
122 break;
123 case 3:
124 scal_detail_size = 12;
125 rio_detail_size = 15;
126 break;
127 }
128
129 ptr = (unsigned long)rio_table_hdr + 3;
130 for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
131 scal_devs[i] = (struct scal_detail *)ptr;
132
133 for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
134 rio_devs[i] = (struct rio_detail *)ptr;
135
136 return 1;
137}
138
139void __init setup_summit(void)
140{
141 unsigned long ptr;
142 unsigned short offset;
143 int i, next_wpeg, next_bus = 0;
144
145 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
146 ptr = get_bios_ebda();
147 ptr = (unsigned long)phys_to_virt(ptr);
148
149 rio_table_hdr = NULL;
150 offset = 0x180;
151 while (offset) {
152 /* The block id is stored in the 2nd word */
153 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
154 /* set the pointer past the offset & block id */
155 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
156 break;
157 }
158 /* The next offset is stored in the 1st word. 0 means no more */
159 offset = *((unsigned short *)(ptr + offset));
160 }
161 if (!rio_table_hdr) {
162 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
163 return;
164 }
165
166 if (!build_detail_arrays())
167 return;
168
169 /* The first Winnipeg we're looking for has an index of 0 */
170 next_wpeg = 0;
171 do {
172 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
173 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
174 /* It's the Winnipeg we're looking for! */
175 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
176 next_wpeg++;
177 break;
178 }
179 }
180 /*
181 * If we go through all Rio devices and don't find one with
182 * the next index, it means we've found all the Winnipegs,
183 * and thus all the PCI buses.
184 */
185 if (i == rio_table_hdr->num_rio_dev)
186 next_wpeg = 0;
187 } while (next_wpeg != 0);
188}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e2e86a08f31d..ff5c8736b491 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
1ENTRY(sys_call_table) 1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ 2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit 3 .long sys_exit
4 .long sys_fork 4 .long ptregs_fork
5 .long sys_read 5 .long sys_read
6 .long sys_write 6 .long sys_write
7 .long sys_open /* 5 */ 7 .long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
10 .long sys_creat 10 .long sys_creat
11 .long sys_link 11 .long sys_link
12 .long sys_unlink /* 10 */ 12 .long sys_unlink /* 10 */
13 .long sys_execve 13 .long ptregs_execve
14 .long sys_chdir 14 .long sys_chdir
15 .long sys_time 15 .long sys_time
16 .long sys_mknod 16 .long sys_mknod
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
109 .long sys_newlstat 109 .long sys_newlstat
110 .long sys_newfstat 110 .long sys_newfstat
111 .long sys_uname 111 .long sys_uname
112 .long sys_iopl /* 110 */ 112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup 113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */ 114 .long sys_ni_syscall /* old "idle" system call */
115 .long sys_vm86old 115 .long ptregs_vm86old
116 .long sys_wait4 116 .long sys_wait4
117 .long sys_swapoff /* 115 */ 117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo 118 .long sys_sysinfo
119 .long sys_ipc 119 .long sys_ipc
120 .long sys_fsync 120 .long sys_fsync
121 .long sys_sigreturn 121 .long ptregs_sigreturn
122 .long sys_clone /* 120 */ 122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname 123 .long sys_setdomainname
124 .long sys_newuname 124 .long sys_newuname
125 .long sys_modify_ldt 125 .long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
165 .long sys_mremap 165 .long sys_mremap
166 .long sys_setresuid16 166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */ 167 .long sys_getresuid16 /* 165 */
168 .long sys_vm86 168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */ 169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll 170 .long sys_poll
171 .long sys_nfsservctl 171 .long sys_nfsservctl
172 .long sys_setresgid16 /* 170 */ 172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16 173 .long sys_getresgid16
174 .long sys_prctl 174 .long sys_prctl
175 .long sys_rt_sigreturn 175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction 176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */ 177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending 178 .long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
185 .long sys_getcwd 185 .long sys_getcwd
186 .long sys_capget 186 .long sys_capget
187 .long sys_capset /* 185 */ 187 .long sys_capset /* 185 */
188 .long sys_sigaltstack 188 .long ptregs_sigaltstack
189 .long sys_sendfile 189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */ 190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long sys_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap2
195 .long sys_truncate64 195 .long sys_truncate64
@@ -332,3 +332,5 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv
336 .long sys_pwritev
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 3985cac0ed47..5c5d87f0b2e1 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -33,12 +33,12 @@
33#include <linux/time.h> 33#include <linux/time.h>
34#include <linux/mca.h> 34#include <linux/mca.h>
35 35
36#include <asm/arch_hooks.h> 36#include <asm/setup.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h> 39#include <asm/timer.h>
40 40
41#include "do_timer.h" 41#include <asm/do_timer.h>
42 42
43int timer_ack; 43int timer_ack;
44 44
@@ -118,7 +118,7 @@ void __init hpet_time_init(void)
118{ 118{
119 if (!hpet_enable()) 119 if (!hpet_enable())
120 setup_pit_timer(); 120 setup_pit_timer();
121 time_init_hook(); 121 x86_quirk_time_init();
122} 122}
123 123
124/* 124/*
@@ -131,7 +131,7 @@ void __init hpet_time_init(void)
131 */ 131 */
132void __init time_init(void) 132void __init time_init(void)
133{ 133{
134 pre_time_init_hook(); 134 x86_quirk_pre_time_init();
135 tsc_init(); 135 tsc_init();
136 late_time_init = choose_time_init(); 136 late_time_init = choose_time_init();
137} 137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e6e695acd725..5ba343e61844 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -115,8 +115,7 @@ unsigned long __init calibrate_cpu(void)
115 115
116static struct irqaction irq0 = { 116static struct irqaction irq0 = {
117 .handler = timer_interrupt, 117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .mask = CPU_MASK_NONE,
120 .name = "timer" 119 .name = "timer"
121}; 120};
122 121
@@ -125,7 +124,6 @@ void __init hpet_time_init(void)
125 if (!hpet_enable()) 124 if (!hpet_enable())
126 setup_pit_timer(); 125 setup_pit_timer();
127 126
128 irq0.mask = cpumask_of_cpu(0);
129 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
130} 128}
131 129
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 load_cr3(swapper_pg_dir);
40}
41EXPORT_SYMBOL_GPL(leave_mm);
42
43/*
44 *
45 * The flush IPI assumes that a thread switch happens in this order:
46 * [cpu0: the cpu that switches]
47 * 1) switch_mm() either 1a) or 1b)
48 * 1a) thread switch to a different mm
49 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
50 * Stop ipi delivery for the old mm. This is not synchronized with
51 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
52 * for the wrong mm, and in the worst case we perform a superfluous
53 * tlb flush.
54 * 1a2) set cpu_tlbstate to TLBSTATE_OK
55 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
56 * was in lazy tlb mode.
57 * 1a3) update cpu_tlbstate[].active_mm
58 * Now cpu0 accepts tlb flushes for the new mm.
59 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
60 * Now the other cpus will send tlb flush ipis.
61 * 1a4) change cr3.
62 * 1b) thread switch without mm change
63 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
64 * flush ipis.
65 * 1b1) set cpu_tlbstate to TLBSTATE_OK
66 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
67 * Atomically set the bit [other cpus will start sending flush ipis],
68 * and test the bit.
69 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
70 * 2) switch %%esp, ie current
71 *
72 * The interrupt must handle 2 special cases:
73 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
74 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
75 * runs in kernel space, the cpu could load tlb entries for user space
76 * pages.
77 *
78 * The good news is that cpu_tlbstate is local to each cpu, no
79 * write/read ordering problems.
80 */
81
82/*
83 * TLB flush IPI:
84 *
85 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
86 * 2) Leave the mm if we are in the lazy tlb mode.
87 */
88
89void smp_invalidate_interrupt(struct pt_regs *regs)
90{
91 unsigned long cpu;
92
93 cpu = get_cpu();
94
95 if (!cpu_isset(cpu, flush_cpumask))
96 goto out;
97 /*
98 * This was a BUG() but until someone can quote me the
99 * line from the intel manual that guarantees an IPI to
100 * multiple CPUs is retried _only_ on the erroring CPUs
101 * its staying as a return
102 *
103 * BUG();
104 */
105
106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (flush_va == TLB_FLUSH_ALL)
109 local_flush_tlb();
110 else
111 __flush_tlb_one(flush_va);
112 } else
113 leave_mm(cpu);
114 }
115 ack_APIC_irq();
116 smp_mb__before_clear_bit();
117 cpu_clear(cpu, flush_cpumask);
118 smp_mb__after_clear_bit();
119out:
120 put_cpu_no_resched();
121 inc_irq_stat(irq_tlb_count);
122}
123
124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
125 unsigned long va)
126{
127 cpumask_t cpumask = *cpumaskp;
128
129 /*
130 * A couple of (to be removed) sanity checks:
131 *
132 * - current CPU must not be in mask
133 * - mask must exist :)
134 */
135 BUG_ON(cpus_empty(cpumask));
136 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
137 BUG_ON(!mm);
138
139#ifdef CONFIG_HOTPLUG_CPU
140 /* If a CPU which we ran on has gone down, OK. */
141 cpus_and(cpumask, cpumask, cpu_online_map);
142 if (unlikely(cpus_empty(cpumask)))
143 return;
144#endif
145
146 /*
147 * i'm not happy about this global shared spinlock in the
148 * MM hot path, but we'll see how contended it is.
149 * AK: x86-64 has a faster method that could be ported.
150 */
151 spin_lock(&tlbstate_lock);
152
153 flush_mm = mm;
154 flush_va = va;
155 cpus_or(flush_cpumask, cpumask, flush_cpumask);
156
157 /*
158 * Make the above memory operations globally visible before
159 * sending the IPI.
160 */
161 smp_mb();
162 /*
163 * We have to send the IPI only to
164 * CPUs affected.
165 */
166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167
168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */
170 cpu_relax();
171
172 flush_mm = NULL;
173 flush_va = 0;
174 spin_unlock(&tlbstate_lock);
175}
176
177void flush_tlb_current_task(void)
178{
179 struct mm_struct *mm = current->mm;
180 cpumask_t cpu_mask;
181
182 preempt_disable();
183 cpu_mask = mm->cpu_vm_mask;
184 cpu_clear(smp_processor_id(), cpu_mask);
185
186 local_flush_tlb();
187 if (!cpus_empty(cpu_mask))
188 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
189 preempt_enable();
190}
191
192void flush_tlb_mm(struct mm_struct *mm)
193{
194 cpumask_t cpu_mask;
195
196 preempt_disable();
197 cpu_mask = mm->cpu_vm_mask;
198 cpu_clear(smp_processor_id(), cpu_mask);
199
200 if (current->active_mm == mm) {
201 if (current->mm)
202 local_flush_tlb();
203 else
204 leave_mm(smp_processor_id());
205 }
206 if (!cpus_empty(cpu_mask))
207 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
208
209 preempt_enable();
210}
211
212void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
213{
214 struct mm_struct *mm = vma->vm_mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 if (current->active_mm == mm) {
222 if (current->mm)
223 __flush_tlb_one(va);
224 else
225 leave_mm(smp_processor_id());
226 }
227
228 if (!cpus_empty(cpu_mask))
229 flush_tlb_others(cpu_mask, mm, va);
230
231 preempt_enable();
232}
233EXPORT_SYMBOL(flush_tlb_page);
234
235static void do_flush_tlb_all(void *info)
236{
237 unsigned long cpu = smp_processor_id();
238
239 __flush_tlb_all();
240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
241 leave_mm(cpu);
242}
243
244void flush_tlb_all(void)
245{
246 on_each_cpu(do_flush_tlb_all, NULL, 1);
247}
248
249void reset_lazy_tlbstate(void)
250{
251 int cpu = raw_smp_processor_id();
252
253 per_cpu(cpu_tlbstate, cpu).state = 0;
254 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
255}
256
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index f8be6f1d2e48..000000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,284 +0,0 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h>
6#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h>
10
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h>
14#include <asm/mmu_context.h>
15#include <asm/proto.h>
16#include <asm/apicdef.h>
17#include <asm/idle.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20
21#include <mach_ipi.h>
22/*
23 * Smarter SMP flushing macros.
24 * c/o Linus Torvalds.
25 *
26 * These mean you can really definitely utterly forget about
27 * writing to user space from interrupts. (Its not allowed anyway).
28 *
29 * Optimizations Manfred Spraul <manfred@colorfullife.com>
30 *
31 * More scalable flush, from Andi Kleen
32 *
33 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data.
37 *
38 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now.
40 * In future when interrupts are split into per CPU domains this could be
41 * fixed, at the cost of triggering multiple IPIs in some cases.
42 */
43
44union smp_flush_state {
45 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm;
48 unsigned long flush_va;
49 spinlock_t tlbstate_lock;
50 };
51 char pad[SMP_CACHE_BYTES];
52} ____cacheline_aligned;
53
54/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state);
58
59/*
60 * We cannot call mmdrop() because we are in interrupt context,
61 * instead update mm->cpu_vm_mask.
62 */
63void leave_mm(int cpu)
64{
65 if (read_pda(mmu_state) == TLBSTATE_OK)
66 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir);
69}
70EXPORT_SYMBOL_GPL(leave_mm);
71
72/*
73 *
74 * The flush IPI assumes that a thread switch happens in this order:
75 * [cpu0: the cpu that switches]
76 * 1) switch_mm() either 1a) or 1b)
77 * 1a) thread switch to a different mm
78 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
79 * Stop ipi delivery for the old mm. This is not synchronized with
80 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
81 * for the wrong mm, and in the worst case we perform a superfluous
82 * tlb flush.
83 * 1a2) set cpu mmu_state to TLBSTATE_OK
84 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
85 * was in lazy tlb mode.
86 * 1a3) update cpu active_mm
87 * Now cpu0 accepts tlb flushes for the new mm.
88 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
89 * Now the other cpus will send tlb flush ipis.
90 * 1a4) change cr3.
91 * 1b) thread switch without mm change
92 * cpu active_mm is correct, cpu0 already handles
93 * flush ipis.
94 * 1b1) set cpu mmu_state to TLBSTATE_OK
95 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
96 * Atomically set the bit [other cpus will start sending flush ipis],
97 * and test the bit.
98 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
99 * 2) switch %%esp, ie current
100 *
101 * The interrupt must handle 2 special cases:
102 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
103 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
104 * runs in kernel space, the cpu could load tlb entries for user space
105 * pages.
106 *
107 * The good news is that cpu mmu_state is local to each cpu, no
108 * write/read ordering problems.
109 */
110
111/*
112 * TLB flush IPI:
113 *
114 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
115 * 2) Leave the mm if we are in the lazy tlb mode.
116 *
117 * Interrupts are disabled.
118 */
119
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
121{
122 int cpu;
123 int sender;
124 union smp_flush_state *f;
125
126 cpu = smp_processor_id();
127 /*
128 * orig_rax contains the negated interrupt vector.
129 * Use that to determine where the sender put the data.
130 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender);
133
134 if (!cpu_isset(cpu, f->flush_cpumask))
135 goto out;
136 /*
137 * This was a BUG() but until someone can quote me the
138 * line from the intel manual that guarantees an IPI to
139 * multiple CPUs is retried _only_ on the erroring CPUs
140 * its staying as a return
141 *
142 * BUG();
143 */
144
145 if (f->flush_mm == read_pda(active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb();
149 else
150 __flush_tlb_one(f->flush_va);
151 } else
152 leave_mm(cpu);
153 }
154out:
155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask);
157 inc_irq_stat(irq_tlb_count);
158}
159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
161 unsigned long va)
162{
163 int sender;
164 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169
170 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender);
173
174 /*
175 * Could avoid this lock when
176 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
177 * probably not worth checking this for a cache-hot lock.
178 */
179 spin_lock(&f->tlbstate_lock);
180
181 f->flush_mm = mm;
182 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
184
185 /*
186 * Make the above memory operations globally visible before
187 * sending the IPI.
188 */
189 smp_mb();
190 /*
191 * We have to send the IPI only to
192 * CPUs affected.
193 */
194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
195
196 while (!cpus_empty(f->flush_cpumask))
197 cpu_relax();
198
199 f->flush_mm = NULL;
200 f->flush_va = 0;
201 spin_unlock(&f->tlbstate_lock);
202}
203
204static int __cpuinit init_smp_flush(void)
205{
206 int i;
207
208 for_each_possible_cpu(i)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
210
211 return 0;
212}
213core_initcall(init_smp_flush);
214
215void flush_tlb_current_task(void)
216{
217 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219
220 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223
224 local_flush_tlb();
225 if (!cpus_empty(cpu_mask))
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable();
228}
229
230void flush_tlb_mm(struct mm_struct *mm)
231{
232 cpumask_t cpu_mask;
233
234 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237
238 if (current->active_mm == mm) {
239 if (current->mm)
240 local_flush_tlb();
241 else
242 leave_mm(smp_processor_id());
243 }
244 if (!cpus_empty(cpu_mask))
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
246
247 preempt_enable();
248}
249
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{
252 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254
255 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258
259 if (current->active_mm == mm) {
260 if (current->mm)
261 __flush_tlb_one(va);
262 else
263 leave_mm(smp_processor_id());
264 }
265
266 if (!cpus_empty(cpu_mask))
267 flush_tlb_others(cpu_mask, mm, va);
268
269 preempt_enable();
270}
271
272static void do_flush_tlb_all(void *info)
273{
274 unsigned long cpu = smp_processor_id();
275
276 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY)
278 leave_mm(cpu);
279}
280
281void flush_tlb_all(void)
282{
283 on_each_cpu(do_flush_tlb_all, NULL, 1);
284}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6812b829ed83..deb5ebb32c3b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,16 +11,15 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
17#include <asm/genapic.h> 18#include <asm/apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
19#include <asm/tsc.h> 20#include <asm/tsc.h>
20#include <asm/irq_vectors.h> 21#include <asm/irq_vectors.h>
21 22
22#include <mach_apic.h>
23
24static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
25static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
26 25
@@ -210,14 +209,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
210 * 209 *
211 * Send a broadcast and wait for a broadcast message to complete. 210 * Send a broadcast and wait for a broadcast message to complete.
212 * 211 *
213 * The cpumaskp mask contains the cpus the broadcast was sent to. 212 * The flush_mask contains the cpus the broadcast was sent to.
214 * 213 *
215 * Returns 1 if all remote flushing was done. The mask is zeroed. 214 * Returns NULL if all remote flushing was done. The mask is zeroed.
216 * Returns 0 if some remote flushing remains to be done. The mask is left 215 * Returns @flush_mask if some remote flushing remains to be done. The
217 * unchanged. 216 * mask will have some bits still set.
218 */ 217 */
219int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 218const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
220 cpumask_t *cpumaskp) 219 struct bau_desc *bau_desc,
220 struct cpumask *flush_mask)
221{ 221{
222 int completion_status = 0; 222 int completion_status = 0;
223 int right_shift; 223 int right_shift;
@@ -257,66 +257,75 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
257 * the cpu's, all of which are still in the mask. 257 * the cpu's, all of which are still in the mask.
258 */ 258 */
259 __get_cpu_var(ptcstats).ptc_i++; 259 __get_cpu_var(ptcstats).ptc_i++;
260 return 0; 260 return flush_mask;
261 } 261 }
262 262
263 /* 263 /*
264 * Success, so clear the remote cpu's from the mask so we don't 264 * Success, so clear the remote cpu's from the mask so we don't
265 * use the IPI method of shootdown on them. 265 * use the IPI method of shootdown on them.
266 */ 266 */
267 for_each_cpu_mask(bit, *cpumaskp) { 267 for_each_cpu(bit, flush_mask) {
268 blade = uv_cpu_to_blade_id(bit); 268 blade = uv_cpu_to_blade_id(bit);
269 if (blade == this_blade) 269 if (blade == this_blade)
270 continue; 270 continue;
271 cpu_clear(bit, *cpumaskp); 271 cpumask_clear_cpu(bit, flush_mask);
272 } 272 }
273 if (!cpus_empty(*cpumaskp)) 273 if (!cpumask_empty(flush_mask))
274 return 0; 274 return flush_mask;
275 return 1; 275 return NULL;
276} 276}
277 277
278static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
279
278/** 280/**
279 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
280 * address or all TLB's 282 * address or all TLB's
281 * @cpumaskp: mask of all cpu's in which the address is to be removed 283 * @cpumask: mask of all cpu's in which the address is to be removed
282 * @mm: mm_struct containing virtual address range 284 * @mm: mm_struct containing virtual address range
283 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 285 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
286 * @cpu: the current cpu
284 * 287 *
285 * This is the entry point for initiating any UV global TLB shootdown. 288 * This is the entry point for initiating any UV global TLB shootdown.
286 * 289 *
287 * Purges the translation caches of all specified processors of the given 290 * Purges the translation caches of all specified processors of the given
288 * virtual address, or purges all TLB's on specified processors. 291 * virtual address, or purges all TLB's on specified processors.
289 * 292 *
290 * The caller has derived the cpumaskp from the mm_struct and has subtracted 293 * The caller has derived the cpumask from the mm_struct. This function
291 * the local cpu from the mask. This function is called only if there 294 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
292 * are bits set in the mask. (e.g. flush_tlb_page())
293 * 295 *
294 * The cpumaskp is converted into a nodemask of the nodes containing 296 * The cpumask is converted into a nodemask of the nodes containing
295 * the cpus. 297 * the cpus.
296 * 298 *
297 * Returns 1 if all remote flushing was done. 299 * Note that this function should be called with preemption disabled.
298 * Returns 0 if some remote flushing remains to be done. 300 *
301 * Returns NULL if all remote flushing was done.
302 * Returns pointer to cpumask if some remote flushing remains to be
303 * done. The returned pointer is valid till preemption is re-enabled.
299 */ 304 */
300int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 305const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
301 unsigned long va) 306 struct mm_struct *mm,
307 unsigned long va, unsigned int cpu)
302{ 308{
309 struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
303 int i; 310 int i;
304 int bit; 311 int bit;
305 int blade; 312 int blade;
306 int cpu; 313 int uv_cpu;
307 int this_blade; 314 int this_blade;
308 int locals = 0; 315 int locals = 0;
309 struct bau_desc *bau_desc; 316 struct bau_desc *bau_desc;
310 317
311 cpu = uv_blade_processor_id(); 318 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
319
320 uv_cpu = uv_blade_processor_id();
312 this_blade = uv_numa_blade_id(); 321 this_blade = uv_numa_blade_id();
313 bau_desc = __get_cpu_var(bau_control).descriptor_base; 322 bau_desc = __get_cpu_var(bau_control).descriptor_base;
314 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 323 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
315 324
316 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 325 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
317 326
318 i = 0; 327 i = 0;
319 for_each_cpu_mask(bit, *cpumaskp) { 328 for_each_cpu(bit, flush_mask) {
320 blade = uv_cpu_to_blade_id(bit); 329 blade = uv_cpu_to_blade_id(bit);
321 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 330 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
322 if (blade == this_blade) { 331 if (blade == this_blade) {
@@ -331,17 +340,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
331 * no off_node flushing; return status for local node 340 * no off_node flushing; return status for local node
332 */ 341 */
333 if (locals) 342 if (locals)
334 return 0; 343 return flush_mask;
335 else 344 else
336 return 1; 345 return NULL;
337 } 346 }
338 __get_cpu_var(ptcstats).requestor++; 347 __get_cpu_var(ptcstats).requestor++;
339 __get_cpu_var(ptcstats).ntargeted += i; 348 __get_cpu_var(ptcstats).ntargeted += i;
340 349
341 bau_desc->payload.address = va; 350 bau_desc->payload.address = va;
342 bau_desc->payload.sending_cpu = smp_processor_id(); 351 bau_desc->payload.sending_cpu = cpu;
343 352
344 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 353 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
345} 354}
346 355
347/* 356/*
@@ -742,16 +751,21 @@ static int __init uv_bau_init(void)
742 int node; 751 int node;
743 int nblades; 752 int nblades;
744 int last_blade; 753 int last_blade;
745 int cur_cpu = 0; 754 int cur_cpu;
746 755
747 if (!is_uv_system()) 756 if (!is_uv_system())
748 return 0; 757 return 0;
749 758
759 for_each_possible_cpu(cur_cpu)
760 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
761 GFP_KERNEL, cpu_to_node(cur_cpu));
762
750 uv_bau_retry_limit = 1; 763 uv_bau_retry_limit = 1;
751 uv_nshift = uv_hub_info->n_val; 764 uv_nshift = uv_hub_info->n_val;
752 uv_mmask = (1UL << uv_hub_info->n_val) - 1; 765 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
753 nblades = 0; 766 nblades = 0;
754 last_blade = -1; 767 last_blade = -1;
768 cur_cpu = 0;
755 for_each_online_node(node) { 769 for_each_online_node(node) {
756 blade = uv_node_to_blade_id(node); 770 blade = uv_node_to_blade_id(node);
757 if (blade == last_blade) 771 if (blade == last_blade)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f7..7e4515957a1c 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
25 * 25 *
26 * Send feedback to <colpatch@us.ibm.com> 26 * Send feedback to <colpatch@us.ibm.com>
27 */ 27 */
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/nodemask.h> 28#include <linux/nodemask.h>
31#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
47 */ 47 */
48 if (num) 48 if (num)
49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50
50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 51 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51} 52}
52EXPORT_SYMBOL(arch_register_cpu); 53EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
56 unregister_cpu(&per_cpu(cpu_devices, num).cpu); 57 unregister_cpu(&per_cpu(cpu_devices, num).cpu);
57} 58}
58EXPORT_SYMBOL(arch_unregister_cpu); 59EXPORT_SYMBOL(arch_unregister_cpu);
59#else 60#else /* CONFIG_HOTPLUG_CPU */
61
60static int __init arch_register_cpu(int num) 62static int __init arch_register_cpu(int num)
61{ 63{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 64 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63} 65}
64#endif /*CONFIG_HOTPLUG_CPU*/ 66#endif /* CONFIG_HOTPLUG_CPU */
65 67
66static int __init topology_init(void) 68static int __init topology_init(void)
67{ 69{
@@ -70,11 +72,11 @@ static int __init topology_init(void)
70#ifdef CONFIG_NUMA 72#ifdef CONFIG_NUMA
71 for_each_online_node(i) 73 for_each_online_node(i)
72 register_one_node(i); 74 register_one_node(i);
73#endif /* CONFIG_NUMA */ 75#endif
74 76
75 for_each_present_cpu(i) 77 for_each_present_cpu(i)
76 arch_register_cpu(i); 78 arch_register_cpu(i);
79
77 return 0; 80 return 0;
78} 81}
79
80subsys_initcall(topology_init); 82subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index d8ccc3c6552f..66d874e5404c 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -29,7 +29,7 @@
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/page.h> 32#include <asm/page_types.h>
33 33
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 34/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 35#ifndef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 894293c598db..cddfb8d386b9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,10 +25,11 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable_types.h>
29#include <asm/page.h> 29#include <asm/page_types.h>
30#include <asm/msr.h> 30#include <asm/msr.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/processor-flags.h>
32 33
33.section .rodata, "a", @progbits 34.section .rodata, "a", @progbits
34 35
@@ -37,7 +38,7 @@
37ENTRY(trampoline_data) 38ENTRY(trampoline_data)
38r_base = . 39r_base = .
39 cli # We should be safe anyway 40 cli # We should be safe anyway
40 wbinvd 41 wbinvd
41 mov %cs, %ax # Code and data in the same place 42 mov %cs, %ax # Code and data in the same place
42 mov %ax, %ds 43 mov %ax, %ds
43 mov %ax, %es 44 mov %ax, %es
@@ -73,9 +74,8 @@ r_base = .
73 lidtl tidt - r_base # load idt with 0, 0 74 lidtl tidt - r_base # load idt with 0, 0
74 lgdtl tgdt - r_base # load gdt with whatever is appropriate 75 lgdtl tgdt - r_base # load gdt with whatever is appropriate
75 76
76 xor %ax, %ax 77 mov $X86_CR0_PE, %ax # protected mode (PE) bit
77 inc %ax # protected mode (PE) bit 78 lmsw %ax # into protected mode
78 lmsw %ax # into protected mode
79 79
80 # flush prefetch and jump to startup_32 80 # flush prefetch and jump to startup_32
81 ljmpl *(startup_32_vector - r_base) 81 ljmpl *(startup_32_vector - r_base)
@@ -86,9 +86,8 @@ startup_32:
86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register 86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
87 movl %eax, %ds 87 movl %eax, %ds
88 88
89 xorl %eax, %eax 89 movl $X86_CR4_PAE, %eax
90 btsl $5, %eax # Enable PAE mode 90 movl %eax, %cr4 # Enable PAE mode
91 movl %eax, %cr4
92 91
93 # Setup trampoline 4 level pagetables 92 # Setup trampoline 4 level pagetables
94 leal (trampoline_level4_pgt - r_base)(%esi), %eax 93 leal (trampoline_level4_pgt - r_base)(%esi), %eax
@@ -99,9 +98,9 @@ startup_32:
99 xorl %edx, %edx 98 xorl %edx, %edx
100 wrmsr 99 wrmsr
101 100
102 xorl %eax, %eax 101 # Enable paging and in turn activate Long Mode
103 btsl $31, %eax # Enable paging and in turn activate Long Mode 102 # Enable protected mode
104 btsl $0, %eax # Enable protected mode 103 movl $(X86_CR0_PG | X86_CR0_PE), %eax
105 movl %eax, %cr0 104 movl %eax, %cr0
106 105
107 /* 106 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7932338d7cb3..a1d288327ff0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,15 +54,14 @@
54#include <asm/desc.h> 54#include <asm/desc.h>
55#include <asm/i387.h> 55#include <asm/i387.h>
56 56
57#include <mach_traps.h> 57#include <asm/mach_traps.h>
58 58
59#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
60#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
61#include <asm/proto.h> 61#include <asm/proto.h>
62#include <asm/pda.h>
63#else 62#else
64#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
65#include <asm/arch_hooks.h> 64#include <asm/setup.h>
66#include <asm/traps.h> 65#include <asm/traps.h>
67 66
68#include "cpu/mcheck/mce.h" 67#include "cpu/mcheck/mce.h"
@@ -99,6 +98,12 @@ static inline void preempt_conditional_sti(struct pt_regs *regs)
99 local_irq_enable(); 98 local_irq_enable();
100} 99}
101 100
101static inline void conditional_cli(struct pt_regs *regs)
102{
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_disable();
105}
106
102static inline void preempt_conditional_cli(struct pt_regs *regs) 107static inline void preempt_conditional_cli(struct pt_regs *regs)
103{ 108{
104 if (regs->flags & X86_EFLAGS_IF) 109 if (regs->flags & X86_EFLAGS_IF)
@@ -113,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
113 if (!user_mode_vm(regs)) 118 if (!user_mode_vm(regs))
114 die(str, regs, err); 119 die(str, regs, err);
115} 120}
116
117/*
118 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
119 * invalid offset set (the LAZY one) and the faulting thread has
120 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
121 * we set the offset field correctly and return 1.
122 */
123static int lazy_iobitmap_copy(void)
124{
125 struct thread_struct *thread;
126 struct tss_struct *tss;
127 int cpu;
128
129 cpu = get_cpu();
130 tss = &per_cpu(init_tss, cpu);
131 thread = &current->thread;
132
133 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
134 thread->io_bitmap_ptr) {
135 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
136 thread->io_bitmap_max);
137 /*
138 * If the previously set map was extending to higher ports
139 * than the current one, pad extra space with 0xff (no access).
140 */
141 if (thread->io_bitmap_max < tss->io_bitmap_max) {
142 memset((char *) tss->io_bitmap +
143 thread->io_bitmap_max, 0xff,
144 tss->io_bitmap_max - thread->io_bitmap_max);
145 }
146 tss->io_bitmap_max = thread->io_bitmap_max;
147 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
148 tss->io_bitmap_owner = thread;
149 put_cpu();
150
151 return 1;
152 }
153 put_cpu();
154
155 return 0;
156}
157#endif 121#endif
158 122
159static void __kprobes 123static void __kprobes
@@ -304,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)
304 conditional_sti(regs); 268 conditional_sti(regs);
305 269
306#ifdef CONFIG_X86_32 270#ifdef CONFIG_X86_32
307 if (lazy_iobitmap_copy()) {
308 /* restart the faulting instruction */
309 return;
310 }
311
312 if (regs->flags & X86_VM_MASK) 271 if (regs->flags & X86_VM_MASK)
313 goto gp_in_vm86; 272 goto gp_in_vm86;
314#endif 273#endif
@@ -626,8 +585,10 @@ clear_dr7:
626 585
627#ifdef CONFIG_X86_32 586#ifdef CONFIG_X86_32
628debug_vm86: 587debug_vm86:
588 /* reenable preemption: handle_vm86_trap() might sleep */
589 dec_preempt_count();
629 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 590 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
630 preempt_conditional_cli(regs); 591 conditional_cli(regs);
631 return; 592 return;
632#endif 593#endif
633 594
@@ -906,19 +867,20 @@ void math_emulate(struct math_emu_info *info)
906} 867}
907#endif /* CONFIG_MATH_EMULATION */ 868#endif /* CONFIG_MATH_EMULATION */
908 869
909dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) 870dotraplinkage void __kprobes
871do_device_not_available(struct pt_regs *regs, long error_code)
910{ 872{
911#ifdef CONFIG_X86_32 873#ifdef CONFIG_X86_32
912 if (read_cr0() & X86_CR0_EM) { 874 if (read_cr0() & X86_CR0_EM) {
913 struct math_emu_info info = { }; 875 struct math_emu_info info = { };
914 876
915 conditional_sti(&regs); 877 conditional_sti(regs);
916 878
917 info.regs = &regs; 879 info.regs = regs;
918 math_emulate(&info); 880 math_emulate(&info);
919 } else { 881 } else {
920 math_state_restore(); /* interrupts still off */ 882 math_state_restore(); /* interrupts still off */
921 conditional_sti(&regs); 883 conditional_sti(regs);
922 } 884 }
923#else 885#else
924 math_state_restore(); 886 math_state_restore();
@@ -934,7 +896,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
934 info.si_signo = SIGILL; 896 info.si_signo = SIGILL;
935 info.si_errno = 0; 897 info.si_errno = 0;
936 info.si_code = ILL_BADSTK; 898 info.si_code = ILL_BADSTK;
937 info.si_addr = 0; 899 info.si_addr = NULL;
938 if (notify_die(DIE_TRAP, "iret exception", 900 if (notify_die(DIE_TRAP, "iret exception",
939 regs, error_code, 32, SIGILL) == NOTIFY_STOP) 901 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
940 return; 902 return;
@@ -1018,6 +980,6 @@ void __init trap_init(void)
1018 cpu_init(); 980 cpu_init();
1019 981
1020#ifdef CONFIG_X86_32 982#ifdef CONFIG_X86_32
1021 trap_init_hook(); 983 x86_quirk_trap_init();
1022#endif 984#endif
1023} 985}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 599e58168631..7a567ebe6361 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,20 +17,21 @@
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19 19
20unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
21EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
22unsigned int tsc_khz; 22
23unsigned int __read_mostly tsc_khz;
23EXPORT_SYMBOL(tsc_khz); 24EXPORT_SYMBOL(tsc_khz);
24 25
25/* 26/*
26 * TSC can be unstable due to cpufreq or due to unsynced TSCs 27 * TSC can be unstable due to cpufreq or due to unsynced TSCs
27 */ 28 */
28static int tsc_unstable; 29static int __read_mostly tsc_unstable;
29 30
30/* native_sched_clock() is called before tsc_init(), so 31/* native_sched_clock() is called before tsc_init(), so
31 we must start with the TSC soft disabled to prevent 32 we must start with the TSC soft disabled to prevent
32 erroneous rdtsc usage on !cpu_has_tsc processors */ 33 erroneous rdtsc usage on !cpu_has_tsc processors */
33static int tsc_disabled = -1; 34static int __read_mostly tsc_disabled = -1;
34 35
35static int tsc_clocksource_reliable; 36static int tsc_clocksource_reliable;
36/* 37/*
@@ -273,30 +274,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
273 * use the TSC value at the transitions to calculate a pretty 274 * use the TSC value at the transitions to calculate a pretty
274 * good value for the TSC frequencty. 275 * good value for the TSC frequencty.
275 */ 276 */
276static inline int pit_expect_msb(unsigned char val) 277static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
277{ 278{
278 int count = 0; 279 int count;
280 u64 tsc = 0;
279 281
280 for (count = 0; count < 50000; count++) { 282 for (count = 0; count < 50000; count++) {
281 /* Ignore LSB */ 283 /* Ignore LSB */
282 inb(0x42); 284 inb(0x42);
283 if (inb(0x42) != val) 285 if (inb(0x42) != val)
284 break; 286 break;
287 tsc = get_cycles();
285 } 288 }
286 return count > 50; 289 *deltap = get_cycles() - tsc;
290 *tscp = tsc;
291
292 /*
293 * We require _some_ success, but the quality control
294 * will be based on the error terms on the TSC values.
295 */
296 return count > 5;
287} 297}
288 298
289/* 299/*
290 * How many MSB values do we want to see? We aim for a 300 * How many MSB values do we want to see? We aim for
291 * 15ms calibration, which assuming a 2us counter read 301 * a maximum error rate of 500ppm (in practice the
292 * error should give us roughly 150 ppm precision for 302 * real error is much smaller), but refuse to spend
293 * the calibration. 303 * more than 25ms on it.
294 */ 304 */
295#define QUICK_PIT_MS 15 305#define MAX_QUICK_PIT_MS 25
296#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 306#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
297 307
298static unsigned long quick_pit_calibrate(void) 308static unsigned long quick_pit_calibrate(void)
299{ 309{
310 int i;
311 u64 tsc, delta;
312 unsigned long d1, d2;
313
300 /* Set the Gate high, disable speaker */ 314 /* Set the Gate high, disable speaker */
301 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 315 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
302 316
@@ -315,45 +329,52 @@ static unsigned long quick_pit_calibrate(void)
315 outb(0xff, 0x42); 329 outb(0xff, 0x42);
316 outb(0xff, 0x42); 330 outb(0xff, 0x42);
317 331
318 if (pit_expect_msb(0xff)) { 332 /*
319 int i; 333 * The PIT starts counting at the next edge, so we
320 u64 t1, t2, delta; 334 * need to delay for a microsecond. The easiest way
321 unsigned char expect = 0xfe; 335 * to do that is to just read back the 16-bit counter
322 336 * once from the PIT.
323 t1 = get_cycles(); 337 */
324 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { 338 inb(0x42);
325 if (!pit_expect_msb(expect)) 339 inb(0x42);
326 goto failed; 340
341 if (pit_expect_msb(0xff, &tsc, &d1)) {
342 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
343 if (!pit_expect_msb(0xff-i, &delta, &d2))
344 break;
345
346 /*
347 * Iterate until the error is less than 500 ppm
348 */
349 delta -= tsc;
350 if (d1+d2 < delta >> 11)
351 goto success;
327 } 352 }
328 t2 = get_cycles();
329
330 /*
331 * Make sure we can rely on the second TSC timestamp:
332 */
333 if (!pit_expect_msb(expect))
334 goto failed;
335
336 /*
337 * Ok, if we get here, then we've seen the
338 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
339 * times, and each MSB had many hits, so we never
340 * had any sudden jumps.
341 *
342 * As a result, we can depend on there not being
343 * any odd delays anywhere, and the TSC reads are
344 * reliable.
345 *
346 * kHz = ticks / time-in-seconds / 1000;
347 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
348 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
349 */
350 delta = (t2 - t1)*PIT_TICK_RATE;
351 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
352 printk("Fast TSC calibration using PIT\n");
353 return delta;
354 } 353 }
355failed: 354 printk("Fast TSC calibration failed\n");
356 return 0; 355 return 0;
356
357success:
358 /*
359 * Ok, if we get here, then we've seen the
360 * MSB of the PIT decrement 'i' times, and the
361 * error has shrunk to less than 500 ppm.
362 *
363 * As a result, we can depend on there not being
364 * any odd delays anywhere, and the TSC reads are
365 * reliable (within the error). We also adjust the
366 * delta to the middle of the error bars, just
367 * because it looks nicer.
368 *
369 * kHz = ticks / time-in-seconds / 1000;
370 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
371 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
372 */
373 delta += (long)(d2 - d1)/2;
374 delta *= PIT_TICK_RATE;
375 do_div(delta, i*256*1000);
376 printk("Fast TSC calibration using PIT\n");
377 return delta;
357} 378}
358 379
359/** 380/**
@@ -523,8 +544,6 @@ unsigned long native_calibrate_tsc(void)
523 return tsc_pit_min; 544 return tsc_pit_min;
524} 545}
525 546
526#ifdef CONFIG_X86_32
527/* Only called from the Powernow K7 cpu freq driver */
528int recalibrate_cpu_khz(void) 547int recalibrate_cpu_khz(void)
529{ 548{
530#ifndef CONFIG_SMP 549#ifndef CONFIG_SMP
@@ -546,7 +565,6 @@ int recalibrate_cpu_khz(void)
546 565
547EXPORT_SYMBOL(recalibrate_cpu_khz); 566EXPORT_SYMBOL(recalibrate_cpu_khz);
548 567
549#endif /* CONFIG_X86_32 */
550 568
551/* Accelerators for sched_clock() 569/* Accelerators for sched_clock()
552 * convert from cycles(64bits) => nanoseconds (64bits) 570 * convert from cycles(64bits) => nanoseconds (64bits)
@@ -773,7 +791,7 @@ __cpuinit int unsynchronized_tsc(void)
773 if (!cpu_has_tsc || tsc_unstable) 791 if (!cpu_has_tsc || tsc_unstable)
774 return 1; 792 return 1;
775 793
776#ifdef CONFIG_X86_SMP 794#ifdef CONFIG_SMP
777 if (apic_is_clustered_box()) 795 if (apic_is_clustered_box())
778 return 1; 796 return 1;
779#endif 797#endif
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 000000000000..2ffb6c53326e
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22
23#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv.h>
27#include <asm/apic.h>
28#include <asm/cpu.h>
29
30#define RTC_NAME "sgi_rtc"
31
32static cycle_t uv_read_rtc(void);
33static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
34static void uv_rtc_timer_setup(enum clock_event_mode,
35 struct clock_event_device *);
36
37static struct clocksource clocksource_uv = {
38 .name = RTC_NAME,
39 .rating = 400,
40 .read = uv_read_rtc,
41 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
42 .shift = 10,
43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
44};
45
46static struct clock_event_device clock_event_device_uv = {
47 .name = RTC_NAME,
48 .features = CLOCK_EVT_FEAT_ONESHOT,
49 .shift = 20,
50 .rating = 400,
51 .irq = -1,
52 .set_next_event = uv_rtc_next_event,
53 .set_mode = uv_rtc_timer_setup,
54 .event_handler = NULL,
55};
56
57static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
58
59/* There is one of these allocated per node */
60struct uv_rtc_timer_head {
61 spinlock_t lock;
62 /* next cpu waiting for timer, local node relative: */
63 int next_cpu;
64 /* number of cpus on this node: */
65 int ncpus;
66 struct {
67 int lcpu; /* systemwide logical cpu number */
68 u64 expires; /* next timer expiration for this cpu */
69 } cpu[1];
70};
71
72/*
73 * Access to uv_rtc_timer_head via blade id.
74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly;
76
77static int uv_rtc_enable;
78
79/*
80 * Hardware interface routines
81 */
82
83/* Send IPIs to another node */
84static void uv_rtc_send_IPI(int cpu)
85{
86 unsigned long apicid, val;
87 int pnode;
88
89 apicid = cpu_physical_id(cpu);
90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96}
97
98/* Check for an RTC interrupt pending */
99static int uv_intr_pending(int pnode)
100{
101 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
102 UVH_EVENT_OCCURRED0_RTC1_MASK;
103}
104
105/* Setup interrupt and return non-zero if early expiration occurred. */
106static int uv_setup_intr(int cpu, u64 expires)
107{
108 u64 val;
109 int pnode = uv_cpu_to_pnode(cpu);
110
111 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
112 UVH_RTC1_INT_CONFIG_M_MASK);
113 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
114
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120
121 /* Set configuration */
122 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125
126 return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
127}
128
129/*
130 * Per-cpu timer tracking routines
131 */
132
133static __init void uv_rtc_deallocate_timers(void)
134{
135 int bid;
136
137 for_each_possible_blade(bid) {
138 kfree(blade_info[bid]);
139 }
140 kfree(blade_info);
141}
142
143/* Allocate per-node list of cpu timer expiration times. */
144static __init int uv_rtc_allocate_timers(void)
145{
146 int cpu;
147
148 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
149 if (!blade_info)
150 return -ENOMEM;
151 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
152
153 for_each_present_cpu(cpu) {
154 int nid = cpu_to_node(cpu);
155 int bid = uv_cpu_to_blade_id(cpu);
156 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
157 struct uv_rtc_timer_head *head = blade_info[bid];
158
159 if (!head) {
160 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
161 (uv_blade_nr_possible_cpus(bid) *
162 2 * sizeof(u64)),
163 GFP_KERNEL, nid);
164 if (!head) {
165 uv_rtc_deallocate_timers();
166 return -ENOMEM;
167 }
168 spin_lock_init(&head->lock);
169 head->ncpus = uv_blade_nr_possible_cpus(bid);
170 head->next_cpu = -1;
171 blade_info[bid] = head;
172 }
173
174 head->cpu[bcpu].lcpu = cpu;
175 head->cpu[bcpu].expires = ULLONG_MAX;
176 }
177
178 return 0;
179}
180
181/* Find and set the next expiring timer. */
182static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
183{
184 u64 lowest = ULLONG_MAX;
185 int c, bcpu = -1;
186
187 head->next_cpu = -1;
188 for (c = 0; c < head->ncpus; c++) {
189 u64 exp = head->cpu[c].expires;
190 if (exp < lowest) {
191 bcpu = c;
192 lowest = exp;
193 }
194 }
195 if (bcpu >= 0) {
196 head->next_cpu = bcpu;
197 c = head->cpu[bcpu].lcpu;
198 if (uv_setup_intr(c, lowest))
199 /* If we didn't set it up in time, trigger */
200 uv_rtc_send_IPI(c);
201 } else {
202 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
203 UVH_RTC1_INT_CONFIG_M_MASK);
204 }
205}
206
207/*
208 * Set expiration time for current cpu.
209 *
210 * Returns 1 if we missed the expiration time.
211 */
212static int uv_rtc_set_timer(int cpu, u64 expires)
213{
214 int pnode = uv_cpu_to_pnode(cpu);
215 int bid = uv_cpu_to_blade_id(cpu);
216 struct uv_rtc_timer_head *head = blade_info[bid];
217 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
218 u64 *t = &head->cpu[bcpu].expires;
219 unsigned long flags;
220 int next_cpu;
221
222 spin_lock_irqsave(&head->lock, flags);
223
224 next_cpu = head->next_cpu;
225 *t = expires;
226 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) {
229 head->next_cpu = bcpu;
230 if (uv_setup_intr(cpu, expires)) {
231 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags);
234 return 1;
235 }
236 }
237
238 spin_unlock_irqrestore(&head->lock, flags);
239 return 0;
240}
241
242/*
243 * Unset expiration time for current cpu.
244 *
245 * Returns 1 if this timer was pending.
246 */
247static int uv_rtc_unset_timer(int cpu)
248{
249 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu);
251 struct uv_rtc_timer_head *head = blade_info[bid];
252 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
253 u64 *t = &head->cpu[bcpu].expires;
254 unsigned long flags;
255 int rc = 0;
256
257 spin_lock_irqsave(&head->lock, flags);
258
259 if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
260 rc = 1;
261
262 *t = ULLONG_MAX;
263
264 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode);
267
268 spin_unlock_irqrestore(&head->lock, flags);
269
270 return rc;
271}
272
273
274/*
275 * Kernel interface routines.
276 */
277
278/*
279 * Read the RTC.
280 */
281static cycle_t uv_read_rtc(void)
282{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC);
284}
285
286/*
287 * Program the next event, relative to now
288 */
289static int uv_rtc_next_event(unsigned long delta,
290 struct clock_event_device *ced)
291{
292 int ced_cpu = cpumask_first(ced->cpumask);
293
294 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
295}
296
297/*
298 * Setup the RTC timer in oneshot mode
299 */
300static void uv_rtc_timer_setup(enum clock_event_mode mode,
301 struct clock_event_device *evt)
302{
303 int ced_cpu = cpumask_first(evt->cpumask);
304
305 switch (mode) {
306 case CLOCK_EVT_MODE_PERIODIC:
307 case CLOCK_EVT_MODE_ONESHOT:
308 case CLOCK_EVT_MODE_RESUME:
309 /* Nothing to do here yet */
310 break;
311 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu);
314 break;
315 }
316}
317
318static void uv_rtc_interrupt(void)
319{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id();
322
323 if (!ced || !ced->event_handler)
324 return;
325
326 if (uv_rtc_unset_timer(cpu) != 1)
327 return;
328
329 ced->event_handler(ced);
330}
331
332static int __init uv_enable_rtc(char *str)
333{
334 uv_rtc_enable = 1;
335
336 return 1;
337}
338__setup("uvrtc", uv_enable_rtc);
339
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{
342 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
343
344 *ced = clock_event_device_uv;
345 ced->cpumask = cpumask_of(smp_processor_id());
346 clockevents_register_device(ced);
347}
348
349static __init int uv_rtc_setup_clock(void)
350{
351 int rc;
352
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
354 return -ENODEV;
355
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift);
360
361 rc = clocksource_register(&clocksource_uv);
362 if (rc) {
363 generic_interrupt_extension = NULL;
364 return rc;
365 }
366
367 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers();
369 if (rc) {
370 clocksource_unregister(&clocksource_uv);
371 generic_interrupt_extension = NULL;
372 return rc;
373 }
374
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift);
377
378 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
379 sn_rtc_cycles_per_second;
380
381 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
382 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
383
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) {
386 clocksource_unregister(&clocksource_uv);
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers();
389 }
390
391 return rc;
392}
393arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index d801d06af068..31ffc24eec4d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -24,18 +24,14 @@
24 24
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h> 27#include <asm/io_apic.h>
29#include <asm/fixmap.h> 28#include <asm/fixmap.h>
30#include <asm/reboot.h> 29#include <asm/reboot.h>
31#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h>
36
37#include "mach_apic.h"
38
39#include <linux/kernel_stat.h> 35#include <linux/kernel_stat.h>
40 36
41#include <asm/i8259.h> 37#include <asm/i8259.h>
@@ -49,8 +45,6 @@
49 45
50extern int no_broadcast; 46extern int no_broadcast;
51 47
52#include <asm/apic.h>
53
54char visws_board_type = -1; 48char visws_board_type = -1;
55char visws_board_rev = -1; 49char visws_board_rev = -1;
56 50
@@ -200,7 +194,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
200 return; 194 return;
201 } 195 }
202 196
203 apic_cpus = apicid_to_cpu_present(m->apicid); 197 apic_cpus = apic->apicid_to_cpu_present(m->apicid);
204 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 198 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
205 /* 199 /*
206 * Validate version 200 * Validate version
@@ -584,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
584static irqreturn_t piix4_master_intr(int irq, void *dev_id) 578static irqreturn_t piix4_master_intr(int irq, void *dev_id)
585{ 579{
586 int realirq; 580 int realirq;
587 irq_desc_t *desc; 581 struct irq_desc *desc;
588 unsigned long flags; 582 unsigned long flags;
589 583
590 spin_lock_irqsave(&i8259A_lock, flags); 584 spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720d..d7ac84e7fc1c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
158 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159 159
160 ret->fs = current->thread.saved_fs; 160 ret->fs = current->thread.saved_fs;
161 loadsegment(gs, current->thread.saved_gs); 161 set_user_gs(ret, current->thread.saved_gs);
162 162
163 return ret; 163 return ret;
164} 164}
@@ -197,9 +197,9 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200asmlinkage int sys_vm86old(struct pt_regs regs) 200int sys_vm86old(struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; 202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 203 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 204 * this avoids wasting of stack space.
205 * This remains on the stack until we 205 * This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
218 if (tmp) 218 if (tmp)
219 goto out; 219 goto out;
220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
221 info.regs32 = &regs; 221 info.regs32 = regs;
222 tsk->thread.vm86_info = v86; 222 tsk->thread.vm86_info = v86;
223 do_sys_vm86(&info, tsk); 223 do_sys_vm86(&info, tsk);
224 ret = 0; /* we never return here */ 224 ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
227} 227}
228 228
229 229
230asmlinkage int sys_vm86(struct pt_regs regs) 230int sys_vm86(struct pt_regs *regs)
231{ 231{
232 struct kernel_vm86_struct info; /* declare this _on top_, 232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 233 * this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
239 struct vm86plus_struct __user *v86; 239 struct vm86plus_struct __user *v86;
240 240
241 tsk = current; 241 tsk = current;
242 switch (regs.bx) { 242 switch (regs->bx) {
243 case VM86_REQUEST_IRQ: 243 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 244 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 245 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 246 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); 247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
248 goto out; 248 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 249 case VM86_PLUS_INSTALL_CHECK:
250 /* 250 /*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
261 ret = -EPERM; 261 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 262 if (tsk->thread.saved_sp0)
263 goto out; 263 goto out;
264 v86 = (struct vm86plus_struct __user *)regs.cx; 264 v86 = (struct vm86plus_struct __user *)regs->cx;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 266 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 267 sizeof(info.regs));
268 ret = -EFAULT; 268 ret = -EFAULT;
269 if (tmp) 269 if (tmp)
270 goto out; 270 goto out;
271 info.regs32 = &regs; 271 info.regs32 = regs;
272 info.vm86plus.is_vm86pus = 1; 272 info.vm86plus.is_vm86pus = 1;
273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
274 do_sys_vm86(&info, tsk); 274 do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
323 info->regs32->ax = 0; 323 info->regs32->ax = 0;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 324 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 325 tsk->thread.saved_fs = info->regs32->fs;
326 savesegment(gs, tsk->thread.saved_gs); 326 tsk->thread.saved_gs = get_user_gs(info->regs32);
327 327
328 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index bef58b4982db..95deb9f2211e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
395 vmi_ops.update_pte(ptep, VMI_PAGE_PT); 395 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
396} 396}
397 397
398static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
399{
400 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
401}
402
403static void vmi_set_pud(pud_t *pudp, pud_t pudval) 398static void vmi_set_pud(pud_t *pudp, pud_t pudval)
404{ 399{
405 /* Um, eww */ 400 /* Um, eww */
@@ -680,10 +675,11 @@ static inline int __init activate_vmi(void)
680 para_fill(pv_mmu_ops.write_cr2, SetCR2); 675 para_fill(pv_mmu_ops.write_cr2, SetCR2);
681 para_fill(pv_mmu_ops.write_cr3, SetCR3); 676 para_fill(pv_mmu_ops.write_cr3, SetCR3);
682 para_fill(pv_cpu_ops.write_cr4, SetCR4); 677 para_fill(pv_cpu_ops.write_cr4, SetCR4);
683 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 678
684 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 679 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
685 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 680 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
686 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 681 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
682 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
687 683
688 para_fill(pv_cpu_ops.wbinvd, WBINVD); 684 para_fill(pv_cpu_ops.wbinvd, WBINVD);
689 para_fill(pv_cpu_ops.read_tsc, RDTSC); 685 para_fill(pv_cpu_ops.read_tsc, RDTSC);
@@ -749,7 +745,6 @@ static inline int __init activate_vmi(void)
749 pv_mmu_ops.set_pmd = vmi_set_pmd; 745 pv_mmu_ops.set_pmd = vmi_set_pmd;
750#ifdef CONFIG_X86_PAE 746#ifdef CONFIG_X86_PAE
751 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; 747 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
752 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
753 pv_mmu_ops.set_pud = vmi_set_pud; 748 pv_mmu_ops.set_pud = vmi_set_pud;
754 pv_mmu_ops.pte_clear = vmi_pte_clear; 749 pv_mmu_ops.pte_clear = vmi_pte_clear;
755 pv_mmu_ops.pmd_clear = vmi_pmd_clear; 750 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
@@ -797,8 +792,8 @@ static inline int __init activate_vmi(void)
797#endif 792#endif
798 793
799#ifdef CONFIG_X86_LOCAL_APIC 794#ifdef CONFIG_X86_LOCAL_APIC
800 para_fill(apic_ops->read, APICRead); 795 para_fill(apic->read, APICRead);
801 para_fill(apic_ops->write, APICWrite); 796 para_fill(apic->write, APICWrite);
802#endif 797#endif
803 798
804 /* 799 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index c4c1f9e09402..d303369a7bad 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -28,7 +28,6 @@
28 28
29#include <asm/vmi.h> 29#include <asm/vmi.h>
30#include <asm/vmi_time.h> 30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h> 31#include <asm/apicdef.h>
33#include <asm/apic.h> 32#include <asm/apic.h>
34#include <asm/timer.h> 33#include <asm/timer.h>
@@ -202,8 +201,7 @@ static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
202static struct irqaction vmi_clock_action = { 201static struct irqaction vmi_clock_action = {
203 .name = "vmi-timer", 202 .name = "vmi-timer",
204 .handler = vmi_timer_interrupt, 203 .handler = vmi_timer_interrupt,
205 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 204 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
206 .mask = CPU_MASK_ALL,
207}; 205};
208 206
209static void __devinit vmi_time_init_clockevent(void) 207static void __devinit vmi_time_init_clockevent(void)
@@ -256,7 +254,7 @@ void __devinit vmi_time_bsp_init(void)
256 */ 254 */
257 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 255 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
258 local_irq_disable(); 256 local_irq_disable();
259#ifdef CONFIG_X86_SMP 257#ifdef CONFIG_SMP
260 /* 258 /*
261 * XXX handle_percpu_irq only defined for SMP; we need to switch over 259 * XXX handle_percpu_irq only defined for SMP; we need to switch over
262 * to using it, since this is a local interrupt, which each CPU must 260 * to using it, since this is a local interrupt, which each CPU must
@@ -283,10 +281,12 @@ void __devinit vmi_time_ap_init(void)
283#endif 281#endif
284 282
285/** vmi clocksource */ 283/** vmi clocksource */
284static struct clocksource clocksource_vmi;
286 285
287static cycle_t read_real_cycles(void) 286static cycle_t read_real_cycles(void)
288{ 287{
289 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); 288 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
289 return max(ret, clocksource_vmi.cycle_last);
290} 290}
291 291
292static struct clocksource clocksource_vmi = { 292static struct clocksource clocksource_vmi = {
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..62ad500d55f3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -12,7 +12,7 @@
12 12
13#include <asm-generic/vmlinux.lds.h> 13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h> 14#include <asm/thread_info.h>
15#include <asm/page.h> 15#include <asm/page_types.h>
16#include <asm/cache.h> 16#include <asm/cache.h>
17#include <asm/boot.h> 17#include <asm/boot.h>
18 18
@@ -178,14 +178,7 @@ SECTIONS
178 __initramfs_end = .; 178 __initramfs_end = .;
179 } 179 }
180#endif 180#endif
181 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
182 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
183 __per_cpu_start = .;
184 *(.data.percpu.page_aligned)
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
190 /* freed after init ends here */ 183 /* freed after init ends here */
191 184
@@ -196,15 +189,24 @@ SECTIONS
196 *(.bss) 189 *(.bss)
197 . = ALIGN(4); 190 . = ALIGN(4);
198 __bss_stop = .; 191 __bss_stop = .;
199 _end = . ; 192 }
200 /* This is where the kernel creates the early boot page tables */ 193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
201 . = ALIGN(PAGE_SIZE); 195 . = ALIGN(PAGE_SIZE);
202 pg0 = . ; 196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
203 } 204 }
204 205
205 /* Sections to be discarded */ 206 /* Sections to be discarded */
206 /DISCARD/ : { 207 /DISCARD/ : {
207 *(.exitcall.exit) 208 *(.exitcall.exit)
209 *(.discard)
208 } 210 }
209 211
210 STABS_DEBUG 212 STABS_DEBUG
@@ -212,6 +214,12 @@ SECTIONS
212 DWARF_DEBUG 214 DWARF_DEBUG
213} 215}
214 216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
215#ifdef CONFIG_KEXEC 223#ifdef CONFIG_KEXEC
216/* Link time checks */ 224/* Link time checks */
217#include <asm/kexec.h> 225#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..c8742507b030 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,7 +5,8 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/page.h> 8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
11 12
@@ -13,20 +14,23 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(0); /* ___ */ 26 note PT_NOTE FLAGS(0); /* ___ */
23} 27}
24SECTIONS 28SECTIONS
25{ 29{
26 . = __START_KERNEL; 30 . = __START_KERNEL;
27 phys_startup_64 = startup_64 - LOAD_OFFSET; 31 phys_startup_64 = startup_64 - LOAD_OFFSET;
28 _text = .; /* Text and read-only data */
29 .text : AT(ADDR(.text) - LOAD_OFFSET) { 32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
30 /* First the code that has to be first for bootstrapping */ 34 /* First the code that has to be first for bootstrapping */
31 *(.text.head) 35 *(.text.head)
32 _stext = .; 36 _stext = .;
@@ -57,13 +61,13 @@ SECTIONS
57 .data : AT(ADDR(.data) - LOAD_OFFSET) { 61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
58 DATA_DATA 62 DATA_DATA
59 CONSTRUCTORS 63 CONSTRUCTORS
64 _edata = .; /* End of data section */
60 } :data 65 } :data
61 66
62 _edata = .; /* End of data section */
63 67
64 . = ALIGN(PAGE_SIZE);
65 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
66 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
67 *(.data.cacheline_aligned) 71 *(.data.cacheline_aligned)
68 } 72 }
69 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -121,29 +125,29 @@ SECTIONS
121#undef VVIRT_OFFSET 125#undef VVIRT_OFFSET
122#undef VVIRT 126#undef VVIRT
123 127
124 . = ALIGN(THREAD_SIZE); /* init_task */
125 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
126 *(.data.init_task) 130 *(.data.init_task)
127 }:data.init 131 }:data.init
128 132
129 . = ALIGN(PAGE_SIZE);
130 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
131 *(.data.page_aligned) 135 *(.data.page_aligned)
132 } 136 }
133 137
134 /* might get freed after init */
135 . = ALIGN(PAGE_SIZE);
136 __smp_alt_begin = .;
137 __smp_locks = .;
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
139 *(.smp_locks) 143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
140 } 147 }
141 __smp_locks_end = .;
142 . = ALIGN(PAGE_SIZE);
143 __smp_alt_end = .;
144 148
145 . = ALIGN(PAGE_SIZE); /* Init code and data */ 149 . = ALIGN(PAGE_SIZE); /* Init code and data */
146 __init_begin = .; 150 __init_begin = .; /* paired with __init_end */
147 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
148 _sinittext = .; 152 _sinittext = .;
149 INIT_TEXT 153 INIT_TEXT
@@ -155,40 +159,42 @@ SECTIONS
155 __initdata_end = .; 159 __initdata_end = .;
156 } 160 }
157 161
158 . = ALIGN(16); 162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
159 __setup_start = .; 163 . = ALIGN(16);
160 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 164 __setup_start = .;
161 __setup_end = .; 165 *(.init.setup)
162 __initcall_start = .; 166 __setup_end = .;
167 }
163 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
164 INITCALLS 170 INITCALLS
171 __initcall_end = .;
165 } 172 }
166 __initcall_end = .;
167 __con_initcall_start = .;
168 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
169 *(.con_initcall.init) 175 *(.con_initcall.init)
176 __con_initcall_end = .;
170 } 177 }
171 __con_initcall_end = .;
172 __x86_cpu_dev_start = .;
173 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
174 *(.x86_cpu_dev.init) 180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
175 } 182 }
176 __x86_cpu_dev_end = .;
177 SECURITY_INIT 183 SECURITY_INIT
178 184
179 . = ALIGN(8); 185 . = ALIGN(8);
180 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
181 __parainstructions = .; 187 __parainstructions = .;
182 *(.parainstructions) 188 *(.parainstructions)
183 __parainstructions_end = .; 189 __parainstructions_end = .;
184 } 190 }
185 191
186 . = ALIGN(8);
187 __alt_instructions = .;
188 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
189 *(.altinstructions) 195 *(.altinstructions)
196 __alt_instructions_end = .;
190 } 197 }
191 __alt_instructions_end = .;
192 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
193 *(.altinstr_replacement) 199 *(.altinstr_replacement)
194 } 200 }
@@ -203,28 +209,53 @@ SECTIONS
203 209
204#ifdef CONFIG_BLK_DEV_INITRD 210#ifdef CONFIG_BLK_DEV_INITRD
205 . = ALIGN(PAGE_SIZE); 211 . = ALIGN(PAGE_SIZE);
206 __initramfs_start = .; 212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
207 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 213 __initramfs_start = .;
208 __initramfs_end = .; 214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
209#endif 217#endif
210 218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
211 PERCPU(PAGE_SIZE) 230 PERCPU(PAGE_SIZE)
231#endif
212 232
213 . = ALIGN(PAGE_SIZE); 233 . = ALIGN(PAGE_SIZE);
214 __init_end = .; 234 __init_end = .;
215 235
216 . = ALIGN(PAGE_SIZE); 236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
217 __nosave_begin = .; 237 . = ALIGN(PAGE_SIZE);
218 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 238 __nosave_begin = .;
219 . = ALIGN(PAGE_SIZE); 239 *(.data.nosave)
220 __nosave_end = .; 240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
221 243
222 __bss_start = .; /* BSS */
223 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
224 *(.bss.page_aligned) 247 *(.bss.page_aligned)
225 *(.bss) 248 *(.bss)
226 } 249 __bss_stop = .;
227 __bss_stop = .; 250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
228 259
229 _end = . ; 260 _end = . ;
230 261
@@ -232,6 +263,7 @@ SECTIONS
232 /DISCARD/ : { 263 /DISCARD/ : {
233 *(.exitcall.exit) 264 *(.exitcall.exit)
234 *(.eh_frame) 265 *(.eh_frame)
266 *(.discard)
235 } 267 }
236 268
237 STABS_DEBUG 269 STABS_DEBUG
@@ -239,8 +271,28 @@ SECTIONS
239 DWARF_DEBUG 271 DWARF_DEBUG
240} 272}
241 273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
242/* 282/*
243 * Build-time check on the image size: 283 * Build-time check on the image size:
244 */ 284 */
245ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
246 "kernel image bigger than KERNEL_IMAGE_SIZE") 286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..a1d804bcd483 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
@@ -135,6 +139,7 @@ int is_vsmp_box(void)
135 return 0; 139 return 0;
136 } 140 }
137} 141}
142
138#else 143#else
139static void __init detect_vsmp_box(void) 144static void __init detect_vsmp_box(void)
140{ 145{
@@ -144,7 +149,6 @@ int is_vsmp_box(void)
144 return 0; 149 return 0;
145} 150}
146#endif 151#endif
147
148void __init vsmp_init(void) 152void __init vsmp_init(void)
149{ 153{
150 detect_vsmp_box(); 154 detect_vsmp_box();
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);