aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile38
-rw-r--r--arch/x86/kernel/acpi/boot.c251
-rw-r--r--arch/x86/kernel/acpi/cstate.c74
-rw-r--r--arch/x86/kernel/acpi/sleep.c3
-rw-r--r--arch/x86/kernel/amd_iommu.c707
-rw-r--r--arch/x86/kernel/amd_iommu_init.c29
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c386
-rw-r--r--arch/x86/kernel/apm_32.c10
-rw-r--r--arch/x86/kernel/asm-offsets_32.c3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c15
-rw-r--r--arch/x86/kernel/bigsmp_32.c266
-rw-r--r--arch/x86/kernel/bios_uv.c60
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c52
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/common.c281
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c192
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c76
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c9
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c53
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c140
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c22
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c359
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/cpuid.c8
-rw-r--r--arch/x86/kernel/crash.c22
-rw-r--r--arch/x86/kernel/ds.c1143
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c322
-rw-r--r--arch/x86/kernel/e820.c37
-rw-r--r--arch/x86/kernel/early-quirks.c41
-rw-r--r--arch/x86/kernel/early_printk.c51
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S925
-rw-r--r--arch/x86/kernel/entry_64.S1488
-rw-r--r--arch/x86/kernel/es7000_32.c498
-rw-r--r--arch/x86/kernel/ftrace.c391
-rw-r--r--arch/x86/kernel/genapic_64.c28
-rw-r--r--arch/x86/kernel/genapic_flat_64.c265
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c166
-rw-r--r--arch/x86/kernel/genx2apic_phys.c167
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c264
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c26
-rw-r--r--arch/x86/kernel/head_32.S40
-rw-r--r--arch/x86/kernel/head_64.S21
-rw-r--r--arch/x86/kernel/hpet.c34
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8237.c17
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/i8259.c8
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/io_apic.c1410
-rw-r--r--arch/x86/kernel/ioport.c7
-rw-r--r--arch/x86/kernel/ipi.c176
-rw-r--r--arch/x86/kernel/irq.c52
-rw-r--r--arch/x86/kernel/irq_32.c56
-rw-r--r--arch/x86/kernel/irq_64.c105
-rw-r--r--arch/x86/kernel/irqinit_32.c44
-rw-r--r--arch/x86/kernel/irqinit_64.c90
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c11
-rw-r--r--arch/x86/kernel/kvmclock.c12
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/mfgpt_32.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c232
-rw-r--r--arch/x86/kernel/microcode_core.c25
-rw-r--r--arch/x86/kernel/microcode_intel.c18
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c501
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c72
-rw-r--r--arch/x86/kernel/numaq_32.c353
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c13
-rw-r--r--arch/x86/kernel/paravirt.c55
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c26
-rw-r--r--arch/x86/kernel/pci-gart_64.c10
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/probe_32.c411
-rw-r--r--arch/x86/kernel/probe_roms_32.c2
-rw-r--r--arch/x86/kernel/process.c43
-rw-r--r--arch/x86/kernel/process_32.c143
-rw-r--r--arch/x86/kernel/process_64.c109
-rw-r--r--arch/x86/kernel/ptrace.c451
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c116
-rw-r--r--arch/x86/kernel/setup.c211
-rw-r--r--arch/x86/kernel/setup_percpu.c426
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)712
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c63
-rw-r--r--arch/x86/kernel/smpboot.c320
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/stacktrace.c66
-rw-r--r--arch/x86/kernel/summit_32.c416
-rw-r--r--arch/x86/kernel/syscall_table_32.S22
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/time_64.c8
-rw-r--r--arch/x86/kernel/tlb_32.c257
-rw-r--r--arch/x86/kernel/tlb_64.c284
-rw-r--r--arch/x86/kernel/tlb_uv.c86
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/trampoline_64.S19
-rw-r--r--arch/x86/kernel/traps.c93
-rw-r--r--arch/x86/kernel/tsc.c44
-rw-r--r--arch/x86/kernel/tsc_sync.c12
-rw-r--r--arch/x86/kernel/visws_quirks.c36
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmi_32.c155
-rw-r--r--arch/x86/kernel/vmiclock_32.c4
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S10
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S36
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/kernel/vsyscall_64.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kernel/xsave.c4
144 files changed, 10970 insertions, 7994 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e489ff9cb3e2..24f357e7557a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 14CFLAGS_REMOVE_ftrace.o = -pg
15CFLAGS_REMOVE_early_printk.o = -pg
15endif 16endif
16 17
17# 18#
@@ -22,13 +23,14 @@ nostackp := $(call cc-option, -fno-stack-protector)
22CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
23CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
24CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
25 27
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 31obj-y += setup.o i8259.o irqinit_$(BITS).o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_32.o probe_roms_32.o
32obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
33obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 35obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
34obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 36obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
@@ -41,36 +43,38 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
41obj-y += process.o 43obj-y += process.o
42obj-y += i387.o xsave.o 44obj-y += i387.o xsave.o
43obj-y += ptrace.o 45obj-y += ptrace.o
44obj-y += ds.o 46obj-$(CONFIG_X86_DS) += ds.o
45obj-$(CONFIG_X86_32) += tls.o 47obj-$(CONFIG_X86_32) += tls.o
46obj-$(CONFIG_IA32_EMULATION) += tls.o 48obj-$(CONFIG_IA32_EMULATION) += tls.o
47obj-y += step.o 49obj-y += step.o
48obj-$(CONFIG_STACKTRACE) += stacktrace.o 50obj-$(CONFIG_STACKTRACE) += stacktrace.o
49obj-y += cpu/ 51obj-y += cpu/
50obj-y += acpi/ 52obj-y += acpi/
51obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o 53obj-y += reboot.o
52obj-$(CONFIG_MCA) += mca_32.o 54obj-$(CONFIG_MCA) += mca_32.o
53obj-$(CONFIG_X86_MSR) += msr.o 55obj-$(CONFIG_X86_MSR) += msr.o
54obj-$(CONFIG_X86_CPUID) += cpuid.o 56obj-$(CONFIG_X86_CPUID) += cpuid.o
55obj-$(CONFIG_PCI) += early-quirks.o 57obj-$(CONFIG_PCI) += early-quirks.o
56apm-y := apm_32.o 58apm-y := apm_32.o
57obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
58obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_SMP) += smp.o
59obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o ipi.o
60obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_SMP) += setup_percpu.o
61obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
62obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
63obj-$(CONFIG_X86_MPPARSE) += mpparse.o 65obj-$(CONFIG_X86_MPPARSE) += mpparse.o
64obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 66obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o ipi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 67obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 68obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 69obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
70obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 71obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 72obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 73obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
74obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 75obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
72obj-$(CONFIG_X86_ES7000) += es7000_32.o 76obj-$(CONFIG_X86_ES7000) += es7000_32.o
73obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 77obj-$(CONFIG_X86_SUMMIT) += summit_32.o
74obj-y += vsmp_64.o 78obj-y += vsmp_64.o
75obj-$(CONFIG_KPROBES) += kprobes.o 79obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_MODULES) += module_$(BITS).o 80obj-$(CONFIG_MODULES) += module_$(BITS).o
@@ -105,20 +109,24 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
105microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 109microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
106obj-$(CONFIG_MICROCODE) += microcode.o 110obj-$(CONFIG_MICROCODE) += microcode.o
107 111
112obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
113
114obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
115
108### 116###
109# 64 bit specific files 117# 64 bit specific files
110ifeq ($(CONFIG_X86_64),y) 118ifeq ($(CONFIG_X86_64),y)
111 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 119 obj-y += genapic_64.o genapic_flat_64.o
112 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
113 obj-y += genx2apic_cluster.o 120 obj-y += genx2apic_cluster.o
114 obj-y += genx2apic_phys.o 121 obj-y += genx2apic_phys.o
122 obj-$(CONFIG_X86_UV) += genx2apic_uv_x.o tlb_uv.o
123 obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o
115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 124 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
116 obj-$(CONFIG_AUDIT) += audit_64.o 125 obj-$(CONFIG_AUDIT) += audit_64.o
117 126
118 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 127 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
119 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 128 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 129 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
121 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
122 130
123 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 131 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
124endif 132endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8c1f76abae9e..956c1dee6fbe 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -42,12 +42,8 @@
42#include <asm/mpspec.h> 42#include <asm/mpspec.h>
43#include <asm/smp.h> 43#include <asm/smp.h>
44 44
45#ifdef CONFIG_X86_LOCAL_APIC
46# include <mach_apic.h>
47#endif
48
49static int __initdata acpi_force = 0; 45static int __initdata acpi_force = 0;
50 46u32 acpi_rsdt_forced;
51#ifdef CONFIG_ACPI 47#ifdef CONFIG_ACPI
52int acpi_disabled = 0; 48int acpi_disabled = 0;
53#else 49#else
@@ -56,16 +52,7 @@ int acpi_disabled = 1;
56EXPORT_SYMBOL(acpi_disabled); 52EXPORT_SYMBOL(acpi_disabled);
57 53
58#ifdef CONFIG_X86_64 54#ifdef CONFIG_X86_64
59 55# include <asm/proto.h>
60#include <asm/proto.h>
61
62#else /* X86 */
63
64#ifdef CONFIG_X86_LOCAL_APIC
65#include <mach_apic.h>
66#include <mach_mpparse.h>
67#endif /* CONFIG_X86_LOCAL_APIC */
68
69#endif /* X86 */ 56#endif /* X86 */
70 57
71#define BAD_MADT_ENTRY(entry, end) ( \ 58#define BAD_MADT_ENTRY(entry, end) ( \
@@ -121,35 +108,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
121 */ 108 */
122char *__init __acpi_map_table(unsigned long phys, unsigned long size) 109char *__init __acpi_map_table(unsigned long phys, unsigned long size)
123{ 110{
124 unsigned long base, offset, mapped_size;
125 int idx;
126 111
127 if (!phys || !size) 112 if (!phys || !size)
128 return NULL; 113 return NULL;
129 114
130 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) 115 return early_ioremap(phys, size);
131 return __va(phys); 116}
132 117void __init __acpi_unmap_table(char *map, unsigned long size)
133 offset = phys & (PAGE_SIZE - 1); 118{
134 mapped_size = PAGE_SIZE - offset; 119 if (!map || !size)
135 clear_fixmap(FIX_ACPI_END); 120 return;
136 set_fixmap(FIX_ACPI_END, phys);
137 base = fix_to_virt(FIX_ACPI_END);
138
139 /*
140 * Most cases can be covered by the below.
141 */
142 idx = FIX_ACPI_END;
143 while (mapped_size < size) {
144 if (--idx < FIX_ACPI_BEGIN)
145 return NULL; /* cannot handle this */
146 phys += PAGE_SIZE;
147 clear_fixmap(idx);
148 set_fixmap(idx, phys);
149 mapped_size += PAGE_SIZE;
150 }
151 121
152 return ((unsigned char *)base + offset); 122 early_iounmap(map, size);
153} 123}
154 124
155#ifdef CONFIG_PCI_MMCONFIG 125#ifdef CONFIG_PCI_MMCONFIG
@@ -239,7 +209,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
239 madt->address); 209 madt->address);
240 } 210 }
241 211
242 acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); 212 default_acpi_madt_oem_check(madt->header.oem_id,
213 madt->header.oem_table_id);
243 214
244 return 0; 215 return 0;
245} 216}
@@ -538,9 +509,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
538 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 509 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
539 union acpi_object *obj; 510 union acpi_object *obj;
540 struct acpi_madt_local_apic *lapic; 511 struct acpi_madt_local_apic *lapic;
541 cpumask_t tmp_map, new_map; 512 cpumask_var_t tmp_map, new_map;
542 u8 physid; 513 u8 physid;
543 int cpu; 514 int cpu;
515 int retval = -ENOMEM;
544 516
545 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) 517 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
546 return -EINVAL; 518 return -EINVAL;
@@ -569,23 +541,37 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
569 buffer.length = ACPI_ALLOCATE_BUFFER; 541 buffer.length = ACPI_ALLOCATE_BUFFER;
570 buffer.pointer = NULL; 542 buffer.pointer = NULL;
571 543
572 tmp_map = cpu_present_map; 544 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
545 goto out;
546
547 if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
548 goto free_tmp_map;
549
550 cpumask_copy(tmp_map, cpu_present_mask);
573 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); 551 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
574 552
575 /* 553 /*
576 * If mp_register_lapic successfully generates a new logical cpu 554 * If mp_register_lapic successfully generates a new logical cpu
577 * number, then the following will get us exactly what was mapped 555 * number, then the following will get us exactly what was mapped
578 */ 556 */
579 cpus_andnot(new_map, cpu_present_map, tmp_map); 557 cpumask_andnot(new_map, cpu_present_mask, tmp_map);
580 if (cpus_empty(new_map)) { 558 if (cpumask_empty(new_map)) {
581 printk ("Unable to map lapic to logical cpu number\n"); 559 printk ("Unable to map lapic to logical cpu number\n");
582 return -EINVAL; 560 retval = -EINVAL;
561 goto free_new_map;
583 } 562 }
584 563
585 cpu = first_cpu(new_map); 564 cpu = cpumask_first(new_map);
586 565
587 *pcpu = cpu; 566 *pcpu = cpu;
588 return 0; 567 retval = 0;
568
569free_new_map:
570 free_cpumask_var(new_map);
571free_tmp_map:
572 free_cpumask_var(tmp_map);
573out:
574 return retval;
589} 575}
590 576
591/* wrapper to silence section mismatch warning */ 577/* wrapper to silence section mismatch warning */
@@ -598,7 +584,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
598int acpi_unmap_lsapic(int cpu) 584int acpi_unmap_lsapic(int cpu)
599{ 585{
600 per_cpu(x86_cpu_to_apicid, cpu) = -1; 586 per_cpu(x86_cpu_to_apicid, cpu) = -1;
601 cpu_clear(cpu, cpu_present_map); 587 set_cpu_present(cpu, false);
602 num_processors--; 588 num_processors--;
603 589
604 return (0); 590 return (0);
@@ -869,7 +855,7 @@ static struct {
869 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 855 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
870} mp_ioapic_routing[MAX_IO_APICS]; 856} mp_ioapic_routing[MAX_IO_APICS];
871 857
872static int mp_find_ioapic(int gsi) 858int mp_find_ioapic(int gsi)
873{ 859{
874 int i = 0; 860 int i = 0;
875 861
@@ -884,6 +870,16 @@ static int mp_find_ioapic(int gsi)
884 return -1; 870 return -1;
885} 871}
886 872
873int mp_find_ioapic_pin(int ioapic, int gsi)
874{
875 if (WARN_ON(ioapic == -1))
876 return -1;
877 if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
878 return -1;
879
880 return gsi - mp_ioapic_routing[ioapic].gsi_base;
881}
882
887static u8 __init uniq_ioapic_id(u8 id) 883static u8 __init uniq_ioapic_id(u8 id)
888{ 884{
889#ifdef CONFIG_X86_32 885#ifdef CONFIG_X86_32
@@ -897,8 +893,8 @@ static u8 __init uniq_ioapic_id(u8 id)
897 DECLARE_BITMAP(used, 256); 893 DECLARE_BITMAP(used, 256);
898 bitmap_zero(used, 256); 894 bitmap_zero(used, 256);
899 for (i = 0; i < nr_ioapics; i++) { 895 for (i = 0; i < nr_ioapics; i++) {
900 struct mp_config_ioapic *ia = &mp_ioapics[i]; 896 struct mpc_ioapic *ia = &mp_ioapics[i];
901 __set_bit(ia->mp_apicid, used); 897 __set_bit(ia->apicid, used);
902 } 898 }
903 if (!test_bit(id, used)) 899 if (!test_bit(id, used))
904 return id; 900 return id;
@@ -930,47 +926,70 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
930 926
931 idx = nr_ioapics; 927 idx = nr_ioapics;
932 928
933 mp_ioapics[idx].mp_type = MP_IOAPIC; 929 mp_ioapics[idx].type = MP_IOAPIC;
934 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; 930 mp_ioapics[idx].flags = MPC_APIC_USABLE;
935 mp_ioapics[idx].mp_apicaddr = address; 931 mp_ioapics[idx].apicaddr = address;
936 932
937 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 933 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
938 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); 934 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
939#ifdef CONFIG_X86_32 935#ifdef CONFIG_X86_32
940 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); 936 mp_ioapics[idx].apicver = io_apic_get_version(idx);
941#else 937#else
942 mp_ioapics[idx].mp_apicver = 0; 938 mp_ioapics[idx].apicver = 0;
943#endif 939#endif
944 /* 940 /*
945 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 941 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
946 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 942 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
947 */ 943 */
948 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; 944 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
949 mp_ioapic_routing[idx].gsi_base = gsi_base; 945 mp_ioapic_routing[idx].gsi_base = gsi_base;
950 mp_ioapic_routing[idx].gsi_end = gsi_base + 946 mp_ioapic_routing[idx].gsi_end = gsi_base +
951 io_apic_get_redir_entries(idx); 947 io_apic_get_redir_entries(idx);
952 948
953 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 949 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
954 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, 950 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
955 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, 951 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
956 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 952 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
957 953
958 nr_ioapics++; 954 nr_ioapics++;
959} 955}
960 956
961static void assign_to_mp_irq(struct mp_config_intsrc *m, 957int __init acpi_probe_gsi(void)
962 struct mp_config_intsrc *mp_irq)
963{ 958{
964 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); 959 int idx;
960 int gsi;
961 int max_gsi = 0;
962
963 if (acpi_disabled)
964 return 0;
965
966 if (!acpi_ioapic)
967 return 0;
968
969 max_gsi = 0;
970 for (idx = 0; idx < nr_ioapics; idx++) {
971 gsi = mp_ioapic_routing[idx].gsi_end;
972
973 if (gsi > max_gsi)
974 max_gsi = gsi;
975 }
976
977 return max_gsi + 1;
965} 978}
966 979
967static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, 980static void assign_to_mp_irq(struct mpc_intsrc *m,
968 struct mp_config_intsrc *m) 981 struct mpc_intsrc *mp_irq)
969{ 982{
970 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); 983 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
971} 984}
972 985
973static void save_mp_irq(struct mp_config_intsrc *m) 986static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
987 struct mpc_intsrc *m)
988{
989 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
990}
991
992static void save_mp_irq(struct mpc_intsrc *m)
974{ 993{
975 int i; 994 int i;
976 995
@@ -988,7 +1007,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
988{ 1007{
989 int ioapic; 1008 int ioapic;
990 int pin; 1009 int pin;
991 struct mp_config_intsrc mp_irq; 1010 struct mpc_intsrc mp_irq;
992 1011
993 /* 1012 /*
994 * Convert 'gsi' to 'ioapic.pin'. 1013 * Convert 'gsi' to 'ioapic.pin'.
@@ -996,7 +1015,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
996 ioapic = mp_find_ioapic(gsi); 1015 ioapic = mp_find_ioapic(gsi);
997 if (ioapic < 0) 1016 if (ioapic < 0)
998 return; 1017 return;
999 pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1018 pin = mp_find_ioapic_pin(ioapic, gsi);
1000 1019
1001 /* 1020 /*
1002 * TBD: This check is for faulty timer entries, where the override 1021 * TBD: This check is for faulty timer entries, where the override
@@ -1006,13 +1025,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1006 if ((bus_irq == 0) && (trigger == 3)) 1025 if ((bus_irq == 0) && (trigger == 3))
1007 trigger = 1; 1026 trigger = 1;
1008 1027
1009 mp_irq.mp_type = MP_INTSRC; 1028 mp_irq.type = MP_INTSRC;
1010 mp_irq.mp_irqtype = mp_INT; 1029 mp_irq.irqtype = mp_INT;
1011 mp_irq.mp_irqflag = (trigger << 2) | polarity; 1030 mp_irq.irqflag = (trigger << 2) | polarity;
1012 mp_irq.mp_srcbus = MP_ISA_BUS; 1031 mp_irq.srcbus = MP_ISA_BUS;
1013 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ 1032 mp_irq.srcbusirq = bus_irq; /* IRQ */
1014 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ 1033 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1015 mp_irq.mp_dstirq = pin; /* INTIN# */ 1034 mp_irq.dstirq = pin; /* INTIN# */
1016 1035
1017 save_mp_irq(&mp_irq); 1036 save_mp_irq(&mp_irq);
1018} 1037}
@@ -1022,7 +1041,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1022 int i; 1041 int i;
1023 int ioapic; 1042 int ioapic;
1024 unsigned int dstapic; 1043 unsigned int dstapic;
1025 struct mp_config_intsrc mp_irq; 1044 struct mpc_intsrc mp_irq;
1026 1045
1027#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1046#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1028 /* 1047 /*
@@ -1047,7 +1066,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1047 ioapic = mp_find_ioapic(0); 1066 ioapic = mp_find_ioapic(0);
1048 if (ioapic < 0) 1067 if (ioapic < 0)
1049 return; 1068 return;
1050 dstapic = mp_ioapics[ioapic].mp_apicid; 1069 dstapic = mp_ioapics[ioapic].apicid;
1051 1070
1052 /* 1071 /*
1053 * Use the default configuration for the IRQs 0-15. Unless 1072 * Use the default configuration for the IRQs 0-15. Unless
@@ -1057,16 +1076,14 @@ void __init mp_config_acpi_legacy_irqs(void)
1057 int idx; 1076 int idx;
1058 1077
1059 for (idx = 0; idx < mp_irq_entries; idx++) { 1078 for (idx = 0; idx < mp_irq_entries; idx++) {
1060 struct mp_config_intsrc *irq = mp_irqs + idx; 1079 struct mpc_intsrc *irq = mp_irqs + idx;
1061 1080
1062 /* Do we already have a mapping for this ISA IRQ? */ 1081 /* Do we already have a mapping for this ISA IRQ? */
1063 if (irq->mp_srcbus == MP_ISA_BUS 1082 if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
1064 && irq->mp_srcbusirq == i)
1065 break; 1083 break;
1066 1084
1067 /* Do we already have a mapping for this IOAPIC pin */ 1085 /* Do we already have a mapping for this IOAPIC pin */
1068 if (irq->mp_dstapic == dstapic && 1086 if (irq->dstapic == dstapic && irq->dstirq == i)
1069 irq->mp_dstirq == i)
1070 break; 1087 break;
1071 } 1088 }
1072 1089
@@ -1075,13 +1092,13 @@ void __init mp_config_acpi_legacy_irqs(void)
1075 continue; /* IRQ already used */ 1092 continue; /* IRQ already used */
1076 } 1093 }
1077 1094
1078 mp_irq.mp_type = MP_INTSRC; 1095 mp_irq.type = MP_INTSRC;
1079 mp_irq.mp_irqflag = 0; /* Conforming */ 1096 mp_irq.irqflag = 0; /* Conforming */
1080 mp_irq.mp_srcbus = MP_ISA_BUS; 1097 mp_irq.srcbus = MP_ISA_BUS;
1081 mp_irq.mp_dstapic = dstapic; 1098 mp_irq.dstapic = dstapic;
1082 mp_irq.mp_irqtype = mp_INT; 1099 mp_irq.irqtype = mp_INT;
1083 mp_irq.mp_srcbusirq = i; /* Identity mapped */ 1100 mp_irq.srcbusirq = i; /* Identity mapped */
1084 mp_irq.mp_dstirq = i; 1101 mp_irq.dstirq = i;
1085 1102
1086 save_mp_irq(&mp_irq); 1103 save_mp_irq(&mp_irq);
1087 } 1104 }
@@ -1118,7 +1135,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1118 return gsi; 1135 return gsi;
1119 } 1136 }
1120 1137
1121 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1138 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1122 1139
1123#ifdef CONFIG_X86_32 1140#ifdef CONFIG_X86_32
1124 if (ioapic_renumber_irq) 1141 if (ioapic_renumber_irq)
@@ -1192,22 +1209,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1192 u32 gsi, int triggering, int polarity) 1209 u32 gsi, int triggering, int polarity)
1193{ 1210{
1194#ifdef CONFIG_X86_MPPARSE 1211#ifdef CONFIG_X86_MPPARSE
1195 struct mp_config_intsrc mp_irq; 1212 struct mpc_intsrc mp_irq;
1196 int ioapic; 1213 int ioapic;
1197 1214
1198 if (!acpi_ioapic) 1215 if (!acpi_ioapic)
1199 return 0; 1216 return 0;
1200 1217
1201 /* print the entry should happen on mptable identically */ 1218 /* print the entry should happen on mptable identically */
1202 mp_irq.mp_type = MP_INTSRC; 1219 mp_irq.type = MP_INTSRC;
1203 mp_irq.mp_irqtype = mp_INT; 1220 mp_irq.irqtype = mp_INT;
1204 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | 1221 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1205 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); 1222 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1206 mp_irq.mp_srcbus = number; 1223 mp_irq.srcbus = number;
1207 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1224 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1208 ioapic = mp_find_ioapic(gsi); 1225 ioapic = mp_find_ioapic(gsi);
1209 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; 1226 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1210 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; 1227 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1211 1228
1212 save_mp_irq(&mp_irq); 1229 save_mp_irq(&mp_irq);
1213#endif 1230#endif
@@ -1334,7 +1351,7 @@ static void __init acpi_process_madt(void)
1334 if (!error) { 1351 if (!error) {
1335 acpi_lapic = 1; 1352 acpi_lapic = 1;
1336 1353
1337#ifdef CONFIG_X86_GENERICARCH 1354#ifdef CONFIG_X86_BIGSMP
1338 generic_bigsmp_probe(); 1355 generic_bigsmp_probe();
1339#endif 1356#endif
1340 /* 1357 /*
@@ -1343,13 +1360,11 @@ static void __init acpi_process_madt(void)
1343 error = acpi_parse_madt_ioapic_entries(); 1360 error = acpi_parse_madt_ioapic_entries();
1344 if (!error) { 1361 if (!error) {
1345 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; 1362 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1346 acpi_irq_balance_set(NULL);
1347 acpi_ioapic = 1; 1363 acpi_ioapic = 1;
1348 1364
1349 smp_found_config = 1; 1365 smp_found_config = 1;
1350#ifdef CONFIG_X86_32 1366 if (apic->setup_apic_routing)
1351 setup_apic_routing(); 1367 apic->setup_apic_routing();
1352#endif
1353 } 1368 }
1354 } 1369 }
1355 if (error == -EINVAL) { 1370 if (error == -EINVAL) {
@@ -1360,7 +1375,29 @@ static void __init acpi_process_madt(void)
1360 "Invalid BIOS MADT, disabling ACPI\n"); 1375 "Invalid BIOS MADT, disabling ACPI\n");
1361 disable_acpi(); 1376 disable_acpi();
1362 } 1377 }
1378 } else {
1379 /*
1380 * ACPI found no MADT, and so ACPI wants UP PIC mode.
1381 * In the event an MPS table was found, forget it.
1382 * Boot with "acpi=off" to use MPS on such a system.
1383 */
1384 if (smp_found_config) {
1385 printk(KERN_WARNING PREFIX
1386 "No APIC-table, disabling MPS\n");
1387 smp_found_config = 0;
1388 }
1363 } 1389 }
1390
1391 /*
1392 * ACPI supports both logical (e.g. Hyper-Threading) and physical
1393 * processors, where MPS only supports physical.
1394 */
1395 if (acpi_lapic && acpi_ioapic)
1396 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
1397 "information\n");
1398 else if (acpi_lapic)
1399 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
1400 "configuration information\n");
1364#endif 1401#endif
1365 return; 1402 return;
1366} 1403}
@@ -1784,6 +1821,10 @@ static int __init parse_acpi(char *arg)
1784 disable_acpi(); 1821 disable_acpi();
1785 acpi_ht = 1; 1822 acpi_ht = 1;
1786 } 1823 }
1824 /* acpi=rsdt use RSDT instead of XSDT */
1825 else if (strcmp(arg, "rsdt") == 0) {
1826 acpi_rsdt_forced = 1;
1827 }
1787 /* "acpi=noirq" disables ACPI interrupt routing */ 1828 /* "acpi=noirq" disables ACPI interrupt routing */
1788 else if (strcmp(arg, "noirq") == 0) { 1829 else if (strcmp(arg, "noirq") == 0) {
1789 acpi_noirq_set(); 1830 acpi_noirq_set();
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa83..bbbe4bbb6f34 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -56,6 +56,7 @@ static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
56static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 56static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
57 57
58#define MWAIT_SUBSTATE_MASK (0xf) 58#define MWAIT_SUBSTATE_MASK (0xf)
59#define MWAIT_CSTATE_MASK (0xf)
59#define MWAIT_SUBSTATE_SIZE (4) 60#define MWAIT_SUBSTATE_SIZE (4)
60 61
61#define CPUID_MWAIT_LEAF (5) 62#define CPUID_MWAIT_LEAF (5)
@@ -66,39 +67,20 @@ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
66 67
67#define NATIVE_CSTATE_BEYOND_HALT (2) 68#define NATIVE_CSTATE_BEYOND_HALT (2)
68 69
69int acpi_processor_ffh_cstate_probe(unsigned int cpu, 70static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
70 struct acpi_processor_cx *cx, struct acpi_power_register *reg)
71{ 71{
72 struct cstate_entry *percpu_entry; 72 struct acpi_processor_cx *cx = _cx;
73 struct cpuinfo_x86 *c = &cpu_data(cpu); 73 long retval;
74
75 cpumask_t saved_mask;
76 int retval;
77 unsigned int eax, ebx, ecx, edx; 74 unsigned int eax, ebx, ecx, edx;
78 unsigned int edx_part; 75 unsigned int edx_part;
79 unsigned int cstate_type; /* C-state type and not ACPI C-state type */ 76 unsigned int cstate_type; /* C-state type and not ACPI C-state type */
80 unsigned int num_cstate_subtype; 77 unsigned int num_cstate_subtype;
81 78
82 if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF )
83 return -1;
84
85 if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
86 return -1;
87
88 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
89 percpu_entry->states[cx->index].eax = 0;
90 percpu_entry->states[cx->index].ecx = 0;
91
92 /* Make sure we are running on right CPU */
93 saved_mask = current->cpus_allowed;
94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
95 if (retval)
96 return -1;
97
98 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 79 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
99 80
100 /* Check whether this particular cx_type (in CST) is supported or not */ 81 /* Check whether this particular cx_type (in CST) is supported or not */
101 cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1; 82 cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
83 MWAIT_CSTATE_MASK) + 1;
102 edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); 84 edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
103 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; 85 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
104 86
@@ -114,21 +96,45 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
114 retval = -1; 96 retval = -1;
115 goto out; 97 goto out;
116 } 98 }
117 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
118
119 /* Use the hint in CST */
120 percpu_entry->states[cx->index].eax = cx->address;
121 99
122 if (!mwait_supported[cstate_type]) { 100 if (!mwait_supported[cstate_type]) {
123 mwait_supported[cstate_type] = 1; 101 mwait_supported[cstate_type] = 1;
124 printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d " 102 printk(KERN_DEBUG
125 "state\n", cx->type); 103 "Monitor-Mwait will be used to enter C-%d "
104 "state\n", cx->type);
126 } 105 }
127 snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", 106 snprintf(cx->desc,
128 cx->address); 107 ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
129 108 cx->address);
130out: 109out:
131 set_cpus_allowed_ptr(current, &saved_mask); 110 return retval;
111}
112
113int acpi_processor_ffh_cstate_probe(unsigned int cpu,
114 struct acpi_processor_cx *cx, struct acpi_power_register *reg)
115{
116 struct cstate_entry *percpu_entry;
117 struct cpuinfo_x86 *c = &cpu_data(cpu);
118 long retval;
119
120 if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
121 return -1;
122
123 if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
124 return -1;
125
126 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
127 percpu_entry->states[cx->index].eax = 0;
128 percpu_entry->states[cx->index].ecx = 0;
129
130 /* Make sure we are running on right CPU */
131
132 retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
133 if (retval == 0) {
134 /* Use the hint in CST */
135 percpu_entry->states[cx->index].eax = cx->address;
136 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
137 }
132 return retval; 138 return retval;
133} 139}
134EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 140EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 806b4e9051b4..7c243a2c5115 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
@@ -156,6 +157,8 @@ static int __init acpi_sleep_setup(char *str)
156#ifdef CONFIG_HIBERNATION 157#ifdef CONFIG_HIBERNATION
157 if (strncmp(str, "s4_nohwsig", 10) == 0) 158 if (strncmp(str, "s4_nohwsig", 10) == 0)
158 acpi_no_s4_hw_signature(); 159 acpi_no_s4_hw_signature();
160 if (strncmp(str, "s4_nonvs", 8) == 0)
161 acpi_s4_no_nvs();
159#endif 162#endif
160 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
161 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 331b318304eb..5113c080f0c4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,10 +20,15 @@
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitops.h>
23#include <linux/debugfs.h>
23#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 25#include <linux/iommu-helper.h>
26#ifdef CONFIG_IOMMU_API
27#include <linux/iommu.h>
28#endif
25#include <asm/proto.h> 29#include <asm/proto.h>
26#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
27#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
29 34
@@ -37,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
37static LIST_HEAD(iommu_pd_list); 42static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock); 43static DEFINE_SPINLOCK(iommu_pd_list_lock);
39 44
45#ifdef CONFIG_IOMMU_API
46static struct iommu_ops amd_iommu_ops;
47#endif
48
40/* 49/*
41 * general struct to manage commands send to an IOMMU 50 * general struct to manage commands send to an IOMMU
42 */ 51 */
@@ -46,6 +55,68 @@ struct iommu_cmd {
46 55
47static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 56static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
48 struct unity_map_entry *e); 57 struct unity_map_entry *e);
58static struct dma_ops_domain *find_protection_domain(u16 devid);
59
60
61#ifdef CONFIG_AMD_IOMMU_STATS
62
63/*
64 * Initialization code for statistics collection
65 */
66
67DECLARE_STATS_COUNTER(compl_wait);
68DECLARE_STATS_COUNTER(cnt_map_single);
69DECLARE_STATS_COUNTER(cnt_unmap_single);
70DECLARE_STATS_COUNTER(cnt_map_sg);
71DECLARE_STATS_COUNTER(cnt_unmap_sg);
72DECLARE_STATS_COUNTER(cnt_alloc_coherent);
73DECLARE_STATS_COUNTER(cnt_free_coherent);
74DECLARE_STATS_COUNTER(cross_page);
75DECLARE_STATS_COUNTER(domain_flush_single);
76DECLARE_STATS_COUNTER(domain_flush_all);
77DECLARE_STATS_COUNTER(alloced_io_mem);
78DECLARE_STATS_COUNTER(total_map_requests);
79
80static struct dentry *stats_dir;
81static struct dentry *de_isolate;
82static struct dentry *de_fflush;
83
84static void amd_iommu_stats_add(struct __iommu_counter *cnt)
85{
86 if (stats_dir == NULL)
87 return;
88
89 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
90 &cnt->value);
91}
92
93static void amd_iommu_stats_init(void)
94{
95 stats_dir = debugfs_create_dir("amd-iommu", NULL);
96 if (stats_dir == NULL)
97 return;
98
99 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
100 (u32 *)&amd_iommu_isolate);
101
102 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
103 (u32 *)&amd_iommu_unmap_flush);
104
105 amd_iommu_stats_add(&compl_wait);
106 amd_iommu_stats_add(&cnt_map_single);
107 amd_iommu_stats_add(&cnt_unmap_single);
108 amd_iommu_stats_add(&cnt_map_sg);
109 amd_iommu_stats_add(&cnt_unmap_sg);
110 amd_iommu_stats_add(&cnt_alloc_coherent);
111 amd_iommu_stats_add(&cnt_free_coherent);
112 amd_iommu_stats_add(&cross_page);
113 amd_iommu_stats_add(&domain_flush_single);
114 amd_iommu_stats_add(&domain_flush_all);
115 amd_iommu_stats_add(&alloced_io_mem);
116 amd_iommu_stats_add(&total_map_requests);
117}
118
119#endif
49 120
50/* returns !0 if the IOMMU is caching non-present entries in its TLB */ 121/* returns !0 if the IOMMU is caching non-present entries in its TLB */
51static int iommu_has_npcache(struct amd_iommu *iommu) 122static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -187,12 +258,56 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
187 258
188 spin_lock_irqsave(&iommu->lock, flags); 259 spin_lock_irqsave(&iommu->lock, flags);
189 ret = __iommu_queue_command(iommu, cmd); 260 ret = __iommu_queue_command(iommu, cmd);
261 if (!ret)
262 iommu->need_sync = true;
190 spin_unlock_irqrestore(&iommu->lock, flags); 263 spin_unlock_irqrestore(&iommu->lock, flags);
191 264
192 return ret; 265 return ret;
193} 266}
194 267
195/* 268/*
269 * This function waits until an IOMMU has completed a completion
270 * wait command
271 */
272static void __iommu_wait_for_completion(struct amd_iommu *iommu)
273{
274 int ready = 0;
275 unsigned status = 0;
276 unsigned long i = 0;
277
278 INC_STATS_COUNTER(compl_wait);
279
280 while (!ready && (i < EXIT_LOOP_COUNT)) {
281 ++i;
282 /* wait for the bit to become one */
283 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
284 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
285 }
286
287 /* set bit back to zero */
288 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
289 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
290
291 if (unlikely(i == EXIT_LOOP_COUNT))
292 panic("AMD IOMMU: Completion wait loop failed\n");
293}
294
295/*
296 * This function queues a completion wait command into the command
297 * buffer of an IOMMU
298 */
299static int __iommu_completion_wait(struct amd_iommu *iommu)
300{
301 struct iommu_cmd cmd;
302
303 memset(&cmd, 0, sizeof(cmd));
304 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
305 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
306
307 return __iommu_queue_command(iommu, &cmd);
308}
309
310/*
196 * This function is called whenever we need to ensure that the IOMMU has 311 * This function is called whenever we need to ensure that the IOMMU has
197 * completed execution of all commands we sent. It sends a 312 * completed execution of all commands we sent. It sends a
198 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs 313 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -201,37 +316,23 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
201 */ 316 */
202static int iommu_completion_wait(struct amd_iommu *iommu) 317static int iommu_completion_wait(struct amd_iommu *iommu)
203{ 318{
204 int ret = 0, ready = 0; 319 int ret = 0;
205 unsigned status = 0; 320 unsigned long flags;
206 struct iommu_cmd cmd;
207 unsigned long flags, i = 0;
208 321
209 memset(&cmd, 0, sizeof(cmd)); 322 spin_lock_irqsave(&iommu->lock, flags);
210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
212 323
213 iommu->need_sync = 0; 324 if (!iommu->need_sync)
325 goto out;
214 326
215 spin_lock_irqsave(&iommu->lock, flags); 327 ret = __iommu_completion_wait(iommu);
216 328
217 ret = __iommu_queue_command(iommu, &cmd); 329 iommu->need_sync = false;
218 330
219 if (ret) 331 if (ret)
220 goto out; 332 goto out;
221 333
222 while (!ready && (i < EXIT_LOOP_COUNT)) { 334 __iommu_wait_for_completion(iommu);
223 ++i;
224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
227 }
228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
232 335
233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out: 336out:
236 spin_unlock_irqrestore(&iommu->lock, flags); 337 spin_unlock_irqrestore(&iommu->lock, flags);
237 338
@@ -254,11 +355,24 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
254 355
255 ret = iommu_queue_command(iommu, &cmd); 356 ret = iommu_queue_command(iommu, &cmd);
256 357
257 iommu->need_sync = 1;
258
259 return ret; 358 return ret;
260} 359}
261 360
361static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
362 u16 domid, int pde, int s)
363{
364 memset(cmd, 0, sizeof(*cmd));
365 address &= PAGE_MASK;
366 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
367 cmd->data[1] |= domid;
368 cmd->data[2] = lower_32_bits(address);
369 cmd->data[3] = upper_32_bits(address);
370 if (s) /* size bit - we flush more than one 4kb page */
371 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
372 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
373 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
374}
375
262/* 376/*
263 * Generic command send function for invalidaing TLB entries 377 * Generic command send function for invalidaing TLB entries
264 */ 378 */
@@ -268,21 +382,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
268 struct iommu_cmd cmd; 382 struct iommu_cmd cmd;
269 int ret; 383 int ret;
270 384
271 memset(&cmd, 0, sizeof(cmd)); 385 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
272 address &= PAGE_MASK;
273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
274 cmd.data[1] |= domid;
275 cmd.data[2] = lower_32_bits(address);
276 cmd.data[3] = upper_32_bits(address);
277 if (s) /* size bit - we flush more than one 4kb page */
278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
281 386
282 ret = iommu_queue_command(iommu, &cmd); 387 ret = iommu_queue_command(iommu, &cmd);
283 388
284 iommu->need_sync = 1;
285
286 return ret; 389 return ret;
287} 390}
288 391
@@ -318,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{ 421{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 422 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320 423
424 INC_STATS_COUNTER(domain_flush_single);
425
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 426 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322} 427}
323 428
429/*
430 * This function is used to flush the IO/TLB for a given protection domain
431 * on every IOMMU in the system
432 */
433static void iommu_flush_domain(u16 domid)
434{
435 unsigned long flags;
436 struct amd_iommu *iommu;
437 struct iommu_cmd cmd;
438
439 INC_STATS_COUNTER(domain_flush_all);
440
441 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
442 domid, 1, 1);
443
444 list_for_each_entry(iommu, &amd_iommu_list, list) {
445 spin_lock_irqsave(&iommu->lock, flags);
446 __iommu_queue_command(iommu, &cmd);
447 __iommu_completion_wait(iommu);
448 __iommu_wait_for_completion(iommu);
449 spin_unlock_irqrestore(&iommu->lock, flags);
450 }
451}
452
324/**************************************************************************** 453/****************************************************************************
325 * 454 *
326 * The functions below are used the create the page table mappings for 455 * The functions below are used the create the page table mappings for
@@ -335,15 +464,15 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
335 * supporting all features of AMD IOMMU page tables like level skipping 464 * supporting all features of AMD IOMMU page tables like level skipping
336 * and full 64 bit address spaces. 465 * and full 64 bit address spaces.
337 */ 466 */
338static int iommu_map(struct protection_domain *dom, 467static int iommu_map_page(struct protection_domain *dom,
339 unsigned long bus_addr, 468 unsigned long bus_addr,
340 unsigned long phys_addr, 469 unsigned long phys_addr,
341 int prot) 470 int prot)
342{ 471{
343 u64 __pte, *pte, *page; 472 u64 __pte, *pte, *page;
344 473
345 bus_addr = PAGE_ALIGN(bus_addr); 474 bus_addr = PAGE_ALIGN(bus_addr);
346 phys_addr = PAGE_ALIGN(bus_addr); 475 phys_addr = PAGE_ALIGN(phys_addr);
347 476
348 /* only support 512GB address spaces for now */ 477 /* only support 512GB address spaces for now */
349 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 478 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
@@ -385,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
385 return 0; 514 return 0;
386} 515}
387 516
517static void iommu_unmap_page(struct protection_domain *dom,
518 unsigned long bus_addr)
519{
520 u64 *pte;
521
522 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
523
524 if (!IOMMU_PTE_PRESENT(*pte))
525 return;
526
527 pte = IOMMU_PTE_PAGE(*pte);
528 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
529
530 if (!IOMMU_PTE_PRESENT(*pte))
531 return;
532
533 pte = IOMMU_PTE_PAGE(*pte);
534 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
535
536 *pte = 0;
537}
538
388/* 539/*
389 * This function checks if a specific unity mapping entry is needed for 540 * This function checks if a specific unity mapping entry is needed for
390 * this specific IOMMU. 541 * this specific IOMMU.
@@ -437,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
437 588
438 for (addr = e->address_start; addr < e->address_end; 589 for (addr = e->address_start; addr < e->address_end;
439 addr += PAGE_SIZE) { 590 addr += PAGE_SIZE) {
440 ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); 591 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
441 if (ret) 592 if (ret)
442 return ret; 593 return ret;
443 /* 594 /*
@@ -537,7 +688,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
537 address >>= PAGE_SHIFT; 688 address >>= PAGE_SHIFT;
538 iommu_area_free(dom->bitmap, address, pages); 689 iommu_area_free(dom->bitmap, address, pages);
539 690
540 if (address + pages >= dom->next_bit) 691 if (address >= dom->next_bit)
541 dom->need_flush = true; 692 dom->need_flush = true;
542} 693}
543 694
@@ -568,6 +719,16 @@ static u16 domain_id_alloc(void)
568 return id; 719 return id;
569} 720}
570 721
722static void domain_id_free(int id)
723{
724 unsigned long flags;
725
726 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
727 if (id > 0 && id < MAX_DOMAIN_ID)
728 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
729 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
730}
731
571/* 732/*
572 * Used to reserve address ranges in the aperture (e.g. for exclusion 733 * Used to reserve address ranges in the aperture (e.g. for exclusion
573 * ranges. 734 * ranges.
@@ -584,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
584 iommu_area_reserve(dom->bitmap, start_page, pages); 745 iommu_area_reserve(dom->bitmap, start_page, pages);
585} 746}
586 747
587static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 748static void free_pagetable(struct protection_domain *domain)
588{ 749{
589 int i, j; 750 int i, j;
590 u64 *p1, *p2, *p3; 751 u64 *p1, *p2, *p3;
591 752
592 p1 = dma_dom->domain.pt_root; 753 p1 = domain->pt_root;
593 754
594 if (!p1) 755 if (!p1)
595 return; 756 return;
@@ -599,7 +760,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
599 continue; 760 continue;
600 761
601 p2 = IOMMU_PTE_PAGE(p1[i]); 762 p2 = IOMMU_PTE_PAGE(p1[i]);
602 for (j = 0; j < 512; ++i) { 763 for (j = 0; j < 512; ++j) {
603 if (!IOMMU_PTE_PRESENT(p2[j])) 764 if (!IOMMU_PTE_PRESENT(p2[j]))
604 continue; 765 continue;
605 p3 = IOMMU_PTE_PAGE(p2[j]); 766 p3 = IOMMU_PTE_PAGE(p2[j]);
@@ -610,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
610 } 771 }
611 772
612 free_page((unsigned long)p1); 773 free_page((unsigned long)p1);
774
775 domain->pt_root = NULL;
613} 776}
614 777
615/* 778/*
@@ -621,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
621 if (!dom) 784 if (!dom)
622 return; 785 return;
623 786
624 dma_ops_free_pagetable(dom); 787 free_pagetable(&dom->domain);
625 788
626 kfree(dom->pte_pages); 789 kfree(dom->pte_pages);
627 790
@@ -660,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
660 goto free_dma_dom; 823 goto free_dma_dom;
661 dma_dom->domain.mode = PAGE_MODE_3_LEVEL; 824 dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
662 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 825 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
826 dma_dom->domain.flags = PD_DMA_OPS_MASK;
663 dma_dom->domain.priv = dma_dom; 827 dma_dom->domain.priv = dma_dom;
664 if (!dma_dom->domain.pt_root) 828 if (!dma_dom->domain.pt_root)
665 goto free_dma_dom; 829 goto free_dma_dom;
@@ -722,6 +886,15 @@ free_dma_dom:
722} 886}
723 887
724/* 888/*
889 * little helper function to check whether a given protection domain is a
890 * dma_ops domain
891 */
892static bool dma_ops_domain(struct protection_domain *domain)
893{
894 return domain->flags & PD_DMA_OPS_MASK;
895}
896
897/*
725 * Find out the protection domain structure for a given PCI device. This 898 * Find out the protection domain structure for a given PCI device. This
726 * will give us the pointer to the page table root for example. 899 * will give us the pointer to the page table root for example.
727 */ 900 */
@@ -741,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
741 * If a device is not yet associated with a domain, this function does 914 * If a device is not yet associated with a domain, this function does
742 * assigns it visible for the hardware 915 * assigns it visible for the hardware
743 */ 916 */
744static void set_device_domain(struct amd_iommu *iommu, 917static void attach_device(struct amd_iommu *iommu,
745 struct protection_domain *domain, 918 struct protection_domain *domain,
746 u16 devid) 919 u16 devid)
747{ 920{
748 unsigned long flags; 921 unsigned long flags;
749
750 u64 pte_root = virt_to_phys(domain->pt_root); 922 u64 pte_root = virt_to_phys(domain->pt_root);
751 923
924 domain->dev_cnt += 1;
925
752 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 926 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
753 << DEV_ENTRY_MODE_SHIFT; 927 << DEV_ENTRY_MODE_SHIFT;
754 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 928 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -762,10 +936,118 @@ static void set_device_domain(struct amd_iommu *iommu,
762 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 936 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
763 937
764 iommu_queue_inv_dev_entry(iommu, devid); 938 iommu_queue_inv_dev_entry(iommu, devid);
939}
940
941/*
942 * Removes a device from a protection domain (unlocked)
943 */
944static void __detach_device(struct protection_domain *domain, u16 devid)
945{
946
947 /* lock domain */
948 spin_lock(&domain->lock);
949
950 /* remove domain from the lookup table */
951 amd_iommu_pd_table[devid] = NULL;
952
953 /* remove entry from the device table seen by the hardware */
954 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
955 amd_iommu_dev_table[devid].data[1] = 0;
956 amd_iommu_dev_table[devid].data[2] = 0;
957
958 /* decrease reference counter */
959 domain->dev_cnt -= 1;
960
961 /* ready */
962 spin_unlock(&domain->lock);
963}
964
965/*
966 * Removes a device from a protection domain (with devtable_lock held)
967 */
968static void detach_device(struct protection_domain *domain, u16 devid)
969{
970 unsigned long flags;
971
972 /* lock device table */
973 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
974 __detach_device(domain, devid);
975 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
976}
977
978static int device_change_notifier(struct notifier_block *nb,
979 unsigned long action, void *data)
980{
981 struct device *dev = data;
982 struct pci_dev *pdev = to_pci_dev(dev);
983 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
984 struct protection_domain *domain;
985 struct dma_ops_domain *dma_domain;
986 struct amd_iommu *iommu;
987 int order = amd_iommu_aperture_order;
988 unsigned long flags;
989
990 if (devid > amd_iommu_last_bdf)
991 goto out;
992
993 devid = amd_iommu_alias_table[devid];
994
995 iommu = amd_iommu_rlookup_table[devid];
996 if (iommu == NULL)
997 goto out;
998
999 domain = domain_for_device(devid);
1000
1001 if (domain && !dma_ops_domain(domain))
1002 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
1003 "to a non-dma-ops domain\n", dev_name(dev));
1004
1005 switch (action) {
1006 case BUS_NOTIFY_BOUND_DRIVER:
1007 if (domain)
1008 goto out;
1009 dma_domain = find_protection_domain(devid);
1010 if (!dma_domain)
1011 dma_domain = iommu->default_dom;
1012 attach_device(iommu, &dma_domain->domain, devid);
1013 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1014 "device %s\n", dma_domain->domain.id, dev_name(dev));
1015 break;
1016 case BUS_NOTIFY_UNBIND_DRIVER:
1017 if (!domain)
1018 goto out;
1019 detach_device(domain, devid);
1020 break;
1021 case BUS_NOTIFY_ADD_DEVICE:
1022 /* allocate a protection domain if a device is added */
1023 dma_domain = find_protection_domain(devid);
1024 if (dma_domain)
1025 goto out;
1026 dma_domain = dma_ops_domain_alloc(iommu, order);
1027 if (!dma_domain)
1028 goto out;
1029 dma_domain->target_dev = devid;
1030
1031 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1032 list_add_tail(&dma_domain->list, &iommu_pd_list);
1033 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1034
1035 break;
1036 default:
1037 goto out;
1038 }
1039
1040 iommu_queue_inv_dev_entry(iommu, devid);
1041 iommu_completion_wait(iommu);
765 1042
766 iommu->need_sync = 1; 1043out:
1044 return 0;
767} 1045}
768 1046
1047struct notifier_block device_nb = {
1048 .notifier_call = device_change_notifier,
1049};
1050
769/***************************************************************************** 1051/*****************************************************************************
770 * 1052 *
771 * The next functions belong to the dma_ops mapping/unmapping code. 1053 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -801,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
801 list_for_each_entry(entry, &iommu_pd_list, list) { 1083 list_for_each_entry(entry, &iommu_pd_list, list) {
802 if (entry->target_dev == devid) { 1084 if (entry->target_dev == devid) {
803 ret = entry; 1085 ret = entry;
804 list_del(&ret->list);
805 break; 1086 break;
806 } 1087 }
807 } 1088 }
@@ -852,12 +1133,14 @@ static int get_device_resources(struct device *dev,
852 if (!dma_dom) 1133 if (!dma_dom)
853 dma_dom = (*iommu)->default_dom; 1134 dma_dom = (*iommu)->default_dom;
854 *domain = &dma_dom->domain; 1135 *domain = &dma_dom->domain;
855 set_device_domain(*iommu, *domain, *bdf); 1136 attach_device(*iommu, *domain, *bdf);
856 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1137 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
857 "device ", (*domain)->id); 1138 "device %s\n", (*domain)->id, dev_name(dev));
858 print_devid(_bdf, 1);
859 } 1139 }
860 1140
1141 if (domain_for_device(_bdf) == NULL)
1142 attach_device(*iommu, *domain, _bdf);
1143
861 return 1; 1144 return 1;
862} 1145}
863 1146
@@ -908,7 +1191,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
908 if (address >= dom->aperture_size) 1191 if (address >= dom->aperture_size)
909 return; 1192 return;
910 1193
911 WARN_ON(address & 0xfffULL || address > dom->aperture_size); 1194 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
912 1195
913 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1196 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
914 pte += IOMMU_PTE_L0_INDEX(address); 1197 pte += IOMMU_PTE_L0_INDEX(address);
@@ -920,8 +1203,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
920 1203
921/* 1204/*
922 * This function contains common code for mapping of a physically 1205 * This function contains common code for mapping of a physically
923 * contiguous memory region into DMA address space. It is uses by all 1206 * contiguous memory region into DMA address space. It is used by all
924 * mapping functions provided by this IOMMU driver. 1207 * mapping functions provided with this IOMMU driver.
925 * Must be called with the domain lock held. 1208 * Must be called with the domain lock held.
926 */ 1209 */
927static dma_addr_t __map_single(struct device *dev, 1210static dma_addr_t __map_single(struct device *dev,
@@ -942,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
942 pages = iommu_num_pages(paddr, size, PAGE_SIZE); 1225 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
943 paddr &= PAGE_MASK; 1226 paddr &= PAGE_MASK;
944 1227
1228 INC_STATS_COUNTER(total_map_requests);
1229
1230 if (pages > 1)
1231 INC_STATS_COUNTER(cross_page);
1232
945 if (align) 1233 if (align)
946 align_mask = (1UL << get_order(size)) - 1; 1234 align_mask = (1UL << get_order(size)) - 1;
947 1235
@@ -958,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
958 } 1246 }
959 address += offset; 1247 address += offset;
960 1248
1249 ADD_STATS_COUNTER(alloced_io_mem, size);
1250
961 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1251 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
962 iommu_flush_tlb(iommu, dma_dom->domain.id); 1252 iommu_flush_tlb(iommu, dma_dom->domain.id);
963 dma_dom->need_flush = false; 1253 dma_dom->need_flush = false;
@@ -981,7 +1271,8 @@ static void __unmap_single(struct amd_iommu *iommu,
981 dma_addr_t i, start; 1271 dma_addr_t i, start;
982 unsigned int pages; 1272 unsigned int pages;
983 1273
984 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 1274 if ((dma_addr == bad_dma_address) ||
1275 (dma_addr + size > dma_dom->aperture_size))
985 return; 1276 return;
986 1277
987 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 1278 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
@@ -993,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
993 start += PAGE_SIZE; 1284 start += PAGE_SIZE;
994 } 1285 }
995 1286
1287 SUB_STATS_COUNTER(alloced_io_mem, size);
1288
996 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1289 dma_ops_free_addresses(dma_dom, dma_addr, pages);
997 1290
998 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1291 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1014,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1014 dma_addr_t addr; 1307 dma_addr_t addr;
1015 u64 dma_mask; 1308 u64 dma_mask;
1016 1309
1310 INC_STATS_COUNTER(cnt_map_single);
1311
1017 if (!check_device(dev)) 1312 if (!check_device(dev))
1018 return bad_dma_address; 1313 return bad_dma_address;
1019 1314
@@ -1025,14 +1320,16 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1025 /* device not handled by any AMD IOMMU */ 1320 /* device not handled by any AMD IOMMU */
1026 return (dma_addr_t)paddr; 1321 return (dma_addr_t)paddr;
1027 1322
1323 if (!dma_ops_domain(domain))
1324 return bad_dma_address;
1325
1028 spin_lock_irqsave(&domain->lock, flags); 1326 spin_lock_irqsave(&domain->lock, flags);
1029 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1327 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1030 dma_mask); 1328 dma_mask);
1031 if (addr == bad_dma_address) 1329 if (addr == bad_dma_address)
1032 goto out; 1330 goto out;
1033 1331
1034 if (unlikely(iommu->need_sync)) 1332 iommu_completion_wait(iommu);
1035 iommu_completion_wait(iommu);
1036 1333
1037out: 1334out:
1038 spin_unlock_irqrestore(&domain->lock, flags); 1335 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1051,17 +1348,21 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
1051 struct protection_domain *domain; 1348 struct protection_domain *domain;
1052 u16 devid; 1349 u16 devid;
1053 1350
1351 INC_STATS_COUNTER(cnt_unmap_single);
1352
1054 if (!check_device(dev) || 1353 if (!check_device(dev) ||
1055 !get_device_resources(dev, &iommu, &domain, &devid)) 1354 !get_device_resources(dev, &iommu, &domain, &devid))
1056 /* device not handled by any AMD IOMMU */ 1355 /* device not handled by any AMD IOMMU */
1057 return; 1356 return;
1058 1357
1358 if (!dma_ops_domain(domain))
1359 return;
1360
1059 spin_lock_irqsave(&domain->lock, flags); 1361 spin_lock_irqsave(&domain->lock, flags);
1060 1362
1061 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1363 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
1062 1364
1063 if (unlikely(iommu->need_sync)) 1365 iommu_completion_wait(iommu);
1064 iommu_completion_wait(iommu);
1065 1366
1066 spin_unlock_irqrestore(&domain->lock, flags); 1367 spin_unlock_irqrestore(&domain->lock, flags);
1067} 1368}
@@ -1101,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1101 int mapped_elems = 0; 1402 int mapped_elems = 0;
1102 u64 dma_mask; 1403 u64 dma_mask;
1103 1404
1405 INC_STATS_COUNTER(cnt_map_sg);
1406
1104 if (!check_device(dev)) 1407 if (!check_device(dev))
1105 return 0; 1408 return 0;
1106 1409
@@ -1111,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1111 if (!iommu || !domain) 1414 if (!iommu || !domain)
1112 return map_sg_no_iommu(dev, sglist, nelems, dir); 1415 return map_sg_no_iommu(dev, sglist, nelems, dir);
1113 1416
1417 if (!dma_ops_domain(domain))
1418 return 0;
1419
1114 spin_lock_irqsave(&domain->lock, flags); 1420 spin_lock_irqsave(&domain->lock, flags);
1115 1421
1116 for_each_sg(sglist, s, nelems, i) { 1422 for_each_sg(sglist, s, nelems, i) {
@@ -1127,8 +1433,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1127 goto unmap; 1433 goto unmap;
1128 } 1434 }
1129 1435
1130 if (unlikely(iommu->need_sync)) 1436 iommu_completion_wait(iommu);
1131 iommu_completion_wait(iommu);
1132 1437
1133out: 1438out:
1134 spin_unlock_irqrestore(&domain->lock, flags); 1439 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1161,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1161 u16 devid; 1466 u16 devid;
1162 int i; 1467 int i;
1163 1468
1469 INC_STATS_COUNTER(cnt_unmap_sg);
1470
1164 if (!check_device(dev) || 1471 if (!check_device(dev) ||
1165 !get_device_resources(dev, &iommu, &domain, &devid)) 1472 !get_device_resources(dev, &iommu, &domain, &devid))
1166 return; 1473 return;
1167 1474
1475 if (!dma_ops_domain(domain))
1476 return;
1477
1168 spin_lock_irqsave(&domain->lock, flags); 1478 spin_lock_irqsave(&domain->lock, flags);
1169 1479
1170 for_each_sg(sglist, s, nelems, i) { 1480 for_each_sg(sglist, s, nelems, i) {
@@ -1173,8 +1483,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1173 s->dma_address = s->dma_length = 0; 1483 s->dma_address = s->dma_length = 0;
1174 } 1484 }
1175 1485
1176 if (unlikely(iommu->need_sync)) 1486 iommu_completion_wait(iommu);
1177 iommu_completion_wait(iommu);
1178 1487
1179 spin_unlock_irqrestore(&domain->lock, flags); 1488 spin_unlock_irqrestore(&domain->lock, flags);
1180} 1489}
@@ -1193,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
1193 phys_addr_t paddr; 1502 phys_addr_t paddr;
1194 u64 dma_mask = dev->coherent_dma_mask; 1503 u64 dma_mask = dev->coherent_dma_mask;
1195 1504
1505 INC_STATS_COUNTER(cnt_alloc_coherent);
1506
1196 if (!check_device(dev)) 1507 if (!check_device(dev))
1197 return NULL; 1508 return NULL;
1198 1509
@@ -1211,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
1211 return virt_addr; 1522 return virt_addr;
1212 } 1523 }
1213 1524
1525 if (!dma_ops_domain(domain))
1526 goto out_free;
1527
1214 if (!dma_mask) 1528 if (!dma_mask)
1215 dma_mask = *dev->dma_mask; 1529 dma_mask = *dev->dma_mask;
1216 1530
@@ -1219,19 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
1219 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1533 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1220 size, DMA_BIDIRECTIONAL, true, dma_mask); 1534 size, DMA_BIDIRECTIONAL, true, dma_mask);
1221 1535
1222 if (*dma_addr == bad_dma_address) { 1536 if (*dma_addr == bad_dma_address)
1223 free_pages((unsigned long)virt_addr, get_order(size)); 1537 goto out_free;
1224 virt_addr = NULL;
1225 goto out;
1226 }
1227 1538
1228 if (unlikely(iommu->need_sync)) 1539 iommu_completion_wait(iommu);
1229 iommu_completion_wait(iommu);
1230 1540
1231out:
1232 spin_unlock_irqrestore(&domain->lock, flags); 1541 spin_unlock_irqrestore(&domain->lock, flags);
1233 1542
1234 return virt_addr; 1543 return virt_addr;
1544
1545out_free:
1546
1547 free_pages((unsigned long)virt_addr, get_order(size));
1548
1549 return NULL;
1235} 1550}
1236 1551
1237/* 1552/*
@@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
1245 struct protection_domain *domain; 1560 struct protection_domain *domain;
1246 u16 devid; 1561 u16 devid;
1247 1562
1563 INC_STATS_COUNTER(cnt_free_coherent);
1564
1248 if (!check_device(dev)) 1565 if (!check_device(dev))
1249 return; 1566 return;
1250 1567
@@ -1253,12 +1570,14 @@ static void free_coherent(struct device *dev, size_t size,
1253 if (!iommu || !domain) 1570 if (!iommu || !domain)
1254 goto free_mem; 1571 goto free_mem;
1255 1572
1573 if (!dma_ops_domain(domain))
1574 goto free_mem;
1575
1256 spin_lock_irqsave(&domain->lock, flags); 1576 spin_lock_irqsave(&domain->lock, flags);
1257 1577
1258 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1578 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1259 1579
1260 if (unlikely(iommu->need_sync)) 1580 iommu_completion_wait(iommu);
1261 iommu_completion_wait(iommu);
1262 1581
1263 spin_unlock_irqrestore(&domain->lock, flags); 1582 spin_unlock_irqrestore(&domain->lock, flags);
1264 1583
@@ -1297,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1297 * we don't need to preallocate the protection domains anymore. 1616 * we don't need to preallocate the protection domains anymore.
1298 * For now we have to. 1617 * For now we have to.
1299 */ 1618 */
1300void prealloc_protection_domains(void) 1619static void prealloc_protection_domains(void)
1301{ 1620{
1302 struct pci_dev *dev = NULL; 1621 struct pci_dev *dev = NULL;
1303 struct dma_ops_domain *dma_dom; 1622 struct dma_ops_domain *dma_dom;
@@ -1306,7 +1625,7 @@ void prealloc_protection_domains(void)
1306 u16 devid; 1625 u16 devid;
1307 1626
1308 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1627 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1309 devid = (dev->bus->number << 8) | dev->devfn; 1628 devid = calc_devid(dev->bus->number, dev->devfn);
1310 if (devid > amd_iommu_last_bdf) 1629 if (devid > amd_iommu_last_bdf)
1311 continue; 1630 continue;
1312 devid = amd_iommu_alias_table[devid]; 1631 devid = amd_iommu_alias_table[devid];
@@ -1353,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
1353 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1672 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
1354 if (iommu->default_dom == NULL) 1673 if (iommu->default_dom == NULL)
1355 return -ENOMEM; 1674 return -ENOMEM;
1675 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
1356 ret = iommu_init_unity_mappings(iommu); 1676 ret = iommu_init_unity_mappings(iommu);
1357 if (ret) 1677 if (ret)
1358 goto free_domains; 1678 goto free_domains;
@@ -1376,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
1376 /* Make the driver finally visible to the drivers */ 1696 /* Make the driver finally visible to the drivers */
1377 dma_ops = &amd_iommu_dma_ops; 1697 dma_ops = &amd_iommu_dma_ops;
1378 1698
1699 register_iommu(&amd_iommu_ops);
1700
1701 bus_register_notifier(&pci_bus_type, &device_nb);
1702
1703 amd_iommu_stats_init();
1704
1379 return 0; 1705 return 0;
1380 1706
1381free_domains: 1707free_domains:
@@ -1387,3 +1713,224 @@ free_domains:
1387 1713
1388 return ret; 1714 return ret;
1389} 1715}
1716
1717/*****************************************************************************
1718 *
1719 * The following functions belong to the exported interface of AMD IOMMU
1720 *
1721 * This interface allows access to lower level functions of the IOMMU
1722 * like protection domain handling and assignement of devices to domains
1723 * which is not possible with the dma_ops interface.
1724 *
1725 *****************************************************************************/
1726
1727static void cleanup_domain(struct protection_domain *domain)
1728{
1729 unsigned long flags;
1730 u16 devid;
1731
1732 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1733
1734 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
1735 if (amd_iommu_pd_table[devid] == domain)
1736 __detach_device(domain, devid);
1737
1738 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1739}
1740
1741static int amd_iommu_domain_init(struct iommu_domain *dom)
1742{
1743 struct protection_domain *domain;
1744
1745 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1746 if (!domain)
1747 return -ENOMEM;
1748
1749 spin_lock_init(&domain->lock);
1750 domain->mode = PAGE_MODE_3_LEVEL;
1751 domain->id = domain_id_alloc();
1752 if (!domain->id)
1753 goto out_free;
1754 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1755 if (!domain->pt_root)
1756 goto out_free;
1757
1758 dom->priv = domain;
1759
1760 return 0;
1761
1762out_free:
1763 kfree(domain);
1764
1765 return -ENOMEM;
1766}
1767
1768static void amd_iommu_domain_destroy(struct iommu_domain *dom)
1769{
1770 struct protection_domain *domain = dom->priv;
1771
1772 if (!domain)
1773 return;
1774
1775 if (domain->dev_cnt > 0)
1776 cleanup_domain(domain);
1777
1778 BUG_ON(domain->dev_cnt != 0);
1779
1780 free_pagetable(domain);
1781
1782 domain_id_free(domain->id);
1783
1784 kfree(domain);
1785
1786 dom->priv = NULL;
1787}
1788
1789static void amd_iommu_detach_device(struct iommu_domain *dom,
1790 struct device *dev)
1791{
1792 struct protection_domain *domain = dom->priv;
1793 struct amd_iommu *iommu;
1794 struct pci_dev *pdev;
1795 u16 devid;
1796
1797 if (dev->bus != &pci_bus_type)
1798 return;
1799
1800 pdev = to_pci_dev(dev);
1801
1802 devid = calc_devid(pdev->bus->number, pdev->devfn);
1803
1804 if (devid > 0)
1805 detach_device(domain, devid);
1806
1807 iommu = amd_iommu_rlookup_table[devid];
1808 if (!iommu)
1809 return;
1810
1811 iommu_queue_inv_dev_entry(iommu, devid);
1812 iommu_completion_wait(iommu);
1813}
1814
1815static int amd_iommu_attach_device(struct iommu_domain *dom,
1816 struct device *dev)
1817{
1818 struct protection_domain *domain = dom->priv;
1819 struct protection_domain *old_domain;
1820 struct amd_iommu *iommu;
1821 struct pci_dev *pdev;
1822 u16 devid;
1823
1824 if (dev->bus != &pci_bus_type)
1825 return -EINVAL;
1826
1827 pdev = to_pci_dev(dev);
1828
1829 devid = calc_devid(pdev->bus->number, pdev->devfn);
1830
1831 if (devid >= amd_iommu_last_bdf ||
1832 devid != amd_iommu_alias_table[devid])
1833 return -EINVAL;
1834
1835 iommu = amd_iommu_rlookup_table[devid];
1836 if (!iommu)
1837 return -EINVAL;
1838
1839 old_domain = domain_for_device(devid);
1840 if (old_domain)
1841 return -EBUSY;
1842
1843 attach_device(iommu, domain, devid);
1844
1845 iommu_completion_wait(iommu);
1846
1847 return 0;
1848}
1849
1850static int amd_iommu_map_range(struct iommu_domain *dom,
1851 unsigned long iova, phys_addr_t paddr,
1852 size_t size, int iommu_prot)
1853{
1854 struct protection_domain *domain = dom->priv;
1855 unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
1856 int prot = 0;
1857 int ret;
1858
1859 if (iommu_prot & IOMMU_READ)
1860 prot |= IOMMU_PROT_IR;
1861 if (iommu_prot & IOMMU_WRITE)
1862 prot |= IOMMU_PROT_IW;
1863
1864 iova &= PAGE_MASK;
1865 paddr &= PAGE_MASK;
1866
1867 for (i = 0; i < npages; ++i) {
1868 ret = iommu_map_page(domain, iova, paddr, prot);
1869 if (ret)
1870 return ret;
1871
1872 iova += PAGE_SIZE;
1873 paddr += PAGE_SIZE;
1874 }
1875
1876 return 0;
1877}
1878
1879static void amd_iommu_unmap_range(struct iommu_domain *dom,
1880 unsigned long iova, size_t size)
1881{
1882
1883 struct protection_domain *domain = dom->priv;
1884 unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
1885
1886 iova &= PAGE_MASK;
1887
1888 for (i = 0; i < npages; ++i) {
1889 iommu_unmap_page(domain, iova);
1890 iova += PAGE_SIZE;
1891 }
1892
1893 iommu_flush_domain(domain->id);
1894}
1895
1896static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
1897 unsigned long iova)
1898{
1899 struct protection_domain *domain = dom->priv;
1900 unsigned long offset = iova & ~PAGE_MASK;
1901 phys_addr_t paddr;
1902 u64 *pte;
1903
1904 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
1905
1906 if (!IOMMU_PTE_PRESENT(*pte))
1907 return 0;
1908
1909 pte = IOMMU_PTE_PAGE(*pte);
1910 pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
1911
1912 if (!IOMMU_PTE_PRESENT(*pte))
1913 return 0;
1914
1915 pte = IOMMU_PTE_PAGE(*pte);
1916 pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
1917
1918 if (!IOMMU_PTE_PRESENT(*pte))
1919 return 0;
1920
1921 paddr = *pte & IOMMU_PAGE_MASK;
1922 paddr |= offset;
1923
1924 return paddr;
1925}
1926
1927static struct iommu_ops amd_iommu_ops = {
1928 .domain_init = amd_iommu_domain_init,
1929 .domain_destroy = amd_iommu_domain_destroy,
1930 .attach_dev = amd_iommu_attach_device,
1931 .detach_dev = amd_iommu_detach_device,
1932 .map = amd_iommu_map_range,
1933 .unmap = amd_iommu_unmap_range,
1934 .iova_to_phys = amd_iommu_iova_to_phys,
1935};
1936
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 0cdcda35a05f..42c33cebf00f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
28#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
31 32
32/* 33/*
33 * definitions for the ACPI scanning code 34 * definitions for the ACPI scanning code
@@ -121,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
121LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
122 we find in ACPI */ 123 we find in ACPI */
123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
124int amd_iommu_isolate; /* if 1, device isolation is enabled */ 125bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
126 128
127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -242,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242} 244}
243 245
244/* Function to enable the hardware */ 246/* Function to enable the hardware */
245void __init iommu_enable(struct amd_iommu *iommu) 247static void __init iommu_enable(struct amd_iommu *iommu)
246{ 248{
247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " 249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
248 "at %02x:%02x.%x cap 0x%hx\n", 250 dev_name(&iommu->dev->dev), iommu->cap_ptr);
249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
253 251
254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
255} 253}
256 254
257/* Function to enable IOMMU event logging and event interrupts */ 255/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu) 256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{ 257{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); 259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -427,6 +425,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
428 &entry, sizeof(entry)); 426 &entry, sizeof(entry));
429 427
428 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431
430 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
431 433
432 return cmd_buf; 434 return cmd_buf;
@@ -1074,7 +1076,8 @@ int __init amd_iommu_init(void)
1074 goto free; 1076 goto free;
1075 1077
1076 /* IOMMU rlookup table - find the IOMMU for a specific device */ 1078 /* IOMMU rlookup table - find the IOMMU for a specific device */
1077 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, 1079 amd_iommu_rlookup_table = (void *)__get_free_pages(
1080 GFP_KERNEL | __GFP_ZERO,
1078 get_order(rlookup_table_size)); 1081 get_order(rlookup_table_size));
1079 if (amd_iommu_rlookup_table == NULL) 1082 if (amd_iommu_rlookup_table == NULL)
1080 goto free; 1083 goto free;
@@ -1212,8 +1215,10 @@ static int __init parse_amd_iommu_options(char *str)
1212{ 1215{
1213 for (; *str; ++str) { 1216 for (; *str; ++str) {
1214 if (strncmp(str, "isolate", 7) == 0) 1217 if (strncmp(str, "isolate", 7) == 0)
1215 amd_iommu_isolate = 1; 1218 amd_iommu_isolate = true;
1216 if (strncmp(str, "fullflush", 11) == 0) 1219 if (strncmp(str, "share", 5) == 0)
1220 amd_iommu_isolate = false;
1221 if (strncmp(str, "fullflush", 9) == 0)
1217 amd_iommu_unmap_flush = true; 1222 amd_iommu_unmap_flush = true;
1218 } 1223 }
1219 1224
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..676debfc1702 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture, only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge, or set too small aperture.
6 *
6 * If all fails map the aperture over some low memory. This is cheaper than 7 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 8 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 9 * because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 04a7f960bbc0..cf2ca19e62da 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Local APIC handling, local APIC timers 2 * Local APIC handling, local APIC timers
3 * 3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> 4 * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
5 * 5 *
6 * Fixes 6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs; 7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
@@ -14,49 +14,71 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 18#include <linux/mc146818rtc.h>
26#include <linux/ioport.h>
27#include <linux/cpu.h>
28#include <linux/clockchips.h>
29#include <linux/acpi_pmtmr.h> 19#include <linux/acpi_pmtmr.h>
20#include <linux/clockchips.h>
21#include <linux/interrupt.h>
22#include <linux/bootmem.h>
23#include <linux/ftrace.h>
24#include <linux/ioport.h>
30#include <linux/module.h> 25#include <linux/module.h>
31#include <linux/dmi.h> 26#include <linux/sysdev.h>
27#include <linux/delay.h>
28#include <linux/timex.h>
32#include <linux/dmar.h> 29#include <linux/dmar.h>
30#include <linux/init.h>
31#include <linux/cpu.h>
32#include <linux/dmi.h>
33#include <linux/nmi.h>
34#include <linux/smp.h>
35#include <linux/mm.h>
33 36
34#include <asm/atomic.h>
35#include <asm/smp.h>
36#include <asm/mtrr.h>
37#include <asm/mpspec.h>
38#include <asm/desc.h>
39#include <asm/arch_hooks.h> 37#include <asm/arch_hooks.h>
40#include <asm/hpet.h>
41#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
39#include <asm/genapic.h>
40#include <asm/atomic.h>
41#include <asm/mpspec.h>
42#include <asm/i8253.h> 42#include <asm/i8253.h>
43#include <asm/nmi.h> 43#include <asm/i8259.h>
44#include <asm/idle.h>
45#include <asm/proto.h> 44#include <asm/proto.h>
46#include <asm/timex.h>
47#include <asm/apic.h> 45#include <asm/apic.h>
48#include <asm/i8259.h> 46#include <asm/desc.h>
47#include <asm/hpet.h>
48#include <asm/idle.h>
49#include <asm/mtrr.h>
50#include <asm/smp.h>
51
52unsigned int num_processors;
53
54unsigned disabled_cpus __cpuinitdata;
55
56/* Processor that is doing the boot up */
57unsigned int boot_cpu_physical_apicid = -1U;
49 58
50#include <mach_apic.h> 59/*
51#include <mach_apicdef.h> 60 * The highest APIC ID seen during enumeration.
52#include <mach_ipi.h> 61 *
62 * This determines the messaging protocol we can use: if all APIC IDs
63 * are in the 0 ... 7 range, then we can use logical addressing which
64 * has some performance advantages (better broadcasting).
65 *
66 * If there's an APIC ID above 8, we use physical addressing.
67 */
68unsigned int max_physical_apicid;
53 69
54/* 70/*
55 * Sanity check 71 * Bitmask of physically existing CPUs:
56 */ 72 */
57#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) 73physid_mask_t phys_cpu_present_map;
58# error SPURIOUS_APIC_VECTOR definition error 74
59#endif 75/*
76 * Map cpu index to physical APIC ID
77 */
78DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
79DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
60 82
61#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
62/* 84/*
@@ -97,8 +119,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
97#ifdef HAVE_X2APIC 119#ifdef HAVE_X2APIC
98int x2apic; 120int x2apic;
99/* x2apic enabled before OS handover */ 121/* x2apic enabled before OS handover */
100int x2apic_preenabled; 122static int x2apic_preenabled;
101int disable_x2apic; 123static int disable_x2apic;
102static __init int setup_nox2apic(char *str) 124static __init int setup_nox2apic(char *str)
103{ 125{
104 disable_x2apic = 1; 126 disable_x2apic = 1;
@@ -118,8 +140,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
118 140
119int first_system_vector = 0xfe; 141int first_system_vector = 0xfe;
120 142
121char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
122
123/* 143/*
124 * Debug level, exported for io_apic.c 144 * Debug level, exported for io_apic.c
125 */ 145 */
@@ -141,7 +161,7 @@ static int lapic_next_event(unsigned long delta,
141 struct clock_event_device *evt); 161 struct clock_event_device *evt);
142static void lapic_timer_setup(enum clock_event_mode mode, 162static void lapic_timer_setup(enum clock_event_mode mode,
143 struct clock_event_device *evt); 163 struct clock_event_device *evt);
144static void lapic_timer_broadcast(cpumask_t mask); 164static void lapic_timer_broadcast(const struct cpumask *mask);
145static void apic_pm_activate(void); 165static void apic_pm_activate(void);
146 166
147/* 167/*
@@ -227,7 +247,7 @@ void xapic_icr_write(u32 low, u32 id)
227 apic_write(APIC_ICR, low); 247 apic_write(APIC_ICR, low);
228} 248}
229 249
230u64 xapic_icr_read(void) 250static u64 xapic_icr_read(void)
231{ 251{
232 u32 icr1, icr2; 252 u32 icr1, icr2;
233 253
@@ -267,7 +287,7 @@ void x2apic_icr_write(u32 low, u32 id)
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); 287 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268} 288}
269 289
270u64 x2apic_icr_read(void) 290static u64 x2apic_icr_read(void)
271{ 291{
272 unsigned long val; 292 unsigned long val;
273 293
@@ -441,6 +461,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
441 v = apic_read(APIC_LVTT); 461 v = apic_read(APIC_LVTT);
442 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 462 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
443 apic_write(APIC_LVTT, v); 463 apic_write(APIC_LVTT, v);
464 apic_write(APIC_TMICT, 0xffffffff);
444 break; 465 break;
445 case CLOCK_EVT_MODE_RESUME: 466 case CLOCK_EVT_MODE_RESUME:
446 /* Nothing to do here */ 467 /* Nothing to do here */
@@ -453,10 +474,10 @@ static void lapic_timer_setup(enum clock_event_mode mode,
453/* 474/*
454 * Local APIC timer broadcast function 475 * Local APIC timer broadcast function
455 */ 476 */
456static void lapic_timer_broadcast(cpumask_t mask) 477static void lapic_timer_broadcast(const struct cpumask *mask)
457{ 478{
458#ifdef CONFIG_SMP 479#ifdef CONFIG_SMP
459 send_IPI_mask(mask, LOCAL_TIMER_VECTOR); 480 apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
460#endif 481#endif
461} 482}
462 483
@@ -469,7 +490,7 @@ static void __cpuinit setup_APIC_timer(void)
469 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 490 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
470 491
471 memcpy(levt, &lapic_clockevent, sizeof(*levt)); 492 memcpy(levt, &lapic_clockevent, sizeof(*levt));
472 levt->cpumask = cpumask_of_cpu(smp_processor_id()); 493 levt->cpumask = cpumask_of(smp_processor_id());
473 494
474 clockevents_register_device(levt); 495 clockevents_register_device(levt);
475} 496}
@@ -534,7 +555,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
534 } 555 }
535} 556}
536 557
537static int __init calibrate_by_pmtimer(long deltapm, long *delta) 558static int __init
559calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
538{ 560{
539 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; 561 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
540 const long pm_thresh = pm_100ms / 100; 562 const long pm_thresh = pm_100ms / 100;
@@ -545,7 +567,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
545 return -1; 567 return -1;
546#endif 568#endif
547 569
548 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 570 apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
549 571
550 /* Check, if the PM timer is available */ 572 /* Check, if the PM timer is available */
551 if (!deltapm) 573 if (!deltapm)
@@ -555,19 +577,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
555 577
556 if (deltapm > (pm_100ms - pm_thresh) && 578 if (deltapm > (pm_100ms - pm_thresh) &&
557 deltapm < (pm_100ms + pm_thresh)) { 579 deltapm < (pm_100ms + pm_thresh)) {
558 apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); 580 apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
559 } else { 581 return 0;
560 res = (((u64)deltapm) * mult) >> 22; 582 }
561 do_div(res, 1000000); 583
562 printk(KERN_WARNING "APIC calibration not consistent " 584 res = (((u64)deltapm) * mult) >> 22;
563 "with PM Timer: %ldms instead of 100ms\n", 585 do_div(res, 1000000);
564 (long)res); 586 pr_warning("APIC calibration not consistent "
565 /* Correct the lapic counter value */ 587 "with PM-Timer: %ldms instead of 100ms\n",(long)res);
566 res = (((u64)(*delta)) * pm_100ms); 588
589 /* Correct the lapic counter value */
590 res = (((u64)(*delta)) * pm_100ms);
591 do_div(res, deltapm);
592 pr_info("APIC delta adjusted to PM-Timer: "
593 "%lu (%ld)\n", (unsigned long)res, *delta);
594 *delta = (long)res;
595
596 /* Correct the tsc counter value */
597 if (cpu_has_tsc) {
598 res = (((u64)(*deltatsc)) * pm_100ms);
567 do_div(res, deltapm); 599 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: " 600 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
569 "%lu (%ld)\n", (unsigned long)res, *delta); 601 "PM-Timer: %lu (%ld) \n",
570 *delta = (long)res; 602 (unsigned long)res, *deltatsc);
603 *deltatsc = (long)res;
571 } 604 }
572 605
573 return 0; 606 return 0;
@@ -578,7 +611,7 @@ static int __init calibrate_APIC_clock(void)
578 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 611 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
579 void (*real_handler)(struct clock_event_device *dev); 612 void (*real_handler)(struct clock_event_device *dev);
580 unsigned long deltaj; 613 unsigned long deltaj;
581 long delta; 614 long delta, deltatsc;
582 int pm_referenced = 0; 615 int pm_referenced = 0;
583 616
584 local_irq_disable(); 617 local_irq_disable();
@@ -608,9 +641,11 @@ static int __init calibrate_APIC_clock(void)
608 delta = lapic_cal_t1 - lapic_cal_t2; 641 delta = lapic_cal_t1 - lapic_cal_t2;
609 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 642 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
610 643
644 deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
645
611 /* we trust the PM based calibration if possible */ 646 /* we trust the PM based calibration if possible */
612 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, 647 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
613 &delta); 648 &delta, &deltatsc);
614 649
615 /* Calculate the scaled math multiplication factor */ 650 /* Calculate the scaled math multiplication factor */
616 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 651 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -628,11 +663,10 @@ static int __init calibrate_APIC_clock(void)
628 calibration_result); 663 calibration_result);
629 664
630 if (cpu_has_tsc) { 665 if (cpu_has_tsc) {
631 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
632 apic_printk(APIC_VERBOSE, "..... CPU clock speed is " 666 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
633 "%ld.%04ld MHz.\n", 667 "%ld.%04ld MHz.\n",
634 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), 668 (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
635 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); 669 (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
636 } 670 }
637 671
638 apic_printk(APIC_VERBOSE, "..... host bus clock speed is " 672 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
@@ -645,8 +679,7 @@ static int __init calibrate_APIC_clock(void)
645 */ 679 */
646 if (calibration_result < (1000000 / HZ)) { 680 if (calibration_result < (1000000 / HZ)) {
647 local_irq_enable(); 681 local_irq_enable();
648 printk(KERN_WARNING 682 pr_warning("APIC frequency too slow, disabling apic timer\n");
649 "APIC frequency too slow, disabling apic timer\n");
650 return -1; 683 return -1;
651 } 684 }
652 685
@@ -672,13 +705,9 @@ static int __init calibrate_APIC_clock(void)
672 while (lapic_cal_loops <= LAPIC_CAL_LOOPS) 705 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
673 cpu_relax(); 706 cpu_relax();
674 707
675 local_irq_disable();
676
677 /* Stop the lapic timer */ 708 /* Stop the lapic timer */
678 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); 709 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
679 710
680 local_irq_enable();
681
682 /* Jiffies delta */ 711 /* Jiffies delta */
683 deltaj = lapic_cal_j2 - lapic_cal_j1; 712 deltaj = lapic_cal_j2 - lapic_cal_j1;
684 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); 713 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
@@ -692,8 +721,7 @@ static int __init calibrate_APIC_clock(void)
692 local_irq_enable(); 721 local_irq_enable();
693 722
694 if (levt->features & CLOCK_EVT_FEAT_DUMMY) { 723 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
695 printk(KERN_WARNING 724 pr_warning("APIC timer disabled due to verification failure\n");
696 "APIC timer disabled due to verification failure.\n");
697 return -1; 725 return -1;
698 } 726 }
699 727
@@ -714,7 +742,7 @@ void __init setup_boot_APIC_clock(void)
714 * broadcast mechanism is used. On UP systems simply ignore it. 742 * broadcast mechanism is used. On UP systems simply ignore it.
715 */ 743 */
716 if (disable_apic_timer) { 744 if (disable_apic_timer) {
717 printk(KERN_INFO "Disabling APIC timer\n"); 745 pr_info("Disabling APIC timer\n");
718 /* No broadcast on UP ! */ 746 /* No broadcast on UP ! */
719 if (num_possible_cpus() > 1) { 747 if (num_possible_cpus() > 1) {
720 lapic_clockevent.mult = 1; 748 lapic_clockevent.mult = 1;
@@ -741,7 +769,7 @@ void __init setup_boot_APIC_clock(void)
741 if (nmi_watchdog != NMI_IO_APIC) 769 if (nmi_watchdog != NMI_IO_APIC)
742 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 770 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
743 else 771 else
744 printk(KERN_WARNING "APIC timer registered as dummy," 772 pr_warning("APIC timer registered as dummy,"
745 " due to nmi_watchdog=%d!\n", nmi_watchdog); 773 " due to nmi_watchdog=%d!\n", nmi_watchdog);
746 774
747 /* Setup the lapic or request the broadcast */ 775 /* Setup the lapic or request the broadcast */
@@ -773,8 +801,7 @@ static void local_apic_timer_interrupt(void)
773 * spurious. 801 * spurious.
774 */ 802 */
775 if (!evt->event_handler) { 803 if (!evt->event_handler) {
776 printk(KERN_WARNING 804 pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
777 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
778 /* Switch it off */ 805 /* Switch it off */
779 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); 806 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
780 return; 807 return;
@@ -783,11 +810,7 @@ static void local_apic_timer_interrupt(void)
783 /* 810 /*
784 * the NMI deadlock-detector uses this. 811 * the NMI deadlock-detector uses this.
785 */ 812 */
786#ifdef CONFIG_X86_64 813 inc_irq_stat(apic_timer_irqs);
787 add_pda(apic_timer_irqs, 1);
788#else
789 per_cpu(irq_stat, cpu).apic_timer_irqs++;
790#endif
791 814
792 evt->event_handler(evt); 815 evt->event_handler(evt);
793} 816}
@@ -800,7 +823,7 @@ static void local_apic_timer_interrupt(void)
800 * [ if a single-CPU system runs an SMP kernel then we call the local 823 * [ if a single-CPU system runs an SMP kernel then we call the local
801 * interrupt as well. Thus we cannot inline the local irq ... ] 824 * interrupt as well. Thus we cannot inline the local irq ... ]
802 */ 825 */
803void smp_apic_timer_interrupt(struct pt_regs *regs) 826void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
804{ 827{
805 struct pt_regs *old_regs = set_irq_regs(regs); 828 struct pt_regs *old_regs = set_irq_regs(regs);
806 829
@@ -814,9 +837,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
814 * Besides, if we don't timer interrupts ignore the global 837 * Besides, if we don't timer interrupts ignore the global
815 * interrupt lock, which is the WrongThing (tm) to do. 838 * interrupt lock, which is the WrongThing (tm) to do.
816 */ 839 */
817#ifdef CONFIG_X86_64
818 exit_idle(); 840 exit_idle();
819#endif
820 irq_enter(); 841 irq_enter();
821 local_apic_timer_interrupt(); 842 local_apic_timer_interrupt();
822 irq_exit(); 843 irq_exit();
@@ -907,6 +928,10 @@ void disable_local_APIC(void)
907{ 928{
908 unsigned int value; 929 unsigned int value;
909 930
931 /* APIC hasn't been mapped yet */
932 if (!apic_phys)
933 return;
934
910 clear_local_APIC(); 935 clear_local_APIC();
911 936
912 /* 937 /*
@@ -999,11 +1024,11 @@ int __init verify_local_APIC(void)
999 */ 1024 */
1000 reg0 = apic_read(APIC_ID); 1025 reg0 = apic_read(APIC_ID);
1001 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 1026 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1002 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 1027 apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
1003 reg1 = apic_read(APIC_ID); 1028 reg1 = apic_read(APIC_ID);
1004 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 1029 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1005 apic_write(APIC_ID, reg0); 1030 apic_write(APIC_ID, reg0);
1006 if (reg1 != (reg0 ^ APIC_ID_MASK)) 1031 if (reg1 != (reg0 ^ apic->apic_id_mask))
1007 return 0; 1032 return 0;
1008 1033
1009 /* 1034 /*
@@ -1093,18 +1118,18 @@ static void __cpuinit lapic_setup_esr(void)
1093 unsigned int oldvalue, value, maxlvt; 1118 unsigned int oldvalue, value, maxlvt;
1094 1119
1095 if (!lapic_is_integrated()) { 1120 if (!lapic_is_integrated()) {
1096 printk(KERN_INFO "No ESR for 82489DX.\n"); 1121 pr_info("No ESR for 82489DX.\n");
1097 return; 1122 return;
1098 } 1123 }
1099 1124
1100 if (esr_disable) { 1125 if (apic->disable_esr) {
1101 /* 1126 /*
1102 * Something untraceable is creating bad interrupts on 1127 * Something untraceable is creating bad interrupts on
1103 * secondary quads ... for the moment, just leave the 1128 * secondary quads ... for the moment, just leave the
1104 * ESR disabled - we can't do anything useful with the 1129 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh 1130 * errors anyway - mbligh
1106 */ 1131 */
1107 printk(KERN_INFO "Leaving ESR disabled.\n"); 1132 pr_info("Leaving ESR disabled.\n");
1108 return; 1133 return;
1109 } 1134 }
1110 1135
@@ -1138,9 +1163,14 @@ void __cpuinit setup_local_APIC(void)
1138 unsigned int value; 1163 unsigned int value;
1139 int i, j; 1164 int i, j;
1140 1165
1166 if (disable_apic) {
1167 arch_disable_smp_support();
1168 return;
1169 }
1170
1141#ifdef CONFIG_X86_32 1171#ifdef CONFIG_X86_32
1142 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1172 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1143 if (lapic_is_integrated() && esr_disable) { 1173 if (lapic_is_integrated() && apic->disable_esr) {
1144 apic_write(APIC_ESR, 0); 1174 apic_write(APIC_ESR, 0);
1145 apic_write(APIC_ESR, 0); 1175 apic_write(APIC_ESR, 0);
1146 apic_write(APIC_ESR, 0); 1176 apic_write(APIC_ESR, 0);
@@ -1154,7 +1184,7 @@ void __cpuinit setup_local_APIC(void)
1154 * Double-check whether this APIC is really registered. 1184 * Double-check whether this APIC is really registered.
1155 * This is meaningless in clustered apic mode, so we skip it. 1185 * This is meaningless in clustered apic mode, so we skip it.
1156 */ 1186 */
1157 if (!apic_id_registered()) 1187 if (!apic->apic_id_registered())
1158 BUG(); 1188 BUG();
1159 1189
1160 /* 1190 /*
@@ -1162,7 +1192,7 @@ void __cpuinit setup_local_APIC(void)
1162 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 1192 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
1163 * document number 292116). So here it goes... 1193 * document number 292116). So here it goes...
1164 */ 1194 */
1165 init_apic_ldr(); 1195 apic->init_apic_ldr();
1166 1196
1167 /* 1197 /*
1168 * Set Task Priority to 'accept all'. We never change this 1198 * Set Task Priority to 'accept all'. We never change this
@@ -1298,7 +1328,7 @@ void check_x2apic(void)
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1328 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299 1329
1300 if (msr & X2APIC_ENABLE) { 1330 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n"); 1331 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1; 1332 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops; 1333 apic_ops = &x2apic_ops;
1304 } 1334 }
@@ -1310,12 +1340,12 @@ void enable_x2apic(void)
1310 1340
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1341 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) { 1342 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n"); 1343 pr_info("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1344 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 } 1345 }
1316} 1346}
1317 1347
1318void enable_IR_x2apic(void) 1348void __init enable_IR_x2apic(void)
1319{ 1349{
1320#ifdef CONFIG_INTR_REMAP 1350#ifdef CONFIG_INTR_REMAP
1321 int ret; 1351 int ret;
@@ -1325,9 +1355,8 @@ void enable_IR_x2apic(void)
1325 return; 1355 return;
1326 1356
1327 if (!x2apic_preenabled && disable_x2apic) { 1357 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO 1358 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "Skipped enabling x2apic and Interrupt-remapping " 1359 "because of nox2apic\n");
1330 "because of nox2apic\n");
1331 return; 1360 return;
1332 } 1361 }
1333 1362
@@ -1335,22 +1364,19 @@ void enable_IR_x2apic(void)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1364 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336 1365
1337 if (!x2apic_preenabled && skip_ioapic_setup) { 1366 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO 1367 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1339 "Skipped enabling x2apic and Interrupt-remapping " 1368 "because of skipping io-apic setup\n");
1340 "because of skipping io-apic setup\n");
1341 return; 1369 return;
1342 } 1370 }
1343 1371
1344 ret = dmar_table_init(); 1372 ret = dmar_table_init();
1345 if (ret) { 1373 if (ret) {
1346 printk(KERN_INFO 1374 pr_info("dmar_table_init() failed with %d:\n", ret);
1347 "dmar_table_init() failed with %d:\n", ret);
1348 1375
1349 if (x2apic_preenabled) 1376 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed"); 1377 panic("x2apic enabled by bios. But IR enabling failed");
1351 else 1378 else
1352 printk(KERN_INFO 1379 pr_info("Not enabling x2apic,Intr-remapping\n");
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return; 1380 return;
1355 } 1381 }
1356 1382
@@ -1359,7 +1385,7 @@ void enable_IR_x2apic(void)
1359 1385
1360 ret = save_mask_IO_APIC_setup(); 1386 ret = save_mask_IO_APIC_setup();
1361 if (ret) { 1387 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); 1388 pr_info("Saving IO-APIC state failed: %d\n", ret);
1363 goto end; 1389 goto end;
1364 } 1390 }
1365 1391
@@ -1394,14 +1420,11 @@ end:
1394 1420
1395 if (!ret) { 1421 if (!ret) {
1396 if (!x2apic_preenabled) 1422 if (!x2apic_preenabled)
1397 printk(KERN_INFO 1423 pr_info("Enabled x2apic and interrupt-remapping\n");
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else 1424 else
1400 printk(KERN_INFO 1425 pr_info("Enabled Interrupt-remapping\n");
1401 "Enabled Interrupt-remapping\n");
1402 } else 1426 } else
1403 printk(KERN_ERR 1427 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else 1428#else
1406 if (!cpu_has_x2apic) 1429 if (!cpu_has_x2apic)
1407 return; 1430 return;
@@ -1410,8 +1433,8 @@ end:
1410 panic("x2apic enabled prior OS handover," 1433 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP"); 1434 " enable CONFIG_INTR_REMAP");
1412 1435
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " 1436 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n"); 1437 " and x2apic\n");
1415#endif 1438#endif
1416 1439
1417 return; 1440 return;
@@ -1428,7 +1451,7 @@ end:
1428static int __init detect_init_APIC(void) 1451static int __init detect_init_APIC(void)
1429{ 1452{
1430 if (!cpu_has_apic) { 1453 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n"); 1454 pr_info("No local APIC present\n");
1432 return -1; 1455 return -1;
1433 } 1456 }
1434 1457
@@ -1451,7 +1474,7 @@ static int __init detect_init_APIC(void)
1451 switch (boot_cpu_data.x86_vendor) { 1474 switch (boot_cpu_data.x86_vendor) {
1452 case X86_VENDOR_AMD: 1475 case X86_VENDOR_AMD:
1453 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || 1476 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
1454 (boot_cpu_data.x86 == 15)) 1477 (boot_cpu_data.x86 >= 15))
1455 break; 1478 break;
1456 goto no_apic; 1479 goto no_apic;
1457 case X86_VENDOR_INTEL: 1480 case X86_VENDOR_INTEL:
@@ -1469,8 +1492,8 @@ static int __init detect_init_APIC(void)
1469 * "lapic" specified. 1492 * "lapic" specified.
1470 */ 1493 */
1471 if (!force_enable_local_apic) { 1494 if (!force_enable_local_apic) {
1472 printk(KERN_INFO "Local APIC disabled by BIOS -- " 1495 pr_info("Local APIC disabled by BIOS -- "
1473 "you can enable it with \"lapic\"\n"); 1496 "you can enable it with \"lapic\"\n");
1474 return -1; 1497 return -1;
1475 } 1498 }
1476 /* 1499 /*
@@ -1480,8 +1503,7 @@ static int __init detect_init_APIC(void)
1480 */ 1503 */
1481 rdmsr(MSR_IA32_APICBASE, l, h); 1504 rdmsr(MSR_IA32_APICBASE, l, h);
1482 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1505 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1483 printk(KERN_INFO 1506 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1484 "Local APIC disabled by BIOS -- reenabling.\n");
1485 l &= ~MSR_IA32_APICBASE_BASE; 1507 l &= ~MSR_IA32_APICBASE_BASE;
1486 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1508 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1487 wrmsr(MSR_IA32_APICBASE, l, h); 1509 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -1494,7 +1516,7 @@ static int __init detect_init_APIC(void)
1494 */ 1516 */
1495 features = cpuid_edx(1); 1517 features = cpuid_edx(1);
1496 if (!(features & (1 << X86_FEATURE_APIC))) { 1518 if (!(features & (1 << X86_FEATURE_APIC))) {
1497 printk(KERN_WARNING "Could not enable APIC!\n"); 1519 pr_warning("Could not enable APIC!\n");
1498 return -1; 1520 return -1;
1499 } 1521 }
1500 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1522 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -1505,14 +1527,14 @@ static int __init detect_init_APIC(void)
1505 if (l & MSR_IA32_APICBASE_ENABLE) 1527 if (l & MSR_IA32_APICBASE_ENABLE)
1506 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; 1528 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1507 1529
1508 printk(KERN_INFO "Found and enabled local APIC!\n"); 1530 pr_info("Found and enabled local APIC!\n");
1509 1531
1510 apic_pm_activate(); 1532 apic_pm_activate();
1511 1533
1512 return 0; 1534 return 0;
1513 1535
1514no_apic: 1536no_apic:
1515 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1537 pr_info("No local APIC present or hardware disabled\n");
1516 return -1; 1538 return -1;
1517} 1539}
1518#endif 1540#endif
@@ -1586,14 +1608,14 @@ int apic_version[MAX_APICS];
1586 1608
1587int __init APIC_init_uniprocessor(void) 1609int __init APIC_init_uniprocessor(void)
1588{ 1610{
1589#ifdef CONFIG_X86_64
1590 if (disable_apic) { 1611 if (disable_apic) {
1591 printk(KERN_INFO "Apic disabled\n"); 1612 pr_info("Apic disabled\n");
1592 return -1; 1613 return -1;
1593 } 1614 }
1615#ifdef CONFIG_X86_64
1594 if (!cpu_has_apic) { 1616 if (!cpu_has_apic) {
1595 disable_apic = 1; 1617 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n"); 1618 pr_info("Apic disabled by BIOS\n");
1597 return -1; 1619 return -1;
1598 } 1620 }
1599#else 1621#else
@@ -1605,8 +1627,8 @@ int __init APIC_init_uniprocessor(void)
1605 */ 1627 */
1606 if (!cpu_has_apic && 1628 if (!cpu_has_apic &&
1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1629 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", 1630 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
1609 boot_cpu_physical_apicid); 1631 boot_cpu_physical_apicid);
1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1632 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1611 return -1; 1633 return -1;
1612 } 1634 }
@@ -1616,7 +1638,7 @@ int __init APIC_init_uniprocessor(void)
1616 enable_IR_x2apic(); 1638 enable_IR_x2apic();
1617#endif 1639#endif
1618#ifdef CONFIG_X86_64 1640#ifdef CONFIG_X86_64
1619 setup_apic_routing(); 1641 default_setup_apic_routing();
1620#endif 1642#endif
1621 1643
1622 verify_local_APIC(); 1644 verify_local_APIC();
@@ -1682,9 +1704,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1682{ 1704{
1683 u32 v; 1705 u32 v;
1684 1706
1685#ifdef CONFIG_X86_64
1686 exit_idle(); 1707 exit_idle();
1687#endif
1688 irq_enter(); 1708 irq_enter();
1689 /* 1709 /*
1690 * Check if this really is a spurious interrupt and ACK it 1710 * Check if this really is a spurious interrupt and ACK it
@@ -1695,14 +1715,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1715 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1696 ack_APIC_irq(); 1716 ack_APIC_irq();
1697 1717
1698#ifdef CONFIG_X86_64 1718 inc_irq_stat(irq_spurious_count);
1699 add_pda(irq_spurious_count, 1); 1719
1700#else
1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1720 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1721 pr_info("spurious APIC interrupt on CPU#%d, "
1703 "should never happen.\n", smp_processor_id()); 1722 "should never happen.\n", smp_processor_id());
1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1706 irq_exit(); 1723 irq_exit();
1707} 1724}
1708 1725
@@ -1713,9 +1730,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1713{ 1730{
1714 u32 v, v1; 1731 u32 v, v1;
1715 1732
1716#ifdef CONFIG_X86_64
1717 exit_idle(); 1733 exit_idle();
1718#endif
1719 irq_enter(); 1734 irq_enter();
1720 /* First tickle the hardware, only then report what went on. -- REW */ 1735 /* First tickle the hardware, only then report what went on. -- REW */
1721 v = apic_read(APIC_ESR); 1736 v = apic_read(APIC_ESR);
@@ -1724,17 +1739,18 @@ void smp_error_interrupt(struct pt_regs *regs)
1724 ack_APIC_irq(); 1739 ack_APIC_irq();
1725 atomic_inc(&irq_err_count); 1740 atomic_inc(&irq_err_count);
1726 1741
1727 /* Here is what the APIC error bits mean: 1742 /*
1728 0: Send CS error 1743 * Here is what the APIC error bits mean:
1729 1: Receive CS error 1744 * 0: Send CS error
1730 2: Send accept error 1745 * 1: Receive CS error
1731 3: Receive accept error 1746 * 2: Send accept error
1732 4: Reserved 1747 * 3: Receive accept error
1733 5: Send illegal vector 1748 * 4: Reserved
1734 6: Received illegal vector 1749 * 5: Send illegal vector
1735 7: Illegal register address 1750 * 6: Received illegal vector
1736 */ 1751 * 7: Illegal register address
1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", 1752 */
1753 pr_debug("APIC error on CPU%d: %02x(%02x)\n",
1738 smp_processor_id(), v , v1); 1754 smp_processor_id(), v , v1);
1739 irq_exit(); 1755 irq_exit();
1740} 1756}
@@ -1760,7 +1776,8 @@ void __init connect_bsp_APIC(void)
1760 outb(0x01, 0x23); 1776 outb(0x01, 0x23);
1761 } 1777 }
1762#endif 1778#endif
1763 enable_apic_mode(); 1779 if (apic->enable_apic_mode)
1780 apic->enable_apic_mode();
1764} 1781}
1765 1782
1766/** 1783/**
@@ -1832,28 +1849,37 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1832void __cpuinit generic_processor_info(int apicid, int version) 1849void __cpuinit generic_processor_info(int apicid, int version)
1833{ 1850{
1834 int cpu; 1851 int cpu;
1835 cpumask_t tmp_map;
1836 1852
1837 /* 1853 /*
1838 * Validate version 1854 * Validate version
1839 */ 1855 */
1840 if (version == 0x0) { 1856 if (version == 0x0) {
1841 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " 1857 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1842 "fixing up to 0x10. (tell your hw vendor)\n", 1858 "fixing up to 0x10. (tell your hw vendor)\n",
1843 version); 1859 version);
1844 version = 0x10; 1860 version = 0x10;
1845 } 1861 }
1846 apic_version[apicid] = version; 1862 apic_version[apicid] = version;
1847 1863
1848 if (num_processors >= NR_CPUS) { 1864 if (num_processors >= nr_cpu_ids) {
1849 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1865 int max = nr_cpu_ids;
1850 " Processor ignored.\n", NR_CPUS); 1866 int thiscpu = max + disabled_cpus;
1867
1868 pr_warning(
1869 "ACPI: NR_CPUS/possible_cpus limit of %i reached."
1870 " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
1871
1872 disabled_cpus++;
1851 return; 1873 return;
1852 } 1874 }
1853 1875
1854 num_processors++; 1876 num_processors++;
1855 cpus_complement(tmp_map, cpu_present_map); 1877 cpu = cpumask_next_zero(-1, cpu_present_mask);
1856 cpu = first_cpu(tmp_map); 1878
1879 if (version != apic_version[boot_cpu_physical_apicid])
1880 WARN_ONCE(1,
1881 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1882 apic_version[boot_cpu_physical_apicid], cpu, version);
1857 1883
1858 physid_set(apicid, phys_cpu_present_map); 1884 physid_set(apicid, phys_cpu_present_map);
1859 if (apicid == boot_cpu_physical_apicid) { 1885 if (apicid == boot_cpu_physical_apicid) {
@@ -1889,29 +1915,39 @@ void __cpuinit generic_processor_info(int apicid, int version)
1889 } 1915 }
1890#endif 1916#endif
1891 1917
1892#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1918#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1893 /* are we being called early in kernel startup? */ 1919 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1894 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1920 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1895 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1896 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1897
1898 cpu_to_apicid[cpu] = apicid;
1899 bios_cpu_apicid[cpu] = apicid;
1900 } else {
1901 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1902 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1903 }
1904#endif 1921#endif
1905 1922
1906 cpu_set(cpu, cpu_possible_map); 1923 set_cpu_possible(cpu, true);
1907 cpu_set(cpu, cpu_present_map); 1924 set_cpu_present(cpu, true);
1908} 1925}
1909 1926
1910#ifdef CONFIG_X86_64
1911int hard_smp_processor_id(void) 1927int hard_smp_processor_id(void)
1912{ 1928{
1913 return read_apic_id(); 1929 return read_apic_id();
1914} 1930}
1931
1932void default_init_apic_ldr(void)
1933{
1934 unsigned long val;
1935
1936 apic_write(APIC_DFR, APIC_DFR_VALUE);
1937 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
1938 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
1939 apic_write(APIC_LDR, val);
1940}
1941
1942#ifdef CONFIG_X86_32
1943int default_apicid_to_node(int logical_apicid)
1944{
1945#ifdef CONFIG_SMP
1946 return apicid_2_node[hard_smp_processor_id()];
1947#else
1948 return 0;
1949#endif
1950}
1915#endif 1951#endif
1916 1952
1917/* 1953/*
@@ -2106,18 +2142,16 @@ __cpuinit int apic_is_clustered_box(void)
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2142 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2143 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108 2144
2109 for (i = 0; i < NR_CPUS; i++) { 2145 for (i = 0; i < nr_cpu_ids; i++) {
2110 /* are we being called early in kernel startup? */ 2146 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) { 2147 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i]; 2148 id = bios_cpu_apicid[i];
2113 } 2149 } else if (i < nr_cpu_ids) {
2114 else if (i < nr_cpu_ids) {
2115 if (cpu_present(i)) 2150 if (cpu_present(i))
2116 id = per_cpu(x86_bios_cpu_apicid, i); 2151 id = per_cpu(x86_bios_cpu_apicid, i);
2117 else 2152 else
2118 continue; 2153 continue;
2119 } 2154 } else
2120 else
2121 break; 2155 break;
2122 2156
2123 if (id != BAD_APICID) 2157 if (id != BAD_APICID)
@@ -2209,7 +2243,7 @@ static int __init apic_set_verbosity(char *arg)
2209 else if (strcmp("verbose", arg) == 0) 2243 else if (strcmp("verbose", arg) == 0)
2210 apic_verbosity = APIC_VERBOSE; 2244 apic_verbosity = APIC_VERBOSE;
2211 else { 2245 else {
2212 printk(KERN_WARNING "APIC Verbosity level %s not recognised" 2246 pr_warning("APIC Verbosity level %s not recognised"
2213 " use apic=verbose or apic=debug\n", arg); 2247 " use apic=verbose or apic=debug\n", arg);
2214 return -EINVAL; 2248 return -EINVAL;
2215 } 2249 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..37ba5f85b718 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -160,9 +160,9 @@
160 * Work around byte swap bug in one of the Vaio's BIOS's 160 * Work around byte swap bug in one of the Vaio's BIOS's
161 * (Marc Boucher <marc@mbsi.ca>). 161 * (Marc Boucher <marc@mbsi.ca>).
162 * Exposed the disable flag to dmi so that we can handle known 162 * Exposed the disable flag to dmi so that we can handle known
163 * broken APM (Alan Cox <alan@redhat.com>). 163 * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>).
164 * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin 164 * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
165 * calling it - instead idle. (Alan Cox <alan@redhat.com>) 165 * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>)
166 * If an APM idle fails log it and idle sensibly 166 * If an APM idle fails log it and idle sensibly
167 * 1.15: Don't queue events to clients who open the device O_WRONLY. 167 * 1.15: Don't queue events to clients who open the device O_WRONLY.
168 * Don't expect replies from clients who open the device O_RDONLY. 168 * Don't expect replies from clients who open the device O_RDONLY.
@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);
301 */ 301 */
302#define APM_ZERO_SEGS 302#define APM_ZERO_SEGS
303 303
304#include "apm.h" 304#include <asm/apm.h>
305 305
306/* 306/*
307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. 307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
@@ -391,11 +391,7 @@ static int power_off;
391#else 391#else
392static int power_off = 1; 392static int power_off = 1;
393#endif 393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off; 394static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 396static int allow_ints = 1;
401#else 397#else
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88f..fbf2f33e3080 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <linux/kbuild.h> 12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 13#include <asm/ucontext.h>
14#include "sigframe.h" 14#include <asm/sigframe.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/processor.h> 17#include <asm/processor.h>
@@ -75,6 +75,7 @@ void foo(void)
75 OFFSET(PT_DS, pt_regs, ds); 75 OFFSET(PT_DS, pt_regs, ds);
76 OFFSET(PT_ES, pt_regs, es); 76 OFFSET(PT_ES, pt_regs, es);
77 OFFSET(PT_FS, pt_regs, fs); 77 OFFSET(PT_FS, pt_regs, fs);
78 OFFSET(PT_GS, pt_regs, gs);
78 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); 79 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
79 OFFSET(PT_EIP, pt_regs, ip); 80 OFFSET(PT_EIP, pt_regs, ip);
80 OFFSET(PT_CS, pt_regs, cs); 81 OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d22f8b..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
@@ -20,6 +19,8 @@
20 19
21#include <xen/interface/xen.h> 20#include <xen/interface/xen.h>
22 21
22#include <asm/sigframe.h>
23
23#define __NO_STUBS 1 24#define __NO_STUBS 1
24#undef __SYSCALL 25#undef __SYSCALL
25#undef _ASM_X86_UNISTD_64_H 26#undef _ASM_X86_UNISTD_64_H
@@ -46,16 +47,6 @@ int main(void)
46#endif 47#endif
47 BLANK(); 48 BLANK();
48#undef ENTRY 49#undef ENTRY
49#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
50 ENTRY(kernelstack);
51 ENTRY(oldrsp);
52 ENTRY(pcurrent);
53 ENTRY(irqcount);
54 ENTRY(cpunumber);
55 ENTRY(irqstackptr);
56 ENTRY(data_offset);
57 BLANK();
58#undef ENTRY
59#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
60 BLANK(); 51 BLANK();
61 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 52 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -87,7 +78,7 @@ int main(void)
87 BLANK(); 78 BLANK();
88#undef ENTRY 79#undef ENTRY
89 DEFINE(IA32_RT_SIGFRAME_sigcontext, 80 DEFINE(IA32_RT_SIGFRAME_sigcontext,
90 offsetof (struct rt_sigframe32, uc.uc_mcontext)); 81 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
91 BLANK(); 82 BLANK();
92#endif 83#endif
93 DEFINE(pbe_address, offsetof(struct pbe, address)); 84 DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bigsmp_32.c b/arch/x86/kernel/bigsmp_32.c
new file mode 100644
index 000000000000..47a62f46afdb
--- /dev/null
+++ b/arch/x86/kernel/bigsmp_32.c
@@ -0,0 +1,266 @@
1/*
2 * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
3 * Drives the local APIC in "clustered mode".
4 */
5#define APIC_DEFINITION 1
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <asm/ipi.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/dmi.h>
16#include <linux/smp.h>
17
18
19static inline unsigned bigsmp_get_apic_id(unsigned long x)
20{
21 return (x >> 24) & 0xFF;
22}
23
24#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
25
26static inline int bigsmp_apic_id_registered(void)
27{
28 return 1;
29}
30
31static inline const cpumask_t *bigsmp_target_cpus(void)
32{
33#ifdef CONFIG_SMP
34 return &cpu_online_map;
35#else
36 return &cpumask_of_cpu(0);
37#endif
38}
39
40#define APIC_DFR_VALUE (APIC_DFR_FLAT)
41
42static inline unsigned long
43bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
44{
45 return 0;
46}
47
48static inline unsigned long bigsmp_check_apicid_present(int bit)
49{
50 return 1;
51}
52
53static inline unsigned long calculate_ldr(int cpu)
54{
55 unsigned long val, id;
56 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
57 id = xapic_phys_to_log_apicid(cpu);
58 val |= SET_APIC_LOGICAL_ID(id);
59 return val;
60}
61
62/*
63 * Set up the logical destination ID.
64 *
65 * Intel recommends to set DFR, LDR and TPR before enabling
66 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
67 * document number 292116). So here it goes...
68 */
69static inline void bigsmp_init_apic_ldr(void)
70{
71 unsigned long val;
72 int cpu = smp_processor_id();
73
74 apic_write(APIC_DFR, APIC_DFR_VALUE);
75 val = calculate_ldr(cpu);
76 apic_write(APIC_LDR, val);
77}
78
79static inline void bigsmp_setup_apic_routing(void)
80{
81 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
82 "Physflat", nr_ioapics);
83}
84
85static inline int bigsmp_apicid_to_node(int logical_apicid)
86{
87 return apicid_2_node[hard_smp_processor_id()];
88}
89
90static inline int bigsmp_cpu_present_to_apicid(int mps_cpu)
91{
92 if (mps_cpu < nr_cpu_ids)
93 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
94
95 return BAD_APICID;
96}
97
98static inline physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
99{
100 return physid_mask_of_physid(phys_apicid);
101}
102
103extern u8 cpu_2_logical_apicid[];
104/* Mapping from cpu number to logical apicid */
105static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106{
107 if (cpu >= nr_cpu_ids)
108 return BAD_APICID;
109 return cpu_physical_id(cpu);
110}
111
112static inline physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
113{
114 /* For clustered we don't have a good way to do this yet - hack */
115 return physids_promote(0xFFL);
116}
117
118static inline void bigsmp_setup_portio_remap(void)
119{
120}
121
122static inline int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
123{
124 return 1;
125}
126
127/* As we are using single CPU as destination, pick only one CPU here */
128static inline unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask)
129{
130 return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask));
131}
132
133static inline unsigned int
134bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
135 const struct cpumask *andmask)
136{
137 int cpu;
138
139 /*
140 * We're using fixed IRQ delivery, can only return one phys APIC ID.
141 * May as well be the first.
142 */
143 for_each_cpu_and(cpu, cpumask, andmask) {
144 if (cpumask_test_cpu(cpu, cpu_online_mask))
145 break;
146 }
147 if (cpu < nr_cpu_ids)
148 return bigsmp_cpu_to_logical_apicid(cpu);
149
150 return BAD_APICID;
151}
152
153static inline int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
154{
155 return cpuid_apic >> index_msb;
156}
157
158static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
159{
160 default_send_IPI_mask_sequence_phys(mask, vector);
161}
162
163static inline void bigsmp_send_IPI_allbutself(int vector)
164{
165 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
166}
167
168static inline void bigsmp_send_IPI_all(int vector)
169{
170 bigsmp_send_IPI_mask(cpu_online_mask, vector);
171}
172
173static int dmi_bigsmp; /* can be set by dmi scanners */
174
175static int hp_ht_bigsmp(const struct dmi_system_id *d)
176{
177 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
178 dmi_bigsmp = 1;
179 return 0;
180}
181
182
183static const struct dmi_system_id bigsmp_dmi_table[] = {
184 { hp_ht_bigsmp, "HP ProLiant DL760 G2",
185 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
186 DMI_MATCH(DMI_BIOS_VERSION, "P44-"),}
187 },
188
189 { hp_ht_bigsmp, "HP ProLiant DL740",
190 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
191 DMI_MATCH(DMI_BIOS_VERSION, "P47-"),}
192 },
193 { }
194};
195
196static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask)
197{
198 cpus_clear(*retmask);
199 cpu_set(cpu, *retmask);
200}
201
202static int probe_bigsmp(void)
203{
204 if (def_to_bigsmp)
205 dmi_bigsmp = 1;
206 else
207 dmi_check_system(bigsmp_dmi_table);
208 return dmi_bigsmp;
209}
210
211struct genapic apic_bigsmp = {
212
213 .name = "bigsmp",
214 .probe = probe_bigsmp,
215 .acpi_madt_oem_check = NULL,
216 .apic_id_registered = bigsmp_apic_id_registered,
217
218 .irq_delivery_mode = dest_Fixed,
219 /* phys delivery to target CPU: */
220 .irq_dest_mode = 0,
221
222 .target_cpus = bigsmp_target_cpus,
223 .disable_esr = 1,
224 .dest_logical = 0,
225 .check_apicid_used = bigsmp_check_apicid_used,
226 .check_apicid_present = bigsmp_check_apicid_present,
227
228 .vector_allocation_domain = bigsmp_vector_allocation_domain,
229 .init_apic_ldr = bigsmp_init_apic_ldr,
230
231 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
232 .setup_apic_routing = bigsmp_setup_apic_routing,
233 .multi_timer_check = NULL,
234 .apicid_to_node = bigsmp_apicid_to_node,
235 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
236 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
237 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
238 .setup_portio_remap = NULL,
239 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
240 .enable_apic_mode = NULL,
241 .phys_pkg_id = bigsmp_phys_pkg_id,
242 .mps_oem_check = NULL,
243
244 .get_apic_id = bigsmp_get_apic_id,
245 .set_apic_id = NULL,
246 .apic_id_mask = 0xFF << 24,
247
248 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
249 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
250
251 .send_IPI_mask = bigsmp_send_IPI_mask,
252 .send_IPI_mask_allbutself = NULL,
253 .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
254 .send_IPI_all = bigsmp_send_IPI_all,
255 .send_IPI_self = default_send_IPI_self,
256
257 .wakeup_cpu = NULL,
258 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
259 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
260
261 .wait_for_init_deassert = default_wait_for_init_deassert,
262
263 .smp_callin_clear_local_apic = NULL,
264 .store_NMI_vector = NULL,
265 .inquire_remote_apic = default_inquire_remote_apic,
266};
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f0dfe6f17e7e..f63882728d91 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
25#include <asm/uv/bios.h> 25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27 27
28struct uv_systab uv_systab; 28static struct uv_systab uv_systab;
29 29
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
@@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
69 69
70long sn_partition_id; 70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id); 71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id; 72long sn_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id); 73EXPORT_SYMBOL_GPL(sn_coherency_id);
74long uv_region_size; 74long sn_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size); 75EXPORT_SYMBOL_GPL(sn_region_size);
76int uv_type; 76int uv_type;
77 77
78 78
@@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
100 return ret; 100 return ret;
101} 101}
102 102
103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset)
106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist;
109 s64 ret;
110
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /*
115 * bios returns watchlist number or negative error number.
116 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS)
121 return ret;
122
123 return watchlist;
124}
125EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
126
127int
128uv_bios_mq_watchlist_free(int blade, int watchlist_num)
129{
130 return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
131 blade, watchlist_num, 0, 0, 0);
132}
133EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
134
135s64
136uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
137{
138 return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
139 perms, 0, 0);
140}
141EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
142
143s64
144uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
145{
146 s64 ret;
147
148 ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
149 (u64)addr, buf, (u64)len, 0);
150 return ret;
151}
152EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
103 153
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) 154s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{ 155{
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000000..2ac0ab71412a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/kthread.h>
4#include <linux/workqueue.h>
5#include <asm/e820.h>
6#include <asm/proto.h>
7
8/*
9 * Some BIOSes seem to corrupt the low 64k of memory during events
10 * like suspend/resume and unplugging an HDMI cable. Reserve all
11 * remaining free memory in that area and fill it with a distinct
12 * pattern.
13 */
14#define MAX_SCAN_AREAS 8
15
16static int __read_mostly memory_corruption_check = -1;
17
18static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20
21static struct e820entry scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas;
23
24
25static __init int set_corruption_check(char *arg)
26{
27 char *end;
28
29 memory_corruption_check = simple_strtol(arg, &end, 10);
30
31 return (*end == 0) ? 0 : -EINVAL;
32}
33early_param("memory_corruption_check", set_corruption_check);
34
35static __init int set_corruption_check_period(char *arg)
36{
37 char *end;
38
39 corruption_check_period = simple_strtoul(arg, &end, 10);
40
41 return (*end == 0) ? 0 : -EINVAL;
42}
43early_param("memory_corruption_check_period", set_corruption_check_period);
44
45static __init int set_corruption_check_size(char *arg)
46{
47 char *end;
48 unsigned size;
49
50 size = memparse(arg, &end);
51
52 if (*end == '\0')
53 corruption_check_size = size;
54
55 return (size == corruption_check_size) ? 0 : -EINVAL;
56}
57early_param("memory_corruption_check_size", set_corruption_check_size);
58
59
60void __init setup_bios_corruption_check(void)
61{
62 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
63
64 if (memory_corruption_check == -1) {
65 memory_corruption_check =
66#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
67 1
68#else
69 0
70#endif
71 ;
72 }
73
74 if (corruption_check_size == 0)
75 memory_corruption_check = 0;
76
77 if (!memory_corruption_check)
78 return;
79
80 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
81
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85
86 if (addr == 0)
87 break;
88
89 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr;
91
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++;
99
100 /* Assume we've already mapped this early memory */
101 memset(__va(addr), 0, size);
102
103 addr += size;
104 }
105
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
107 num_scan_areas);
108 update_e820();
109}
110
111
112void check_for_bios_corruption(void)
113{
114 int i;
115 int corruption = 0;
116
117 if (!memory_corruption_check)
118 return;
119
120 for (i = 0; i < num_scan_areas; i++) {
121 unsigned long *addr = __va(scan_areas[i].addr);
122 unsigned long size = scan_areas[i].size;
123
124 for (; size; addr++, size -= sizeof(unsigned long)) {
125 if (!*addr)
126 continue;
127 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
128 addr, __pa(addr), *addr);
129 corruption = 1;
130 *addr = 0;
131 }
132 }
133
134 WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
135}
136
137static void check_corruption(struct work_struct *dummy);
138static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
139
140static void check_corruption(struct work_struct *dummy)
141{
142 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ));
145}
146
147static int start_periodic_check_for_corruption(void)
148{
149 if (!memory_corruption_check || corruption_check_period == 0)
150 return 0;
151
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
153 corruption_check_period);
154
155 /* First time we run the checks right away */
156 schedule_delayed_work(&bios_check_work, 0);
157 return 0;
158}
159
160module_init(start_periodic_check_for_corruption);
161
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg
8endif
9
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 10obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o common.o 11obj-y += proc.o capflags.o powerflags.o common.o
12obj-y += vmware.o hypervisor.o
7 13
8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ef8f831af823..e48640cfac0c 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h> 10#include <asm/genapic.h>
11 11
12struct cpuid_bit { 12struct cpuid_bit {
13 u16 feature; 13 u16 feature;
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
69 */ 69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) 70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{ 71{
72#ifdef CONFIG_X86_SMP 72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index; 73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width; 74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings; 75 unsigned int core_select_mask, core_level_siblings;
@@ -116,14 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
116 116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; 117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118 118
119#ifdef CONFIG_X86_32 119 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 120 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 121 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else 122 /*
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; 123 * Reinit the apicid, now that we have extended initial_apicid.
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width); 124 */
126#endif 125 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
126
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128 128
129 129
@@ -135,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
135 return; 135 return;
136#endif 136#endif
137} 137}
138
139#ifdef CONFIG_X86_PAT
140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
141{
142 if (!cpu_has_pat)
143 pat_disable("PAT not supported by CPU.");
144
145 switch (c->x86_vendor) {
146 case X86_VENDOR_INTEL:
147 /*
148 * There is a known erratum on Pentium III and Core Solo
149 * and Core Duo CPUs.
150 * " Page with PAT set to WC while associated MTRR is UC
151 * may consolidate to UC "
152 * Because of this erratum, it is better to stick with
153 * setting WC in MTRR rather than using PAT on these CPUs.
154 *
155 * Enable PAT WC only on P4, Core 2 or later CPUs.
156 */
157 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
158 return;
159
160 pat_disable("PAT WC disabled due to known CPU erratum.");
161 return;
162
163 case X86_VENDOR_AMD:
164 case X86_VENDOR_CENTAUR:
165 case X86_VENDOR_TRANSMETA:
166 return;
167 }
168
169 pat_disable("PAT disabled. Not yet verified on this CPU type.");
170}
171#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f1e31db2ad5..ff4d7b9e32e4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,7 @@
12# include <asm/cacheflush.h> 12# include <asm/cacheflush.h>
13#endif 13#endif
14 14
15#include <mach_apic.h> 15#include <asm/genapic.h>
16 16
17#include "cpu.h" 17#include "cpu.h"
18 18
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{ 283{
284 early_init_amd_mc(c); 284 early_init_amd_mc(c);
285 285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 286 /*
287 if (c->x86_power & (1<<8)) 287 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
288 * with P/T states and does not stop in deep C-states
289 */
290 if (c->x86_power & (1 << 8)) {
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 291 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
292 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
293 }
289 294
290#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32); 296 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9..e8f4a386bd9d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,14 +21,16 @@
21#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/smp.h> 23#include <asm/smp.h>
24#include <asm/cpu.h>
25#include <asm/cpumask.h>
24#ifdef CONFIG_X86_LOCAL_APIC 26#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 27#include <asm/mpspec.h>
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <mach_apic.h>
28#include <asm/genapic.h> 29#include <asm/genapic.h>
30#include <asm/genapic.h>
31#include <asm/uv/uv.h>
29#endif 32#endif
30 33
31#include <asm/pda.h>
32#include <asm/pgtable.h> 34#include <asm/pgtable.h>
33#include <asm/processor.h> 35#include <asm/processor.h>
34#include <asm/desc.h> 36#include <asm/desc.h>
@@ -36,28 +38,59 @@
36#include <asm/proto.h> 38#include <asm/proto.h>
37#include <asm/sections.h> 39#include <asm/sections.h>
38#include <asm/setup.h> 40#include <asm/setup.h>
41#include <asm/hypervisor.h>
42#include <asm/stackprotector.h>
39 43
40#include "cpu.h" 44#include "cpu.h"
41 45
46#ifdef CONFIG_X86_64
47
48/* all of these masks are initialized in setup_cpu_local_masks() */
49cpumask_var_t cpu_callin_mask;
50cpumask_var_t cpu_callout_mask;
51cpumask_var_t cpu_initialized_mask;
52
53/* representing cpus for which sibling maps can be computed */
54cpumask_var_t cpu_sibling_setup_mask;
55
56/* correctly size the local cpu masks */
57void __init setup_cpu_local_masks(void)
58{
59 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
60 alloc_bootmem_cpumask_var(&cpu_callin_mask);
61 alloc_bootmem_cpumask_var(&cpu_callout_mask);
62 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
63}
64
65#else /* CONFIG_X86_32 */
66
67cpumask_t cpu_callin_map;
68cpumask_t cpu_callout_map;
69cpumask_t cpu_initialized;
70cpumask_t cpu_sibling_setup_map;
71
72#endif /* CONFIG_X86_32 */
73
74
42static struct cpu_dev *this_cpu __cpuinitdata; 75static struct cpu_dev *this_cpu __cpuinitdata;
43 76
77DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
44#ifdef CONFIG_X86_64 78#ifdef CONFIG_X86_64
45/* We need valid kernel segments for data and code in long mode too 79 /*
46 * IRET will check the segment types kkeil 2000/10/28 80 * We need valid kernel segments for data and code in long mode too
47 * Also sysret mandates a special GDT layout 81 * IRET will check the segment types kkeil 2000/10/28
48 */ 82 * Also sysret mandates a special GDT layout
49/* The TLS descriptors are currently at a different place compared to i386. 83 *
50 Hopefully nobody expects them at a fixed place (Wine?) */ 84 * The TLS descriptors are currently at a different place compared to i386.
51DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 85 * Hopefully nobody expects them at a fixed place (Wine?)
86 */
52 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 87 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
53 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 88 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
54 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 89 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
55 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 90 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
56 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 91 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
57 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 92 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
58} };
59#else 93#else
60DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
61 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 94 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
62 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 95 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
63 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 96 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -89,9 +122,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
89 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 122 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
90 123
91 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 124 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
92 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 125 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
93} }; 126 GDT_STACK_CANARY_INIT
94#endif 127#endif
128} };
95EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 129EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
96 130
97#ifdef CONFIG_X86_32 131#ifdef CONFIG_X86_32
@@ -192,6 +226,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
192#endif 226#endif
193 227
194/* 228/*
229 * Some CPU features depend on higher CPUID levels, which may not always
230 * be available due to CPUID level capping or broken virtualization
231 * software. Add those features to this table to auto-disable them.
232 */
233struct cpuid_dependent_feature {
234 u32 feature;
235 u32 level;
236};
237static const struct cpuid_dependent_feature __cpuinitconst
238cpuid_dependent_features[] = {
239 { X86_FEATURE_MWAIT, 0x00000005 },
240 { X86_FEATURE_DCA, 0x00000009 },
241 { X86_FEATURE_XSAVE, 0x0000000d },
242 { 0, 0 }
243};
244
245static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
246{
247 const struct cpuid_dependent_feature *df;
248 for (df = cpuid_dependent_features; df->feature; df++) {
249 /*
250 * Note: cpuid_level is set to -1 if unavailable, but
251 * extended_extended_level is set to 0 if unavailable
252 * and the legitimate extended levels are all negative
253 * when signed; hence the weird messing around with
254 * signs here...
255 */
256 if (cpu_has(c, df->feature) &&
257 ((s32)df->feature < 0 ?
258 (u32)df->feature > (u32)c->extended_cpuid_level :
259 (s32)df->feature > (s32)c->cpuid_level)) {
260 clear_cpu_cap(c, df->feature);
261 if (warn)
262 printk(KERN_WARNING
263 "CPU: CPU feature %s disabled "
264 "due to lack of CPUID level 0x%x\n",
265 x86_cap_flags[df->feature],
266 df->level);
267 }
268 }
269}
270
271/*
195 * Naming convention should be: <Name> [(<Codename>)] 272 * Naming convention should be: <Name> [(<Codename>)]
196 * This table only is used unless init_<vendor>() below doesn't set it; 273 * This table only is used unless init_<vendor>() below doesn't set it;
197 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 274 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
@@ -221,18 +298,29 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
221 298
222__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 299__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
223 300
301void load_percpu_segment(int cpu)
302{
303#ifdef CONFIG_X86_32
304 loadsegment(fs, __KERNEL_PERCPU);
305#else
306 loadsegment(gs, 0);
307 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
308#endif
309 load_stack_canary_segment();
310}
311
224/* Current gdt points %fs at the "master" per-cpu area: after this, 312/* Current gdt points %fs at the "master" per-cpu area: after this,
225 * it's on the real one. */ 313 * it's on the real one. */
226void switch_to_new_gdt(void) 314void switch_to_new_gdt(int cpu)
227{ 315{
228 struct desc_ptr gdt_descr; 316 struct desc_ptr gdt_descr;
229 317
230 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 318 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
231 gdt_descr.size = GDT_SIZE - 1; 319 gdt_descr.size = GDT_SIZE - 1;
232 load_gdt(&gdt_descr); 320 load_gdt(&gdt_descr);
233#ifdef CONFIG_X86_32 321 /* Reload the per-cpu base */
234 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 322
235#endif 323 load_percpu_segment(cpu);
236} 324}
237 325
238static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 326static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -354,7 +442,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
354 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 442 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
355 } else if (smp_num_siblings > 1) { 443 } else if (smp_num_siblings > 1) {
356 444
357 if (smp_num_siblings > NR_CPUS) { 445 if (smp_num_siblings > nr_cpu_ids) {
358 printk(KERN_WARNING "CPU: Unsupported number of siblings %d", 446 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
359 smp_num_siblings); 447 smp_num_siblings);
360 smp_num_siblings = 1; 448 smp_num_siblings = 1;
@@ -362,11 +450,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
362 } 450 }
363 451
364 index_msb = get_count_order(smp_num_siblings); 452 index_msb = get_count_order(smp_num_siblings);
365#ifdef CONFIG_X86_64 453 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
366 c->phys_proc_id = phys_pkg_id(index_msb);
367#else
368 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
369#endif
370 454
371 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 455 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
372 456
@@ -374,13 +458,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
374 458
375 core_bits = get_count_order(c->x86_max_cores); 459 core_bits = get_count_order(c->x86_max_cores);
376 460
377#ifdef CONFIG_X86_64 461 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
378 c->cpu_core_id = phys_pkg_id(index_msb) &
379 ((1 << core_bits) - 1); 462 ((1 << core_bits) - 1);
380#else
381 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
382 ((1 << core_bits) - 1);
383#endif
384 } 463 }
385 464
386out: 465out:
@@ -549,11 +628,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
549 if (this_cpu->c_early_init) 628 if (this_cpu->c_early_init)
550 this_cpu->c_early_init(c); 629 this_cpu->c_early_init(c);
551 630
552 validate_pat_support(c);
553
554#ifdef CONFIG_SMP 631#ifdef CONFIG_SMP
555 c->cpu_index = boot_cpu_id; 632 c->cpu_index = boot_cpu_id;
556#endif 633#endif
634 filter_cpuid_features(c, false);
557} 635}
558 636
559void __init early_cpu_init(void) 637void __init early_cpu_init(void)
@@ -616,7 +694,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
616 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; 694 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
617#ifdef CONFIG_X86_32 695#ifdef CONFIG_X86_32
618# ifdef CONFIG_X86_HT 696# ifdef CONFIG_X86_HT
619 c->apicid = phys_pkg_id(c->initial_apicid, 0); 697 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
620# else 698# else
621 c->apicid = c->initial_apicid; 699 c->apicid = c->initial_apicid;
622# endif 700# endif
@@ -663,7 +741,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
663 this_cpu->c_identify(c); 741 this_cpu->c_identify(c);
664 742
665#ifdef CONFIG_X86_64 743#ifdef CONFIG_X86_64
666 c->apicid = phys_pkg_id(0); 744 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
667#endif 745#endif
668 746
669 /* 747 /*
@@ -687,6 +765,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
687 * we do "generic changes." 765 * we do "generic changes."
688 */ 766 */
689 767
768 /* Filter out anything that depends on CPUID levels we don't have */
769 filter_cpuid_features(c, true);
770
690 /* If the model name is still unset, do table lookup. */ 771 /* If the model name is still unset, do table lookup. */
691 if (!c->x86_model_id[0]) { 772 if (!c->x86_model_id[0]) {
692 char *p; 773 char *p;
@@ -703,6 +784,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
703 detect_ht(c); 784 detect_ht(c);
704#endif 785#endif
705 786
787 init_hypervisor(c);
706 /* 788 /*
707 * On SMP, boot_cpu_data holds the common feature set between 789 * On SMP, boot_cpu_data holds the common feature set between
708 * all CPUs; so make sure that we indicate which features are 790 * all CPUs; so make sure that we indicate which features are
@@ -854,57 +936,23 @@ static __init int setup_disablecpuid(char *arg)
854} 936}
855__setup("clearcpuid=", setup_disablecpuid); 937__setup("clearcpuid=", setup_disablecpuid);
856 938
857cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
858
859#ifdef CONFIG_X86_64 939#ifdef CONFIG_X86_64
860struct x8664_pda **_cpu_pda __read_mostly;
861EXPORT_SYMBOL(_cpu_pda);
862
863struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 940struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
864 941
865char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 942DEFINE_PER_CPU_FIRST(union irq_stack_union,
943 irq_stack_union) __aligned(PAGE_SIZE);
944DEFINE_PER_CPU(char *, irq_stack_ptr) =
945 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
866 946
867void __cpuinit pda_init(int cpu) 947DEFINE_PER_CPU(unsigned long, kernel_stack) =
868{ 948 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
869 struct x8664_pda *pda = cpu_pda(cpu); 949EXPORT_PER_CPU_SYMBOL(kernel_stack);
870 950
871 /* Setup up data that may be needed in __get_free_pages early */ 951DEFINE_PER_CPU(unsigned int, irq_count) = -1;
872 loadsegment(fs, 0);
873 loadsegment(gs, 0);
874 /* Memory clobbers used to order PDA accessed */
875 mb();
876 wrmsrl(MSR_GS_BASE, pda);
877 mb();
878
879 pda->cpunumber = cpu;
880 pda->irqcount = -1;
881 pda->kernelstack = (unsigned long)stack_thread_info() -
882 PDA_STACKOFFSET + THREAD_SIZE;
883 pda->active_mm = &init_mm;
884 pda->mmu_state = 0;
885
886 if (cpu == 0) {
887 /* others are initialized in smpboot.c */
888 pda->pcurrent = &init_task;
889 pda->irqstackptr = boot_cpu_stack;
890 pda->irqstackptr += IRQSTACKSIZE - 64;
891 } else {
892 if (!pda->irqstackptr) {
893 pda->irqstackptr = (char *)
894 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
895 if (!pda->irqstackptr)
896 panic("cannot allocate irqstack for cpu %d",
897 cpu);
898 pda->irqstackptr += IRQSTACKSIZE - 64;
899 }
900 952
901 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 953static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
902 pda->nodenumber = cpu_to_node(cpu); 954 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
903 } 955 __aligned(PAGE_SIZE);
904}
905
906char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
907 DEBUG_STKSZ] __page_aligned_bss;
908 956
909extern asmlinkage void ignore_sysret(void); 957extern asmlinkage void ignore_sysret(void);
910 958
@@ -937,16 +985,21 @@ unsigned long kernel_eflags;
937 */ 985 */
938DEFINE_PER_CPU(struct orig_ist, orig_ist); 986DEFINE_PER_CPU(struct orig_ist, orig_ist);
939 987
940#else 988#else /* x86_64 */
989
990#ifdef CONFIG_CC_STACKPROTECTOR
991DEFINE_PER_CPU(unsigned long, stack_canary);
992#endif
941 993
942/* Make sure %fs is initialized properly in idle threads */ 994/* Make sure %fs and %gs are initialized properly in idle threads */
943struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 995struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
944{ 996{
945 memset(regs, 0, sizeof(struct pt_regs)); 997 memset(regs, 0, sizeof(struct pt_regs));
946 regs->fs = __KERNEL_PERCPU; 998 regs->fs = __KERNEL_PERCPU;
999 regs->gs = __KERNEL_STACK_CANARY;
947 return regs; 1000 return regs;
948} 1001}
949#endif 1002#endif /* x86_64 */
950 1003
951/* 1004/*
952 * cpu_init() initializes state that is per-CPU. Some data is already 1005 * cpu_init() initializes state that is per-CPU. Some data is already
@@ -962,19 +1015,18 @@ void __cpuinit cpu_init(void)
962 struct tss_struct *t = &per_cpu(init_tss, cpu); 1015 struct tss_struct *t = &per_cpu(init_tss, cpu);
963 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); 1016 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
964 unsigned long v; 1017 unsigned long v;
965 char *estacks = NULL;
966 struct task_struct *me; 1018 struct task_struct *me;
967 int i; 1019 int i;
968 1020
969 /* CPU 0 is initialised in head64.c */ 1021#ifdef CONFIG_NUMA
970 if (cpu != 0) 1022 if (cpu != 0 && percpu_read(node_number) == 0 &&
971 pda_init(cpu); 1023 cpu_to_node(cpu) != NUMA_NO_NODE)
972 else 1024 percpu_write(node_number, cpu_to_node(cpu));
973 estacks = boot_exception_stacks; 1025#endif
974 1026
975 me = current; 1027 me = current;
976 1028
977 if (cpu_test_and_set(cpu, cpu_initialized)) 1029 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
978 panic("CPU#%d already initialized!\n", cpu); 1030 panic("CPU#%d already initialized!\n", cpu);
979 1031
980 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1032 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -986,7 +1038,9 @@ void __cpuinit cpu_init(void)
986 * and set up the GDT descriptor: 1038 * and set up the GDT descriptor:
987 */ 1039 */
988 1040
989 switch_to_new_gdt(); 1041 switch_to_new_gdt(cpu);
1042 loadsegment(fs, 0);
1043
990 load_idt((const struct desc_ptr *)&idt_descr); 1044 load_idt((const struct desc_ptr *)&idt_descr);
991 1045
992 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 1046 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1004,18 +1058,13 @@ void __cpuinit cpu_init(void)
1004 * set up and load the per-CPU TSS 1058 * set up and load the per-CPU TSS
1005 */ 1059 */
1006 if (!orig_ist->ist[0]) { 1060 if (!orig_ist->ist[0]) {
1007 static const unsigned int order[N_EXCEPTION_STACKS] = { 1061 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1008 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1062 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1009 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 1063 [DEBUG_STACK - 1] = DEBUG_STKSZ
1010 }; 1064 };
1065 char *estacks = per_cpu(exception_stacks, cpu);
1011 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1066 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1012 if (cpu) { 1067 estacks += sizes[v];
1013 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1014 if (!estacks)
1015 panic("Cannot allocate exception "
1016 "stack %ld %d\n", v, cpu);
1017 }
1018 estacks += PAGE_SIZE << order[v];
1019 orig_ist->ist[v] = t->x86_tss.ist[v] = 1068 orig_ist->ist[v] = t->x86_tss.ist[v] =
1020 (unsigned long)estacks; 1069 (unsigned long)estacks;
1021 } 1070 }
@@ -1049,22 +1098,19 @@ void __cpuinit cpu_init(void)
1049 */ 1098 */
1050 if (kgdb_connected && arch_kgdb_ops.correct_hw_break) 1099 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1051 arch_kgdb_ops.correct_hw_break(); 1100 arch_kgdb_ops.correct_hw_break();
1052 else { 1101 else
1053#endif 1102#endif
1054 /* 1103 {
1055 * Clear all 6 debug registers: 1104 /*
1056 */ 1105 * Clear all 6 debug registers:
1057 1106 */
1058 set_debugreg(0UL, 0); 1107 set_debugreg(0UL, 0);
1059 set_debugreg(0UL, 1); 1108 set_debugreg(0UL, 1);
1060 set_debugreg(0UL, 2); 1109 set_debugreg(0UL, 2);
1061 set_debugreg(0UL, 3); 1110 set_debugreg(0UL, 3);
1062 set_debugreg(0UL, 6); 1111 set_debugreg(0UL, 6);
1063 set_debugreg(0UL, 7); 1112 set_debugreg(0UL, 7);
1064#ifdef CONFIG_KGDB
1065 /* If the kgdb is connected no debug regs should be altered. */
1066 } 1113 }
1067#endif
1068 1114
1069 fpu_init(); 1115 fpu_init();
1070 1116
@@ -1083,7 +1129,7 @@ void __cpuinit cpu_init(void)
1083 struct tss_struct *t = &per_cpu(init_tss, cpu); 1129 struct tss_struct *t = &per_cpu(init_tss, cpu);
1084 struct thread_struct *thread = &curr->thread; 1130 struct thread_struct *thread = &curr->thread;
1085 1131
1086 if (cpu_test_and_set(cpu, cpu_initialized)) { 1132 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1087 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 1133 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1088 for (;;) local_irq_enable(); 1134 for (;;) local_irq_enable();
1089 } 1135 }
@@ -1094,7 +1140,7 @@ void __cpuinit cpu_init(void)
1094 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1140 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1095 1141
1096 load_idt(&idt_descr); 1142 load_idt(&idt_descr);
1097 switch_to_new_gdt(); 1143 switch_to_new_gdt(cpu);
1098 1144
1099 /* 1145 /*
1100 * Set up and load the per-CPU TSS and LDT 1146 * Set up and load the per-CPU TSS and LDT
@@ -1115,9 +1161,6 @@ void __cpuinit cpu_init(void)
1115 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 1161 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1116#endif 1162#endif
1117 1163
1118 /* Clear %gs. */
1119 asm volatile ("mov %0, %%gs" : : "r" (0));
1120
1121 /* Clear all 6 debug registers: */ 1164 /* Clear all 6 debug registers: */
1122 set_debugreg(0, 0); 1165 set_debugreg(0, 0);
1123 set_debugreg(0, 1); 1166 set_debugreg(0, 1);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index efae3b22a0ff..65792c2cc462 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -245,17 +245,6 @@ config X86_E_POWERSAVER
245 245
246comment "shared options" 246comment "shared options"
247 247
248config X86_ACPI_CPUFREQ_PROC_INTF
249 bool "/proc/acpi/processor/../performance interface (deprecated)"
250 depends on PROC_FS
251 depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
252 help
253 This enables the deprecated /proc/acpi/processor/../performance
254 interface. While it is helpful for debugging, the generic,
255 cross-architecture cpufreq interfaces should be used.
256
257 If in doubt, say N.
258
259config X86_SPEEDSTEP_LIB 248config X86_SPEEDSTEP_LIB
260 tristate 249 tristate
261 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) 250 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..4b1c319d30c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -144,13 +145,14 @@ typedef union {
144 145
145struct drv_cmd { 146struct drv_cmd {
146 unsigned int type; 147 unsigned int type;
147 cpumask_t mask; 148 const struct cpumask *mask;
148 drv_addr_union addr; 149 drv_addr_union addr;
149 u32 val; 150 u32 val;
150}; 151};
151 152
152static void do_drv_read(struct drv_cmd *cmd) 153static long do_drv_read(void *_cmd)
153{ 154{
155 struct drv_cmd *cmd = _cmd;
154 u32 h; 156 u32 h;
155 157
156 switch (cmd->type) { 158 switch (cmd->type) {
@@ -165,10 +167,12 @@ static void do_drv_read(struct drv_cmd *cmd)
165 default: 167 default:
166 break; 168 break;
167 } 169 }
170 return 0;
168} 171}
169 172
170static void do_drv_write(struct drv_cmd *cmd) 173static long do_drv_write(void *_cmd)
171{ 174{
175 struct drv_cmd *cmd = _cmd;
172 u32 lo, hi; 176 u32 lo, hi;
173 177
174 switch (cmd->type) { 178 switch (cmd->type) {
@@ -185,48 +189,41 @@ static void do_drv_write(struct drv_cmd *cmd)
185 default: 189 default:
186 break; 190 break;
187 } 191 }
192 return 0;
188} 193}
189 194
190static void drv_read(struct drv_cmd *cmd) 195static void drv_read(struct drv_cmd *cmd)
191{ 196{
192 cpumask_t saved_mask = current->cpus_allowed;
193 cmd->val = 0; 197 cmd->val = 0;
194 198
195 set_cpus_allowed_ptr(current, &cmd->mask); 199 work_on_cpu(cpumask_any(cmd->mask), do_drv_read, cmd);
196 do_drv_read(cmd);
197 set_cpus_allowed_ptr(current, &saved_mask);
198} 200}
199 201
200static void drv_write(struct drv_cmd *cmd) 202static void drv_write(struct drv_cmd *cmd)
201{ 203{
202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 204 unsigned int i;
204 205
205 for_each_cpu_mask_nr(i, cmd->mask) { 206 for_each_cpu(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 207 work_on_cpu(i, do_drv_write, cmd);
207 do_drv_write(cmd);
208 } 208 }
209
210 set_cpus_allowed_ptr(current, &saved_mask);
211 return;
212} 209}
213 210
214static u32 get_cur_val(const cpumask_t *mask) 211static u32 get_cur_val(const struct cpumask *mask)
215{ 212{
216 struct acpi_processor_performance *perf; 213 struct acpi_processor_performance *perf;
217 struct drv_cmd cmd; 214 struct drv_cmd cmd;
218 215
219 if (unlikely(cpus_empty(*mask))) 216 if (unlikely(cpumask_empty(mask)))
220 return 0; 217 return 0;
221 218
222 switch (per_cpu(drv_data, first_cpu(*mask))->cpu_feature) { 219 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {
223 case SYSTEM_INTEL_MSR_CAPABLE: 220 case SYSTEM_INTEL_MSR_CAPABLE:
224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 221 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 222 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
226 break; 223 break;
227 case SYSTEM_IO_CAPABLE: 224 case SYSTEM_IO_CAPABLE:
228 cmd.type = SYSTEM_IO_CAPABLE; 225 cmd.type = SYSTEM_IO_CAPABLE;
229 perf = per_cpu(drv_data, first_cpu(*mask))->acpi_data; 226 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;
230 cmd.addr.io.port = perf->control_register.address; 227 cmd.addr.io.port = perf->control_register.address;
231 cmd.addr.io.bit_width = perf->control_register.bit_width; 228 cmd.addr.io.bit_width = perf->control_register.bit_width;
232 break; 229 break;
@@ -234,8 +231,7 @@ static u32 get_cur_val(const cpumask_t *mask)
234 return 0; 231 return 0;
235 } 232 }
236 233
237 cmd.mask = *mask; 234 cmd.mask = mask;
238
239 drv_read(&cmd); 235 drv_read(&cmd);
240 236
241 dprintk("get_cur_val = %u\n", cmd.val); 237 dprintk("get_cur_val = %u\n", cmd.val);
@@ -243,6 +239,30 @@ static u32 get_cur_val(const cpumask_t *mask)
243 return cmd.val; 239 return cmd.val;
244} 240}
245 241
242struct perf_cur {
243 union {
244 struct {
245 u32 lo;
246 u32 hi;
247 } split;
248 u64 whole;
249 } aperf_cur, mperf_cur;
250};
251
252
253static long read_measured_perf_ctrs(void *_cur)
254{
255 struct perf_cur *cur = _cur;
256
257 rdmsr(MSR_IA32_APERF, cur->aperf_cur.split.lo, cur->aperf_cur.split.hi);
258 rdmsr(MSR_IA32_MPERF, cur->mperf_cur.split.lo, cur->mperf_cur.split.hi);
259
260 wrmsr(MSR_IA32_APERF, 0, 0);
261 wrmsr(MSR_IA32_MPERF, 0, 0);
262
263 return 0;
264}
265
246/* 266/*
247 * Return the measured active (C0) frequency on this CPU since last call 267 * Return the measured active (C0) frequency on this CPU since last call
248 * to this function. 268 * to this function.
@@ -259,31 +279,12 @@ static u32 get_cur_val(const cpumask_t *mask)
259static unsigned int get_measured_perf(struct cpufreq_policy *policy, 279static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu) 280 unsigned int cpu)
261{ 281{
262 union { 282 struct perf_cur cur;
263 struct {
264 u32 lo;
265 u32 hi;
266 } split;
267 u64 whole;
268 } aperf_cur, mperf_cur;
269
270 cpumask_t saved_mask;
271 unsigned int perf_percent; 283 unsigned int perf_percent;
272 unsigned int retval; 284 unsigned int retval;
273 285
274 saved_mask = current->cpus_allowed; 286 if (!work_on_cpu(cpu, read_measured_perf_ctrs, &cur))
275 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
276 if (get_cpu() != cpu) {
277 /* We were not able to run on requested processor */
278 put_cpu();
279 return 0; 287 return 0;
280 }
281
282 rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
283 rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
284
285 wrmsr(MSR_IA32_APERF, 0,0);
286 wrmsr(MSR_IA32_MPERF, 0,0);
287 288
288#ifdef __i386__ 289#ifdef __i386__
289 /* 290 /*
@@ -291,37 +292,39 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
291 * Get an approximate value. Return failure in case we cannot get 292 * Get an approximate value. Return failure in case we cannot get
292 * an approximate value. 293 * an approximate value.
293 */ 294 */
294 if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) { 295 if (unlikely(cur.aperf_cur.split.hi || cur.mperf_cur.split.hi)) {
295 int shift_count; 296 int shift_count;
296 u32 h; 297 u32 h;
297 298
298 h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi); 299 h = max_t(u32, cur.aperf_cur.split.hi, cur.mperf_cur.split.hi);
299 shift_count = fls(h); 300 shift_count = fls(h);
300 301
301 aperf_cur.whole >>= shift_count; 302 cur.aperf_cur.whole >>= shift_count;
302 mperf_cur.whole >>= shift_count; 303 cur.mperf_cur.whole >>= shift_count;
303 } 304 }
304 305
305 if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) { 306 if (((unsigned long)(-1) / 100) < cur.aperf_cur.split.lo) {
306 int shift_count = 7; 307 int shift_count = 7;
307 aperf_cur.split.lo >>= shift_count; 308 cur.aperf_cur.split.lo >>= shift_count;
308 mperf_cur.split.lo >>= shift_count; 309 cur.mperf_cur.split.lo >>= shift_count;
309 } 310 }
310 311
311 if (aperf_cur.split.lo && mperf_cur.split.lo) 312 if (cur.aperf_cur.split.lo && cur.mperf_cur.split.lo)
312 perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo; 313 perf_percent = (cur.aperf_cur.split.lo * 100) /
314 cur.mperf_cur.split.lo;
313 else 315 else
314 perf_percent = 0; 316 perf_percent = 0;
315 317
316#else 318#else
317 if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) { 319 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf_cur.whole)) {
318 int shift_count = 7; 320 int shift_count = 7;
319 aperf_cur.whole >>= shift_count; 321 cur.aperf_cur.whole >>= shift_count;
320 mperf_cur.whole >>= shift_count; 322 cur.mperf_cur.whole >>= shift_count;
321 } 323 }
322 324
323 if (aperf_cur.whole && mperf_cur.whole) 325 if (cur.aperf_cur.whole && cur.mperf_cur.whole)
324 perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole; 326 perf_percent = (cur.aperf_cur.whole * 100) /
327 cur.mperf_cur.whole;
325 else 328 else
326 perf_percent = 0; 329 perf_percent = 0;
327 330
@@ -329,10 +332,6 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
329 332
330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; 333 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
331 334
332 put_cpu();
333 set_cpus_allowed_ptr(current, &saved_mask);
334
335 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
336 return retval; 335 return retval;
337} 336}
338 337
@@ -350,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
350 } 349 }
351 350
352 cached_freq = data->freq_table[data->acpi_data->state].frequency; 351 cached_freq = data->freq_table[data->acpi_data->state].frequency;
353 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); 352 freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
354 if (freq != cached_freq) { 353 if (freq != cached_freq) {
355 /* 354 /*
356 * The dreaded BIOS frequency change behind our back. 355 * The dreaded BIOS frequency change behind our back.
@@ -364,7 +363,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
364 return freq; 363 return freq;
365} 364}
366 365
367static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq, 366static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
368 struct acpi_cpufreq_data *data) 367 struct acpi_cpufreq_data *data)
369{ 368{
370 unsigned int cur_freq; 369 unsigned int cur_freq;
@@ -385,12 +384,12 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
385 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 384 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
386 struct acpi_processor_performance *perf; 385 struct acpi_processor_performance *perf;
387 struct cpufreq_freqs freqs; 386 struct cpufreq_freqs freqs;
388 cpumask_t online_policy_cpus;
389 struct drv_cmd cmd; 387 struct drv_cmd cmd;
390 unsigned int next_state = 0; /* Index into freq_table */ 388 unsigned int next_state = 0; /* Index into freq_table */
391 unsigned int next_perf_state = 0; /* Index into perf table */ 389 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 390 unsigned int i;
393 int result = 0; 391 int result = 0;
392 struct power_trace it;
394 393
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 394 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 395
@@ -404,15 +403,10 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
404 data->freq_table, 403 data->freq_table,
405 target_freq, 404 target_freq,
406 relation, &next_state); 405 relation, &next_state);
407 if (unlikely(result)) 406 if (unlikely(result)) {
408 return -ENODEV; 407 result = -ENODEV;
409 408 goto out;
410#ifdef CONFIG_HOTPLUG_CPU 409 }
411 /* cpufreq holds the hotplug lock, so we are safe from here on */
412 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
413#else
414 online_policy_cpus = policy->cpus;
415#endif
416 410
417 next_perf_state = data->freq_table[next_state].index; 411 next_perf_state = data->freq_table[next_state].index;
418 if (perf->state == next_perf_state) { 412 if (perf->state == next_perf_state) {
@@ -423,10 +417,12 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
423 } else { 417 } else {
424 dprintk("Already at target state (P%d)\n", 418 dprintk("Already at target state (P%d)\n",
425 next_perf_state); 419 next_perf_state);
426 return 0; 420 goto out;
427 } 421 }
428 } 422 }
429 423
424 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
425
430 switch (data->cpu_feature) { 426 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 427 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 428 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -440,19 +436,19 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
440 cmd.val = (u32) perf->states[next_perf_state].control; 436 cmd.val = (u32) perf->states[next_perf_state].control;
441 break; 437 break;
442 default: 438 default:
443 return -ENODEV; 439 result = -ENODEV;
440 goto out;
444 } 441 }
445 442
446 cpus_clear(cmd.mask); 443 /* cpufreq holds the hotplug lock, so we are safe from here on */
447
448 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) 444 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
449 cmd.mask = online_policy_cpus; 445 cmd.mask = policy->cpus;
450 else 446 else
451 cpu_set(policy->cpu, cmd.mask); 447 cmd.mask = cpumask_of(policy->cpu);
452 448
453 freqs.old = perf->states[perf->state].core_frequency * 1000; 449 freqs.old = perf->states[perf->state].core_frequency * 1000;
454 freqs.new = data->freq_table[next_state].frequency; 450 freqs.new = data->freq_table[next_state].frequency;
455 for_each_cpu_mask_nr(i, cmd.mask) { 451 for_each_cpu(i, cmd.mask) {
456 freqs.cpu = i; 452 freqs.cpu = i;
457 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 453 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
458 } 454 }
@@ -460,19 +456,21 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
460 drv_write(&cmd); 456 drv_write(&cmd);
461 457
462 if (acpi_pstate_strict) { 458 if (acpi_pstate_strict) {
463 if (!check_freqs(&cmd.mask, freqs.new, data)) { 459 if (!check_freqs(cmd.mask, freqs.new, data)) {
464 dprintk("acpi_cpufreq_target failed (%d)\n", 460 dprintk("acpi_cpufreq_target failed (%d)\n",
465 policy->cpu); 461 policy->cpu);
466 return -EAGAIN; 462 result = -EAGAIN;
463 goto out;
467 } 464 }
468 } 465 }
469 466
470 for_each_cpu_mask_nr(i, cmd.mask) { 467 for_each_cpu(i, cmd.mask) {
471 freqs.cpu = i; 468 freqs.cpu = i;
472 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 469 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
473 } 470 }
474 perf->state = next_perf_state; 471 perf->state = next_perf_state;
475 472
473out:
476 return result; 474 return result;
477} 475}
478 476
@@ -513,6 +511,17 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
513 } 511 }
514} 512}
515 513
514static void free_acpi_perf_data(void)
515{
516 unsigned int i;
517
518 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
519 for_each_possible_cpu(i)
520 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
521 ->shared_cpu_map);
522 free_percpu(acpi_perf_data);
523}
524
516/* 525/*
517 * acpi_cpufreq_early_init - initialize ACPI P-States library 526 * acpi_cpufreq_early_init - initialize ACPI P-States library
518 * 527 *
@@ -523,6 +532,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
523 */ 532 */
524static int __init acpi_cpufreq_early_init(void) 533static int __init acpi_cpufreq_early_init(void)
525{ 534{
535 unsigned int i;
526 dprintk("acpi_cpufreq_early_init\n"); 536 dprintk("acpi_cpufreq_early_init\n");
527 537
528 acpi_perf_data = alloc_percpu(struct acpi_processor_performance); 538 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
@@ -530,6 +540,16 @@ static int __init acpi_cpufreq_early_init(void)
530 dprintk("Memory allocation error for acpi_perf_data.\n"); 540 dprintk("Memory allocation error for acpi_perf_data.\n");
531 return -ENOMEM; 541 return -ENOMEM;
532 } 542 }
543 for_each_possible_cpu(i) {
544 if (!alloc_cpumask_var_node(
545 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
546 GFP_KERNEL, cpu_to_node(i))) {
547
548 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
549 free_acpi_perf_data();
550 return -ENOMEM;
551 }
552 }
533 553
534 /* Do initialization in ACPI core */ 554 /* Do initialization in ACPI core */
535 acpi_processor_preregister_performance(acpi_perf_data); 555 acpi_processor_preregister_performance(acpi_perf_data);
@@ -600,15 +620,15 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
600 */ 620 */
601 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || 621 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
602 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 622 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
603 policy->cpus = perf->shared_cpu_map; 623 cpumask_copy(policy->cpus, perf->shared_cpu_map);
604 } 624 }
605 policy->related_cpus = perf->shared_cpu_map; 625 cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
606 626
607#ifdef CONFIG_SMP 627#ifdef CONFIG_SMP
608 dmi_check_system(sw_any_bug_dmi_table); 628 dmi_check_system(sw_any_bug_dmi_table);
609 if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) { 629 if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
610 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; 630 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
611 policy->cpus = per_cpu(cpu_core_map, cpu); 631 cpumask_copy(policy->cpus, cpu_core_mask(cpu));
612 } 632 }
613#endif 633#endif
614 634
@@ -791,7 +811,7 @@ static int __init acpi_cpufreq_init(void)
791 811
792 ret = cpufreq_register_driver(&acpi_cpufreq_driver); 812 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
793 if (ret) 813 if (ret)
794 free_percpu(acpi_perf_data); 814 free_acpi_perf_data();
795 815
796 return ret; 816 return ret;
797} 817}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index b0461856acfb..a4cff5d6e380 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -982,7 +982,7 @@ static int __init longhaul_init(void)
982 case 10: 982 case 10:
983 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); 983 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
984 default: 984 default:
985 ;; 985 ;
986 } 986 }
987 987
988 return -ENODEV; 988 return -ENODEV;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b8e05ee4f736..b585e04cbc9e 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask_nr(i, policy->cpus) { 125 for_each_cpu(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask_nr(i, policy->cpus) 133 for_each_cpu(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask_nr(i, policy->cpus) { 137 for_each_cpu(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
@@ -160,6 +160,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
160 switch (c->x86_model) { 160 switch (c->x86_model) {
161 case 0x0E: /* Core */ 161 case 0x0E: /* Core */
162 case 0x0F: /* Core Duo */ 162 case 0x0F: /* Core Duo */
163 case 0x16: /* Celeron Core */
163 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 164 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
164 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE); 165 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
165 case 0x0D: /* Pentium M (Dothan) */ 166 case 0x0D: /* Pentium M (Dothan) */
@@ -171,7 +172,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 172 }
172 173
173 if (c->x86 != 0xF) { 174 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n"); 175 if (!cpu_has(c, X86_FEATURE_EST))
176 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. "
177 "Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 178 return 0;
176 } 179 }
177 180
@@ -200,7 +203,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
200 unsigned int i; 203 unsigned int i;
201 204
202#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
203 policy->cpus = per_cpu(cpu_sibling_map, policy->cpu); 206 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
204#endif 207#endif
205 208
206 /* Errata workaround */ 209 /* Errata workaround */
@@ -274,6 +277,7 @@ static struct cpufreq_driver p4clockmod_driver = {
274 .name = "p4-clockmod", 277 .name = "p4-clockmod",
275 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
276 .attr = p4clockmod_attr, 279 .attr = p4clockmod_attr,
280 .hide_interface = 1,
277}; 281};
278 282
279 283
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 7c7d56b43136..1b446d79a8fd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -310,6 +310,12 @@ static int powernow_acpi_init(void)
310 goto err0; 310 goto err0;
311 } 311 }
312 312
313 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
314 GFP_KERNEL)) {
315 retval = -ENOMEM;
316 goto err05;
317 }
318
313 if (acpi_processor_register_performance(acpi_processor_perf, 0)) { 319 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
314 retval = -EIO; 320 retval = -EIO;
315 goto err1; 321 goto err1;
@@ -412,6 +418,8 @@ static int powernow_acpi_init(void)
412err2: 418err2:
413 acpi_processor_unregister_performance(acpi_processor_perf, 0); 419 acpi_processor_unregister_performance(acpi_processor_perf, 0);
414err1: 420err1:
421 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
422err05:
415 kfree(acpi_processor_perf); 423 kfree(acpi_processor_perf);
416err0: 424err0:
417 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); 425 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
@@ -652,6 +660,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
652#ifdef CONFIG_X86_POWERNOW_K7_ACPI 660#ifdef CONFIG_X86_POWERNOW_K7_ACPI
653 if (acpi_processor_perf) { 661 if (acpi_processor_perf) {
654 acpi_processor_unregister_performance(acpi_processor_perf, 0); 662 acpi_processor_unregister_performance(acpi_processor_perf, 0);
663 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
655 kfree(acpi_processor_perf); 664 kfree(acpi_processor_perf);
656 } 665 }
657#endif 666#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d3dcd58b87cd..fb039cd345d8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -115,9 +115,20 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
115 u32 i = 0; 115 u32 i = 0;
116 116
117 if (cpu_family == CPU_HW_PSTATE) { 117 if (cpu_family == CPU_HW_PSTATE) {
118 rdmsr(MSR_PSTATE_STATUS, lo, hi); 118 if (data->currpstate == HW_PSTATE_INVALID) {
119 i = lo & HW_PSTATE_MASK; 119 /* read (initial) hw pstate if not yet set */
120 data->currpstate = i; 120 rdmsr(MSR_PSTATE_STATUS, lo, hi);
121 i = lo & HW_PSTATE_MASK;
122
123 /*
124 * a workaround for family 11h erratum 311 might cause
125 * an "out-of-range Pstate if the core is in Pstate-0
126 */
127 if (i >= data->numps)
128 data->currpstate = HW_PSTATE_0;
129 else
130 data->currpstate = i;
131 }
121 return 0; 132 return 0;
122 } 133 }
123 do { 134 do {
@@ -755,7 +766,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
755static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 766static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
756{ 767{
757 struct cpufreq_frequency_table *powernow_table; 768 struct cpufreq_frequency_table *powernow_table;
758 int ret_val; 769 int ret_val = -ENODEV;
759 770
760 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 771 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
761 dprintk("register performance failed: bad ACPI data\n"); 772 dprintk("register performance failed: bad ACPI data\n");
@@ -804,6 +815,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
804 /* notify BIOS that we exist */ 815 /* notify BIOS that we exist */
805 acpi_processor_notify_smm(THIS_MODULE); 816 acpi_processor_notify_smm(THIS_MODULE);
806 817
818 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
819 printk(KERN_ERR PFX
820 "unable to alloc powernow_k8_data cpumask\n");
821 ret_val = -ENOMEM;
822 goto err_out_mem;
823 }
824
807 return 0; 825 return 0;
808 826
809err_out_mem: 827err_out_mem:
@@ -815,7 +833,7 @@ err_out:
815 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 833 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
816 data->acpi_data.state_count = 0; 834 data->acpi_data.state_count = 0;
817 835
818 return -ENODEV; 836 return ret_val;
819} 837}
820 838
821static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 839static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
@@ -918,12 +936,28 @@ static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
918{ 936{
919 if (data->acpi_data.state_count) 937 if (data->acpi_data.state_count)
920 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 938 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
939 free_cpumask_var(data->acpi_data.shared_cpu_map);
940}
941
942static int get_transition_latency(struct powernow_k8_data *data)
943{
944 int max_latency = 0;
945 int i;
946 for (i = 0; i < data->acpi_data.state_count; i++) {
947 int cur_latency = data->acpi_data.states[i].transition_latency
948 + data->acpi_data.states[i].bus_master_latency;
949 if (cur_latency > max_latency)
950 max_latency = cur_latency;
951 }
952 /* value in usecs, needs to be in nanoseconds */
953 return 1000 * max_latency;
921} 954}
922 955
923#else 956#else
924static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } 957static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
925static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } 958static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
926static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } 959static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
960static int get_transition_latency(struct powernow_k8_data *data) { return 0; }
927#endif /* CONFIG_X86_POWERNOW_K8_ACPI */ 961#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
928 962
929/* Take a frequency, and issue the fid/vid transition command */ 963/* Take a frequency, and issue the fid/vid transition command */
@@ -1121,8 +1155,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1121 } 1155 }
1122 1156
1123 data->cpu = pol->cpu; 1157 data->cpu = pol->cpu;
1158 data->currpstate = HW_PSTATE_INVALID;
1124 1159
1125 if (powernow_k8_cpu_init_acpi(data)) { 1160 rc = powernow_k8_cpu_init_acpi(data);
1161 if (rc) {
1126 /* 1162 /*
1127 * Use the PSB BIOS structure. This is only availabe on 1163 * Use the PSB BIOS structure. This is only availabe on
1128 * an UP version, and is deprecated by AMD. 1164 * an UP version, and is deprecated by AMD.
@@ -1140,22 +1176,25 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1140 "ACPI maintainers and complain to your BIOS " 1176 "ACPI maintainers and complain to your BIOS "
1141 "vendor.\n"); 1177 "vendor.\n");
1142#endif 1178#endif
1143 kfree(data); 1179 goto err_out;
1144 return -ENODEV;
1145 } 1180 }
1146 if (pol->cpu != 0) { 1181 if (pol->cpu != 0) {
1147 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " 1182 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1148 "CPU other than CPU0. Complain to your BIOS " 1183 "CPU other than CPU0. Complain to your BIOS "
1149 "vendor.\n"); 1184 "vendor.\n");
1150 kfree(data); 1185 goto err_out;
1151 return -ENODEV;
1152 } 1186 }
1153 rc = find_psb_table(data); 1187 rc = find_psb_table(data);
1154 if (rc) { 1188 if (rc) {
1155 kfree(data); 1189 goto err_out;
1156 return -ENODEV;
1157 } 1190 }
1158 } 1191 /* Take a crude guess here.
1192 * That guess was in microseconds, so multiply with 1000 */
1193 pol->cpuinfo.transition_latency = (
1194 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1195 ((1 << data->irt) * 30)) * 1000;
1196 } else /* ACPI _PSS objects available */
1197 pol->cpuinfo.transition_latency = get_transition_latency(data);
1159 1198
1160 /* only run on specific CPU from here on */ 1199 /* only run on specific CPU from here on */
1161 oldmask = current->cpus_allowed; 1200 oldmask = current->cpus_allowed;
@@ -1181,15 +1220,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1181 set_cpus_allowed_ptr(current, &oldmask); 1220 set_cpus_allowed_ptr(current, &oldmask);
1182 1221
1183 if (cpu_family == CPU_HW_PSTATE) 1222 if (cpu_family == CPU_HW_PSTATE)
1184 pol->cpus = cpumask_of_cpu(pol->cpu); 1223 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1185 else 1224 else
1186 pol->cpus = per_cpu(cpu_core_map, pol->cpu); 1225 cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu));
1187 data->available_cores = &(pol->cpus); 1226 data->available_cores = pol->cpus;
1188
1189 /* Take a crude guess here.
1190 * That guess was in microseconds, so multiply with 1000 */
1191 pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
1192 + (3 * (1 << data->irt) * 10)) * 1000;
1193 1227
1194 if (cpu_family == CPU_HW_PSTATE) 1228 if (cpu_family == CPU_HW_PSTATE)
1195 pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1229 pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d96..8ecc75b6c7c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,6 +5,19 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate {
10 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0,
12 HW_PSTATE_1 = 1,
13 HW_PSTATE_2 = 2,
14 HW_PSTATE_3 = 3,
15 HW_PSTATE_4 = 4,
16 HW_PSTATE_5 = 5,
17 HW_PSTATE_6 = 6,
18 HW_PSTATE_7 = 7,
19};
20
8struct powernow_k8_data { 21struct powernow_k8_data {
9 unsigned int cpu; 22 unsigned int cpu;
10 23
@@ -23,7 +36,9 @@ struct powernow_k8_data {
23 u32 exttype; /* extended interface = 1 */ 36 u32 exttype; /* extended interface = 1 */
24 37
25 /* keep track of the current fid / vid or pstate */ 38 /* keep track of the current fid / vid or pstate */
26 u32 currvid, currfid, currpstate; 39 u32 currvid;
40 u32 currfid;
41 enum pstate currpstate;
27 42
28 /* the powernow_table includes all frequency and vid/fid pairings: 43 /* the powernow_table includes all frequency and vid/fid pairings:
29 * fid are the lower 8 bits of the index, vid are the upper 8 bits. 44 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
@@ -38,7 +53,7 @@ struct powernow_k8_data {
38 /* we need to keep track of associated cores, but let cpufreq 53 /* we need to keep track of associated cores, but let cpufreq
39 * handle hotplug events - so just point at cpufreq pol->cpus 54 * handle hotplug events - so just point at cpufreq pol->cpus
40 * structure */ 55 * structure */
41 cpumask_t *available_cores; 56 struct cpumask *available_cores;
42}; 57};
43 58
44 59
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 3b5f06423e77..f08998278a3a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -458,13 +458,6 @@ static int centrino_verify (struct cpufreq_policy *policy)
458 * 458 *
459 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
460 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
468static int centrino_target (struct cpufreq_policy *policy, 461static int centrino_target (struct cpufreq_policy *policy,
469 unsigned int target_freq, 462 unsigned int target_freq,
470 unsigned int relation) 463 unsigned int relation)
@@ -474,14 +467,15 @@ static int centrino_target (struct cpufreq_policy *policy,
474 struct cpufreq_freqs freqs; 467 struct cpufreq_freqs freqs;
475 int retval = 0; 468 int retval = 0;
476 unsigned int j, k, first_cpu, tmp; 469 unsigned int j, k, first_cpu, tmp;
477 CPUMASK_ALLOC(allmasks); 470 cpumask_var_t saved_mask, covered_cpus;
478 CPUMASK_PTR(online_policy_cpus, allmasks);
479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482 471
483 if (unlikely(allmasks == NULL)) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask);
484 return -ENOMEM; 476 return -ENOMEM;
477 }
478 cpumask_copy(saved_mask, &current->cpus_allowed);
485 479
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { 480 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV; 481 retval = -ENODEV;
@@ -497,30 +491,26 @@ static int centrino_target (struct cpufreq_policy *policy,
497 goto out; 491 goto out;
498 } 492 }
499 493
500#ifdef CONFIG_HOTPLUG_CPU
501 /* cpufreq holds the hotplug lock, so we are safe from here on */
502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
503#else
504 *online_policy_cpus = policy->cpus;
505#endif
506
507 *saved_mask = current->cpus_allowed;
508 first_cpu = 1; 494 first_cpu = 1;
509 cpus_clear(*covered_cpus); 495 for_each_cpu(j, policy->cpus) {
510 for_each_cpu_mask_nr(j, *online_policy_cpus) { 496 const struct cpumask *mask;
497
498 /* cpufreq holds the hotplug lock, so we are safe here */
499 if (!cpu_online(j))
500 continue;
501
511 /* 502 /*
512 * Support for SMP systems. 503 * Support for SMP systems.
513 * Make sure we are running on CPU that wants to change freq 504 * Make sure we are running on CPU that wants to change freq
514 */ 505 */
515 cpus_clear(*set_mask);
516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 506 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
517 cpus_or(*set_mask, *set_mask, *online_policy_cpus); 507 mask = policy->cpus;
518 else 508 else
519 cpu_set(j, *set_mask); 509 mask = cpumask_of(j);
520 510
521 set_cpus_allowed_ptr(current, set_mask); 511 set_cpus_allowed_ptr(current, mask);
522 preempt_disable(); 512 preempt_disable();
523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { 513 if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
524 dprintk("couldn't limit to CPUs in this domain\n"); 514 dprintk("couldn't limit to CPUs in this domain\n");
525 retval = -EAGAIN; 515 retval = -EAGAIN;
526 if (first_cpu) { 516 if (first_cpu) {
@@ -548,7 +538,9 @@ static int centrino_target (struct cpufreq_policy *policy,
548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 538 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
549 target_freq, freqs.old, freqs.new, msr); 539 target_freq, freqs.old, freqs.new, msr);
550 540
551 for_each_cpu_mask_nr(k, *online_policy_cpus) { 541 for_each_cpu(k, policy->cpus) {
542 if (!cpu_online(k))
543 continue;
552 freqs.cpu = k; 544 freqs.cpu = k;
553 cpufreq_notify_transition(&freqs, 545 cpufreq_notify_transition(&freqs,
554 CPUFREQ_PRECHANGE); 546 CPUFREQ_PRECHANGE);
@@ -571,7 +563,9 @@ static int centrino_target (struct cpufreq_policy *policy,
571 preempt_enable(); 563 preempt_enable();
572 } 564 }
573 565
574 for_each_cpu_mask_nr(k, *online_policy_cpus) { 566 for_each_cpu(k, policy->cpus) {
567 if (!cpu_online(k))
568 continue;
575 freqs.cpu = k; 569 freqs.cpu = k;
576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
577 } 571 }
@@ -584,18 +578,17 @@ static int centrino_target (struct cpufreq_policy *policy,
584 * Best effort undo.. 578 * Best effort undo..
585 */ 579 */
586 580
587 if (!cpus_empty(*covered_cpus)) 581 for_each_cpu_mask_nr(j, *covered_cpus) {
588 for_each_cpu_mask_nr(j, *covered_cpus) { 582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));
589 set_cpus_allowed_ptr(current, 583 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
590 &cpumask_of_cpu(j)); 584 }
591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
592 }
593 585
594 tmp = freqs.new; 586 tmp = freqs.new;
595 freqs.new = freqs.old; 587 freqs.new = freqs.old;
596 freqs.old = tmp; 588 freqs.old = tmp;
597 for_each_cpu_mask_nr(j, *online_policy_cpus) { 589 for_each_cpu(j, policy->cpus) {
598 freqs.cpu = j; 590 if (!cpu_online(j))
591 continue;
599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 592 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 593 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
601 } 594 }
@@ -608,7 +601,8 @@ migrate_end:
608 preempt_enable(); 601 preempt_enable();
609 set_cpus_allowed_ptr(current, saved_mask); 602 set_cpus_allowed_ptr(current, saved_mask);
610out: 603out:
611 CPUMASK_FREE(allmasks); 604 free_cpumask_var(saved_mask);
605 free_cpumask_var(covered_cpus);
612 return retval; 606 return retval;
613} 607}
614 608
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 04d0376b64b0..dedc1e98f168 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -229,7 +229,7 @@ static unsigned int speedstep_detect_chipset (void)
229 return 0; 229 return 0;
230} 230}
231 231
232static unsigned int _speedstep_get(const cpumask_t *cpus) 232static unsigned int _speedstep_get(const struct cpumask *cpus)
233{ 233{
234 unsigned int speed; 234 unsigned int speed;
235 cpumask_t cpus_allowed; 235 cpumask_t cpus_allowed;
@@ -244,7 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 return _speedstep_get(&cpumask_of_cpu(cpu)); 247 return _speedstep_get(cpumask_of(cpu));
248} 248}
249 249
250/** 250/**
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
268 return -EINVAL; 268 return -EINVAL;
269 269
270 freqs.old = _speedstep_get(&policy->cpus); 270 freqs.old = _speedstep_get(policy->cpus);
271 freqs.new = speedstep_freqs[newstate].frequency; 271 freqs.new = speedstep_freqs[newstate].frequency;
272 freqs.cpu = policy->cpu; 272 freqs.cpu = policy->cpu;
273 273
@@ -279,20 +279,20 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask_nr(i, policy->cpus) { 282 for_each_cpu(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
286 286
287 /* switch to physical CPU where state is to be changed */ 287 /* switch to physical CPU where state is to be changed */
288 set_cpus_allowed_ptr(current, &policy->cpus); 288 set_cpus_allowed_ptr(current, policy->cpus);
289 289
290 speedstep_set_state(newstate); 290 speedstep_set_state(newstate);
291 291
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask_nr(i, policy->cpus) { 295 for_each_cpu(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
@@ -322,11 +322,11 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
322 322
323 /* only run on CPU to be set, or on its sibling */ 323 /* only run on CPU to be set, or on its sibling */
324#ifdef CONFIG_SMP 324#ifdef CONFIG_SMP
325 policy->cpus = per_cpu(cpu_sibling_map, policy->cpu); 325 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
326#endif 326#endif
327 327
328 cpus_allowed = current->cpus_allowed; 328 cpus_allowed = current->cpus_allowed;
329 set_cpus_allowed_ptr(current, &policy->cpus); 329 set_cpus_allowed_ptr(current, policy->cpus);
330 330
331 /* detect low and high frequency and transition latency */ 331 /* detect low and high frequency and transition latency */
332 result = speedstep_get_freqs(speedstep_processor, 332 result = speedstep_get_freqs(speedstep_processor,
@@ -339,7 +339,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
339 return result; 339 return result;
340 340
341 /* get current speed setting */ 341 /* get current speed setting */
342 speed = _speedstep_get(&policy->cpus); 342 speed = _speedstep_get(policy->cpus);
343 if (!speed) 343 if (!speed)
344 return -EIO; 344 return -EIO;
345 345
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 98d4fdb7dc04..cdac7d62369b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -139,6 +139,15 @@ static unsigned int pentium_core_get_frequency(void)
139 case 3: 139 case 3:
140 fsb = 166667; 140 fsb = 166667;
141 break; 141 break;
142 case 2:
143 fsb = 200000;
144 break;
145 case 0:
146 fsb = 266667;
147 break;
148 case 4:
149 fsb = 333333;
150 break;
142 default: 151 default:
143 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); 152 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
144 } 153 }
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000000..fb5b86af0b01
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
1/*
2 * Common hypervisor code
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h>
27
28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{
31 if (vmware_platform()) {
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else {
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43}
44
45static inline void __cpuinit
46hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
47{
48 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
49 vmware_set_feature_bits(c);
50 return;
51 }
52}
53
54void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55{
56 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c);
58}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d55..1f137a87d4bd 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -25,11 +24,24 @@
25#ifdef CONFIG_X86_LOCAL_APIC 24#ifdef CONFIG_X86_LOCAL_APIC
26#include <asm/mpspec.h> 25#include <asm/mpspec.h>
27#include <asm/apic.h> 26#include <asm/apic.h>
28#include <mach_apic.h> 27#include <asm/genapic.h>
29#endif 28#endif
30 29
31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
32{ 31{
32 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
41 c->cpuid_level = cpuid_eax(0);
42 }
43 }
44
33 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 45 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
34 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 46 (c->x86 == 0x6 && c->x86_model >= 0x0e))
35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 47 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
@@ -41,6 +53,28 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
41 if (c->x86 == 15 && c->x86_cache_alignment == 64) 53 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128; 54 c->x86_cache_alignment = 128;
43#endif 55#endif
56
57 /*
58 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
59 * with P/T states and does not stop in deep C-states
60 */
61 if (c->x86_power & (1 << 8)) {
62 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
63 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
64 }
65
66 /*
67 * There is a known erratum on Pentium III and Core Solo
68 * and Core Duo CPUs.
69 * " Page with PAT set to WC while associated MTRR is UC
70 * may consolidate to UC "
71 * Because of this erratum, it is better to stick with
72 * setting WC in MTRR rather than using PAT on these CPUs.
73 *
74 * Enable PAT WC only on P4, Core 2 or later CPUs.
75 */
76 if (c->x86 == 6 && c->x86_model < 15)
77 clear_cpu_cap(c, X86_FEATURE_PAT);
44} 78}
45 79
46#ifdef CONFIG_X86_32 80#ifdef CONFIG_X86_32
@@ -242,6 +276,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
242 276
243 intel_workarounds(c); 277 intel_workarounds(c);
244 278
279 /*
280 * Detect the extended topology information if available. This
281 * will reinitialise the initial_apicid which will be used
282 * in init_intel_cacheinfo()
283 */
284 detect_extended_topology(c);
285
245 l2 = init_intel_cacheinfo(c); 286 l2 = init_intel_cacheinfo(c);
246 if (c->cpuid_level > 9) { 287 if (c->cpuid_level > 9) {
247 unsigned eax = cpuid_eax(10); 288 unsigned eax = cpuid_eax(10);
@@ -262,6 +303,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
262 ds_init_intel(c); 303 ds_init_intel(c);
263 } 304 }
264 305
306 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
307 set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
308
265#ifdef CONFIG_X86_64 309#ifdef CONFIG_X86_64
266 if (c->x86 == 15) 310 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2; 311 c->x86_cache_alignment = c->x86_clflush_size * 2;
@@ -307,13 +351,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
307 set_cpu_cap(c, X86_FEATURE_P4); 351 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6) 352 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3); 353 set_cpu_cap(c, X86_FEATURE_P3);
310
311 if (cpu_has_bts)
312 ptrace_bts_init_intel(c);
313
314#endif 354#endif
315 355
316 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 356 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /* 357 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 358 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf1..7293508d8f5c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -36,8 +36,11 @@ static struct _cache_table cache_table[] __cpuinitdata =
36{ 36{
37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ 38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
39 { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
39 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 40 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
40 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 41 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
42 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
43 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
41 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 44 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
42 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 45 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
43 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 46 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -85,6 +88,18 @@ static struct _cache_table cache_table[] __cpuinitdata =
85 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 88 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
86 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 89 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
87 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 90 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
91 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
93 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
95 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */
96 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
97 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
98 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */
100 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
101 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
102 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
88 { 0x00, 0, 0} 103 { 0x00, 0, 0}
89}; 104};
90 105
@@ -132,7 +147,16 @@ struct _cpuid4_info {
132 union _cpuid4_leaf_ecx ecx; 147 union _cpuid4_leaf_ecx ecx;
133 unsigned long size; 148 unsigned long size;
134 unsigned long can_disable; 149 unsigned long can_disable;
135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 150 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
151};
152
153/* subset of above _cpuid4_info w/o shared_cpu_map */
154struct _cpuid4_info_regs {
155 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx;
158 unsigned long size;
159 unsigned long can_disable;
136}; 160};
137 161
138#ifdef CONFIG_PCI 162#ifdef CONFIG_PCI
@@ -263,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
263} 287}
264 288
265static void __cpuinit 289static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 290amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
267{ 291{
268 if (index < 3) 292 if (index < 3)
269 return; 293 return;
@@ -271,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
271} 295}
272 296
273static int 297static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 298__cpuinit cpuid4_cache_lookup_regs(int index,
299 struct _cpuid4_info_regs *this_leaf)
275{ 300{
276 union _cpuid4_leaf_eax eax; 301 union _cpuid4_leaf_eax eax;
277 union _cpuid4_leaf_ebx ebx; 302 union _cpuid4_leaf_ebx ebx;
@@ -299,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
299 return 0; 324 return 0;
300} 325}
301 326
327static int
328__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
329{
330 struct _cpuid4_info_regs *leaf_regs =
331 (struct _cpuid4_info_regs *)this_leaf;
332
333 return cpuid4_cache_lookup_regs(index, leaf_regs);
334}
335
302static int __cpuinit find_num_cache_leaves(void) 336static int __cpuinit find_num_cache_leaves(void)
303{ 337{
304 unsigned int eax, ebx, ecx, edx; 338 unsigned int eax, ebx, ecx, edx;
@@ -338,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
338 * parameters cpuid leaf to find the cache details 372 * parameters cpuid leaf to find the cache details
339 */ 373 */
340 for (i = 0; i < num_cache_leaves; i++) { 374 for (i = 0; i < num_cache_leaves; i++) {
341 struct _cpuid4_info this_leaf; 375 struct _cpuid4_info_regs this_leaf;
342
343 int retval; 376 int retval;
344 377
345 retval = cpuid4_cache_lookup(i, &this_leaf); 378 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
346 if (retval >= 0) { 379 if (retval >= 0) {
347 switch(this_leaf.eax.split.level) { 380 switch(this_leaf.eax.split.level) {
348 case 1: 381 case 1:
@@ -491,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
491 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 524 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
492 525
493 if (num_threads_sharing == 1) 526 if (num_threads_sharing == 1)
494 cpu_set(cpu, this_leaf->shared_cpu_map); 527 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
495 else { 528 else {
496 index_msb = get_count_order(num_threads_sharing); 529 index_msb = get_count_order(num_threads_sharing);
497 530
498 for_each_online_cpu(i) { 531 for_each_online_cpu(i) {
499 if (cpu_data(i).apicid >> index_msb == 532 if (cpu_data(i).apicid >> index_msb ==
500 c->apicid >> index_msb) { 533 c->apicid >> index_msb) {
501 cpu_set(i, this_leaf->shared_cpu_map); 534 cpumask_set_cpu(i,
535 to_cpumask(this_leaf->shared_cpu_map));
502 if (i != cpu && per_cpu(cpuid4_info, i)) { 536 if (i != cpu && per_cpu(cpuid4_info, i)) {
503 sibling_leaf = CPUID4_INFO_IDX(i, index); 537 sibling_leaf =
504 cpu_set(cpu, sibling_leaf->shared_cpu_map); 538 CPUID4_INFO_IDX(i, index);
539 cpumask_set_cpu(cpu, to_cpumask(
540 sibling_leaf->shared_cpu_map));
505 } 541 }
506 } 542 }
507 } 543 }
@@ -513,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
513 int sibling; 549 int sibling;
514 550
515 this_leaf = CPUID4_INFO_IDX(cpu, index); 551 this_leaf = CPUID4_INFO_IDX(cpu, index);
516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 552 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
517 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 553 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
518 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 554 cpumask_clear_cpu(cpu,
555 to_cpumask(sibling_leaf->shared_cpu_map));
519 } 556 }
520} 557}
521#else 558#else
@@ -534,31 +571,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
534 per_cpu(cpuid4_info, cpu) = NULL; 571 per_cpu(cpuid4_info, cpu) = NULL;
535} 572}
536 573
537static int __cpuinit detect_cache_attributes(unsigned int cpu) 574static void __cpuinit get_cpu_leaves(void *_retval)
538{ 575{
539 struct _cpuid4_info *this_leaf; 576 int j, *retval = _retval, cpu = smp_processor_id();
540 unsigned long j;
541 int retval;
542 cpumask_t oldmask;
543
544 if (num_cache_leaves == 0)
545 return -ENOENT;
546
547 per_cpu(cpuid4_info, cpu) = kzalloc(
548 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
549 if (per_cpu(cpuid4_info, cpu) == NULL)
550 return -ENOMEM;
551
552 oldmask = current->cpus_allowed;
553 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
554 if (retval)
555 goto out;
556 577
557 /* Do cpuid and store the results */ 578 /* Do cpuid and store the results */
558 for (j = 0; j < num_cache_leaves; j++) { 579 for (j = 0; j < num_cache_leaves; j++) {
580 struct _cpuid4_info *this_leaf;
559 this_leaf = CPUID4_INFO_IDX(cpu, j); 581 this_leaf = CPUID4_INFO_IDX(cpu, j);
560 retval = cpuid4_cache_lookup(j, this_leaf); 582 *retval = cpuid4_cache_lookup(j, this_leaf);
561 if (unlikely(retval < 0)) { 583 if (unlikely(*retval < 0)) {
562 int i; 584 int i;
563 585
564 for (i = 0; i < j; i++) 586 for (i = 0; i < j; i++)
@@ -567,9 +589,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
567 } 589 }
568 cache_shared_cpu_map_setup(cpu, j); 590 cache_shared_cpu_map_setup(cpu, j);
569 } 591 }
570 set_cpus_allowed_ptr(current, &oldmask); 592}
593
594static int __cpuinit detect_cache_attributes(unsigned int cpu)
595{
596 int retval;
597
598 if (num_cache_leaves == 0)
599 return -ENOENT;
571 600
572out: 601 per_cpu(cpuid4_info, cpu) = kzalloc(
602 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
603 if (per_cpu(cpuid4_info, cpu) == NULL)
604 return -ENOMEM;
605
606 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
573 if (retval) { 607 if (retval) {
574 kfree(per_cpu(cpuid4_info, cpu)); 608 kfree(per_cpu(cpuid4_info, cpu));
575 per_cpu(cpuid4_info, cpu) = NULL; 609 per_cpu(cpuid4_info, cpu) = NULL;
@@ -623,11 +657,12 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
623 int n = 0; 657 int n = 0;
624 658
625 if (len > 1) { 659 if (len > 1) {
626 cpumask_t *mask = &this_leaf->shared_cpu_map; 660 const struct cpumask *mask;
627 661
662 mask = to_cpumask(this_leaf->shared_cpu_map);
628 n = type? 663 n = type?
629 cpulist_scnprintf(buf, len-2, *mask): 664 cpulist_scnprintf(buf, len-2, mask) :
630 cpumask_scnprintf(buf, len-2, *mask); 665 cpumask_scnprintf(buf, len-2, mask);
631 buf[n++] = '\n'; 666 buf[n++] = '\n';
632 buf[n] = '\0'; 667 buf[n] = '\0';
633 } 668 }
@@ -644,20 +679,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
644 return show_shared_cpu_map_func(leaf, 1, buf); 679 return show_shared_cpu_map_func(leaf, 1, buf);
645} 680}
646 681
647static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { 682static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
648 switch(this_leaf->eax.split.type) { 683{
649 case CACHE_TYPE_DATA: 684 switch (this_leaf->eax.split.type) {
685 case CACHE_TYPE_DATA:
650 return sprintf(buf, "Data\n"); 686 return sprintf(buf, "Data\n");
651 break; 687 case CACHE_TYPE_INST:
652 case CACHE_TYPE_INST:
653 return sprintf(buf, "Instruction\n"); 688 return sprintf(buf, "Instruction\n");
654 break; 689 case CACHE_TYPE_UNIFIED:
655 case CACHE_TYPE_UNIFIED:
656 return sprintf(buf, "Unified\n"); 690 return sprintf(buf, "Unified\n");
657 break; 691 default:
658 default:
659 return sprintf(buf, "Unknown\n"); 692 return sprintf(buf, "Unknown\n");
660 break;
661 } 693 }
662} 694}
663 695
@@ -690,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
690 722
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{ 724{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
726 int node = cpu_to_node(cpumask_first(mask));
694 struct pci_dev *dev = NULL; 727 struct pci_dev *dev = NULL;
695 ssize_t ret = 0; 728 ssize_t ret = 0;
696 int i; 729 int i;
@@ -724,7 +757,8 @@ static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count) 758 size_t count)
726{ 759{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
761 int node = cpu_to_node(cpumask_first(mask));
728 struct pci_dev *dev = NULL; 762 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val; 763 unsigned int ret, index, val;
730 764
@@ -869,7 +903,7 @@ err_out:
869 return -ENOMEM; 903 return -ENOMEM;
870} 904}
871 905
872static cpumask_t cache_dev_map = CPU_MASK_NONE; 906static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
873 907
874/* Add/Remove cache interface for CPU device */ 908/* Add/Remove cache interface for CPU device */
875static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 909static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -909,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
909 } 943 }
910 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 944 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
911 } 945 }
912 cpu_set(cpu, cache_dev_map); 946 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
913 947
914 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 948 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
915 return 0; 949 return 0;
@@ -922,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
922 956
923 if (per_cpu(cpuid4_info, cpu) == NULL) 957 if (per_cpu(cpuid4_info, cpu) == NULL)
924 return; 958 return;
925 if (!cpu_isset(cpu, cache_dev_map)) 959 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
926 return; 960 return;
927 cpu_clear(cpu, cache_dev_map); 961 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
928 962
929 for (i = 0; i < num_cache_leaves; i++) 963 for (i = 0; i < num_cache_leaves; i++)
930 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 964 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 0ebf3fc6a610..dfaebce3633e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * mce.c - x86 Machine Check Exception Reporting 2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com> 3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac856..1c838032fd37 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
510 */ 510 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 511void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 512{
513 static cpumask_t mce_cpus = CPU_MASK_NONE;
514
515 mce_cpu_quirks(c); 513 mce_cpu_quirks(c);
516 514
517 if (mce_dont_init || 515 if (mce_dont_init ||
518 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
519 !mce_available(c)) 516 !mce_available(c))
520 return; 517 return;
521 518
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a4b2e9..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
83 * CPU Initialization 83 * CPU Initialization
84 */ 84 */
85 85
86struct thresh_restart {
87 struct threshold_block *b;
88 int reset;
89 u16 old_limit;
90};
91
86/* must be called with correct cpu affinity */ 92/* must be called with correct cpu affinity */
87static void threshold_restart_bank(struct threshold_block *b, 93static long threshold_restart_bank(void *_tr)
88 int reset, u16 old_limit)
89{ 94{
95 struct thresh_restart *tr = _tr;
90 u32 mci_misc_hi, mci_misc_lo; 96 u32 mci_misc_hi, mci_misc_lo;
91 97
92 rdmsr(b->address, mci_misc_lo, mci_misc_hi); 98 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
93 99
94 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 100 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
95 reset = 1; /* limit cannot be lower than err count */ 101 tr->reset = 1; /* limit cannot be lower than err count */
96 102
97 if (reset) { /* reset err count and overflow bit */ 103 if (tr->reset) { /* reset err count and overflow bit */
98 mci_misc_hi = 104 mci_misc_hi =
99 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 105 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
100 (THRESHOLD_MAX - b->threshold_limit); 106 (THRESHOLD_MAX - tr->b->threshold_limit);
101 } else if (old_limit) { /* change limit w/o reset */ 107 } else if (tr->old_limit) { /* change limit w/o reset */
102 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 108 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
103 (old_limit - b->threshold_limit); 109 (tr->old_limit - tr->b->threshold_limit);
104 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 110 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
105 (new_count & THRESHOLD_MAX); 111 (new_count & THRESHOLD_MAX);
106 } 112 }
107 113
108 b->interrupt_enable ? 114 tr->b->interrupt_enable ?
109 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 115 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
110 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 116 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
111 117
112 mci_misc_hi |= MASK_COUNT_EN_HI; 118 mci_misc_hi |= MASK_COUNT_EN_HI;
113 wrmsr(b->address, mci_misc_lo, mci_misc_hi); 119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
114} 121}
115 122
116/* cpu init entry point, called from mce.c with preempt off */ 123/* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
120 unsigned int cpu = smp_processor_id(); 127 unsigned int cpu = smp_processor_id();
121 u8 lvt_off; 128 u8 lvt_off;
122 u32 low = 0, high = 0, address = 0; 129 u32 low = 0, high = 0, address = 0;
130 struct thresh_restart tr;
123 131
124 for (bank = 0; bank < NR_BANKS; ++bank) { 132 for (bank = 0; bank < NR_BANKS; ++bank) {
125 for (block = 0; block < NR_BLOCKS; ++block) { 133 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
162 wrmsr(address, low, high); 170 wrmsr(address, low, high);
163 171
164 threshold_defaults.address = address; 172 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 173 tr.b = &threshold_defaults;
174 tr.reset = 0;
175 tr.old_limit = 0;
176 threshold_restart_bank(&tr);
166 } 177 }
167 } 178 }
168} 179}
@@ -237,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void)
237 } 248 }
238 } 249 }
239out: 250out:
240 add_pda(irq_threshold_count, 1); 251 inc_irq_stat(irq_threshold_count);
241 irq_exit(); 252 irq_exit();
242} 253}
243 254
@@ -251,20 +262,6 @@ struct threshold_attr {
251 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 262 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
252}; 263};
253 264
254static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
255 cpumask_t *newmask)
256{
257 *oldmask = current->cpus_allowed;
258 cpus_clear(*newmask);
259 cpu_set(cpu, *newmask);
260 set_cpus_allowed_ptr(current, newmask);
261}
262
263static void affinity_restore(const cpumask_t *oldmask)
264{
265 set_cpus_allowed_ptr(current, oldmask);
266}
267
268#define SHOW_FIELDS(name) \ 265#define SHOW_FIELDS(name) \
269static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 266static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
270{ \ 267{ \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
277 const char *buf, size_t count) 274 const char *buf, size_t count)
278{ 275{
279 char *end; 276 char *end;
280 cpumask_t oldmask, newmask; 277 struct thresh_restart tr;
281 unsigned long new = simple_strtoul(buf, &end, 0); 278 unsigned long new = simple_strtoul(buf, &end, 0);
282 if (end == buf) 279 if (end == buf)
283 return -EINVAL; 280 return -EINVAL;
284 b->interrupt_enable = !!new; 281 b->interrupt_enable = !!new;
285 282
286 affinity_set(b->cpu, &oldmask, &newmask); 283 tr.b = b;
287 threshold_restart_bank(b, 0, 0); 284 tr.reset = 0;
288 affinity_restore(&oldmask); 285 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
289 287
290 return end - buf; 288 return end - buf;
291} 289}
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
294 const char *buf, size_t count) 292 const char *buf, size_t count)
295{ 293{
296 char *end; 294 char *end;
297 cpumask_t oldmask, newmask; 295 struct thresh_restart tr;
298 u16 old;
299 unsigned long new = simple_strtoul(buf, &end, 0); 296 unsigned long new = simple_strtoul(buf, &end, 0);
300 if (end == buf) 297 if (end == buf)
301 return -EINVAL; 298 return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 new = THRESHOLD_MAX; 300 new = THRESHOLD_MAX;
304 if (new < 1) 301 if (new < 1)
305 new = 1; 302 new = 1;
306 old = b->threshold_limit; 303 tr.old_limit = b->threshold_limit;
307 b->threshold_limit = new; 304 b->threshold_limit = new;
305 tr.b = b;
306 tr.reset = 0;
308 307
309 affinity_set(b->cpu, &oldmask, &newmask); 308 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
310 threshold_restart_bank(b, 0, old);
311 affinity_restore(&oldmask);
312 309
313 return end - buf; 310 return end - buf;
314} 311}
315 312
316static ssize_t show_error_count(struct threshold_block *b, char *buf) 313static long local_error_count(void *_b)
317{ 314{
318 u32 high, low; 315 struct threshold_block *b = _b;
319 cpumask_t oldmask, newmask; 316 u32 low, high;
320 affinity_set(b->cpu, &oldmask, &newmask); 317
321 rdmsr(b->address, low, high); 318 rdmsr(b->address, low, high);
322 affinity_restore(&oldmask); 319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
323 return sprintf(buf, "%x\n", 320}
324 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); 321
322static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
325} 325}
326 326
327static ssize_t store_error_count(struct threshold_block *b, 327static ssize_t store_error_count(struct threshold_block *b,
328 const char *buf, size_t count) 328 const char *buf, size_t count)
329{ 329{
330 cpumask_t oldmask, newmask; 330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 affinity_set(b->cpu, &oldmask, &newmask); 331
332 threshold_restart_bank(b, 1, 0); 332 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
333 affinity_restore(&oldmask);
334 return 1; 333 return 1;
335} 334}
336 335
@@ -463,19 +462,26 @@ out_free:
463 return err; 462 return err;
464} 463}
465 464
465static __cpuinit long local_allocate_threshold_blocks(void *_bank)
466{
467 unsigned int *bank = _bank;
468
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471}
472
466/* symlinks sibling shared banks to first core. first core owns dir/files. */ 473/* symlinks sibling shared banks to first core. first core owns dir/files. */
467static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 474static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
468{ 475{
469 int i, err = 0; 476 int i, err = 0;
470 struct threshold_bank *b = NULL; 477 struct threshold_bank *b = NULL;
471 cpumask_t oldmask, newmask;
472 char name[32]; 478 char name[32];
473 479
474 sprintf(name, "threshold_bank%i", bank); 480 sprintf(name, "threshold_bank%i", bank);
475 481
476#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
477 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
478 i = first_cpu(per_cpu(cpu_core_map, cpu)); 484 i = cpumask_first(&per_cpu(cpu_core_map, cpu));
479 485
480 /* first core not up yet */ 486 /* first core not up yet */
481 if (cpu_data(i).cpu_core_id) 487 if (cpu_data(i).cpu_core_id)
@@ -495,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
495 if (err) 501 if (err)
496 goto out; 502 goto out;
497 503
498 b->cpus = per_cpu(cpu_core_map, cpu); 504 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
499 per_cpu(threshold_banks, cpu)[bank] = b; 505 per_cpu(threshold_banks, cpu)[bank] = b;
500 goto out; 506 goto out;
501 } 507 }
@@ -506,28 +512,29 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
506 err = -ENOMEM; 512 err = -ENOMEM;
507 goto out; 513 goto out;
508 } 514 }
515 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
516 kfree(b);
517 err = -ENOMEM;
518 goto out;
519 }
509 520
510 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 521 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
511 if (!b->kobj) 522 if (!b->kobj)
512 goto out_free; 523 goto out_free;
513 524
514#ifndef CONFIG_SMP 525#ifndef CONFIG_SMP
515 b->cpus = CPU_MASK_ALL; 526 cpumask_setall(b->cpus);
516#else 527#else
517 b->cpus = per_cpu(cpu_core_map, cpu); 528 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
518#endif 529#endif
519 530
520 per_cpu(threshold_banks, cpu)[bank] = b; 531 per_cpu(threshold_banks, cpu)[bank] = b;
521 532
522 affinity_set(cpu, &oldmask, &newmask); 533 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
523 err = allocate_threshold_blocks(cpu, bank, 0,
524 MSR_IA32_MC0_MISC + bank * 4);
525 affinity_restore(&oldmask);
526
527 if (err) 534 if (err)
528 goto out_free; 535 goto out_free;
529 536
530 for_each_cpu_mask_nr(i, b->cpus) { 537 for_each_cpu(i, b->cpus) {
531 if (i == cpu) 538 if (i == cpu)
532 continue; 539 continue;
533 540
@@ -543,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
543 550
544out_free: 551out_free:
545 per_cpu(threshold_banks, cpu)[bank] = NULL; 552 per_cpu(threshold_banks, cpu)[bank] = NULL;
553 free_cpumask_var(b->cpus);
546 kfree(b); 554 kfree(b);
547out: 555out:
548 return err; 556 return err;
@@ -617,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 625#endif
618 626
619 /* remove all sibling symlinks before unregistering */ 627 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask_nr(i, b->cpus) { 628 for_each_cpu(i, b->cpus) {
621 if (i == cpu) 629 if (i == cpu)
622 continue; 630 continue;
623 631
@@ -630,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
630free_out: 638free_out:
631 kobject_del(b->kobj); 639 kobject_del(b->kobj);
632 kobject_put(b->kobj); 640 kobject_put(b->kobj);
641 free_cpumask_var(b->cpus);
633 kfree(b); 642 kfree(b);
634 per_cpu(threshold_banks, cpu)[bank] = NULL; 643 per_cpu(threshold_banks, cpu)[bank] = NULL;
635} 644}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6dd..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/apic.h>
10#include <asm/msr.h> 11#include <asm/msr.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/hw_irq.h> 13#include <asm/hw_irq.h>
@@ -26,7 +27,7 @@ asmlinkage void smp_thermal_interrupt(void)
26 if (therm_throt_process(msr_val & 1)) 27 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 28 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28 29
29 add_pda(irq_thermal_count, 1); 30 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 31 irq_exit();
31} 32}
32 33
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index bfa5817afdda..c9f77ea69edc 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 62efc9c2b3af..2ac52d7b434b 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index f2be3e190c6b..2a043d89811d 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..0c0a455fe95c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
14#include <asm/pat.h> 14#include <asm/pat.h>
15#include "mtrr.h" 15#include "mtrr.h"
16 16
17struct mtrr_state {
18 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
19 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
20 unsigned char enabled;
21 unsigned char have_fixed;
22 mtrr_type def_type;
23};
24
25struct fixed_range_block { 17struct fixed_range_block {
26 int base_msr; /* start address of an MTRR block */ 18 int base_msr; /* start address of an MTRR block */
27 int ranges; /* number of MTRRs in this block */ 19 int ranges; /* number of MTRRs in this block */
@@ -35,15 +27,19 @@ static struct fixed_range_block fixed_range_blocks[] = {
35}; 27};
36 28
37static unsigned long smp_changes_mask; 29static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 30static int mtrr_state_set;
40u64 mtrr_tom2; 31u64 mtrr_tom2;
41 32
42#undef MODULE_PARAM_PREFIX 33struct mtrr_state_type mtrr_state = {};
43#define MODULE_PARAM_PREFIX "mtrr." 34EXPORT_SYMBOL_GPL(mtrr_state);
44 35
45static int mtrr_show; 36static int __initdata mtrr_show;
46module_param_named(show, mtrr_show, bool, 0); 37static int __init mtrr_debug(char *opt)
38{
39 mtrr_show = 1;
40 return 0;
41}
42early_param("mtrr.show", mtrr_debug);
47 43
48/* 44/*
49 * Returns the effective MTRR type for the region 45 * Returns the effective MTRR type for the region
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..236a401b8259 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
49 49
50u32 num_var_ranges = 0; 50u32 num_var_ranges = 0;
51 51
52unsigned int mtrr_usage_table[MAX_VAR_RANGES]; 52unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
53static DEFINE_MUTEX(mtrr_mutex); 53static DEFINE_MUTEX(mtrr_mutex);
54 54
55u64 size_or_mask, size_and_mask; 55u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
803} 803}
804 804
805static struct res_range __initdata range[RANGE_NUM]; 805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
806 807
807#ifdef CONFIG_MTRR_SANITIZER 808#ifdef CONFIG_MTRR_SANITIZER
808 809
@@ -823,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
823 824
824static int __init disable_mtrr_cleanup_setup(char *str) 825static int __init disable_mtrr_cleanup_setup(char *str)
825{ 826{
826 if (enable_mtrr_cleanup != -1) 827 enable_mtrr_cleanup = 0;
827 enable_mtrr_cleanup = 0;
828 return 0; 828 return 0;
829} 829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); 830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831 831
832static int __init enable_mtrr_cleanup_setup(char *str) 832static int __init enable_mtrr_cleanup_setup(char *str)
833{ 833{
834 if (enable_mtrr_cleanup != -1) 834 enable_mtrr_cleanup = 1;
835 enable_mtrr_cleanup = 1;
836 return 0; 835 return 0;
837} 836}
838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); 837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
@@ -1206,39 +1205,43 @@ struct mtrr_cleanup_result {
1206#define PSHIFT (PAGE_SHIFT - 10) 1205#define PSHIFT (PAGE_SHIFT - 10)
1207 1206
1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1209static struct res_range __initdata range_new[RANGE_NUM];
1210static unsigned long __initdata min_loss_pfn[RANGE_NUM]; 1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1211 1209
1212static int __init mtrr_cleanup(unsigned address_bits) 1210static void __init print_out_mtrr_range_state(void)
1213{ 1211{
1214 unsigned long extra_remove_base, extra_remove_size;
1215 unsigned long base, size, def, dummy;
1216 mtrr_type type;
1217 int nr_range, nr_range_new;
1218 u64 chunk_size, gran_size;
1219 unsigned long range_sums, range_sums_new;
1220 int index_good;
1221 int num_reg_good;
1222 int i; 1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1223 1216
1224 /* extra one for all 0 */ 1217 for (i = 0; i < num_var_ranges; i++) {
1225 int num[MTRR_NUM_TYPES + 1];
1226 1218
1227 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1228 return 0; 1220 if (!size_base)
1229 rdmsr(MTRRdefType_MSR, def, dummy); 1221 continue;
1230 def &= 0xff;
1231 if (def != MTRR_TYPE_UNCACHABLE)
1232 return 0;
1233 1222
1234 /* get it and store it aside */ 1223 size_base = to_size_factor(size_base, &size_factor),
1235 memset(range_state, 0, sizeof(range_state)); 1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1236 for (i = 0; i < num_var_ranges; i++) { 1225 start_base = to_size_factor(start_base, &start_factor),
1237 mtrr_if->get(i, &base, &size, &type); 1226 type = range_state[i].type;
1238 range_state[i].base_pfn = base; 1227
1239 range_state[i].size_pfn = size; 1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1240 range_state[i].type = type; 1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1241 } 1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1242 1245
1243 /* check entries number */ 1246 /* check entries number */
1244 memset(num, 0, sizeof(num)); 1247 memset(num, 0, sizeof(num));
@@ -1263,29 +1266,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
1263 num_var_ranges - num[MTRR_NUM_TYPES]) 1266 num_var_ranges - num[MTRR_NUM_TYPES])
1264 return 0; 1267 return 0;
1265 1268
1266 /* print original var MTRRs at first, for debugging: */ 1269 return 1;
1267 printk(KERN_DEBUG "original variable MTRRs\n"); 1270}
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271 1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); 1272static unsigned long __initdata range_sums;
1273 if (!size_base) 1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 continue; 1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1275 1303
1276 size_base = to_size_factor(size_base, &size_factor), 1304 /* double check it */
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); 1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1278 start_base = to_size_factor(start_base, &start_factor), 1306 if (nr_range_new != nr_range ||
1279 type = range_state[i].type; 1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1280 1310
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", 1311 if (!result[i].bad && (range_sums - range_sums_new <
1282 i, start_base, start_factor, 1312 min_loss_pfn[num_reg])) {
1283 size_base, size_factor, 1313 min_loss_pfn[num_reg] =
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" : 1314 range_sums - range_sums_new;
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 } 1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1289 1396
1290 memset(range, 0, sizeof(range)); 1397 memset(range, 0, sizeof(range));
1291 extra_remove_size = 0; 1398 extra_remove_size = 0;
@@ -1309,176 +1416,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
1309 range_sums >> (20 - PAGE_SHIFT)); 1416 range_sums >> (20 - PAGE_SHIFT));
1310 1417
1311 if (mtrr_chunk_size && mtrr_gran_size) { 1418 if (mtrr_chunk_size && mtrr_gran_size) {
1312 int num_reg; 1419 i = 0;
1313 char gran_factor, chunk_factor, lose_factor; 1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1314 unsigned long gran_base, chunk_base, lose_base; 1421 extra_remove_base, extra_remove_size, i);
1315
1316 debug_print++;
1317 /* convert ranges to var ranges state */
1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1319 mtrr_gran_size);
1320 1422
1321 /* we got new setting in range_state, check it */ 1423 mtrr_print_out_one_result(i);
1322 memset(range_new, 0, sizeof(range_new));
1323 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1324 extra_remove_base,
1325 extra_remove_size);
1326 range_sums_new = sum_ranges(range_new, nr_range_new);
1327 1424
1328 i = 0;
1329 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1330 result[i].gran_sizek = mtrr_gran_size >> 10;
1331 result[i].num_reg = num_reg;
1332 if (range_sums < range_sums_new) {
1333 result[i].lose_cover_sizek =
1334 (range_sums_new - range_sums) << PSHIFT;
1335 result[i].bad = 1;
1336 } else
1337 result[i].lose_cover_sizek =
1338 (range_sums - range_sums_new) << PSHIFT;
1339
1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1347 result[i].num_reg, result[i].bad?"-":"",
1348 lose_base, lose_factor);
1349 if (!result[i].bad) { 1425 if (!result[i].bad) {
1350 set_var_mtrr_all(address_bits); 1426 set_var_mtrr_all(address_bits);
1351 return 1; 1427 return 1;
1352 } 1428 }
1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1354 "will find optimal one\n"); 1430 "will find optimal one\n");
1355 debug_print--;
1356 memset(result, 0, sizeof(result[0]));
1357 } 1431 }
1358 1432
1359 i = 0; 1433 i = 0;
1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1361 memset(result, 0, sizeof(result)); 1435 memset(result, 0, sizeof(result));
1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { 1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368 1437
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32); 1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1370 chunk_size <<= 1) { 1439 chunk_size <<= 1) {
1371 int num_reg;
1372 1440
1373 if (debug_print) {
1374 char chunk_factor;
1375 unsigned long chunk_base;
1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1382 if (i >= NUM_RESULT) 1441 if (i >= NUM_RESULT)
1383 continue; 1442 continue;
1384 1443
1385 /* convert ranges to var ranges state */ 1444 mtrr_calc_range_state(chunk_size, gran_size,
1386 num_reg = x86_setup_var_mtrrs(range, nr_range, 1445 extra_remove_base, extra_remove_size, i);
1387 chunk_size, gran_size); 1446 if (debug_print) {
1388 1447 mtrr_print_out_one_result(i);
1389 /* we got new setting in range_state, check it */ 1448 printk(KERN_INFO "\n");
1390 memset(range_new, 0, sizeof(range_new));
1391 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1392 extra_remove_base, extra_remove_size);
1393 range_sums_new = sum_ranges(range_new, nr_range_new);
1394
1395 result[i].chunk_sizek = chunk_size >> 10;
1396 result[i].gran_sizek = gran_size >> 10;
1397 result[i].num_reg = num_reg;
1398 if (range_sums < range_sums_new) {
1399 result[i].lose_cover_sizek =
1400 (range_sums_new - range_sums) << PSHIFT;
1401 result[i].bad = 1;
1402 } else
1403 result[i].lose_cover_sizek =
1404 (range_sums - range_sums_new) << PSHIFT;
1405
1406 /* double check it */
1407 if (!result[i].bad && !result[i].lose_cover_sizek) {
1408 if (nr_range_new != nr_range ||
1409 memcmp(range, range_new, sizeof(range)))
1410 result[i].bad = 1;
1411 } 1449 }
1412 1450
1413 if (!result[i].bad && (range_sums - range_sums_new <
1414 min_loss_pfn[num_reg])) {
1415 min_loss_pfn[num_reg] =
1416 range_sums - range_sums_new;
1417 }
1418 i++; 1451 i++;
1419 } 1452 }
1420 } 1453 }
1421 1454
1422 /* print out all */
1423 for (i = 0; i < NUM_RESULT; i++) {
1424 char gran_factor, chunk_factor, lose_factor;
1425 unsigned long gran_base, chunk_base, lose_base;
1426
1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1436 }
1437
1438 /* try to find the optimal index */ 1455 /* try to find the optimal index */
1439 if (nr_mtrr_spare_reg >= num_var_ranges) 1456 index_good = mtrr_search_optimal_index();
1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1441 num_reg_good = -1;
1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1443 if (!min_loss_pfn[i])
1444 num_reg_good = i;
1445 }
1446
1447 index_good = -1;
1448 if (num_reg_good != -1) {
1449 for (i = 0; i < NUM_RESULT; i++) {
1450 if (!result[i].bad &&
1451 result[i].num_reg == num_reg_good &&
1452 !result[i].lose_cover_sizek) {
1453 index_good = i;
1454 break;
1455 }
1456 }
1457 }
1458 1457
1459 if (index_good != -1) { 1458 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1464 i = index_good; 1460 i = index_good;
1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 1461 mtrr_print_out_one_result(i);
1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 1462
1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1469 gran_base, gran_factor, chunk_base, chunk_factor);
1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1472 /* convert ranges to var ranges state */ 1463 /* convert ranges to var ranges state */
1473 chunk_size = result[i].chunk_sizek; 1464 chunk_size = result[i].chunk_sizek;
1474 chunk_size <<= 10; 1465 chunk_size <<= 10;
1475 gran_size = result[i].gran_sizek; 1466 gran_size = result[i].gran_sizek;
1476 gran_size <<= 10; 1467 gran_size <<= 10;
1477 debug_print++;
1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1480 set_var_mtrr_all(address_bits); 1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1481 return 1; 1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1482 } 1477 }
1483 1478
1484 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); 1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1557,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1562{ 1557{
1563 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1564 mtrr_type type; 1559 mtrr_type type;
1565 int nr_range;
1566 u64 total_trim_size; 1560 u64 total_trim_size;
1567 1561
1568 /* extra one for all 0 */ 1562 /* extra one for all 0 */
@@ -1600,8 +1594,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1600 1594
1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1602 if (!highest_pfn) { 1596 if (!highest_pfn) {
1603 WARN(!kvm_para_available(), KERN_WARNING 1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1604 "WARNING: strange, CPU MTRRs all blank?\n");
1605 return 0; 1598 return 0;
1606 } 1599 }
1607 1600
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
8#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
10 10
11#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
13
14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
16#define MTRRfix64K_00000_MSR 0x250 11#define MTRRfix64K_00000_MSR 0x250
17#define MTRRfix16K_80000_MSR 0x258 12#define MTRRfix16K_80000_MSR 0x258
18#define MTRRfix16K_A0000_MSR 0x259 13#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
29#define MTRR_CHANGE_MASK_VARIABLE 0x02 24#define MTRR_CHANGE_MASK_VARIABLE 0x02
30#define MTRR_CHANGE_MASK_DEFTYPE 0x04 25#define MTRR_CHANGE_MASK_DEFTYPE 0x04
31 26
32/* In the Intel processor's MTRR interface, the MTRR type is always held in 27extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
33 an 8 bit field: */
34typedef u8 mtrr_type;
35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37 28
38struct mtrr_ops { 29struct mtrr_ops {
39 u32 vendor; 30 u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
70 u32 ccr3; 61 u32 ccr3;
71}; 62};
72 63
73struct mtrr_var_range {
74 u32 base_lo;
75 u32 base_hi;
76 u32 mask_lo;
77 u32 mask_hi;
78};
79
80void set_mtrr_done(struct set_mtrr_context *ctxt); 64void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 65void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 66void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000000..284c399e3234
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
1/*
2 * VMware Detection code.
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <linux/dmi.h>
25#include <asm/div64.h>
26#include <asm/vmware.h>
27
28#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
30#define VMWARE_HYPERVISOR_PORT 0x5658
31
32#define VMWARE_PORT_CMD_GETVERSION 10
33#define VMWARE_PORT_CMD_GETHZ 45
34
35#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
36 __asm__("inl (%%dx)" : \
37 "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
38 "0"(VMWARE_HYPERVISOR_MAGIC), \
39 "1"(VMWARE_PORT_CMD_##cmd), \
40 "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
41 "memory");
42
43static inline int __vmware_platform(void)
44{
45 uint32_t eax, ebx, ecx, edx;
46 VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48}
49
50static unsigned long __vmware_get_tsc_khz(void)
51{
52 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx;
54
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32);
62 return tsc_hz;
63}
64
65/*
66 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware
68 * specific string when running under VMware hypervisor.
69 */
70int vmware_platform(void)
71{
72 if (cpu_has_hypervisor) {
73 unsigned int eax, ebx, ecx, edx;
74 char hyper_vendor_id[13];
75
76 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
77 memcpy(hyper_vendor_id + 0, &ebx, 4);
78 memcpy(hyper_vendor_id + 4, &ecx, 4);
79 memcpy(hyper_vendor_id + 8, &edx, 4);
80 hyper_vendor_id[12] = '\0';
81 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
82 return 1;
83 } else if (dmi_available && dmi_name_in_serial("VMware") &&
84 __vmware_platform())
85 return 1;
86
87 return 0;
88}
89
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can
99 * be marked as unstable in some cases. For example, the TSC sync check at
100 * bootup can fail due to a marginal offset between vcpus' TSCs (though the
101 * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
102 * is not suitable as a watchdog when running on a hypervisor because the
103 * kernel may miss a wrap of the counter if the vcpu is descheduled for a
104 * long time. To skip these checks at runtime we set these capability bits,
105 * so that the kernel could just trust the hypervisor with providing a
106 * reliable virtual TSC that is suitable for timekeeping.
107 */
108void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
109{
110 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
111 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
112}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 72cefd1e649b..2ac1f0c2beb3 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h>
42 43
43#include <asm/processor.h> 44#include <asm/processor.h>
44#include <asm/msr.h> 45#include <asm/msr.h>
45#include <asm/uaccess.h>
46#include <asm/system.h> 46#include <asm/system.h>
47 47
48static struct class *cpuid_class; 48static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
82} 82}
83 83
84static ssize_t cpuid_read(struct file *file, char __user *buf, 84static ssize_t cpuid_read(struct file *file, char __user *buf,
85 size_t count, loff_t * ppos) 85 size_t count, loff_t *ppos)
86{ 86{
87 char __user *tmp = buf; 87 char __user *tmp = buf;
88 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
@@ -117,11 +117,11 @@ static int cpuid_open(struct inode *inode, struct file *file)
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0; 119 int ret = 0;
120 120
121 lock_kernel(); 121 lock_kernel();
122 122
123 cpu = iminor(file->f_path.dentry->d_inode); 123 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= NR_CPUS || !cpu_online(cpu)) { 124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
125 ret = -ENXIO; /* No such CPU */ 125 ret = -ENXIO; /* No such CPU */
126 goto out; 126 goto out;
127 } 127 }
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..ad7f2a696f4a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,10 +24,11 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h>
29 30
30#include <mach_ipi.h> 31#include <asm/genapic.h>
31 32
32 33
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
49#endif 50#endif
50 crash_save_cpu(regs, cpu); 51 crash_save_cpu(regs, cpu);
51 52
53 /* Disable VMX or SVM if needed.
54 *
55 * We need to disable virtualization on all CPUs.
56 * Having VMX or SVM enabled on any CPU may break rebooting
57 * after the kdump kernel has finished its task.
58 */
59 cpu_emergency_vmxoff();
60 cpu_emergency_svm_disable();
61
52 disable_local_APIC(); 62 disable_local_APIC();
53} 63}
54 64
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
80 local_irq_disable(); 90 local_irq_disable();
81 91
82 kdump_nmi_shootdown_cpus(); 92 kdump_nmi_shootdown_cpus();
93
94 /* Booting kdump kernel with VMX or SVM enabled won't work,
95 * because (among other limitations) we can't disable paging
96 * with the virt flags.
97 */
98 cpu_emergency_vmxoff();
99 cpu_emergency_svm_disable();
100
83 lapic_shutdown(); 101 lapic_shutdown();
84#if defined(CONFIG_X86_IO_APIC) 102#if defined(CONFIG_X86_IO_APIC)
85 disable_IO_APIC(); 103 disable_IO_APIC();
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2b69994fd3a8..169a120587be 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,23 +6,20 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2009 Intel Corporation.
20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
21 */ 20 */
22 21
23 22
24#ifdef CONFIG_X86_DS
25
26#include <asm/ds.h> 23#include <asm/ds.h>
27 24
28#include <linux/errno.h> 25#include <linux/errno.h>
@@ -30,22 +27,69 @@
30#include <linux/slab.h> 27#include <linux/slab.h>
31#include <linux/sched.h> 28#include <linux/sched.h>
32#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
33 31
34 32
35/* 33/*
36 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
37 */ 35 */
38struct ds_configuration { 36struct ds_configuration {
39 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
40 unsigned char sizeof_ds; 38 const char *name;
41 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
42 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
43 unsigned char sizeof_field; 42 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
48};
49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
61
62/*
63 * A BTS or PEBS tracer.
64 *
65 * This holds the configuration of the tracer and serves as a handle
66 * to identify tracers.
67 */
68struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */
70 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */
72 void *buffer;
73 size_t size;
74};
75
76struct bts_tracer {
77 /* the common DS part */
78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
81 /* buffer overflow notification function */
82 bts_ovfl_callback_t ovfl;
46}; 83};
47static struct ds_configuration ds_cfg;
48 84
85struct pebs_tracer {
86 /* the common DS part */
87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
90 /* buffer overflow notification function */
91 pebs_ovfl_callback_t ovfl;
92};
49 93
50/* 94/*
51 * Debug Store (DS) save area configuration (see Intel64 and IA32 95 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -111,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
111 155
112 156
113/* 157/*
114 * Locking is done only for allocating BTS or PEBS resources and for 158 * Locking is done only for allocating BTS or PEBS resources.
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
120 */ 159 */
121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
122
123/*
124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
128 */
129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140 161
141 162
142/* 163/*
@@ -152,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
152 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
154 * 175 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation. 177 * type of allocation.
160 */ 178 */
161static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
162 180
163static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
164{ 182{
165 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
166} 187}
167 188
168static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
169{ 190{
170 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
171} 195}
172 196
173static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
174{ 198{
175 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
176} 202}
177 203
178 204
@@ -185,100 +211,83 @@ static inline int check_tracer(struct task_struct *task)
185 * 211 *
186 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */ 214 */
211static inline struct ds_context *ds_get_context(struct task_struct *task) 215struct ds_context {
212{ 216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
213 struct ds_context *context; 217 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */
219 struct bts_tracer *bts_master;
220 struct pebs_tracer *pebs_master;
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
214 230
215 spin_lock(&ds_lock); 231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
216 232
217 context = (task ? task->thread.ds_ctx : this_system_context); 233#define system_context per_cpu(system_context_array, smp_processor_id())
218 if (context)
219 context->count++;
220 234
221 spin_unlock(&ds_lock);
222 235
223 return context; 236static inline struct ds_context *ds_get_context(struct task_struct *task)
224}
225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
233{ 237{
234 struct ds_context **p_context = 238 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
236 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
237 241 struct ds_context *new_context = NULL;
238 if (!context) { 242 unsigned long irq;
239 context = kzalloc(sizeof(*context), GFP_KERNEL); 243
240 244 /* Chances are small that we already have a context. */
241 if (!context) 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
242 return NULL; 246 if (!new_context)
247 return NULL;
243 248
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 249 spin_lock_irqsave(&ds_lock, irq);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249 250
250 *p_context = context; 251 context = *p_context;
252 if (!context) {
253 context = new_context;
251 254
252 context->this = p_context; 255 context->this = p_context;
253 context->task = task; 256 context->task = task;
257 context->count = 0;
254 258
255 if (task) 259 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257 261
258 if (!task || (task == current)) 262 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
260 264
261 get_tracer(task); 265 *p_context = context;
262 } 266 }
263 267
264 context->count++; 268 context->count++;
265 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
266 return context; 275 return context;
267} 276}
268 277
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context) 278static inline void ds_put_context(struct ds_context *context)
274{ 279{
280 unsigned long irq;
281
275 if (!context) 282 if (!context)
276 return; 283 return;
277 284
278 spin_lock(&ds_lock); 285 spin_lock_irqsave(&ds_lock, irq);
279 286
280 if (--context->count) 287 if (--context->count) {
281 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
282 291
283 *(context->this) = NULL; 292 *(context->this) = NULL;
284 293
@@ -288,132 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
288 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
290 299
291 put_tracer(context->task); 300 spin_unlock_irqrestore(&ds_lock, irq);
292 301
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context); 302 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
301} 303}
302 304
303 305
304/* 306/*
305 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
306 * 308 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context 309 * context: the ds context
310 * qual: the buffer type 310 * qual: the buffer type
311 */ 311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context, 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
313 enum ds_qualifier qual)
314{ 313{
315 if (!context) 314 switch (qual) {
316 return; 315 case ds_bts:
317 316 if (context->bts_master &&
318 if (context->callback[qual]) 317 context->bts_master->ovfl)
319 (*context->callback[qual])(task); 318 context->bts_master->ovfl(context->bts_master);
320 319 break;
321 /* todo: do some more overflow handling */ 320 case ds_pebs:
321 if (context->pebs_master &&
322 context->pebs_master->ovfl)
323 context->pebs_master->ovfl(context->pebs_master);
324 break;
325 }
322} 326}
323 327
324 328
325/* 329/*
326 * Allocate a non-pageable buffer of the parameter size. 330 * Write raw data into the BTS or PEBS buffer.
327 * Checks the memory and the locked memory rlimit.
328 * 331 *
329 * Returns the buffer, if successful; 332 * The remainder of any partially written record is zeroed out.
330 * NULL, if out of memory or rlimit exceeded.
331 * 333 *
332 * size: the requested buffer size in bytes 334 * context: the DS context
333 * pages (out): if not NULL, contains the number of pages reserved 335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
334 */ 338 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) 339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
336{ 341{
337 unsigned long rlim, vm, pgsz; 342 int bytes_written = 0;
338 void *buffer;
339 343
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 344 if (!record)
345 return -EINVAL;
341 346
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 347 while (size) {
343 vm = current->mm->total_vm + pgsz; 348 unsigned long base, index, end, write_end, int_th;
344 if (rlim < vm) 349 unsigned long write_size, adj_write_size;
345 return NULL;
346 350
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 351 /*
348 vm = current->mm->locked_vm + pgsz; 352 * write as much as possible without producing an
349 if (rlim < vm) 353 * overflow interrupt.
350 return NULL; 354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
351 365
352 buffer = kzalloc(size, GFP_KERNEL); 366 write_end = min(end, int_th);
353 if (!buffer) 367
354 return NULL; 368 /* if we are already beyond the interrupt threshold,
369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
355 372
356 current->mm->total_vm += pgsz; 373 if (write_end <= index)
357 current->mm->locked_vm += pgsz; 374 break;
358 375
359 if (pages) 376 write_size = min((unsigned long) size, write_end - index);
360 *pages = pgsz; 377 memcpy((void *)index, record, write_size);
361 378
362 return buffer; 379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
385
386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
363} 400}
364 401
365static int ds_request(struct task_struct *task, void *base, size_t size, 402
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
367{ 443{
368 struct ds_context *context; 444 base += (ds_cfg.sizeof_field * field);
369 unsigned long buffer, adj; 445 return *(unsigned long *)base;
370 const unsigned long alignment = (1 << 3); 446}
371 int error = 0;
372 447
373 if (!ds_cfg.sizeof_ds) 448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
374 return -EOPNOTSUPP; 449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
375 453
376 /* we require some space to do alignment adjustments below */ 454
377 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 455/*
456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
378 return -EINVAL; 470 return -EINVAL;
379 471
380 /* buffer overflow notification is not yet implemented */ 472 if (at < tracer->trace.ds.begin)
381 if (ovfl) 473 return -EINVAL;
382 return -EOPNOTSUPP;
383 474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
384 477
385 spin_lock(&ds_lock); 478 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
490 }
386 491
387 if (!check_tracer(task)) 492 return ds_cfg.sizeof_rec[ds_bts];
388 return -EPERM; 493}
389 494
390 error = -ENOMEM; 495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
391 context = ds_alloc_context(task); 496{
392 if (!context) 497 unsigned char raw[MAX_SIZEOF_BTS];
393 goto out_unlock;
394 498
395 error = -EALREADY; 499 if (!tracer)
396 if (context->owner[qual] == current) 500 return -EINVAL;
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402 501
403 spin_unlock(&ds_lock); 502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
503 return -EOVERFLOW;
404 504
505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
524 }
405 525
406 error = -ENOMEM; 526 return ds_write(tracer->ds.context, ds_bts, raw,
407 if (!base) { 527 ds_cfg.sizeof_rec[ds_bts]);
408 base = ds_allocate_buffer(size, &context->pages[qual]); 528}
409 if (!base)
410 goto out_release;
411 529
412 context->buffer[qual] = base;
413 }
414 error = 0;
415 530
416 context->callback[qual] = ovfl; 531static void ds_write_config(struct ds_context *context,
532 struct ds_trace *cfg, enum ds_qualifier qual)
533{
534 unsigned char *ds = context->ds;
535
536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
546
547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
556 unsigned long buffer, adj;
417 557
418 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
419 * constraints: 559 * constraints:
@@ -425,395 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
425 */ 565 */
426 buffer = (unsigned long)base; 566 buffer = (unsigned long)base;
427 567
428 adj = ALIGN(buffer, alignment) - buffer; 568 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
429 buffer += adj; 569 buffer += adj;
430 size -= adj; 570 size -= adj;
431 571
432 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438 574
439 if (ovfl) { 575 size = (trace->n * trace->size);
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444 576
445 /* we keep the context until ds_release */ 577 trace->begin = (void *)buffer;
446 return error; 578 trace->top = trace->begin;
447 579 trace->end = (void *)(buffer + size);
448 out_release: 580 /* The value for 'no threshold' is -1, which will set the
449 context->owner[qual] = NULL; 581 * threshold outside of the buffer, just like we want it.
450 ds_put_context(context); 582 */
451 return error; 583 trace->ith = (void *)(buffer + size - ith);
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
457}
458 584
459int ds_request_bts(struct task_struct *task, void *base, size_t size, 585 trace->flags = flags;
460 ds_ovfl_callback_t ovfl)
461{
462 return ds_request(task, base, size, ovfl, ds_bts);
463} 586}
464 587
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
467{
468 return ds_request(task, base, size, ovfl, ds_pebs);
469}
470 588
471static int ds_release(struct task_struct *task, enum ds_qualifier qual) 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
590 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags)
472{ 592{
473 struct ds_context *context; 593 struct ds_context *context;
474 int error; 594 int error;
475 595
476 context = ds_get_context(task); 596 error = -EINVAL;
477 error = ds_validate_access(context, qual); 597 if (!base)
478 if (error < 0)
479 goto out; 598 goto out;
480 599
481 kfree(context->buffer[qual]); 600 /* we require some space to do alignment adjustments below */
482 context->buffer[qual] = NULL; 601 error = -EINVAL;
483 602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
484 current->mm->total_vm -= context->pages[qual]; 603 goto out;
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
498}
499 604
500int ds_release_bts(struct task_struct *task) 605 if (th != (size_t)-1) {
501{ 606 th *= ds_cfg.sizeof_rec[qual];
502 return ds_release(task, ds_bts);
503}
504 607
505int ds_release_pebs(struct task_struct *task) 608 error = -EINVAL;
506{ 609 if (size <= th)
507 return ds_release(task, ds_pebs); 610 goto out;
508} 611 }
509 612
510static int ds_get_index(struct task_struct *task, size_t *pos, 613 tracer->buffer = base;
511 enum ds_qualifier qual) 614 tracer->size = size;
512{
513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516 615
616 error = -ENOMEM;
517 context = ds_get_context(task); 617 context = ds_get_context(task);
518 error = ds_validate_access(context, qual); 618 if (!context)
519 if (error < 0)
520 goto out; 619 goto out;
620 tracer->context = context;
521 621
522 base = ds_get(context->ds, qual, ds_buffer_base); 622 ds_init_ds_trace(trace, qual, base, size, th, flags);
523 index = ds_get(context->ds, qual, ds_index);
524 623
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 624 error = 0;
526 if (pos)
527 *pos = error;
528 out: 625 out:
529 ds_put_context(context);
530 return error; 626 return error;
531} 627}
532 628
533int ds_get_bts_index(struct task_struct *task, size_t *pos) 629struct bts_tracer *ds_request_bts(struct task_struct *task,
630 void *base, size_t size,
631 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags)
534{ 633{
535 return ds_get_index(task, pos, ds_bts); 634 struct bts_tracer *tracer;
536} 635 unsigned long irq;
537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
542
543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
545{
546 struct ds_context *context;
547 unsigned long base, end;
548 int error; 636 int error;
549 637
550 context = ds_get_context(task); 638 error = -EOPNOTSUPP;
551 error = ds_validate_access(context, qual); 639 if (!ds_cfg.ctl[dsf_bts])
552 if (error < 0)
553 goto out; 640 goto out;
554 641
555 base = ds_get(context->ds, qual, ds_buffer_base); 642 /* buffer overflow notification is not yet implemented */
556 end = ds_get(context->ds, qual, ds_absolute_maximum); 643 error = -EOPNOTSUPP;
644 if (ovfl)
645 goto out;
557 646
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 647 error = -ENOMEM;
559 if (pos) 648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
560 *pos = error; 649 if (!tracer)
561 out: 650 goto out;
562 ds_put_context(context); 651 tracer->ovfl = ovfl;
563 return error;
564}
565 652
566int ds_get_bts_end(struct task_struct *task, size_t *pos) 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
567{ 654 ds_bts, task, base, size, th, flags);
568 return ds_get_end(task, pos, ds_bts); 655 if (error < 0)
569} 656 goto out_tracer;
570 657
571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
572{
573 return ds_get_end(task, pos, ds_pebs);
574}
575 658
576static int ds_access(struct task_struct *task, size_t index, 659 spin_lock_irqsave(&ds_lock, irq);
577 const void **record, enum ds_qualifier qual)
578{
579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
582 660
583 if (!record) 661 error = -EPERM;
584 return -EINVAL; 662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
585 665
586 context = ds_get_context(task); 666 error = -EPERM;
587 error = ds_validate_access(context, qual); 667 if (tracer->ds.context->bts_master)
588 if (error < 0) 668 goto out_put_tracer;
589 goto out; 669 tracer->ds.context->bts_master = tracer;
590 670
591 base = ds_get(context->ds, qual, ds_buffer_base); 671 spin_unlock_irqrestore(&ds_lock, irq);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
593 672
594 error = -EINVAL;
595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
596 goto out;
597 673
598 *record = (const void *)idx; 674 tracer->trace.read = bts_read;
599 error = ds_cfg.sizeof_rec[qual]; 675 tracer->trace.write = bts_write;
600 out:
601 ds_put_context(context);
602 return error;
603}
604 676
605int ds_access_bts(struct task_struct *task, size_t index, const void **record) 677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
606{ 678 ds_resume_bts(tracer);
607 return ds_access(task, index, record, ds_bts);
608}
609 679
610int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 680 return tracer;
611{ 681
612 return ds_access(task, index, record, ds_pebs); 682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
687 out_tracer:
688 kfree(tracer);
689 out:
690 return ERR_PTR(error);
613} 691}
614 692
615static int ds_write(struct task_struct *task, const void *record, size_t size, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
616 enum ds_qualifier qual, int force) 694 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
617{ 697{
618 struct ds_context *context; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
619 int error; 700 int error;
620 701
621 if (!record) 702 /* buffer overflow notification is not yet implemented */
622 return -EINVAL; 703 error = -EOPNOTSUPP;
704 if (ovfl)
705 goto out;
623 706
624 error = -EPERM; 707 error = -ENOMEM;
625 context = ds_get_context(task); 708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
626 if (!context) 709 if (!tracer)
627 goto out; 710 goto out;
711 tracer->ovfl = ovfl;
628 712
629 if (!force) { 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
630 error = ds_validate_access(context, qual); 714 ds_pebs, task, base, size, th, flags);
631 if (error < 0) 715 if (error < 0)
632 goto out; 716 goto out_tracer;
633 }
634 717
635 error = 0; 718 spin_lock_irqsave(&ds_lock, irq);
636 while (size) {
637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639 719
640 /* 720 error = -EPERM;
641 * write as much as possible without producing an 721 if (!check_tracer(task))
642 * overflow interrupt. 722 goto out_unlock;
643 * 723 get_tracer(task);
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654 724
655 write_end = min(end, int_th); 725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
656 729
657 /* if we are already beyond the interrupt threshold, 730 spin_unlock_irqrestore(&ds_lock, irq);
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661 731
662 if (write_end <= index) 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
663 goto out; 733 ds_resume_pebs(tracer);
664 734
665 write_size = min((unsigned long) size, write_end - index); 735 return tracer;
666 memcpy((void *)index, record, write_size);
667 736
668 record = (const char *)record + write_size; 737 out_put_tracer:
669 size -= write_size; 738 put_tracer(task);
670 error += write_size; 739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
742 out_tracer:
743 kfree(tracer);
744 out:
745 return ERR_PTR(error);
746}
671 747
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 748void ds_release_bts(struct bts_tracer *tracer)
673 adj_write_size *= ds_cfg.sizeof_rec[qual]; 749{
750 if (!tracer)
751 return;
674 752
675 /* zero out trailing bytes */ 753 ds_suspend_bts(tracer);
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679 754
680 if (index >= end) 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
681 index = base; 756 tracer->ds.context->bts_master = NULL;
682 ds_set(context->ds, qual, ds_index, index);
683 757
684 if (index >= int_th) 758 put_tracer(tracer->ds.context->task);
685 ds_overflow(task, context, qual); 759 ds_put_context(tracer->ds.context);
686 }
687 760
688 out: 761 kfree(tracer);
689 ds_put_context(context);
690 return error;
691} 762}
692 763
693int ds_write_bts(struct task_struct *task, const void *record, size_t size) 764void ds_suspend_bts(struct bts_tracer *tracer)
694{ 765{
695 return ds_write(task, record, size, ds_bts, /* force = */ 0); 766 struct task_struct *task;
696}
697 767
698int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 768 if (!tracer)
699{ 769 return;
700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
701}
702 770
703int ds_unchecked_write_bts(struct task_struct *task, 771 task = tracer->ds.context->task;
704 const void *record, size_t size)
705{
706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
707}
708 772
709int ds_unchecked_write_pebs(struct task_struct *task, 773 if (!task || (task == current))
710 const void *record, size_t size) 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
711{ 775
712 return ds_write(task, record, size, ds_pebs, /* force = */ 1); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
778
779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
713} 782}
714 783
715static int ds_reset_or_clear(struct task_struct *task, 784void ds_resume_bts(struct bts_tracer *tracer)
716 enum ds_qualifier qual, int clear)
717{ 785{
718 struct ds_context *context; 786 struct task_struct *task;
719 unsigned long base, end; 787 unsigned long control;
720 int error;
721 788
722 context = ds_get_context(task); 789 if (!tracer)
723 error = ds_validate_access(context, qual); 790 return;
724 if (error < 0)
725 goto out;
726 791
727 base = ds_get(context->ds, qual, ds_buffer_base); 792 task = tracer->ds.context->task;
728 end = ds_get(context->ds, qual, ds_absolute_maximum);
729 793
730 if (clear) 794 control = ds_cfg.ctl[dsf_bts];
731 memset((void *)base, 0, end - base); 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
732 799
733 ds_set(context->ds, qual, ds_index, base); 800 if (task) {
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
734 804
735 error = 0; 805 if (!task || (task == current))
736 out: 806 update_debugctlmsr(get_debugctlmsr() | control);
737 ds_put_context(context);
738 return error;
739} 807}
740 808
741int ds_reset_bts(struct task_struct *task) 809void ds_release_pebs(struct pebs_tracer *tracer)
742{ 810{
743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 811 if (!tracer)
812 return;
813
814 ds_suspend_pebs(tracer);
815
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL;
818
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context);
821
822 kfree(tracer);
744} 823}
745 824
746int ds_reset_pebs(struct task_struct *task) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
747{ 826{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 827
749} 828}
750 829
751int ds_clear_bts(struct task_struct *task) 830void ds_resume_pebs(struct pebs_tracer *tracer)
752{ 831{
753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 832
754} 833}
755 834
756int ds_clear_pebs(struct task_struct *task) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
757{ 836{
758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 837 if (!tracer)
838 return NULL;
839
840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
841 return &tracer->trace;
759} 842}
760 843
761int ds_get_pebs_reset(struct task_struct *task, u64 *value) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
762{ 845{
763 struct ds_context *context; 846 if (!tracer)
764 int error; 847 return NULL;
848
849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
765 852
766 if (!value) 853 return &tracer->trace;
854}
855
856int ds_reset_bts(struct bts_tracer *tracer)
857{
858 if (!tracer)
767 return -EINVAL; 859 return -EINVAL;
768 860
769 context = ds_get_context(task); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
770 error = ds_validate_access(context, ds_pebs);
771 if (error < 0)
772 goto out;
773 862
774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
775 865
776 error = 0; 866 return 0;
777 out:
778 ds_put_context(context);
779 return error;
780} 867}
781 868
782int ds_set_pebs_reset(struct task_struct *task, u64 value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
783{ 870{
784 struct ds_context *context; 871 if (!tracer)
785 int error; 872 return -EINVAL;
786 873
787 context = ds_get_context(task); 874 tracer->trace.ds.top = tracer->trace.ds.begin;
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 875
792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
793 878
794 error = 0; 879 return 0;
795 out: 880}
796 ds_put_context(context); 881
797 return error; 882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
883{
884 if (!tracer)
885 return -EINVAL;
886
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
888
889 return 0;
798} 890}
799 891
800static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
801 .sizeof_ds = sizeof(long) * 12, 893 .name = "Netburst",
802 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
803 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10 896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
900#ifdef __i386__
901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
902#else
903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
904#endif
905};
906static const struct ds_configuration ds_cfg_pentium_m = {
907 .name = "Pentium M",
908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
912#ifdef __i386__
913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
914#else
915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
916#endif
805}; 917};
806static const struct ds_configuration ds_cfg_64 = { 918static const struct ds_configuration ds_cfg_core2_atom = {
807 .sizeof_ds = 8 * 12, 919 .name = "Core 2/Atom",
808 .sizeof_field = 8, 920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
809 .sizeof_rec[ds_bts] = 8 * 3, 921 .ctl[dsf_bts_kernel] = (1 << 9),
810 .sizeof_rec[ds_pebs] = 8 * 10 922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
811}; 927};
812 928
813static inline void 929static void
814ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
815{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
816 ds_cfg = *cfg; 933 ds_cfg = *cfg;
934
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
943
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
817} 945}
818 946
819void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -821,25 +949,27 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
821 switch (c->x86) { 949 switch (c->x86) {
822 case 0x6: 950 case 0x6:
823 switch (c->x86_model) { 951 switch (c->x86_model) {
824 case 0xD: 952 case 0x9:
825 case 0xE: /* Pentium M */ 953 case 0xd: /* Pentium M */
826 ds_configure(&ds_cfg_var); 954 ds_configure(&ds_cfg_pentium_m);
827 break; 955 break;
828 case 0xF: /* Core2 */ 956 case 0xf:
829 case 0x1C: /* Atom */ 957 case 0x17: /* Core2 */
830 ds_configure(&ds_cfg_64); 958 case 0x1c: /* Atom */
959 ds_configure(&ds_cfg_core2_atom);
831 break; 960 break;
961 case 0x1a: /* i7 */
832 default: 962 default:
833 /* sorry, don't know about them */ 963 /* sorry, don't know about them */
834 break; 964 break;
835 } 965 }
836 break; 966 break;
837 case 0xF: 967 case 0xf:
838 switch (c->x86_model) { 968 switch (c->x86_model) {
839 case 0x0: 969 case 0x0:
840 case 0x1: 970 case 0x1:
841 case 0x2: /* Netburst */ 971 case 0x2: /* Netburst */
842 ds_configure(&ds_cfg_var); 972 ds_configure(&ds_cfg_netburst);
843 break; 973 break;
844 default: 974 default:
845 /* sorry, don't know about them */ 975 /* sorry, don't know about them */
@@ -852,13 +982,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
852 } 982 }
853} 983}
854 984
855void ds_free(struct ds_context *context) 985/*
986 * Change the DS configuration from tracing prev to tracing next.
987 */
988void ds_switch_to(struct task_struct *prev, struct task_struct *next)
989{
990 struct ds_context *prev_ctx = prev->thread.ds_ctx;
991 struct ds_context *next_ctx = next->thread.ds_ctx;
992
993 if (prev_ctx) {
994 update_debugctlmsr(0);
995
996 if (prev_ctx->bts_master &&
997 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
998 struct bts_struct ts = {
999 .qualifier = bts_task_departs,
1000 .variant.timestamp.jiffies = jiffies_64,
1001 .variant.timestamp.pid = prev->pid
1002 };
1003 bts_write(prev_ctx->bts_master, &ts);
1004 }
1005 }
1006
1007 if (next_ctx) {
1008 if (next_ctx->bts_master &&
1009 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1010 struct bts_struct ts = {
1011 .qualifier = bts_task_arrives,
1012 .variant.timestamp.jiffies = jiffies_64,
1013 .variant.timestamp.pid = next->pid
1014 };
1015 bts_write(next_ctx->bts_master, &ts);
1016 }
1017
1018 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1019 }
1020
1021 update_debugctlmsr(next->thread.debugctlmsr);
1022}
1023
1024void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1025{
1026 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1027 tsk->thread.ds_ctx = NULL;
1028}
1029
1030void ds_exit_thread(struct task_struct *tsk)
856{ 1031{
857 /* This is called when the task owning the parameter context 1032 WARN_ON(tsk->thread.ds_ctx);
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863} 1033}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..87d103ded1c3
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#include "dumpstack.h"
21
22int panic_on_unrecovered_nmi;
23unsigned int code_bytes = 64;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
64/*
65 * x86-64 can have up to three kernel stacks:
66 * process stack
67 * interrupt stack
68 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
69 */
70
71static inline int valid_stack_ptr(struct thread_info *tinfo,
72 void *p, unsigned int size, void *end)
73{
74 void *t = tinfo;
75 if (end) {
76 if (p < end && p >= (end-THREAD_SIZE))
77 return 1;
78 else
79 return 0;
80 }
81 return p > t && p < t + THREAD_SIZE - size;
82}
83
84unsigned long
85print_context_stack(struct thread_info *tinfo,
86 unsigned long *stack, unsigned long bp,
87 const struct stacktrace_ops *ops, void *data,
88 unsigned long *end, int *graph)
89{
90 struct stack_frame *frame = (struct stack_frame *)bp;
91
92 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
93 unsigned long addr;
94
95 addr = *stack;
96 if (__kernel_text_address(addr)) {
97 if ((unsigned long) stack == bp + sizeof(long)) {
98 ops->address(data, addr, 1);
99 frame = frame->next_frame;
100 bp = (unsigned long) frame;
101 } else {
102 ops->address(data, addr, 0);
103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 }
106 stack++;
107 }
108 return bp;
109}
110
111
112static void
113print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
114{
115 printk(data);
116 print_symbol(msg, symbol);
117 printk("\n");
118}
119
120static void print_trace_warning(void *data, char *msg)
121{
122 printk("%s%s\n", (char *)data, msg);
123}
124
125static int print_trace_stack(void *data, char *name)
126{
127 printk("%s <%s> ", (char *)data, name);
128 return 0;
129}
130
131/*
132 * Print one address/symbol entries per line.
133 */
134static void print_trace_address(void *data, unsigned long addr, int reliable)
135{
136 touch_nmi_watchdog();
137 printk(data);
138 printk_address(addr, reliable);
139}
140
141static const struct stacktrace_ops print_trace_ops = {
142 .warning = print_trace_warning,
143 .warning_symbol = print_trace_warning_symbol,
144 .stack = print_trace_stack,
145 .address = print_trace_address,
146};
147
148void
149show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
150 unsigned long *stack, unsigned long bp, char *log_lvl)
151{
152 printk("%sCall Trace:\n", log_lvl);
153 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
154}
155
156void show_trace(struct task_struct *task, struct pt_regs *regs,
157 unsigned long *stack, unsigned long bp)
158{
159 show_trace_log_lvl(task, regs, stack, bp, "");
160}
161
162void show_stack(struct task_struct *task, unsigned long *sp)
163{
164 show_stack_log_lvl(task, NULL, sp, 0, "");
165}
166
167/*
168 * The architecture-independent dump_stack generator
169 */
170void dump_stack(void)
171{
172 unsigned long bp = 0;
173 unsigned long stack;
174
175#ifdef CONFIG_FRAME_POINTER
176 if (!bp)
177 get_bp(bp);
178#endif
179
180 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version);
185 show_trace(NULL, NULL, &stack, bp);
186}
187EXPORT_SYMBOL(dump_stack);
188
189static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
190static int die_owner = -1;
191static unsigned int die_nest_count;
192
193unsigned __kprobes long oops_begin(void)
194{
195 int cpu;
196 unsigned long flags;
197
198 oops_enter();
199
200 /* racy, but better than risking deadlock. */
201 raw_local_irq_save(flags);
202 cpu = smp_processor_id();
203 if (!__raw_spin_trylock(&die_lock)) {
204 if (cpu == die_owner)
205 /* nested oops. should stop eventually */;
206 else
207 __raw_spin_lock(&die_lock);
208 }
209 die_nest_count++;
210 die_owner = cpu;
211 console_verbose();
212 bust_spinlocks(1);
213 return flags;
214}
215
216void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
217{
218 if (regs && kexec_should_crash(current))
219 crash_kexec(regs);
220
221 bust_spinlocks(0);
222 die_owner = -1;
223 add_taint(TAINT_DIE);
224 die_nest_count--;
225 if (!die_nest_count)
226 /* Nest count reaches zero, release the lock. */
227 __raw_spin_unlock(&die_lock);
228 raw_local_irq_restore(flags);
229 oops_exit();
230
231 if (!signr)
232 return;
233 if (in_interrupt())
234 panic("Fatal exception in interrupt");
235 if (panic_on_oops)
236 panic("Fatal exception");
237 do_exit(signr);
238}
239
240int __kprobes __die(const char *str, struct pt_regs *regs, long err)
241{
242#ifdef CONFIG_X86_32
243 unsigned short ss;
244 unsigned long sp;
245#endif
246 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
247#ifdef CONFIG_PREEMPT
248 printk("PREEMPT ");
249#endif
250#ifdef CONFIG_SMP
251 printk("SMP ");
252#endif
253#ifdef CONFIG_DEBUG_PAGEALLOC
254 printk("DEBUG_PAGEALLOC");
255#endif
256 printk("\n");
257 sysfs_printk_last_file();
258 if (notify_die(DIE_OOPS, str, regs, err,
259 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
260 return 1;
261
262 show_registers(regs);
263#ifdef CONFIG_X86_32
264 sp = (unsigned long) (&regs->sp);
265 savesegment(ss, ss);
266 if (user_mode(regs)) {
267 sp = regs->sp;
268 ss = regs->ss & 0xffff;
269 }
270 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
271 print_symbol("%s", regs->ip);
272 printk(" SS:ESP %04x:%08lx\n", ss, sp);
273#else
274 /* Executive summary in case the oops scrolled away */
275 printk(KERN_ALERT "RIP ");
276 printk_address(regs->ip, 1);
277 printk(" RSP <%016lx>\n", regs->sp);
278#endif
279 return 0;
280}
281
282/*
283 * This is gone through when something in the kernel has done something bad
284 * and is about to be terminated:
285 */
286void die(const char *str, struct pt_regs *regs, long err)
287{
288 unsigned long flags = oops_begin();
289 int sig = SIGSEGV;
290
291 if (!user_mode_vm(regs))
292 report_bug(regs->ip, regs);
293
294 if (__die(str, regs, err))
295 sig = 0;
296 oops_end(flags, regs, sig);
297}
298
299void notrace __kprobes
300die_nmi(char *str, struct pt_regs *regs, int do_panic)
301{
302 unsigned long flags;
303
304 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
305 return;
306
307 /*
308 * We are in trouble anyway, lets at least try
309 * to get a message out.
310 */
311 flags = oops_begin();
312 printk(KERN_EMERG "%s", str);
313 printk(" on CPU%d, ip %08lx, registers:\n",
314 smp_processor_id(), regs->ip);
315 show_registers(regs);
316 oops_end(flags, regs, 0);
317 if (do_panic || panic_on_oops)
318 panic("Non maskable interrupt");
319 nmi_exit();
320 local_irq_enable();
321 do_exit(SIGBUS);
322}
323
324static int __init oops_setup(char *s)
325{
326 if (!s)
327 return -EINVAL;
328 if (!strcmp(s, "panic"))
329 panic_on_oops = 1;
330 return 0;
331}
332early_param("oops", oops_setup);
333
334static int __init kstack_setup(char *s)
335{
336 if (!s)
337 return -EINVAL;
338 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
339 return 0;
340}
341early_param("kstack", kstack_setup);
342
343static int __init code_bytes_setup(char *s)
344{
345 code_bytes = simple_strtoul(s, NULL, 0);
346 if (code_bytes > 8192)
347 code_bytes = 8192;
348
349 return 1;
350}
351__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl);
26
27extern void
28show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl);
30
31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33
34/* The form of the top of the frame on the stack */
35struct stack_frame {
36 struct stack_frame *next_frame;
37 unsigned long return_address;
38};
39#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197b..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 8 20#include "dumpstack.h"
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78 21
79void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
82{ 25{
26 int graph = 0;
27
83 if (!task) 28 if (!task)
84 task = current; 29 task = current;
85 30
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
107 52
108 context = (struct thread_info *) 53 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
111 57
112 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
113 if (!stack) 59 if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
119} 65}
120EXPORT_SYMBOL(dump_trace); 66EXPORT_SYMBOL(dump_trace);
121 67
122static void 68void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
175{ 71{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 92 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197} 93}
198 94
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226 95
227void show_registers(struct pt_regs *regs) 96void show_registers(struct pt_regs *regs)
228{ 97{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
283 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
284} 153}
285 154
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a7..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 4 20#include "dumpstack.h"
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33 21
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp) 23 unsigned *usedp, char **idp)
@@ -113,59 +101,16 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 101 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */ 102 */
115 103
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs, 104void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp, 105 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
164{ 107{
165 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
167 unsigned used = 0; 111 unsigned used = 0;
168 struct thread_info *tinfo; 112 struct thread_info *tinfo;
113 int graph = 0;
169 114
170 if (!task) 115 if (!task)
171 task = current; 116 task = current;
@@ -206,7 +151,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
206 break; 151 break;
207 152
208 bp = print_context_stack(tinfo, stack, bp, ops, 153 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end); 154 data, estack_end, &graph);
210 ops->stack(data, "<EOE>"); 155 ops->stack(data, "<EOE>");
211 /* 156 /*
212 * We link to the next stack via the 157 * We link to the next stack via the
@@ -216,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
216 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
217 continue; 162 continue;
218 } 163 }
219 if (irqstack_end) { 164 if (irq_stack_end) {
220 unsigned long *irqstack; 165 unsigned long *irq_stack;
221 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
222 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
223 168
224 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
225 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
226 break; 171 break;
227 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end); 173 ops, data, irq_stack_end, &graph);
229 /* 174 /*
230 * We link to the next stack (which would be 175 * We link to the next stack (which would be
231 * the process stack normally) the last 176 * the process stack normally) the last
232 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
233 */ 178 */
234 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
235 irqstack_end = NULL; 180 irq_stack_end = NULL;
236 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
237 continue; 182 continue;
238 } 183 }
@@ -243,72 +188,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 /* 188 /*
244 * This handles the process stack: 189 * This handles the process stack:
245 */ 190 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 191 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
247 put_cpu(); 192 put_cpu();
248} 193}
249EXPORT_SYMBOL(dump_trace); 194EXPORT_SYMBOL(dump_trace);
250 195
251static void 196void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 197show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl) 198 unsigned long *sp, unsigned long bp, char *log_lvl)
304{ 199{
305 unsigned long *stack; 200 unsigned long *stack;
306 int i; 201 int i;
307 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
308 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
309 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
310 unsigned long *irqstack = 205 unsigned long *irq_stack =
311 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
312 207
313 /* 208 /*
314 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -324,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
324 219
325 stack = sp; 220 stack = sp;
326 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
327 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
328 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
329 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
330 printk(" <EOI> "); 225 printk(" <EOI> ");
331 } 226 }
332 } else { 227 } else {
@@ -342,39 +237,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 237 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343} 238}
344 239
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs) 240void show_registers(struct pt_regs *regs)
373{ 241{
374 int i; 242 int i;
375 unsigned long sp; 243 unsigned long sp;
376 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
377 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
378 246
379 sp = regs->sp; 247 sp = regs->sp;
380 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
@@ -429,147 +297,3 @@ int is_valid_bugaddr(unsigned long ip)
429 return ud2 == 0x0b0f; 297 return ud2 == 0x0b0f;
430} 298}
431 299
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7aafeb5263ef..e85826829cf2 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -665,6 +665,27 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
665} 665}
666#endif 666#endif
667 667
668#ifdef CONFIG_HIBERNATION
669/**
670 * Mark ACPI NVS memory region, so that we can save/restore it during
671 * hibernation and the subsequent resume.
672 */
673static int __init e820_mark_nvs_memory(void)
674{
675 int i;
676
677 for (i = 0; i < e820.nr_map; i++) {
678 struct e820entry *ei = &e820.map[i];
679
680 if (ei->type == E820_NVS)
681 hibernate_nvs_register(ei->addr, ei->size);
682 }
683
684 return 0;
685}
686core_initcall(e820_mark_nvs_memory);
687#endif
688
668/* 689/*
669 * Early reserved memory areas. 690 * Early reserved memory areas.
670 */ 691 */
@@ -677,22 +698,6 @@ struct early_res {
677}; 698};
678static struct early_res early_res[MAX_EARLY_RES] __initdata = { 699static struct early_res early_res[MAX_EARLY_RES] __initdata = {
679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 700 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
680#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
681 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
682#endif
683#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
684 /*
685 * But first pinch a few for the stack/trampoline stuff
686 * FIXME: Don't need the extra page at 4K, but need to fix
687 * trampoline before removing it. (see the GDT stuff)
688 */
689 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
690 /*
691 * Has to be in very low memory so we can execute
692 * real-mode AP code.
693 */
694 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
695#endif
696 {} 701 {}
697}; 702};
698 703
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3ce029ffaa55..76b8cd953dee 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h>
20 21
21static void __init fix_hypertransport_config(int num, int slot, int func) 22static void __init fix_hypertransport_config(int num, int slot, int func)
22{ 23{
@@ -188,20 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
188} 189}
189#endif 190#endif
190 191
191#ifdef CONFIG_DMAR
192static void __init intel_g33_dmar(int num, int slot, int func)
193{
194 struct acpi_table_header *dmar_tbl;
195 acpi_status status;
196
197 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
198 if (ACPI_SUCCESS(status)) {
199 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
200 dmar_disabled = 1;
201 }
202}
203#endif
204
205#define QFLAG_APPLY_ONCE 0x1 192#define QFLAG_APPLY_ONCE 0x1
206#define QFLAG_APPLIED 0x2 193#define QFLAG_APPLIED 0x2
207#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 194#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -214,6 +201,12 @@ struct chipset {
214 void (*f)(int num, int slot, int func); 201 void (*f)(int num, int slot, int func);
215}; 202};
216 203
204/*
205 * Only works for devices on the root bus. If you add any devices
206 * not on bus 0 readd another loop level in early_quirks(). But
207 * be careful because at least the Nvidia quirk here relies on
208 * only matching on bus 0.
209 */
217static struct chipset early_qrk[] __initdata = { 210static struct chipset early_qrk[] __initdata = {
218 { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, 211 { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
219 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, 212 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -225,10 +218,6 @@ static struct chipset early_qrk[] __initdata = {
225 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, 218 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
226 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 219 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
227 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, 220 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
228#ifdef CONFIG_DMAR
229 { PCI_VENDOR_ID_INTEL, 0x29c0,
230 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
231#endif
232 {} 221 {}
233}; 222};
234 223
@@ -284,17 +273,17 @@ static int __init check_dev_quirk(int num, int slot, int func)
284 273
285void __init early_quirks(void) 274void __init early_quirks(void)
286{ 275{
287 int num, slot, func; 276 int slot, func;
288 277
289 if (!early_pci_allowed()) 278 if (!early_pci_allowed())
290 return; 279 return;
291 280
292 /* Poor man's PCI discovery */ 281 /* Poor man's PCI discovery */
293 for (num = 0; num < 32; num++) 282 /* Only scan the root bus */
294 for (slot = 0; slot < 32; slot++) 283 for (slot = 0; slot < 32; slot++)
295 for (func = 0; func < 8; func++) { 284 for (func = 0; func < 8; func++) {
296 /* Only probe function 0 on single fn devices */ 285 /* Only probe function 0 on single fn devices */
297 if (check_dev_quirk(num, slot, func)) 286 if (check_dev_quirk(0, slot, func))
298 break; 287 break;
299 } 288 }
300} 289}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d3834..639ad98238a2 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,8 +13,8 @@
13#include <asm/setup.h> 13#include <asm/setup.h>
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 18#include <linux/usb/ehci_def.h>
19 19
20/* Simple VGA output */ 20/* Simple VGA output */
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
875}; 875};
876#endif 876#endif
877 877
878/* Console interface to a host file on AMD's SimNow! */
879
880static int simnow_fd;
881
882enum {
883 MAGIC1 = 0xBACCD00A,
884 MAGIC2 = 0xCA110000,
885 XOPEN = 5,
886 XWRITE = 4,
887};
888
889static noinline long simnow(long cmd, long a, long b, long c)
890{
891 long ret;
892
893 asm volatile("cpuid" :
894 "=a" (ret) :
895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
896 return ret;
897}
898
899static void __init simnow_init(char *str)
900{
901 char *fn = "klog";
902
903 if (*str == '=')
904 fn = ++str;
905 /* error ignored */
906 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
907}
908
909static void simnow_write(struct console *con, const char *s, unsigned n)
910{
911 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
912}
913
914static struct console simnow_console = {
915 .name = "simnow",
916 .write = simnow_write,
917 .flags = CON_PRINTBUFFER,
918 .index = -1,
919};
920
921/* Direct interface for emergencies */ 878/* Direct interface for emergencies */
922static struct console *early_console = &early_vga_console; 879static struct console *early_console = &early_vga_console;
923static int __initdata early_console_initialized; 880static int __initdata early_console_initialized;
@@ -929,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
929 va_list ap; 886 va_list ap;
930 887
931 va_start(ap, fmt); 888 va_start(ap, fmt);
932 n = vscnprintf(buf, 512, fmt, ap); 889 n = vscnprintf(buf, sizeof(buf), fmt, ap);
933 early_console->write(early_console, buf, n); 890 early_console->write(early_console, buf, n);
934 va_end(ap); 891 va_end(ap);
935} 892}
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
960 max_ypos = boot_params.screen_info.orig_video_lines; 917 max_ypos = boot_params.screen_info.orig_video_lines;
961 current_ypos = boot_params.screen_info.orig_y; 918 current_ypos = boot_params.screen_info.orig_y;
962 early_console = &early_vga_console; 919 early_console = &early_vga_console;
963 } else if (!strncmp(buf, "simnow", 6)) {
964 simnow_init(buf + 6);
965 early_console = &simnow_console;
966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP 920#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) { 921 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0) 922 if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..e99206831459 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
30 * 1C(%esp) - %ds 30 * 1C(%esp) - %ds
31 * 20(%esp) - %es 31 * 20(%esp) - %es
32 * 24(%esp) - %fs 32 * 24(%esp) - %fs
33 * 28(%esp) - orig_eax 33 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
34 * 2C(%esp) - %eip 34 * 2C(%esp) - orig_eax
35 * 30(%esp) - %cs 35 * 30(%esp) - %eip
36 * 34(%esp) - %eflags 36 * 34(%esp) - %cs
37 * 38(%esp) - %oldesp 37 * 38(%esp) - %eflags
38 * 3C(%esp) - %oldss 38 * 3C(%esp) - %oldesp
39 * 40(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -101,121 +102,221 @@
101#define resume_userspace_sig resume_userspace 102#define resume_userspace_sig resume_userspace
102#endif 103#endif
103 104
104#define SAVE_ALL \ 105/*
105 cld; \ 106 * User gs save/restore
106 pushl %fs; \ 107 *
107 CFI_ADJUST_CFA_OFFSET 4;\ 108 * %gs is used for userland TLS and kernel only uses it for stack
108 /*CFI_REL_OFFSET fs, 0;*/\ 109 * canary which is required to be at %gs:20 by gcc. Read the comment
109 pushl %es; \ 110 * at the top of stackprotector.h for more info.
110 CFI_ADJUST_CFA_OFFSET 4;\ 111 *
111 /*CFI_REL_OFFSET es, 0;*/\ 112 * Local labels 98 and 99 are used.
112 pushl %ds; \ 113 */
113 CFI_ADJUST_CFA_OFFSET 4;\ 114#ifdef CONFIG_X86_32_LAZY_GS
114 /*CFI_REL_OFFSET ds, 0;*/\ 115
115 pushl %eax; \ 116 /* unfortunately push/pop can't be no-op */
116 CFI_ADJUST_CFA_OFFSET 4;\ 117.macro PUSH_GS
117 CFI_REL_OFFSET eax, 0;\ 118 pushl $0
118 pushl %ebp; \ 119 CFI_ADJUST_CFA_OFFSET 4
119 CFI_ADJUST_CFA_OFFSET 4;\ 120.endm
120 CFI_REL_OFFSET ebp, 0;\ 121.macro POP_GS pop=0
121 pushl %edi; \ 122 addl $(4 + \pop), %esp
122 CFI_ADJUST_CFA_OFFSET 4;\ 123 CFI_ADJUST_CFA_OFFSET -(4 + \pop)
123 CFI_REL_OFFSET edi, 0;\ 124.endm
124 pushl %esi; \ 125.macro POP_GS_EX
125 CFI_ADJUST_CFA_OFFSET 4;\ 126.endm
126 CFI_REL_OFFSET esi, 0;\ 127
127 pushl %edx; \ 128 /* all the rest are no-op */
128 CFI_ADJUST_CFA_OFFSET 4;\ 129.macro PTGS_TO_GS
129 CFI_REL_OFFSET edx, 0;\ 130.endm
130 pushl %ecx; \ 131.macro PTGS_TO_GS_EX
131 CFI_ADJUST_CFA_OFFSET 4;\ 132.endm
132 CFI_REL_OFFSET ecx, 0;\ 133.macro GS_TO_REG reg
133 pushl %ebx; \ 134.endm
134 CFI_ADJUST_CFA_OFFSET 4;\ 135.macro REG_TO_PTGS reg
135 CFI_REL_OFFSET ebx, 0;\ 136.endm
136 movl $(__USER_DS), %edx; \ 137.macro SET_KERNEL_GS reg
137 movl %edx, %ds; \ 138.endm
138 movl %edx, %es; \ 139
139 movl $(__KERNEL_PERCPU), %edx; \ 140#else /* CONFIG_X86_32_LAZY_GS */
141
142.macro PUSH_GS
143 pushl %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/
146.endm
147
148.macro POP_GS pop=0
14998: popl %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/
152 .if \pop <> 0
153 add $\pop, %esp
154 CFI_ADJUST_CFA_OFFSET -\pop
155 .endif
156.endm
157.macro POP_GS_EX
158.pushsection .fixup, "ax"
15999: movl $0, (%esp)
160 jmp 98b
161.section __ex_table, "a"
162 .align 4
163 .long 98b, 99b
164.popsection
165.endm
166
167.macro PTGS_TO_GS
16898: mov PT_GS(%esp), %gs
169.endm
170.macro PTGS_TO_GS_EX
171.pushsection .fixup, "ax"
17299: movl $0, PT_GS(%esp)
173 jmp 98b
174.section __ex_table, "a"
175 .align 4
176 .long 98b, 99b
177.popsection
178.endm
179
180.macro GS_TO_REG reg
181 movl %gs, \reg
182 /*CFI_REGISTER gs, \reg*/
183.endm
184.macro REG_TO_PTGS reg
185 movl \reg, PT_GS(%esp)
186 /*CFI_REL_OFFSET gs, PT_GS*/
187.endm
188.macro SET_KERNEL_GS reg
189 movl $(__KERNEL_STACK_CANARY), \reg
190 movl \reg, %gs
191.endm
192
193#endif /* CONFIG_X86_32_LAZY_GS */
194
195.macro SAVE_ALL
196 cld
197 PUSH_GS
198 pushl %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0
210 pushl %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0
213 pushl %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0
216 pushl %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0
219 pushl %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0
222 pushl %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0
225 pushl %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx
229 movl %edx, %ds
230 movl %edx, %es
231 movl $(__KERNEL_PERCPU), %edx
140 movl %edx, %fs 232 movl %edx, %fs
233 SET_KERNEL_GS %edx
234.endm
141 235
142#define RESTORE_INT_REGS \ 236.macro RESTORE_INT_REGS
143 popl %ebx; \ 237 popl %ebx
144 CFI_ADJUST_CFA_OFFSET -4;\ 238 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE ebx;\ 239 CFI_RESTORE ebx
146 popl %ecx; \ 240 popl %ecx
147 CFI_ADJUST_CFA_OFFSET -4;\ 241 CFI_ADJUST_CFA_OFFSET -4
148 CFI_RESTORE ecx;\ 242 CFI_RESTORE ecx
149 popl %edx; \ 243 popl %edx
150 CFI_ADJUST_CFA_OFFSET -4;\ 244 CFI_ADJUST_CFA_OFFSET -4
151 CFI_RESTORE edx;\ 245 CFI_RESTORE edx
152 popl %esi; \ 246 popl %esi
153 CFI_ADJUST_CFA_OFFSET -4;\ 247 CFI_ADJUST_CFA_OFFSET -4
154 CFI_RESTORE esi;\ 248 CFI_RESTORE esi
155 popl %edi; \ 249 popl %edi
156 CFI_ADJUST_CFA_OFFSET -4;\ 250 CFI_ADJUST_CFA_OFFSET -4
157 CFI_RESTORE edi;\ 251 CFI_RESTORE edi
158 popl %ebp; \ 252 popl %ebp
159 CFI_ADJUST_CFA_OFFSET -4;\ 253 CFI_ADJUST_CFA_OFFSET -4
160 CFI_RESTORE ebp;\ 254 CFI_RESTORE ebp
161 popl %eax; \ 255 popl %eax
162 CFI_ADJUST_CFA_OFFSET -4;\ 256 CFI_ADJUST_CFA_OFFSET -4
163 CFI_RESTORE eax 257 CFI_RESTORE eax
258.endm
164 259
165#define RESTORE_REGS \ 260.macro RESTORE_REGS pop=0
166 RESTORE_INT_REGS; \ 261 RESTORE_INT_REGS
1671: popl %ds; \ 2621: popl %ds
168 CFI_ADJUST_CFA_OFFSET -4;\ 263 CFI_ADJUST_CFA_OFFSET -4
169 /*CFI_RESTORE ds;*/\ 264 /*CFI_RESTORE ds;*/
1702: popl %es; \ 2652: popl %es
171 CFI_ADJUST_CFA_OFFSET -4;\ 266 CFI_ADJUST_CFA_OFFSET -4
172 /*CFI_RESTORE es;*/\ 267 /*CFI_RESTORE es;*/
1733: popl %fs; \ 2683: popl %fs
174 CFI_ADJUST_CFA_OFFSET -4;\ 269 CFI_ADJUST_CFA_OFFSET -4
175 /*CFI_RESTORE fs;*/\ 270 /*CFI_RESTORE fs;*/
176.pushsection .fixup,"ax"; \ 271 POP_GS \pop
1774: movl $0,(%esp); \ 272.pushsection .fixup, "ax"
178 jmp 1b; \ 2734: movl $0, (%esp)
1795: movl $0,(%esp); \ 274 jmp 1b
180 jmp 2b; \ 2755: movl $0, (%esp)
1816: movl $0,(%esp); \ 276 jmp 2b
182 jmp 3b; \ 2776: movl $0, (%esp)
183.section __ex_table,"a";\ 278 jmp 3b
184 .align 4; \ 279.section __ex_table, "a"
185 .long 1b,4b; \ 280 .align 4
186 .long 2b,5b; \ 281 .long 1b, 4b
187 .long 3b,6b; \ 282 .long 2b, 5b
283 .long 3b, 6b
188.popsection 284.popsection
285 POP_GS_EX
286.endm
189 287
190#define RING0_INT_FRAME \ 288.macro RING0_INT_FRAME
191 CFI_STARTPROC simple;\ 289 CFI_STARTPROC simple
192 CFI_SIGNAL_FRAME;\ 290 CFI_SIGNAL_FRAME
193 CFI_DEF_CFA esp, 3*4;\ 291 CFI_DEF_CFA esp, 3*4
194 /*CFI_OFFSET cs, -2*4;*/\ 292 /*CFI_OFFSET cs, -2*4;*/
195 CFI_OFFSET eip, -3*4 293 CFI_OFFSET eip, -3*4
294.endm
196 295
197#define RING0_EC_FRAME \ 296.macro RING0_EC_FRAME
198 CFI_STARTPROC simple;\ 297 CFI_STARTPROC simple
199 CFI_SIGNAL_FRAME;\ 298 CFI_SIGNAL_FRAME
200 CFI_DEF_CFA esp, 4*4;\ 299 CFI_DEF_CFA esp, 4*4
201 /*CFI_OFFSET cs, -2*4;*/\ 300 /*CFI_OFFSET cs, -2*4;*/
202 CFI_OFFSET eip, -3*4 301 CFI_OFFSET eip, -3*4
302.endm
203 303
204#define RING0_PTREGS_FRAME \ 304.macro RING0_PTREGS_FRAME
205 CFI_STARTPROC simple;\ 305 CFI_STARTPROC simple
206 CFI_SIGNAL_FRAME;\ 306 CFI_SIGNAL_FRAME
207 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ 307 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
208 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ 308 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
209 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ 309 CFI_OFFSET eip, PT_EIP-PT_OLDESP
210 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ 310 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
211 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ 311 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
212 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ 312 CFI_OFFSET eax, PT_EAX-PT_OLDESP
213 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ 313 CFI_OFFSET ebp, PT_EBP-PT_OLDESP
214 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ 314 CFI_OFFSET edi, PT_EDI-PT_OLDESP
215 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ 315 CFI_OFFSET esi, PT_ESI-PT_OLDESP
216 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ 316 CFI_OFFSET edx, PT_EDX-PT_OLDESP
217 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ 317 CFI_OFFSET ecx, PT_ECX-PT_OLDESP
218 CFI_OFFSET ebx, PT_EBX-PT_OLDESP 318 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
319.endm
219 320
220ENTRY(ret_from_fork) 321ENTRY(ret_from_fork)
221 CFI_STARTPROC 322 CFI_STARTPROC
@@ -362,6 +463,7 @@ sysenter_exit:
362 xorl %ebp,%ebp 463 xorl %ebp,%ebp
363 TRACE_IRQS_ON 464 TRACE_IRQS_ON
3641: mov PT_FS(%esp), %fs 4651: mov PT_FS(%esp), %fs
466 PTGS_TO_GS
365 ENABLE_INTERRUPTS_SYSEXIT 467 ENABLE_INTERRUPTS_SYSEXIT
366 468
367#ifdef CONFIG_AUDITSYSCALL 469#ifdef CONFIG_AUDITSYSCALL
@@ -410,6 +512,7 @@ sysexit_audit:
410 .align 4 512 .align 4
411 .long 1b,2b 513 .long 1b,2b
412.popsection 514.popsection
515 PTGS_TO_GS_EX
413ENDPROC(ia32_sysenter_target) 516ENDPROC(ia32_sysenter_target)
414 517
415 # system call handler stub 518 # system call handler stub
@@ -452,8 +555,7 @@ restore_all:
452restore_nocheck: 555restore_nocheck:
453 TRACE_IRQS_IRET 556 TRACE_IRQS_IRET
454restore_nocheck_notrace: 557restore_nocheck_notrace:
455 RESTORE_REGS 558 RESTORE_REGS 4 # skip orig_eax/error_code
456 addl $4, %esp # skip orig_eax/error_code
457 CFI_ADJUST_CFA_OFFSET -4 559 CFI_ADJUST_CFA_OFFSET -4
458irq_return: 560irq_return:
459 INTERRUPT_RETURN 561 INTERRUPT_RETURN
@@ -595,52 +697,83 @@ syscall_badsys:
595END(syscall_badsys) 697END(syscall_badsys)
596 CFI_ENDPROC 698 CFI_ENDPROC
597 699
598#define FIXUP_ESPFIX_STACK \ 700/*
599 /* since we are on a wrong stack, we cant make it a C code :( */ \ 701 * System calls that need a pt_regs pointer.
600 PER_CPU(gdt_page, %ebx); \ 702 */
601 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 703#define PTREGSCALL(name) \
602 addl %esp, %eax; \ 704 ALIGN; \
603 pushl $__KERNEL_DS; \ 705ptregs_##name: \
604 CFI_ADJUST_CFA_OFFSET 4; \ 706 leal 4(%esp),%eax; \
605 pushl %eax; \ 707 jmp sys_##name;
606 CFI_ADJUST_CFA_OFFSET 4; \ 708
607 lss (%esp), %esp; \ 709PTREGSCALL(iopl)
608 CFI_ADJUST_CFA_OFFSET -8; 710PTREGSCALL(fork)
609#define UNWIND_ESPFIX_STACK \ 711PTREGSCALL(clone)
610 movl %ss, %eax; \ 712PTREGSCALL(vfork)
611 /* see if on espfix stack */ \ 713PTREGSCALL(execve)
612 cmpw $__ESPFIX_SS, %ax; \ 714PTREGSCALL(sigaltstack)
613 jne 27f; \ 715PTREGSCALL(sigreturn)
614 movl $__KERNEL_DS, %eax; \ 716PTREGSCALL(rt_sigreturn)
615 movl %eax, %ds; \ 717PTREGSCALL(vm86)
616 movl %eax, %es; \ 718PTREGSCALL(vm86old)
617 /* switch to normal stack */ \ 719
618 FIXUP_ESPFIX_STACK; \ 720.macro FIXUP_ESPFIX_STACK
61927:; 721 /* since we are on a wrong stack, we cant make it a C code :( */
722 PER_CPU(gdt_page, %ebx)
723 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
724 addl %esp, %eax
725 pushl $__KERNEL_DS
726 CFI_ADJUST_CFA_OFFSET 4
727 pushl %eax
728 CFI_ADJUST_CFA_OFFSET 4
729 lss (%esp), %esp
730 CFI_ADJUST_CFA_OFFSET -8
731.endm
732.macro UNWIND_ESPFIX_STACK
733 movl %ss, %eax
734 /* see if on espfix stack */
735 cmpw $__ESPFIX_SS, %ax
736 jne 27f
737 movl $__KERNEL_DS, %eax
738 movl %eax, %ds
739 movl %eax, %es
740 /* switch to normal stack */
741 FIXUP_ESPFIX_STACK
74227:
743.endm
620 744
621/* 745/*
622 * Build the entry stubs and pointer table with 746 * Build the entry stubs and pointer table with some assembler magic.
623 * some assembler magic. 747 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
748 * single cache line on all modern x86 implementations.
624 */ 749 */
625.section .rodata,"a" 750.section .init.rodata,"a"
626ENTRY(interrupt) 751ENTRY(interrupt)
627.text 752.text
628 753 .p2align 5
754 .p2align CONFIG_X86_L1_CACHE_SHIFT
629ENTRY(irq_entries_start) 755ENTRY(irq_entries_start)
630 RING0_INT_FRAME 756 RING0_INT_FRAME
631vector=0 757vector=FIRST_EXTERNAL_VECTOR
632.rept NR_VECTORS 758.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
633 ALIGN 759 .balign 32
634 .if vector 760 .rept 7
761 .if vector < NR_VECTORS
762 .if vector <> FIRST_EXTERNAL_VECTOR
635 CFI_ADJUST_CFA_OFFSET -4 763 CFI_ADJUST_CFA_OFFSET -4
636 .endif 764 .endif
6371: pushl $~(vector) 7651: pushl $(~vector+0x80) /* Note: always in signed byte range */
638 CFI_ADJUST_CFA_OFFSET 4 766 CFI_ADJUST_CFA_OFFSET 4
639 jmp common_interrupt 767 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
640 .previous 768 jmp 2f
769 .endif
770 .previous
641 .long 1b 771 .long 1b
642 .text 772 .text
643vector=vector+1 773vector=vector+1
774 .endif
775 .endr
7762: jmp common_interrupt
644.endr 777.endr
645END(irq_entries_start) 778END(irq_entries_start)
646 779
@@ -652,8 +785,9 @@ END(interrupt)
652 * the CPU automatically disables interrupts when executing an IRQ vector, 785 * the CPU automatically disables interrupts when executing an IRQ vector,
653 * so IRQ-flags tracing has to follow that: 786 * so IRQ-flags tracing has to follow that:
654 */ 787 */
655 ALIGN 788 .p2align CONFIG_X86_L1_CACHE_SHIFT
656common_interrupt: 789common_interrupt:
790 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
657 SAVE_ALL 791 SAVE_ALL
658 TRACE_IRQS_OFF 792 TRACE_IRQS_OFF
659 movl %esp,%eax 793 movl %esp,%eax
@@ -662,7 +796,7 @@ common_interrupt:
662ENDPROC(common_interrupt) 796ENDPROC(common_interrupt)
663 CFI_ENDPROC 797 CFI_ENDPROC
664 798
665#define BUILD_INTERRUPT(name, nr) \ 799#define BUILD_INTERRUPT3(name, nr, fn) \
666ENTRY(name) \ 800ENTRY(name) \
667 RING0_INT_FRAME; \ 801 RING0_INT_FRAME; \
668 pushl $~(nr); \ 802 pushl $~(nr); \
@@ -670,72 +804,15 @@ ENTRY(name) \
670 SAVE_ALL; \ 804 SAVE_ALL; \
671 TRACE_IRQS_OFF \ 805 TRACE_IRQS_OFF \
672 movl %esp,%eax; \ 806 movl %esp,%eax; \
673 call smp_##name; \ 807 call fn; \
674 jmp ret_from_intr; \ 808 jmp ret_from_intr; \
675 CFI_ENDPROC; \ 809 CFI_ENDPROC; \
676ENDPROC(name) 810ENDPROC(name)
677 811
678/* The include is where all of the SMP etc. interrupts come from */ 812#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
679#include "entry_arch.h"
680 813
681KPROBE_ENTRY(page_fault) 814/* The include is where all of the SMP etc. interrupts come from */
682 RING0_EC_FRAME 815#include <asm/entry_arch.h>
683 pushl $do_page_fault
684 CFI_ADJUST_CFA_OFFSET 4
685 ALIGN
686error_code:
687 /* the function address is in %fs's slot on the stack */
688 pushl %es
689 CFI_ADJUST_CFA_OFFSET 4
690 /*CFI_REL_OFFSET es, 0*/
691 pushl %ds
692 CFI_ADJUST_CFA_OFFSET 4
693 /*CFI_REL_OFFSET ds, 0*/
694 pushl %eax
695 CFI_ADJUST_CFA_OFFSET 4
696 CFI_REL_OFFSET eax, 0
697 pushl %ebp
698 CFI_ADJUST_CFA_OFFSET 4
699 CFI_REL_OFFSET ebp, 0
700 pushl %edi
701 CFI_ADJUST_CFA_OFFSET 4
702 CFI_REL_OFFSET edi, 0
703 pushl %esi
704 CFI_ADJUST_CFA_OFFSET 4
705 CFI_REL_OFFSET esi, 0
706 pushl %edx
707 CFI_ADJUST_CFA_OFFSET 4
708 CFI_REL_OFFSET edx, 0
709 pushl %ecx
710 CFI_ADJUST_CFA_OFFSET 4
711 CFI_REL_OFFSET ecx, 0
712 pushl %ebx
713 CFI_ADJUST_CFA_OFFSET 4
714 CFI_REL_OFFSET ebx, 0
715 cld
716 pushl %fs
717 CFI_ADJUST_CFA_OFFSET 4
718 /*CFI_REL_OFFSET fs, 0*/
719 movl $(__KERNEL_PERCPU), %ecx
720 movl %ecx, %fs
721 UNWIND_ESPFIX_STACK
722 popl %ecx
723 CFI_ADJUST_CFA_OFFSET -4
724 /*CFI_REGISTER es, ecx*/
725 movl PT_FS(%esp), %edi # get the function address
726 movl PT_ORIG_EAX(%esp), %edx # get the error code
727 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
728 mov %ecx, PT_FS(%esp)
729 /*CFI_REL_OFFSET fs, ES*/
730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds
732 movl %ecx, %es
733 TRACE_IRQS_OFF
734 movl %esp,%eax # pt_regs pointer
735 call *%edi
736 jmp ret_from_exception
737 CFI_ENDPROC
738KPROBE_END(page_fault)
739 816
740ENTRY(coprocessor_error) 817ENTRY(coprocessor_error)
741 RING0_INT_FRAME 818 RING0_INT_FRAME
@@ -767,140 +844,6 @@ ENTRY(device_not_available)
767 CFI_ENDPROC 844 CFI_ENDPROC
768END(device_not_available) 845END(device_not_available)
769 846
770/*
771 * Debug traps and NMI can happen at the one SYSENTER instruction
772 * that sets up the real kernel stack. Check here, since we can't
773 * allow the wrong stack to be used.
774 *
775 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
776 * already pushed 3 words if it hits on the sysenter instruction:
777 * eflags, cs and eip.
778 *
779 * We just load the right stack, and push the three (known) values
780 * by hand onto the new stack - while updating the return eip past
781 * the instruction that would have done it for sysenter.
782 */
783#define FIX_STACK(offset, ok, label) \
784 cmpw $__KERNEL_CS,4(%esp); \
785 jne ok; \
786label: \
787 movl TSS_sysenter_sp0+offset(%esp),%esp; \
788 CFI_DEF_CFA esp, 0; \
789 CFI_UNDEFINED eip; \
790 pushfl; \
791 CFI_ADJUST_CFA_OFFSET 4; \
792 pushl $__KERNEL_CS; \
793 CFI_ADJUST_CFA_OFFSET 4; \
794 pushl $sysenter_past_esp; \
795 CFI_ADJUST_CFA_OFFSET 4; \
796 CFI_REL_OFFSET eip, 0
797
798KPROBE_ENTRY(debug)
799 RING0_INT_FRAME
800 cmpl $ia32_sysenter_target,(%esp)
801 jne debug_stack_correct
802 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
803debug_stack_correct:
804 pushl $-1 # mark this as an int
805 CFI_ADJUST_CFA_OFFSET 4
806 SAVE_ALL
807 TRACE_IRQS_OFF
808 xorl %edx,%edx # error code 0
809 movl %esp,%eax # pt_regs pointer
810 call do_debug
811 jmp ret_from_exception
812 CFI_ENDPROC
813KPROBE_END(debug)
814
815/*
816 * NMI is doubly nasty. It can happen _while_ we're handling
817 * a debug fault, and the debug fault hasn't yet been able to
818 * clear up the stack. So we first check whether we got an
819 * NMI on the sysenter entry path, but after that we need to
820 * check whether we got an NMI on the debug path where the debug
821 * fault happened on the sysenter path.
822 */
823KPROBE_ENTRY(nmi)
824 RING0_INT_FRAME
825 pushl %eax
826 CFI_ADJUST_CFA_OFFSET 4
827 movl %ss, %eax
828 cmpw $__ESPFIX_SS, %ax
829 popl %eax
830 CFI_ADJUST_CFA_OFFSET -4
831 je nmi_espfix_stack
832 cmpl $ia32_sysenter_target,(%esp)
833 je nmi_stack_fixup
834 pushl %eax
835 CFI_ADJUST_CFA_OFFSET 4
836 movl %esp,%eax
837 /* Do not access memory above the end of our stack page,
838 * it might not exist.
839 */
840 andl $(THREAD_SIZE-1),%eax
841 cmpl $(THREAD_SIZE-20),%eax
842 popl %eax
843 CFI_ADJUST_CFA_OFFSET -4
844 jae nmi_stack_correct
845 cmpl $ia32_sysenter_target,12(%esp)
846 je nmi_debug_stack_check
847nmi_stack_correct:
848 /* We have a RING0_INT_FRAME here */
849 pushl %eax
850 CFI_ADJUST_CFA_OFFSET 4
851 SAVE_ALL
852 TRACE_IRQS_OFF
853 xorl %edx,%edx # zero error code
854 movl %esp,%eax # pt_regs pointer
855 call do_nmi
856 jmp restore_nocheck_notrace
857 CFI_ENDPROC
858
859nmi_stack_fixup:
860 RING0_INT_FRAME
861 FIX_STACK(12,nmi_stack_correct, 1)
862 jmp nmi_stack_correct
863
864nmi_debug_stack_check:
865 /* We have a RING0_INT_FRAME here */
866 cmpw $__KERNEL_CS,16(%esp)
867 jne nmi_stack_correct
868 cmpl $debug,(%esp)
869 jb nmi_stack_correct
870 cmpl $debug_esp_fix_insn,(%esp)
871 ja nmi_stack_correct
872 FIX_STACK(24,nmi_stack_correct, 1)
873 jmp nmi_stack_correct
874
875nmi_espfix_stack:
876 /* We have a RING0_INT_FRAME here.
877 *
878 * create the pointer to lss back
879 */
880 pushl %ss
881 CFI_ADJUST_CFA_OFFSET 4
882 pushl %esp
883 CFI_ADJUST_CFA_OFFSET 4
884 addw $4, (%esp)
885 /* copy the iret frame of 12 bytes */
886 .rept 3
887 pushl 16(%esp)
888 CFI_ADJUST_CFA_OFFSET 4
889 .endr
890 pushl %eax
891 CFI_ADJUST_CFA_OFFSET 4
892 SAVE_ALL
893 TRACE_IRQS_OFF
894 FIXUP_ESPFIX_STACK # %eax == %esp
895 xorl %edx,%edx # zero error code
896 call do_nmi
897 RESTORE_REGS
898 lss 12+4(%esp), %esp # back to espfix stack
899 CFI_ADJUST_CFA_OFFSET -24
900 jmp irq_return
901 CFI_ENDPROC
902KPROBE_END(nmi)
903
904#ifdef CONFIG_PARAVIRT 847#ifdef CONFIG_PARAVIRT
905ENTRY(native_iret) 848ENTRY(native_iret)
906 iret 849 iret
@@ -916,19 +859,6 @@ ENTRY(native_irq_enable_sysexit)
916END(native_irq_enable_sysexit) 859END(native_irq_enable_sysexit)
917#endif 860#endif
918 861
919KPROBE_ENTRY(int3)
920 RING0_INT_FRAME
921 pushl $-1 # mark this as an int
922 CFI_ADJUST_CFA_OFFSET 4
923 SAVE_ALL
924 TRACE_IRQS_OFF
925 xorl %edx,%edx # zero error code
926 movl %esp,%eax # pt_regs pointer
927 call do_int3
928 jmp ret_from_exception
929 CFI_ENDPROC
930KPROBE_END(int3)
931
932ENTRY(overflow) 862ENTRY(overflow)
933 RING0_INT_FRAME 863 RING0_INT_FRAME
934 pushl $0 864 pushl $0
@@ -993,14 +923,6 @@ ENTRY(stack_segment)
993 CFI_ENDPROC 923 CFI_ENDPROC
994END(stack_segment) 924END(stack_segment)
995 925
996KPROBE_ENTRY(general_protection)
997 RING0_EC_FRAME
998 pushl $do_general_protection
999 CFI_ADJUST_CFA_OFFSET 4
1000 jmp error_code
1001 CFI_ENDPROC
1002KPROBE_END(general_protection)
1003
1004ENTRY(alignment_check) 926ENTRY(alignment_check)
1005 RING0_EC_FRAME 927 RING0_EC_FRAME
1006 pushl $do_alignment_check 928 pushl $do_alignment_check
@@ -1051,6 +973,7 @@ ENTRY(kernel_thread_helper)
1051 push %eax 973 push %eax
1052 CFI_ADJUST_CFA_OFFSET 4 974 CFI_ADJUST_CFA_OFFSET 4
1053 call do_exit 975 call do_exit
976 ud2 # padding for call trace
1054 CFI_ENDPROC 977 CFI_ENDPROC
1055ENDPROC(kernel_thread_helper) 978ENDPROC(kernel_thread_helper)
1056 979
@@ -1157,6 +1080,9 @@ ENTRY(mcount)
1157END(mcount) 1080END(mcount)
1158 1081
1159ENTRY(ftrace_caller) 1082ENTRY(ftrace_caller)
1083 cmpl $0, function_trace_stop
1084 jne ftrace_stub
1085
1160 pushl %eax 1086 pushl %eax
1161 pushl %ecx 1087 pushl %ecx
1162 pushl %edx 1088 pushl %edx
@@ -1171,6 +1097,11 @@ ftrace_call:
1171 popl %edx 1097 popl %edx
1172 popl %ecx 1098 popl %ecx
1173 popl %eax 1099 popl %eax
1100#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1101.globl ftrace_graph_call
1102ftrace_graph_call:
1103 jmp ftrace_stub
1104#endif
1174 1105
1175.globl ftrace_stub 1106.globl ftrace_stub
1176ftrace_stub: 1107ftrace_stub:
@@ -1180,8 +1111,18 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 1111#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 1112
1182ENTRY(mcount) 1113ENTRY(mcount)
1114 cmpl $0, function_trace_stop
1115 jne ftrace_stub
1116
1183 cmpl $ftrace_stub, ftrace_trace_function 1117 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 1118 jnz trace
1119#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1120 cmpl $ftrace_stub, ftrace_graph_return
1121 jnz ftrace_graph_caller
1122
1123 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
1124 jnz ftrace_graph_caller
1125#endif
1185.globl ftrace_stub 1126.globl ftrace_stub
1186ftrace_stub: 1127ftrace_stub:
1187 ret 1128 ret
@@ -1200,13 +1141,265 @@ trace:
1200 popl %edx 1141 popl %edx
1201 popl %ecx 1142 popl %ecx
1202 popl %eax 1143 popl %eax
1203
1204 jmp ftrace_stub 1144 jmp ftrace_stub
1205END(mcount) 1145END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1146#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1147#endif /* CONFIG_FUNCTION_TRACER */
1208 1148
1149#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1150ENTRY(ftrace_graph_caller)
1151 cmpl $0, function_trace_stop
1152 jne ftrace_stub
1153
1154 pushl %eax
1155 pushl %ecx
1156 pushl %edx
1157 movl 0xc(%esp), %edx
1158 lea 0x4(%ebp), %eax
1159 subl $MCOUNT_INSN_SIZE, %edx
1160 call prepare_ftrace_return
1161 popl %edx
1162 popl %ecx
1163 popl %eax
1164 ret
1165END(ftrace_graph_caller)
1166
1167.globl return_to_handler
1168return_to_handler:
1169 pushl $0
1170 pushl %eax
1171 pushl %ecx
1172 pushl %edx
1173 call ftrace_return_to_handler
1174 movl %eax, 0xc(%esp)
1175 popl %edx
1176 popl %ecx
1177 popl %eax
1178 ret
1179#endif
1180
1209.section .rodata,"a" 1181.section .rodata,"a"
1210#include "syscall_table_32.S" 1182#include "syscall_table_32.S"
1211 1183
1212syscall_table_size=(.-sys_call_table) 1184syscall_table_size=(.-sys_call_table)
1185
1186/*
1187 * Some functions should be protected against kprobes
1188 */
1189 .pushsection .kprobes.text, "ax"
1190
1191ENTRY(page_fault)
1192 RING0_EC_FRAME
1193 pushl $do_page_fault
1194 CFI_ADJUST_CFA_OFFSET 4
1195 ALIGN
1196error_code:
1197 /* the function address is in %gs's slot on the stack */
1198 pushl %fs
1199 CFI_ADJUST_CFA_OFFSET 4
1200 /*CFI_REL_OFFSET fs, 0*/
1201 pushl %es
1202 CFI_ADJUST_CFA_OFFSET 4
1203 /*CFI_REL_OFFSET es, 0*/
1204 pushl %ds
1205 CFI_ADJUST_CFA_OFFSET 4
1206 /*CFI_REL_OFFSET ds, 0*/
1207 pushl %eax
1208 CFI_ADJUST_CFA_OFFSET 4
1209 CFI_REL_OFFSET eax, 0
1210 pushl %ebp
1211 CFI_ADJUST_CFA_OFFSET 4
1212 CFI_REL_OFFSET ebp, 0
1213 pushl %edi
1214 CFI_ADJUST_CFA_OFFSET 4
1215 CFI_REL_OFFSET edi, 0
1216 pushl %esi
1217 CFI_ADJUST_CFA_OFFSET 4
1218 CFI_REL_OFFSET esi, 0
1219 pushl %edx
1220 CFI_ADJUST_CFA_OFFSET 4
1221 CFI_REL_OFFSET edx, 0
1222 pushl %ecx
1223 CFI_ADJUST_CFA_OFFSET 4
1224 CFI_REL_OFFSET ecx, 0
1225 pushl %ebx
1226 CFI_ADJUST_CFA_OFFSET 4
1227 CFI_REL_OFFSET ebx, 0
1228 cld
1229 movl $(__KERNEL_PERCPU), %ecx
1230 movl %ecx, %fs
1231 UNWIND_ESPFIX_STACK
1232 GS_TO_REG %ecx
1233 movl PT_GS(%esp), %edi # get the function address
1234 movl PT_ORIG_EAX(%esp), %edx # get the error code
1235 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1236 REG_TO_PTGS %ecx
1237 SET_KERNEL_GS %ecx
1238 movl $(__USER_DS), %ecx
1239 movl %ecx, %ds
1240 movl %ecx, %es
1241 TRACE_IRQS_OFF
1242 movl %esp,%eax # pt_regs pointer
1243 call *%edi
1244 jmp ret_from_exception
1245 CFI_ENDPROC
1246END(page_fault)
1247
1248/*
1249 * Debug traps and NMI can happen at the one SYSENTER instruction
1250 * that sets up the real kernel stack. Check here, since we can't
1251 * allow the wrong stack to be used.
1252 *
1253 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
1254 * already pushed 3 words if it hits on the sysenter instruction:
1255 * eflags, cs and eip.
1256 *
1257 * We just load the right stack, and push the three (known) values
1258 * by hand onto the new stack - while updating the return eip past
1259 * the instruction that would have done it for sysenter.
1260 */
1261.macro FIX_STACK offset ok label
1262 cmpw $__KERNEL_CS, 4(%esp)
1263 jne \ok
1264\label:
1265 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1266 CFI_DEF_CFA esp, 0
1267 CFI_UNDEFINED eip
1268 pushfl
1269 CFI_ADJUST_CFA_OFFSET 4
1270 pushl $__KERNEL_CS
1271 CFI_ADJUST_CFA_OFFSET 4
1272 pushl $sysenter_past_esp
1273 CFI_ADJUST_CFA_OFFSET 4
1274 CFI_REL_OFFSET eip, 0
1275.endm
1276
1277ENTRY(debug)
1278 RING0_INT_FRAME
1279 cmpl $ia32_sysenter_target,(%esp)
1280 jne debug_stack_correct
1281 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1282debug_stack_correct:
1283 pushl $-1 # mark this as an int
1284 CFI_ADJUST_CFA_OFFSET 4
1285 SAVE_ALL
1286 TRACE_IRQS_OFF
1287 xorl %edx,%edx # error code 0
1288 movl %esp,%eax # pt_regs pointer
1289 call do_debug
1290 jmp ret_from_exception
1291 CFI_ENDPROC
1292END(debug)
1293
1294/*
1295 * NMI is doubly nasty. It can happen _while_ we're handling
1296 * a debug fault, and the debug fault hasn't yet been able to
1297 * clear up the stack. So we first check whether we got an
1298 * NMI on the sysenter entry path, but after that we need to
1299 * check whether we got an NMI on the debug path where the debug
1300 * fault happened on the sysenter path.
1301 */
1302ENTRY(nmi)
1303 RING0_INT_FRAME
1304 pushl %eax
1305 CFI_ADJUST_CFA_OFFSET 4
1306 movl %ss, %eax
1307 cmpw $__ESPFIX_SS, %ax
1308 popl %eax
1309 CFI_ADJUST_CFA_OFFSET -4
1310 je nmi_espfix_stack
1311 cmpl $ia32_sysenter_target,(%esp)
1312 je nmi_stack_fixup
1313 pushl %eax
1314 CFI_ADJUST_CFA_OFFSET 4
1315 movl %esp,%eax
1316 /* Do not access memory above the end of our stack page,
1317 * it might not exist.
1318 */
1319 andl $(THREAD_SIZE-1),%eax
1320 cmpl $(THREAD_SIZE-20),%eax
1321 popl %eax
1322 CFI_ADJUST_CFA_OFFSET -4
1323 jae nmi_stack_correct
1324 cmpl $ia32_sysenter_target,12(%esp)
1325 je nmi_debug_stack_check
1326nmi_stack_correct:
1327 /* We have a RING0_INT_FRAME here */
1328 pushl %eax
1329 CFI_ADJUST_CFA_OFFSET 4
1330 SAVE_ALL
1331 xorl %edx,%edx # zero error code
1332 movl %esp,%eax # pt_regs pointer
1333 call do_nmi
1334 jmp restore_nocheck_notrace
1335 CFI_ENDPROC
1336
1337nmi_stack_fixup:
1338 RING0_INT_FRAME
1339 FIX_STACK 12, nmi_stack_correct, 1
1340 jmp nmi_stack_correct
1341
1342nmi_debug_stack_check:
1343 /* We have a RING0_INT_FRAME here */
1344 cmpw $__KERNEL_CS,16(%esp)
1345 jne nmi_stack_correct
1346 cmpl $debug,(%esp)
1347 jb nmi_stack_correct
1348 cmpl $debug_esp_fix_insn,(%esp)
1349 ja nmi_stack_correct
1350 FIX_STACK 24, nmi_stack_correct, 1
1351 jmp nmi_stack_correct
1352
1353nmi_espfix_stack:
1354 /* We have a RING0_INT_FRAME here.
1355 *
1356 * create the pointer to lss back
1357 */
1358 pushl %ss
1359 CFI_ADJUST_CFA_OFFSET 4
1360 pushl %esp
1361 CFI_ADJUST_CFA_OFFSET 4
1362 addw $4, (%esp)
1363 /* copy the iret frame of 12 bytes */
1364 .rept 3
1365 pushl 16(%esp)
1366 CFI_ADJUST_CFA_OFFSET 4
1367 .endr
1368 pushl %eax
1369 CFI_ADJUST_CFA_OFFSET 4
1370 SAVE_ALL
1371 FIXUP_ESPFIX_STACK # %eax == %esp
1372 xorl %edx,%edx # zero error code
1373 call do_nmi
1374 RESTORE_REGS
1375 lss 12+4(%esp), %esp # back to espfix stack
1376 CFI_ADJUST_CFA_OFFSET -24
1377 jmp irq_return
1378 CFI_ENDPROC
1379END(nmi)
1380
1381ENTRY(int3)
1382 RING0_INT_FRAME
1383 pushl $-1 # mark this as an int
1384 CFI_ADJUST_CFA_OFFSET 4
1385 SAVE_ALL
1386 TRACE_IRQS_OFF
1387 xorl %edx,%edx # zero error code
1388 movl %esp,%eax # pt_regs pointer
1389 call do_int3
1390 jmp ret_from_exception
1391 CFI_ENDPROC
1392END(int3)
1393
1394ENTRY(general_protection)
1395 RING0_EC_FRAME
1396 pushl $do_general_protection
1397 CFI_ADJUST_CFA_OFFSET 4
1398 jmp error_code
1399 CFI_ENDPROC
1400END(general_protection)
1401
1402/*
1403 * End of kprobes section
1404 */
1405 .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..fbcf96b295ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
11 * 11 *
12 * NOTE: This code handles signal-recognition, which happens every time 12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 13 * after an interrupt and after each system call.
14 * 14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is 15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al. 16 * only done for syscall tracing, signals or fork/exec et.al.
17 * 17 *
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better 25 * - CFI macros are used to generate dwarf2 unwind information for better
@@ -52,6 +52,7 @@
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -60,7 +61,6 @@
60#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
61 62
62 .code64 63 .code64
63
64#ifdef CONFIG_FUNCTION_TRACER 64#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 65#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 66ENTRY(mcount)
@@ -68,16 +68,10 @@ ENTRY(mcount)
68END(mcount) 68END(mcount)
69 69
70ENTRY(ftrace_caller) 70ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop
72 jne ftrace_stub
71 73
72 /* taken from glibc */ 74 MCOUNT_SAVE_FRAME
73 subq $0x38, %rsp
74 movq %rax, (%rsp)
75 movq %rcx, 8(%rsp)
76 movq %rdx, 16(%rsp)
77 movq %rsi, 24(%rsp)
78 movq %rdi, 32(%rsp)
79 movq %r8, 40(%rsp)
80 movq %r9, 48(%rsp)
81 75
82 movq 0x38(%rsp), %rdi 76 movq 0x38(%rsp), %rdi
83 movq 8(%rbp), %rsi 77 movq 8(%rbp), %rsi
@@ -87,14 +81,13 @@ ENTRY(ftrace_caller)
87ftrace_call: 81ftrace_call:
88 call ftrace_stub 82 call ftrace_stub
89 83
90 movq 48(%rsp), %r9 84 MCOUNT_RESTORE_FRAME
91 movq 40(%rsp), %r8 85
92 movq 32(%rsp), %rdi 86#ifdef CONFIG_FUNCTION_GRAPH_TRACER
93 movq 24(%rsp), %rsi 87.globl ftrace_graph_call
94 movq 16(%rsp), %rdx 88ftrace_graph_call:
95 movq 8(%rsp), %rcx 89 jmp ftrace_stub
96 movq (%rsp), %rax 90#endif
97 addq $0x38, %rsp
98 91
99.globl ftrace_stub 92.globl ftrace_stub
100ftrace_stub: 93ftrace_stub:
@@ -103,15 +96,63 @@ END(ftrace_caller)
103 96
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 97#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 98ENTRY(mcount)
99 cmpl $0, function_trace_stop
100 jne ftrace_stub
101
106 cmpq $ftrace_stub, ftrace_trace_function 102 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 103 jnz trace
104
105#ifdef CONFIG_FUNCTION_GRAPH_TRACER
106 cmpq $ftrace_stub, ftrace_graph_return
107 jnz ftrace_graph_caller
108
109 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
110 jnz ftrace_graph_caller
111#endif
112
108.globl ftrace_stub 113.globl ftrace_stub
109ftrace_stub: 114ftrace_stub:
110 retq 115 retq
111 116
112trace: 117trace:
113 /* taken from glibc */ 118 MCOUNT_SAVE_FRAME
114 subq $0x38, %rsp 119
120 movq 0x38(%rsp), %rdi
121 movq 8(%rbp), %rsi
122 subq $MCOUNT_INSN_SIZE, %rdi
123
124 call *ftrace_trace_function
125
126 MCOUNT_RESTORE_FRAME
127
128 jmp ftrace_stub
129END(mcount)
130#endif /* CONFIG_DYNAMIC_FTRACE */
131#endif /* CONFIG_FUNCTION_TRACER */
132
133#ifdef CONFIG_FUNCTION_GRAPH_TRACER
134ENTRY(ftrace_graph_caller)
135 cmpl $0, function_trace_stop
136 jne ftrace_stub
137
138 MCOUNT_SAVE_FRAME
139
140 leaq 8(%rbp), %rdi
141 movq 0x38(%rsp), %rsi
142 subq $MCOUNT_INSN_SIZE, %rsi
143
144 call prepare_ftrace_return
145
146 MCOUNT_RESTORE_FRAME
147
148 retq
149END(ftrace_graph_caller)
150
151
152.globl return_to_handler
153return_to_handler:
154 subq $80, %rsp
155
115 movq %rax, (%rsp) 156 movq %rax, (%rsp)
116 movq %rcx, 8(%rsp) 157 movq %rcx, 8(%rsp)
117 movq %rdx, 16(%rsp) 158 movq %rdx, 16(%rsp)
@@ -119,13 +160,14 @@ trace:
119 movq %rdi, 32(%rsp) 160 movq %rdi, 32(%rsp)
120 movq %r8, 40(%rsp) 161 movq %r8, 40(%rsp)
121 movq %r9, 48(%rsp) 162 movq %r9, 48(%rsp)
163 movq %r10, 56(%rsp)
164 movq %r11, 64(%rsp)
122 165
123 movq 0x38(%rsp), %rdi 166 call ftrace_return_to_handler
124 movq 8(%rbp), %rsi
125 subq $MCOUNT_INSN_SIZE, %rdi
126
127 call *ftrace_trace_function
128 167
168 movq %rax, 72(%rsp)
169 movq 64(%rsp), %r11
170 movq 56(%rsp), %r10
129 movq 48(%rsp), %r9 171 movq 48(%rsp), %r9
130 movq 40(%rsp), %r8 172 movq 40(%rsp), %r8
131 movq 32(%rsp), %rdi 173 movq 32(%rsp), %rdi
@@ -133,16 +175,14 @@ trace:
133 movq 16(%rsp), %rdx 175 movq 16(%rsp), %rdx
134 movq 8(%rsp), %rcx 176 movq 8(%rsp), %rcx
135 movq (%rsp), %rax 177 movq (%rsp), %rax
136 addq $0x38, %rsp 178 addq $72, %rsp
179 retq
180#endif
137 181
138 jmp ftrace_stub
139END(mcount)
140#endif /* CONFIG_DYNAMIC_FTRACE */
141#endif /* CONFIG_FUNCTION_TRACER */
142 182
143#ifndef CONFIG_PREEMPT 183#ifndef CONFIG_PREEMPT
144#define retint_kernel retint_restore_args 184#define retint_kernel retint_restore_args
145#endif 185#endif
146 186
147#ifdef CONFIG_PARAVIRT 187#ifdef CONFIG_PARAVIRT
148ENTRY(native_usergs_sysret64) 188ENTRY(native_usergs_sysret64)
@@ -161,29 +201,29 @@ ENTRY(native_usergs_sysret64)
161.endm 201.endm
162 202
163/* 203/*
164 * C code is not supposed to know about undefined top of stack. Every time 204 * C code is not supposed to know about undefined top of stack. Every time
165 * a C function with an pt_regs argument is called from the SYSCALL based 205 * a C function with an pt_regs argument is called from the SYSCALL based
166 * fast path FIXUP_TOP_OF_STACK is needed. 206 * fast path FIXUP_TOP_OF_STACK is needed.
167 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 207 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
168 * manipulation. 208 * manipulation.
169 */ 209 */
170 210
171 /* %rsp:at FRAMEEND */ 211 /* %rsp:at FRAMEEND */
172 .macro FIXUP_TOP_OF_STACK tmp 212 .macro FIXUP_TOP_OF_STACK tmp offset=0
173 movq %gs:pda_oldrsp,\tmp 213 movq PER_CPU_VAR(old_rsp),\tmp
174 movq \tmp,RSP(%rsp) 214 movq \tmp,RSP+\offset(%rsp)
175 movq $__USER_DS,SS(%rsp) 215 movq $__USER_DS,SS+\offset(%rsp)
176 movq $__USER_CS,CS(%rsp) 216 movq $__USER_CS,CS+\offset(%rsp)
177 movq $-1,RCX(%rsp) 217 movq $-1,RCX+\offset(%rsp)
178 movq R11(%rsp),\tmp /* get eflags */ 218 movq R11+\offset(%rsp),\tmp /* get eflags */
179 movq \tmp,EFLAGS(%rsp) 219 movq \tmp,EFLAGS+\offset(%rsp)
180 .endm 220 .endm
181 221
182 .macro RESTORE_TOP_OF_STACK tmp,offset=0 222 .macro RESTORE_TOP_OF_STACK tmp offset=0
183 movq RSP-\offset(%rsp),\tmp 223 movq RSP+\offset(%rsp),\tmp
184 movq \tmp,%gs:pda_oldrsp 224 movq \tmp,PER_CPU_VAR(old_rsp)
185 movq EFLAGS-\offset(%rsp),\tmp 225 movq EFLAGS+\offset(%rsp),\tmp
186 movq \tmp,R11-\offset(%rsp) 226 movq \tmp,R11+\offset(%rsp)
187 .endm 227 .endm
188 228
189 .macro FAKE_STACK_FRAME child_rip 229 .macro FAKE_STACK_FRAME child_rip
@@ -195,7 +235,7 @@ ENTRY(native_usergs_sysret64)
195 pushq %rax /* rsp */ 235 pushq %rax /* rsp */
196 CFI_ADJUST_CFA_OFFSET 8 236 CFI_ADJUST_CFA_OFFSET 8
197 CFI_REL_OFFSET rsp,0 237 CFI_REL_OFFSET rsp,0
198 pushq $(1<<9) /* eflags - interrupts on */ 238 pushq $X86_EFLAGS_IF /* eflags - interrupts on */
199 CFI_ADJUST_CFA_OFFSET 8 239 CFI_ADJUST_CFA_OFFSET 8
200 /*CFI_REL_OFFSET rflags,0*/ 240 /*CFI_REL_OFFSET rflags,0*/
201 pushq $__KERNEL_CS /* cs */ 241 pushq $__KERNEL_CS /* cs */
@@ -213,62 +253,187 @@ ENTRY(native_usergs_sysret64)
213 CFI_ADJUST_CFA_OFFSET -(6*8) 253 CFI_ADJUST_CFA_OFFSET -(6*8)
214 .endm 254 .endm
215 255
216 .macro CFI_DEFAULT_STACK start=1 256/*
257 * initial frame state for interrupts (and exceptions without error code)
258 */
259 .macro EMPTY_FRAME start=1 offset=0
217 .if \start 260 .if \start
218 CFI_STARTPROC simple 261 CFI_STARTPROC simple
219 CFI_SIGNAL_FRAME 262 CFI_SIGNAL_FRAME
220 CFI_DEF_CFA rsp,SS+8 263 CFI_DEF_CFA rsp,8+\offset
221 .else 264 .else
222 CFI_DEF_CFA_OFFSET SS+8 265 CFI_DEF_CFA_OFFSET 8+\offset
223 .endif 266 .endif
224 CFI_REL_OFFSET r15,R15
225 CFI_REL_OFFSET r14,R14
226 CFI_REL_OFFSET r13,R13
227 CFI_REL_OFFSET r12,R12
228 CFI_REL_OFFSET rbp,RBP
229 CFI_REL_OFFSET rbx,RBX
230 CFI_REL_OFFSET r11,R11
231 CFI_REL_OFFSET r10,R10
232 CFI_REL_OFFSET r9,R9
233 CFI_REL_OFFSET r8,R8
234 CFI_REL_OFFSET rax,RAX
235 CFI_REL_OFFSET rcx,RCX
236 CFI_REL_OFFSET rdx,RDX
237 CFI_REL_OFFSET rsi,RSI
238 CFI_REL_OFFSET rdi,RDI
239 CFI_REL_OFFSET rip,RIP
240 /*CFI_REL_OFFSET cs,CS*/
241 /*CFI_REL_OFFSET rflags,EFLAGS*/
242 CFI_REL_OFFSET rsp,RSP
243 /*CFI_REL_OFFSET ss,SS*/
244 .endm 267 .endm
268
269/*
270 * initial frame state for interrupts (and exceptions without error code)
271 */
272 .macro INTR_FRAME start=1 offset=0
273 EMPTY_FRAME \start, SS+8+\offset-RIP
274 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
275 CFI_REL_OFFSET rsp, RSP+\offset-RIP
276 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
277 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
278 CFI_REL_OFFSET rip, RIP+\offset-RIP
279 .endm
280
281/*
282 * initial frame state for exceptions with error code (and interrupts
283 * with vector already pushed)
284 */
285 .macro XCPT_FRAME start=1 offset=0
286 INTR_FRAME \start, RIP+\offset-ORIG_RAX
287 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
288 .endm
289
290/*
291 * frame that enables calling into C.
292 */
293 .macro PARTIAL_FRAME start=1 offset=0
294 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
295 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
296 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
297 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
298 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
299 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
300 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
301 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
302 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
303 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
304 .endm
305
245/* 306/*
246 * A newly forked process directly context switches into this. 307 * frame that enables passing a complete pt_regs to a C function.
247 */ 308 */
248/* rdi: prev */ 309 .macro DEFAULT_FRAME start=1 offset=0
310 PARTIAL_FRAME \start, R11+\offset-R15
311 CFI_REL_OFFSET rbx, RBX+\offset
312 CFI_REL_OFFSET rbp, RBP+\offset
313 CFI_REL_OFFSET r12, R12+\offset
314 CFI_REL_OFFSET r13, R13+\offset
315 CFI_REL_OFFSET r14, R14+\offset
316 CFI_REL_OFFSET r15, R15+\offset
317 .endm
318
319/* save partial stack frame */
320ENTRY(save_args)
321 XCPT_FRAME
322 cld
323 movq_cfi rdi, RDI+16-ARGOFFSET
324 movq_cfi rsi, RSI+16-ARGOFFSET
325 movq_cfi rdx, RDX+16-ARGOFFSET
326 movq_cfi rcx, RCX+16-ARGOFFSET
327 movq_cfi rax, RAX+16-ARGOFFSET
328 movq_cfi r8, R8+16-ARGOFFSET
329 movq_cfi r9, R9+16-ARGOFFSET
330 movq_cfi r10, R10+16-ARGOFFSET
331 movq_cfi r11, R11+16-ARGOFFSET
332
333 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
334 movq_cfi rbp, 8 /* push %rbp */
335 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
336 testl $3, CS(%rdi)
337 je 1f
338 SWAPGS
339 /*
340 * irq_count is used to check if a CPU is already on an interrupt stack
341 * or not. While this is essentially redundant with preempt_count it is
342 * a little cheaper to use a separate counter in the PDA (short of
343 * moving irq_enter into assembly, which would be too much work)
344 */
3451: incl PER_CPU_VAR(irq_count)
346 jne 2f
347 popq_cfi %rax /* move return address... */
348 mov PER_CPU_VAR(irq_stack_ptr),%rsp
349 EMPTY_FRAME 0
350 pushq_cfi %rbp /* backlink for unwinder */
351 pushq_cfi %rax /* ... to the new stack */
352 /*
353 * We entered an interrupt context - irqs are off:
354 */
3552: TRACE_IRQS_OFF
356 ret
357 CFI_ENDPROC
358END(save_args)
359
360ENTRY(save_rest)
361 PARTIAL_FRAME 1 REST_SKIP+8
362 movq 5*8+16(%rsp), %r11 /* save return address */
363 movq_cfi rbx, RBX+16
364 movq_cfi rbp, RBP+16
365 movq_cfi r12, R12+16
366 movq_cfi r13, R13+16
367 movq_cfi r14, R14+16
368 movq_cfi r15, R15+16
369 movq %r11, 8(%rsp) /* return address */
370 FIXUP_TOP_OF_STACK %r11, 16
371 ret
372 CFI_ENDPROC
373END(save_rest)
374
375/* save complete stack frame */
376ENTRY(save_paranoid)
377 XCPT_FRAME 1 RDI+8
378 cld
379 movq_cfi rdi, RDI+8
380 movq_cfi rsi, RSI+8
381 movq_cfi rdx, RDX+8
382 movq_cfi rcx, RCX+8
383 movq_cfi rax, RAX+8
384 movq_cfi r8, R8+8
385 movq_cfi r9, R9+8
386 movq_cfi r10, R10+8
387 movq_cfi r11, R11+8
388 movq_cfi rbx, RBX+8
389 movq_cfi rbp, RBP+8
390 movq_cfi r12, R12+8
391 movq_cfi r13, R13+8
392 movq_cfi r14, R14+8
393 movq_cfi r15, R15+8
394 movl $1,%ebx
395 movl $MSR_GS_BASE,%ecx
396 rdmsr
397 testl %edx,%edx
398 js 1f /* negative -> in kernel */
399 SWAPGS
400 xorl %ebx,%ebx
4011: ret
402 CFI_ENDPROC
403END(save_paranoid)
404
405/*
406 * A newly forked process directly context switches into this address.
407 *
408 * rdi: prev task we switched from
409 */
249ENTRY(ret_from_fork) 410ENTRY(ret_from_fork)
250 CFI_DEFAULT_STACK 411 DEFAULT_FRAME
412
413 LOCK ; btr $TIF_FORK,TI_flags(%r8)
414
251 push kernel_eflags(%rip) 415 push kernel_eflags(%rip)
252 CFI_ADJUST_CFA_OFFSET 8 416 CFI_ADJUST_CFA_OFFSET 8
253 popf # reset kernel eflags 417 popf # reset kernel eflags
254 CFI_ADJUST_CFA_OFFSET -8 418 CFI_ADJUST_CFA_OFFSET -8
255 call schedule_tail 419
420 call schedule_tail # rdi: 'prev' task parameter
421
256 GET_THREAD_INFO(%rcx) 422 GET_THREAD_INFO(%rcx)
257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 423
258 jnz rff_trace 424 CFI_REMEMBER_STATE
259rff_action:
260 RESTORE_REST 425 RESTORE_REST
261 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 426
427 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
262 je int_ret_from_sys_call 428 je int_ret_from_sys_call
263 testl $_TIF_IA32,TI_flags(%rcx) 429
430 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
264 jnz int_ret_from_sys_call 431 jnz int_ret_from_sys_call
265 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 432
266 jmp ret_from_sys_call 433 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
267rff_trace: 434 jmp ret_from_sys_call # go to the SYSRET fastpath
268 movq %rsp,%rdi 435
269 call syscall_trace_leave 436 CFI_RESTORE_STATE
270 GET_THREAD_INFO(%rcx)
271 jmp rff_action
272 CFI_ENDPROC 437 CFI_ENDPROC
273END(ret_from_fork) 438END(ret_from_fork)
274 439
@@ -278,20 +443,20 @@ END(ret_from_fork)
278 * SYSCALL does not save anything on the stack and does not change the 443 * SYSCALL does not save anything on the stack and does not change the
279 * stack pointer. 444 * stack pointer.
280 */ 445 */
281 446
282/* 447/*
283 * Register setup: 448 * Register setup:
284 * rax system call number 449 * rax system call number
285 * rdi arg0 450 * rdi arg0
286 * rcx return address for syscall/sysret, C arg3 451 * rcx return address for syscall/sysret, C arg3
287 * rsi arg1 452 * rsi arg1
288 * rdx arg2 453 * rdx arg2
289 * r10 arg3 (--> moved to rcx for C) 454 * r10 arg3 (--> moved to rcx for C)
290 * r8 arg4 455 * r8 arg4
291 * r9 arg5 456 * r9 arg5
292 * r11 eflags for syscall/sysret, temporary for C 457 * r11 eflags for syscall/sysret, temporary for C
293 * r12-r15,rbp,rbx saved by C code, not touched. 458 * r12-r15,rbp,rbx saved by C code, not touched.
294 * 459 *
295 * Interrupts are off on entry. 460 * Interrupts are off on entry.
296 * Only called from user space. 461 * Only called from user space.
297 * 462 *
@@ -301,12 +466,12 @@ END(ret_from_fork)
301 * When user can change the frames always force IRET. That is because 466 * When user can change the frames always force IRET. That is because
302 * it deals with uncanonical addresses better. SYSRET has trouble 467 * it deals with uncanonical addresses better. SYSRET has trouble
303 * with them due to bugs in both AMD and Intel CPUs. 468 * with them due to bugs in both AMD and Intel CPUs.
304 */ 469 */
305 470
306ENTRY(system_call) 471ENTRY(system_call)
307 CFI_STARTPROC simple 472 CFI_STARTPROC simple
308 CFI_SIGNAL_FRAME 473 CFI_SIGNAL_FRAME
309 CFI_DEF_CFA rsp,PDA_STACKOFFSET 474 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
310 CFI_REGISTER rip,rcx 475 CFI_REGISTER rip,rcx
311 /*CFI_REGISTER rflags,r11*/ 476 /*CFI_REGISTER rflags,r11*/
312 SWAPGS_UNSAFE_STACK 477 SWAPGS_UNSAFE_STACK
@@ -317,15 +482,15 @@ ENTRY(system_call)
317 */ 482 */
318ENTRY(system_call_after_swapgs) 483ENTRY(system_call_after_swapgs)
319 484
320 movq %rsp,%gs:pda_oldrsp 485 movq %rsp,PER_CPU_VAR(old_rsp)
321 movq %gs:pda_kernelstack,%rsp 486 movq PER_CPU_VAR(kernel_stack),%rsp
322 /* 487 /*
323 * No need to follow this irqs off/on section - it's straight 488 * No need to follow this irqs off/on section - it's straight
324 * and short: 489 * and short:
325 */ 490 */
326 ENABLE_INTERRUPTS(CLBR_NONE) 491 ENABLE_INTERRUPTS(CLBR_NONE)
327 SAVE_ARGS 8,1 492 SAVE_ARGS 8,1
328 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 493 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
329 movq %rcx,RIP-ARGOFFSET(%rsp) 494 movq %rcx,RIP-ARGOFFSET(%rsp)
330 CFI_REL_OFFSET rip,RIP-ARGOFFSET 495 CFI_REL_OFFSET rip,RIP-ARGOFFSET
331 GET_THREAD_INFO(%rcx) 496 GET_THREAD_INFO(%rcx)
@@ -339,19 +504,19 @@ system_call_fastpath:
339 movq %rax,RAX-ARGOFFSET(%rsp) 504 movq %rax,RAX-ARGOFFSET(%rsp)
340/* 505/*
341 * Syscall return path ending with SYSRET (fast path) 506 * Syscall return path ending with SYSRET (fast path)
342 * Has incomplete stack frame and undefined top of stack. 507 * Has incomplete stack frame and undefined top of stack.
343 */ 508 */
344ret_from_sys_call: 509ret_from_sys_call:
345 movl $_TIF_ALLWORK_MASK,%edi 510 movl $_TIF_ALLWORK_MASK,%edi
346 /* edi: flagmask */ 511 /* edi: flagmask */
347sysret_check: 512sysret_check:
348 LOCKDEP_SYS_EXIT 513 LOCKDEP_SYS_EXIT
349 GET_THREAD_INFO(%rcx) 514 GET_THREAD_INFO(%rcx)
350 DISABLE_INTERRUPTS(CLBR_NONE) 515 DISABLE_INTERRUPTS(CLBR_NONE)
351 TRACE_IRQS_OFF 516 TRACE_IRQS_OFF
352 movl TI_flags(%rcx),%edx 517 movl TI_flags(%rcx),%edx
353 andl %edi,%edx 518 andl %edi,%edx
354 jnz sysret_careful 519 jnz sysret_careful
355 CFI_REMEMBER_STATE 520 CFI_REMEMBER_STATE
356 /* 521 /*
357 * sysretq will re-enable interrupts: 522 * sysretq will re-enable interrupts:
@@ -361,12 +526,12 @@ sysret_check:
361 CFI_REGISTER rip,rcx 526 CFI_REGISTER rip,rcx
362 RESTORE_ARGS 0,-ARG_SKIP,1 527 RESTORE_ARGS 0,-ARG_SKIP,1
363 /*CFI_REGISTER rflags,r11*/ 528 /*CFI_REGISTER rflags,r11*/
364 movq %gs:pda_oldrsp, %rsp 529 movq PER_CPU_VAR(old_rsp), %rsp
365 USERGS_SYSRET64 530 USERGS_SYSRET64
366 531
367 CFI_RESTORE_STATE 532 CFI_RESTORE_STATE
368 /* Handle reschedules */ 533 /* Handle reschedules */
369 /* edx: work, edi: workmask */ 534 /* edx: work, edi: workmask */
370sysret_careful: 535sysret_careful:
371 bt $TIF_NEED_RESCHED,%edx 536 bt $TIF_NEED_RESCHED,%edx
372 jnc sysret_signal 537 jnc sysret_signal
@@ -379,7 +544,7 @@ sysret_careful:
379 CFI_ADJUST_CFA_OFFSET -8 544 CFI_ADJUST_CFA_OFFSET -8
380 jmp sysret_check 545 jmp sysret_check
381 546
382 /* Handle a signal */ 547 /* Handle a signal */
383sysret_signal: 548sysret_signal:
384 TRACE_IRQS_ON 549 TRACE_IRQS_ON
385 ENABLE_INTERRUPTS(CLBR_NONE) 550 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -388,17 +553,20 @@ sysret_signal:
388 jc sysret_audit 553 jc sysret_audit
389#endif 554#endif
390 /* edx: work flags (arg3) */ 555 /* edx: work flags (arg3) */
391 leaq do_notify_resume(%rip),%rax
392 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 556 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
393 xorl %esi,%esi # oldset -> arg2 557 xorl %esi,%esi # oldset -> arg2
394 call ptregscall_common 558 SAVE_REST
559 FIXUP_TOP_OF_STACK %r11
560 call do_notify_resume
561 RESTORE_TOP_OF_STACK %r11
562 RESTORE_REST
395 movl $_TIF_WORK_MASK,%edi 563 movl $_TIF_WORK_MASK,%edi
396 /* Use IRET because user could have changed frame. This 564 /* Use IRET because user could have changed frame. This
397 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 565 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
398 DISABLE_INTERRUPTS(CLBR_NONE) 566 DISABLE_INTERRUPTS(CLBR_NONE)
399 TRACE_IRQS_OFF 567 TRACE_IRQS_OFF
400 jmp int_with_check 568 jmp int_with_check
401 569
402badsys: 570badsys:
403 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 571 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
404 jmp ret_from_sys_call 572 jmp ret_from_sys_call
@@ -437,7 +605,7 @@ sysret_audit:
437#endif /* CONFIG_AUDITSYSCALL */ 605#endif /* CONFIG_AUDITSYSCALL */
438 606
439 /* Do syscall tracing */ 607 /* Do syscall tracing */
440tracesys: 608tracesys:
441#ifdef CONFIG_AUDITSYSCALL 609#ifdef CONFIG_AUDITSYSCALL
442 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 610 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
443 jz auditsys 611 jz auditsys
@@ -460,8 +628,8 @@ tracesys:
460 call *sys_call_table(,%rax,8) 628 call *sys_call_table(,%rax,8)
461 movq %rax,RAX-ARGOFFSET(%rsp) 629 movq %rax,RAX-ARGOFFSET(%rsp)
462 /* Use IRET because user could have changed frame */ 630 /* Use IRET because user could have changed frame */
463 631
464/* 632/*
465 * Syscall return path ending with IRET. 633 * Syscall return path ending with IRET.
466 * Has correct top of stack, but partial stack frame. 634 * Has correct top of stack, but partial stack frame.
467 */ 635 */
@@ -505,18 +673,18 @@ int_very_careful:
505 TRACE_IRQS_ON 673 TRACE_IRQS_ON
506 ENABLE_INTERRUPTS(CLBR_NONE) 674 ENABLE_INTERRUPTS(CLBR_NONE)
507 SAVE_REST 675 SAVE_REST
508 /* Check for syscall exit trace */ 676 /* Check for syscall exit trace */
509 testl $_TIF_WORK_SYSCALL_EXIT,%edx 677 testl $_TIF_WORK_SYSCALL_EXIT,%edx
510 jz int_signal 678 jz int_signal
511 pushq %rdi 679 pushq %rdi
512 CFI_ADJUST_CFA_OFFSET 8 680 CFI_ADJUST_CFA_OFFSET 8
513 leaq 8(%rsp),%rdi # &ptregs -> arg1 681 leaq 8(%rsp),%rdi # &ptregs -> arg1
514 call syscall_trace_leave 682 call syscall_trace_leave
515 popq %rdi 683 popq %rdi
516 CFI_ADJUST_CFA_OFFSET -8 684 CFI_ADJUST_CFA_OFFSET -8
517 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 685 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
518 jmp int_restore_rest 686 jmp int_restore_rest
519 687
520int_signal: 688int_signal:
521 testl $_TIF_DO_NOTIFY_MASK,%edx 689 testl $_TIF_DO_NOTIFY_MASK,%edx
522 jz 1f 690 jz 1f
@@ -531,22 +699,24 @@ int_restore_rest:
531 jmp int_with_check 699 jmp int_with_check
532 CFI_ENDPROC 700 CFI_ENDPROC
533END(system_call) 701END(system_call)
534 702
535/* 703/*
536 * Certain special system calls that need to save a complete full stack frame. 704 * Certain special system calls that need to save a complete full stack frame.
537 */ 705 */
538
539 .macro PTREGSCALL label,func,arg 706 .macro PTREGSCALL label,func,arg
540 .globl \label 707ENTRY(\label)
541\label: 708 PARTIAL_FRAME 1 8 /* offset 8: return address */
542 leaq \func(%rip),%rax 709 subq $REST_SKIP, %rsp
543 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 710 CFI_ADJUST_CFA_OFFSET REST_SKIP
544 jmp ptregscall_common 711 call save_rest
712 DEFAULT_FRAME 0 8 /* offset 8: return address */
713 leaq 8(%rsp), \arg /* pt_regs pointer */
714 call \func
715 jmp ptregscall_common
716 CFI_ENDPROC
545END(\label) 717END(\label)
546 .endm 718 .endm
547 719
548 CFI_STARTPROC
549
550 PTREGSCALL stub_clone, sys_clone, %r8 720 PTREGSCALL stub_clone, sys_clone, %r8
551 PTREGSCALL stub_fork, sys_fork, %rdi 721 PTREGSCALL stub_fork, sys_fork, %rdi
552 PTREGSCALL stub_vfork, sys_vfork, %rdi 722 PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -554,25 +724,18 @@ END(\label)
554 PTREGSCALL stub_iopl, sys_iopl, %rsi 724 PTREGSCALL stub_iopl, sys_iopl, %rsi
555 725
556ENTRY(ptregscall_common) 726ENTRY(ptregscall_common)
557 popq %r11 727 DEFAULT_FRAME 1 8 /* offset 8: return address */
558 CFI_ADJUST_CFA_OFFSET -8 728 RESTORE_TOP_OF_STACK %r11, 8
559 CFI_REGISTER rip, r11 729 movq_cfi_restore R15+8, r15
560 SAVE_REST 730 movq_cfi_restore R14+8, r14
561 movq %r11, %r15 731 movq_cfi_restore R13+8, r13
562 CFI_REGISTER rip, r15 732 movq_cfi_restore R12+8, r12
563 FIXUP_TOP_OF_STACK %r11 733 movq_cfi_restore RBP+8, rbp
564 call *%rax 734 movq_cfi_restore RBX+8, rbx
565 RESTORE_TOP_OF_STACK %r11 735 ret $REST_SKIP /* pop extended registers */
566 movq %r15, %r11
567 CFI_REGISTER rip, r11
568 RESTORE_REST
569 pushq %r11
570 CFI_ADJUST_CFA_OFFSET 8
571 CFI_REL_OFFSET rip, 0
572 ret
573 CFI_ENDPROC 736 CFI_ENDPROC
574END(ptregscall_common) 737END(ptregscall_common)
575 738
576ENTRY(stub_execve) 739ENTRY(stub_execve)
577 CFI_STARTPROC 740 CFI_STARTPROC
578 popq %r11 741 popq %r11
@@ -588,11 +751,11 @@ ENTRY(stub_execve)
588 jmp int_ret_from_sys_call 751 jmp int_ret_from_sys_call
589 CFI_ENDPROC 752 CFI_ENDPROC
590END(stub_execve) 753END(stub_execve)
591 754
592/* 755/*
593 * sigreturn is special because it needs to restore all registers on return. 756 * sigreturn is special because it needs to restore all registers on return.
594 * This cannot be done with SYSRET, so use the IRET return path instead. 757 * This cannot be done with SYSRET, so use the IRET return path instead.
595 */ 758 */
596ENTRY(stub_rt_sigreturn) 759ENTRY(stub_rt_sigreturn)
597 CFI_STARTPROC 760 CFI_STARTPROC
598 addq $8, %rsp 761 addq $8, %rsp
@@ -608,76 +771,76 @@ ENTRY(stub_rt_sigreturn)
608END(stub_rt_sigreturn) 771END(stub_rt_sigreturn)
609 772
610/* 773/*
611 * initial frame state for interrupts and exceptions 774 * Build the entry stubs and pointer table with some assembler magic.
775 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
776 * single cache line on all modern x86 implementations.
612 */ 777 */
613 .macro _frame ref 778 .section .init.rodata,"a"
614 CFI_STARTPROC simple 779ENTRY(interrupt)
615 CFI_SIGNAL_FRAME 780 .text
616 CFI_DEF_CFA rsp,SS+8-\ref 781 .p2align 5
617 /*CFI_REL_OFFSET ss,SS-\ref*/ 782 .p2align CONFIG_X86_L1_CACHE_SHIFT
618 CFI_REL_OFFSET rsp,RSP-\ref 783ENTRY(irq_entries_start)
619 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ 784 INTR_FRAME
620 /*CFI_REL_OFFSET cs,CS-\ref*/ 785vector=FIRST_EXTERNAL_VECTOR
621 CFI_REL_OFFSET rip,RIP-\ref 786.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
622 .endm 787 .balign 32
788 .rept 7
789 .if vector < NR_VECTORS
790 .if vector <> FIRST_EXTERNAL_VECTOR
791 CFI_ADJUST_CFA_OFFSET -8
792 .endif
7931: pushq $(~vector+0x80) /* Note: always in signed byte range */
794 CFI_ADJUST_CFA_OFFSET 8
795 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
796 jmp 2f
797 .endif
798 .previous
799 .quad 1b
800 .text
801vector=vector+1
802 .endif
803 .endr
8042: jmp common_interrupt
805.endr
806 CFI_ENDPROC
807END(irq_entries_start)
623 808
624/* initial frame state for interrupts (and exceptions without error code) */ 809.previous
625#define INTR_FRAME _frame RIP 810END(interrupt)
626/* initial frame state for exceptions with error code (and interrupts with 811.previous
627 vector already pushed) */
628#define XCPT_FRAME _frame ORIG_RAX
629 812
630/* 813/*
631 * Interrupt entry/exit. 814 * Interrupt entry/exit.
632 * 815 *
633 * Interrupt entry points save only callee clobbered registers in fast path. 816 * Interrupt entry points save only callee clobbered registers in fast path.
634 * 817 *
635 * Entry runs with interrupts off. 818 * Entry runs with interrupts off.
636 */ 819 */
637 820
638/* 0(%rsp): interrupt number */ 821/* 0(%rsp): ~(interrupt number) */
639 .macro interrupt func 822 .macro interrupt func
640 cld 823 subq $10*8, %rsp
641 SAVE_ARGS 824 CFI_ADJUST_CFA_OFFSET 10*8
642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 825 call save_args
643 pushq %rbp 826 PARTIAL_FRAME 0
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
651 CFI_ADJUST_CFA_OFFSET 8
652 CFI_REL_OFFSET rbp, 0
653 movq %rsp,%rbp
654 CFI_DEF_CFA_REGISTER rbp
655 testl $3,CS(%rdi)
656 je 1f
657 SWAPGS
658 /* irqcount is used to check if a CPU is already on an interrupt
659 stack or not. While this is essentially redundant with preempt_count
660 it is a little cheaper to use a separate counter in the PDA
661 (short of moving irq_enter into assembly, which would be too
662 much work) */
6631: incl %gs:pda_irqcount
664 cmoveq %gs:pda_irqstackptr,%rsp
665 push %rbp # backlink for old unwinder
666 /*
667 * We entered an interrupt context - irqs are off:
668 */
669 TRACE_IRQS_OFF
670 call \func 827 call \func
671 .endm 828 .endm
672 829
673ENTRY(common_interrupt) 830 /*
831 * The interrupt stubs push (~vector+0x80) onto the stack and
832 * then jump to common_interrupt.
833 */
834 .p2align CONFIG_X86_L1_CACHE_SHIFT
835common_interrupt:
674 XCPT_FRAME 836 XCPT_FRAME
837 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
675 interrupt do_IRQ 838 interrupt do_IRQ
676 /* 0(%rsp): oldrsp-ARGOFFSET */ 839 /* 0(%rsp): old_rsp-ARGOFFSET */
677ret_from_intr: 840ret_from_intr:
678 DISABLE_INTERRUPTS(CLBR_NONE) 841 DISABLE_INTERRUPTS(CLBR_NONE)
679 TRACE_IRQS_OFF 842 TRACE_IRQS_OFF
680 decl %gs:pda_irqcount 843 decl PER_CPU_VAR(irq_count)
681 leaveq 844 leaveq
682 CFI_DEF_CFA_REGISTER rsp 845 CFI_DEF_CFA_REGISTER rsp
683 CFI_ADJUST_CFA_OFFSET -8 846 CFI_ADJUST_CFA_OFFSET -8
@@ -685,12 +848,12 @@ exit_intr:
685 GET_THREAD_INFO(%rcx) 848 GET_THREAD_INFO(%rcx)
686 testl $3,CS-ARGOFFSET(%rsp) 849 testl $3,CS-ARGOFFSET(%rsp)
687 je retint_kernel 850 je retint_kernel
688 851
689 /* Interrupt came from user space */ 852 /* Interrupt came from user space */
690 /* 853 /*
691 * Has a correct top of stack, but a partial stack frame 854 * Has a correct top of stack, but a partial stack frame
692 * %rcx: thread info. Interrupts off. 855 * %rcx: thread info. Interrupts off.
693 */ 856 */
694retint_with_reschedule: 857retint_with_reschedule:
695 movl $_TIF_WORK_MASK,%edi 858 movl $_TIF_WORK_MASK,%edi
696retint_check: 859retint_check:
@@ -763,20 +926,20 @@ retint_careful:
763 pushq %rdi 926 pushq %rdi
764 CFI_ADJUST_CFA_OFFSET 8 927 CFI_ADJUST_CFA_OFFSET 8
765 call schedule 928 call schedule
766 popq %rdi 929 popq %rdi
767 CFI_ADJUST_CFA_OFFSET -8 930 CFI_ADJUST_CFA_OFFSET -8
768 GET_THREAD_INFO(%rcx) 931 GET_THREAD_INFO(%rcx)
769 DISABLE_INTERRUPTS(CLBR_NONE) 932 DISABLE_INTERRUPTS(CLBR_NONE)
770 TRACE_IRQS_OFF 933 TRACE_IRQS_OFF
771 jmp retint_check 934 jmp retint_check
772 935
773retint_signal: 936retint_signal:
774 testl $_TIF_DO_NOTIFY_MASK,%edx 937 testl $_TIF_DO_NOTIFY_MASK,%edx
775 jz retint_swapgs 938 jz retint_swapgs
776 TRACE_IRQS_ON 939 TRACE_IRQS_ON
777 ENABLE_INTERRUPTS(CLBR_NONE) 940 ENABLE_INTERRUPTS(CLBR_NONE)
778 SAVE_REST 941 SAVE_REST
779 movq $-1,ORIG_RAX(%rsp) 942 movq $-1,ORIG_RAX(%rsp)
780 xorl %esi,%esi # oldset 943 xorl %esi,%esi # oldset
781 movq %rsp,%rdi # &pt_regs 944 movq %rsp,%rdi # &pt_regs
782 call do_notify_resume 945 call do_notify_resume
@@ -798,324 +961,213 @@ ENTRY(retint_kernel)
798 jnc retint_restore_args 961 jnc retint_restore_args
799 call preempt_schedule_irq 962 call preempt_schedule_irq
800 jmp exit_intr 963 jmp exit_intr
801#endif 964#endif
802 965
803 CFI_ENDPROC 966 CFI_ENDPROC
804END(common_interrupt) 967END(common_interrupt)
805 968
806/* 969/*
807 * APIC interrupts. 970 * APIC interrupts.
808 */ 971 */
809 .macro apicinterrupt num,func 972.macro apicinterrupt num sym do_sym
973ENTRY(\sym)
810 INTR_FRAME 974 INTR_FRAME
811 pushq $~(\num) 975 pushq $~(\num)
812 CFI_ADJUST_CFA_OFFSET 8 976 CFI_ADJUST_CFA_OFFSET 8
813 interrupt \func 977 interrupt \do_sym
814 jmp ret_from_intr 978 jmp ret_from_intr
815 CFI_ENDPROC 979 CFI_ENDPROC
816 .endm 980END(\sym)
817 981.endm
818ENTRY(thermal_interrupt)
819 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
820END(thermal_interrupt)
821
822ENTRY(threshold_interrupt)
823 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
824END(threshold_interrupt)
825
826#ifdef CONFIG_SMP
827ENTRY(reschedule_interrupt)
828 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
829END(reschedule_interrupt)
830
831 .macro INVALIDATE_ENTRY num
832ENTRY(invalidate_interrupt\num)
833 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
834END(invalidate_interrupt\num)
835 .endm
836 982
837 INVALIDATE_ENTRY 0 983#ifdef CONFIG_SMP
838 INVALIDATE_ENTRY 1 984apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
839 INVALIDATE_ENTRY 2 985 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
840 INVALIDATE_ENTRY 3
841 INVALIDATE_ENTRY 4
842 INVALIDATE_ENTRY 5
843 INVALIDATE_ENTRY 6
844 INVALIDATE_ENTRY 7
845
846ENTRY(call_function_interrupt)
847 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
848END(call_function_interrupt)
849ENTRY(call_function_single_interrupt)
850 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
851END(call_function_single_interrupt)
852ENTRY(irq_move_cleanup_interrupt)
853 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
854END(irq_move_cleanup_interrupt)
855#endif 986#endif
856 987
857ENTRY(apic_timer_interrupt) 988#ifdef CONFIG_X86_UV
858 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 989apicinterrupt UV_BAU_MESSAGE \
859END(apic_timer_interrupt) 990 uv_bau_message_intr1 uv_bau_message_interrupt
991#endif
992apicinterrupt LOCAL_TIMER_VECTOR \
993 apic_timer_interrupt smp_apic_timer_interrupt
994
995#ifdef CONFIG_SMP
996apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
997 invalidate_interrupt0 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
999 invalidate_interrupt1 smp_invalidate_interrupt
1000apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
1001 invalidate_interrupt2 smp_invalidate_interrupt
1002apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
1003 invalidate_interrupt3 smp_invalidate_interrupt
1004apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
1005 invalidate_interrupt4 smp_invalidate_interrupt
1006apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
1007 invalidate_interrupt5 smp_invalidate_interrupt
1008apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
1009 invalidate_interrupt6 smp_invalidate_interrupt
1010apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1011 invalidate_interrupt7 smp_invalidate_interrupt
1012#endif
860 1013
861ENTRY(uv_bau_message_intr1) 1014apicinterrupt THRESHOLD_APIC_VECTOR \
862 apicinterrupt 220,uv_bau_message_interrupt 1015 threshold_interrupt mce_threshold_interrupt
863END(uv_bau_message_intr1) 1016apicinterrupt THERMAL_APIC_VECTOR \
1017 thermal_interrupt smp_thermal_interrupt
1018
1019#ifdef CONFIG_SMP
1020apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1021 call_function_single_interrupt smp_call_function_single_interrupt
1022apicinterrupt CALL_FUNCTION_VECTOR \
1023 call_function_interrupt smp_call_function_interrupt
1024apicinterrupt RESCHEDULE_VECTOR \
1025 reschedule_interrupt smp_reschedule_interrupt
1026#endif
864 1027
865ENTRY(error_interrupt) 1028apicinterrupt ERROR_APIC_VECTOR \
866 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 1029 error_interrupt smp_error_interrupt
867END(error_interrupt) 1030apicinterrupt SPURIOUS_APIC_VECTOR \
1031 spurious_interrupt smp_spurious_interrupt
868 1032
869ENTRY(spurious_interrupt)
870 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
871END(spurious_interrupt)
872
873/* 1033/*
874 * Exception entry points. 1034 * Exception entry points.
875 */ 1035 */
876 .macro zeroentry sym 1036.macro zeroentry sym do_sym
1037ENTRY(\sym)
877 INTR_FRAME 1038 INTR_FRAME
878 PARAVIRT_ADJUST_EXCEPTION_FRAME 1039 PARAVIRT_ADJUST_EXCEPTION_FRAME
879 pushq $0 /* push error code/oldrax */ 1040 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
880 CFI_ADJUST_CFA_OFFSET 8 1041 subq $15*8,%rsp
881 pushq %rax /* push real oldrax to the rdi slot */ 1042 CFI_ADJUST_CFA_OFFSET 15*8
882 CFI_ADJUST_CFA_OFFSET 8 1043 call error_entry
883 CFI_REL_OFFSET rax,0 1044 DEFAULT_FRAME 0
884 leaq \sym(%rip),%rax 1045 movq %rsp,%rdi /* pt_regs pointer */
885 jmp error_entry 1046 xorl %esi,%esi /* no error code */
1047 call \do_sym
1048 jmp error_exit /* %ebx: no swapgs flag */
886 CFI_ENDPROC 1049 CFI_ENDPROC
887 .endm 1050END(\sym)
1051.endm
888 1052
889 .macro errorentry sym 1053.macro paranoidzeroentry sym do_sym
890 XCPT_FRAME 1054ENTRY(\sym)
1055 INTR_FRAME
891 PARAVIRT_ADJUST_EXCEPTION_FRAME 1056 PARAVIRT_ADJUST_EXCEPTION_FRAME
892 pushq %rax 1057 pushq $-1 /* ORIG_RAX: no syscall to restart */
893 CFI_ADJUST_CFA_OFFSET 8 1058 CFI_ADJUST_CFA_OFFSET 8
894 CFI_REL_OFFSET rax,0 1059 subq $15*8, %rsp
895 leaq \sym(%rip),%rax 1060 call save_paranoid
896 jmp error_entry 1061 TRACE_IRQS_OFF
1062 movq %rsp,%rdi /* pt_regs pointer */
1063 xorl %esi,%esi /* no error code */
1064 call \do_sym
1065 jmp paranoid_exit /* %ebx: no swapgs flag */
897 CFI_ENDPROC 1066 CFI_ENDPROC
898 .endm 1067END(\sym)
1068.endm
899 1069
900 /* error code is on the stack already */ 1070.macro paranoidzeroentry_ist sym do_sym ist
901 /* handle NMI like exceptions that can happen everywhere */ 1071ENTRY(\sym)
902 .macro paranoidentry sym, ist=0, irqtrace=1 1072 INTR_FRAME
903 SAVE_ALL 1073 PARAVIRT_ADJUST_EXCEPTION_FRAME
904 cld 1074 pushq $-1 /* ORIG_RAX: no syscall to restart */
905 movl $1,%ebx 1075 CFI_ADJUST_CFA_OFFSET 8
906 movl $MSR_GS_BASE,%ecx 1076 subq $15*8, %rsp
907 rdmsr 1077 call save_paranoid
908 testl %edx,%edx
909 js 1f
910 SWAPGS
911 xorl %ebx,%ebx
9121:
913 .if \ist
914 movq %gs:pda_data_offset, %rbp
915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
919 movq %rsp,%rdi
920 movq ORIG_RAX(%rsp),%rsi
921 movq $-1,ORIG_RAX(%rsp)
922 .if \ist
923 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
924 .endif
925 call \sym
926 .if \ist
927 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
928 .endif
929 DISABLE_INTERRUPTS(CLBR_NONE)
930 .if \irqtrace
931 TRACE_IRQS_OFF 1078 TRACE_IRQS_OFF
932 .endif 1079 movq %rsp,%rdi /* pt_regs pointer */
933 .endm 1080 xorl %esi,%esi /* no error code */
1081 PER_CPU(init_tss, %rbp)
1082 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1083 call \do_sym
1084 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1085 jmp paranoid_exit /* %ebx: no swapgs flag */
1086 CFI_ENDPROC
1087END(\sym)
1088.endm
934 1089
935 /* 1090.macro errorentry sym do_sym
936 * "Paranoid" exit path from exception stack. 1091ENTRY(\sym)
937 * Paranoid because this is used by NMIs and cannot take 1092 XCPT_FRAME
938 * any kernel state for granted. 1093 PARAVIRT_ADJUST_EXCEPTION_FRAME
939 * We don't do kernel preemption checks here, because only 1094 subq $15*8,%rsp
940 * NMI should be common and it does not enable IRQs and 1095 CFI_ADJUST_CFA_OFFSET 15*8
941 * cannot get reschedule ticks. 1096 call error_entry
942 * 1097 DEFAULT_FRAME 0
943 * "trace" is 0 for the NMI handler only, because irq-tracing 1098 movq %rsp,%rdi /* pt_regs pointer */
944 * is fundamentally NMI-unsafe. (we cannot change the soft and 1099 movq ORIG_RAX(%rsp),%rsi /* get error code */
945 * hard flags at once, atomically) 1100 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
946 */ 1101 call \do_sym
947 .macro paranoidexit trace=1 1102 jmp error_exit /* %ebx: no swapgs flag */
948 /* ebx: no swapgs flag */
949paranoid_exit\trace:
950 testl %ebx,%ebx /* swapgs needed? */
951 jnz paranoid_restore\trace
952 testl $3,CS(%rsp)
953 jnz paranoid_userspace\trace
954paranoid_swapgs\trace:
955 .if \trace
956 TRACE_IRQS_IRETQ 0
957 .endif
958 SWAPGS_UNSAFE_STACK
959paranoid_restore\trace:
960 RESTORE_ALL 8
961 jmp irq_return
962paranoid_userspace\trace:
963 GET_THREAD_INFO(%rcx)
964 movl TI_flags(%rcx),%ebx
965 andl $_TIF_WORK_MASK,%ebx
966 jz paranoid_swapgs\trace
967 movq %rsp,%rdi /* &pt_regs */
968 call sync_regs
969 movq %rax,%rsp /* switch stack for scheduling */
970 testl $_TIF_NEED_RESCHED,%ebx
971 jnz paranoid_schedule\trace
972 movl %ebx,%edx /* arg3: thread flags */
973 .if \trace
974 TRACE_IRQS_ON
975 .endif
976 ENABLE_INTERRUPTS(CLBR_NONE)
977 xorl %esi,%esi /* arg2: oldset */
978 movq %rsp,%rdi /* arg1: &pt_regs */
979 call do_notify_resume
980 DISABLE_INTERRUPTS(CLBR_NONE)
981 .if \trace
982 TRACE_IRQS_OFF
983 .endif
984 jmp paranoid_userspace\trace
985paranoid_schedule\trace:
986 .if \trace
987 TRACE_IRQS_ON
988 .endif
989 ENABLE_INTERRUPTS(CLBR_ANY)
990 call schedule
991 DISABLE_INTERRUPTS(CLBR_ANY)
992 .if \trace
993 TRACE_IRQS_OFF
994 .endif
995 jmp paranoid_userspace\trace
996 CFI_ENDPROC 1103 CFI_ENDPROC
997 .endm 1104END(\sym)
1105.endm
998 1106
999/* 1107 /* error code is on the stack already */
1000 * Exception entry point. This expects an error code/orig_rax on the stack 1108.macro paranoiderrorentry sym do_sym
1001 * and the exception handler in %rax. 1109ENTRY(\sym)
1002 */ 1110 XCPT_FRAME
1003KPROBE_ENTRY(error_entry) 1111 PARAVIRT_ADJUST_EXCEPTION_FRAME
1004 _frame RDI 1112 subq $15*8,%rsp
1005 CFI_REL_OFFSET rax,0 1113 CFI_ADJUST_CFA_OFFSET 15*8
1006 /* rdi slot contains rax, oldrax contains error code */ 1114 call save_paranoid
1007 cld 1115 DEFAULT_FRAME 0
1008 subq $14*8,%rsp
1009 CFI_ADJUST_CFA_OFFSET (14*8)
1010 movq %rsi,13*8(%rsp)
1011 CFI_REL_OFFSET rsi,RSI
1012 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1013 CFI_REGISTER rax,rsi
1014 movq %rdx,12*8(%rsp)
1015 CFI_REL_OFFSET rdx,RDX
1016 movq %rcx,11*8(%rsp)
1017 CFI_REL_OFFSET rcx,RCX
1018 movq %rsi,10*8(%rsp) /* store rax */
1019 CFI_REL_OFFSET rax,RAX
1020 movq %r8, 9*8(%rsp)
1021 CFI_REL_OFFSET r8,R8
1022 movq %r9, 8*8(%rsp)
1023 CFI_REL_OFFSET r9,R9
1024 movq %r10,7*8(%rsp)
1025 CFI_REL_OFFSET r10,R10
1026 movq %r11,6*8(%rsp)
1027 CFI_REL_OFFSET r11,R11
1028 movq %rbx,5*8(%rsp)
1029 CFI_REL_OFFSET rbx,RBX
1030 movq %rbp,4*8(%rsp)
1031 CFI_REL_OFFSET rbp,RBP
1032 movq %r12,3*8(%rsp)
1033 CFI_REL_OFFSET r12,R12
1034 movq %r13,2*8(%rsp)
1035 CFI_REL_OFFSET r13,R13
1036 movq %r14,1*8(%rsp)
1037 CFI_REL_OFFSET r14,R14
1038 movq %r15,(%rsp)
1039 CFI_REL_OFFSET r15,R15
1040 xorl %ebx,%ebx
1041 testl $3,CS(%rsp)
1042 je error_kernelspace
1043error_swapgs:
1044 SWAPGS
1045error_sti:
1046 TRACE_IRQS_OFF
1047 movq %rdi,RDI(%rsp)
1048 CFI_REL_OFFSET rdi,RDI
1049 movq %rsp,%rdi
1050 movq ORIG_RAX(%rsp),%rsi /* get error code */
1051 movq $-1,ORIG_RAX(%rsp)
1052 call *%rax
1053 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1054error_exit:
1055 movl %ebx,%eax
1056 RESTORE_REST
1057 DISABLE_INTERRUPTS(CLBR_NONE)
1058 TRACE_IRQS_OFF 1116 TRACE_IRQS_OFF
1059 GET_THREAD_INFO(%rcx) 1117 movq %rsp,%rdi /* pt_regs pointer */
1060 testl %eax,%eax 1118 movq ORIG_RAX(%rsp),%rsi /* get error code */
1061 jne retint_kernel 1119 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1062 LOCKDEP_SYS_EXIT_IRQ 1120 call \do_sym
1063 movl TI_flags(%rcx),%edx 1121 jmp paranoid_exit /* %ebx: no swapgs flag */
1064 movl $_TIF_WORK_MASK,%edi
1065 andl %edi,%edx
1066 jnz retint_careful
1067 jmp retint_swapgs
1068 CFI_ENDPROC 1122 CFI_ENDPROC
1123END(\sym)
1124.endm
1069 1125
1070error_kernelspace: 1126zeroentry divide_error do_divide_error
1071 incl %ebx 1127zeroentry overflow do_overflow
1072 /* There are two places in the kernel that can potentially fault with 1128zeroentry bounds do_bounds
1073 usergs. Handle them here. The exception handlers after 1129zeroentry invalid_op do_invalid_op
1074 iret run with kernel gs again, so don't set the user space flag. 1130zeroentry device_not_available do_device_not_available
1075 B stepping K8s sometimes report an truncated RIP for IRET 1131paranoiderrorentry double_fault do_double_fault
1076 exceptions returning to compat mode. Check for these here too. */ 1132zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1077 leaq irq_return(%rip),%rcx 1133errorentry invalid_TSS do_invalid_TSS
1078 cmpq %rcx,RIP(%rsp) 1134errorentry segment_not_present do_segment_not_present
1079 je error_swapgs 1135zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1080 movl %ecx,%ecx /* zero extend */ 1136zeroentry coprocessor_error do_coprocessor_error
1081 cmpq %rcx,RIP(%rsp) 1137errorentry alignment_check do_alignment_check
1082 je error_swapgs 1138zeroentry simd_coprocessor_error do_simd_coprocessor_error
1083 cmpq $gs_change,RIP(%rsp) 1139
1084 je error_swapgs 1140 /* Reload gs selector with exception handling */
1085 jmp error_sti 1141 /* edi: new selector */
1086KPROBE_END(error_entry)
1087
1088 /* Reload gs selector with exception handling */
1089 /* edi: new selector */
1090ENTRY(native_load_gs_index) 1142ENTRY(native_load_gs_index)
1091 CFI_STARTPROC 1143 CFI_STARTPROC
1092 pushf 1144 pushf
1093 CFI_ADJUST_CFA_OFFSET 8 1145 CFI_ADJUST_CFA_OFFSET 8
1094 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1146 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1095 SWAPGS 1147 SWAPGS
1096gs_change: 1148gs_change:
1097 movl %edi,%gs 1149 movl %edi,%gs
10982: mfence /* workaround */ 11502: mfence /* workaround */
1099 SWAPGS 1151 SWAPGS
1100 popf 1152 popf
1101 CFI_ADJUST_CFA_OFFSET -8 1153 CFI_ADJUST_CFA_OFFSET -8
1102 ret 1154 ret
1103 CFI_ENDPROC 1155 CFI_ENDPROC
1104ENDPROC(native_load_gs_index) 1156END(native_load_gs_index)
1105 1157
1106 .section __ex_table,"a" 1158 .section __ex_table,"a"
1107 .align 8 1159 .align 8
1108 .quad gs_change,bad_gs 1160 .quad gs_change,bad_gs
1109 .previous 1161 .previous
1110 .section .fixup,"ax" 1162 .section .fixup,"ax"
1111 /* running with kernelgs */ 1163 /* running with kernelgs */
1112bad_gs: 1164bad_gs:
1113 SWAPGS /* switch back to user gs */ 1165 SWAPGS /* switch back to user gs */
1114 xorl %eax,%eax 1166 xorl %eax,%eax
1115 movl %eax,%gs 1167 movl %eax,%gs
1116 jmp 2b 1168 jmp 2b
1117 .previous 1169 .previous
1118 1170
1119/* 1171/*
1120 * Create a kernel thread. 1172 * Create a kernel thread.
1121 * 1173 *
@@ -1138,7 +1190,7 @@ ENTRY(kernel_thread)
1138 1190
1139 xorl %r8d,%r8d 1191 xorl %r8d,%r8d
1140 xorl %r9d,%r9d 1192 xorl %r9d,%r9d
1141 1193
1142 # clone now 1194 # clone now
1143 call do_fork 1195 call do_fork
1144 movq %rax,RAX(%rsp) 1196 movq %rax,RAX(%rsp)
@@ -1149,15 +1201,15 @@ ENTRY(kernel_thread)
1149 * so internally to the x86_64 port you can rely on kernel_thread() 1201 * so internally to the x86_64 port you can rely on kernel_thread()
1150 * not to reschedule the child before returning, this avoids the need 1202 * not to reschedule the child before returning, this avoids the need
1151 * of hacks for example to fork off the per-CPU idle tasks. 1203 * of hacks for example to fork off the per-CPU idle tasks.
1152 * [Hopefully no generic code relies on the reschedule -AK] 1204 * [Hopefully no generic code relies on the reschedule -AK]
1153 */ 1205 */
1154 RESTORE_ALL 1206 RESTORE_ALL
1155 UNFAKE_STACK_FRAME 1207 UNFAKE_STACK_FRAME
1156 ret 1208 ret
1157 CFI_ENDPROC 1209 CFI_ENDPROC
1158ENDPROC(kernel_thread) 1210END(kernel_thread)
1159 1211
1160child_rip: 1212ENTRY(child_rip)
1161 pushq $0 # fake return address 1213 pushq $0 # fake return address
1162 CFI_STARTPROC 1214 CFI_STARTPROC
1163 /* 1215 /*
@@ -1170,8 +1222,9 @@ child_rip:
1170 # exit 1222 # exit
1171 mov %eax, %edi 1223 mov %eax, %edi
1172 call do_exit 1224 call do_exit
1225 ud2 # padding for call trace
1173 CFI_ENDPROC 1226 CFI_ENDPROC
1174ENDPROC(child_rip) 1227END(child_rip)
1175 1228
1176/* 1229/*
1177 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1230 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1191,10 +1244,10 @@ ENDPROC(child_rip)
1191ENTRY(kernel_execve) 1244ENTRY(kernel_execve)
1192 CFI_STARTPROC 1245 CFI_STARTPROC
1193 FAKE_STACK_FRAME $0 1246 FAKE_STACK_FRAME $0
1194 SAVE_ALL 1247 SAVE_ALL
1195 movq %rsp,%rcx 1248 movq %rsp,%rcx
1196 call sys_execve 1249 call sys_execve
1197 movq %rax, RAX(%rsp) 1250 movq %rax, RAX(%rsp)
1198 RESTORE_REST 1251 RESTORE_REST
1199 testq %rax,%rax 1252 testq %rax,%rax
1200 je int_ret_from_sys_call 1253 je int_ret_from_sys_call
@@ -1202,129 +1255,7 @@ ENTRY(kernel_execve)
1202 UNFAKE_STACK_FRAME 1255 UNFAKE_STACK_FRAME
1203 ret 1256 ret
1204 CFI_ENDPROC 1257 CFI_ENDPROC
1205ENDPROC(kernel_execve) 1258END(kernel_execve)
1206
1207KPROBE_ENTRY(page_fault)
1208 errorentry do_page_fault
1209KPROBE_END(page_fault)
1210
1211ENTRY(coprocessor_error)
1212 zeroentry do_coprocessor_error
1213END(coprocessor_error)
1214
1215ENTRY(simd_coprocessor_error)
1216 zeroentry do_simd_coprocessor_error
1217END(simd_coprocessor_error)
1218
1219ENTRY(device_not_available)
1220 zeroentry do_device_not_available
1221END(device_not_available)
1222
1223 /* runs on exception stack */
1224KPROBE_ENTRY(debug)
1225 INTR_FRAME
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1227 pushq $0
1228 CFI_ADJUST_CFA_OFFSET 8
1229 paranoidentry do_debug, DEBUG_STACK
1230 paranoidexit
1231KPROBE_END(debug)
1232
1233 /* runs on exception stack */
1234KPROBE_ENTRY(nmi)
1235 INTR_FRAME
1236 PARAVIRT_ADJUST_EXCEPTION_FRAME
1237 pushq $-1
1238 CFI_ADJUST_CFA_OFFSET 8
1239 paranoidentry do_nmi, 0, 0
1240#ifdef CONFIG_TRACE_IRQFLAGS
1241 paranoidexit 0
1242#else
1243 jmp paranoid_exit1
1244 CFI_ENDPROC
1245#endif
1246KPROBE_END(nmi)
1247
1248KPROBE_ENTRY(int3)
1249 INTR_FRAME
1250 PARAVIRT_ADJUST_EXCEPTION_FRAME
1251 pushq $0
1252 CFI_ADJUST_CFA_OFFSET 8
1253 paranoidentry do_int3, DEBUG_STACK
1254 jmp paranoid_exit1
1255 CFI_ENDPROC
1256KPROBE_END(int3)
1257
1258ENTRY(overflow)
1259 zeroentry do_overflow
1260END(overflow)
1261
1262ENTRY(bounds)
1263 zeroentry do_bounds
1264END(bounds)
1265
1266ENTRY(invalid_op)
1267 zeroentry do_invalid_op
1268END(invalid_op)
1269
1270ENTRY(coprocessor_segment_overrun)
1271 zeroentry do_coprocessor_segment_overrun
1272END(coprocessor_segment_overrun)
1273
1274 /* runs on exception stack */
1275ENTRY(double_fault)
1276 XCPT_FRAME
1277 PARAVIRT_ADJUST_EXCEPTION_FRAME
1278 paranoidentry do_double_fault
1279 jmp paranoid_exit1
1280 CFI_ENDPROC
1281END(double_fault)
1282
1283ENTRY(invalid_TSS)
1284 errorentry do_invalid_TSS
1285END(invalid_TSS)
1286
1287ENTRY(segment_not_present)
1288 errorentry do_segment_not_present
1289END(segment_not_present)
1290
1291 /* runs on exception stack */
1292ENTRY(stack_segment)
1293 XCPT_FRAME
1294 PARAVIRT_ADJUST_EXCEPTION_FRAME
1295 paranoidentry do_stack_segment
1296 jmp paranoid_exit1
1297 CFI_ENDPROC
1298END(stack_segment)
1299
1300KPROBE_ENTRY(general_protection)
1301 errorentry do_general_protection
1302KPROBE_END(general_protection)
1303
1304ENTRY(alignment_check)
1305 errorentry do_alignment_check
1306END(alignment_check)
1307
1308ENTRY(divide_error)
1309 zeroentry do_divide_error
1310END(divide_error)
1311
1312ENTRY(spurious_interrupt_bug)
1313 zeroentry do_spurious_interrupt_bug
1314END(spurious_interrupt_bug)
1315
1316#ifdef CONFIG_X86_MCE
1317 /* runs on exception stack */
1318ENTRY(machine_check)
1319 INTR_FRAME
1320 PARAVIRT_ADJUST_EXCEPTION_FRAME
1321 pushq $0
1322 CFI_ADJUST_CFA_OFFSET 8
1323 paranoidentry do_machine_check
1324 jmp paranoid_exit1
1325 CFI_ENDPROC
1326END(machine_check)
1327#endif
1328 1259
1329/* Call softirq on interrupt stack. Interrupts are off. */ 1260/* Call softirq on interrupt stack. Interrupts are off. */
1330ENTRY(call_softirq) 1261ENTRY(call_softirq)
@@ -1334,81 +1265,77 @@ ENTRY(call_softirq)
1334 CFI_REL_OFFSET rbp,0 1265 CFI_REL_OFFSET rbp,0
1335 mov %rsp,%rbp 1266 mov %rsp,%rbp
1336 CFI_DEF_CFA_REGISTER rbp 1267 CFI_DEF_CFA_REGISTER rbp
1337 incl %gs:pda_irqcount 1268 incl PER_CPU_VAR(irq_count)
1338 cmove %gs:pda_irqstackptr,%rsp 1269 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1339 push %rbp # backlink for old unwinder 1270 push %rbp # backlink for old unwinder
1340 call __do_softirq 1271 call __do_softirq
1341 leaveq 1272 leaveq
1342 CFI_DEF_CFA_REGISTER rsp 1273 CFI_DEF_CFA_REGISTER rsp
1343 CFI_ADJUST_CFA_OFFSET -8 1274 CFI_ADJUST_CFA_OFFSET -8
1344 decl %gs:pda_irqcount 1275 decl PER_CPU_VAR(irq_count)
1345 ret 1276 ret
1346 CFI_ENDPROC 1277 CFI_ENDPROC
1347ENDPROC(call_softirq) 1278END(call_softirq)
1348
1349KPROBE_ENTRY(ignore_sysret)
1350 CFI_STARTPROC
1351 mov $-ENOSYS,%eax
1352 sysret
1353 CFI_ENDPROC
1354ENDPROC(ignore_sysret)
1355 1279
1356#ifdef CONFIG_XEN 1280#ifdef CONFIG_XEN
1357ENTRY(xen_hypervisor_callback) 1281zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1358 zeroentry xen_do_hypervisor_callback
1359END(xen_hypervisor_callback)
1360 1282
1361/* 1283/*
1362# A note on the "critical region" in our callback handler. 1284 * A note on the "critical region" in our callback handler.
1363# We want to avoid stacking callback handlers due to events occurring 1285 * We want to avoid stacking callback handlers due to events occurring
1364# during handling of the last event. To do this, we keep events disabled 1286 * during handling of the last event. To do this, we keep events disabled
1365# until we've done all processing. HOWEVER, we must enable events before 1287 * until we've done all processing. HOWEVER, we must enable events before
1366# popping the stack frame (can't be done atomically) and so it would still 1288 * popping the stack frame (can't be done atomically) and so it would still
1367# be possible to get enough handler activations to overflow the stack. 1289 * be possible to get enough handler activations to overflow the stack.
1368# Although unlikely, bugs of that kind are hard to track down, so we'd 1290 * Although unlikely, bugs of that kind are hard to track down, so we'd
1369# like to avoid the possibility. 1291 * like to avoid the possibility.
1370# So, on entry to the handler we detect whether we interrupted an 1292 * So, on entry to the handler we detect whether we interrupted an
1371# existing activation in its critical region -- if so, we pop the current 1293 * existing activation in its critical region -- if so, we pop the current
1372# activation and restart the handler using the previous one. 1294 * activation and restart the handler using the previous one.
1373*/ 1295 */
1374ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1296ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1375 CFI_STARTPROC 1297 CFI_STARTPROC
1376/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1298/*
1377 see the correct pointer to the pt_regs */ 1299 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1300 * see the correct pointer to the pt_regs
1301 */
1378 movq %rdi, %rsp # we don't return, adjust the stack frame 1302 movq %rdi, %rsp # we don't return, adjust the stack frame
1379 CFI_ENDPROC 1303 CFI_ENDPROC
1380 CFI_DEFAULT_STACK 1304 DEFAULT_FRAME
138111: incl %gs:pda_irqcount 130511: incl PER_CPU_VAR(irq_count)
1382 movq %rsp,%rbp 1306 movq %rsp,%rbp
1383 CFI_DEF_CFA_REGISTER rbp 1307 CFI_DEF_CFA_REGISTER rbp
1384 cmovzq %gs:pda_irqstackptr,%rsp 1308 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1385 pushq %rbp # backlink for old unwinder 1309 pushq %rbp # backlink for old unwinder
1386 call xen_evtchn_do_upcall 1310 call xen_evtchn_do_upcall
1387 popq %rsp 1311 popq %rsp
1388 CFI_DEF_CFA_REGISTER rsp 1312 CFI_DEF_CFA_REGISTER rsp
1389 decl %gs:pda_irqcount 1313 decl PER_CPU_VAR(irq_count)
1390 jmp error_exit 1314 jmp error_exit
1391 CFI_ENDPROC 1315 CFI_ENDPROC
1392END(do_hypervisor_callback) 1316END(do_hypervisor_callback)
1393 1317
1394/* 1318/*
1395# Hypervisor uses this for application faults while it executes. 1319 * Hypervisor uses this for application faults while it executes.
1396# We get here for two reasons: 1320 * We get here for two reasons:
1397# 1. Fault while reloading DS, ES, FS or GS 1321 * 1. Fault while reloading DS, ES, FS or GS
1398# 2. Fault while executing IRET 1322 * 2. Fault while executing IRET
1399# Category 1 we do not need to fix up as Xen has already reloaded all segment 1323 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1400# registers that could be reloaded and zeroed the others. 1324 * registers that could be reloaded and zeroed the others.
1401# Category 2 we fix up by killing the current process. We cannot use the 1325 * Category 2 we fix up by killing the current process. We cannot use the
1402# normal Linux return path in this case because if we use the IRET hypercall 1326 * normal Linux return path in this case because if we use the IRET hypercall
1403# to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1327 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1404# We distinguish between categories by comparing each saved segment register 1328 * We distinguish between categories by comparing each saved segment register
1405# with its current contents: any discrepancy means we in category 1. 1329 * with its current contents: any discrepancy means we in category 1.
1406*/ 1330 */
1407ENTRY(xen_failsafe_callback) 1331ENTRY(xen_failsafe_callback)
1408 framesz = (RIP-0x30) /* workaround buggy gas */ 1332 INTR_FRAME 1 (6*8)
1409 _frame framesz 1333 /*CFI_REL_OFFSET gs,GS*/
1410 CFI_REL_OFFSET rcx, 0 1334 /*CFI_REL_OFFSET fs,FS*/
1411 CFI_REL_OFFSET r11, 8 1335 /*CFI_REL_OFFSET es,ES*/
1336 /*CFI_REL_OFFSET ds,DS*/
1337 CFI_REL_OFFSET r11,8
1338 CFI_REL_OFFSET rcx,0
1412 movw %ds,%cx 1339 movw %ds,%cx
1413 cmpw %cx,0x10(%rsp) 1340 cmpw %cx,0x10(%rsp)
1414 CFI_REMEMBER_STATE 1341 CFI_REMEMBER_STATE
@@ -1429,12 +1356,9 @@ ENTRY(xen_failsafe_callback)
1429 CFI_RESTORE r11 1356 CFI_RESTORE r11
1430 addq $0x30,%rsp 1357 addq $0x30,%rsp
1431 CFI_ADJUST_CFA_OFFSET -0x30 1358 CFI_ADJUST_CFA_OFFSET -0x30
1432 pushq $0 1359 pushq_cfi $0 /* RIP */
1433 CFI_ADJUST_CFA_OFFSET 8 1360 pushq_cfi %r11
1434 pushq %r11 1361 pushq_cfi %rcx
1435 CFI_ADJUST_CFA_OFFSET 8
1436 pushq %rcx
1437 CFI_ADJUST_CFA_OFFSET 8
1438 jmp general_protection 1362 jmp general_protection
1439 CFI_RESTORE_STATE 1363 CFI_RESTORE_STATE
14401: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 13641: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1444,11 +1368,223 @@ ENTRY(xen_failsafe_callback)
1444 CFI_RESTORE r11 1368 CFI_RESTORE r11
1445 addq $0x30,%rsp 1369 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30 1370 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0 1371 pushq_cfi $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 SAVE_ALL 1372 SAVE_ALL
1450 jmp error_exit 1373 jmp error_exit
1451 CFI_ENDPROC 1374 CFI_ENDPROC
1452END(xen_failsafe_callback) 1375END(xen_failsafe_callback)
1453 1376
1454#endif /* CONFIG_XEN */ 1377#endif /* CONFIG_XEN */
1378
1379/*
1380 * Some functions should be protected against kprobes
1381 */
1382 .pushsection .kprobes.text, "ax"
1383
1384paranoidzeroentry_ist debug do_debug DEBUG_STACK
1385paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1386paranoiderrorentry stack_segment do_stack_segment
1387errorentry general_protection do_general_protection
1388errorentry page_fault do_page_fault
1389#ifdef CONFIG_X86_MCE
1390paranoidzeroentry machine_check do_machine_check
1391#endif
1392
1393 /*
1394 * "Paranoid" exit path from exception stack.
1395 * Paranoid because this is used by NMIs and cannot take
1396 * any kernel state for granted.
1397 * We don't do kernel preemption checks here, because only
1398 * NMI should be common and it does not enable IRQs and
1399 * cannot get reschedule ticks.
1400 *
1401 * "trace" is 0 for the NMI handler only, because irq-tracing
1402 * is fundamentally NMI-unsafe. (we cannot change the soft and
1403 * hard flags at once, atomically)
1404 */
1405
1406 /* ebx: no swapgs flag */
1407ENTRY(paranoid_exit)
1408 INTR_FRAME
1409 DISABLE_INTERRUPTS(CLBR_NONE)
1410 TRACE_IRQS_OFF
1411 testl %ebx,%ebx /* swapgs needed? */
1412 jnz paranoid_restore
1413 testl $3,CS(%rsp)
1414 jnz paranoid_userspace
1415paranoid_swapgs:
1416 TRACE_IRQS_IRETQ 0
1417 SWAPGS_UNSAFE_STACK
1418paranoid_restore:
1419 RESTORE_ALL 8
1420 jmp irq_return
1421paranoid_userspace:
1422 GET_THREAD_INFO(%rcx)
1423 movl TI_flags(%rcx),%ebx
1424 andl $_TIF_WORK_MASK,%ebx
1425 jz paranoid_swapgs
1426 movq %rsp,%rdi /* &pt_regs */
1427 call sync_regs
1428 movq %rax,%rsp /* switch stack for scheduling */
1429 testl $_TIF_NEED_RESCHED,%ebx
1430 jnz paranoid_schedule
1431 movl %ebx,%edx /* arg3: thread flags */
1432 TRACE_IRQS_ON
1433 ENABLE_INTERRUPTS(CLBR_NONE)
1434 xorl %esi,%esi /* arg2: oldset */
1435 movq %rsp,%rdi /* arg1: &pt_regs */
1436 call do_notify_resume
1437 DISABLE_INTERRUPTS(CLBR_NONE)
1438 TRACE_IRQS_OFF
1439 jmp paranoid_userspace
1440paranoid_schedule:
1441 TRACE_IRQS_ON
1442 ENABLE_INTERRUPTS(CLBR_ANY)
1443 call schedule
1444 DISABLE_INTERRUPTS(CLBR_ANY)
1445 TRACE_IRQS_OFF
1446 jmp paranoid_userspace
1447 CFI_ENDPROC
1448END(paranoid_exit)
1449
1450/*
1451 * Exception entry point. This expects an error code/orig_rax on the stack.
1452 * returns in "no swapgs flag" in %ebx.
1453 */
1454ENTRY(error_entry)
1455 XCPT_FRAME
1456 CFI_ADJUST_CFA_OFFSET 15*8
1457 /* oldrax contains error code */
1458 cld
1459 movq_cfi rdi, RDI+8
1460 movq_cfi rsi, RSI+8
1461 movq_cfi rdx, RDX+8
1462 movq_cfi rcx, RCX+8
1463 movq_cfi rax, RAX+8
1464 movq_cfi r8, R8+8
1465 movq_cfi r9, R9+8
1466 movq_cfi r10, R10+8
1467 movq_cfi r11, R11+8
1468 movq_cfi rbx, RBX+8
1469 movq_cfi rbp, RBP+8
1470 movq_cfi r12, R12+8
1471 movq_cfi r13, R13+8
1472 movq_cfi r14, R14+8
1473 movq_cfi r15, R15+8
1474 xorl %ebx,%ebx
1475 testl $3,CS+8(%rsp)
1476 je error_kernelspace
1477error_swapgs:
1478 SWAPGS
1479error_sti:
1480 TRACE_IRQS_OFF
1481 ret
1482 CFI_ENDPROC
1483
1484/*
1485 * There are two places in the kernel that can potentially fault with
1486 * usergs. Handle them here. The exception handlers after iret run with
1487 * kernel gs again, so don't set the user space flag. B stepping K8s
1488 * sometimes report an truncated RIP for IRET exceptions returning to
1489 * compat mode. Check for these here too.
1490 */
1491error_kernelspace:
1492 incl %ebx
1493 leaq irq_return(%rip),%rcx
1494 cmpq %rcx,RIP+8(%rsp)
1495 je error_swapgs
1496 movl %ecx,%ecx /* zero extend */
1497 cmpq %rcx,RIP+8(%rsp)
1498 je error_swapgs
1499 cmpq $gs_change,RIP+8(%rsp)
1500 je error_swapgs
1501 jmp error_sti
1502END(error_entry)
1503
1504
1505/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1506ENTRY(error_exit)
1507 DEFAULT_FRAME
1508 movl %ebx,%eax
1509 RESTORE_REST
1510 DISABLE_INTERRUPTS(CLBR_NONE)
1511 TRACE_IRQS_OFF
1512 GET_THREAD_INFO(%rcx)
1513 testl %eax,%eax
1514 jne retint_kernel
1515 LOCKDEP_SYS_EXIT_IRQ
1516 movl TI_flags(%rcx),%edx
1517 movl $_TIF_WORK_MASK,%edi
1518 andl %edi,%edx
1519 jnz retint_careful
1520 jmp retint_swapgs
1521 CFI_ENDPROC
1522END(error_exit)
1523
1524
1525 /* runs on exception stack */
1526ENTRY(nmi)
1527 INTR_FRAME
1528 PARAVIRT_ADJUST_EXCEPTION_FRAME
1529 pushq_cfi $-1
1530 subq $15*8, %rsp
1531 CFI_ADJUST_CFA_OFFSET 15*8
1532 call save_paranoid
1533 DEFAULT_FRAME 0
1534 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1535 movq %rsp,%rdi
1536 movq $-1,%rsi
1537 call do_nmi
1538#ifdef CONFIG_TRACE_IRQFLAGS
1539 /* paranoidexit; without TRACE_IRQS_OFF */
1540 /* ebx: no swapgs flag */
1541 DISABLE_INTERRUPTS(CLBR_NONE)
1542 testl %ebx,%ebx /* swapgs needed? */
1543 jnz nmi_restore
1544 testl $3,CS(%rsp)
1545 jnz nmi_userspace
1546nmi_swapgs:
1547 SWAPGS_UNSAFE_STACK
1548nmi_restore:
1549 RESTORE_ALL 8
1550 jmp irq_return
1551nmi_userspace:
1552 GET_THREAD_INFO(%rcx)
1553 movl TI_flags(%rcx),%ebx
1554 andl $_TIF_WORK_MASK,%ebx
1555 jz nmi_swapgs
1556 movq %rsp,%rdi /* &pt_regs */
1557 call sync_regs
1558 movq %rax,%rsp /* switch stack for scheduling */
1559 testl $_TIF_NEED_RESCHED,%ebx
1560 jnz nmi_schedule
1561 movl %ebx,%edx /* arg3: thread flags */
1562 ENABLE_INTERRUPTS(CLBR_NONE)
1563 xorl %esi,%esi /* arg2: oldset */
1564 movq %rsp,%rdi /* arg1: &pt_regs */
1565 call do_notify_resume
1566 DISABLE_INTERRUPTS(CLBR_NONE)
1567 jmp nmi_userspace
1568nmi_schedule:
1569 ENABLE_INTERRUPTS(CLBR_ANY)
1570 call schedule
1571 DISABLE_INTERRUPTS(CLBR_ANY)
1572 jmp nmi_userspace
1573 CFI_ENDPROC
1574#else
1575 jmp paranoid_exit
1576 CFI_ENDPROC
1577#endif
1578END(nmi)
1579
1580ENTRY(ignore_sysret)
1581 CFI_STARTPROC
1582 mov $-ENOSYS,%eax
1583 sysret
1584 CFI_ENDPROC
1585END(ignore_sysret)
1586
1587/*
1588 * End of kprobes section
1589 */
1590 .popsection
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index f454c78fcef6..55515d73d9c2 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,10 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/atomic.h>
41#include <asm/apicdef.h> 42#include <asm/apicdef.h>
42#include <mach_mpparse.h> 43#include <asm/genapic.h>
44#include <asm/setup.h>
43 45
44/* 46/*
45 * ES7000 chipsets 47 * ES7000 chipsets
@@ -161,6 +163,39 @@ es7000_rename_gsi(int ioapic, int gsi)
161 return gsi; 163 return gsi;
162} 164}
163 165
166static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
167{
168 unsigned long vect = 0, psaival = 0;
169
170 if (psai == NULL)
171 return -1;
172
173 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
174 psaival = (0x1000000 | vect | cpu);
175
176 while (*psai & 0x1000000)
177 ;
178
179 *psai = psaival;
180
181 return 0;
182}
183
184static int __init es7000_update_genapic(void)
185{
186 apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
187
188 /* MPENTIUMIII */
189 if (boot_cpu_data.x86 == 6 &&
190 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
191 es7000_update_genapic_to_cluster();
192 apic->wait_for_init_deassert = NULL;
193 apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
194 }
195
196 return 0;
197}
198
164void __init 199void __init
165setup_unisys(void) 200setup_unisys(void)
166{ 201{
@@ -176,6 +211,8 @@ setup_unisys(void)
176 else 211 else
177 es7000_plat = ES7000_CLASSIC; 212 es7000_plat = ES7000_CLASSIC;
178 ioapic_renumber_irq = es7000_rename_gsi; 213 ioapic_renumber_irq = es7000_rename_gsi;
214
215 x86_quirks->update_genapic = es7000_update_genapic;
179} 216}
180 217
181/* 218/*
@@ -324,40 +361,449 @@ es7000_mip_write(struct mip_reg *mip_reg)
324 return status; 361 return status;
325} 362}
326 363
327int 364void __init es7000_enable_apic_mode(void)
328es7000_start_cpu(int cpu, unsigned long eip)
329{ 365{
330 unsigned long vect = 0, psaival = 0; 366 struct mip_reg es7000_mip_reg;
367 int mip_status;
331 368
332 if (psai == NULL) 369 if (!es7000_plat)
333 return -1; 370 return;
334 371
335 vect = ((unsigned long)__pa(eip)/0x1000) << 16; 372 printk("ES7000: Enabling APIC mode.\n");
336 psaival = (0x1000000 | vect | cpu); 373 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
374 es7000_mip_reg.off_0 = MIP_SW_APIC;
375 es7000_mip_reg.off_38 = MIP_VALID;
337 376
338 while (*psai & 0x1000000) 377 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) {
339 ; 378 printk("es7000_enable_apic_mode: command failed, status = %x\n",
379 mip_status);
380 }
381}
382
383/*
384 * APIC driver for the Unisys ES7000 chipset.
385 */
386#define APIC_DEFINITION 1
387#include <linux/threads.h>
388#include <linux/cpumask.h>
389#include <asm/mpspec.h>
390#include <asm/genapic.h>
391#include <asm/fixmap.h>
392#include <asm/apicdef.h>
393#include <linux/kernel.h>
394#include <linux/string.h>
395#include <linux/init.h>
396#include <linux/acpi.h>
397#include <linux/smp.h>
398#include <asm/ipi.h>
399
400#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
401#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
402#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
403
404#define APIC_DFR_VALUE (APIC_DFR_FLAT)
405
406extern void es7000_enable_apic_mode(void);
407extern int apic_version [MAX_APICS];
408extern u8 cpu_2_logical_apicid[];
409extern unsigned int boot_cpu_physical_apicid;
410
411extern int parse_unisys_oem (char *oemptr);
412extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
413extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
414extern void setup_unisys(void);
415
416#define apicid_cluster(apicid) (apicid & 0xF0)
417#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
418
419static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
420{
421 /* Careful. Some cpus do not strictly honor the set of cpus
422 * specified in the interrupt destination when using lowest
423 * priority interrupt delivery mode.
424 *
425 * In particular there was a hyperthreading cpu observed to
426 * deliver interrupts to the wrong hyperthread when only one
427 * hyperthread was specified in the interrupt desitination.
428 */
429 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
430}
340 431
341 *psai = psaival;
342 432
433static void es7000_wait_for_init_deassert(atomic_t *deassert)
434{
435#ifndef CONFIG_ES7000_CLUSTERED_APIC
436 while (!atomic_read(deassert))
437 cpu_relax();
438#endif
439 return;
440}
441
442static unsigned int es7000_get_apic_id(unsigned long x)
443{
444 return (x >> 24) & 0xFF;
445}
446
447#ifdef CONFIG_ACPI
448static int es7000_check_dsdt(void)
449{
450 struct acpi_table_header header;
451
452 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
453 !strncmp(header.oem_id, "UNISYS", 6))
454 return 1;
343 return 0; 455 return 0;
456}
457#endif
344 458
459static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
460{
461 default_send_IPI_mask_sequence_phys(mask, vector);
345} 462}
346 463
347void __init 464static void es7000_send_IPI_allbutself(int vector)
348es7000_sw_apic(void) 465{
349{ 466 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
350 if (es7000_plat) { 467}
351 int mip_status; 468
352 struct mip_reg es7000_mip_reg; 469static void es7000_send_IPI_all(int vector)
353 470{
354 printk("ES7000: Enabling APIC mode.\n"); 471 es7000_send_IPI_mask(cpu_online_mask, vector);
355 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 472}
356 es7000_mip_reg.off_0 = MIP_SW_APIC; 473
357 es7000_mip_reg.off_38 = (MIP_VALID); 474static int es7000_apic_id_registered(void)
358 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) 475{
359 printk("es7000_sw_apic: command failed, status = %x\n", 476 return 1;
360 mip_status); 477}
361 return; 478
479static const cpumask_t *target_cpus_cluster(void)
480{
481 return &CPU_MASK_ALL;
482}
483
484static const cpumask_t *es7000_target_cpus(void)
485{
486 return &cpumask_of_cpu(smp_processor_id());
487}
488
489static unsigned long
490es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
491{
492 return 0;
493}
494static unsigned long es7000_check_apicid_present(int bit)
495{
496 return physid_isset(bit, phys_cpu_present_map);
497}
498
499static unsigned long calculate_ldr(int cpu)
500{
501 unsigned long id = xapic_phys_to_log_apicid(cpu);
502
503 return (SET_APIC_LOGICAL_ID(id));
504}
505
506/*
507 * Set up the logical destination ID.
508 *
509 * Intel recommends to set DFR, LdR and TPR before enabling
510 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
511 * document number 292116). So here it goes...
512 */
513static void es7000_init_apic_ldr_cluster(void)
514{
515 unsigned long val;
516 int cpu = smp_processor_id();
517
518 apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
519 val = calculate_ldr(cpu);
520 apic_write(APIC_LDR, val);
521}
522
523static void es7000_init_apic_ldr(void)
524{
525 unsigned long val;
526 int cpu = smp_processor_id();
527
528 apic_write(APIC_DFR, APIC_DFR_VALUE);
529 val = calculate_ldr(cpu);
530 apic_write(APIC_LDR, val);
531}
532
533static void es7000_setup_apic_routing(void)
534{
535 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
536 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
537 (apic_version[apic] == 0x14) ?
538 "Physical Cluster" : "Logical Cluster",
539 nr_ioapics, cpus_addr(*es7000_target_cpus())[0]);
540}
541
542static int es7000_apicid_to_node(int logical_apicid)
543{
544 return 0;
545}
546
547
548static int es7000_cpu_present_to_apicid(int mps_cpu)
549{
550 if (!mps_cpu)
551 return boot_cpu_physical_apicid;
552 else if (mps_cpu < nr_cpu_ids)
553 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
554 else
555 return BAD_APICID;
556}
557
558static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
559{
560 static int id = 0;
561 physid_mask_t mask;
562
563 mask = physid_mask_of_physid(id);
564 ++id;
565
566 return mask;
567}
568
569/* Mapping from cpu number to logical apicid */
570static int es7000_cpu_to_logical_apicid(int cpu)
571{
572#ifdef CONFIG_SMP
573 if (cpu >= nr_cpu_ids)
574 return BAD_APICID;
575 return (int)cpu_2_logical_apicid[cpu];
576#else
577 return logical_smp_processor_id();
578#endif
579}
580
581static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
582{
583 /* For clustered we don't have a good way to do this yet - hack */
584 return physids_promote(0xff);
585}
586
587static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
588{
589 boot_cpu_physical_apicid = read_apic_id();
590 return (1);
591}
592
593static unsigned int
594es7000_cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
595{
596 int cpus_found = 0;
597 int num_bits_set;
598 int apicid;
599 int cpu;
600
601 num_bits_set = cpumask_weight(cpumask);
602 /* Return id to all */
603 if (num_bits_set == nr_cpu_ids)
604 return 0xFF;
605 /*
606 * The cpus in the mask must all be on the apic cluster. If are not
607 * on the same apicid cluster return default value of target_cpus():
608 */
609 cpu = cpumask_first(cpumask);
610 apicid = es7000_cpu_to_logical_apicid(cpu);
611
612 while (cpus_found < num_bits_set) {
613 if (cpumask_test_cpu(cpu, cpumask)) {
614 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
615
616 if (apicid_cluster(apicid) !=
617 apicid_cluster(new_apicid)) {
618 printk ("%s: Not a valid mask!\n", __func__);
619
620 return 0xFF;
621 }
622 apicid = new_apicid;
623 cpus_found++;
624 }
625 cpu++;
626 }
627 return apicid;
628}
629
630static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
631{
632 int cpus_found = 0;
633 int num_bits_set;
634 int apicid;
635 int cpu;
636
637 num_bits_set = cpus_weight(*cpumask);
638 /* Return id to all */
639 if (num_bits_set == nr_cpu_ids)
640 return es7000_cpu_to_logical_apicid(0);
641 /*
642 * The cpus in the mask must all be on the apic cluster. If are not
643 * on the same apicid cluster return default value of target_cpus():
644 */
645 cpu = first_cpu(*cpumask);
646 apicid = es7000_cpu_to_logical_apicid(cpu);
647 while (cpus_found < num_bits_set) {
648 if (cpu_isset(cpu, *cpumask)) {
649 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
650
651 if (apicid_cluster(apicid) !=
652 apicid_cluster(new_apicid)) {
653 printk ("%s: Not a valid mask!\n", __func__);
654
655 return es7000_cpu_to_logical_apicid(0);
656 }
657 apicid = new_apicid;
658 cpus_found++;
659 }
660 cpu++;
362 } 661 }
662 return apicid;
363} 663}
664
665static unsigned int
666es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
667 const struct cpumask *andmask)
668{
669 int apicid = es7000_cpu_to_logical_apicid(0);
670 cpumask_var_t cpumask;
671
672 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
673 return apicid;
674
675 cpumask_and(cpumask, inmask, andmask);
676 cpumask_and(cpumask, cpumask, cpu_online_mask);
677 apicid = es7000_cpu_mask_to_apicid(cpumask);
678
679 free_cpumask_var(cpumask);
680
681 return apicid;
682}
683
684static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
685{
686 return cpuid_apic >> index_msb;
687}
688
689void __init es7000_update_genapic_to_cluster(void)
690{
691 apic->target_cpus = target_cpus_cluster;
692 apic->irq_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
693 apic->irq_dest_mode = INT_DEST_MODE_CLUSTER;
694
695 apic->init_apic_ldr = es7000_init_apic_ldr_cluster;
696
697 apic->cpu_mask_to_apicid = es7000_cpu_mask_to_apicid_cluster;
698}
699
700static int probe_es7000(void)
701{
702 /* probed later in mptable/ACPI hooks */
703 return 0;
704}
705
706static __init int
707es7000_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
708{
709 if (mpc->oemptr) {
710 struct mpc_oemtable *oem_table =
711 (struct mpc_oemtable *)mpc->oemptr;
712
713 if (!strncmp(oem, "UNISYS", 6))
714 return parse_unisys_oem((char *)oem_table);
715 }
716 return 0;
717}
718
719#ifdef CONFIG_ACPI
720/* Hook from generic ACPI tables.c */
721static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
722{
723 unsigned long oem_addr = 0;
724 int check_dsdt;
725 int ret = 0;
726
727 /* check dsdt at first to avoid clear fix_map for oem_addr */
728 check_dsdt = es7000_check_dsdt();
729
730 if (!find_unisys_acpi_oem_table(&oem_addr)) {
731 if (check_dsdt)
732 ret = parse_unisys_oem((char *)oem_addr);
733 else {
734 setup_unisys();
735 ret = 1;
736 }
737 /*
738 * we need to unmap it
739 */
740 unmap_unisys_acpi_oem_table(oem_addr);
741 }
742 return ret;
743}
744#else
745static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
746{
747 return 0;
748}
749#endif
750
751
752struct genapic apic_es7000 = {
753
754 .name = "es7000",
755 .probe = probe_es7000,
756 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
757 .apic_id_registered = es7000_apic_id_registered,
758
759 .irq_delivery_mode = dest_Fixed,
760 /* phys delivery to target CPUs: */
761 .irq_dest_mode = 0,
762
763 .target_cpus = es7000_target_cpus,
764 .disable_esr = 1,
765 .dest_logical = 0,
766 .check_apicid_used = es7000_check_apicid_used,
767 .check_apicid_present = es7000_check_apicid_present,
768
769 .vector_allocation_domain = es7000_vector_allocation_domain,
770 .init_apic_ldr = es7000_init_apic_ldr,
771
772 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
773 .setup_apic_routing = es7000_setup_apic_routing,
774 .multi_timer_check = NULL,
775 .apicid_to_node = es7000_apicid_to_node,
776 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
777 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
778 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
779 .setup_portio_remap = NULL,
780 .check_phys_apicid_present = es7000_check_phys_apicid_present,
781 .enable_apic_mode = es7000_enable_apic_mode,
782 .phys_pkg_id = es7000_phys_pkg_id,
783 .mps_oem_check = es7000_mps_oem_check,
784
785 .get_apic_id = es7000_get_apic_id,
786 .set_apic_id = NULL,
787 .apic_id_mask = 0xFF << 24,
788
789 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
790 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
791
792 .send_IPI_mask = es7000_send_IPI_mask,
793 .send_IPI_mask_allbutself = NULL,
794 .send_IPI_allbutself = es7000_send_IPI_allbutself,
795 .send_IPI_all = es7000_send_IPI_all,
796 .send_IPI_self = default_send_IPI_self,
797
798 .wakeup_cpu = NULL,
799
800 .trampoline_phys_low = 0x467,
801 .trampoline_phys_high = 0x469,
802
803 .wait_for_init_deassert = es7000_wait_for_init_deassert,
804
805 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
806 .smp_callin_clear_local_apic = NULL,
807 .store_NMI_vector = NULL,
808 .inquire_remote_apic = default_inquire_remote_apic,
809};
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..231bdd3c5b1c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,219 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
493 " movl $0, %[faulted]\n"
494 "3:\n"
495
496 ".section .fixup, \"ax\"\n"
497 "4: movl $1, %[faulted]\n"
498 " jmp 3b\n"
499 ".previous\n"
500
501 _ASM_EXTABLE(1b, 4b)
502 _ASM_EXTABLE(2b, 4b)
503
504 : [old] "=r" (old), [faulted] "=r" (faulted)
505 : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
506 : "memory"
507 );
508
509 if (unlikely(faulted)) {
510 ftrace_graph_stop();
511 WARN_ON(1);
512 return;
513 }
514
515 if (unlikely(!__kernel_text_address(old))) {
516 ftrace_graph_stop();
517 *parent = old;
518 WARN_ON(1);
519 return;
520 }
521
522 calltime = cpu_clock(raw_smp_processor_id());
523
524 if (push_return_trace(old, calltime,
525 self_addr, &trace.depth) == -EBUSY) {
526 *parent = old;
527 return;
528 }
529
530 trace.func = self_addr;
531
532 /* Only trace if the calling function expects to */
533 if (!ftrace_graph_entry(&trace)) {
534 current->curr_ret_stack--;
535 *parent = old;
536 }
537}
538#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e95..820dea5d0ebe 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/setup.h>
24 25
25extern struct genapic apic_flat; 26extern struct genapic apic_flat;
26extern struct genapic apic_physflat; 27extern struct genapic apic_physflat;
@@ -28,10 +29,12 @@ extern struct genapic apic_x2xpic_uv_x;
28extern struct genapic apic_x2apic_phys; 29extern struct genapic apic_x2apic_phys;
29extern struct genapic apic_x2apic_cluster; 30extern struct genapic apic_x2apic_cluster;
30 31
31struct genapic __read_mostly *genapic = &apic_flat; 32struct genapic __read_mostly *apic = &apic_flat;
32 33
33static struct genapic *apic_probe[] __initdata = { 34static struct genapic *apic_probe[] __initdata = {
35#ifdef CONFIG_X86_UV
34 &apic_x2apic_uv_x, 36 &apic_x2apic_uv_x,
37#endif
35 &apic_x2apic_phys, 38 &apic_x2apic_phys,
36 &apic_x2apic_cluster, 39 &apic_x2apic_cluster,
37 &apic_physflat, 40 &apic_physflat,
@@ -41,36 +44,39 @@ static struct genapic *apic_probe[] __initdata = {
41/* 44/*
42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 45 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
43 */ 46 */
44void __init setup_apic_routing(void) 47void __init default_setup_apic_routing(void)
45{ 48{
46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { 49 if (apic == &apic_x2apic_phys || apic == &apic_x2apic_cluster) {
47 if (!intr_remapping_enabled) 50 if (!intr_remapping_enabled)
48 genapic = &apic_flat; 51 apic = &apic_flat;
49 } 52 }
50 53
51 if (genapic == &apic_flat) { 54 if (apic == &apic_flat) {
52 if (max_physical_apicid >= 8) 55 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat; 56 apic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 57 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
55 } 58 }
59
60 if (x86_quirks->update_genapic)
61 x86_quirks->update_genapic();
56} 62}
57 63
58/* Same for both flat and physical. */ 64/* Same for both flat and physical. */
59 65
60void apic_send_IPI_self(int vector) 66void apic_send_IPI_self(int vector)
61{ 67{
62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 68 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
63} 69}
64 70
65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 71int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
66{ 72{
67 int i; 73 int i;
68 74
69 for (i = 0; apic_probe[i]; ++i) { 75 for (i = 0; apic_probe[i]; ++i) {
70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 76 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
71 genapic = apic_probe[i]; 77 apic = apic_probe[i];
72 printk(KERN_INFO "Setting APIC routing to %s.\n", 78 printk(KERN_INFO "Setting APIC routing to %s.\n",
73 genapic->name); 79 apic->name);
74 return 1; 80 return 1;
75 } 81 }
76 } 82 }
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda4..249d2d3c034c 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -19,7 +19,6 @@
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/ipi.h> 20#include <asm/ipi.h>
21#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23 22
24#ifdef CONFIG_ACPI 23#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
@@ -30,12 +29,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
30 return 1; 29 return 1;
31} 30}
32 31
33static cpumask_t flat_target_cpus(void) 32static const struct cpumask *flat_target_cpus(void)
34{ 33{
35 return cpu_online_map; 34 return cpu_online_mask;
36} 35}
37 36
38static cpumask_t flat_vector_allocation_domain(int cpu) 37static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
39{ 38{
40 /* Careful. Some cpus do not strictly honor the set of cpus 39 /* Careful. Some cpus do not strictly honor the set of cpus
41 * specified in the interrupt destination when using lowest 40 * specified in the interrupt destination when using lowest
@@ -45,8 +44,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
45 * deliver interrupts to the wrong hyperthread when only one 44 * deliver interrupts to the wrong hyperthread when only one
46 * hyperthread was specified in the interrupt desitination. 45 * hyperthread was specified in the interrupt desitination.
47 */ 46 */
48 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 47 cpumask_clear(retmask);
49 return domain; 48 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
50} 49}
51 50
52/* 51/*
@@ -69,48 +68,73 @@ static void flat_init_apic_ldr(void)
69 apic_write(APIC_LDR, val); 68 apic_write(APIC_LDR, val);
70} 69}
71 70
72static void flat_send_IPI_mask(cpumask_t cpumask, int vector) 71static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
73{ 72{
74 unsigned long mask = cpus_addr(cpumask)[0];
75 unsigned long flags; 73 unsigned long flags;
76 74
77 local_irq_save(flags); 75 local_irq_save(flags);
78 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); 76 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
79 local_irq_restore(flags); 77 local_irq_restore(flags);
80} 78}
81 79
80static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
81{
82 unsigned long mask = cpumask_bits(cpumask)[0];
83
84 _flat_send_IPI_mask(mask, vector);
85}
86
87static void
88 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
89{
90 unsigned long mask = cpumask_bits(cpumask)[0];
91 int cpu = smp_processor_id();
92
93 if (cpu < BITS_PER_LONG)
94 clear_bit(cpu, &mask);
95
96 _flat_send_IPI_mask(mask, vector);
97}
98
82static void flat_send_IPI_allbutself(int vector) 99static void flat_send_IPI_allbutself(int vector)
83{ 100{
101 int cpu = smp_processor_id();
84#ifdef CONFIG_HOTPLUG_CPU 102#ifdef CONFIG_HOTPLUG_CPU
85 int hotplug = 1; 103 int hotplug = 1;
86#else 104#else
87 int hotplug = 0; 105 int hotplug = 0;
88#endif 106#endif
89 if (hotplug || vector == NMI_VECTOR) { 107 if (hotplug || vector == NMI_VECTOR) {
90 cpumask_t allbutme = cpu_online_map; 108 if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
109 unsigned long mask = cpumask_bits(cpu_online_mask)[0];
91 110
92 cpu_clear(smp_processor_id(), allbutme); 111 if (cpu < BITS_PER_LONG)
112 clear_bit(cpu, &mask);
93 113
94 if (!cpus_empty(allbutme)) 114 _flat_send_IPI_mask(mask, vector);
95 flat_send_IPI_mask(allbutme, vector); 115 }
96 } else if (num_online_cpus() > 1) { 116 } else if (num_online_cpus() > 1) {
97 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 117 __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
118 vector, apic->dest_logical);
98 } 119 }
99} 120}
100 121
101static void flat_send_IPI_all(int vector) 122static void flat_send_IPI_all(int vector)
102{ 123{
103 if (vector == NMI_VECTOR) 124 if (vector == NMI_VECTOR) {
104 flat_send_IPI_mask(cpu_online_map, vector); 125 flat_send_IPI_mask(cpu_online_mask, vector);
105 else 126 } else {
106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 127 __default_send_IPI_shortcut(APIC_DEST_ALLINC,
128 vector, apic->dest_logical);
129 }
107} 130}
108 131
109static unsigned int get_apic_id(unsigned long x) 132static unsigned int flat_get_apic_id(unsigned long x)
110{ 133{
111 unsigned int id; 134 unsigned int id;
112 135
113 id = (((x)>>24) & 0xFFu); 136 id = (((x)>>24) & 0xFFu);
137
114 return id; 138 return id;
115} 139}
116 140
@@ -126,7 +150,7 @@ static unsigned int read_xapic_id(void)
126{ 150{
127 unsigned int id; 151 unsigned int id;
128 152
129 id = get_apic_id(apic_read(APIC_ID)); 153 id = flat_get_apic_id(apic_read(APIC_ID));
130 return id; 154 return id;
131} 155}
132 156
@@ -135,34 +159,76 @@ static int flat_apic_id_registered(void)
135 return physid_isset(read_xapic_id(), phys_cpu_present_map); 159 return physid_isset(read_xapic_id(), phys_cpu_present_map);
136} 160}
137 161
138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 162static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
163{
164 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
165}
166
167static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
168 const struct cpumask *andmask)
139{ 169{
140 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; 170 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
171 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
172
173 return mask1 & mask2;
141} 174}
142 175
143static unsigned int phys_pkg_id(int index_msb) 176static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
144{ 177{
145 return hard_smp_processor_id() >> index_msb; 178 return hard_smp_processor_id() >> index_msb;
146} 179}
147 180
148struct genapic apic_flat = { 181struct genapic apic_flat = {
149 .name = "flat", 182 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 183 .probe = NULL,
151 .int_delivery_mode = dest_LowestPrio, 184 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
152 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 185 .apic_id_registered = flat_apic_id_registered,
153 .target_cpus = flat_target_cpus, 186
154 .vector_allocation_domain = flat_vector_allocation_domain, 187 .irq_delivery_mode = dest_LowestPrio,
155 .apic_id_registered = flat_apic_id_registered, 188 .irq_dest_mode = 1, /* logical */
156 .init_apic_ldr = flat_init_apic_ldr, 189
157 .send_IPI_all = flat_send_IPI_all, 190 .target_cpus = flat_target_cpus,
158 .send_IPI_allbutself = flat_send_IPI_allbutself, 191 .disable_esr = 0,
159 .send_IPI_mask = flat_send_IPI_mask, 192 .dest_logical = APIC_DEST_LOGICAL,
160 .send_IPI_self = apic_send_IPI_self, 193 .check_apicid_used = NULL,
161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 194 .check_apicid_present = NULL,
162 .phys_pkg_id = phys_pkg_id, 195
163 .get_apic_id = get_apic_id, 196 .vector_allocation_domain = flat_vector_allocation_domain,
164 .set_apic_id = set_apic_id, 197 .init_apic_ldr = flat_init_apic_ldr,
165 .apic_id_mask = (0xFFu<<24), 198
199 .ioapic_phys_id_map = NULL,
200 .setup_apic_routing = NULL,
201 .multi_timer_check = NULL,
202 .apicid_to_node = NULL,
203 .cpu_to_logical_apicid = NULL,
204 .cpu_present_to_apicid = default_cpu_present_to_apicid,
205 .apicid_to_cpu_present = NULL,
206 .setup_portio_remap = NULL,
207 .check_phys_apicid_present = default_check_phys_apicid_present,
208 .enable_apic_mode = NULL,
209 .phys_pkg_id = flat_phys_pkg_id,
210 .mps_oem_check = NULL,
211
212 .get_apic_id = flat_get_apic_id,
213 .set_apic_id = set_apic_id,
214 .apic_id_mask = 0xFFu << 24,
215
216 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
217 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
218
219 .send_IPI_mask = flat_send_IPI_mask,
220 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
221 .send_IPI_allbutself = flat_send_IPI_allbutself,
222 .send_IPI_all = flat_send_IPI_all,
223 .send_IPI_self = apic_send_IPI_self,
224
225 .wakeup_cpu = NULL,
226 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
227 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
228 .wait_for_init_deassert = NULL,
229 .smp_callin_clear_local_apic = NULL,
230 .store_NMI_vector = NULL,
231 .inquire_remote_apic = NULL,
166}; 232};
167 233
168/* 234/*
@@ -188,35 +254,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
188 return 0; 254 return 0;
189} 255}
190 256
191static cpumask_t physflat_target_cpus(void) 257static const struct cpumask *physflat_target_cpus(void)
192{ 258{
193 return cpu_online_map; 259 return cpu_online_mask;
194} 260}
195 261
196static cpumask_t physflat_vector_allocation_domain(int cpu) 262static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
197{ 263{
198 return cpumask_of_cpu(cpu); 264 cpumask_clear(retmask);
265 cpumask_set_cpu(cpu, retmask);
199} 266}
200 267
201static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) 268static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
202{ 269{
203 send_IPI_mask_sequence(cpumask, vector); 270 default_send_IPI_mask_sequence_phys(cpumask, vector);
204} 271}
205 272
206static void physflat_send_IPI_allbutself(int vector) 273static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
274 int vector)
207{ 275{
208 cpumask_t allbutme = cpu_online_map; 276 default_send_IPI_mask_allbutself_phys(cpumask, vector);
277}
209 278
210 cpu_clear(smp_processor_id(), allbutme); 279static void physflat_send_IPI_allbutself(int vector)
211 physflat_send_IPI_mask(allbutme, vector); 280{
281 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
212} 282}
213 283
214static void physflat_send_IPI_all(int vector) 284static void physflat_send_IPI_all(int vector)
215{ 285{
216 physflat_send_IPI_mask(cpu_online_map, vector); 286 physflat_send_IPI_mask(cpu_online_mask, vector);
217} 287}
218 288
219static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) 289static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
220{ 290{
221 int cpu; 291 int cpu;
222 292
@@ -224,29 +294,84 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
224 * We're using fixed IRQ delivery, can only return one phys APIC ID. 294 * We're using fixed IRQ delivery, can only return one phys APIC ID.
225 * May as well be the first. 295 * May as well be the first.
226 */ 296 */
227 cpu = first_cpu(cpumask); 297 cpu = cpumask_first(cpumask);
228 if ((unsigned)cpu < nr_cpu_ids) 298 if ((unsigned)cpu < nr_cpu_ids)
229 return per_cpu(x86_cpu_to_apicid, cpu); 299 return per_cpu(x86_cpu_to_apicid, cpu);
230 else 300 else
231 return BAD_APICID; 301 return BAD_APICID;
232} 302}
233 303
304static unsigned int
305physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 const struct cpumask *andmask)
307{
308 int cpu;
309
310 /*
311 * We're using fixed IRQ delivery, can only return one phys APIC ID.
312 * May as well be the first.
313 */
314 for_each_cpu_and(cpu, cpumask, andmask) {
315 if (cpumask_test_cpu(cpu, cpu_online_mask))
316 break;
317 }
318 if (cpu < nr_cpu_ids)
319 return per_cpu(x86_cpu_to_apicid, cpu);
320
321 return BAD_APICID;
322}
323
234struct genapic apic_physflat = { 324struct genapic apic_physflat = {
235 .name = "physical flat", 325
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 326 .name = "physical flat",
237 .int_delivery_mode = dest_Fixed, 327 .probe = NULL,
238 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 328 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
239 .target_cpus = physflat_target_cpus, 329 .apic_id_registered = flat_apic_id_registered,
240 .vector_allocation_domain = physflat_vector_allocation_domain, 330
241 .apic_id_registered = flat_apic_id_registered, 331 .irq_delivery_mode = dest_Fixed,
242 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ 332 .irq_dest_mode = 0, /* physical */
243 .send_IPI_all = physflat_send_IPI_all, 333
244 .send_IPI_allbutself = physflat_send_IPI_allbutself, 334 .target_cpus = physflat_target_cpus,
245 .send_IPI_mask = physflat_send_IPI_mask, 335 .disable_esr = 0,
246 .send_IPI_self = apic_send_IPI_self, 336 .dest_logical = 0,
247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 337 .check_apicid_used = NULL,
248 .phys_pkg_id = phys_pkg_id, 338 .check_apicid_present = NULL,
249 .get_apic_id = get_apic_id, 339
250 .set_apic_id = set_apic_id, 340 .vector_allocation_domain = physflat_vector_allocation_domain,
251 .apic_id_mask = (0xFFu<<24), 341 /* not needed, but shouldn't hurt: */
342 .init_apic_ldr = flat_init_apic_ldr,
343
344 .ioapic_phys_id_map = NULL,
345 .setup_apic_routing = NULL,
346 .multi_timer_check = NULL,
347 .apicid_to_node = NULL,
348 .cpu_to_logical_apicid = NULL,
349 .cpu_present_to_apicid = default_cpu_present_to_apicid,
350 .apicid_to_cpu_present = NULL,
351 .setup_portio_remap = NULL,
352 .check_phys_apicid_present = default_check_phys_apicid_present,
353 .enable_apic_mode = NULL,
354 .phys_pkg_id = flat_phys_pkg_id,
355 .mps_oem_check = NULL,
356
357 .get_apic_id = flat_get_apic_id,
358 .set_apic_id = set_apic_id,
359 .apic_id_mask = 0xFFu << 24,
360
361 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
362 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
363
364 .send_IPI_mask = physflat_send_IPI_mask,
365 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
366 .send_IPI_allbutself = physflat_send_IPI_allbutself,
367 .send_IPI_all = physflat_send_IPI_all,
368 .send_IPI_self = apic_send_IPI_self,
369
370 .wakeup_cpu = NULL,
371 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
372 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
373 .wait_for_init_deassert = NULL,
374 .smp_callin_clear_local_apic = NULL,
375 .store_NMI_vector = NULL,
376 .inquire_remote_apic = NULL,
252}; 377};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a6..7c87156b6411 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,23 +22,22 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
22 22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24 24
25static cpumask_t x2apic_target_cpus(void) 25static const struct cpumask *x2apic_target_cpus(void)
26{ 26{
27 return cpumask_of_cpu(0); 27 return cpumask_of(0);
28} 28}
29 29
30/* 30/*
31 * for now each logical cpu is in its own vector allocation domain. 31 * for now each logical cpu is in its own vector allocation domain.
32 */ 32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu) 33static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
34{ 34{
35 cpumask_t domain = CPU_MASK_NONE; 35 cpumask_clear(retmask);
36 cpu_set(cpu, domain); 36 cpumask_set_cpu(cpu, retmask);
37 return domain;
38} 37}
39 38
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 39static void
41 unsigned int dest) 40 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
42{ 41{
43 unsigned long cfg; 42 unsigned long cfg;
44 43
@@ -56,32 +55,58 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 55 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes. 56 * writes.
58 */ 57 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
60{ 59{
60 unsigned long query_cpu;
61 unsigned long flags; 61 unsigned long flags;
62
63 local_irq_save(flags);
64 for_each_cpu(query_cpu, mask) {
65 __x2apic_send_IPI_dest(
66 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, apic->dest_logical);
68 }
69 local_irq_restore(flags);
70}
71
72static void
73 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
74{
75 unsigned long this_cpu = smp_processor_id();
62 unsigned long query_cpu; 76 unsigned long query_cpu;
77 unsigned long flags;
63 78
64 local_irq_save(flags); 79 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) { 80 for_each_cpu(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), 81 if (query_cpu == this_cpu)
67 vector, APIC_DEST_LOGICAL); 82 continue;
83 __x2apic_send_IPI_dest(
84 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
85 vector, apic->dest_logical);
68 } 86 }
69 local_irq_restore(flags); 87 local_irq_restore(flags);
70} 88}
71 89
72static void x2apic_send_IPI_allbutself(int vector) 90static void x2apic_send_IPI_allbutself(int vector)
73{ 91{
74 cpumask_t mask = cpu_online_map; 92 unsigned long this_cpu = smp_processor_id();
75 93 unsigned long query_cpu;
76 cpu_clear(smp_processor_id(), mask); 94 unsigned long flags;
77 95
78 if (!cpus_empty(mask)) 96 local_irq_save(flags);
79 x2apic_send_IPI_mask(mask, vector); 97 for_each_online_cpu(query_cpu) {
98 if (query_cpu == this_cpu)
99 continue;
100 __x2apic_send_IPI_dest(
101 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
102 vector, apic->dest_logical);
103 }
104 local_irq_restore(flags);
80} 105}
81 106
82static void x2apic_send_IPI_all(int vector) 107static void x2apic_send_IPI_all(int vector)
83{ 108{
84 x2apic_send_IPI_mask(cpu_online_map, vector); 109 x2apic_send_IPI_mask(cpu_online_mask, vector);
85} 110}
86 111
87static int x2apic_apic_id_registered(void) 112static int x2apic_apic_id_registered(void)
@@ -89,22 +114,42 @@ static int x2apic_apic_id_registered(void)
89 return 1; 114 return 1;
90} 115}
91 116
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 117static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
93{ 118{
94 int cpu;
95
96 /* 119 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID. 120 * We're using fixed IRQ delivery, can only return one logical APIC ID.
98 * May as well be the first. 121 * May as well be the first.
99 */ 122 */
100 cpu = first_cpu(cpumask); 123 int cpu = cpumask_first(cpumask);
101 if ((unsigned)cpu < NR_CPUS) 124
125 if ((unsigned)cpu < nr_cpu_ids)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu); 126 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else 127 else
104 return BAD_APICID; 128 return BAD_APICID;
105} 129}
106 130
107static unsigned int get_apic_id(unsigned long x) 131static unsigned int
132x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 const struct cpumask *andmask)
134{
135 int cpu;
136
137 /*
138 * We're using fixed IRQ delivery, can only return one logical APIC ID.
139 * May as well be the first.
140 */
141 for_each_cpu_and(cpu, cpumask, andmask) {
142 if (cpumask_test_cpu(cpu, cpu_online_mask))
143 break;
144 }
145
146 if (cpu < nr_cpu_ids)
147 return per_cpu(x86_cpu_to_logical_apicid, cpu);
148
149 return BAD_APICID;
150}
151
152static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
108{ 153{
109 unsigned int id; 154 unsigned int id;
110 155
@@ -120,7 +165,7 @@ static unsigned long set_apic_id(unsigned int id)
120 return x; 165 return x;
121} 166}
122 167
123static unsigned int phys_pkg_id(int index_msb) 168static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
124{ 169{
125 return current_cpu_data.initial_apicid >> index_msb; 170 return current_cpu_data.initial_apicid >> index_msb;
126} 171}
@@ -135,25 +180,58 @@ static void init_x2apic_ldr(void)
135 int cpu = smp_processor_id(); 180 int cpu = smp_processor_id();
136 181
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 182 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139} 183}
140 184
141struct genapic apic_x2apic_cluster = { 185struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic", 186
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 187 .name = "cluster x2apic",
144 .int_delivery_mode = dest_LowestPrio, 188 .probe = NULL,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 189 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
146 .target_cpus = x2apic_target_cpus, 190 .apic_id_registered = x2apic_apic_id_registered,
147 .vector_allocation_domain = x2apic_vector_allocation_domain, 191
148 .apic_id_registered = x2apic_apic_id_registered, 192 .irq_delivery_mode = dest_LowestPrio,
149 .init_apic_ldr = init_x2apic_ldr, 193 .irq_dest_mode = 1, /* logical */
150 .send_IPI_all = x2apic_send_IPI_all, 194
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 195 .target_cpus = x2apic_target_cpus,
152 .send_IPI_mask = x2apic_send_IPI_mask, 196 .disable_esr = 0,
153 .send_IPI_self = x2apic_send_IPI_self, 197 .dest_logical = APIC_DEST_LOGICAL,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 198 .check_apicid_used = NULL,
155 .phys_pkg_id = phys_pkg_id, 199 .check_apicid_present = NULL,
156 .get_apic_id = get_apic_id, 200
157 .set_apic_id = set_apic_id, 201 .vector_allocation_domain = x2apic_vector_allocation_domain,
158 .apic_id_mask = (0xFFFFFFFFu), 202 .init_apic_ldr = init_x2apic_ldr,
203
204 .ioapic_phys_id_map = NULL,
205 .setup_apic_routing = NULL,
206 .multi_timer_check = NULL,
207 .apicid_to_node = NULL,
208 .cpu_to_logical_apicid = NULL,
209 .cpu_present_to_apicid = default_cpu_present_to_apicid,
210 .apicid_to_cpu_present = NULL,
211 .setup_portio_remap = NULL,
212 .check_phys_apicid_present = default_check_phys_apicid_present,
213 .enable_apic_mode = NULL,
214 .phys_pkg_id = x2apic_cluster_phys_pkg_id,
215 .mps_oem_check = NULL,
216
217 .get_apic_id = x2apic_cluster_phys_get_apic_id,
218 .set_apic_id = set_apic_id,
219 .apic_id_mask = 0xFFFFFFFFu,
220
221 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
222 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
223
224 .send_IPI_mask = x2apic_send_IPI_mask,
225 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
226 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
227 .send_IPI_all = x2apic_send_IPI_all,
228 .send_IPI_self = x2apic_send_IPI_self,
229
230 .wakeup_cpu = NULL,
231 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
232 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
233 .wait_for_init_deassert = NULL,
234 .smp_callin_clear_local_apic = NULL,
235 .store_NMI_vector = NULL,
236 .inquire_remote_apic = NULL,
159}; 237};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index d042211768b7..5cbae8aa0408 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31 31
32static cpumask_t x2apic_target_cpus(void) 32static const struct cpumask *x2apic_target_cpus(void)
33{ 33{
34 return cpumask_of_cpu(0); 34 return cpumask_of(0);
35} 35}
36 36
37static cpumask_t x2apic_vector_allocation_domain(int cpu) 37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
38{ 38{
39 cpumask_t domain = CPU_MASK_NONE; 39 cpumask_clear(retmask);
40 cpu_set(cpu, domain); 40 cpumask_set_cpu(cpu, retmask);
41 return domain;
42} 41}
43 42
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 43static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,55 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
54 x2apic_icr_write(cfg, apicid); 53 x2apic_icr_write(cfg, apicid);
55} 54}
56 55
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{ 57{
59 unsigned long flags;
60 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags;
61 60
62 local_irq_save(flags); 61 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) { 62 for_each_cpu(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL); 64 vector, APIC_DEST_PHYSICAL);
66 } 65 }
67 local_irq_restore(flags); 66 local_irq_restore(flags);
68} 67}
69 68
70static void x2apic_send_IPI_allbutself(int vector) 69static void
70 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
71{ 71{
72 cpumask_t mask = cpu_online_map; 72 unsigned long this_cpu = smp_processor_id();
73 unsigned long query_cpu;
74 unsigned long flags;
75
76 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu)
79 __x2apic_send_IPI_dest(
80 per_cpu(x86_cpu_to_apicid, query_cpu),
81 vector, APIC_DEST_PHYSICAL);
82 }
83 local_irq_restore(flags);
84}
73 85
74 cpu_clear(smp_processor_id(), mask); 86static void x2apic_send_IPI_allbutself(int vector)
87{
88 unsigned long this_cpu = smp_processor_id();
89 unsigned long query_cpu;
90 unsigned long flags;
75 91
76 if (!cpus_empty(mask)) 92 local_irq_save(flags);
77 x2apic_send_IPI_mask(mask, vector); 93 for_each_online_cpu(query_cpu) {
94 if (query_cpu == this_cpu)
95 continue;
96 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
97 vector, APIC_DEST_PHYSICAL);
98 }
99 local_irq_restore(flags);
78} 100}
79 101
80static void x2apic_send_IPI_all(int vector) 102static void x2apic_send_IPI_all(int vector)
81{ 103{
82 x2apic_send_IPI_mask(cpu_online_map, vector); 104 x2apic_send_IPI_mask(cpu_online_mask, vector);
83} 105}
84 106
85static int x2apic_apic_id_registered(void) 107static int x2apic_apic_id_registered(void)
@@ -87,68 +109,115 @@ static int x2apic_apic_id_registered(void)
87 return 1; 109 return 1;
88} 110}
89 111
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
91{ 113{
92 int cpu;
93
94 /* 114 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID. 115 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first. 116 * May as well be the first.
97 */ 117 */
98 cpu = first_cpu(cpumask); 118 int cpu = cpumask_first(cpumask);
99 if ((unsigned)cpu < NR_CPUS) 119
120 if ((unsigned)cpu < nr_cpu_ids)
100 return per_cpu(x86_cpu_to_apicid, cpu); 121 return per_cpu(x86_cpu_to_apicid, cpu);
101 else 122 else
102 return BAD_APICID; 123 return BAD_APICID;
103} 124}
104 125
105static unsigned int get_apic_id(unsigned long x) 126static unsigned int
127x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
128 const struct cpumask *andmask)
106{ 129{
107 unsigned int id; 130 int cpu;
108 131
109 id = x; 132 /*
110 return id; 133 * We're using fixed IRQ delivery, can only return one phys APIC ID.
134 * May as well be the first.
135 */
136 for_each_cpu_and(cpu, cpumask, andmask) {
137 if (cpumask_test_cpu(cpu, cpu_online_mask))
138 break;
139 }
140
141 if (cpu < nr_cpu_ids)
142 return per_cpu(x86_cpu_to_apicid, cpu);
143
144 return BAD_APICID;
111} 145}
112 146
113static unsigned long set_apic_id(unsigned int id) 147static unsigned int x2apic_phys_get_apic_id(unsigned long x)
114{ 148{
115 unsigned long x;
116
117 x = id;
118 return x; 149 return x;
119} 150}
120 151
121static unsigned int phys_pkg_id(int index_msb) 152static unsigned long set_apic_id(unsigned int id)
153{
154 return id;
155}
156
157static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
122{ 158{
123 return current_cpu_data.initial_apicid >> index_msb; 159 return current_cpu_data.initial_apicid >> index_msb;
124} 160}
125 161
126void x2apic_send_IPI_self(int vector) 162static void x2apic_send_IPI_self(int vector)
127{ 163{
128 apic_write(APIC_SELF_IPI, vector); 164 apic_write(APIC_SELF_IPI, vector);
129} 165}
130 166
131void init_x2apic_ldr(void) 167static void init_x2apic_ldr(void)
132{ 168{
133 return;
134} 169}
135 170
136struct genapic apic_x2apic_phys = { 171struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic", 172
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 173 .name = "physical x2apic",
139 .int_delivery_mode = dest_Fixed, 174 .probe = NULL,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 175 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
141 .target_cpus = x2apic_target_cpus, 176 .apic_id_registered = x2apic_apic_id_registered,
142 .vector_allocation_domain = x2apic_vector_allocation_domain, 177
143 .apic_id_registered = x2apic_apic_id_registered, 178 .irq_delivery_mode = dest_Fixed,
144 .init_apic_ldr = init_x2apic_ldr, 179 .irq_dest_mode = 0, /* physical */
145 .send_IPI_all = x2apic_send_IPI_all, 180
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 181 .target_cpus = x2apic_target_cpus,
147 .send_IPI_mask = x2apic_send_IPI_mask, 182 .disable_esr = 0,
148 .send_IPI_self = x2apic_send_IPI_self, 183 .dest_logical = 0,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 184 .check_apicid_used = NULL,
150 .phys_pkg_id = phys_pkg_id, 185 .check_apicid_present = NULL,
151 .get_apic_id = get_apic_id, 186
152 .set_apic_id = set_apic_id, 187 .vector_allocation_domain = x2apic_vector_allocation_domain,
153 .apic_id_mask = (0xFFFFFFFFu), 188 .init_apic_ldr = init_x2apic_ldr,
189
190 .ioapic_phys_id_map = NULL,
191 .setup_apic_routing = NULL,
192 .multi_timer_check = NULL,
193 .apicid_to_node = NULL,
194 .cpu_to_logical_apicid = NULL,
195 .cpu_present_to_apicid = default_cpu_present_to_apicid,
196 .apicid_to_cpu_present = NULL,
197 .setup_portio_remap = NULL,
198 .check_phys_apicid_present = default_check_phys_apicid_present,
199 .enable_apic_mode = NULL,
200 .phys_pkg_id = x2apic_phys_pkg_id,
201 .mps_oem_check = NULL,
202
203 .get_apic_id = x2apic_phys_get_apic_id,
204 .set_apic_id = set_apic_id,
205 .apic_id_mask = 0xFFFFFFFFu,
206
207 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
208 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
209
210 .send_IPI_mask = x2apic_send_IPI_mask,
211 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
212 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
213 .send_IPI_all = x2apic_send_IPI_all,
214 .send_IPI_self = x2apic_send_IPI_self,
215
216 .wakeup_cpu = NULL,
217 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
218 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
219 .wait_for_init_deassert = NULL,
220 .smp_callin_clear_local_apic = NULL,
221 .store_NMI_vector = NULL,
222 .inquire_remote_apic = NULL,
154}; 223};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2c7dbdb98278..6adb5e6f4d92 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpu.h>
13#include <linux/cpumask.h> 14#include <linux/cpumask.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/ctype.h> 16#include <linux/ctype.h>
@@ -17,10 +18,14 @@
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/timer.h>
22#include <linux/proc_fs.h>
23#include <asm/current.h>
20#include <asm/smp.h> 24#include <asm/smp.h>
21#include <asm/ipi.h> 25#include <asm/ipi.h>
22#include <asm/genapic.h> 26#include <asm/genapic.h>
23#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <asm/uv/uv.h>
24#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
26#include <asm/uv/bios.h> 31#include <asm/uv/bios.h>
@@ -75,16 +80,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
75 80
76/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 81/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
77 82
78static cpumask_t uv_target_cpus(void) 83static const struct cpumask *uv_target_cpus(void)
79{ 84{
80 return cpumask_of_cpu(0); 85 return cpumask_of(0);
81} 86}
82 87
83static cpumask_t uv_vector_allocation_domain(int cpu) 88static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
84{ 89{
85 cpumask_t domain = CPU_MASK_NONE; 90 cpumask_clear(retmask);
86 cpu_set(cpu, domain); 91 cpumask_set_cpu(cpu, retmask);
87 return domain;
88} 92}
89 93
90int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 94int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -114,37 +118,49 @@ static void uv_send_IPI_one(int cpu, int vector)
114 int pnode; 118 int pnode;
115 119
116 apicid = per_cpu(x86_cpu_to_apicid, cpu); 120 apicid = per_cpu(x86_cpu_to_apicid, cpu);
117 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 121 lapicid = apicid & 0x3f; /* ZZZ macro needed */
118 pnode = uv_apicid_to_pnode(apicid); 122 pnode = uv_apicid_to_pnode(apicid);
119 val = 123
120 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << 124 val = ( 1UL << UVH_IPI_INT_SEND_SHFT ) |
121 UVH_IPI_INT_APIC_ID_SHFT) | 125 ( lapicid << UVH_IPI_INT_APIC_ID_SHFT ) |
122 (vector << UVH_IPI_INT_VECTOR_SHFT); 126 ( vector << UVH_IPI_INT_VECTOR_SHFT );
127
123 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 128 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
124} 129}
125 130
126static void uv_send_IPI_mask(cpumask_t mask, int vector) 131static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
127{ 132{
128 unsigned int cpu; 133 unsigned int cpu;
129 134
130 for_each_possible_cpu(cpu) 135 for_each_cpu(cpu, mask)
131 if (cpu_isset(cpu, mask)) 136 uv_send_IPI_one(cpu, vector);
137}
138
139static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
140{
141 unsigned int this_cpu = smp_processor_id();
142 unsigned int cpu;
143
144 for_each_cpu(cpu, mask) {
145 if (cpu != this_cpu)
132 uv_send_IPI_one(cpu, vector); 146 uv_send_IPI_one(cpu, vector);
147 }
133} 148}
134 149
135static void uv_send_IPI_allbutself(int vector) 150static void uv_send_IPI_allbutself(int vector)
136{ 151{
137 cpumask_t mask = cpu_online_map; 152 unsigned int this_cpu = smp_processor_id();
138 153 unsigned int cpu;
139 cpu_clear(smp_processor_id(), mask);
140 154
141 if (!cpus_empty(mask)) 155 for_each_online_cpu(cpu) {
142 uv_send_IPI_mask(mask, vector); 156 if (cpu != this_cpu)
157 uv_send_IPI_one(cpu, vector);
158 }
143} 159}
144 160
145static void uv_send_IPI_all(int vector) 161static void uv_send_IPI_all(int vector)
146{ 162{
147 uv_send_IPI_mask(cpu_online_map, vector); 163 uv_send_IPI_mask(cpu_online_mask, vector);
148} 164}
149 165
150static int uv_apic_id_registered(void) 166static int uv_apic_id_registered(void)
@@ -156,22 +172,41 @@ static void uv_init_apic_ldr(void)
156{ 172{
157} 173}
158 174
159static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 175static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
160{ 176{
161 int cpu;
162
163 /* 177 /*
164 * We're using fixed IRQ delivery, can only return one phys APIC ID. 178 * We're using fixed IRQ delivery, can only return one phys APIC ID.
165 * May as well be the first. 179 * May as well be the first.
166 */ 180 */
167 cpu = first_cpu(cpumask); 181 int cpu = cpumask_first(cpumask);
182
168 if ((unsigned)cpu < nr_cpu_ids) 183 if ((unsigned)cpu < nr_cpu_ids)
169 return per_cpu(x86_cpu_to_apicid, cpu); 184 return per_cpu(x86_cpu_to_apicid, cpu);
170 else 185 else
171 return BAD_APICID; 186 return BAD_APICID;
172} 187}
173 188
174static unsigned int get_apic_id(unsigned long x) 189static unsigned int
190uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
191 const struct cpumask *andmask)
192{
193 int cpu;
194
195 /*
196 * We're using fixed IRQ delivery, can only return one phys APIC ID.
197 * May as well be the first.
198 */
199 for_each_cpu_and(cpu, cpumask, andmask) {
200 if (cpumask_test_cpu(cpu, cpu_online_mask))
201 break;
202 }
203 if (cpu < nr_cpu_ids)
204 return per_cpu(x86_cpu_to_apicid, cpu);
205
206 return BAD_APICID;
207}
208
209static unsigned int x2apic_get_apic_id(unsigned long x)
175{ 210{
176 unsigned int id; 211 unsigned int id;
177 212
@@ -193,10 +228,10 @@ static unsigned long set_apic_id(unsigned int id)
193static unsigned int uv_read_apic_id(void) 228static unsigned int uv_read_apic_id(void)
194{ 229{
195 230
196 return get_apic_id(apic_read(APIC_ID)); 231 return x2apic_get_apic_id(apic_read(APIC_ID));
197} 232}
198 233
199static unsigned int phys_pkg_id(int index_msb) 234static int uv_phys_pkg_id(int initial_apicid, int index_msb)
200{ 235{
201 return uv_read_apic_id() >> index_msb; 236 return uv_read_apic_id() >> index_msb;
202} 237}
@@ -207,23 +242,57 @@ static void uv_send_IPI_self(int vector)
207} 242}
208 243
209struct genapic apic_x2apic_uv_x = { 244struct genapic apic_x2apic_uv_x = {
210 .name = "UV large system", 245
211 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 246 .name = "UV large system",
212 .int_delivery_mode = dest_Fixed, 247 .probe = NULL,
213 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 248 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
214 .target_cpus = uv_target_cpus, 249 .apic_id_registered = uv_apic_id_registered,
215 .vector_allocation_domain = uv_vector_allocation_domain, 250
216 .apic_id_registered = uv_apic_id_registered, 251 .irq_delivery_mode = dest_Fixed,
217 .init_apic_ldr = uv_init_apic_ldr, 252 .irq_dest_mode = 1, /* logical */
218 .send_IPI_all = uv_send_IPI_all, 253
219 .send_IPI_allbutself = uv_send_IPI_allbutself, 254 .target_cpus = uv_target_cpus,
220 .send_IPI_mask = uv_send_IPI_mask, 255 .disable_esr = 0,
221 .send_IPI_self = uv_send_IPI_self, 256 .dest_logical = APIC_DEST_LOGICAL,
222 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 257 .check_apicid_used = NULL,
223 .phys_pkg_id = phys_pkg_id, 258 .check_apicid_present = NULL,
224 .get_apic_id = get_apic_id, 259
225 .set_apic_id = set_apic_id, 260 .vector_allocation_domain = uv_vector_allocation_domain,
226 .apic_id_mask = (0xFFFFFFFFu), 261 .init_apic_ldr = uv_init_apic_ldr,
262
263 .ioapic_phys_id_map = NULL,
264 .setup_apic_routing = NULL,
265 .multi_timer_check = NULL,
266 .apicid_to_node = NULL,
267 .cpu_to_logical_apicid = NULL,
268 .cpu_present_to_apicid = default_cpu_present_to_apicid,
269 .apicid_to_cpu_present = NULL,
270 .setup_portio_remap = NULL,
271 .check_phys_apicid_present = default_check_phys_apicid_present,
272 .enable_apic_mode = NULL,
273 .phys_pkg_id = uv_phys_pkg_id,
274 .mps_oem_check = NULL,
275
276 .get_apic_id = x2apic_get_apic_id,
277 .set_apic_id = set_apic_id,
278 .apic_id_mask = 0xFFFFFFFFu,
279
280 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
281 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
282
283 .send_IPI_mask = uv_send_IPI_mask,
284 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
285 .send_IPI_allbutself = uv_send_IPI_allbutself,
286 .send_IPI_all = uv_send_IPI_all,
287 .send_IPI_self = uv_send_IPI_self,
288
289 .wakeup_cpu = NULL,
290 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
291 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
292 .wait_for_init_deassert = NULL,
293 .smp_callin_clear_local_apic = NULL,
294 .store_NMI_vector = NULL,
295 .inquire_remote_apic = NULL,
227}; 296};
228 297
229static __cpuinit void set_x2apic_extra_bits(int pnode) 298static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -356,6 +425,103 @@ static __init void uv_rtc_init(void)
356} 425}
357 426
358/* 427/*
428 * percpu heartbeat timer
429 */
430static void uv_heartbeat(unsigned long ignored)
431{
432 struct timer_list *timer = &uv_hub_info->scir.timer;
433 unsigned char bits = uv_hub_info->scir.state;
434
435 /* flip heartbeat bit */
436 bits ^= SCIR_CPU_HEARTBEAT;
437
438 /* is this cpu idle? */
439 if (idle_cpu(raw_smp_processor_id()))
440 bits &= ~SCIR_CPU_ACTIVITY;
441 else
442 bits |= SCIR_CPU_ACTIVITY;
443
444 /* update system controller interface reg */
445 uv_set_scir_bits(bits);
446
447 /* enable next timer period */
448 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
449}
450
451static void __cpuinit uv_heartbeat_enable(int cpu)
452{
453 if (!uv_cpu_hub_info(cpu)->scir.enabled) {
454 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
455
456 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
457 setup_timer(timer, uv_heartbeat, cpu);
458 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
459 add_timer_on(timer, cpu);
460 uv_cpu_hub_info(cpu)->scir.enabled = 1;
461 }
462
463 /* check boot cpu */
464 if (!uv_cpu_hub_info(0)->scir.enabled)
465 uv_heartbeat_enable(0);
466}
467
468#ifdef CONFIG_HOTPLUG_CPU
469static void __cpuinit uv_heartbeat_disable(int cpu)
470{
471 if (uv_cpu_hub_info(cpu)->scir.enabled) {
472 uv_cpu_hub_info(cpu)->scir.enabled = 0;
473 del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
474 }
475 uv_set_cpu_scir_bits(cpu, 0xff);
476}
477
478/*
479 * cpu hotplug notifier
480 */
481static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
482 unsigned long action, void *hcpu)
483{
484 long cpu = (long)hcpu;
485
486 switch (action) {
487 case CPU_ONLINE:
488 uv_heartbeat_enable(cpu);
489 break;
490 case CPU_DOWN_PREPARE:
491 uv_heartbeat_disable(cpu);
492 break;
493 default:
494 break;
495 }
496 return NOTIFY_OK;
497}
498
499static __init void uv_scir_register_cpu_notifier(void)
500{
501 hotcpu_notifier(uv_scir_cpu_notify, 0);
502}
503
504#else /* !CONFIG_HOTPLUG_CPU */
505
506static __init void uv_scir_register_cpu_notifier(void)
507{
508}
509
510static __init int uv_init_heartbeat(void)
511{
512 int cpu;
513
514 if (is_uv_system())
515 for_each_online_cpu(cpu)
516 uv_heartbeat_enable(cpu);
517 return 0;
518}
519
520late_initcall(uv_init_heartbeat);
521
522#endif /* !CONFIG_HOTPLUG_CPU */
523
524/*
359 * Called on each cpu to initialize the per_cpu UV data area. 525 * Called on each cpu to initialize the per_cpu UV data area.
360 * ZZZ hotplug not supported yet 526 * ZZZ hotplug not supported yet
361 */ 527 */
@@ -428,7 +594,7 @@ void __init uv_system_init(void)
428 594
429 uv_bios_init(); 595 uv_bios_init();
430 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 596 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
431 &uv_coherency_id, &uv_region_size); 597 &sn_coherency_id, &sn_region_size);
432 uv_rtc_init(); 598 uv_rtc_init();
433 599
434 for_each_present_cpu(cpu) { 600 for_each_present_cpu(cpu) {
@@ -439,8 +605,7 @@ void __init uv_system_init(void)
439 uv_blade_info[blade].nr_possible_cpus++; 605 uv_blade_info[blade].nr_possible_cpus++;
440 606
441 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 607 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
442 uv_cpu_hub_info(cpu)->lowmem_remap_top = 608 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
443 lowmem_redir_base + lowmem_redir_size;
444 uv_cpu_hub_info(cpu)->m_val = m_val; 609 uv_cpu_hub_info(cpu)->m_val = m_val;
445 uv_cpu_hub_info(cpu)->n_val = m_val; 610 uv_cpu_hub_info(cpu)->n_val = m_val;
446 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 611 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -450,7 +615,8 @@ void __init uv_system_init(void)
450 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 615 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
451 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 616 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
452 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 617 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
453 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; 618 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
619 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
454 uv_node_to_blade[nid] = blade; 620 uv_node_to_blade[nid] = blade;
455 uv_cpu_to_blade[cpu] = blade; 621 uv_cpu_to_blade[cpu] = blade;
456 max_pnode = max(pnode, max_pnode); 622 max_pnode = max(pnode, max_pnode);
@@ -467,4 +633,6 @@ void __init uv_system_init(void)
467 map_mmioh_high(max_pnode); 633 map_mmioh_high(max_pnode);
468 634
469 uv_cpu_init(); 635 uv_cpu_init();
636 uv_scir_register_cpu_notifier();
637 proc_mkdir("sgi_uv", NULL);
470} 638}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897e..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
39 38
40 /* Fixup: bios puts an EBDA in the top 64K segment */ 39 /* Fixup: bios puts an EBDA in the top 64K segment */
41 /* of conventional memory, but does not adjust lowmem. */ 40 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e3..ac108d1fe182 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/bios_ebda.h>
15#include <asm/trampoline.h>
15 16
16void __init i386_start_kernel(void) 17void __init i386_start_kernel(void)
17{ 18{
19 reserve_trampoline_memory();
20
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19 22
20#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f90649..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,27 +24,7 @@
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27 27#include <asm/trampoline.h>
28/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly;
30
31#ifdef CONFIG_SMP
32/*
33 * We install an empty cpu_pda pointer table to indicate to early users
34 * (numa_set_node) that the cpu_pda pointer table for cpus other than
35 * the boot cpu is not yet setup.
36 */
37static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
38#else
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif
41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48 28
49static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
50{ 30{
@@ -111,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
111 if (console_loglevel == 10) 91 if (console_loglevel == 10)
112 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
113 93
114 x86_64_init_pda();
115
116 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
117} 95}
118 96
@@ -120,6 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
120{ 98{
121 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
122 100
101 reserve_trampoline_memory();
102
123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
124 104
125#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..2a0aad7718d5 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,6 +19,7 @@
19#include <asm/asm-offsets.h> 19#include <asm/asm-offsets.h>
20#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23/* Physical address */ 24/* Physical address */
24#define pa(X) ((X) - __PAGE_OFFSET) 25#define pa(X) ((X) - __PAGE_OFFSET)
@@ -429,14 +430,34 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 430 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4311: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 432 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 433
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 434 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 435 movl %eax,%ds
436 movl %eax,%es 436 movl %eax,%es
437 437
438 xorl %eax,%eax # Clear GS and LDT 438 movl $(__KERNEL_PERCPU), %eax
439 movl %eax,%fs # set this cpu's percpu
440
441#ifdef CONFIG_CC_STACKPROTECTOR
442 /*
443 * The linker can't handle this by relocation. Manually set
444 * base address in stack canary segment descriptor.
445 */
446 cmpb $0,ready
447 jne 1f
448 movl $per_cpu__gdt_page,%eax
449 movl $per_cpu__stack_canary,%ecx
450 subl $20, %ecx
451 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
452 shrl $16, %ecx
453 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
454 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
4551:
456#endif
457 movl $(__KERNEL_STACK_CANARY),%eax
439 movl %eax,%gs 458 movl %eax,%gs
459
460 xorl %eax,%eax # Clear LDT
440 lldt %ax 461 lldt %ax
441 462
442 cld # gcc2 wants the direction flag cleared at all times 463 cld # gcc2 wants the direction flag cleared at all times
@@ -446,8 +467,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 467 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 468 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 469 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 470 movl (stack_start), %esp
4521: 4711:
453#endif /* CONFIG_SMP */ 472#endif /* CONFIG_SMP */
@@ -548,12 +567,8 @@ early_fault:
548 pushl %eax 567 pushl %eax
549 pushl %edx /* trapno */ 568 pushl %edx /* trapno */
550 pushl $fault_msg 569 pushl $fault_msg
551#ifdef CONFIG_EARLY_PRINTK
552 call early_printk
553#else
554 call printk 570 call printk
555#endif 571#endif
556#endif
557 call dump_stack 572 call dump_stack
558hlt_loop: 573hlt_loop:
559 hlt 574 hlt
@@ -580,11 +595,10 @@ ignore_int:
580 pushl 32(%esp) 595 pushl 32(%esp)
581 pushl 40(%esp) 596 pushl 40(%esp)
582 pushl $int_msg 597 pushl $int_msg
583#ifdef CONFIG_EARLY_PRINTK
584 call early_printk
585#else
586 call printk 598 call printk
587#endif 599
600 call dump_stack
601
588 addl $(5*4),%esp 602 addl $(5*4),%esp
589 popl %ds 603 popl %ds
590 popl %es 604 popl %es
@@ -660,7 +674,7 @@ early_recursion_flag:
660 .long 0 674 .long 0
661 675
662int_msg: 676int_msg:
663 .asciz "Unknown interrupt or fault at EIP %p %p %p\n" 677 .asciz "Unknown interrupt or fault at: %p %p %p\n"
664 678
665fault_msg: 679fault_msg:
666/* fault info: */ 680/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 26cfdc1d7c7f..2e648e3a5ea4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 227 movl %eax,%fs
227 movl %eax,%gs 228 movl %eax,%gs
228 229
229 /* 230 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 231 *
231 * that does in_interrupt() 232 * The base of %gs always points to the bottom of the irqstack
232 */ 233 * union. If the stack protector canary is enabled, it is
234 * located at %gs:40. Note that, on SMP, the boot cpu uses
235 * init data section till per cpu areas are set up.
236 */
233 movl $MSR_GS_BASE,%ecx 237 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 238 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 239 movq %rax,%rdx
236 shrq $32,%rdx 240 shrq $32,%rdx
237 wrmsr 241 wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
257 .align 8 261 .align 8
258 ENTRY(initial_code) 262 ENTRY(initial_code)
259 .quad x86_64_start_kernel 263 .quad x86_64_start_kernel
264 ENTRY(initial_gs)
265 .quad INIT_PER_CPU_VAR(irq_stack_union)
260 __FINITDATA 266 __FINITDATA
261 267
262 ENTRY(stack_start) 268 ENTRY(stack_start)
@@ -305,7 +311,7 @@ ENTRY(early_idt_handler)
305 call dump_stack 311 call dump_stack
306#ifdef CONFIG_KALLSYMS 312#ifdef CONFIG_KALLSYMS
307 leaq early_idt_ripmsg(%rip),%rdi 313 leaq early_idt_ripmsg(%rip),%rdi
308 movq 8(%rsp),%rsi # get rip again 314 movq 0(%rsp),%rsi # get rip again
309 call __print_symbol 315 call __print_symbol
310#endif 316#endif
311#endif /* EARLY_PRINTK */ 317#endif /* EARLY_PRINTK */
@@ -401,7 +407,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 407 .globl early_gdt_descr
402early_gdt_descr: 408early_gdt_descr:
403 .word GDT_ENTRIES*8-1 409 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 410early_gdt_descr_base:
411 .quad INIT_PER_CPU_VAR(gdt_page)
405 412
406ENTRY(phys_base) 413ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 414 /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 067d8de913f6..388254f69a2a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,7 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36unsigned long hpet_num_timers; 36#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers;
38#endif
37static void __iomem *hpet_virt_address; 39static void __iomem *hpet_virt_address;
38 40
39struct hpet_dev { 41struct hpet_dev {
@@ -246,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
246 * Start hpet with the boot cpu mask and make it 248 * Start hpet with the boot cpu mask and make it
247 * global after the IO_APIC has been initialized. 249 * global after the IO_APIC has been initialized.
248 */ 250 */
249 hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 251 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
250 clockevents_register_device(&hpet_clockevent); 252 clockevents_register_device(&hpet_clockevent);
251 global_clock_event = &hpet_clockevent; 253 global_clock_event = &hpet_clockevent;
252 printk(KERN_DEBUG "hpet clockevent registered\n"); 254 printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -301,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); 303 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq); 304 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq); 305 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); 306 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
305 enable_irq(hdev->irq); 307 enable_irq(hdev->irq);
306 } 308 }
307 break; 309 break;
@@ -449,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
449 return -1; 451 return -1;
450 452
451 disable_irq(dev->irq); 453 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); 454 irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
453 enable_irq(dev->irq); 455 enable_irq(dev->irq);
454 456
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", 457 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -500,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
500 /* 5 usec minimum reprogramming delta. */ 502 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000; 503 evt->min_delta_ns = 5000;
502 504
503 evt->cpumask = cpumask_of_cpu(hdev->cpu); 505 evt->cpumask = cpumask_of(hdev->cpu);
504 clockevents_register_device(evt); 506 clockevents_register_device(evt);
505} 507}
506 508
@@ -626,11 +628,12 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
626 628
627 switch (action & 0xf) { 629 switch (action & 0xf) {
628 case CPU_ONLINE: 630 case CPU_ONLINE:
629 INIT_DELAYED_WORK(&work.work, hpet_work); 631 INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
630 init_completion(&work.complete); 632 init_completion(&work.complete);
631 /* FIXME: add schedule_work_on() */ 633 /* FIXME: add schedule_work_on() */
632 schedule_delayed_work_on(cpu, &work.work, 0); 634 schedule_delayed_work_on(cpu, &work.work, 0);
633 wait_for_completion(&work.complete); 635 wait_for_completion(&work.complete);
636 destroy_timer_on_stack(&work.work.timer);
634 break; 637 break;
635 case CPU_DEAD: 638 case CPU_DEAD:
636 if (hdev) { 639 if (hdev) {
@@ -811,7 +814,7 @@ int __init hpet_enable(void)
811 814
812out_nohpet: 815out_nohpet:
813 hpet_clear_mapping(); 816 hpet_clear_mapping();
814 boot_hpet_disable = 1; 817 hpet_address = 0;
815 return 0; 818 return 0;
816} 819}
817 820
@@ -834,10 +837,11 @@ static __init int hpet_late_init(void)
834 837
835 hpet_address = force_hpet_address; 838 hpet_address = force_hpet_address;
836 hpet_enable(); 839 hpet_enable();
837 if (!hpet_virt_address)
838 return -ENODEV;
839 } 840 }
840 841
842 if (!hpet_virt_address)
843 return -ENODEV;
844
841 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 845 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
842 846
843 for_each_online_cpu(cpu) { 847 for_each_online_cpu(cpu) {
@@ -893,7 +897,7 @@ static unsigned long hpet_rtc_flags;
893static int hpet_prev_update_sec; 897static int hpet_prev_update_sec;
894static struct rtc_time hpet_alarm_time; 898static struct rtc_time hpet_alarm_time;
895static unsigned long hpet_pie_count; 899static unsigned long hpet_pie_count;
896static unsigned long hpet_t1_cmp; 900static u32 hpet_t1_cmp;
897static unsigned long hpet_default_delta; 901static unsigned long hpet_default_delta;
898static unsigned long hpet_pie_delta; 902static unsigned long hpet_pie_delta;
899static unsigned long hpet_pie_limit; 903static unsigned long hpet_pie_limit;
@@ -901,6 +905,14 @@ static unsigned long hpet_pie_limit;
901static rtc_irq_handler irq_handler; 905static rtc_irq_handler irq_handler;
902 906
903/* 907/*
908 * Check that the hpet counter c1 is ahead of the c2
909 */
910static inline int hpet_cnt_ahead(u32 c1, u32 c2)
911{
912 return (s32)(c2 - c1) < 0;
913}
914
915/*
904 * Registers a IRQ handler. 916 * Registers a IRQ handler.
905 */ 917 */
906int hpet_register_irq_handler(rtc_irq_handler handler) 918int hpet_register_irq_handler(rtc_irq_handler handler)
@@ -1071,7 +1083,7 @@ static void hpet_rtc_timer_reinit(void)
1071 hpet_t1_cmp += delta; 1083 hpet_t1_cmp += delta;
1072 hpet_writel(hpet_t1_cmp, HPET_T1_CMP); 1084 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1073 lost_ints++; 1085 lost_ints++;
1074 } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); 1086 } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER)));
1075 1087
1076 if (lost_ints) { 1088 if (lost_ints) {
1077 if (hpet_rtc_flags & RTC_PIE) 1089 if (hpet_rtc_flags & RTC_PIE)
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f20608d4ca8..b0f61f0dcd0a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -58,7 +58,7 @@ void __cpuinit mxcsr_feature_mask_init(void)
58 stts(); 58 stts();
59} 59}
60 60
61void __init init_thread_xstate(void) 61void __cpuinit init_thread_xstate(void)
62{ 62{
63 if (!HAVE_HWFP) { 63 if (!HAVE_HWFP) {
64 xstate_size = sizeof(struct i387_soft_struct); 64 xstate_size = sizeof(struct i387_soft_struct);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index dbd6c1d1b638..b42ca694dc68 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -28,10 +28,10 @@ static int i8237A_resume(struct sys_device *dev)
28 28
29 flags = claim_dma_lock(); 29 flags = claim_dma_lock();
30 30
31 dma_outb(DMA1_RESET_REG, 0); 31 dma_outb(0, DMA1_RESET_REG);
32 dma_outb(DMA2_RESET_REG, 0); 32 dma_outb(0, DMA2_RESET_REG);
33 33
34 for (i = 0;i < 8;i++) { 34 for (i = 0; i < 8; i++) {
35 set_dma_addr(i, 0x000000); 35 set_dma_addr(i, 0x000000);
36 /* DMA count is a bit weird so this is not 0 */ 36 /* DMA count is a bit weird so this is not 0 */
37 set_dma_count(i, 1); 37 set_dma_count(i, 1);
@@ -51,14 +51,14 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
51} 51}
52 52
53static struct sysdev_class i8237_sysdev_class = { 53static struct sysdev_class i8237_sysdev_class = {
54 .name = "i8237", 54 .name = "i8237",
55 .suspend = i8237A_suspend, 55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 56 .resume = i8237A_resume,
57}; 57};
58 58
59static struct sys_device device_i8237A = { 59static struct sys_device device_i8237A = {
60 .id = 0, 60 .id = 0,
61 .cls = &i8237_sysdev_class, 61 .cls = &i8237_sysdev_class,
62}; 62};
63 63
64static int __init i8237A_init_sysfs(void) 64static int __init i8237A_init_sysfs(void)
@@ -68,5 +68,4 @@ static int __init i8237A_init_sysfs(void)
68 error = sysdev_register(&device_i8237A); 68 error = sysdev_register(&device_i8237A);
69 return error; 69 return error;
70} 70}
71
72device_initcall(i8237A_init_sysfs); 71device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f2..10f92fb532f3 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 117 pit_clockevent.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
119 pit_clockevent.shift); 119 pit_clockevent.shift);
120 pit_clockevent.max_delta_ns = 120 pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 4b8a53d841f7..11d5093eb281 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -11,15 +11,15 @@
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h> 12#include <linux/sysdev.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
14 17
15#include <asm/acpi.h>
16#include <asm/atomic.h> 18#include <asm/atomic.h>
17#include <asm/system.h> 19#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/timer.h> 20#include <asm/timer.h>
20#include <asm/hw_irq.h> 21#include <asm/hw_irq.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22#include <asm/delay.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/arch_hooks.h> 25#include <asm/arch_hooks.h>
@@ -323,7 +323,7 @@ void init_8259A(int auto_eoi)
323 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 323 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
324 324
325 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, 325 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
326 to 0x20-0x27 on i386 */ 326 to 0x20-0x27 on i386 */
327 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); 327 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
328 328
329 /* 8259A-1 (the master) has a slave on IR2 */ 329 /* 8259A-1 (the master) has a slave on IR2 */
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..df3bf269beab 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,11 +10,9 @@
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h> 11#include <asm/desc.h>
12 12
13static struct fs_struct init_fs = INIT_FS;
14static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
16struct mm_struct init_mm = INIT_MM(init_mm); 15struct mm_struct init_mm = INIT_MM(init_mm);
17EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
18 16
19/* 17/*
20 * Initial thread structure. 18 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 7a3f2028e2eb..7248ca11bdcd 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Intel IO-APIC support for multi-Pentium hosts. 2 * Intel IO-APIC support for multi-Pentium hosts.
3 * 3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo 4 * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
5 * 5 *
6 * Many thanks to Stig Venaas for trying out countless experimental 6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently! 7 * patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/io.h> 47#include <asm/io.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/cpu.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/proto.h> 51#include <asm/proto.h>
51#include <asm/acpi.h> 52#include <asm/acpi.h>
@@ -61,9 +62,7 @@
61#include <asm/uv/uv_hub.h> 62#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h> 63#include <asm/uv/uv_irq.h>
63 64
64#include <mach_ipi.h> 65#include <asm/genapic.h>
65#include <mach_apic.h>
66#include <mach_apicdef.h>
67 66
68#define __apicdebuginit(type) static type __init 67#define __apicdebuginit(type) static type __init
69 68
@@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);
82int nr_ioapic_registers[MAX_IO_APICS]; 81int nr_ioapic_registers[MAX_IO_APICS];
83 82
84/* I/O APIC entries */ 83/* I/O APIC entries */
85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 84struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 85int nr_ioapics;
87 86
88/* MP IRQ source entries */ 87/* MP IRQ source entries */
89struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 88struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 89
91/* # of MP IRQ source entries */ 90/* # of MP IRQ source entries */
92int mp_irq_entries; 91int mp_irq_entries;
@@ -99,103 +98,293 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
99 98
100int skip_ioapic_setup; 99int skip_ioapic_setup;
101 100
101void arch_disable_smp_support(void)
102{
103#ifdef CONFIG_PCI
104 noioapicquirk = 1;
105 noioapicreroute = -1;
106#endif
107 skip_ioapic_setup = 1;
108}
109
102static int __init parse_noapic(char *str) 110static int __init parse_noapic(char *str)
103{ 111{
104 /* disable IO-APIC */ 112 /* disable IO-APIC */
105 disable_ioapic_setup(); 113 arch_disable_smp_support();
106 return 0; 114 return 0;
107} 115}
108early_param("noapic", parse_noapic); 116early_param("noapic", parse_noapic);
109 117
110struct irq_pin_list; 118struct irq_pin_list;
119
120/*
121 * This is performance-critical, we want to do it O(1)
122 *
123 * the indexing order of this array favors 1:1 mappings
124 * between pins and IRQs.
125 */
126
127struct irq_pin_list {
128 int apic, pin;
129 struct irq_pin_list *next;
130};
131
132static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
133{
134 struct irq_pin_list *pin;
135 int node;
136
137 node = cpu_to_node(cpu);
138
139 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
140
141 return pin;
142}
143
111struct irq_cfg { 144struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin; 145 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain; 146 cpumask_var_t domain;
115 cpumask_t old_domain; 147 cpumask_var_t old_domain;
116 unsigned move_cleanup_count; 148 unsigned move_cleanup_count;
117 u8 vector; 149 u8 vector;
118 u8 move_in_progress : 1; 150 u8 move_in_progress : 1;
151#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
152 u8 move_desc_pending : 1;
153#endif
119}; 154};
120 155
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 156/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
157#ifdef CONFIG_SPARSE_IRQ
158static struct irq_cfg irq_cfgx[] = {
159#else
122static struct irq_cfg irq_cfgx[NR_IRQS] = { 160static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 161#endif
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 162 [0] = { .vector = IRQ0_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 163 [1] = { .vector = IRQ1_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, 164 [2] = { .vector = IRQ2_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, 165 [3] = { .vector = IRQ3_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, 166 [4] = { .vector = IRQ4_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, 167 [5] = { .vector = IRQ5_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, 168 [6] = { .vector = IRQ6_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, 169 [7] = { .vector = IRQ7_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, 170 [8] = { .vector = IRQ8_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, 171 [9] = { .vector = IRQ9_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, 172 [10] = { .vector = IRQ10_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, 173 [11] = { .vector = IRQ11_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, 174 [12] = { .vector = IRQ12_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, 175 [13] = { .vector = IRQ13_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, 176 [14] = { .vector = IRQ14_VECTOR, },
177 [15] = { .vector = IRQ15_VECTOR, },
139}; 178};
140 179
141#define for_each_irq_cfg(irq, cfg) \ 180int __init arch_early_irq_init(void)
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) 181{
182 struct irq_cfg *cfg;
183 struct irq_desc *desc;
184 int count;
185 int i;
186
187 cfg = irq_cfgx;
188 count = ARRAY_SIZE(irq_cfgx);
143 189
190 for (i = 0; i < count; i++) {
191 desc = irq_to_desc(i);
192 desc->chip_data = &cfg[i];
193 alloc_bootmem_cpumask_var(&cfg[i].domain);
194 alloc_bootmem_cpumask_var(&cfg[i].old_domain);
195 if (i < NR_IRQS_LEGACY)
196 cpumask_setall(cfg[i].domain);
197 }
198
199 return 0;
200}
201
202#ifdef CONFIG_SPARSE_IRQ
144static struct irq_cfg *irq_cfg(unsigned int irq) 203static struct irq_cfg *irq_cfg(unsigned int irq)
145{ 204{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL; 205 struct irq_cfg *cfg = NULL;
206 struct irq_desc *desc;
207
208 desc = irq_to_desc(irq);
209 if (desc)
210 cfg = desc->chip_data;
211
212 return cfg;
147} 213}
148 214
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq) 215static struct irq_cfg *get_one_free_irq_cfg(int cpu)
150{ 216{
151 return irq_cfg(irq); 217 struct irq_cfg *cfg;
218 int node;
219
220 node = cpu_to_node(cpu);
221
222 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
223 if (cfg) {
224 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
225 kfree(cfg);
226 cfg = NULL;
227 } else if (!alloc_cpumask_var_node(&cfg->old_domain,
228 GFP_ATOMIC, node)) {
229 free_cpumask_var(cfg->domain);
230 kfree(cfg);
231 cfg = NULL;
232 } else {
233 cpumask_clear(cfg->domain);
234 cpumask_clear(cfg->old_domain);
235 }
236 }
237
238 return cfg;
152} 239}
153 240
154/* 241int arch_init_chip_data(struct irq_desc *desc, int cpu)
155 * Rough estimation of how many shared IRQs there are, can be changed 242{
156 * anytime. 243 struct irq_cfg *cfg;
157 */
158#define MAX_PLUS_SHARED_IRQS NR_IRQS
159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
160 244
161/* 245 cfg = desc->chip_data;
162 * This is performance-critical, we want to do it O(1) 246 if (!cfg) {
163 * 247 desc->chip_data = get_one_free_irq_cfg(cpu);
164 * the indexing order of this array favors 1:1 mappings 248 if (!desc->chip_data) {
165 * between pins and IRQs. 249 printk(KERN_ERR "can not alloc irq_cfg\n");
166 */ 250 BUG_ON(1);
251 }
252 }
167 253
168struct irq_pin_list { 254 return 0;
169 int apic, pin; 255}
170 struct irq_pin_list *next;
171};
172 256
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; 257#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
174static struct irq_pin_list *irq_2_pin_ptr;
175 258
176static void __init irq_2_pin_init(void) 259static void
260init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
177{ 261{
178 struct irq_pin_list *pin = irq_2_pin_head; 262 struct irq_pin_list *old_entry, *head, *tail, *entry;
179 int i;
180 263
181 for (i = 1; i < PIN_MAP_SIZE; i++) 264 cfg->irq_2_pin = NULL;
182 pin[i-1].next = &pin[i]; 265 old_entry = old_cfg->irq_2_pin;
266 if (!old_entry)
267 return;
183 268
184 irq_2_pin_ptr = &pin[0]; 269 entry = get_one_free_irq_2_pin(cpu);
270 if (!entry)
271 return;
272
273 entry->apic = old_entry->apic;
274 entry->pin = old_entry->pin;
275 head = entry;
276 tail = entry;
277 old_entry = old_entry->next;
278 while (old_entry) {
279 entry = get_one_free_irq_2_pin(cpu);
280 if (!entry) {
281 entry = head;
282 while (entry) {
283 head = entry->next;
284 kfree(entry);
285 entry = head;
286 }
287 /* still use the old one */
288 return;
289 }
290 entry->apic = old_entry->apic;
291 entry->pin = old_entry->pin;
292 tail->next = entry;
293 tail = entry;
294 old_entry = old_entry->next;
295 }
296
297 tail->next = NULL;
298 cfg->irq_2_pin = head;
185} 299}
186 300
187static struct irq_pin_list *get_one_free_irq_2_pin(void) 301static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
188{ 302{
189 struct irq_pin_list *pin = irq_2_pin_ptr; 303 struct irq_pin_list *entry, *next;
190 304
191 if (!pin) 305 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
192 panic("can not get more irq_2_pin\n"); 306 return;
193 307
194 irq_2_pin_ptr = pin->next; 308 entry = old_cfg->irq_2_pin;
195 pin->next = NULL; 309
196 return pin; 310 while (entry) {
311 next = entry->next;
312 kfree(entry);
313 entry = next;
314 }
315 old_cfg->irq_2_pin = NULL;
316}
317
318void arch_init_copy_chip_data(struct irq_desc *old_desc,
319 struct irq_desc *desc, int cpu)
320{
321 struct irq_cfg *cfg;
322 struct irq_cfg *old_cfg;
323
324 cfg = get_one_free_irq_cfg(cpu);
325
326 if (!cfg)
327 return;
328
329 desc->chip_data = cfg;
330
331 old_cfg = old_desc->chip_data;
332
333 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
334
335 init_copy_irq_2_pin(old_cfg, cfg, cpu);
336}
337
338static void free_irq_cfg(struct irq_cfg *old_cfg)
339{
340 kfree(old_cfg);
341}
342
343void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
344{
345 struct irq_cfg *old_cfg, *cfg;
346
347 old_cfg = old_desc->chip_data;
348 cfg = desc->chip_data;
349
350 if (old_cfg == cfg)
351 return;
352
353 if (old_cfg) {
354 free_irq_2_pin(old_cfg, cfg);
355 free_irq_cfg(old_cfg);
356 old_desc->chip_data = NULL;
357 }
197} 358}
198 359
360static void
361set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
362{
363 struct irq_cfg *cfg = desc->chip_data;
364
365 if (!cfg->move_in_progress) {
366 /* it means that domain is not changed */
367 if (!cpumask_intersects(desc->affinity, mask))
368 cfg->move_desc_pending = 1;
369 }
370}
371#endif
372
373#else
374static struct irq_cfg *irq_cfg(unsigned int irq)
375{
376 return irq < nr_irqs ? irq_cfgx + irq : NULL;
377}
378
379#endif
380
381#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
382static inline void
383set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
384{
385}
386#endif
387
199struct io_apic { 388struct io_apic {
200 unsigned int index; 389 unsigned int index;
201 unsigned int unused[3]; 390 unsigned int unused[3];
@@ -205,7 +394,7 @@ struct io_apic {
205static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
206{ 395{
207 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 396 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
208 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); 397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
209} 398}
210 399
211static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -237,11 +426,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
237 writel(value, &io_apic->data); 426 writel(value, &io_apic->data);
238} 427}
239 428
240static bool io_apic_level_ack_pending(unsigned int irq) 429static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
241{ 430{
242 struct irq_pin_list *entry; 431 struct irq_pin_list *entry;
243 unsigned long flags; 432 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
245 433
246 spin_lock_irqsave(&ioapic_lock, flags); 434 spin_lock_irqsave(&ioapic_lock, flags);
247 entry = cfg->irq_2_pin; 435 entry = cfg->irq_2_pin;
@@ -298,7 +486,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
298 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 486 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
299} 487}
300 488
301static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 489void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
302{ 490{
303 unsigned long flags; 491 unsigned long flags;
304 spin_lock_irqsave(&ioapic_lock, flags); 492 spin_lock_irqsave(&ioapic_lock, flags);
@@ -323,13 +511,32 @@ static void ioapic_mask_entry(int apic, int pin)
323} 511}
324 512
325#ifdef CONFIG_SMP 513#ifdef CONFIG_SMP
326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 514static void send_cleanup_vector(struct irq_cfg *cfg)
515{
516 cpumask_var_t cleanup_mask;
517
518 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
519 unsigned int i;
520 cfg->move_cleanup_count = 0;
521 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
522 cfg->move_cleanup_count++;
523 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
524 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
525 } else {
526 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
527 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
528 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
529 free_cpumask_var(cleanup_mask);
530 }
531 cfg->move_in_progress = 0;
532}
533
534static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
327{ 535{
328 int apic, pin; 536 int apic, pin;
329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry; 537 struct irq_pin_list *entry;
538 u8 vector = cfg->vector;
331 539
332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin; 540 entry = cfg->irq_2_pin;
334 for (;;) { 541 for (;;) {
335 unsigned int reg; 542 unsigned int reg;
@@ -359,36 +566,63 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
359 } 566 }
360} 567}
361 568
362static int assign_irq_vector(int irq, cpumask_t mask); 569static int
570assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
571
572/*
573 * Either sets desc->affinity to a valid value, and returns
574 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
575 * leaves desc->affinity untouched.
576 */
577static unsigned int
578set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
579{
580 struct irq_cfg *cfg;
581 unsigned int irq;
363 582
364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 583 if (!cpumask_intersects(mask, cpu_online_mask))
584 return BAD_APICID;
585
586 irq = desc->irq;
587 cfg = desc->chip_data;
588 if (assign_irq_vector(irq, cfg, mask))
589 return BAD_APICID;
590
591 cpumask_and(desc->affinity, cfg->domain, mask);
592 set_extra_move_desc(desc, mask);
593
594 return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
595}
596
597static void
598set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
365{ 599{
366 struct irq_cfg *cfg; 600 struct irq_cfg *cfg;
367 unsigned long flags; 601 unsigned long flags;
368 unsigned int dest; 602 unsigned int dest;
369 cpumask_t tmp; 603 unsigned int irq;
370 struct irq_desc *desc;
371 604
372 cpus_and(tmp, mask, cpu_online_map); 605 irq = desc->irq;
373 if (cpus_empty(tmp)) 606 cfg = desc->chip_data;
374 return;
375 607
376 cfg = irq_cfg(irq); 608 spin_lock_irqsave(&ioapic_lock, flags);
377 if (assign_irq_vector(irq, mask)) 609 dest = set_desc_affinity(desc, mask);
378 return; 610 if (dest != BAD_APICID) {
611 /* Only the high 8 bits are valid. */
612 dest = SET_APIC_LOGICAL_ID(dest);
613 __target_IO_APIC_irq(irq, dest, cfg);
614 }
615 spin_unlock_irqrestore(&ioapic_lock, flags);
616}
379 617
380 cpus_and(tmp, cfg->domain, mask); 618static void
381 dest = cpu_mask_to_apicid(tmp); 619set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
382 /* 620{
383 * Only the high 8 bits are valid. 621 struct irq_desc *desc;
384 */
385 dest = SET_APIC_LOGICAL_ID(dest);
386 622
387 desc = irq_to_desc(irq); 623 desc = irq_to_desc(irq);
388 spin_lock_irqsave(&ioapic_lock, flags); 624
389 __target_IO_APIC_irq(irq, dest, cfg->vector); 625 set_ioapic_affinity_irq_desc(desc, mask);
390 desc->affinity = mask;
391 spin_unlock_irqrestore(&ioapic_lock, flags);
392} 626}
393#endif /* CONFIG_SMP */ 627#endif /* CONFIG_SMP */
394 628
@@ -397,16 +631,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
397 * shared ISA-space IRQs, so we have to support them. We are super 631 * shared ISA-space IRQs, so we have to support them. We are super
398 * fast in the common case, and fast for shared ISA-space IRQs. 632 * fast in the common case, and fast for shared ISA-space IRQs.
399 */ 633 */
400static void add_pin_to_irq(unsigned int irq, int apic, int pin) 634static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
401{ 635{
402 struct irq_cfg *cfg;
403 struct irq_pin_list *entry; 636 struct irq_pin_list *entry;
404 637
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin; 638 entry = cfg->irq_2_pin;
408 if (!entry) { 639 if (!entry) {
409 entry = get_one_free_irq_2_pin(); 640 entry = get_one_free_irq_2_pin(cpu);
641 if (!entry) {
642 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
643 apic, pin);
644 return;
645 }
410 cfg->irq_2_pin = entry; 646 cfg->irq_2_pin = entry;
411 entry->apic = apic; 647 entry->apic = apic;
412 entry->pin = pin; 648 entry->pin = pin;
@@ -421,7 +657,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
421 entry = entry->next; 657 entry = entry->next;
422 } 658 }
423 659
424 entry->next = get_one_free_irq_2_pin(); 660 entry->next = get_one_free_irq_2_pin(cpu);
425 entry = entry->next; 661 entry = entry->next;
426 entry->apic = apic; 662 entry->apic = apic;
427 entry->pin = pin; 663 entry->pin = pin;
@@ -430,11 +666,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
430/* 666/*
431 * Reroute an IRQ to a different pin. 667 * Reroute an IRQ to a different pin.
432 */ 668 */
433static void __init replace_pin_at_irq(unsigned int irq, 669static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
434 int oldapic, int oldpin, 670 int oldapic, int oldpin,
435 int newapic, int newpin) 671 int newapic, int newpin)
436{ 672{
437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin; 673 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0; 674 int replaced = 0;
440 675
@@ -451,18 +686,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
451 686
452 /* why? call replace before add? */ 687 /* why? call replace before add? */
453 if (!replaced) 688 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin); 689 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
455} 690}
456 691
457static inline void io_apic_modify_irq(unsigned int irq, 692static inline void io_apic_modify_irq(struct irq_cfg *cfg,
458 int mask_and, int mask_or, 693 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry)) 694 void (*final)(struct irq_pin_list *entry))
460{ 695{
461 int pin; 696 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry; 697 struct irq_pin_list *entry;
464 698
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 699 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg; 700 unsigned int reg;
468 pin = entry->pin; 701 pin = entry->pin;
@@ -475,13 +708,13 @@ static inline void io_apic_modify_irq(unsigned int irq,
475 } 708 }
476} 709}
477 710
478static void __unmask_IO_APIC_irq(unsigned int irq) 711static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
479{ 712{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); 713 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
481} 714}
482 715
483#ifdef CONFIG_X86_64 716#ifdef CONFIG_X86_64
484void io_apic_sync(struct irq_pin_list *entry) 717static void io_apic_sync(struct irq_pin_list *entry)
485{ 718{
486 /* 719 /*
487 * Synchronize the IO-APIC and the CPU by doing 720 * Synchronize the IO-APIC and the CPU by doing
@@ -492,47 +725,64 @@ void io_apic_sync(struct irq_pin_list *entry)
492 readl(&io_apic->data); 725 readl(&io_apic->data);
493} 726}
494 727
495static void __mask_IO_APIC_irq(unsigned int irq) 728static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
496{ 729{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 730 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498} 731}
499#else /* CONFIG_X86_32 */ 732#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq) 733static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
501{ 734{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); 735 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
503} 736}
504 737
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq) 738static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
506{ 739{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, 740 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL); 741 IO_APIC_REDIR_MASKED, NULL);
509} 742}
510 743
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq) 744static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
512{ 745{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 746 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 747 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515} 748}
516#endif /* CONFIG_X86_32 */ 749#endif /* CONFIG_X86_32 */
517 750
518static void mask_IO_APIC_irq (unsigned int irq) 751static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
519{ 752{
753 struct irq_cfg *cfg = desc->chip_data;
520 unsigned long flags; 754 unsigned long flags;
521 755
756 BUG_ON(!cfg);
757
522 spin_lock_irqsave(&ioapic_lock, flags); 758 spin_lock_irqsave(&ioapic_lock, flags);
523 __mask_IO_APIC_irq(irq); 759 __mask_IO_APIC_irq(cfg);
524 spin_unlock_irqrestore(&ioapic_lock, flags); 760 spin_unlock_irqrestore(&ioapic_lock, flags);
525} 761}
526 762
527static void unmask_IO_APIC_irq (unsigned int irq) 763static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
528{ 764{
765 struct irq_cfg *cfg = desc->chip_data;
529 unsigned long flags; 766 unsigned long flags;
530 767
531 spin_lock_irqsave(&ioapic_lock, flags); 768 spin_lock_irqsave(&ioapic_lock, flags);
532 __unmask_IO_APIC_irq(irq); 769 __unmask_IO_APIC_irq(cfg);
533 spin_unlock_irqrestore(&ioapic_lock, flags); 770 spin_unlock_irqrestore(&ioapic_lock, flags);
534} 771}
535 772
773static void mask_IO_APIC_irq(unsigned int irq)
774{
775 struct irq_desc *desc = irq_to_desc(irq);
776
777 mask_IO_APIC_irq_desc(desc);
778}
779static void unmask_IO_APIC_irq(unsigned int irq)
780{
781 struct irq_desc *desc = irq_to_desc(irq);
782
783 unmask_IO_APIC_irq_desc(desc);
784}
785
536static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 786static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
537{ 787{
538 struct IO_APIC_route_entry entry; 788 struct IO_APIC_route_entry entry;
@@ -556,23 +806,6 @@ static void clear_IO_APIC (void)
556 clear_IO_APIC_pin(apic, pin); 806 clear_IO_APIC_pin(apic, pin);
557} 807}
558 808
559#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
560void send_IPI_self(int vector)
561{
562 unsigned int cfg;
563
564 /*
565 * Wait for idle.
566 */
567 apic_wait_icr_idle();
568 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
569 /*
570 * Send the IPI. The write to APIC_ICR fires this off.
571 */
572 apic_write(APIC_ICR, cfg);
573}
574#endif /* !CONFIG_SMP && CONFIG_X86_32*/
575
576#ifdef CONFIG_X86_32 809#ifdef CONFIG_X86_32
577/* 810/*
578 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to 811 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -704,10 +937,10 @@ static int find_irq_entry(int apic, int pin, int type)
704 int i; 937 int i;
705 938
706 for (i = 0; i < mp_irq_entries; i++) 939 for (i = 0; i < mp_irq_entries; i++)
707 if (mp_irqs[i].mp_irqtype == type && 940 if (mp_irqs[i].irqtype == type &&
708 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || 941 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
709 mp_irqs[i].mp_dstapic == MP_APIC_ALL) && 942 mp_irqs[i].dstapic == MP_APIC_ALL) &&
710 mp_irqs[i].mp_dstirq == pin) 943 mp_irqs[i].dstirq == pin)
711 return i; 944 return i;
712 945
713 return -1; 946 return -1;
@@ -721,13 +954,13 @@ static int __init find_isa_irq_pin(int irq, int type)
721 int i; 954 int i;
722 955
723 for (i = 0; i < mp_irq_entries; i++) { 956 for (i = 0; i < mp_irq_entries; i++) {
724 int lbus = mp_irqs[i].mp_srcbus; 957 int lbus = mp_irqs[i].srcbus;
725 958
726 if (test_bit(lbus, mp_bus_not_pci) && 959 if (test_bit(lbus, mp_bus_not_pci) &&
727 (mp_irqs[i].mp_irqtype == type) && 960 (mp_irqs[i].irqtype == type) &&
728 (mp_irqs[i].mp_srcbusirq == irq)) 961 (mp_irqs[i].srcbusirq == irq))
729 962
730 return mp_irqs[i].mp_dstirq; 963 return mp_irqs[i].dstirq;
731 } 964 }
732 return -1; 965 return -1;
733} 966}
@@ -737,17 +970,17 @@ static int __init find_isa_irq_apic(int irq, int type)
737 int i; 970 int i;
738 971
739 for (i = 0; i < mp_irq_entries; i++) { 972 for (i = 0; i < mp_irq_entries; i++) {
740 int lbus = mp_irqs[i].mp_srcbus; 973 int lbus = mp_irqs[i].srcbus;
741 974
742 if (test_bit(lbus, mp_bus_not_pci) && 975 if (test_bit(lbus, mp_bus_not_pci) &&
743 (mp_irqs[i].mp_irqtype == type) && 976 (mp_irqs[i].irqtype == type) &&
744 (mp_irqs[i].mp_srcbusirq == irq)) 977 (mp_irqs[i].srcbusirq == irq))
745 break; 978 break;
746 } 979 }
747 if (i < mp_irq_entries) { 980 if (i < mp_irq_entries) {
748 int apic; 981 int apic;
749 for(apic = 0; apic < nr_ioapics; apic++) { 982 for(apic = 0; apic < nr_ioapics; apic++) {
750 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) 983 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
751 return apic; 984 return apic;
752 } 985 }
753 } 986 }
@@ -772,23 +1005,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
772 return -1; 1005 return -1;
773 } 1006 }
774 for (i = 0; i < mp_irq_entries; i++) { 1007 for (i = 0; i < mp_irq_entries; i++) {
775 int lbus = mp_irqs[i].mp_srcbus; 1008 int lbus = mp_irqs[i].srcbus;
776 1009
777 for (apic = 0; apic < nr_ioapics; apic++) 1010 for (apic = 0; apic < nr_ioapics; apic++)
778 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || 1011 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
779 mp_irqs[i].mp_dstapic == MP_APIC_ALL) 1012 mp_irqs[i].dstapic == MP_APIC_ALL)
780 break; 1013 break;
781 1014
782 if (!test_bit(lbus, mp_bus_not_pci) && 1015 if (!test_bit(lbus, mp_bus_not_pci) &&
783 !mp_irqs[i].mp_irqtype && 1016 !mp_irqs[i].irqtype &&
784 (bus == lbus) && 1017 (bus == lbus) &&
785 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { 1018 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
786 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); 1019 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
787 1020
788 if (!(apic || IO_APIC_IRQ(irq))) 1021 if (!(apic || IO_APIC_IRQ(irq)))
789 continue; 1022 continue;
790 1023
791 if (pin == (mp_irqs[i].mp_srcbusirq & 3)) 1024 if (pin == (mp_irqs[i].srcbusirq & 3))
792 return irq; 1025 return irq;
793 /* 1026 /*
794 * Use the first all-but-pin matching entry as a 1027 * Use the first all-but-pin matching entry as a
@@ -809,7 +1042,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
809 */ 1042 */
810static int EISA_ELCR(unsigned int irq) 1043static int EISA_ELCR(unsigned int irq)
811{ 1044{
812 if (irq < 16) { 1045 if (irq < NR_IRQS_LEGACY) {
813 unsigned int port = 0x4d0 + (irq >> 3); 1046 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1; 1047 return (inb(port) >> (irq & 7)) & 1;
815 } 1048 }
@@ -831,7 +1064,7 @@ static int EISA_ELCR(unsigned int irq)
831 * EISA conforming in the MP table, that means its trigger type must 1064 * EISA conforming in the MP table, that means its trigger type must
832 * be read in from the ELCR */ 1065 * be read in from the ELCR */
833 1066
834#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) 1067#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
835#define default_EISA_polarity(idx) default_ISA_polarity(idx) 1068#define default_EISA_polarity(idx) default_ISA_polarity(idx)
836 1069
837/* PCI interrupts are always polarity one level triggered, 1070/* PCI interrupts are always polarity one level triggered,
@@ -848,13 +1081,13 @@ static int EISA_ELCR(unsigned int irq)
848 1081
849static int MPBIOS_polarity(int idx) 1082static int MPBIOS_polarity(int idx)
850{ 1083{
851 int bus = mp_irqs[idx].mp_srcbus; 1084 int bus = mp_irqs[idx].srcbus;
852 int polarity; 1085 int polarity;
853 1086
854 /* 1087 /*
855 * Determine IRQ line polarity (high active or low active): 1088 * Determine IRQ line polarity (high active or low active):
856 */ 1089 */
857 switch (mp_irqs[idx].mp_irqflag & 3) 1090 switch (mp_irqs[idx].irqflag & 3)
858 { 1091 {
859 case 0: /* conforms, ie. bus-type dependent polarity */ 1092 case 0: /* conforms, ie. bus-type dependent polarity */
860 if (test_bit(bus, mp_bus_not_pci)) 1093 if (test_bit(bus, mp_bus_not_pci))
@@ -890,13 +1123,13 @@ static int MPBIOS_polarity(int idx)
890 1123
891static int MPBIOS_trigger(int idx) 1124static int MPBIOS_trigger(int idx)
892{ 1125{
893 int bus = mp_irqs[idx].mp_srcbus; 1126 int bus = mp_irqs[idx].srcbus;
894 int trigger; 1127 int trigger;
895 1128
896 /* 1129 /*
897 * Determine IRQ trigger mode (edge or level sensitive): 1130 * Determine IRQ trigger mode (edge or level sensitive):
898 */ 1131 */
899 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) 1132 switch ((mp_irqs[idx].irqflag>>2) & 3)
900 { 1133 {
901 case 0: /* conforms, ie. bus-type dependent */ 1134 case 0: /* conforms, ie. bus-type dependent */
902 if (test_bit(bus, mp_bus_not_pci)) 1135 if (test_bit(bus, mp_bus_not_pci))
@@ -974,16 +1207,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
974static int pin_2_irq(int idx, int apic, int pin) 1207static int pin_2_irq(int idx, int apic, int pin)
975{ 1208{
976 int irq, i; 1209 int irq, i;
977 int bus = mp_irqs[idx].mp_srcbus; 1210 int bus = mp_irqs[idx].srcbus;
978 1211
979 /* 1212 /*
980 * Debugging check, we are in big trouble if this message pops up! 1213 * Debugging check, we are in big trouble if this message pops up!
981 */ 1214 */
982 if (mp_irqs[idx].mp_dstirq != pin) 1215 if (mp_irqs[idx].dstirq != pin)
983 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1216 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
984 1217
985 if (test_bit(bus, mp_bus_not_pci)) { 1218 if (test_bit(bus, mp_bus_not_pci)) {
986 irq = mp_irqs[idx].mp_srcbusirq; 1219 irq = mp_irqs[idx].srcbusirq;
987 } else { 1220 } else {
988 /* 1221 /*
989 * PCI IRQs are mapped in order 1222 * PCI IRQs are mapped in order
@@ -1034,7 +1267,8 @@ void unlock_vector_lock(void)
1034 spin_unlock(&vector_lock); 1267 spin_unlock(&vector_lock);
1035} 1268}
1036 1269
1037static int __assign_irq_vector(int irq, cpumask_t mask) 1270static int
1271__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1038{ 1272{
1039 /* 1273 /*
1040 * NOTE! The local APIC isn't very good at handling 1274 * NOTE! The local APIC isn't very good at handling
@@ -1049,52 +1283,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
1049 */ 1283 */
1050 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1284 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1051 unsigned int old_vector; 1285 unsigned int old_vector;
1052 int cpu; 1286 int cpu, err;
1053 struct irq_cfg *cfg; 1287 cpumask_var_t tmp_mask;
1054
1055 cfg = irq_cfg(irq);
1056
1057 /* Only try and allocate irqs on cpus that are present */
1058 cpus_and(mask, mask, cpu_online_map);
1059 1288
1060 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1289 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1061 return -EBUSY; 1290 return -EBUSY;
1062 1291
1292 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1293 return -ENOMEM;
1294
1063 old_vector = cfg->vector; 1295 old_vector = cfg->vector;
1064 if (old_vector) { 1296 if (old_vector) {
1065 cpumask_t tmp; 1297 cpumask_and(tmp_mask, mask, cpu_online_mask);
1066 cpus_and(tmp, cfg->domain, mask); 1298 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1067 if (!cpus_empty(tmp)) 1299 if (!cpumask_empty(tmp_mask)) {
1300 free_cpumask_var(tmp_mask);
1068 return 0; 1301 return 0;
1302 }
1069 } 1303 }
1070 1304
1071 for_each_cpu_mask_nr(cpu, mask) { 1305 /* Only try and allocate irqs on cpus that are present */
1072 cpumask_t domain, new_mask; 1306 err = -ENOSPC;
1307 for_each_cpu_and(cpu, mask, cpu_online_mask) {
1073 int new_cpu; 1308 int new_cpu;
1074 int vector, offset; 1309 int vector, offset;
1075 1310
1076 domain = vector_allocation_domain(cpu); 1311 apic->vector_allocation_domain(cpu, tmp_mask);
1077 cpus_and(new_mask, domain, cpu_online_map);
1078 1312
1079 vector = current_vector; 1313 vector = current_vector;
1080 offset = current_offset; 1314 offset = current_offset;
1081next: 1315next:
1082 vector += 8; 1316 vector += 8;
1083 if (vector >= first_system_vector) { 1317 if (vector >= first_system_vector) {
1084 /* If we run out of vectors on large boxen, must share them. */ 1318 /* If out of vectors on large boxen, must share them. */
1085 offset = (offset + 1) % 8; 1319 offset = (offset + 1) % 8;
1086 vector = FIRST_DEVICE_VECTOR + offset; 1320 vector = FIRST_DEVICE_VECTOR + offset;
1087 } 1321 }
1088 if (unlikely(current_vector == vector)) 1322 if (unlikely(current_vector == vector))
1089 continue; 1323 continue;
1090#ifdef CONFIG_X86_64 1324
1091 if (vector == IA32_SYSCALL_VECTOR) 1325 if (test_bit(vector, used_vectors))
1092 goto next;
1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next; 1326 goto next;
1096#endif 1327
1097 for_each_cpu_mask_nr(new_cpu, new_mask) 1328 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1329 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
1099 goto next; 1330 goto next;
1100 /* Found one! */ 1331 /* Found one! */
@@ -1102,44 +1333,56 @@ next:
1102 current_offset = offset; 1333 current_offset = offset;
1103 if (old_vector) { 1334 if (old_vector) {
1104 cfg->move_in_progress = 1; 1335 cfg->move_in_progress = 1;
1105 cfg->old_domain = cfg->domain; 1336 cpumask_copy(cfg->old_domain, cfg->domain);
1106 } 1337 }
1107 for_each_cpu_mask_nr(new_cpu, new_mask) 1338 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1108 per_cpu(vector_irq, new_cpu)[vector] = irq; 1339 per_cpu(vector_irq, new_cpu)[vector] = irq;
1109 cfg->vector = vector; 1340 cfg->vector = vector;
1110 cfg->domain = domain; 1341 cpumask_copy(cfg->domain, tmp_mask);
1111 return 0; 1342 err = 0;
1343 break;
1112 } 1344 }
1113 return -ENOSPC; 1345 free_cpumask_var(tmp_mask);
1346 return err;
1114} 1347}
1115 1348
1116static int assign_irq_vector(int irq, cpumask_t mask) 1349static int
1350assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1117{ 1351{
1118 int err; 1352 int err;
1119 unsigned long flags; 1353 unsigned long flags;
1120 1354
1121 spin_lock_irqsave(&vector_lock, flags); 1355 spin_lock_irqsave(&vector_lock, flags);
1122 err = __assign_irq_vector(irq, mask); 1356 err = __assign_irq_vector(irq, cfg, mask);
1123 spin_unlock_irqrestore(&vector_lock, flags); 1357 spin_unlock_irqrestore(&vector_lock, flags);
1124 return err; 1358 return err;
1125} 1359}
1126 1360
1127static void __clear_irq_vector(int irq) 1361static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1128{ 1362{
1129 struct irq_cfg *cfg;
1130 cpumask_t mask;
1131 int cpu, vector; 1363 int cpu, vector;
1132 1364
1133 cfg = irq_cfg(irq);
1134 BUG_ON(!cfg->vector); 1365 BUG_ON(!cfg->vector);
1135 1366
1136 vector = cfg->vector; 1367 vector = cfg->vector;
1137 cpus_and(mask, cfg->domain, cpu_online_map); 1368 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1138 for_each_cpu_mask_nr(cpu, mask)
1139 per_cpu(vector_irq, cpu)[vector] = -1; 1369 per_cpu(vector_irq, cpu)[vector] = -1;
1140 1370
1141 cfg->vector = 0; 1371 cfg->vector = 0;
1142 cpus_clear(cfg->domain); 1372 cpumask_clear(cfg->domain);
1373
1374 if (likely(!cfg->move_in_progress))
1375 return;
1376 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1377 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
1378 vector++) {
1379 if (per_cpu(vector_irq, cpu)[vector] != irq)
1380 continue;
1381 per_cpu(vector_irq, cpu)[vector] = -1;
1382 break;
1383 }
1384 }
1385 cfg->move_in_progress = 0;
1143} 1386}
1144 1387
1145void __setup_vector_irq(int cpu) 1388void __setup_vector_irq(int cpu)
@@ -1148,10 +1391,12 @@ void __setup_vector_irq(int cpu)
1148 /* This function must be called with vector_lock held */ 1391 /* This function must be called with vector_lock held */
1149 int irq, vector; 1392 int irq, vector;
1150 struct irq_cfg *cfg; 1393 struct irq_cfg *cfg;
1394 struct irq_desc *desc;
1151 1395
1152 /* Mark the inuse vectors */ 1396 /* Mark the inuse vectors */
1153 for_each_irq_cfg(irq, cfg) { 1397 for_each_irq_desc(irq, desc) {
1154 if (!cpu_isset(cpu, cfg->domain)) 1398 cfg = desc->chip_data;
1399 if (!cpumask_test_cpu(cpu, cfg->domain))
1155 continue; 1400 continue;
1156 vector = cfg->vector; 1401 vector = cfg->vector;
1157 per_cpu(vector_irq, cpu)[vector] = irq; 1402 per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1163,7 +1408,7 @@ void __setup_vector_irq(int cpu)
1163 continue; 1408 continue;
1164 1409
1165 cfg = irq_cfg(irq); 1410 cfg = irq_cfg(irq);
1166 if (!cpu_isset(cpu, cfg->domain)) 1411 if (!cpumask_test_cpu(cpu, cfg->domain))
1167 per_cpu(vector_irq, cpu)[vector] = -1; 1412 per_cpu(vector_irq, cpu)[vector] = -1;
1168 } 1413 }
1169} 1414}
@@ -1201,11 +1446,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1201} 1446}
1202#endif 1447#endif
1203 1448
1204static void ioapic_register_intr(int irq, unsigned long trigger) 1449static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
1205{ 1450{
1206 struct irq_desc *desc;
1207
1208 desc = irq_to_desc(irq);
1209 1451
1210 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1452 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1211 trigger == IOAPIC_LEVEL) 1453 trigger == IOAPIC_LEVEL)
@@ -1236,10 +1478,10 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
1236 handle_edge_irq, "edge"); 1478 handle_edge_irq, "edge");
1237} 1479}
1238 1480
1239static int setup_ioapic_entry(int apic, int irq, 1481int setup_ioapic_entry(int apic_id, int irq,
1240 struct IO_APIC_route_entry *entry, 1482 struct IO_APIC_route_entry *entry,
1241 unsigned int destination, int trigger, 1483 unsigned int destination, int trigger,
1242 int polarity, int vector) 1484 int polarity, int vector)
1243{ 1485{
1244 /* 1486 /*
1245 * add it to the IO-APIC irq-routing table: 1487 * add it to the IO-APIC irq-routing table:
@@ -1248,25 +1490,25 @@ static int setup_ioapic_entry(int apic, int irq,
1248 1490
1249#ifdef CONFIG_INTR_REMAP 1491#ifdef CONFIG_INTR_REMAP
1250 if (intr_remapping_enabled) { 1492 if (intr_remapping_enabled) {
1251 struct intel_iommu *iommu = map_ioapic_to_ir(apic); 1493 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1252 struct irte irte; 1494 struct irte irte;
1253 struct IR_IO_APIC_route_entry *ir_entry = 1495 struct IR_IO_APIC_route_entry *ir_entry =
1254 (struct IR_IO_APIC_route_entry *) entry; 1496 (struct IR_IO_APIC_route_entry *) entry;
1255 int index; 1497 int index;
1256 1498
1257 if (!iommu) 1499 if (!iommu)
1258 panic("No mapping iommu for ioapic %d\n", apic); 1500 panic("No mapping iommu for ioapic %d\n", apic_id);
1259 1501
1260 index = alloc_irte(iommu, irq, 1); 1502 index = alloc_irte(iommu, irq, 1);
1261 if (index < 0) 1503 if (index < 0)
1262 panic("Failed to allocate IRTE for ioapic %d\n", apic); 1504 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1263 1505
1264 memset(&irte, 0, sizeof(irte)); 1506 memset(&irte, 0, sizeof(irte));
1265 1507
1266 irte.present = 1; 1508 irte.present = 1;
1267 irte.dst_mode = INT_DEST_MODE; 1509 irte.dst_mode = apic->irq_dest_mode;
1268 irte.trigger_mode = trigger; 1510 irte.trigger_mode = trigger;
1269 irte.dlvry_mode = INT_DELIVERY_MODE; 1511 irte.dlvry_mode = apic->irq_delivery_mode;
1270 irte.vector = vector; 1512 irte.vector = vector;
1271 irte.dest_id = IRTE_DEST(destination); 1513 irte.dest_id = IRTE_DEST(destination);
1272 1514
@@ -1279,8 +1521,8 @@ static int setup_ioapic_entry(int apic, int irq,
1279 } else 1521 } else
1280#endif 1522#endif
1281 { 1523 {
1282 entry->delivery_mode = INT_DELIVERY_MODE; 1524 entry->delivery_mode = apic->irq_delivery_mode;
1283 entry->dest_mode = INT_DEST_MODE; 1525 entry->dest_mode = apic->irq_dest_mode;
1284 entry->dest = destination; 1526 entry->dest = destination;
1285 } 1527 }
1286 1528
@@ -1297,69 +1539,68 @@ static int setup_ioapic_entry(int apic, int irq,
1297 return 0; 1539 return 0;
1298} 1540}
1299 1541
1300static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1542static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
1301 int trigger, int polarity) 1543 int trigger, int polarity)
1302{ 1544{
1303 struct irq_cfg *cfg; 1545 struct irq_cfg *cfg;
1304 struct IO_APIC_route_entry entry; 1546 struct IO_APIC_route_entry entry;
1305 cpumask_t mask; 1547 unsigned int dest;
1306 1548
1307 if (!IO_APIC_IRQ(irq)) 1549 if (!IO_APIC_IRQ(irq))
1308 return; 1550 return;
1309 1551
1310 cfg = irq_cfg(irq); 1552 cfg = desc->chip_data;
1311 1553
1312 mask = TARGET_CPUS; 1554 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1313 if (assign_irq_vector(irq, mask))
1314 return; 1555 return;
1315 1556
1316 cpus_and(mask, cfg->domain, mask); 1557 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
1317 1558
1318 apic_printk(APIC_VERBOSE,KERN_DEBUG 1559 apic_printk(APIC_VERBOSE,KERN_DEBUG
1319 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1560 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1320 "IRQ %d Mode:%i Active:%i)\n", 1561 "IRQ %d Mode:%i Active:%i)\n",
1321 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1562 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
1322 irq, trigger, polarity); 1563 irq, trigger, polarity);
1323 1564
1324 1565
1325 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1566 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1326 cpu_mask_to_apicid(mask), trigger, polarity, 1567 dest, trigger, polarity, cfg->vector)) {
1327 cfg->vector)) {
1328 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1568 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1329 mp_ioapics[apic].mp_apicid, pin); 1569 mp_ioapics[apic_id].apicid, pin);
1330 __clear_irq_vector(irq); 1570 __clear_irq_vector(irq, cfg);
1331 return; 1571 return;
1332 } 1572 }
1333 1573
1334 ioapic_register_intr(irq, trigger); 1574 ioapic_register_intr(irq, desc, trigger);
1335 if (irq < 16) 1575 if (irq < NR_IRQS_LEGACY)
1336 disable_8259A_irq(irq); 1576 disable_8259A_irq(irq);
1337 1577
1338 ioapic_write_entry(apic, pin, entry); 1578 ioapic_write_entry(apic_id, pin, entry);
1339} 1579}
1340 1580
1341static void __init setup_IO_APIC_irqs(void) 1581static void __init setup_IO_APIC_irqs(void)
1342{ 1582{
1343 int apic, pin, idx, irq; 1583 int apic_id, pin, idx, irq;
1344 int notcon = 0; 1584 int notcon = 0;
1585 struct irq_desc *desc;
1586 struct irq_cfg *cfg;
1587 int cpu = boot_cpu_id;
1345 1588
1346 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1589 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1347 1590
1348 for (apic = 0; apic < nr_ioapics; apic++) { 1591 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
1349 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1592 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1350 1593
1351 idx = find_irq_entry(apic, pin, mp_INT); 1594 idx = find_irq_entry(apic_id, pin, mp_INT);
1352 if (idx == -1) { 1595 if (idx == -1) {
1353 if (!notcon) { 1596 if (!notcon) {
1354 notcon = 1; 1597 notcon = 1;
1355 apic_printk(APIC_VERBOSE, 1598 apic_printk(APIC_VERBOSE,
1356 KERN_DEBUG " %d-%d", 1599 KERN_DEBUG " %d-%d",
1357 mp_ioapics[apic].mp_apicid, 1600 mp_ioapics[apic_id].apicid, pin);
1358 pin);
1359 } else 1601 } else
1360 apic_printk(APIC_VERBOSE, " %d-%d", 1602 apic_printk(APIC_VERBOSE, " %d-%d",
1361 mp_ioapics[apic].mp_apicid, 1603 mp_ioapics[apic_id].apicid, pin);
1362 pin);
1363 continue; 1604 continue;
1364 } 1605 }
1365 if (notcon) { 1606 if (notcon) {
@@ -1368,14 +1609,25 @@ static void __init setup_IO_APIC_irqs(void)
1368 notcon = 0; 1609 notcon = 0;
1369 } 1610 }
1370 1611
1371 irq = pin_2_irq(idx, apic, pin); 1612 irq = pin_2_irq(idx, apic_id, pin);
1372#ifdef CONFIG_X86_32 1613
1373 if (multi_timer_check(apic, irq)) 1614 /*
1615 * Skip the timer IRQ if there's a quirk handler
1616 * installed and if it returns 1:
1617 */
1618 if (apic->multi_timer_check &&
1619 apic->multi_timer_check(apic_id, irq))
1374 continue; 1620 continue;
1375#endif
1376 add_pin_to_irq(irq, apic, pin);
1377 1621
1378 setup_IO_APIC_irq(apic, pin, irq, 1622 desc = irq_to_desc_alloc_cpu(irq, cpu);
1623 if (!desc) {
1624 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1625 continue;
1626 }
1627 cfg = desc->chip_data;
1628 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
1629
1630 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1379 irq_trigger(idx), irq_polarity(idx)); 1631 irq_trigger(idx), irq_polarity(idx));
1380 } 1632 }
1381 } 1633 }
@@ -1388,7 +1640,7 @@ static void __init setup_IO_APIC_irqs(void)
1388/* 1640/*
1389 * Set up the timer pin, possibly with the 8259A-master behind. 1641 * Set up the timer pin, possibly with the 8259A-master behind.
1390 */ 1642 */
1391static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, 1643static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1392 int vector) 1644 int vector)
1393{ 1645{
1394 struct IO_APIC_route_entry entry; 1646 struct IO_APIC_route_entry entry;
@@ -1404,10 +1656,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1404 * We use logical delivery to get the timer IRQ 1656 * We use logical delivery to get the timer IRQ
1405 * to the first CPU. 1657 * to the first CPU.
1406 */ 1658 */
1407 entry.dest_mode = INT_DEST_MODE; 1659 entry.dest_mode = apic->irq_dest_mode;
1408 entry.mask = 1; /* mask IRQ now */ 1660 entry.mask = 0; /* don't mask IRQ for edge */
1409 entry.dest = cpu_mask_to_apicid(TARGET_CPUS); 1661 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
1410 entry.delivery_mode = INT_DELIVERY_MODE; 1662 entry.delivery_mode = apic->irq_delivery_mode;
1411 entry.polarity = 0; 1663 entry.polarity = 0;
1412 entry.trigger = 0; 1664 entry.trigger = 0;
1413 entry.vector = vector; 1665 entry.vector = vector;
@@ -1421,7 +1673,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1421 /* 1673 /*
1422 * Add it to the IO-APIC irq-routing table: 1674 * Add it to the IO-APIC irq-routing table:
1423 */ 1675 */
1424 ioapic_write_entry(apic, pin, entry); 1676 ioapic_write_entry(apic_id, pin, entry);
1425} 1677}
1426 1678
1427 1679
@@ -1434,6 +1686,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1434 union IO_APIC_reg_03 reg_03; 1686 union IO_APIC_reg_03 reg_03;
1435 unsigned long flags; 1687 unsigned long flags;
1436 struct irq_cfg *cfg; 1688 struct irq_cfg *cfg;
1689 struct irq_desc *desc;
1437 unsigned int irq; 1690 unsigned int irq;
1438 1691
1439 if (apic_verbosity == APIC_QUIET) 1692 if (apic_verbosity == APIC_QUIET)
@@ -1442,7 +1695,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1442 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1695 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1443 for (i = 0; i < nr_ioapics; i++) 1696 for (i = 0; i < nr_ioapics; i++)
1444 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1697 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1445 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); 1698 mp_ioapics[i].apicid, nr_ioapic_registers[i]);
1446 1699
1447 /* 1700 /*
1448 * We are a bit conservative about what we expect. We have to 1701 * We are a bit conservative about what we expect. We have to
@@ -1462,7 +1715,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1462 spin_unlock_irqrestore(&ioapic_lock, flags); 1715 spin_unlock_irqrestore(&ioapic_lock, flags);
1463 1716
1464 printk("\n"); 1717 printk("\n");
1465 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1718 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
1466 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1719 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1467 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1720 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1468 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1721 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1523,8 +1776,11 @@ __apicdebuginit(void) print_IO_APIC(void)
1523 } 1776 }
1524 } 1777 }
1525 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1778 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1526 for_each_irq_cfg(irq, cfg) { 1779 for_each_irq_desc(irq, desc) {
1527 struct irq_pin_list *entry = cfg->irq_2_pin; 1780 struct irq_pin_list *entry;
1781
1782 cfg = desc->chip_data;
1783 entry = cfg->irq_2_pin;
1528 if (!entry) 1784 if (!entry)
1529 continue; 1785 continue;
1530 printk(KERN_DEBUG "IRQ%d ", irq); 1786 printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1830,7 +2086,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
1830{ 2086{
1831 union IO_APIC_reg_00 reg_00; 2087 union IO_APIC_reg_00 reg_00;
1832 physid_mask_t phys_id_present_map; 2088 physid_mask_t phys_id_present_map;
1833 int apic; 2089 int apic_id;
1834 int i; 2090 int i;
1835 unsigned char old_id; 2091 unsigned char old_id;
1836 unsigned long flags; 2092 unsigned long flags;
@@ -1849,26 +2105,26 @@ static void __init setup_ioapic_ids_from_mpc(void)
1849 * This is broken; anything with a real cpu count has to 2105 * This is broken; anything with a real cpu count has to
1850 * circumvent this idiocy regardless. 2106 * circumvent this idiocy regardless.
1851 */ 2107 */
1852 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); 2108 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
1853 2109
1854 /* 2110 /*
1855 * Set the IOAPIC ID to the value stored in the MPC table. 2111 * Set the IOAPIC ID to the value stored in the MPC table.
1856 */ 2112 */
1857 for (apic = 0; apic < nr_ioapics; apic++) { 2113 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
1858 2114
1859 /* Read the register 0 value */ 2115 /* Read the register 0 value */
1860 spin_lock_irqsave(&ioapic_lock, flags); 2116 spin_lock_irqsave(&ioapic_lock, flags);
1861 reg_00.raw = io_apic_read(apic, 0); 2117 reg_00.raw = io_apic_read(apic_id, 0);
1862 spin_unlock_irqrestore(&ioapic_lock, flags); 2118 spin_unlock_irqrestore(&ioapic_lock, flags);
1863 2119
1864 old_id = mp_ioapics[apic].mp_apicid; 2120 old_id = mp_ioapics[apic_id].apicid;
1865 2121
1866 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { 2122 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
1867 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 2123 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1868 apic, mp_ioapics[apic].mp_apicid); 2124 apic_id, mp_ioapics[apic_id].apicid);
1869 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2125 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1870 reg_00.bits.ID); 2126 reg_00.bits.ID);
1871 mp_ioapics[apic].mp_apicid = reg_00.bits.ID; 2127 mp_ioapics[apic_id].apicid = reg_00.bits.ID;
1872 } 2128 }
1873 2129
1874 /* 2130 /*
@@ -1876,10 +2132,10 @@ static void __init setup_ioapic_ids_from_mpc(void)
1876 * system must have a unique ID or we get lots of nice 2132 * system must have a unique ID or we get lots of nice
1877 * 'stuck on smp_invalidate_needed IPI wait' messages. 2133 * 'stuck on smp_invalidate_needed IPI wait' messages.
1878 */ 2134 */
1879 if (check_apicid_used(phys_id_present_map, 2135 if (apic->check_apicid_used(phys_id_present_map,
1880 mp_ioapics[apic].mp_apicid)) { 2136 mp_ioapics[apic_id].apicid)) {
1881 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2137 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1882 apic, mp_ioapics[apic].mp_apicid); 2138 apic_id, mp_ioapics[apic_id].apicid);
1883 for (i = 0; i < get_physical_broadcast(); i++) 2139 for (i = 0; i < get_physical_broadcast(); i++)
1884 if (!physid_isset(i, phys_id_present_map)) 2140 if (!physid_isset(i, phys_id_present_map))
1885 break; 2141 break;
@@ -1888,13 +2144,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
1888 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2144 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1889 i); 2145 i);
1890 physid_set(i, phys_id_present_map); 2146 physid_set(i, phys_id_present_map);
1891 mp_ioapics[apic].mp_apicid = i; 2147 mp_ioapics[apic_id].apicid = i;
1892 } else { 2148 } else {
1893 physid_mask_t tmp; 2149 physid_mask_t tmp;
1894 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); 2150 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
1895 apic_printk(APIC_VERBOSE, "Setting %d in the " 2151 apic_printk(APIC_VERBOSE, "Setting %d in the "
1896 "phys_id_present_map\n", 2152 "phys_id_present_map\n",
1897 mp_ioapics[apic].mp_apicid); 2153 mp_ioapics[apic_id].apicid);
1898 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2154 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1899 } 2155 }
1900 2156
@@ -1903,11 +2159,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
1903 * We need to adjust the IRQ routing table 2159 * We need to adjust the IRQ routing table
1904 * if the ID changed. 2160 * if the ID changed.
1905 */ 2161 */
1906 if (old_id != mp_ioapics[apic].mp_apicid) 2162 if (old_id != mp_ioapics[apic_id].apicid)
1907 for (i = 0; i < mp_irq_entries; i++) 2163 for (i = 0; i < mp_irq_entries; i++)
1908 if (mp_irqs[i].mp_dstapic == old_id) 2164 if (mp_irqs[i].dstapic == old_id)
1909 mp_irqs[i].mp_dstapic 2165 mp_irqs[i].dstapic
1910 = mp_ioapics[apic].mp_apicid; 2166 = mp_ioapics[apic_id].apicid;
1911 2167
1912 /* 2168 /*
1913 * Read the right value from the MPC table and 2169 * Read the right value from the MPC table and
@@ -1915,20 +2171,20 @@ static void __init setup_ioapic_ids_from_mpc(void)
1915 */ 2171 */
1916 apic_printk(APIC_VERBOSE, KERN_INFO 2172 apic_printk(APIC_VERBOSE, KERN_INFO
1917 "...changing IO-APIC physical APIC ID to %d ...", 2173 "...changing IO-APIC physical APIC ID to %d ...",
1918 mp_ioapics[apic].mp_apicid); 2174 mp_ioapics[apic_id].apicid);
1919 2175
1920 reg_00.bits.ID = mp_ioapics[apic].mp_apicid; 2176 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
1921 spin_lock_irqsave(&ioapic_lock, flags); 2177 spin_lock_irqsave(&ioapic_lock, flags);
1922 io_apic_write(apic, 0, reg_00.raw); 2178 io_apic_write(apic_id, 0, reg_00.raw);
1923 spin_unlock_irqrestore(&ioapic_lock, flags); 2179 spin_unlock_irqrestore(&ioapic_lock, flags);
1924 2180
1925 /* 2181 /*
1926 * Sanity check 2182 * Sanity check
1927 */ 2183 */
1928 spin_lock_irqsave(&ioapic_lock, flags); 2184 spin_lock_irqsave(&ioapic_lock, flags);
1929 reg_00.raw = io_apic_read(apic, 0); 2185 reg_00.raw = io_apic_read(apic_id, 0);
1930 spin_unlock_irqrestore(&ioapic_lock, flags); 2186 spin_unlock_irqrestore(&ioapic_lock, flags);
1931 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) 2187 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
1932 printk("could not set ID!\n"); 2188 printk("could not set ID!\n");
1933 else 2189 else
1934 apic_printk(APIC_VERBOSE, " ok.\n"); 2190 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2008,14 +2264,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2008{ 2264{
2009 int was_pending = 0; 2265 int was_pending = 0;
2010 unsigned long flags; 2266 unsigned long flags;
2267 struct irq_cfg *cfg;
2011 2268
2012 spin_lock_irqsave(&ioapic_lock, flags); 2269 spin_lock_irqsave(&ioapic_lock, flags);
2013 if (irq < 16) { 2270 if (irq < NR_IRQS_LEGACY) {
2014 disable_8259A_irq(irq); 2271 disable_8259A_irq(irq);
2015 if (i8259A_irq_pending(irq)) 2272 if (i8259A_irq_pending(irq))
2016 was_pending = 1; 2273 was_pending = 1;
2017 } 2274 }
2018 __unmask_IO_APIC_irq(irq); 2275 cfg = irq_cfg(irq);
2276 __unmask_IO_APIC_irq(cfg);
2019 spin_unlock_irqrestore(&ioapic_lock, flags); 2277 spin_unlock_irqrestore(&ioapic_lock, flags);
2020 2278
2021 return was_pending; 2279 return was_pending;
@@ -2029,7 +2287,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2029 unsigned long flags; 2287 unsigned long flags;
2030 2288
2031 spin_lock_irqsave(&vector_lock, flags); 2289 spin_lock_irqsave(&vector_lock, flags);
2032 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); 2290 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2033 spin_unlock_irqrestore(&vector_lock, flags); 2291 spin_unlock_irqrestore(&vector_lock, flags);
2034 2292
2035 return 1; 2293 return 1;
@@ -2037,7 +2295,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2037#else 2295#else
2038static int ioapic_retrigger_irq(unsigned int irq) 2296static int ioapic_retrigger_irq(unsigned int irq)
2039{ 2297{
2040 send_IPI_self(irq_cfg(irq)->vector); 2298 apic->send_IPI_self(irq_cfg(irq)->vector);
2041 2299
2042 return 1; 2300 return 1;
2043} 2301}
@@ -2078,35 +2336,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2078 * as simple as edge triggered migration and we can do the irq migration 2336 * as simple as edge triggered migration and we can do the irq migration
2079 * with a simple atomic update to IO-APIC RTE. 2337 * with a simple atomic update to IO-APIC RTE.
2080 */ 2338 */
2081static void migrate_ioapic_irq(int irq, cpumask_t mask) 2339static void
2340migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2082{ 2341{
2083 struct irq_cfg *cfg; 2342 struct irq_cfg *cfg;
2084 struct irq_desc *desc;
2085 cpumask_t tmp, cleanup_mask;
2086 struct irte irte; 2343 struct irte irte;
2087 int modify_ioapic_rte; 2344 int modify_ioapic_rte;
2088 unsigned int dest; 2345 unsigned int dest;
2089 unsigned long flags; 2346 unsigned long flags;
2347 unsigned int irq;
2090 2348
2091 cpus_and(tmp, mask, cpu_online_map); 2349 if (!cpumask_intersects(mask, cpu_online_mask))
2092 if (cpus_empty(tmp))
2093 return; 2350 return;
2094 2351
2352 irq = desc->irq;
2095 if (get_irte(irq, &irte)) 2353 if (get_irte(irq, &irte))
2096 return; 2354 return;
2097 2355
2098 if (assign_irq_vector(irq, mask)) 2356 cfg = desc->chip_data;
2357 if (assign_irq_vector(irq, cfg, mask))
2099 return; 2358 return;
2100 2359
2101 cfg = irq_cfg(irq); 2360 set_extra_move_desc(desc, mask);
2102 cpus_and(tmp, cfg->domain, mask); 2361
2103 dest = cpu_mask_to_apicid(tmp); 2362 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2104 2363
2105 desc = irq_to_desc(irq);
2106 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2364 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2107 if (modify_ioapic_rte) { 2365 if (modify_ioapic_rte) {
2108 spin_lock_irqsave(&ioapic_lock, flags); 2366 spin_lock_irqsave(&ioapic_lock, flags);
2109 __target_IO_APIC_irq(irq, dest, cfg->vector); 2367 __target_IO_APIC_irq(irq, dest, cfg);
2110 spin_unlock_irqrestore(&ioapic_lock, flags); 2368 spin_unlock_irqrestore(&ioapic_lock, flags);
2111 } 2369 }
2112 2370
@@ -2118,24 +2376,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
2118 */ 2376 */
2119 modify_irte(irq, &irte); 2377 modify_irte(irq, &irte);
2120 2378
2121 if (cfg->move_in_progress) { 2379 if (cfg->move_in_progress)
2122 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2380 send_cleanup_vector(cfg);
2123 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2124 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2125 cfg->move_in_progress = 0;
2126 }
2127 2381
2128 desc->affinity = mask; 2382 cpumask_copy(desc->affinity, mask);
2129} 2383}
2130 2384
2131static int migrate_irq_remapped_level(int irq) 2385static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2132{ 2386{
2133 int ret = -1; 2387 int ret = -1;
2134 struct irq_desc *desc = irq_to_desc(irq); 2388 struct irq_cfg *cfg = desc->chip_data;
2135 2389
2136 mask_IO_APIC_irq(irq); 2390 mask_IO_APIC_irq_desc(desc);
2137 2391
2138 if (io_apic_level_ack_pending(irq)) { 2392 if (io_apic_level_ack_pending(cfg)) {
2139 /* 2393 /*
2140 * Interrupt in progress. Migrating irq now will change the 2394 * Interrupt in progress. Migrating irq now will change the
2141 * vector information in the IO-APIC RTE and that will confuse 2395 * vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2401,15 @@ static int migrate_irq_remapped_level(int irq)
2147 } 2401 }
2148 2402
2149 /* everthing is clear. we have right of way */ 2403 /* everthing is clear. we have right of way */
2150 migrate_ioapic_irq(irq, desc->pending_mask); 2404 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2151 2405
2152 ret = 0; 2406 ret = 0;
2153 desc->status &= ~IRQ_MOVE_PENDING; 2407 desc->status &= ~IRQ_MOVE_PENDING;
2154 cpus_clear(desc->pending_mask); 2408 cpumask_clear(desc->pending_mask);
2155 2409
2156unmask: 2410unmask:
2157 unmask_IO_APIC_irq(irq); 2411 unmask_IO_APIC_irq_desc(desc);
2412
2158 return ret; 2413 return ret;
2159} 2414}
2160 2415
@@ -2184,28 +2439,33 @@ static void ir_irq_migration(struct work_struct *work)
2184/* 2439/*
2185 * Migrates the IRQ destination in the process context. 2440 * Migrates the IRQ destination in the process context.
2186 */ 2441 */
2187static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 2442static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2443 const struct cpumask *mask)
2188{ 2444{
2189 struct irq_desc *desc = irq_to_desc(irq);
2190
2191 if (desc->status & IRQ_LEVEL) { 2445 if (desc->status & IRQ_LEVEL) {
2192 desc->status |= IRQ_MOVE_PENDING; 2446 desc->status |= IRQ_MOVE_PENDING;
2193 desc->pending_mask = mask; 2447 cpumask_copy(desc->pending_mask, mask);
2194 migrate_irq_remapped_level(irq); 2448 migrate_irq_remapped_level_desc(desc);
2195 return; 2449 return;
2196 } 2450 }
2197 2451
2198 migrate_ioapic_irq(irq, mask); 2452 migrate_ioapic_irq_desc(desc, mask);
2453}
2454static void set_ir_ioapic_affinity_irq(unsigned int irq,
2455 const struct cpumask *mask)
2456{
2457 struct irq_desc *desc = irq_to_desc(irq);
2458
2459 set_ir_ioapic_affinity_irq_desc(desc, mask);
2199} 2460}
2200#endif 2461#endif
2201 2462
2202asmlinkage void smp_irq_move_cleanup_interrupt(void) 2463asmlinkage void smp_irq_move_cleanup_interrupt(void)
2203{ 2464{
2204 unsigned vector, me; 2465 unsigned vector, me;
2466
2205 ack_APIC_irq(); 2467 ack_APIC_irq();
2206#ifdef CONFIG_X86_64
2207 exit_idle(); 2468 exit_idle();
2208#endif
2209 irq_enter(); 2469 irq_enter();
2210 2470
2211 me = smp_processor_id(); 2471 me = smp_processor_id();
@@ -2215,6 +2475,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2215 struct irq_cfg *cfg; 2475 struct irq_cfg *cfg;
2216 irq = __get_cpu_var(vector_irq)[vector]; 2476 irq = __get_cpu_var(vector_irq)[vector];
2217 2477
2478 if (irq == -1)
2479 continue;
2480
2218 desc = irq_to_desc(irq); 2481 desc = irq_to_desc(irq);
2219 if (!desc) 2482 if (!desc)
2220 continue; 2483 continue;
@@ -2224,7 +2487,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2224 if (!cfg->move_cleanup_count) 2487 if (!cfg->move_cleanup_count)
2225 goto unlock; 2488 goto unlock;
2226 2489
2227 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) 2490 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2228 goto unlock; 2491 goto unlock;
2229 2492
2230 __get_cpu_var(vector_irq)[vector] = -1; 2493 __get_cpu_var(vector_irq)[vector] = -1;
@@ -2236,28 +2499,45 @@ unlock:
2236 irq_exit(); 2499 irq_exit();
2237} 2500}
2238 2501
2239static void irq_complete_move(unsigned int irq) 2502static void irq_complete_move(struct irq_desc **descp)
2240{ 2503{
2241 struct irq_cfg *cfg = irq_cfg(irq); 2504 struct irq_desc *desc = *descp;
2505 struct irq_cfg *cfg = desc->chip_data;
2242 unsigned vector, me; 2506 unsigned vector, me;
2243 2507
2244 if (likely(!cfg->move_in_progress)) 2508 if (likely(!cfg->move_in_progress)) {
2509#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2510 if (likely(!cfg->move_desc_pending))
2511 return;
2512
2513 /* domain has not changed, but affinity did */
2514 me = smp_processor_id();
2515 if (cpumask_test_cpu(me, desc->affinity)) {
2516 *descp = desc = move_irq_desc(desc, me);
2517 /* get the new one */
2518 cfg = desc->chip_data;
2519 cfg->move_desc_pending = 0;
2520 }
2521#endif
2245 return; 2522 return;
2523 }
2246 2524
2247 vector = ~get_irq_regs()->orig_ax; 2525 vector = ~get_irq_regs()->orig_ax;
2248 me = smp_processor_id(); 2526 me = smp_processor_id();
2249 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
2250 cpumask_t cleanup_mask;
2251 2527
2252 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2528 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
2253 cfg->move_cleanup_count = cpus_weight(cleanup_mask); 2529#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2254 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2530 *descp = desc = move_irq_desc(desc, me);
2255 cfg->move_in_progress = 0; 2531 /* get the new one */
2532 cfg = desc->chip_data;
2533#endif
2534 send_cleanup_vector(cfg);
2256 } 2535 }
2257} 2536}
2258#else 2537#else
2259static inline void irq_complete_move(unsigned int irq) {} 2538static inline void irq_complete_move(struct irq_desc **descp) {}
2260#endif 2539#endif
2540
2261#ifdef CONFIG_INTR_REMAP 2541#ifdef CONFIG_INTR_REMAP
2262static void ack_x2apic_level(unsigned int irq) 2542static void ack_x2apic_level(unsigned int irq)
2263{ 2543{
@@ -2268,11 +2548,14 @@ static void ack_x2apic_edge(unsigned int irq)
2268{ 2548{
2269 ack_x2APIC_irq(); 2549 ack_x2APIC_irq();
2270} 2550}
2551
2271#endif 2552#endif
2272 2553
2273static void ack_apic_edge(unsigned int irq) 2554static void ack_apic_edge(unsigned int irq)
2274{ 2555{
2275 irq_complete_move(irq); 2556 struct irq_desc *desc = irq_to_desc(irq);
2557
2558 irq_complete_move(&desc);
2276 move_native_irq(irq); 2559 move_native_irq(irq);
2277 ack_APIC_irq(); 2560 ack_APIC_irq();
2278} 2561}
@@ -2281,18 +2564,21 @@ atomic_t irq_mis_count;
2281 2564
2282static void ack_apic_level(unsigned int irq) 2565static void ack_apic_level(unsigned int irq)
2283{ 2566{
2567 struct irq_desc *desc = irq_to_desc(irq);
2568
2284#ifdef CONFIG_X86_32 2569#ifdef CONFIG_X86_32
2285 unsigned long v; 2570 unsigned long v;
2286 int i; 2571 int i;
2287#endif 2572#endif
2573 struct irq_cfg *cfg;
2288 int do_unmask_irq = 0; 2574 int do_unmask_irq = 0;
2289 2575
2290 irq_complete_move(irq); 2576 irq_complete_move(&desc);
2291#ifdef CONFIG_GENERIC_PENDING_IRQ 2577#ifdef CONFIG_GENERIC_PENDING_IRQ
2292 /* If we are moving the irq we need to mask it */ 2578 /* If we are moving the irq we need to mask it */
2293 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2579 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2294 do_unmask_irq = 1; 2580 do_unmask_irq = 1;
2295 mask_IO_APIC_irq(irq); 2581 mask_IO_APIC_irq_desc(desc);
2296 } 2582 }
2297#endif 2583#endif
2298 2584
@@ -2316,7 +2602,8 @@ static void ack_apic_level(unsigned int irq)
2316 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2602 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2317 * The idea is from Manfred Spraul. --macro 2603 * The idea is from Manfred Spraul. --macro
2318 */ 2604 */
2319 i = irq_cfg(irq)->vector; 2605 cfg = desc->chip_data;
2606 i = cfg->vector;
2320 2607
2321 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2608 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2322#endif 2609#endif
@@ -2355,17 +2642,18 @@ static void ack_apic_level(unsigned int irq)
2355 * accurate and is causing problems then it is a hardware bug 2642 * accurate and is causing problems then it is a hardware bug
2356 * and you can go talk to the chipset vendor about it. 2643 * and you can go talk to the chipset vendor about it.
2357 */ 2644 */
2358 if (!io_apic_level_ack_pending(irq)) 2645 cfg = desc->chip_data;
2646 if (!io_apic_level_ack_pending(cfg))
2359 move_masked_irq(irq); 2647 move_masked_irq(irq);
2360 unmask_IO_APIC_irq(irq); 2648 unmask_IO_APIC_irq_desc(desc);
2361 } 2649 }
2362 2650
2363#ifdef CONFIG_X86_32 2651#ifdef CONFIG_X86_32
2364 if (!(v & (1 << (i & 0x1f)))) { 2652 if (!(v & (1 << (i & 0x1f)))) {
2365 atomic_inc(&irq_mis_count); 2653 atomic_inc(&irq_mis_count);
2366 spin_lock(&ioapic_lock); 2654 spin_lock(&ioapic_lock);
2367 __mask_and_edge_IO_APIC_irq(irq); 2655 __mask_and_edge_IO_APIC_irq(cfg);
2368 __unmask_and_level_IO_APIC_irq(irq); 2656 __unmask_and_level_IO_APIC_irq(cfg);
2369 spin_unlock(&ioapic_lock); 2657 spin_unlock(&ioapic_lock);
2370 } 2658 }
2371#endif 2659#endif
@@ -2416,20 +2704,19 @@ static inline void init_IO_APIC_traps(void)
2416 * Also, we've got to be careful not to trash gate 2704 * Also, we've got to be careful not to trash gate
2417 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2705 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2418 */ 2706 */
2419 for_each_irq_cfg(irq, cfg) { 2707 for_each_irq_desc(irq, desc) {
2420 if (IO_APIC_IRQ(irq) && !cfg->vector) { 2708 cfg = desc->chip_data;
2709 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2421 /* 2710 /*
2422 * Hmm.. We don't have an entry for this, 2711 * Hmm.. We don't have an entry for this,
2423 * so default to an old-fashioned 8259 2712 * so default to an old-fashioned 8259
2424 * interrupt if we can.. 2713 * interrupt if we can..
2425 */ 2714 */
2426 if (irq < 16) 2715 if (irq < NR_IRQS_LEGACY)
2427 make_8259A_irq(irq); 2716 make_8259A_irq(irq);
2428 else { 2717 else
2429 desc = irq_to_desc(irq);
2430 /* Strange. Oh, well.. */ 2718 /* Strange. Oh, well.. */
2431 desc->chip = &no_irq_chip; 2719 desc->chip = &no_irq_chip;
2432 }
2433 } 2720 }
2434 } 2721 }
2435} 2722}
@@ -2454,7 +2741,7 @@ static void unmask_lapic_irq(unsigned int irq)
2454 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2741 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2455} 2742}
2456 2743
2457static void ack_lapic_irq (unsigned int irq) 2744static void ack_lapic_irq(unsigned int irq)
2458{ 2745{
2459 ack_APIC_irq(); 2746 ack_APIC_irq();
2460} 2747}
@@ -2466,11 +2753,8 @@ static struct irq_chip lapic_chip __read_mostly = {
2466 .ack = ack_lapic_irq, 2753 .ack = ack_lapic_irq,
2467}; 2754};
2468 2755
2469static void lapic_register_intr(int irq) 2756static void lapic_register_intr(int irq, struct irq_desc *desc)
2470{ 2757{
2471 struct irq_desc *desc;
2472
2473 desc = irq_to_desc(irq);
2474 desc->status &= ~IRQ_LEVEL; 2758 desc->status &= ~IRQ_LEVEL;
2475 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2759 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2476 "edge"); 2760 "edge");
@@ -2574,22 +2858,20 @@ int timer_through_8259 __initdata;
2574 */ 2858 */
2575static inline void __init check_timer(void) 2859static inline void __init check_timer(void)
2576{ 2860{
2577 struct irq_cfg *cfg = irq_cfg(0); 2861 struct irq_desc *desc = irq_to_desc(0);
2862 struct irq_cfg *cfg = desc->chip_data;
2863 int cpu = boot_cpu_id;
2578 int apic1, pin1, apic2, pin2; 2864 int apic1, pin1, apic2, pin2;
2579 unsigned long flags; 2865 unsigned long flags;
2580 unsigned int ver;
2581 int no_pin1 = 0; 2866 int no_pin1 = 0;
2582 2867
2583 local_irq_save(flags); 2868 local_irq_save(flags);
2584 2869
2585 ver = apic_read(APIC_LVR);
2586 ver = GET_APIC_VERSION(ver);
2587
2588 /* 2870 /*
2589 * get/set the timer IRQ vector: 2871 * get/set the timer IRQ vector:
2590 */ 2872 */
2591 disable_8259A_irq(0); 2873 disable_8259A_irq(0);
2592 assign_irq_vector(0, TARGET_CPUS); 2874 assign_irq_vector(0, cfg, apic->target_cpus());
2593 2875
2594 /* 2876 /*
2595 * As IRQ0 is to be enabled in the 8259A, the virtual 2877 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2603,7 +2885,13 @@ static inline void __init check_timer(void)
2603 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2885 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2604 init_8259A(1); 2886 init_8259A(1);
2605#ifdef CONFIG_X86_32 2887#ifdef CONFIG_X86_32
2606 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2888 {
2889 unsigned int ver;
2890
2891 ver = apic_read(APIC_LVR);
2892 ver = GET_APIC_VERSION(ver);
2893 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2894 }
2607#endif 2895#endif
2608 2896
2609 pin1 = find_isa_irq_pin(0, mp_INT); 2897 pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2640,10 +2928,19 @@ static inline void __init check_timer(void)
2640 * Ok, does IRQ0 through the IOAPIC work? 2928 * Ok, does IRQ0 through the IOAPIC work?
2641 */ 2929 */
2642 if (no_pin1) { 2930 if (no_pin1) {
2643 add_pin_to_irq(0, apic1, pin1); 2931 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2644 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2932 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2933 } else {
2934 /* for edge trigger, setup_IO_APIC_irq already
2935 * leave it unmasked.
2936 * so only need to unmask if it is level-trigger
2937 * do we really have level trigger timer?
2938 */
2939 int idx;
2940 idx = find_irq_entry(apic1, pin1, mp_INT);
2941 if (idx != -1 && irq_trigger(idx))
2942 unmask_IO_APIC_irq_desc(desc);
2645 } 2943 }
2646 unmask_IO_APIC_irq(0);
2647 if (timer_irq_works()) { 2944 if (timer_irq_works()) {
2648 if (nmi_watchdog == NMI_IO_APIC) { 2945 if (nmi_watchdog == NMI_IO_APIC) {
2649 setup_nmi(); 2946 setup_nmi();
@@ -2657,6 +2954,7 @@ static inline void __init check_timer(void)
2657 if (intr_remapping_enabled) 2954 if (intr_remapping_enabled)
2658 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2955 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2659#endif 2956#endif
2957 local_irq_disable();
2660 clear_IO_APIC_pin(apic1, pin1); 2958 clear_IO_APIC_pin(apic1, pin1);
2661 if (!no_pin1) 2959 if (!no_pin1)
2662 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2960 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2669,9 +2967,8 @@ static inline void __init check_timer(void)
2669 /* 2967 /*
2670 * legacy devices should be connected to IO APIC #0 2968 * legacy devices should be connected to IO APIC #0
2671 */ 2969 */
2672 replace_pin_at_irq(0, apic1, pin1, apic2, pin2); 2970 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2673 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2971 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2674 unmask_IO_APIC_irq(0);
2675 enable_8259A_irq(0); 2972 enable_8259A_irq(0);
2676 if (timer_irq_works()) { 2973 if (timer_irq_works()) {
2677 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2974 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2686,6 +2983,7 @@ static inline void __init check_timer(void)
2686 /* 2983 /*
2687 * Cleanup, just in case ... 2984 * Cleanup, just in case ...
2688 */ 2985 */
2986 local_irq_disable();
2689 disable_8259A_irq(0); 2987 disable_8259A_irq(0);
2690 clear_IO_APIC_pin(apic2, pin2); 2988 clear_IO_APIC_pin(apic2, pin2);
2691 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2989 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -2703,7 +3001,7 @@ static inline void __init check_timer(void)
2703 apic_printk(APIC_QUIET, KERN_INFO 3001 apic_printk(APIC_QUIET, KERN_INFO
2704 "...trying to set up timer as Virtual Wire IRQ...\n"); 3002 "...trying to set up timer as Virtual Wire IRQ...\n");
2705 3003
2706 lapic_register_intr(0); 3004 lapic_register_intr(0, desc);
2707 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3005 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2708 enable_8259A_irq(0); 3006 enable_8259A_irq(0);
2709 3007
@@ -2711,6 +3009,7 @@ static inline void __init check_timer(void)
2711 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3009 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2712 goto out; 3010 goto out;
2713 } 3011 }
3012 local_irq_disable();
2714 disable_8259A_irq(0); 3013 disable_8259A_irq(0);
2715 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3014 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
2716 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3015 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -2728,6 +3027,7 @@ static inline void __init check_timer(void)
2728 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3027 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2729 goto out; 3028 goto out;
2730 } 3029 }
3030 local_irq_disable();
2731 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 3031 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2732 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 3032 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2733 "report. Then try booting with the 'noapic' option.\n"); 3033 "report. Then try booting with the 'noapic' option.\n");
@@ -2828,8 +3128,8 @@ static int ioapic_resume(struct sys_device *dev)
2828 3128
2829 spin_lock_irqsave(&ioapic_lock, flags); 3129 spin_lock_irqsave(&ioapic_lock, flags);
2830 reg_00.raw = io_apic_read(dev->id, 0); 3130 reg_00.raw = io_apic_read(dev->id, 0);
2831 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { 3131 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
2832 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; 3132 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
2833 io_apic_write(dev->id, 0, reg_00.raw); 3133 io_apic_write(dev->id, 0, reg_00.raw);
2834 } 3134 }
2835 spin_unlock_irqrestore(&ioapic_lock, flags); 3135 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2879,6 +3179,7 @@ static int __init ioapic_init_sysfs(void)
2879 3179
2880device_initcall(ioapic_init_sysfs); 3180device_initcall(ioapic_init_sysfs);
2881 3181
3182static int nr_irqs_gsi = NR_IRQS_LEGACY;
2882/* 3183/*
2883 * Dynamic irq allocate and deallocation 3184 * Dynamic irq allocate and deallocation
2884 */ 3185 */
@@ -2888,22 +3189,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
2888 unsigned int irq; 3189 unsigned int irq;
2889 unsigned int new; 3190 unsigned int new;
2890 unsigned long flags; 3191 unsigned long flags;
2891 struct irq_cfg *cfg_new; 3192 struct irq_cfg *cfg_new = NULL;
2892 3193 int cpu = boot_cpu_id;
2893 irq_want = nr_irqs - 1; 3194 struct irq_desc *desc_new = NULL;
2894 3195
2895 irq = 0; 3196 irq = 0;
3197 if (irq_want < nr_irqs_gsi)
3198 irq_want = nr_irqs_gsi;
3199
2896 spin_lock_irqsave(&vector_lock, flags); 3200 spin_lock_irqsave(&vector_lock, flags);
2897 for (new = irq_want; new > 0; new--) { 3201 for (new = irq_want; new < nr_irqs; new++) {
2898 if (platform_legacy_irq(new)) 3202 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3203 if (!desc_new) {
3204 printk(KERN_INFO "can not get irq_desc for %d\n", new);
2899 continue; 3205 continue;
2900 cfg_new = irq_cfg(new); 3206 }
2901 if (cfg_new && cfg_new->vector != 0) 3207 cfg_new = desc_new->chip_data;
3208
3209 if (cfg_new->vector != 0)
2902 continue; 3210 continue;
2903 /* check if need to create one */ 3211 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
2904 if (!cfg_new)
2905 cfg_new = irq_cfg_alloc(new);
2906 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
2907 irq = new; 3212 irq = new;
2908 break; 3213 break;
2909 } 3214 }
@@ -2911,15 +3216,20 @@ unsigned int create_irq_nr(unsigned int irq_want)
2911 3216
2912 if (irq > 0) { 3217 if (irq > 0) {
2913 dynamic_irq_init(irq); 3218 dynamic_irq_init(irq);
3219 /* restore it, in case dynamic_irq_init clear it */
3220 if (desc_new)
3221 desc_new->chip_data = cfg_new;
2914 } 3222 }
2915 return irq; 3223 return irq;
2916} 3224}
2917 3225
2918int create_irq(void) 3226int create_irq(void)
2919{ 3227{
3228 unsigned int irq_want;
2920 int irq; 3229 int irq;
2921 3230
2922 irq = create_irq_nr(nr_irqs - 1); 3231 irq_want = nr_irqs_gsi;
3232 irq = create_irq_nr(irq_want);
2923 3233
2924 if (irq == 0) 3234 if (irq == 0)
2925 irq = -1; 3235 irq = -1;
@@ -2930,14 +3240,22 @@ int create_irq(void)
2930void destroy_irq(unsigned int irq) 3240void destroy_irq(unsigned int irq)
2931{ 3241{
2932 unsigned long flags; 3242 unsigned long flags;
3243 struct irq_cfg *cfg;
3244 struct irq_desc *desc;
2933 3245
3246 /* store it, in case dynamic_irq_cleanup clear it */
3247 desc = irq_to_desc(irq);
3248 cfg = desc->chip_data;
2934 dynamic_irq_cleanup(irq); 3249 dynamic_irq_cleanup(irq);
3250 /* connect back irq_cfg */
3251 if (desc)
3252 desc->chip_data = cfg;
2935 3253
2936#ifdef CONFIG_INTR_REMAP 3254#ifdef CONFIG_INTR_REMAP
2937 free_irte(irq); 3255 free_irte(irq);
2938#endif 3256#endif
2939 spin_lock_irqsave(&vector_lock, flags); 3257 spin_lock_irqsave(&vector_lock, flags);
2940 __clear_irq_vector(irq); 3258 __clear_irq_vector(irq, cfg);
2941 spin_unlock_irqrestore(&vector_lock, flags); 3259 spin_unlock_irqrestore(&vector_lock, flags);
2942} 3260}
2943 3261
@@ -2950,16 +3268,16 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2950 struct irq_cfg *cfg; 3268 struct irq_cfg *cfg;
2951 int err; 3269 int err;
2952 unsigned dest; 3270 unsigned dest;
2953 cpumask_t tmp;
2954 3271
2955 tmp = TARGET_CPUS; 3272 if (disable_apic)
2956 err = assign_irq_vector(irq, tmp); 3273 return -ENXIO;
3274
3275 cfg = irq_cfg(irq);
3276 err = assign_irq_vector(irq, cfg, apic->target_cpus());
2957 if (err) 3277 if (err)
2958 return err; 3278 return err;
2959 3279
2960 cfg = irq_cfg(irq); 3280 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
2961 cpus_and(tmp, cfg->domain, tmp);
2962 dest = cpu_mask_to_apicid(tmp);
2963 3281
2964#ifdef CONFIG_INTR_REMAP 3282#ifdef CONFIG_INTR_REMAP
2965 if (irq_remapped(irq)) { 3283 if (irq_remapped(irq)) {
@@ -2973,9 +3291,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2973 memset (&irte, 0, sizeof(irte)); 3291 memset (&irte, 0, sizeof(irte));
2974 3292
2975 irte.present = 1; 3293 irte.present = 1;
2976 irte.dst_mode = INT_DEST_MODE; 3294 irte.dst_mode = apic->irq_dest_mode;
2977 irte.trigger_mode = 0; /* edge */ 3295 irte.trigger_mode = 0; /* edge */
2978 irte.dlvry_mode = INT_DELIVERY_MODE; 3296 irte.dlvry_mode = apic->irq_delivery_mode;
2979 irte.vector = cfg->vector; 3297 irte.vector = cfg->vector;
2980 irte.dest_id = IRTE_DEST(dest); 3298 irte.dest_id = IRTE_DEST(dest);
2981 3299
@@ -2993,10 +3311,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2993 msg->address_hi = MSI_ADDR_BASE_HI; 3311 msg->address_hi = MSI_ADDR_BASE_HI;
2994 msg->address_lo = 3312 msg->address_lo =
2995 MSI_ADDR_BASE_LO | 3313 MSI_ADDR_BASE_LO |
2996 ((INT_DEST_MODE == 0) ? 3314 ((apic->irq_dest_mode == 0) ?
2997 MSI_ADDR_DEST_MODE_PHYSICAL: 3315 MSI_ADDR_DEST_MODE_PHYSICAL:
2998 MSI_ADDR_DEST_MODE_LOGICAL) | 3316 MSI_ADDR_DEST_MODE_LOGICAL) |
2999 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3317 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3000 MSI_ADDR_REDIRECTION_CPU: 3318 MSI_ADDR_REDIRECTION_CPU:
3001 MSI_ADDR_REDIRECTION_LOWPRI) | 3319 MSI_ADDR_REDIRECTION_LOWPRI) |
3002 MSI_ADDR_DEST_ID(dest); 3320 MSI_ADDR_DEST_ID(dest);
@@ -3004,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3004 msg->data = 3322 msg->data =
3005 MSI_DATA_TRIGGER_EDGE | 3323 MSI_DATA_TRIGGER_EDGE |
3006 MSI_DATA_LEVEL_ASSERT | 3324 MSI_DATA_LEVEL_ASSERT |
3007 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3325 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3008 MSI_DATA_DELIVERY_FIXED: 3326 MSI_DATA_DELIVERY_FIXED:
3009 MSI_DATA_DELIVERY_LOWPRI) | 3327 MSI_DATA_DELIVERY_LOWPRI) |
3010 MSI_DATA_VECTOR(cfg->vector); 3328 MSI_DATA_VECTOR(cfg->vector);
@@ -3013,64 +3331,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3013} 3331}
3014 3332
3015#ifdef CONFIG_SMP 3333#ifdef CONFIG_SMP
3016static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3334static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3017{ 3335{
3336 struct irq_desc *desc = irq_to_desc(irq);
3018 struct irq_cfg *cfg; 3337 struct irq_cfg *cfg;
3019 struct msi_msg msg; 3338 struct msi_msg msg;
3020 unsigned int dest; 3339 unsigned int dest;
3021 cpumask_t tmp;
3022 struct irq_desc *desc;
3023
3024 cpus_and(tmp, mask, cpu_online_map);
3025 if (cpus_empty(tmp))
3026 return;
3027 3340
3028 if (assign_irq_vector(irq, mask)) 3341 dest = set_desc_affinity(desc, mask);
3342 if (dest == BAD_APICID)
3029 return; 3343 return;
3030 3344
3031 cfg = irq_cfg(irq); 3345 cfg = desc->chip_data;
3032 cpus_and(tmp, cfg->domain, mask);
3033 dest = cpu_mask_to_apicid(tmp);
3034 3346
3035 read_msi_msg(irq, &msg); 3347 read_msi_msg_desc(desc, &msg);
3036 3348
3037 msg.data &= ~MSI_DATA_VECTOR_MASK; 3349 msg.data &= ~MSI_DATA_VECTOR_MASK;
3038 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3350 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3039 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3351 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3040 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3352 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3041 3353
3042 write_msi_msg(irq, &msg); 3354 write_msi_msg_desc(desc, &msg);
3043 desc = irq_to_desc(irq);
3044 desc->affinity = mask;
3045} 3355}
3046
3047#ifdef CONFIG_INTR_REMAP 3356#ifdef CONFIG_INTR_REMAP
3048/* 3357/*
3049 * Migrate the MSI irq to another cpumask. This migration is 3358 * Migrate the MSI irq to another cpumask. This migration is
3050 * done in the process context using interrupt-remapping hardware. 3359 * done in the process context using interrupt-remapping hardware.
3051 */ 3360 */
3052static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3361static void
3362ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3053{ 3363{
3054 struct irq_cfg *cfg; 3364 struct irq_desc *desc = irq_to_desc(irq);
3365 struct irq_cfg *cfg = desc->chip_data;
3055 unsigned int dest; 3366 unsigned int dest;
3056 cpumask_t tmp, cleanup_mask;
3057 struct irte irte; 3367 struct irte irte;
3058 struct irq_desc *desc;
3059
3060 cpus_and(tmp, mask, cpu_online_map);
3061 if (cpus_empty(tmp))
3062 return;
3063 3368
3064 if (get_irte(irq, &irte)) 3369 if (get_irte(irq, &irte))
3065 return; 3370 return;
3066 3371
3067 if (assign_irq_vector(irq, mask)) 3372 dest = set_desc_affinity(desc, mask);
3373 if (dest == BAD_APICID)
3068 return; 3374 return;
3069 3375
3070 cfg = irq_cfg(irq);
3071 cpus_and(tmp, cfg->domain, mask);
3072 dest = cpu_mask_to_apicid(tmp);
3073
3074 irte.vector = cfg->vector; 3376 irte.vector = cfg->vector;
3075 irte.dest_id = IRTE_DEST(dest); 3377 irte.dest_id = IRTE_DEST(dest);
3076 3378
@@ -3084,16 +3386,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3084 * at the new destination. So, time to cleanup the previous 3386 * at the new destination. So, time to cleanup the previous
3085 * vector allocation. 3387 * vector allocation.
3086 */ 3388 */
3087 if (cfg->move_in_progress) { 3389 if (cfg->move_in_progress)
3088 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 3390 send_cleanup_vector(cfg);
3089 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3090 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3091 cfg->move_in_progress = 0;
3092 }
3093
3094 desc = irq_to_desc(irq);
3095 desc->affinity = mask;
3096} 3391}
3392
3097#endif 3393#endif
3098#endif /* CONFIG_SMP */ 3394#endif /* CONFIG_SMP */
3099 3395
@@ -3152,7 +3448,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3152} 3448}
3153#endif 3449#endif
3154 3450
3155static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) 3451static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3156{ 3452{
3157 int ret; 3453 int ret;
3158 struct msi_msg msg; 3454 struct msi_msg msg;
@@ -3161,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3161 if (ret < 0) 3457 if (ret < 0)
3162 return ret; 3458 return ret;
3163 3459
3164 set_irq_msi(irq, desc); 3460 set_irq_msi(irq, msidesc);
3165 write_msi_msg(irq, &msg); 3461 write_msi_msg(irq, &msg);
3166 3462
3167#ifdef CONFIG_INTR_REMAP 3463#ifdef CONFIG_INTR_REMAP
@@ -3181,58 +3477,11 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3181 return 0; 3477 return 0;
3182} 3478}
3183 3479
3184static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
3185{
3186 unsigned int irq;
3187
3188 irq = dev->bus->number;
3189 irq <<= 8;
3190 irq |= dev->devfn;
3191 irq <<= 12;
3192
3193 return irq;
3194}
3195
3196int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3197{
3198 unsigned int irq;
3199 int ret;
3200 unsigned int irq_want;
3201
3202 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3203
3204 irq = create_irq_nr(irq_want);
3205 if (irq == 0)
3206 return -1;
3207
3208#ifdef CONFIG_INTR_REMAP
3209 if (!intr_remapping_enabled)
3210 goto no_ir;
3211
3212 ret = msi_alloc_irte(dev, irq, 1);
3213 if (ret < 0)
3214 goto error;
3215no_ir:
3216#endif
3217 ret = setup_msi_irq(dev, desc, irq);
3218 if (ret < 0) {
3219 destroy_irq(irq);
3220 return ret;
3221 }
3222 return 0;
3223
3224#ifdef CONFIG_INTR_REMAP
3225error:
3226 destroy_irq(irq);
3227 return ret;
3228#endif
3229}
3230
3231int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3480int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3232{ 3481{
3233 unsigned int irq; 3482 unsigned int irq;
3234 int ret, sub_handle; 3483 int ret, sub_handle;
3235 struct msi_desc *desc; 3484 struct msi_desc *msidesc;
3236 unsigned int irq_want; 3485 unsigned int irq_want;
3237 3486
3238#ifdef CONFIG_INTR_REMAP 3487#ifdef CONFIG_INTR_REMAP
@@ -3240,12 +3489,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3240 int index = 0; 3489 int index = 0;
3241#endif 3490#endif
3242 3491
3243 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3492 irq_want = nr_irqs_gsi;
3244 sub_handle = 0; 3493 sub_handle = 0;
3245 list_for_each_entry(desc, &dev->msi_list, list) { 3494 list_for_each_entry(msidesc, &dev->msi_list, list) {
3246 irq = create_irq_nr(irq_want--); 3495 irq = create_irq_nr(irq_want);
3247 if (irq == 0) 3496 if (irq == 0)
3248 return -1; 3497 return -1;
3498 irq_want = irq + 1;
3249#ifdef CONFIG_INTR_REMAP 3499#ifdef CONFIG_INTR_REMAP
3250 if (!intr_remapping_enabled) 3500 if (!intr_remapping_enabled)
3251 goto no_ir; 3501 goto no_ir;
@@ -3275,7 +3525,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3275 } 3525 }
3276no_ir: 3526no_ir:
3277#endif 3527#endif
3278 ret = setup_msi_irq(dev, desc, irq); 3528 ret = setup_msi_irq(dev, msidesc, irq);
3279 if (ret < 0) 3529 if (ret < 0)
3280 goto error; 3530 goto error;
3281 sub_handle++; 3531 sub_handle++;
@@ -3294,24 +3544,18 @@ void arch_teardown_msi_irq(unsigned int irq)
3294 3544
3295#ifdef CONFIG_DMAR 3545#ifdef CONFIG_DMAR
3296#ifdef CONFIG_SMP 3546#ifdef CONFIG_SMP
3297static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3547static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3298{ 3548{
3549 struct irq_desc *desc = irq_to_desc(irq);
3299 struct irq_cfg *cfg; 3550 struct irq_cfg *cfg;
3300 struct msi_msg msg; 3551 struct msi_msg msg;
3301 unsigned int dest; 3552 unsigned int dest;
3302 cpumask_t tmp;
3303 struct irq_desc *desc;
3304 3553
3305 cpus_and(tmp, mask, cpu_online_map); 3554 dest = set_desc_affinity(desc, mask);
3306 if (cpus_empty(tmp)) 3555 if (dest == BAD_APICID)
3307 return; 3556 return;
3308 3557
3309 if (assign_irq_vector(irq, mask)) 3558 cfg = desc->chip_data;
3310 return;
3311
3312 cfg = irq_cfg(irq);
3313 cpus_and(tmp, cfg->domain, mask);
3314 dest = cpu_mask_to_apicid(tmp);
3315 3559
3316 dmar_msi_read(irq, &msg); 3560 dmar_msi_read(irq, &msg);
3317 3561
@@ -3321,9 +3565,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
3321 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3565 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3322 3566
3323 dmar_msi_write(irq, &msg); 3567 dmar_msi_write(irq, &msg);
3324 desc = irq_to_desc(irq);
3325 desc->affinity = mask;
3326} 3568}
3569
3327#endif /* CONFIG_SMP */ 3570#endif /* CONFIG_SMP */
3328 3571
3329struct irq_chip dmar_msi_type = { 3572struct irq_chip dmar_msi_type = {
@@ -3355,24 +3598,18 @@ int arch_setup_dmar_msi(unsigned int irq)
3355#ifdef CONFIG_HPET_TIMER 3598#ifdef CONFIG_HPET_TIMER
3356 3599
3357#ifdef CONFIG_SMP 3600#ifdef CONFIG_SMP
3358static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) 3601static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3359{ 3602{
3603 struct irq_desc *desc = irq_to_desc(irq);
3360 struct irq_cfg *cfg; 3604 struct irq_cfg *cfg;
3361 struct irq_desc *desc;
3362 struct msi_msg msg; 3605 struct msi_msg msg;
3363 unsigned int dest; 3606 unsigned int dest;
3364 cpumask_t tmp;
3365 3607
3366 cpus_and(tmp, mask, cpu_online_map); 3608 dest = set_desc_affinity(desc, mask);
3367 if (cpus_empty(tmp)) 3609 if (dest == BAD_APICID)
3368 return; 3610 return;
3369 3611
3370 if (assign_irq_vector(irq, mask)) 3612 cfg = desc->chip_data;
3371 return;
3372
3373 cfg = irq_cfg(irq);
3374 cpus_and(tmp, cfg->domain, mask);
3375 dest = cpu_mask_to_apicid(tmp);
3376 3613
3377 hpet_msi_read(irq, &msg); 3614 hpet_msi_read(irq, &msg);
3378 3615
@@ -3382,9 +3619,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3382 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3619 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3383 3620
3384 hpet_msi_write(irq, &msg); 3621 hpet_msi_write(irq, &msg);
3385 desc = irq_to_desc(irq);
3386 desc->affinity = mask;
3387} 3622}
3623
3388#endif /* CONFIG_SMP */ 3624#endif /* CONFIG_SMP */
3389 3625
3390struct irq_chip hpet_msi_type = { 3626struct irq_chip hpet_msi_type = {
@@ -3437,28 +3673,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3437 write_ht_irq_msg(irq, &msg); 3673 write_ht_irq_msg(irq, &msg);
3438} 3674}
3439 3675
3440static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3676static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3441{ 3677{
3678 struct irq_desc *desc = irq_to_desc(irq);
3442 struct irq_cfg *cfg; 3679 struct irq_cfg *cfg;
3443 unsigned int dest; 3680 unsigned int dest;
3444 cpumask_t tmp;
3445 struct irq_desc *desc;
3446 3681
3447 cpus_and(tmp, mask, cpu_online_map); 3682 dest = set_desc_affinity(desc, mask);
3448 if (cpus_empty(tmp)) 3683 if (dest == BAD_APICID)
3449 return; 3684 return;
3450 3685
3451 if (assign_irq_vector(irq, mask)) 3686 cfg = desc->chip_data;
3452 return;
3453
3454 cfg = irq_cfg(irq);
3455 cpus_and(tmp, cfg->domain, mask);
3456 dest = cpu_mask_to_apicid(tmp);
3457 3687
3458 target_ht_irq(irq, dest, cfg->vector); 3688 target_ht_irq(irq, dest, cfg->vector);
3459 desc = irq_to_desc(irq);
3460 desc->affinity = mask;
3461} 3689}
3690
3462#endif 3691#endif
3463 3692
3464static struct irq_chip ht_irq_chip = { 3693static struct irq_chip ht_irq_chip = {
@@ -3476,17 +3705,18 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3476{ 3705{
3477 struct irq_cfg *cfg; 3706 struct irq_cfg *cfg;
3478 int err; 3707 int err;
3479 cpumask_t tmp;
3480 3708
3481 tmp = TARGET_CPUS; 3709 if (disable_apic)
3482 err = assign_irq_vector(irq, tmp); 3710 return -ENXIO;
3711
3712 cfg = irq_cfg(irq);
3713 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3483 if (!err) { 3714 if (!err) {
3484 struct ht_irq_msg msg; 3715 struct ht_irq_msg msg;
3485 unsigned dest; 3716 unsigned dest;
3486 3717
3487 cfg = irq_cfg(irq); 3718 dest = apic->cpu_mask_to_apicid_and(cfg->domain,
3488 cpus_and(tmp, cfg->domain, tmp); 3719 apic->target_cpus());
3489 dest = cpu_mask_to_apicid(tmp);
3490 3720
3491 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3721 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3492 3722
@@ -3494,11 +3724,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3494 HT_IRQ_LOW_BASE | 3724 HT_IRQ_LOW_BASE |
3495 HT_IRQ_LOW_DEST_ID(dest) | 3725 HT_IRQ_LOW_DEST_ID(dest) |
3496 HT_IRQ_LOW_VECTOR(cfg->vector) | 3726 HT_IRQ_LOW_VECTOR(cfg->vector) |
3497 ((INT_DEST_MODE == 0) ? 3727 ((apic->irq_dest_mode == 0) ?
3498 HT_IRQ_LOW_DM_PHYSICAL : 3728 HT_IRQ_LOW_DM_PHYSICAL :
3499 HT_IRQ_LOW_DM_LOGICAL) | 3729 HT_IRQ_LOW_DM_LOGICAL) |
3500 HT_IRQ_LOW_RQEOI_EDGE | 3730 HT_IRQ_LOW_RQEOI_EDGE |
3501 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3731 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3502 HT_IRQ_LOW_MT_FIXED : 3732 HT_IRQ_LOW_MT_FIXED :
3503 HT_IRQ_LOW_MT_ARBITRATED) | 3733 HT_IRQ_LOW_MT_ARBITRATED) |
3504 HT_IRQ_LOW_IRQ_MASKED; 3734 HT_IRQ_LOW_IRQ_MASKED;
@@ -3514,7 +3744,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3514} 3744}
3515#endif /* CONFIG_HT_IRQ */ 3745#endif /* CONFIG_HT_IRQ */
3516 3746
3517#ifdef CONFIG_X86_64 3747#ifdef CONFIG_X86_UV
3518/* 3748/*
3519 * Re-target the irq to the specified CPU and enable the specified MMR located 3749 * Re-target the irq to the specified CPU and enable the specified MMR located
3520 * on the specified blade to allow the sending of MSIs to the specified CPU. 3750 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3522,7 +3752,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3522int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 3752int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3523 unsigned long mmr_offset) 3753 unsigned long mmr_offset)
3524{ 3754{
3525 const cpumask_t *eligible_cpu = get_cpu_mask(cpu); 3755 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3526 struct irq_cfg *cfg; 3756 struct irq_cfg *cfg;
3527 int mmr_pnode; 3757 int mmr_pnode;
3528 unsigned long mmr_value; 3758 unsigned long mmr_value;
@@ -3530,7 +3760,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3530 unsigned long flags; 3760 unsigned long flags;
3531 int err; 3761 int err;
3532 3762
3533 err = assign_irq_vector(irq, *eligible_cpu); 3763 cfg = irq_cfg(irq);
3764
3765 err = assign_irq_vector(irq, cfg, eligible_cpu);
3534 if (err != 0) 3766 if (err != 0)
3535 return err; 3767 return err;
3536 3768
@@ -3539,19 +3771,17 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3539 irq_name); 3771 irq_name);
3540 spin_unlock_irqrestore(&vector_lock, flags); 3772 spin_unlock_irqrestore(&vector_lock, flags);
3541 3773
3542 cfg = irq_cfg(irq);
3543
3544 mmr_value = 0; 3774 mmr_value = 0;
3545 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3775 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3546 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3776 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3547 3777
3548 entry->vector = cfg->vector; 3778 entry->vector = cfg->vector;
3549 entry->delivery_mode = INT_DELIVERY_MODE; 3779 entry->delivery_mode = apic->irq_delivery_mode;
3550 entry->dest_mode = INT_DEST_MODE; 3780 entry->dest_mode = apic->irq_dest_mode;
3551 entry->polarity = 0; 3781 entry->polarity = 0;
3552 entry->trigger = 0; 3782 entry->trigger = 0;
3553 entry->mask = 0; 3783 entry->mask = 0;
3554 entry->dest = cpu_mask_to_apicid(*eligible_cpu); 3784 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3555 3785
3556 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3786 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3557 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3787 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3592,31 +3822,50 @@ int __init io_apic_get_redir_entries (int ioapic)
3592 return reg_01.bits.entries; 3822 return reg_01.bits.entries;
3593} 3823}
3594 3824
3595int __init probe_nr_irqs(void) 3825void __init probe_nr_irqs_gsi(void)
3596{ 3826{
3597 int idx;
3598 int nr = 0; 3827 int nr = 0;
3599#ifndef CONFIG_XEN
3600 int nr_min = 32;
3601#else
3602 int nr_min = NR_IRQS;
3603#endif
3604 3828
3605 for (idx = 0; idx < nr_ioapics; idx++) 3829 nr = acpi_probe_gsi();
3606 nr += io_apic_get_redir_entries(idx) + 1; 3830 if (nr > nr_irqs_gsi) {
3831 nr_irqs_gsi = nr;
3832 } else {
3833 /* for acpi=off or acpi is not compiled in */
3834 int idx;
3607 3835
3608 /* double it for hotplug and msi and nmi */ 3836 nr = 0;
3609 nr <<= 1; 3837 for (idx = 0; idx < nr_ioapics; idx++)
3838 nr += io_apic_get_redir_entries(idx) + 1;
3610 3839
3611 /* something wrong ? */ 3840 if (nr > nr_irqs_gsi)
3612 if (nr < nr_min) 3841 nr_irqs_gsi = nr;
3613 nr = nr_min; 3842 }
3614 if (WARN_ON(nr > NR_IRQS))
3615 nr = NR_IRQS;
3616 3843
3617 return nr; 3844 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3618} 3845}
3619 3846
3847#ifdef CONFIG_SPARSE_IRQ
3848int __init arch_probe_nr_irqs(void)
3849{
3850 int nr;
3851
3852 if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
3853 nr_irqs = NR_VECTORS * nr_cpu_ids;
3854
3855 nr = nr_irqs_gsi + 8 * nr_cpu_ids;
3856#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
3857 /*
3858 * for MSI and HT dyn irq
3859 */
3860 nr += nr_irqs_gsi * 16;
3861#endif
3862 if (nr < nr_irqs)
3863 nr_irqs = nr;
3864
3865 return 0;
3866}
3867#endif
3868
3620/* -------------------------------------------------------------------------- 3869/* --------------------------------------------------------------------------
3621 ACPI-based IOAPIC Configuration 3870 ACPI-based IOAPIC Configuration
3622 -------------------------------------------------------------------------- */ 3871 -------------------------------------------------------------------------- */
@@ -3642,7 +3891,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3642 */ 3891 */
3643 3892
3644 if (physids_empty(apic_id_map)) 3893 if (physids_empty(apic_id_map))
3645 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); 3894 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
3646 3895
3647 spin_lock_irqsave(&ioapic_lock, flags); 3896 spin_lock_irqsave(&ioapic_lock, flags);
3648 reg_00.raw = io_apic_read(ioapic, 0); 3897 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3658,10 +3907,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3658 * Every APIC in a system must have a unique ID or we get lots of nice 3907 * Every APIC in a system must have a unique ID or we get lots of nice
3659 * 'stuck on smp_invalidate_needed IPI wait' messages. 3908 * 'stuck on smp_invalidate_needed IPI wait' messages.
3660 */ 3909 */
3661 if (check_apicid_used(apic_id_map, apic_id)) { 3910 if (apic->check_apicid_used(apic_id_map, apic_id)) {
3662 3911
3663 for (i = 0; i < get_physical_broadcast(); i++) { 3912 for (i = 0; i < get_physical_broadcast(); i++) {
3664 if (!check_apicid_used(apic_id_map, i)) 3913 if (!apic->check_apicid_used(apic_id_map, i))
3665 break; 3914 break;
3666 } 3915 }
3667 3916
@@ -3674,7 +3923,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3674 apic_id = i; 3923 apic_id = i;
3675 } 3924 }
3676 3925
3677 tmp = apicid_to_cpu_present(apic_id); 3926 tmp = apic->apicid_to_cpu_present(apic_id);
3678 physids_or(apic_id_map, apic_id_map, tmp); 3927 physids_or(apic_id_map, apic_id_map, tmp);
3679 3928
3680 if (reg_00.bits.ID != apic_id) { 3929 if (reg_00.bits.ID != apic_id) {
@@ -3713,19 +3962,31 @@ int __init io_apic_get_version(int ioapic)
3713 3962
3714int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3963int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3715{ 3964{
3965 struct irq_desc *desc;
3966 struct irq_cfg *cfg;
3967 int cpu = boot_cpu_id;
3968
3716 if (!IO_APIC_IRQ(irq)) { 3969 if (!IO_APIC_IRQ(irq)) {
3717 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3970 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3718 ioapic); 3971 ioapic);
3719 return -EINVAL; 3972 return -EINVAL;
3720 } 3973 }
3721 3974
3975 desc = irq_to_desc_alloc_cpu(irq, cpu);
3976 if (!desc) {
3977 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3978 return 0;
3979 }
3980
3722 /* 3981 /*
3723 * IRQs < 16 are already in the irq_2_pin[] map 3982 * IRQs < 16 are already in the irq_2_pin[] map
3724 */ 3983 */
3725 if (irq >= 16) 3984 if (irq >= NR_IRQS_LEGACY) {
3726 add_pin_to_irq(irq, ioapic, pin); 3985 cfg = desc->chip_data;
3986 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
3987 }
3727 3988
3728 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); 3989 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
3729 3990
3730 return 0; 3991 return 0;
3731} 3992}
@@ -3739,8 +4000,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3739 return -1; 4000 return -1;
3740 4001
3741 for (i = 0; i < mp_irq_entries; i++) 4002 for (i = 0; i < mp_irq_entries; i++)
3742 if (mp_irqs[i].mp_irqtype == mp_INT && 4003 if (mp_irqs[i].irqtype == mp_INT &&
3743 mp_irqs[i].mp_srcbusirq == bus_irq) 4004 mp_irqs[i].srcbusirq == bus_irq)
3744 break; 4005 break;
3745 if (i >= mp_irq_entries) 4006 if (i >= mp_irq_entries)
3746 return -1; 4007 return -1;
@@ -3755,13 +4016,15 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3755/* 4016/*
3756 * This function currently is only a helper for the i386 smp boot process where 4017 * This function currently is only a helper for the i386 smp boot process where
3757 * we need to reprogram the ioredtbls to cater for the cpus which have come online 4018 * we need to reprogram the ioredtbls to cater for the cpus which have come online
3758 * so mask in all cases should simply be TARGET_CPUS 4019 * so mask in all cases should simply be apic->target_cpus()
3759 */ 4020 */
3760#ifdef CONFIG_SMP 4021#ifdef CONFIG_SMP
3761void __init setup_ioapic_dest(void) 4022void __init setup_ioapic_dest(void)
3762{ 4023{
3763 int pin, ioapic, irq, irq_entry; 4024 int pin, ioapic, irq, irq_entry;
4025 struct irq_desc *desc;
3764 struct irq_cfg *cfg; 4026 struct irq_cfg *cfg;
4027 const struct cpumask *mask;
3765 4028
3766 if (skip_ioapic_setup == 1) 4029 if (skip_ioapic_setup == 1)
3767 return; 4030 return;
@@ -3777,17 +4040,31 @@ void __init setup_ioapic_dest(void)
3777 * when you have too many devices, because at that time only boot 4040 * when you have too many devices, because at that time only boot
3778 * cpu is online. 4041 * cpu is online.
3779 */ 4042 */
3780 cfg = irq_cfg(irq); 4043 desc = irq_to_desc(irq);
3781 if (!cfg->vector) 4044 cfg = desc->chip_data;
3782 setup_IO_APIC_irq(ioapic, pin, irq, 4045 if (!cfg->vector) {
4046 setup_IO_APIC_irq(ioapic, pin, irq, desc,
3783 irq_trigger(irq_entry), 4047 irq_trigger(irq_entry),
3784 irq_polarity(irq_entry)); 4048 irq_polarity(irq_entry));
4049 continue;
4050
4051 }
4052
4053 /*
4054 * Honour affinities which have been set in early boot
4055 */
4056 if (desc->status &
4057 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4058 mask = desc->affinity;
4059 else
4060 mask = apic->target_cpus();
4061
3785#ifdef CONFIG_INTR_REMAP 4062#ifdef CONFIG_INTR_REMAP
3786 else if (intr_remapping_enabled) 4063 if (intr_remapping_enabled)
3787 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); 4064 set_ir_ioapic_affinity_irq_desc(desc, mask);
3788#endif
3789 else 4065 else
3790 set_ioapic_affinity_irq(irq, TARGET_CPUS); 4066#endif
4067 set_ioapic_affinity_irq_desc(desc, mask);
3791 } 4068 }
3792 4069
3793 } 4070 }
@@ -3836,11 +4113,10 @@ void __init ioapic_init_mappings(void)
3836 struct resource *ioapic_res; 4113 struct resource *ioapic_res;
3837 int i; 4114 int i;
3838 4115
3839 irq_2_pin_init();
3840 ioapic_res = ioapic_setup_resources(); 4116 ioapic_res = ioapic_setup_resources();
3841 for (i = 0; i < nr_ioapics; i++) { 4117 for (i = 0; i < nr_ioapics; i++) {
3842 if (smp_found_config) { 4118 if (smp_found_config) {
3843 ioapic_phys = mp_ioapics[i].mp_apicaddr; 4119 ioapic_phys = mp_ioapics[i].apicaddr;
3844#ifdef CONFIG_X86_32 4120#ifdef CONFIG_X86_32
3845 if (!ioapic_phys) { 4121 if (!ioapic_phys) {
3846 printk(KERN_ERR 4122 printk(KERN_ERR
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 191914302744..e41980a373ab 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -35,8 +35,8 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base,
35 */ 35 */
36asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) 36asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
37{ 37{
38 struct thread_struct * t = &current->thread; 38 struct thread_struct *t = &current->thread;
39 struct tss_struct * tss; 39 struct tss_struct *tss;
40 unsigned int i, max_long, bytes, bytes_updated; 40 unsigned int i, max_long, bytes, bytes_updated;
41 41
42 if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) 42 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
@@ -131,9 +131,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
131} 131}
132 132
133#ifdef CONFIG_X86_32 133#ifdef CONFIG_X86_32
134asmlinkage long sys_iopl(unsigned long regsp) 134long sys_iopl(struct pt_regs *regs)
135{ 135{
136 struct pt_regs *regs = (struct pt_regs *)&regsp;
137 unsigned int level = regs->bx; 136 unsigned int level = regs->bx;
138 struct thread_struct *t = &current->thread; 137 struct thread_struct *t = &current->thread;
139 int rc; 138 int rc;
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f35..dbf5445727a9 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -17,135 +17,121 @@
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/proto.h> 19#include <asm/proto.h>
20#include <asm/ipi.h>
20 21
21#ifdef CONFIG_X86_32 22void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
22#include <mach_apic.h>
23#include <mach_ipi.h>
24
25/*
26 * the following functions deal with sending IPIs between CPUs.
27 *
28 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
29 */
30
31static inline int __prepare_ICR(unsigned int shortcut, int vector)
32{ 23{
33 unsigned int icr = shortcut | APIC_DEST_LOGICAL; 24 unsigned long query_cpu;
34 25 unsigned long flags;
35 switch (vector) { 26
36 default: 27 /*
37 icr |= APIC_DM_FIXED | vector; 28 * Hack. The clustered APIC addressing mode doesn't allow us to send
38 break; 29 * to an arbitrary mask, so I do a unicast to each CPU instead.
39 case NMI_VECTOR: 30 * - mbligh
40 icr |= APIC_DM_NMI; 31 */
41 break; 32 local_irq_save(flags);
33 for_each_cpu(query_cpu, mask) {
34 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
35 query_cpu), vector, APIC_DEST_PHYSICAL);
42 } 36 }
43 return icr; 37 local_irq_restore(flags);
44} 38}
45 39
46static inline int __prepare_ICR2(unsigned int mask) 40void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
41 int vector)
47{ 42{
48 return SET_APIC_DEST_FIELD(mask); 43 unsigned int this_cpu = smp_processor_id();
44 unsigned int query_cpu;
45 unsigned long flags;
46
47 /* See Hack comment above */
48
49 local_irq_save(flags);
50 for_each_cpu(query_cpu, mask) {
51 if (query_cpu == this_cpu)
52 continue;
53 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
54 query_cpu), vector, APIC_DEST_PHYSICAL);
55 }
56 local_irq_restore(flags);
49} 57}
50 58
51void __send_IPI_shortcut(unsigned int shortcut, int vector) 59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector)
52{ 61{
53 /* 62 unsigned long flags;
54 * Subtle. In the case of the 'never do double writes' workaround 63 unsigned int query_cpu;
55 * we have to lock out interrupts to be safe. As we don't care
56 * of the value read we use an atomic rmw access to avoid costly
57 * cli/sti. Otherwise we use an even cheaper single atomic write
58 * to the APIC.
59 */
60 unsigned int cfg;
61
62 /*
63 * Wait for idle.
64 */
65 apic_wait_icr_idle();
66 64
67 /* 65 /*
68 * No need to touch the target chip field 66 * Hack. The clustered APIC addressing mode doesn't allow us to send
67 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
68 * should be modified to do 1 message per cluster ID - mbligh
69 */ 69 */
70 cfg = __prepare_ICR(shortcut, vector);
71 70
72 /* 71 local_irq_save(flags);
73 * Send the IPI. The write to APIC_ICR fires this off. 72 for_each_cpu(query_cpu, mask)
74 */ 73 __default_send_IPI_dest_field(
75 apic_write(APIC_ICR, cfg); 74 apic->cpu_to_logical_apicid(query_cpu), vector,
75 apic->dest_logical);
76 local_irq_restore(flags);
76} 77}
77 78
78void send_IPI_self(int vector) 79void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
80 int vector)
79{ 81{
80 __send_IPI_shortcut(APIC_DEST_SELF, vector); 82 unsigned long flags;
81} 83 unsigned int query_cpu;
82 84 unsigned int this_cpu = smp_processor_id();
83/*
84 * This is used to send an IPI with no shorthand notation (the destination is
85 * specified in bits 56 to 63 of the ICR).
86 */
87static inline void __send_IPI_dest_field(unsigned long mask, int vector)
88{
89 unsigned long cfg;
90
91 /*
92 * Wait for idle.
93 */
94 if (unlikely(vector == NMI_VECTOR))
95 safe_apic_wait_icr_idle();
96 else
97 apic_wait_icr_idle();
98
99 /*
100 * prepare target chip field
101 */
102 cfg = __prepare_ICR2(mask);
103 apic_write(APIC_ICR2, cfg);
104 85
105 /* 86 /* See Hack comment above */
106 * program the ICR
107 */
108 cfg = __prepare_ICR(0, vector);
109 87
110 /* 88 local_irq_save(flags);
111 * Send the IPI. The write to APIC_ICR fires this off. 89 for_each_cpu(query_cpu, mask) {
112 */ 90 if (query_cpu == this_cpu)
113 apic_write(APIC_ICR, cfg); 91 continue;
92 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector,
94 apic->dest_logical);
95 }
96 local_irq_restore(flags);
114} 97}
115 98
99#ifdef CONFIG_X86_32
100
116/* 101/*
117 * This is only used on smaller machines. 102 * This is only used on smaller machines.
118 */ 103 */
119void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) 104void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
120{ 105{
121 unsigned long mask = cpus_addr(cpumask)[0]; 106 unsigned long mask = cpumask_bits(cpumask)[0];
122 unsigned long flags; 107 unsigned long flags;
123 108
124 local_irq_save(flags); 109 local_irq_save(flags);
125 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); 110 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
126 __send_IPI_dest_field(mask, vector); 111 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
127 local_irq_restore(flags); 112 local_irq_restore(flags);
128} 113}
129 114
130void send_IPI_mask_sequence(cpumask_t mask, int vector) 115void default_send_IPI_allbutself(int vector)
131{ 116{
132 unsigned long flags;
133 unsigned int query_cpu;
134
135 /* 117 /*
136 * Hack. The clustered APIC addressing mode doesn't allow us to send 118 * if there are no other CPUs in the system then we get an APIC send
137 * to an arbitrary mask, so I do a unicasts to each CPU instead. This 119 * error if we try to broadcast, thus avoid sending IPIs in this case.
138 * should be modified to do 1 message per cluster ID - mbligh
139 */ 120 */
121 if (!(num_online_cpus() > 1))
122 return;
140 123
141 local_irq_save(flags); 124 __default_local_send_IPI_allbutself(vector);
142 for_each_possible_cpu(query_cpu) { 125}
143 if (cpu_isset(query_cpu, mask)) { 126
144 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 127void default_send_IPI_all(int vector)
145 vector); 128{
146 } 129 __default_local_send_IPI_all(vector);
147 } 130}
148 local_irq_restore(flags); 131
132void default_send_IPI_self(int vector)
133{
134 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
149} 135}
150 136
151/* must come after the send_IPI functions above for inlining */ 137/* must come after the send_IPI functions above for inlining */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..f13ca1650aaf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -5,10 +5,13 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/smp.h>
9#include <linux/ftrace.h>
8 10
9#include <asm/apic.h> 11#include <asm/apic.h>
10#include <asm/io_apic.h> 12#include <asm/io_apic.h>
11#include <asm/smp.h> 13#include <asm/irq.h>
14#include <asm/idle.h>
12 15
13atomic_t irq_err_count; 16atomic_t irq_err_count;
14 17
@@ -35,11 +38,7 @@ void ack_bad_irq(unsigned int irq)
35#endif 38#endif
36} 39}
37 40
38#ifdef CONFIG_X86_32 41#define irq_stats(x) (&per_cpu(irq_stat, x))
39# define irq_stats(x) (&per_cpu(irq_stat, x))
40#else
41# define irq_stats(x) cpu_pda(x)
42#endif
43/* 42/*
44 * /proc/interrupts printing: 43 * /proc/interrupts printing:
45 */ 44 */
@@ -118,6 +117,9 @@ int show_interrupts(struct seq_file *p, void *v)
118 } 117 }
119 118
120 desc = irq_to_desc(i); 119 desc = irq_to_desc(i);
120 if (!desc)
121 return 0;
122
121 spin_lock_irqsave(&desc->lock, flags); 123 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP 124#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i); 125 any_count = kstat_irqs(i);
@@ -187,3 +189,41 @@ u64 arch_irq_stat(void)
187#endif 189#endif
188 return sum; 190 return sum;
189} 191}
192
193
194/*
195 * do_IRQ handles all normal device IRQ's (the special
196 * SMP cross-CPU interrupts have their own specific
197 * handlers).
198 */
199unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
200{
201 struct pt_regs *old_regs = set_irq_regs(regs);
202
203 /* high bit used in ret_from_ code */
204 unsigned vector = ~regs->orig_ax;
205 unsigned irq;
206
207 exit_idle();
208 irq_enter();
209
210 irq = __get_cpu_var(vector_irq)[vector];
211
212 if (!handle_irq(irq, regs)) {
213#ifdef CONFIG_X86_64
214 if (!disable_apic)
215 ack_APIC_irq();
216#endif
217
218 if (printk_ratelimit())
219 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
220 __func__, smp_processor_id(), vector, irq);
221 }
222
223 irq_exit();
224
225 set_irq_regs(old_regs);
226 return 1;
227}
228
229EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de0..4beb9a13873d 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -15,9 +15,9 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h>
18 19
19#include <asm/apic.h> 20#include <asm/apic.h>
20#include <asm/uaccess.h>
21 21
22DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); 22DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
23EXPORT_PER_CPU_SYMBOL(irq_stat); 23EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -93,7 +93,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
93 return 0; 93 return 0;
94 94
95 /* build the stack frame on the IRQ stack */ 95 /* build the stack frame on the IRQ stack */
96 isp = (u32 *) ((char*)irqctx + sizeof(*irqctx)); 96 isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
97 irqctx->tinfo.task = curctx->tinfo.task; 97 irqctx->tinfo.task = curctx->tinfo.task;
98 irqctx->tinfo.previous_esp = current_stack_pointer; 98 irqctx->tinfo.previous_esp = current_stack_pointer;
99 99
@@ -137,7 +137,7 @@ void __cpuinit irq_ctx_init(int cpu)
137 137
138 hardirq_ctx[cpu] = irqctx; 138 hardirq_ctx[cpu] = irqctx;
139 139
140 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; 140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
141 irqctx->tinfo.task = NULL; 141 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 142 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
@@ -147,7 +147,7 @@ void __cpuinit irq_ctx_init(int cpu)
147 softirq_ctx[cpu] = irqctx; 147 softirq_ctx[cpu] = irqctx;
148 148
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); 150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
151} 151}
152 152
153void irq_ctx_exit(int cpu) 153void irq_ctx_exit(int cpu)
@@ -174,7 +174,7 @@ asmlinkage void do_softirq(void)
174 irqctx->tinfo.previous_esp = current_stack_pointer; 174 irqctx->tinfo.previous_esp = current_stack_pointer;
175 175
176 /* build the stack frame on the softirq stack */ 176 /* build the stack frame on the softirq stack */
177 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 177 isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
178 178
179 call_on_stack(__do_softirq, isp); 179 call_on_stack(__do_softirq, isp);
180 /* 180 /*
@@ -191,33 +191,16 @@ static inline int
191execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } 191execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
192#endif 192#endif
193 193
194/* 194bool handle_irq(unsigned irq, struct pt_regs *regs)
195 * do_IRQ handles all normal device IRQ's (the special
196 * SMP cross-CPU interrupts have their own specific
197 * handlers).
198 */
199unsigned int do_IRQ(struct pt_regs *regs)
200{ 195{
201 struct pt_regs *old_regs;
202 /* high bit used in ret_from_ code */
203 int overflow;
204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc; 196 struct irq_desc *desc;
206 unsigned irq; 197 int overflow;
207
208
209 old_regs = set_irq_regs(regs);
210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
212 198
213 overflow = check_stack_overflow(); 199 overflow = check_stack_overflow();
214 200
215 desc = irq_to_desc(irq); 201 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) { 202 if (unlikely(!desc))
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", 203 return false;
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221 204
222 if (!execute_on_irq_stack(overflow, desc, irq)) { 205 if (!execute_on_irq_stack(overflow, desc, irq)) {
223 if (unlikely(overflow)) 206 if (unlikely(overflow))
@@ -225,33 +208,34 @@ unsigned int do_IRQ(struct pt_regs *regs)
225 desc->handle_irq(irq, desc); 208 desc->handle_irq(irq, desc);
226 } 209 }
227 210
228 irq_exit(); 211 return true;
229 set_irq_regs(old_regs);
230 return 1;
231} 212}
232 213
233#ifdef CONFIG_HOTPLUG_CPU 214#ifdef CONFIG_HOTPLUG_CPU
234#include <mach_apic.h> 215#include <asm/genapic.h>
235 216
236void fixup_irqs(cpumask_t map) 217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
237{ 219{
238 unsigned int irq; 220 unsigned int irq;
239 static int warned; 221 static int warned;
240 struct irq_desc *desc; 222 struct irq_desc *desc;
241 223
242 for_each_irq_desc(irq, desc) { 224 for_each_irq_desc(irq, desc) {
243 cpumask_t mask; 225 const struct cpumask *affinity;
244 226
227 if (!desc)
228 continue;
245 if (irq == 2) 229 if (irq == 2)
246 continue; 230 continue;
247 231
248 cpus_and(mask, desc->affinity, map); 232 affinity = desc->affinity;
249 if (any_online_cpu(mask) == NR_CPUS) { 233 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
250 printk("Breaking affinity for irq %i\n", irq); 234 printk("Breaking affinity for irq %i\n", irq);
251 mask = map; 235 affinity = cpu_all_mask;
252 } 236 }
253 if (desc->chip->set_affinity) 237 if (desc->chip->set_affinity)
254 desc->chip->set_affinity(irq, mask); 238 desc->chip->set_affinity(irq, affinity);
255 else if (desc->action && !(warned++)) 239 else if (desc->action && !(warned++))
256 printk("Cannot set affinity for irq %i\n", irq); 240 printk("Cannot set affinity for irq %i\n", irq);
257 } 241 }
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a0..977d8b43a0dd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,12 +13,19 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <asm/uaccess.h> 16#include <linux/ftrace.h>
17#include <linux/uaccess.h>
18#include <linux/smp.h>
17#include <asm/io_apic.h> 19#include <asm/io_apic.h>
18#include <asm/idle.h> 20#include <asm/idle.h>
19#include <asm/smp.h> 21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
20 28
21#ifdef CONFIG_DEBUG_STACKOVERFLOW
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
24 * 31 *
@@ -28,95 +35,71 @@
28 */ 35 */
29static inline void stack_overflow_check(struct pt_regs *regs) 36static inline void stack_overflow_check(struct pt_regs *regs)
30{ 37{
38#ifdef CONFIG_DEBUG_STACKOVERFLOW
31 u64 curbase = (u64)task_stack_page(current); 39 u64 curbase = (u64)task_stack_page(current);
32 static unsigned long warned = -60*HZ; 40
33 41 WARN_ONCE(regs->sp >= curbase &&
34 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && 42 regs->sp <= curbase + THREAD_SIZE &&
35 regs->sp < curbase + sizeof(struct thread_info) + 128 && 43 regs->sp < curbase + sizeof(struct thread_info) +
36 time_after(jiffies, warned + 60*HZ)) { 44 sizeof(struct pt_regs) + 128,
37 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 45
38 current->comm, curbase, regs->sp); 46 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
39 show_stack(NULL,NULL); 47 current->comm, curbase, regs->sp);
40 warned = jiffies;
41 }
42}
43#endif 48#endif
49}
44 50
45/* 51bool handle_irq(unsigned irq, struct pt_regs *regs)
46 * do_IRQ handles all normal device IRQ's (the special
47 * SMP cross-CPU interrupts have their own specific
48 * handlers).
49 */
50asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
51{ 52{
52 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc; 53 struct irq_desc *desc;
54 54
55 /* high bit used in ret_from_ code */
56 unsigned vector = ~regs->orig_ax;
57 unsigned irq;
58
59 exit_idle();
60 irq_enter();
61 irq = __get_cpu_var(vector_irq)[vector];
62
63#ifdef CONFIG_DEBUG_STACKOVERFLOW
64 stack_overflow_check(regs); 55 stack_overflow_check(regs);
65#endif
66 56
67 desc = irq_to_desc(irq); 57 desc = irq_to_desc(irq);
68 if (likely(desc)) 58 if (unlikely(!desc))
69 generic_handle_irq_desc(irq, desc); 59 return false;
70 else {
71 if (!disable_apic)
72 ack_APIC_irq();
73
74 if (printk_ratelimit())
75 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
76 __func__, smp_processor_id(), vector);
77 }
78 60
79 irq_exit(); 61 generic_handle_irq_desc(irq, desc);
80 62 return true;
81 set_irq_regs(old_regs);
82 return 1;
83} 63}
84 64
85#ifdef CONFIG_HOTPLUG_CPU 65#ifdef CONFIG_HOTPLUG_CPU
86void fixup_irqs(cpumask_t map) 66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
87{ 68{
88 unsigned int irq; 69 unsigned int irq;
89 static int warned; 70 static int warned;
90 struct irq_desc *desc; 71 struct irq_desc *desc;
91 72
92 for_each_irq_desc(irq, desc) { 73 for_each_irq_desc(irq, desc) {
93 cpumask_t mask;
94 int break_affinity = 0; 74 int break_affinity = 0;
95 int set_affinity = 1; 75 int set_affinity = 1;
76 const struct cpumask *affinity;
96 77
78 if (!desc)
79 continue;
97 if (irq == 2) 80 if (irq == 2)
98 continue; 81 continue;
99 82
100 /* interrupt's are disabled at this point */ 83 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 84 spin_lock(&desc->lock);
102 85
86 affinity = desc->affinity;
103 if (!irq_has_action(irq) || 87 if (!irq_has_action(irq) ||
104 cpus_equal(desc->affinity, map)) { 88 cpumask_equal(affinity, cpu_online_mask)) {
105 spin_unlock(&desc->lock); 89 spin_unlock(&desc->lock);
106 continue; 90 continue;
107 } 91 }
108 92
109 cpus_and(mask, desc->affinity, map); 93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
110 if (cpus_empty(mask)) {
111 break_affinity = 1; 94 break_affinity = 1;
112 mask = map; 95 affinity = cpu_all_mask;
113 } 96 }
114 97
115 if (desc->chip->mask) 98 if (desc->chip->mask)
116 desc->chip->mask(irq); 99 desc->chip->mask(irq);
117 100
118 if (desc->chip->set_affinity) 101 if (desc->chip->set_affinity)
119 desc->chip->set_affinity(irq, mask); 102 desc->chip->set_affinity(irq, affinity);
120 else if (!(warned++)) 103 else if (!(warned++))
121 set_affinity = 0; 104 set_affinity = 0;
122 105
@@ -142,18 +125,18 @@ extern void call_softirq(void);
142 125
143asmlinkage void do_softirq(void) 126asmlinkage void do_softirq(void)
144{ 127{
145 __u32 pending; 128 __u32 pending;
146 unsigned long flags; 129 unsigned long flags;
147 130
148 if (in_interrupt()) 131 if (in_interrupt())
149 return; 132 return;
150 133
151 local_irq_save(flags); 134 local_irq_save(flags);
152 pending = local_softirq_pending(); 135 pending = local_softirq_pending();
153 /* Switch to interrupt stack */ 136 /* Switch to interrupt stack */
154 if (pending) { 137 if (pending) {
155 call_softirq(); 138 call_softirq();
156 WARN_ON_ONCE(softirq_count()); 139 WARN_ON_ONCE(softirq_count());
157 } 140 }
158 local_irq_restore(flags); 141 local_irq_restore(flags);
159} 142}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e80..bf629cadec1a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -9,18 +9,18 @@
9#include <linux/kernel_stat.h> 9#include <linux/kernel_stat.h>
10#include <linux/sysdev.h> 10#include <linux/sysdev.h>
11#include <linux/bitops.h> 11#include <linux/bitops.h>
12#include <linux/io.h>
13#include <linux/delay.h>
12 14
13#include <asm/atomic.h> 15#include <asm/atomic.h>
14#include <asm/system.h> 16#include <asm/system.h>
15#include <asm/io.h>
16#include <asm/timer.h> 17#include <asm/timer.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <asm/delay.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/arch_hooks.h> 21#include <asm/arch_hooks.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23 23#include <asm/traps.h>
24 24
25 25
26/* 26/*
@@ -34,12 +34,10 @@
34 * leads to races. IBM designers who came up with it should 34 * leads to races. IBM designers who came up with it should
35 * be shot. 35 * be shot.
36 */ 36 */
37
38 37
39static irqreturn_t math_error_irq(int cpl, void *dev_id) 38static irqreturn_t math_error_irq(int cpl, void *dev_id)
40{ 39{
41 extern void math_error(void __user *); 40 outb(0, 0xF0);
42 outb(0,0xF0);
43 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 41 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
44 return IRQ_NONE; 42 return IRQ_NONE;
45 math_error((void __user *)get_irq_regs()->ip); 43 math_error((void __user *)get_irq_regs()->ip);
@@ -56,7 +54,7 @@ static struct irqaction fpu_irq = {
56 .name = "fpu", 54 .name = "fpu",
57}; 55};
58 56
59void __init init_ISA_irqs (void) 57void __init init_ISA_irqs(void)
60{ 58{
61 int i; 59 int i;
62 60
@@ -68,8 +66,7 @@ void __init init_ISA_irqs (void)
68 /* 66 /*
69 * 16 old-style INTA-cycle interrupts: 67 * 16 old-style INTA-cycle interrupts:
70 */ 68 */
71 for (i = 0; i < 16; i++) { 69 for (i = 0; i < NR_IRQS_LEGACY; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i); 70 struct irq_desc *desc = irq_to_desc(i);
74 71
75 desc->status = IRQ_DISABLED; 72 desc->status = IRQ_DISABLED;
@@ -111,6 +108,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 108 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112}; 109};
113 110
111int vector_used_by_percpu_irq(unsigned int vector)
112{
113 int cpu;
114
115 for_each_online_cpu(cpu) {
116 if (per_cpu(vector_irq, cpu)[vector] != -1)
117 return 1;
118 }
119
120 return 0;
121}
122
114/* Overridden in paravirt.c */ 123/* Overridden in paravirt.c */
115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 124void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
116 125
@@ -129,7 +138,7 @@ void __init native_init_IRQ(void)
129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 138 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
130 /* SYSCALL_VECTOR was reserved in trap_init. */ 139 /* SYSCALL_VECTOR was reserved in trap_init. */
131 if (i != SYSCALL_VECTOR) 140 if (i != SYSCALL_VECTOR)
132 set_intr_gate(i, interrupt[i]); 141 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
133 } 142 }
134 143
135 144
@@ -140,17 +149,26 @@ void __init native_init_IRQ(void)
140 */ 149 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 150 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142 151
143 /* IPI for invalidation */ 152 /* IPIs for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 153 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
154 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
155 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
156 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
157 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
158 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
159 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
160 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
145 161
146 /* IPI for generic function call */ 162 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 163 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148 164
149 /* IPI for single call function */ 165 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); 166 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
167 call_function_single_interrupt);
151 168
152 /* Low priority IPI to cleanup after moving an irq */ 169 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 170 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
171 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
154#endif 172#endif
155 173
156#ifdef CONFIG_X86_LOCAL_APIC 174#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..da481a1e3f30 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,54 +11,19 @@
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h> 12#include <linux/sysdev.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
14 17
15#include <asm/acpi.h>
16#include <asm/atomic.h> 18#include <asm/atomic.h>
17#include <asm/system.h> 19#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h> 20#include <asm/hw_irq.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/i8259.h> 24#include <asm/i8259.h>
25 25
26/* 26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f) 28 * (these are usually mapped to vectors 0x30-0x3f)
64 */ 29 */
@@ -73,37 +38,6 @@
73 * 38 *
74 * (these are usually mapped into the 0x30-0xff vector range) 39 * (these are usually mapped into the 0x30-0xff vector range)
75 */ 40 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107 41
108/* 42/*
109 * IRQ2 is cascade interrupt to second interrupt controller 43 * IRQ2 is cascade interrupt to second interrupt controller
@@ -135,15 +69,26 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 69 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136}; 70};
137 71
138void __init init_ISA_irqs(void) 72int vector_used_by_percpu_irq(unsigned int vector)
73{
74 int cpu;
75
76 for_each_online_cpu(cpu) {
77 if (per_cpu(vector_irq, cpu)[vector] != -1)
78 return 1;
79 }
80
81 return 0;
82}
83
84static void __init init_ISA_irqs(void)
139{ 85{
140 int i; 86 int i;
141 87
142 init_bsp_APIC(); 88 init_bsp_APIC();
143 init_8259A(0); 89 init_8259A(0);
144 90
145 for (i = 0; i < 16; i++) { 91 for (i = 0; i < NR_IRQS_LEGACY; i++) {
146 /* first time call this irq_desc */
147 struct irq_desc *desc = irq_to_desc(i); 92 struct irq_desc *desc = irq_to_desc(i);
148 93
149 desc->status = IRQ_DISABLED; 94 desc->status = IRQ_DISABLED;
@@ -188,6 +133,7 @@ static void __init smp_intr_init(void)
188 133
189 /* Low priority IPI to cleanup after moving an irq */ 134 /* Low priority IPI to cleanup after moving an irq */
190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 135 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
136 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
191#endif 137#endif
192} 138}
193 139
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 10435a120d22..5c4f55483849 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
46#include <asm/apicdef.h> 46#include <asm/apicdef.h>
47#include <asm/system.h> 47#include <asm/system.h>
48 48
49#include <mach_ipi.h> 49#include <asm/genapic.h>
50 50
51/* 51/*
52 * Put the error code here just in case the user cares: 52 * Put the error code here just in case the user cares:
@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
347 */ 347 */
348void kgdb_roundup_cpus(unsigned long flags) 348void kgdb_roundup_cpus(unsigned long flags)
349{ 349{
350 send_IPI_allbutself(APIC_DM_NMI); 350 apic->send_IPI_allbutself(APIC_DM_NMI);
351} 351}
352#endif 352#endif
353 353
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 6c27679ec6aa..e948b28a5a9a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -376,9 +376,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
376 376
377void __kprobes arch_remove_kprobe(struct kprobe *p) 377void __kprobes arch_remove_kprobe(struct kprobe *p)
378{ 378{
379 mutex_lock(&kprobe_mutex); 379 if (p->ainsn.insn) {
380 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); 380 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
381 mutex_unlock(&kprobe_mutex); 381 p->ainsn.insn = NULL;
382 }
382} 383}
383 384
384static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) 385static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
@@ -445,7 +446,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
445static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
446 struct kprobe_ctlblk *kcb) 447 struct kprobe_ctlblk *kcb)
447{ 448{
448#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) 449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
449 if (p->ainsn.boostable == 1 && !p->post_handler) { 450 if (p->ainsn.boostable == 1 && !p->post_handler) {
450 /* Boost up -- we can execute copied instructions directly */ 451 /* Boost up -- we can execute copied instructions directly */
451 reset_current_kprobe(); 452 reset_current_kprobe();
@@ -694,7 +695,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
694 /* 695 /*
695 * It is possible to have multiple instances associated with a given 696 * It is possible to have multiple instances associated with a given
696 * task either because multiple functions in the call path have 697 * task either because multiple functions in the call path have
697 * return probes installed on them, and/or more then one 698 * return probes installed on them, and/or more than one
698 * return probe was registered for a target function. 699 * return probe was registered for a target function.
699 * 700 *
700 * We can handle this because: 701 * We can handle this because:
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 774ac4991568..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
89 */ 89 */
90static unsigned long kvm_get_tsc_khz(void) 90static unsigned long kvm_get_tsc_khz(void)
91{ 91{
92 return preset_lpj; 92 struct pvclock_vcpu_time_info *src;
93 src = &per_cpu(hv_clock, 0);
94 return pvclock_tsc_khz(src);
93} 95}
94 96
95static void kvm_get_preset_lpj(void) 97static void kvm_get_preset_lpj(void)
96{ 98{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz; 99 unsigned long khz;
99 u64 lpj; 100 u64 lpj;
100 101
101 src = &per_cpu(hv_clock, 0); 102 khz = kvm_get_tsc_khz();
102 khz = pvclock_tsc_khz(src);
103 103
104 lpj = ((u64)khz * 1000); 104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ); 105 do_div(lpj, HZ);
@@ -128,7 +128,7 @@ static int kvm_register_clock(char *txt)
128} 128}
129 129
130#ifdef CONFIG_X86_LOCAL_APIC 130#ifdef CONFIG_X86_LOCAL_APIC
131static void kvm_setup_secondary_clock(void) 131static void __cpuinit kvm_setup_secondary_clock(void)
132{ 132{
133 /* 133 /*
134 * Now that the first cpu already had this clocksource initialized, 134 * Now that the first cpu already had this clocksource initialized,
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
194#endif 194#endif
195 kvm_get_preset_lpj(); 195 kvm_get_preset_lpj();
196 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
197 pv_info.paravirt_enabled = 1;
198 pv_info.name = "KVM";
197 } 199 }
198} 200}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee3..71f1d99a635d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/uaccess.h>
15 16
16#include <asm/uaccess.h>
17#include <asm/system.h> 17#include <asm/system.h>
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
93 if (err < 0) 93 if (err < 0)
94 return err; 94 return err;
95 95
96 for(i = 0; i < old->size; i++) 96 for (i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); 97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
98 return 0; 98 return 0;
99} 99}
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c322..8815f3c7fec7 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
252/* 252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use 253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly 254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better then the PIT. This isn't a general purpose MFGPT driver, but 255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source. 256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet. 257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */ 258 */
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
287 .set_mode = mfgpt_set_mode, 287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event, 288 .set_next_event = mfgpt_next_event,
289 .rating = 250, 289 .rating = 250,
290 .cpumask = CPU_MASK_ALL, 290 .cpumask = cpu_all_mask,
291 .shift = 32 291 .shift = 32
292}; 292};
293 293
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 5f8e5d75a254..c25fdb382292 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
10 * This driver allows to upgrade microcode on AMD 10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors. 11 * family 0x10 and 0x11 processors.
12 * 12 *
13 * Licensed unter the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15*/
16 16
@@ -32,9 +32,9 @@
32#include <linux/platform_device.h> 32#include <linux/platform_device.h>
33#include <linux/pci.h> 33#include <linux/pci.h>
34#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
35#include <linux/uaccess.h>
35 36
36#include <asm/msr.h> 37#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <asm/microcode.h> 39#include <asm/microcode.h>
40 40
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
47#define UCODE_UCODE_TYPE 0x00000001 47#define UCODE_UCODE_TYPE 0x00000001
48 48
49struct equiv_cpu_entry { 49struct equiv_cpu_entry {
50 unsigned int installed_cpu; 50 u32 installed_cpu;
51 unsigned int fixed_errata_mask; 51 u32 fixed_errata_mask;
52 unsigned int fixed_errata_compare; 52 u32 fixed_errata_compare;
53 unsigned int equiv_cpu; 53 u16 equiv_cpu;
54}; 54 u16 res;
55} __attribute__((packed));
55 56
56struct microcode_header_amd { 57struct microcode_header_amd {
57 unsigned int data_code; 58 u32 data_code;
58 unsigned int patch_id; 59 u32 patch_id;
59 unsigned char mc_patch_data_id[2]; 60 u16 mc_patch_data_id;
60 unsigned char mc_patch_data_len; 61 u8 mc_patch_data_len;
61 unsigned char init_flag; 62 u8 init_flag;
62 unsigned int mc_patch_data_checksum; 63 u32 mc_patch_data_checksum;
63 unsigned int nb_dev_id; 64 u32 nb_dev_id;
64 unsigned int sb_dev_id; 65 u32 sb_dev_id;
65 unsigned char processor_rev_id[2]; 66 u16 processor_rev_id;
66 unsigned char nb_rev_id; 67 u8 nb_rev_id;
67 unsigned char sb_rev_id; 68 u8 sb_rev_id;
68 unsigned char bios_api_rev; 69 u8 bios_api_rev;
69 unsigned char reserved1[3]; 70 u8 reserved1[3];
70 unsigned int match_reg[8]; 71 u32 match_reg[8];
71}; 72} __attribute__((packed));
72 73
73struct microcode_amd { 74struct microcode_amd {
74 struct microcode_header_amd hdr; 75 struct microcode_header_amd hdr;
75 unsigned int mpb[0]; 76 unsigned int mpb[0];
76}; 77};
77 78
78#define UCODE_MAX_SIZE (2048) 79#define UCODE_MAX_SIZE 2048
79#define DEFAULT_UCODE_DATASIZE (896) 80#define UCODE_CONTAINER_SECTION_HDR 8
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) 81#define UCODE_CONTAINER_HEADER_SIZE 12
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87 82
88/* serialize access to the physical write */ 83/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock); 84static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 88static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{ 89{
95 struct cpuinfo_x86 *c = &cpu_data(cpu); 90 struct cpuinfo_x86 *c = &cpu_data(cpu);
91 u32 dummy;
96 92
97 memset(csig, 0, sizeof(*csig)); 93 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 94 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", 95 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
101 cpu); 96 "supported\n", cpu, c->x86);
102 return -1; 97 return -1;
103 } 98 }
104 99 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
105 asm volatile("movl %1, %%ecx; rdmsr" 100 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0; 101 return 0;
113} 102}
114 103
115static int get_matching_microcode(int cpu, void *mc, int rev) 104static int get_matching_microcode(int cpu, void *mc, int rev)
116{ 105{
117 struct microcode_header_amd *mc_header = mc; 106 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id; 107 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00; 108 u16 equiv_cpu_id = 0;
121 unsigned int i = 0; 109 unsigned int i = 0;
122 110
123 BUG_ON(equiv_cpu_table == NULL); 111 BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
132 } 120 }
133 121
134 if (!equiv_cpu_id) { 122 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id " 123 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
136 "not found in equivalent cpu table \n", cpu); 124 "not listed in equivalent cpu table\n", cpu);
137 return 0; 125 return 0;
138 } 126 }
139 127
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { 128 if (mc_header->processor_rev_id != equiv_cpu_id) {
141 printk(KERN_ERR 129 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
142 "microcode: CPU%d patch does not match " 130 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
143 "(patch is %x, cpu extended is %x) \n", 131 cpu, mc_header->processor_rev_id, equiv_cpu_id);
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0; 132 return 0;
147 } 133 }
148 134
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { 135 /* ucode might be chipset specific -- currently we don't support this */
150 printk(KERN_ERR "microcode: CPU%d patch does not match " 136 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
151 "(patch is %x, cpu base id is %x) \n", 137 printk(KERN_ERR "microcode: CPU%d: loading of chipset "
152 cpu, mc_header->processor_rev_id[1], 138 "specific code not yet supported\n", cpu);
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0; 139 return 0;
156 } 140 }
157 141
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev) 142 if (mc_header->patch_id <= rev)
187 return 0; 143 return 0;
188 144
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
192static void apply_microcode_amd(int cpu) 148static void apply_microcode_amd(int cpu)
193{ 149{
194 unsigned long flags; 150 unsigned long flags;
195 unsigned int eax, edx; 151 u32 rev, dummy;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id(); 152 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 153 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc; 154 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201 155
202 /* We should bind the task to the CPU */ 156 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu); 157 BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
206 return; 160 return;
207 161
208 spin_lock_irqsave(&microcode_update_lock, flags); 162 spin_lock_irqsave(&microcode_update_lock, flags);
209 163 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */ 164 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr" 165 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags); 166 spin_unlock_irqrestore(&microcode_update_lock, flags);
223 167
224 /* check current patch id and patch's id for match */ 168 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) { 169 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision " 170 printk(KERN_ERR "microcode: CPU%d: update failed "
227 "0x%x to 0x%x failed\n", cpu_num, 171 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
228 mc_amd->hdr.patch_id, rev);
229 return; 172 return;
230 } 173 }
231 174
232 printk(KERN_INFO "microcode: CPU%d updated from revision " 175 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
233 "0x%x to 0x%x \n", 176 cpu, rev);
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235 177
236 uci->cpu_sig.rev = rev; 178 uci->cpu_sig.rev = rev;
237} 179}
238 180
239static void * get_next_ucode(u8 *buf, unsigned int size, 181static int get_ucode_data(void *to, const u8 *from, size_t n)
240 int (*get_ucode_data)(void *, const void *, size_t), 182{
241 unsigned int *mc_size) 183 memcpy(to, from, n);
184 return 0;
185}
186
187static void *get_next_ucode(const u8 *buf, unsigned int size,
188 unsigned int *mc_size)
242{ 189{
243 unsigned int total_size; 190 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc; 192 void *mc;
247 193
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
249 return NULL; 195 return NULL;
250 196
251 if (section_hdr[0] != UCODE_UCODE_TYPE) { 197 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! " 198 printk(KERN_ERR "microcode: error: invalid type field in "
253 "Wrong microcode payload type field\n"); 199 "container file section header\n");
254 return NULL; 200 return NULL;
255 } 201 }
256 202
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 203 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258 204
259 printk(KERN_INFO "microcode: size %u, total_size %u\n", 205 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
260 size, total_size); 206 size, total_size);
261 207
262 if (total_size > size || total_size > UCODE_MAX_SIZE) { 208 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); 209 printk(KERN_ERR "microcode: error: size mismatch\n");
264 return NULL; 210 return NULL;
265 } 211 }
266 212
267 mc = vmalloc(UCODE_MAX_SIZE); 213 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) { 214 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE); 215 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { 216 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
217 total_size)) {
271 vfree(mc); 218 vfree(mc);
272 mc = NULL; 219 mc = NULL;
273 } else 220 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 221 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 } 222 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc; 223 return mc;
278} 224}
279 225
280 226
281static int install_equiv_cpu_table(u8 *buf, 227static int install_equiv_cpu_table(const u8 *buf)
282 int (*get_ucode_data)(void *, const void *, size_t))
283{ 228{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr; 230 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size; 231 unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
292 size = buf_pos[2]; 236 size = buf_pos[2];
293 237
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 238 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! " 239 printk(KERN_ERR "microcode: error: invalid type field in "
296 "Wrong microcode equivalnet cpu table\n"); 240 "container file section header\n");
297 return 0; 241 return 0;
298 } 242 }
299 243
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 244 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) { 245 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); 246 printk(KERN_ERR "microcode: failed to allocate "
247 "equivalent CPU table\n");
303 return 0; 248 return 0;
304 } 249 }
305 250
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
310 } 255 }
311 256
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 257 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314} 258}
315 259
316static void free_equiv_cpu_table(void) 260static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
321 } 265 }
322} 266}
323 267
324static int generic_load_microcode(int cpu, void *data, size_t size, 268static int generic_load_microcode(int cpu, const u8 *data, size_t size)
325 int (*get_ucode_data)(void *, const void *, size_t))
326{ 269{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 270 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 271 const u8 *ucode_ptr = data;
272 void *new_mc = NULL;
273 void *mc;
329 int new_rev = uci->cpu_sig.rev; 274 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover; 275 unsigned int leftover;
331 unsigned long offset; 276 unsigned long offset;
332 277
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); 278 offset = install_equiv_cpu_table(ucode_ptr);
334 if (!offset) { 279 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); 280 printk(KERN_ERR "microcode: failed to create "
281 "equivalent cpu table\n");
336 return -EINVAL; 282 return -EINVAL;
337 } 283 }
338 284
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
343 unsigned int uninitialized_var(mc_size); 289 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header; 290 struct microcode_header_amd *mc_header;
345 291
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); 292 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
347 if (!mc) 293 if (!mc)
348 break; 294 break;
349 295
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
353 vfree(new_mc); 299 vfree(new_mc);
354 new_rev = mc_header->patch_id; 300 new_rev = mc_header->patch_id;
355 new_mc = mc; 301 new_mc = mc;
356 } else 302 } else
357 vfree(mc); 303 vfree(mc);
358 304
359 ucode_ptr += mc_size; 305 ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
365 if (uci->mc) 311 if (uci->mc)
366 vfree(uci->mc); 312 vfree(uci->mc);
367 uci->mc = new_mc; 313 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with" 314 pr_debug("microcode: CPU%d found a matching microcode "
369 " version 0x%x (current=0x%x)\n", 315 "update with version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev); 316 cpu, new_rev, uci->cpu_sig.rev);
371 } else 317 } else
372 vfree(new_mc); 318 vfree(new_mc);
373 } 319 }
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
377 return (int)leftover; 323 return (int)leftover;
378} 324}
379 325
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device) 326static int request_microcode_fw(int cpu, struct device *device)
387{ 327{
388 const char *fw_name = "amd-ucode/microcode_amd.bin"; 328 const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
394 334
395 ret = request_firmware(&firmware, fw_name, device); 335 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) { 336 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); 337 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
398 return ret; 338 return ret;
399 } 339 }
400 340
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 341 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
402 &get_ucode_fw);
403 342
404 release_firmware(firmware); 343 release_firmware(firmware);
405 344
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
408 347
409static int request_microcode_user(int cpu, const void __user *buf, size_t size) 348static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{ 349{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" 350 printk(KERN_INFO "microcode: AMD microcode update via "
412 "is not supported\n"); 351 "/dev/cpu/microcode not supported\n");
413 return -1; 352 return -1;
414} 353}
415 354
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
433{ 372{
434 return &microcode_amd_ops; 373 return &microcode_amd_ops;
435} 374}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 82fb2809ce32..c9b721ba968c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
203#endif 203#endif
204 204
205/* fake device for request_firmware */ 205/* fake device for request_firmware */
206struct platform_device *microcode_pdev; 206static struct platform_device *microcode_pdev;
207 207
208static ssize_t reload_store(struct sys_device *dev, 208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 209 struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
272 .name = "microcode", 272 .name = "microcode",
273}; 273};
274 274
275static void microcode_fini_cpu(int cpu) 275static void __microcode_fini_cpu(int cpu)
276{ 276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu); 279 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0; 280 uci->valid = 0;
281}
282
283static void microcode_fini_cpu(int cpu)
284{
285 mutex_lock(&microcode_mutex);
286 __microcode_fini_cpu(cpu);
282 mutex_unlock(&microcode_mutex); 287 mutex_unlock(&microcode_mutex);
283} 288}
284 289
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
306 * to this cpu (a bit of paranoia): 311 * to this cpu (a bit of paranoia):
307 */ 312 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) { 313 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu); 314 __microcode_fini_cpu(cpu);
315 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
316 cpu);
310 return -1; 317 return -1;
311 } 318 }
312 319
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { 320 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
314 microcode_fini_cpu(cpu); 321 __microcode_fini_cpu(cpu);
322 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
323 cpu);
315 /* Should we look for a new ucode here? */ 324 /* Should we look for a new ucode here? */
316 return 1; 325 return 1;
317 } 326 }
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
319 return 0; 328 return 0;
320} 329}
321 330
322void microcode_update_cpu(int cpu) 331static void microcode_update_cpu(int cpu)
323{ 332{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0; 334 int err = 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a21784..5e9f4fc51385 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
87#include <linux/cpu.h> 87#include <linux/cpu.h>
88#include <linux/firmware.h> 88#include <linux/firmware.h>
89#include <linux/platform_device.h> 89#include <linux/platform_device.h>
90#include <linux/uaccess.h>
90 91
91#include <asm/msr.h> 92#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h> 93#include <asm/processor.h>
94#include <asm/microcode.h> 94#include <asm/microcode.h>
95 95
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{ 156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned long flags;
158 unsigned int val[2]; 159 unsigned int val[2];
159 160
160 memset(csig, 0, sizeof(*csig)); 161 memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
174 csig->pf = 1 << ((val[1] >> 18) & 7); 175 csig->pf = 1 << ((val[1] >> 18) & 7);
175 } 176 }
176 177
178 /* serialize access to the physical write to MSR 0x79 */
179 spin_lock_irqsave(&microcode_update_lock, flags);
180
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 181 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */ 182 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core(); 183 sync_core();
180 /* get the current revision from MSR 0x8B */ 184 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 185 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
186 spin_unlock_irqrestore(&microcode_update_lock, flags);
187
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 188 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev); 189 csig->sig, csig->pf, csig->rev);
184 190
@@ -190,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; 196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191} 197}
192 198
193static inline int 199static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev) 200update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{ 201{
196 return (mc_header->rev <= rev) ? 0 : 1; 202 return (mc_header->rev <= rev) ? 0 : 1;
@@ -436,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
436 return ret; 442 return ret;
437 } 443 }
438 444
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 445 ret = generic_load_microcode(cpu, (void *)firmware->data,
440 &get_ucode_fw); 446 firmware->size, &get_ucode_fw);
441 447
442 release_firmware(firmware); 448 release_firmware(firmware);
443 449
@@ -454,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
454 /* We should bind the task to the CPU */ 460 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id()); 461 BUG_ON(cpu != raw_smp_processor_id());
456 462
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); 463 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
458} 464}
459 465
460static void microcode_fini_cpu(int cpu) 466static void microcode_fini_cpu(int cpu)
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
465 uci->mc = NULL; 471 uci->mc = NULL;
466} 472}
467 473
468struct microcode_ops microcode_intel_ops = { 474static struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user, 475 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw, 476 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info, 477 .collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe85..666e43df51f9 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/acpi.h> 14#include <asm/acpi.h>
15#include <asm/mmconfig.h> 15#include <asm/mmconfig.h>
16 16#include <asm/pci_x86.h>
17#include "../pci/pci.h"
18 17
19struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
20 u32 bus; 19 u32 bus;
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb1..0edd819050e7 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
42{ 42{
43 vfree(module_region); 43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception 44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */ 45 table entries. */
46} 46}
47 47
48/* We don't need anything special. */ 48/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
113 *para = NULL; 113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115 115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name)) 117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s; 118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s; 120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s; 122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s; 124 para = s;
125 } 125 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b1..c23880b90b5c 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/pgtable.h> 31#include <asm/pgtable.h>
32 32
33#define DEBUGP(fmt...) 33#define DEBUGP(fmt...)
34 34
35#ifndef CONFIG_UML 35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region) 36void module_free(struct module *mod, void *module_region)
37{ 37{
38 vfree(module_region); 38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception 39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */ 40 table entries. */
41} 41}
42 42
43void *module_alloc(unsigned long size) 43void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; 77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
78 Elf64_Sym *sym; 78 Elf64_Sym *sym;
79 void *loc; 79 void *loc;
80 u64 val; 80 u64 val;
81 81
82 DEBUGP("Applying relocate section %u to %u\n", relsec, 82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info); 83 sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr 91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
92 + ELF64_R_SYM(rel[i].r_info); 92 + ELF64_R_SYM(rel[i].r_info);
93 93
94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
95 (int)ELF64_R_TYPE(rel[i].r_info), 95 (int)ELF64_R_TYPE(rel[i].r_info),
96 sym->st_value, rel[i].r_addend, (u64)loc); 96 sym->st_value, rel[i].r_addend, (u64)loc);
97 97
98 val = sym->st_value + rel[i].r_addend; 98 val = sym->st_value + rel[i].r_addend;
99 99
100 switch (ELF64_R_TYPE(rel[i].r_info)) { 100 switch (ELF64_R_TYPE(rel[i].r_info)) {
101 case R_X86_64_NONE: 101 case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
113 if ((s64)val != *(s32 *)loc) 113 if ((s64)val != *(s32 *)loc)
114 goto overflow; 114 goto overflow;
115 break; 115 break;
116 case R_X86_64_PC32: 116 case R_X86_64_PC32:
117 val -= (u64)loc; 117 val -= (u64)loc;
118 *(u32 *)loc = val; 118 *(u32 *)loc = val;
119#if 0 119#if 0
120 if ((s64)val != *(s32 *)loc) 120 if ((s64)val != *(s32 *)loc)
121 goto overflow; 121 goto overflow;
122#endif 122#endif
123 break; 123 break;
124 default: 124 default:
125 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", 125 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
126 me->name, ELF64_R_TYPE(rel[i].r_info)); 126 me->name, ELF64_R_TYPE(rel[i].r_info));
127 return -ENOEXEC; 127 return -ENOEXEC;
128 } 128 }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
130 return 0; 130 return 0;
131 131
132overflow: 132overflow:
133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
134 (int)ELF64_R_TYPE(rel[i].r_info), val); 134 (int)ELF64_R_TYPE(rel[i].r_info), val);
135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
136 me->name); 136 me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
143 unsigned int relsec, 143 unsigned int relsec,
144 struct module *me) 144 struct module *me)
145{ 145{
146 printk("non add relocation not supported\n"); 146 printk(KERN_ERR "non add relocation not supported\n");
147 return -ENOSYS; 147 return -ENOSYS;
148} 148}
149 149
150int module_finalize(const Elf_Ehdr *hdr, 150int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
152 struct module *me) 152 struct module *me)
153{ 153{
154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL; 155 *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
161 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 161 if (!strcmp(".altinstructions", secstrings + s->sh_name))
162 alt = s; 162 alt = s;
163 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
164 locks= s; 164 locks = s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s; 166 para = s;
167 } 167 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index f98f4e1dba09..200764453195 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -2,8 +2,8 @@
2 * Intel Multiprocessor Specification 1.1 and 1.4 2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines. 3 * compliant MP-table parsing routines.
4 * 4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> 5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 6 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de> 7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
8 */ 8 */
9 9
@@ -16,25 +16,20 @@
16#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h>
19 20
20#include <asm/smp.h>
21#include <asm/mtrr.h> 21#include <asm/mtrr.h>
22#include <asm/mpspec.h> 22#include <asm/mpspec.h>
23#include <asm/pgalloc.h> 23#include <asm/pgalloc.h>
24#include <asm/io_apic.h> 24#include <asm/io_apic.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/acpi.h>
27#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
28#include <asm/e820.h> 27#include <asm/e820.h>
29#include <asm/trampoline.h> 28#include <asm/trampoline.h>
30#include <asm/setup.h> 29#include <asm/setup.h>
30#include <asm/smp.h>
31 31
32#include <mach_apic.h> 32#include <asm/genapic.h>
33#ifdef CONFIG_X86_32
34#include <mach_apicdef.h>
35#include <mach_mpparse.h>
36#endif
37
38/* 33/*
39 * Checksum an MP configuration block. 34 * Checksum an MP configuration block.
40 */ 35 */
@@ -49,12 +44,12 @@ static int __init mpf_checksum(unsigned char *mp, int len)
49 return sum & 0xFF; 44 return sum & 0xFF;
50} 45}
51 46
52static void __init MP_processor_info(struct mpc_config_processor *m) 47static void __init MP_processor_info(struct mpc_cpu *m)
53{ 48{
54 int apicid; 49 int apicid;
55 char *bootup_cpu = ""; 50 char *bootup_cpu = "";
56 51
57 if (!(m->mpc_cpuflag & CPU_ENABLED)) { 52 if (!(m->cpuflag & CPU_ENABLED)) {
58 disabled_cpus++; 53 disabled_cpus++;
59 return; 54 return;
60 } 55 }
@@ -62,54 +57,54 @@ static void __init MP_processor_info(struct mpc_config_processor *m)
62 if (x86_quirks->mpc_apic_id) 57 if (x86_quirks->mpc_apic_id)
63 apicid = x86_quirks->mpc_apic_id(m); 58 apicid = x86_quirks->mpc_apic_id(m);
64 else 59 else
65 apicid = m->mpc_apicid; 60 apicid = m->apicid;
66 61
67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 62 if (m->cpuflag & CPU_BOOTPROCESSOR) {
68 bootup_cpu = " (Bootup-CPU)"; 63 bootup_cpu = " (Bootup-CPU)";
69 boot_cpu_physical_apicid = m->mpc_apicid; 64 boot_cpu_physical_apicid = m->apicid;
70 } 65 }
71 66
72 printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); 67 printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
73 generic_processor_info(apicid, m->mpc_apicver); 68 generic_processor_info(apicid, m->apicver);
74} 69}
75 70
76#ifdef CONFIG_X86_IO_APIC 71#ifdef CONFIG_X86_IO_APIC
77static void __init MP_bus_info(struct mpc_config_bus *m) 72static void __init MP_bus_info(struct mpc_bus *m)
78{ 73{
79 char str[7]; 74 char str[7];
80 memcpy(str, m->mpc_bustype, 6); 75 memcpy(str, m->bustype, 6);
81 str[6] = 0; 76 str[6] = 0;
82 77
83 if (x86_quirks->mpc_oem_bus_info) 78 if (x86_quirks->mpc_oem_bus_info)
84 x86_quirks->mpc_oem_bus_info(m, str); 79 x86_quirks->mpc_oem_bus_info(m, str);
85 else 80 else
86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); 81 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
87 82
88#if MAX_MP_BUSSES < 256 83#if MAX_MP_BUSSES < 256
89 if (m->mpc_busid >= MAX_MP_BUSSES) { 84 if (m->busid >= MAX_MP_BUSSES) {
90 printk(KERN_WARNING "MP table busid value (%d) for bustype %s " 85 printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
91 " is too large, max. supported is %d\n", 86 " is too large, max. supported is %d\n",
92 m->mpc_busid, str, MAX_MP_BUSSES - 1); 87 m->busid, str, MAX_MP_BUSSES - 1);
93 return; 88 return;
94 } 89 }
95#endif 90#endif
96 91
97 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 92 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
98 set_bit(m->mpc_busid, mp_bus_not_pci); 93 set_bit(m->busid, mp_bus_not_pci);
99#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 94#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 95 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
101#endif 96#endif
102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 97 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
103 if (x86_quirks->mpc_oem_pci_bus) 98 if (x86_quirks->mpc_oem_pci_bus)
104 x86_quirks->mpc_oem_pci_bus(m); 99 x86_quirks->mpc_oem_pci_bus(m);
105 100
106 clear_bit(m->mpc_busid, mp_bus_not_pci); 101 clear_bit(m->busid, mp_bus_not_pci);
107#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 102#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 103 mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 104 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
110 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; 105 mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
111 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { 106 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
112 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 107 mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
113#endif 108#endif
114 } else 109 } else
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -133,89 +128,88 @@ static int bad_ioapic(unsigned long address)
133 return 0; 128 return 0;
134} 129}
135 130
136static void __init MP_ioapic_info(struct mpc_config_ioapic *m) 131static void __init MP_ioapic_info(struct mpc_ioapic *m)
137{ 132{
138 if (!(m->mpc_flags & MPC_APIC_USABLE)) 133 if (!(m->flags & MPC_APIC_USABLE))
139 return; 134 return;
140 135
141 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", 136 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
142 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 137 m->apicid, m->apicver, m->apicaddr);
143 138
144 if (bad_ioapic(m->mpc_apicaddr)) 139 if (bad_ioapic(m->apicaddr))
145 return; 140 return;
146 141
147 mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr; 142 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid; 143 mp_ioapics[nr_ioapics].apicid = m->apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->mpc_type; 144 mp_ioapics[nr_ioapics].type = m->type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver; 145 mp_ioapics[nr_ioapics].apicver = m->apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags; 146 mp_ioapics[nr_ioapics].flags = m->flags;
152 nr_ioapics++; 147 nr_ioapics++;
153} 148}
154 149
155static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 150static void print_MP_intsrc_info(struct mpc_intsrc *m)
156{ 151{
157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 152 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
158 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 153 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
159 m->mpc_irqtype, m->mpc_irqflag & 3, 154 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 155 m->srcbusirq, m->dstapic, m->dstirq);
161 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
162} 156}
163 157
164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 158static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
165{ 159{
166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 160 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
167 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 161 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 162 mp_irq->irqtype, mp_irq->irqflag & 3,
169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 163 (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
170 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); 164 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
171} 165}
172 166
173static void __init assign_to_mp_irq(struct mpc_config_intsrc *m, 167static void __init assign_to_mp_irq(struct mpc_intsrc *m,
174 struct mp_config_intsrc *mp_irq) 168 struct mpc_intsrc *mp_irq)
175{ 169{
176 mp_irq->mp_dstapic = m->mpc_dstapic; 170 mp_irq->dstapic = m->dstapic;
177 mp_irq->mp_type = m->mpc_type; 171 mp_irq->type = m->type;
178 mp_irq->mp_irqtype = m->mpc_irqtype; 172 mp_irq->irqtype = m->irqtype;
179 mp_irq->mp_irqflag = m->mpc_irqflag; 173 mp_irq->irqflag = m->irqflag;
180 mp_irq->mp_srcbus = m->mpc_srcbus; 174 mp_irq->srcbus = m->srcbus;
181 mp_irq->mp_srcbusirq = m->mpc_srcbusirq; 175 mp_irq->srcbusirq = m->srcbusirq;
182 mp_irq->mp_dstirq = m->mpc_dstirq; 176 mp_irq->dstirq = m->dstirq;
183} 177}
184 178
185static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, 179static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
186 struct mpc_config_intsrc *m) 180 struct mpc_intsrc *m)
187{ 181{
188 m->mpc_dstapic = mp_irq->mp_dstapic; 182 m->dstapic = mp_irq->dstapic;
189 m->mpc_type = mp_irq->mp_type; 183 m->type = mp_irq->type;
190 m->mpc_irqtype = mp_irq->mp_irqtype; 184 m->irqtype = mp_irq->irqtype;
191 m->mpc_irqflag = mp_irq->mp_irqflag; 185 m->irqflag = mp_irq->irqflag;
192 m->mpc_srcbus = mp_irq->mp_srcbus; 186 m->srcbus = mp_irq->srcbus;
193 m->mpc_srcbusirq = mp_irq->mp_srcbusirq; 187 m->srcbusirq = mp_irq->srcbusirq;
194 m->mpc_dstirq = mp_irq->mp_dstirq; 188 m->dstirq = mp_irq->dstirq;
195} 189}
196 190
197static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, 191static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
198 struct mpc_config_intsrc *m) 192 struct mpc_intsrc *m)
199{ 193{
200 if (mp_irq->mp_dstapic != m->mpc_dstapic) 194 if (mp_irq->dstapic != m->dstapic)
201 return 1; 195 return 1;
202 if (mp_irq->mp_type != m->mpc_type) 196 if (mp_irq->type != m->type)
203 return 2; 197 return 2;
204 if (mp_irq->mp_irqtype != m->mpc_irqtype) 198 if (mp_irq->irqtype != m->irqtype)
205 return 3; 199 return 3;
206 if (mp_irq->mp_irqflag != m->mpc_irqflag) 200 if (mp_irq->irqflag != m->irqflag)
207 return 4; 201 return 4;
208 if (mp_irq->mp_srcbus != m->mpc_srcbus) 202 if (mp_irq->srcbus != m->srcbus)
209 return 5; 203 return 5;
210 if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq) 204 if (mp_irq->srcbusirq != m->srcbusirq)
211 return 6; 205 return 6;
212 if (mp_irq->mp_dstirq != m->mpc_dstirq) 206 if (mp_irq->dstirq != m->dstirq)
213 return 7; 207 return 7;
214 208
215 return 0; 209 return 0;
216} 210}
217 211
218static void __init MP_intsrc_info(struct mpc_config_intsrc *m) 212static void __init MP_intsrc_info(struct mpc_intsrc *m)
219{ 213{
220 int i; 214 int i;
221 215
@@ -233,57 +227,55 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
233 227
234#endif 228#endif
235 229
236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 230static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
237{ 231{
238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," 232 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 233 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
240 m->mpc_irqtype, m->mpc_irqflag & 3, 234 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 235 m->srcbusirq, m->destapic, m->destapiclint);
242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
243} 236}
244 237
245/* 238/*
246 * Read/parse the MPC 239 * Read/parse the MPC
247 */ 240 */
248 241
249static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem, 242static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
250 char *str)
251{ 243{
252 244
253 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { 245 if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
254 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", 246 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
255 mpc->mpc_signature[0], mpc->mpc_signature[1], 247 mpc->signature[0], mpc->signature[1],
256 mpc->mpc_signature[2], mpc->mpc_signature[3]); 248 mpc->signature[2], mpc->signature[3]);
257 return 0; 249 return 0;
258 } 250 }
259 if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) { 251 if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
260 printk(KERN_ERR "MPTABLE: checksum error!\n"); 252 printk(KERN_ERR "MPTABLE: checksum error!\n");
261 return 0; 253 return 0;
262 } 254 }
263 if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) { 255 if (mpc->spec != 0x01 && mpc->spec != 0x04) {
264 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", 256 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
265 mpc->mpc_spec); 257 mpc->spec);
266 return 0; 258 return 0;
267 } 259 }
268 if (!mpc->mpc_lapic) { 260 if (!mpc->lapic) {
269 printk(KERN_ERR "MPTABLE: null local APIC address!\n"); 261 printk(KERN_ERR "MPTABLE: null local APIC address!\n");
270 return 0; 262 return 0;
271 } 263 }
272 memcpy(oem, mpc->mpc_oem, 8); 264 memcpy(oem, mpc->oem, 8);
273 oem[8] = 0; 265 oem[8] = 0;
274 printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); 266 printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
275 267
276 memcpy(str, mpc->mpc_productid, 12); 268 memcpy(str, mpc->productid, 12);
277 str[12] = 0; 269 str[12] = 0;
278 270
279 printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); 271 printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
280 272
281 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); 273 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
282 274
283 return 1; 275 return 1;
284} 276}
285 277
286static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) 278static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
287{ 279{
288 char str[16]; 280 char str[16];
289 char oem[10]; 281 char oem[10];
@@ -295,27 +287,18 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
295 return 0; 287 return 0;
296 288
297#ifdef CONFIG_X86_32 289#ifdef CONFIG_X86_32
298 /* 290 generic_mps_oem_check(mpc, oem, str);
299 * need to make sure summit and es7000's mps_oem_check is safe to be
300 * called early via genericarch 's mps_oem_check
301 */
302 if (early) {
303#ifdef CONFIG_X86_NUMAQ
304 numaq_mps_oem_check(mpc, oem, str);
305#endif
306 } else
307 mps_oem_check(mpc, oem, str);
308#endif 291#endif
309 /* save the local APIC address, it might be non-default */ 292 /* save the local APIC address, it might be non-default */
310 if (!acpi_lapic) 293 if (!acpi_lapic)
311 mp_lapic_addr = mpc->mpc_lapic; 294 mp_lapic_addr = mpc->lapic;
312 295
313 if (early) 296 if (early)
314 return 1; 297 return 1;
315 298
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { 299 if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; 300 struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); 301 x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
319 } 302 }
320 303
321 /* 304 /*
@@ -324,12 +307,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
324 if (x86_quirks->mpc_record) 307 if (x86_quirks->mpc_record)
325 *x86_quirks->mpc_record = 0; 308 *x86_quirks->mpc_record = 0;
326 309
327 while (count < mpc->mpc_length) { 310 while (count < mpc->length) {
328 switch (*mpt) { 311 switch (*mpt) {
329 case MP_PROCESSOR: 312 case MP_PROCESSOR:
330 { 313 {
331 struct mpc_config_processor *m = 314 struct mpc_cpu *m = (struct mpc_cpu *)mpt;
332 (struct mpc_config_processor *)mpt;
333 /* ACPI may have already provided this data */ 315 /* ACPI may have already provided this data */
334 if (!acpi_lapic) 316 if (!acpi_lapic)
335 MP_processor_info(m); 317 MP_processor_info(m);
@@ -339,8 +321,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
339 } 321 }
340 case MP_BUS: 322 case MP_BUS:
341 { 323 {
342 struct mpc_config_bus *m = 324 struct mpc_bus *m = (struct mpc_bus *)mpt;
343 (struct mpc_config_bus *)mpt;
344#ifdef CONFIG_X86_IO_APIC 325#ifdef CONFIG_X86_IO_APIC
345 MP_bus_info(m); 326 MP_bus_info(m);
346#endif 327#endif
@@ -351,30 +332,28 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
351 case MP_IOAPIC: 332 case MP_IOAPIC:
352 { 333 {
353#ifdef CONFIG_X86_IO_APIC 334#ifdef CONFIG_X86_IO_APIC
354 struct mpc_config_ioapic *m = 335 struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
355 (struct mpc_config_ioapic *)mpt;
356 MP_ioapic_info(m); 336 MP_ioapic_info(m);
357#endif 337#endif
358 mpt += sizeof(struct mpc_config_ioapic); 338 mpt += sizeof(struct mpc_ioapic);
359 count += sizeof(struct mpc_config_ioapic); 339 count += sizeof(struct mpc_ioapic);
360 break; 340 break;
361 } 341 }
362 case MP_INTSRC: 342 case MP_INTSRC:
363 { 343 {
364#ifdef CONFIG_X86_IO_APIC 344#ifdef CONFIG_X86_IO_APIC
365 struct mpc_config_intsrc *m = 345 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
366 (struct mpc_config_intsrc *)mpt;
367 346
368 MP_intsrc_info(m); 347 MP_intsrc_info(m);
369#endif 348#endif
370 mpt += sizeof(struct mpc_config_intsrc); 349 mpt += sizeof(struct mpc_intsrc);
371 count += sizeof(struct mpc_config_intsrc); 350 count += sizeof(struct mpc_intsrc);
372 break; 351 break;
373 } 352 }
374 case MP_LINTSRC: 353 case MP_LINTSRC:
375 { 354 {
376 struct mpc_config_lintsrc *m = 355 struct mpc_lintsrc *m =
377 (struct mpc_config_lintsrc *)mpt; 356 (struct mpc_lintsrc *)mpt;
378 MP_lintsrc_info(m); 357 MP_lintsrc_info(m);
379 mpt += sizeof(*m); 358 mpt += sizeof(*m);
380 count += sizeof(*m); 359 count += sizeof(*m);
@@ -385,21 +364,21 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
385 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 364 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
386 printk(KERN_ERR "type %x\n", *mpt); 365 printk(KERN_ERR "type %x\n", *mpt);
387 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, 366 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
388 1, mpc, mpc->mpc_length, 1); 367 1, mpc, mpc->length, 1);
389 count = mpc->mpc_length; 368 count = mpc->length;
390 break; 369 break;
391 } 370 }
392 if (x86_quirks->mpc_record) 371 if (x86_quirks->mpc_record)
393 (*x86_quirks->mpc_record)++; 372 (*x86_quirks->mpc_record)++;
394 } 373 }
395 374
396#ifdef CONFIG_X86_GENERICARCH 375#ifdef CONFIG_X86_BIGSMP
397 generic_bigsmp_probe(); 376 generic_bigsmp_probe();
398#endif 377#endif
399 378
400#ifdef CONFIG_X86_32 379 if (apic->setup_apic_routing)
401 setup_apic_routing(); 380 apic->setup_apic_routing();
402#endif 381
403 if (!num_processors) 382 if (!num_processors)
404 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 383 printk(KERN_ERR "MPTABLE: no processors registered!\n");
405 return num_processors; 384 return num_processors;
@@ -417,16 +396,16 @@ static int __init ELCR_trigger(unsigned int irq)
417 396
418static void __init construct_default_ioirq_mptable(int mpc_default_type) 397static void __init construct_default_ioirq_mptable(int mpc_default_type)
419{ 398{
420 struct mpc_config_intsrc intsrc; 399 struct mpc_intsrc intsrc;
421 int i; 400 int i;
422 int ELCR_fallback = 0; 401 int ELCR_fallback = 0;
423 402
424 intsrc.mpc_type = MP_INTSRC; 403 intsrc.type = MP_INTSRC;
425 intsrc.mpc_irqflag = 0; /* conforming */ 404 intsrc.irqflag = 0; /* conforming */
426 intsrc.mpc_srcbus = 0; 405 intsrc.srcbus = 0;
427 intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid; 406 intsrc.dstapic = mp_ioapics[0].apicid;
428 407
429 intsrc.mpc_irqtype = mp_INT; 408 intsrc.irqtype = mp_INT;
430 409
431 /* 410 /*
432 * If true, we have an ISA/PCI system with no IRQ entries 411 * If true, we have an ISA/PCI system with no IRQ entries
@@ -469,30 +448,30 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
469 * irqflag field (level sensitive, active high polarity). 448 * irqflag field (level sensitive, active high polarity).
470 */ 449 */
471 if (ELCR_trigger(i)) 450 if (ELCR_trigger(i))
472 intsrc.mpc_irqflag = 13; 451 intsrc.irqflag = 13;
473 else 452 else
474 intsrc.mpc_irqflag = 0; 453 intsrc.irqflag = 0;
475 } 454 }
476 455
477 intsrc.mpc_srcbusirq = i; 456 intsrc.srcbusirq = i;
478 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ 457 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
479 MP_intsrc_info(&intsrc); 458 MP_intsrc_info(&intsrc);
480 } 459 }
481 460
482 intsrc.mpc_irqtype = mp_ExtINT; 461 intsrc.irqtype = mp_ExtINT;
483 intsrc.mpc_srcbusirq = 0; 462 intsrc.srcbusirq = 0;
484 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ 463 intsrc.dstirq = 0; /* 8259A to INTIN0 */
485 MP_intsrc_info(&intsrc); 464 MP_intsrc_info(&intsrc);
486} 465}
487 466
488 467
489static void __init construct_ioapic_table(int mpc_default_type) 468static void __init construct_ioapic_table(int mpc_default_type)
490{ 469{
491 struct mpc_config_ioapic ioapic; 470 struct mpc_ioapic ioapic;
492 struct mpc_config_bus bus; 471 struct mpc_bus bus;
493 472
494 bus.mpc_type = MP_BUS; 473 bus.type = MP_BUS;
495 bus.mpc_busid = 0; 474 bus.busid = 0;
496 switch (mpc_default_type) { 475 switch (mpc_default_type) {
497 default: 476 default:
498 printk(KERN_ERR "???\nUnknown standard configuration %d\n", 477 printk(KERN_ERR "???\nUnknown standard configuration %d\n",
@@ -500,29 +479,29 @@ static void __init construct_ioapic_table(int mpc_default_type)
500 /* fall through */ 479 /* fall through */
501 case 1: 480 case 1:
502 case 5: 481 case 5:
503 memcpy(bus.mpc_bustype, "ISA ", 6); 482 memcpy(bus.bustype, "ISA ", 6);
504 break; 483 break;
505 case 2: 484 case 2:
506 case 6: 485 case 6:
507 case 3: 486 case 3:
508 memcpy(bus.mpc_bustype, "EISA ", 6); 487 memcpy(bus.bustype, "EISA ", 6);
509 break; 488 break;
510 case 4: 489 case 4:
511 case 7: 490 case 7:
512 memcpy(bus.mpc_bustype, "MCA ", 6); 491 memcpy(bus.bustype, "MCA ", 6);
513 } 492 }
514 MP_bus_info(&bus); 493 MP_bus_info(&bus);
515 if (mpc_default_type > 4) { 494 if (mpc_default_type > 4) {
516 bus.mpc_busid = 1; 495 bus.busid = 1;
517 memcpy(bus.mpc_bustype, "PCI ", 6); 496 memcpy(bus.bustype, "PCI ", 6);
518 MP_bus_info(&bus); 497 MP_bus_info(&bus);
519 } 498 }
520 499
521 ioapic.mpc_type = MP_IOAPIC; 500 ioapic.type = MP_IOAPIC;
522 ioapic.mpc_apicid = 2; 501 ioapic.apicid = 2;
523 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 502 ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
524 ioapic.mpc_flags = MPC_APIC_USABLE; 503 ioapic.flags = MPC_APIC_USABLE;
525 ioapic.mpc_apicaddr = 0xFEC00000; 504 ioapic.apicaddr = 0xFEC00000;
526 MP_ioapic_info(&ioapic); 505 MP_ioapic_info(&ioapic);
527 506
528 /* 507 /*
@@ -536,8 +515,8 @@ static inline void __init construct_ioapic_table(int mpc_default_type) { }
536 515
537static inline void __init construct_default_ISA_mptable(int mpc_default_type) 516static inline void __init construct_default_ISA_mptable(int mpc_default_type)
538{ 517{
539 struct mpc_config_processor processor; 518 struct mpc_cpu processor;
540 struct mpc_config_lintsrc lintsrc; 519 struct mpc_lintsrc lintsrc;
541 int linttypes[2] = { mp_ExtINT, mp_NMI }; 520 int linttypes[2] = { mp_ExtINT, mp_NMI };
542 int i; 521 int i;
543 522
@@ -549,65 +528,65 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
549 /* 528 /*
550 * 2 CPUs, numbered 0 & 1. 529 * 2 CPUs, numbered 0 & 1.
551 */ 530 */
552 processor.mpc_type = MP_PROCESSOR; 531 processor.type = MP_PROCESSOR;
553 /* Either an integrated APIC or a discrete 82489DX. */ 532 /* Either an integrated APIC or a discrete 82489DX. */
554 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 533 processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
555 processor.mpc_cpuflag = CPU_ENABLED; 534 processor.cpuflag = CPU_ENABLED;
556 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 535 processor.cpufeature = (boot_cpu_data.x86 << 8) |
557 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; 536 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
558 processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; 537 processor.featureflag = boot_cpu_data.x86_capability[0];
559 processor.mpc_reserved[0] = 0; 538 processor.reserved[0] = 0;
560 processor.mpc_reserved[1] = 0; 539 processor.reserved[1] = 0;
561 for (i = 0; i < 2; i++) { 540 for (i = 0; i < 2; i++) {
562 processor.mpc_apicid = i; 541 processor.apicid = i;
563 MP_processor_info(&processor); 542 MP_processor_info(&processor);
564 } 543 }
565 544
566 construct_ioapic_table(mpc_default_type); 545 construct_ioapic_table(mpc_default_type);
567 546
568 lintsrc.mpc_type = MP_LINTSRC; 547 lintsrc.type = MP_LINTSRC;
569 lintsrc.mpc_irqflag = 0; /* conforming */ 548 lintsrc.irqflag = 0; /* conforming */
570 lintsrc.mpc_srcbusid = 0; 549 lintsrc.srcbusid = 0;
571 lintsrc.mpc_srcbusirq = 0; 550 lintsrc.srcbusirq = 0;
572 lintsrc.mpc_destapic = MP_APIC_ALL; 551 lintsrc.destapic = MP_APIC_ALL;
573 for (i = 0; i < 2; i++) { 552 for (i = 0; i < 2; i++) {
574 lintsrc.mpc_irqtype = linttypes[i]; 553 lintsrc.irqtype = linttypes[i];
575 lintsrc.mpc_destapiclint = i; 554 lintsrc.destapiclint = i;
576 MP_lintsrc_info(&lintsrc); 555 MP_lintsrc_info(&lintsrc);
577 } 556 }
578} 557}
579 558
580static struct intel_mp_floating *mpf_found; 559static struct mpf_intel *mpf_found;
581 560
582/* 561/*
583 * Scan the memory blocks for an SMP configuration block. 562 * Scan the memory blocks for an SMP configuration block.
584 */ 563 */
585static void __init __get_smp_config(unsigned int early) 564static void __init __get_smp_config(unsigned int early)
586{ 565{
587 struct intel_mp_floating *mpf = mpf_found; 566 struct mpf_intel *mpf = mpf_found;
567
568 if (!mpf)
569 return;
588 570
589 if (x86_quirks->mach_get_smp_config) {
590 if (x86_quirks->mach_get_smp_config(early))
591 return;
592 }
593 if (acpi_lapic && early) 571 if (acpi_lapic && early)
594 return; 572 return;
573
595 /* 574 /*
596 * ACPI supports both logical (e.g. Hyper-Threading) and physical 575 * MPS doesn't support hyperthreading, aka only have
597 * processors, where MPS only supports physical. 576 * thread 0 apic id in MPS table
598 */ 577 */
599 if (acpi_lapic && acpi_ioapic) { 578 if (acpi_lapic && acpi_ioapic)
600 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
601 "information\n");
602 return; 579 return;
603 } else if (acpi_lapic) 580
604 printk(KERN_INFO "Using ACPI for processor (LAPIC) " 581 if (x86_quirks->mach_get_smp_config) {
605 "configuration information\n"); 582 if (x86_quirks->mach_get_smp_config(early))
583 return;
584 }
606 585
607 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 586 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
608 mpf->mpf_specification); 587 mpf->specification);
609#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 588#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
610 if (mpf->mpf_feature2 & (1 << 7)) { 589 if (mpf->feature2 & (1 << 7)) {
611 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 590 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
612 pic_mode = 1; 591 pic_mode = 1;
613 } else { 592 } else {
@@ -618,7 +597,7 @@ static void __init __get_smp_config(unsigned int early)
618 /* 597 /*
619 * Now see if we need to read further. 598 * Now see if we need to read further.
620 */ 599 */
621 if (mpf->mpf_feature1 != 0) { 600 if (mpf->feature1 != 0) {
622 if (early) { 601 if (early) {
623 /* 602 /*
624 * local APIC has default address 603 * local APIC has default address
@@ -628,16 +607,16 @@ static void __init __get_smp_config(unsigned int early)
628 } 607 }
629 608
630 printk(KERN_INFO "Default MP configuration #%d\n", 609 printk(KERN_INFO "Default MP configuration #%d\n",
631 mpf->mpf_feature1); 610 mpf->feature1);
632 construct_default_ISA_mptable(mpf->mpf_feature1); 611 construct_default_ISA_mptable(mpf->feature1);
633 612
634 } else if (mpf->mpf_physptr) { 613 } else if (mpf->physptr) {
635 614
636 /* 615 /*
637 * Read the physical hardware table. Anything here will 616 * Read the physical hardware table. Anything here will
638 * override the defaults. 617 * override the defaults.
639 */ 618 */
640 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 619 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
641#ifdef CONFIG_X86_LOCAL_APIC 620#ifdef CONFIG_X86_LOCAL_APIC
642 smp_found_config = 0; 621 smp_found_config = 0;
643#endif 622#endif
@@ -657,15 +636,15 @@ static void __init __get_smp_config(unsigned int early)
657 * ISA defaults and hope it will work. 636 * ISA defaults and hope it will work.
658 */ 637 */
659 if (!mp_irq_entries) { 638 if (!mp_irq_entries) {
660 struct mpc_config_bus bus; 639 struct mpc_bus bus;
661 640
662 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " 641 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
663 "using default mptable. " 642 "using default mptable. "
664 "(tell your hw vendor)\n"); 643 "(tell your hw vendor)\n");
665 644
666 bus.mpc_type = MP_BUS; 645 bus.type = MP_BUS;
667 bus.mpc_busid = 0; 646 bus.busid = 0;
668 memcpy(bus.mpc_bustype, "ISA ", 6); 647 memcpy(bus.bustype, "ISA ", 6);
669 MP_bus_info(&bus); 648 MP_bus_info(&bus);
670 649
671 construct_default_ioirq_mptable(0); 650 construct_default_ioirq_mptable(0);
@@ -695,32 +674,32 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
695 unsigned reserve) 674 unsigned reserve)
696{ 675{
697 unsigned int *bp = phys_to_virt(base); 676 unsigned int *bp = phys_to_virt(base);
698 struct intel_mp_floating *mpf; 677 struct mpf_intel *mpf;
699 678
700 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 679 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
701 bp, length); 680 bp, length);
702 BUILD_BUG_ON(sizeof(*mpf) != 16); 681 BUILD_BUG_ON(sizeof(*mpf) != 16);
703 682
704 while (length > 0) { 683 while (length > 0) {
705 mpf = (struct intel_mp_floating *)bp; 684 mpf = (struct mpf_intel *)bp;
706 if ((*bp == SMP_MAGIC_IDENT) && 685 if ((*bp == SMP_MAGIC_IDENT) &&
707 (mpf->mpf_length == 1) && 686 (mpf->length == 1) &&
708 !mpf_checksum((unsigned char *)bp, 16) && 687 !mpf_checksum((unsigned char *)bp, 16) &&
709 ((mpf->mpf_specification == 1) 688 ((mpf->specification == 1)
710 || (mpf->mpf_specification == 4))) { 689 || (mpf->specification == 4))) {
711#ifdef CONFIG_X86_LOCAL_APIC 690#ifdef CONFIG_X86_LOCAL_APIC
712 smp_found_config = 1; 691 smp_found_config = 1;
713#endif 692#endif
714 mpf_found = mpf; 693 mpf_found = mpf;
715 694
716 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 695 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
717 mpf, virt_to_phys(mpf)); 696 mpf, (u64)virt_to_phys(mpf));
718 697
719 if (!reserve) 698 if (!reserve)
720 return 1; 699 return 1;
721 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 700 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
722 BOOTMEM_DEFAULT); 701 BOOTMEM_DEFAULT);
723 if (mpf->mpf_physptr) { 702 if (mpf->physptr) {
724 unsigned long size = PAGE_SIZE; 703 unsigned long size = PAGE_SIZE;
725#ifdef CONFIG_X86_32 704#ifdef CONFIG_X86_32
726 /* 705 /*
@@ -729,14 +708,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
729 * the bottom is mapped now. 708 * the bottom is mapped now.
730 * PC-9800's MPC table places on the very last 709 * PC-9800's MPC table places on the very last
731 * of physical memory; so that simply reserving 710 * of physical memory; so that simply reserving
732 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 711 * PAGE_SIZE from mpf->physptr yields BUG()
733 * in reserve_bootmem. 712 * in reserve_bootmem.
734 */ 713 */
735 unsigned long end = max_low_pfn * PAGE_SIZE; 714 unsigned long end = max_low_pfn * PAGE_SIZE;
736 if (mpf->mpf_physptr + size > end) 715 if (mpf->physptr + size > end)
737 size = end - mpf->mpf_physptr; 716 size = end - mpf->physptr;
738#endif 717#endif
739 reserve_bootmem_generic(mpf->mpf_physptr, size, 718 reserve_bootmem_generic(mpf->physptr, size,
740 BOOTMEM_DEFAULT); 719 BOOTMEM_DEFAULT);
741 } 720 }
742 721
@@ -803,28 +782,28 @@ void __init find_smp_config(void)
803#ifdef CONFIG_X86_IO_APIC 782#ifdef CONFIG_X86_IO_APIC
804static u8 __initdata irq_used[MAX_IRQ_SOURCES]; 783static u8 __initdata irq_used[MAX_IRQ_SOURCES];
805 784
806static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m) 785static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
807{ 786{
808 int i; 787 int i;
809 788
810 if (m->mpc_irqtype != mp_INT) 789 if (m->irqtype != mp_INT)
811 return 0; 790 return 0;
812 791
813 if (m->mpc_irqflag != 0x0f) 792 if (m->irqflag != 0x0f)
814 return 0; 793 return 0;
815 794
816 /* not legacy */ 795 /* not legacy */
817 796
818 for (i = 0; i < mp_irq_entries; i++) { 797 for (i = 0; i < mp_irq_entries; i++) {
819 if (mp_irqs[i].mp_irqtype != mp_INT) 798 if (mp_irqs[i].irqtype != mp_INT)
820 continue; 799 continue;
821 800
822 if (mp_irqs[i].mp_irqflag != 0x0f) 801 if (mp_irqs[i].irqflag != 0x0f)
823 continue; 802 continue;
824 803
825 if (mp_irqs[i].mp_srcbus != m->mpc_srcbus) 804 if (mp_irqs[i].srcbus != m->srcbus)
826 continue; 805 continue;
827 if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq) 806 if (mp_irqs[i].srcbusirq != m->srcbusirq)
828 continue; 807 continue;
829 if (irq_used[i]) { 808 if (irq_used[i]) {
830 /* already claimed */ 809 /* already claimed */
@@ -840,10 +819,10 @@ static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
840 819
841#define SPARE_SLOT_NUM 20 820#define SPARE_SLOT_NUM 20
842 821
843static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; 822static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
844#endif 823#endif
845 824
846static int __init replace_intsrc_all(struct mp_config_table *mpc, 825static int __init replace_intsrc_all(struct mpc_table *mpc,
847 unsigned long mpc_new_phys, 826 unsigned long mpc_new_phys,
848 unsigned long mpc_new_length) 827 unsigned long mpc_new_length)
849{ 828{
@@ -855,36 +834,33 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
855 int count = sizeof(*mpc); 834 int count = sizeof(*mpc);
856 unsigned char *mpt = ((unsigned char *)mpc) + count; 835 unsigned char *mpt = ((unsigned char *)mpc) + count;
857 836
858 printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length); 837 printk(KERN_INFO "mpc_length %x\n", mpc->length);
859 while (count < mpc->mpc_length) { 838 while (count < mpc->length) {
860 switch (*mpt) { 839 switch (*mpt) {
861 case MP_PROCESSOR: 840 case MP_PROCESSOR:
862 { 841 {
863 struct mpc_config_processor *m = 842 struct mpc_cpu *m = (struct mpc_cpu *)mpt;
864 (struct mpc_config_processor *)mpt;
865 mpt += sizeof(*m); 843 mpt += sizeof(*m);
866 count += sizeof(*m); 844 count += sizeof(*m);
867 break; 845 break;
868 } 846 }
869 case MP_BUS: 847 case MP_BUS:
870 { 848 {
871 struct mpc_config_bus *m = 849 struct mpc_bus *m = (struct mpc_bus *)mpt;
872 (struct mpc_config_bus *)mpt;
873 mpt += sizeof(*m); 850 mpt += sizeof(*m);
874 count += sizeof(*m); 851 count += sizeof(*m);
875 break; 852 break;
876 } 853 }
877 case MP_IOAPIC: 854 case MP_IOAPIC:
878 { 855 {
879 mpt += sizeof(struct mpc_config_ioapic); 856 mpt += sizeof(struct mpc_ioapic);
880 count += sizeof(struct mpc_config_ioapic); 857 count += sizeof(struct mpc_ioapic);
881 break; 858 break;
882 } 859 }
883 case MP_INTSRC: 860 case MP_INTSRC:
884 { 861 {
885#ifdef CONFIG_X86_IO_APIC 862#ifdef CONFIG_X86_IO_APIC
886 struct mpc_config_intsrc *m = 863 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
887 (struct mpc_config_intsrc *)mpt;
888 864
889 printk(KERN_INFO "OLD "); 865 printk(KERN_INFO "OLD ");
890 print_MP_intsrc_info(m); 866 print_MP_intsrc_info(m);
@@ -905,14 +881,14 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
905 nr_m_spare++; 881 nr_m_spare++;
906 } 882 }
907#endif 883#endif
908 mpt += sizeof(struct mpc_config_intsrc); 884 mpt += sizeof(struct mpc_intsrc);
909 count += sizeof(struct mpc_config_intsrc); 885 count += sizeof(struct mpc_intsrc);
910 break; 886 break;
911 } 887 }
912 case MP_LINTSRC: 888 case MP_LINTSRC:
913 { 889 {
914 struct mpc_config_lintsrc *m = 890 struct mpc_lintsrc *m =
915 (struct mpc_config_lintsrc *)mpt; 891 (struct mpc_lintsrc *)mpt;
916 mpt += sizeof(*m); 892 mpt += sizeof(*m);
917 count += sizeof(*m); 893 count += sizeof(*m);
918 break; 894 break;
@@ -922,7 +898,7 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
922 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 898 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
923 printk(KERN_ERR "type %x\n", *mpt); 899 printk(KERN_ERR "type %x\n", *mpt);
924 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, 900 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
925 1, mpc, mpc->mpc_length, 1); 901 1, mpc, mpc->length, 1);
926 goto out; 902 goto out;
927 } 903 }
928 } 904 }
@@ -932,10 +908,10 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
932 if (irq_used[i]) 908 if (irq_used[i])
933 continue; 909 continue;
934 910
935 if (mp_irqs[i].mp_irqtype != mp_INT) 911 if (mp_irqs[i].irqtype != mp_INT)
936 continue; 912 continue;
937 913
938 if (mp_irqs[i].mp_irqflag != 0x0f) 914 if (mp_irqs[i].irqflag != 0x0f)
939 continue; 915 continue;
940 916
941 if (nr_m_spare > 0) { 917 if (nr_m_spare > 0) {
@@ -944,9 +920,8 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
944 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 920 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
945 m_spare[nr_m_spare] = NULL; 921 m_spare[nr_m_spare] = NULL;
946 } else { 922 } else {
947 struct mpc_config_intsrc *m = 923 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
948 (struct mpc_config_intsrc *)mpt; 924 count += sizeof(struct mpc_intsrc);
949 count += sizeof(struct mpc_config_intsrc);
950 if (!mpc_new_phys) { 925 if (!mpc_new_phys) {
951 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); 926 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
952 } else { 927 } else {
@@ -958,17 +933,16 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
958 } 933 }
959 } 934 }
960 assign_to_mpc_intsrc(&mp_irqs[i], m); 935 assign_to_mpc_intsrc(&mp_irqs[i], m);
961 mpc->mpc_length = count; 936 mpc->length = count;
962 mpt += sizeof(struct mpc_config_intsrc); 937 mpt += sizeof(struct mpc_intsrc);
963 } 938 }
964 print_mp_irq_info(&mp_irqs[i]); 939 print_mp_irq_info(&mp_irqs[i]);
965 } 940 }
966#endif 941#endif
967out: 942out:
968 /* update checksum */ 943 /* update checksum */
969 mpc->mpc_checksum = 0; 944 mpc->checksum = 0;
970 mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc, 945 mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
971 mpc->mpc_length);
972 946
973 return 0; 947 return 0;
974} 948}
@@ -1013,9 +987,8 @@ static int __init update_mp_table(void)
1013{ 987{
1014 char str[16]; 988 char str[16];
1015 char oem[10]; 989 char oem[10];
1016 struct intel_mp_floating *mpf; 990 struct mpf_intel *mpf;
1017 struct mp_config_table *mpc; 991 struct mpc_table *mpc, *mpc_new;
1018 struct mp_config_table *mpc_new;
1019 992
1020 if (!enable_update_mptable) 993 if (!enable_update_mptable)
1021 return 0; 994 return 0;
@@ -1027,21 +1000,21 @@ static int __init update_mp_table(void)
1027 /* 1000 /*
1028 * Now see if we need to go further. 1001 * Now see if we need to go further.
1029 */ 1002 */
1030 if (mpf->mpf_feature1 != 0) 1003 if (mpf->feature1 != 0)
1031 return 0; 1004 return 0;
1032 1005
1033 if (!mpf->mpf_physptr) 1006 if (!mpf->physptr)
1034 return 0; 1007 return 0;
1035 1008
1036 mpc = phys_to_virt(mpf->mpf_physptr); 1009 mpc = phys_to_virt(mpf->physptr);
1037 1010
1038 if (!smp_check_mpc(mpc, oem, str)) 1011 if (!smp_check_mpc(mpc, oem, str))
1039 return 0; 1012 return 0;
1040 1013
1041 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); 1014 printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
1042 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); 1015 printk(KERN_INFO "physptr: %x\n", mpf->physptr);
1043 1016
1044 if (mpc_new_phys && mpc->mpc_length > mpc_new_length) { 1017 if (mpc_new_phys && mpc->length > mpc_new_length) {
1045 mpc_new_phys = 0; 1018 mpc_new_phys = 0;
1046 printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", 1019 printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
1047 mpc_new_length); 1020 mpc_new_length);
@@ -1050,33 +1023,33 @@ static int __init update_mp_table(void)
1050 if (!mpc_new_phys) { 1023 if (!mpc_new_phys) {
1051 unsigned char old, new; 1024 unsigned char old, new;
1052 /* check if we can change the postion */ 1025 /* check if we can change the postion */
1053 mpc->mpc_checksum = 0; 1026 mpc->checksum = 0;
1054 old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); 1027 old = mpf_checksum((unsigned char *)mpc, mpc->length);
1055 mpc->mpc_checksum = 0xff; 1028 mpc->checksum = 0xff;
1056 new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); 1029 new = mpf_checksum((unsigned char *)mpc, mpc->length);
1057 if (old == new) { 1030 if (old == new) {
1058 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 1031 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
1059 return 0; 1032 return 0;
1060 } 1033 }
1061 printk(KERN_INFO "use in-positon replacing\n"); 1034 printk(KERN_INFO "use in-positon replacing\n");
1062 } else { 1035 } else {
1063 mpf->mpf_physptr = mpc_new_phys; 1036 mpf->physptr = mpc_new_phys;
1064 mpc_new = phys_to_virt(mpc_new_phys); 1037 mpc_new = phys_to_virt(mpc_new_phys);
1065 memcpy(mpc_new, mpc, mpc->mpc_length); 1038 memcpy(mpc_new, mpc, mpc->length);
1066 mpc = mpc_new; 1039 mpc = mpc_new;
1067 /* check if we can modify that */ 1040 /* check if we can modify that */
1068 if (mpc_new_phys - mpf->mpf_physptr) { 1041 if (mpc_new_phys - mpf->physptr) {
1069 struct intel_mp_floating *mpf_new; 1042 struct mpf_intel *mpf_new;
1070 /* steal 16 bytes from [0, 1k) */ 1043 /* steal 16 bytes from [0, 1k) */
1071 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); 1044 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1072 mpf_new = phys_to_virt(0x400 - 16); 1045 mpf_new = phys_to_virt(0x400 - 16);
1073 memcpy(mpf_new, mpf, 16); 1046 memcpy(mpf_new, mpf, 16);
1074 mpf = mpf_new; 1047 mpf = mpf_new;
1075 mpf->mpf_physptr = mpc_new_phys; 1048 mpf->physptr = mpc_new_phys;
1076 } 1049 }
1077 mpf->mpf_checksum = 0; 1050 mpf->checksum = 0;
1078 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); 1051 mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
1079 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); 1052 printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
1080 } 1053 }
1081 1054
1082 /* 1055 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 82a7c7ed6d45..3cf3413ec626 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/uaccess.h>
38 39
39#include <asm/processor.h> 40#include <asm/processor.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h> 42#include <asm/system.h>
43 43
44static struct class *msr_class; 44static struct class *msr_class;
@@ -136,7 +136,7 @@ static int msr_open(struct inode *inode, struct file *file)
136 lock_kernel(); 136 lock_kernel();
137 cpu = iminor(file->f_path.dentry->d_inode); 137 cpu = iminor(file->f_path.dentry->d_inode);
138 138
139 if (cpu >= NR_CPUS || !cpu_online(cpu)) { 139 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
140 ret = -ENXIO; /* No such CPU */ 140 ret = -ENXIO; /* No such CPU */
141 goto out; 141 goto out;
142 } 142 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07f1c2c..bdfad80c3cf1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,14 +29,12 @@
29 29
30#include <asm/i8259.h> 30#include <asm/i8259.h>
31#include <asm/io_apic.h> 31#include <asm/io_apic.h>
32#include <asm/smp.h>
33#include <asm/nmi.h>
34#include <asm/proto.h> 32#include <asm/proto.h>
35#include <asm/timer.h> 33#include <asm/timer.h>
36 34
37#include <asm/mce.h> 35#include <asm/mce.h>
38 36
39#include <mach_traps.h> 37#include <asm/mach_traps.h>
40 38
41int unknown_nmi_panic; 39int unknown_nmi_panic;
42int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
@@ -63,11 +61,7 @@ static int endflag __initdata;
63 61
64static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
65{ 63{
66#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
67 return cpu_pda(cpu)->__nmi_count;
68#else
69 return nmi_count(cpu);
70#endif
71} 65}
72 66
73static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -84,12 +78,8 @@ static inline int mce_in_progress(void)
84 */ 78 */
85static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
86{ 80{
87#ifdef CONFIG_X86_64
88 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
89#else
90 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
91 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
92#endif
93} 83}
94 84
95#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
@@ -131,6 +121,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
131 atomic_dec(&nmi_active); 121 atomic_dec(&nmi_active);
132} 122}
133 123
124static void __acpi_nmi_disable(void *__unused)
125{
126 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
127}
128
134int __init check_nmi_watchdog(void) 129int __init check_nmi_watchdog(void)
135{ 130{
136 unsigned int *prev_nmi_count; 131 unsigned int *prev_nmi_count;
@@ -179,8 +174,12 @@ int __init check_nmi_watchdog(void)
179 kfree(prev_nmi_count); 174 kfree(prev_nmi_count);
180 return 0; 175 return 0;
181error: 176error:
182 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) 177 if (nmi_watchdog == NMI_IO_APIC) {
183 disable_8259A_irq(0); 178 if (!timer_through_8259)
179 disable_8259A_irq(0);
180 on_each_cpu(__acpi_nmi_disable, NULL, 1);
181 }
182
184#ifdef CONFIG_X86_32 183#ifdef CONFIG_X86_32
185 timer_ack = 0; 184 timer_ack = 0;
186#endif 185#endif
@@ -199,12 +198,17 @@ static int __init setup_nmi_watchdog(char *str)
199 ++str; 198 ++str;
200 } 199 }
201 200
202 get_option(&str, &nmi); 201 if (!strncmp(str, "lapic", 5))
203 202 nmi_watchdog = NMI_LOCAL_APIC;
204 if (nmi >= NMI_INVALID) 203 else if (!strncmp(str, "ioapic", 6))
205 return 0; 204 nmi_watchdog = NMI_IO_APIC;
205 else {
206 get_option(&str, &nmi);
207 if (nmi >= NMI_INVALID)
208 return 0;
209 nmi_watchdog = nmi;
210 }
206 211
207 nmi_watchdog = nmi;
208 return 1; 212 return 1;
209} 213}
210__setup("nmi_watchdog=", setup_nmi_watchdog); 214__setup("nmi_watchdog=", setup_nmi_watchdog);
@@ -285,11 +289,6 @@ void acpi_nmi_enable(void)
285 on_each_cpu(__acpi_nmi_enable, NULL, 1); 289 on_each_cpu(__acpi_nmi_enable, NULL, 1);
286} 290}
287 291
288static void __acpi_nmi_disable(void *__unused)
289{
290 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
291}
292
293/* 292/*
294 * Disable timer based NMIs on all CPUs: 293 * Disable timer based NMIs on all CPUs:
295 */ 294 */
@@ -340,6 +339,8 @@ void stop_apic_nmi_watchdog(void *unused)
340 return; 339 return;
341 if (nmi_watchdog == NMI_LOCAL_APIC) 340 if (nmi_watchdog == NMI_LOCAL_APIC)
342 lapic_watchdog_stop(); 341 lapic_watchdog_stop();
342 else
343 __acpi_nmi_disable(NULL);
343 __get_cpu_var(wd_enabled) = 0; 344 __get_cpu_var(wd_enabled) = 0;
344 atomic_dec(&nmi_active); 345 atomic_dec(&nmi_active);
345} 346}
@@ -465,6 +466,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
465 466
466#ifdef CONFIG_SYSCTL 467#ifdef CONFIG_SYSCTL
467 468
469static void enable_ioapic_nmi_watchdog_single(void *unused)
470{
471 __get_cpu_var(wd_enabled) = 1;
472 atomic_inc(&nmi_active);
473 __acpi_nmi_enable(NULL);
474}
475
476static void enable_ioapic_nmi_watchdog(void)
477{
478 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
479 touch_nmi_watchdog();
480}
481
482static void disable_ioapic_nmi_watchdog(void)
483{
484 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
485}
486
468static int __init setup_unknown_nmi_panic(char *str) 487static int __init setup_unknown_nmi_panic(char *str)
469{ 488{
470 unknown_nmi_panic = 1; 489 unknown_nmi_panic = 1;
@@ -507,6 +526,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
507 enable_lapic_nmi_watchdog(); 526 enable_lapic_nmi_watchdog();
508 else 527 else
509 disable_lapic_nmi_watchdog(); 528 disable_lapic_nmi_watchdog();
529 } else if (nmi_watchdog == NMI_IO_APIC) {
530 if (nmi_watchdog_enabled)
531 enable_ioapic_nmi_watchdog();
532 else
533 disable_ioapic_nmi_watchdog();
510 } else { 534 } else {
511 printk(KERN_WARNING 535 printk(KERN_WARNING
512 "NMI watchdog doesn't know what hardware to touch\n"); 536 "NMI watchdog doesn't know what hardware to touch\n");
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e0..0cc41a1d2550 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, IBM Corp. 4 * Copyright (C) 2002, IBM Corp.
5 * 5 *
6 * All rights reserved. 6 * All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 9 * it under the terms of the GNU General Public License as published by
@@ -23,17 +23,18 @@
23 * Send feedback to <gone@us.ibm.com> 23 * Send feedback to <gone@us.ibm.com>
24 */ 24 */
25 25
26#include <linux/mm.h> 26#include <linux/nodemask.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/mmzone.h> 28#include <linux/mmzone.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/nodemask.h> 30#include <linux/mm.h>
31#include <asm/numaq.h> 31
32#include <asm/topology.h>
33#include <asm/processor.h> 32#include <asm/processor.h>
34#include <asm/mpspec.h> 33#include <asm/topology.h>
35#include <asm/e820.h> 34#include <asm/genapic.h>
35#include <asm/numaq.h>
36#include <asm/setup.h> 36#include <asm/setup.h>
37#include <asm/e820.h>
37 38
38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 39#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
39 40
@@ -91,19 +92,20 @@ static int __init numaq_pre_time_init(void)
91} 92}
92 93
93int found_numaq; 94int found_numaq;
95
94/* 96/*
95 * Have to match translation table entries to main table entries by counter 97 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of 98 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this .... 99 * doing this ....
98 */ 100 */
99struct mpc_config_translation { 101struct mpc_config_translation {
100 unsigned char mpc_type; 102 unsigned char mpc_type;
101 unsigned char trans_len; 103 unsigned char trans_len;
102 unsigned char trans_type; 104 unsigned char trans_type;
103 unsigned char trans_quad; 105 unsigned char trans_quad;
104 unsigned char trans_global; 106 unsigned char trans_global;
105 unsigned char trans_local; 107 unsigned char trans_local;
106 unsigned short trans_reserved; 108 unsigned short trans_reserved;
107}; 109};
108 110
109/* x86_quirks member */ 111/* x86_quirks member */
@@ -117,16 +119,15 @@ static inline int generate_logical_apicid(int quad, int phys_apicid)
117} 119}
118 120
119/* x86_quirks member */ 121/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m) 122static int mpc_apic_id(struct mpc_cpu *m)
121{ 123{
122 int quad = translation_table[mpc_record]->trans_quad; 124 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); 125 int logical_apicid = generate_logical_apicid(quad, m->apicid);
124 126
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", 127 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid, 128 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 129 (m->cpufeature & CPU_MODEL_MASK) >> 4,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 130 m->apicver, quad, logical_apicid);
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid; 131 return logical_apicid;
131} 132}
132 133
@@ -135,26 +136,26 @@ int mp_bus_id_to_node[MAX_MP_BUSSES];
135int mp_bus_id_to_local[MAX_MP_BUSSES]; 136int mp_bus_id_to_local[MAX_MP_BUSSES];
136 137
137/* x86_quirks member */ 138/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) 139static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
139{ 140{
140 int quad = translation_table[mpc_record]->trans_quad; 141 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local; 142 int local = translation_table[mpc_record]->trans_local;
142 143
143 mp_bus_id_to_node[m->mpc_busid] = quad; 144 mp_bus_id_to_node[m->busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local; 145 mp_bus_id_to_local[m->busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n", 146 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad); 147 m->busid, name, quad);
147} 148}
148 149
149int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 150int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150 151
151/* x86_quirks member */ 152/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m) 153static void mpc_oem_pci_bus(struct mpc_bus *m)
153{ 154{
154 int quad = translation_table[mpc_record]->trans_quad; 155 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local; 156 int local = translation_table[mpc_record]->trans_local;
156 157
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; 158 quad_local_to_mp_bus_id[quad][local] = m->busid;
158} 159}
159 160
160static void __init MP_translation_info(struct mpc_config_translation *m) 161static void __init MP_translation_info(struct mpc_config_translation *m)
@@ -186,7 +187,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
186 * Read/parse the MPC oem tables 187 * Read/parse the MPC oem tables
187 */ 188 */
188 189
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, 190static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable,
190 unsigned short oemsize) 191 unsigned short oemsize)
191{ 192{
192 int count = sizeof(*oemtable); /* the header size */ 193 int count = sizeof(*oemtable); /* the header size */
@@ -195,18 +196,18 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
195 mpc_record = 0; 196 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", 197 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable); 198 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { 199 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING 200 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", 201 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1], 202 oemtable->signature[0], oemtable->signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]); 203 oemtable->signature[2], oemtable->signature[3]);
203 return; 204 return;
204 } 205 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { 206 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); 207 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return; 208 return;
208 } 209 }
209 while (count < oemtable->oem_length) { 210 while (count < oemtable->length) {
210 switch (*oemptr) { 211 switch (*oemptr) {
211 case MP_TRANSLATION: 212 case MP_TRANSLATION:
212 { 213 {
@@ -235,6 +236,13 @@ static int __init numaq_setup_ioapic_ids(void)
235 return 1; 236 return 1;
236} 237}
237 238
239static int __init numaq_update_genapic(void)
240{
241 apic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
242
243 return 0;
244}
245
238static struct x86_quirks numaq_x86_quirks __initdata = { 246static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init, 247 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL, 248 .arch_time_init = NULL,
@@ -250,10 +258,10 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
250 .mpc_oem_pci_bus = mpc_oem_pci_bus, 258 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem, 259 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids, 260 .setup_ioapic_ids = numaq_setup_ioapic_ids,
261 .update_genapic = numaq_update_genapic,
253}; 262};
254 263
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 264void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
256 char *productid)
257{ 265{
258 if (strncmp(oem, "IBM NUMA", 8)) 266 if (strncmp(oem, "IBM NUMA", 8))
259 printk("Warning! Not a NUMA-Q system!\n"); 267 printk("Warning! Not a NUMA-Q system!\n");
@@ -285,3 +293,280 @@ int __init get_memcfg_numaq(void)
285 smp_dump_qct(); 293 smp_dump_qct();
286 return 1; 294 return 1;
287} 295}
296
297/*
298 * APIC driver for the IBM NUMAQ chipset.
299 */
300#define APIC_DEFINITION 1
301#include <linux/threads.h>
302#include <linux/cpumask.h>
303#include <asm/mpspec.h>
304#include <asm/genapic.h>
305#include <asm/fixmap.h>
306#include <asm/apicdef.h>
307#include <asm/ipi.h>
308#include <linux/kernel.h>
309#include <linux/string.h>
310#include <linux/init.h>
311#include <linux/numa.h>
312#include <linux/smp.h>
313#include <asm/numaq.h>
314#include <asm/io.h>
315#include <linux/mmzone.h>
316#include <linux/nodemask.h>
317
318#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
319
320static inline unsigned int numaq_get_apic_id(unsigned long x)
321{
322 return (x >> 24) & 0x0F;
323}
324
325static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
326{
327 default_send_IPI_mask_sequence_logical(mask, vector);
328}
329
330static inline void numaq_send_IPI_allbutself(int vector)
331{
332 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
333}
334
335static inline void numaq_send_IPI_all(int vector)
336{
337 numaq_send_IPI_mask(cpu_online_mask, vector);
338}
339
340extern void numaq_mps_oem_check(struct mpc_table *, char *, char *);
341
342#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
343#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
344
345/*
346 * Because we use NMIs rather than the INIT-STARTUP sequence to
347 * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
348 */
349static inline void numaq_smp_callin_clear_local_apic(void)
350{
351 clear_local_APIC();
352}
353
354static inline void
355numaq_store_NMI_vector(unsigned short *high, unsigned short *low)
356{
357 printk("Storing NMI vector\n");
358 *high =
359 *((volatile unsigned short *)phys_to_virt(NUMAQ_TRAMPOLINE_PHYS_HIGH));
360 *low =
361 *((volatile unsigned short *)phys_to_virt(NUMAQ_TRAMPOLINE_PHYS_LOW));
362}
363
364static inline const cpumask_t *numaq_target_cpus(void)
365{
366 return &CPU_MASK_ALL;
367}
368
369static inline unsigned long
370numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
371{
372 return physid_isset(apicid, bitmap);
373}
374
375static inline unsigned long numaq_check_apicid_present(int bit)
376{
377 return physid_isset(bit, phys_cpu_present_map);
378}
379
380#define apicid_cluster(apicid) (apicid & 0xF0)
381
382static inline int numaq_apic_id_registered(void)
383{
384 return 1;
385}
386
387static inline void numaq_init_apic_ldr(void)
388{
389 /* Already done in NUMA-Q firmware */
390}
391
392static inline void numaq_setup_apic_routing(void)
393{
394 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
395 "NUMA-Q", nr_ioapics);
396}
397
398/*
399 * Skip adding the timer int on secondary nodes, which causes
400 * a small but painful rift in the time-space continuum.
401 */
402static inline int numaq_multi_timer_check(int apic, int irq)
403{
404 return apic != 0 && irq == 0;
405}
406
407static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
408{
409 /* We don't have a good way to do this yet - hack */
410 return physids_promote(0xFUL);
411}
412
413/* Mapping from cpu number to logical apicid */
414extern u8 cpu_2_logical_apicid[];
415
416static inline int numaq_cpu_to_logical_apicid(int cpu)
417{
418 if (cpu >= nr_cpu_ids)
419 return BAD_APICID;
420 return (int)cpu_2_logical_apicid[cpu];
421}
422
423/*
424 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
425 * cpu to APIC ID relation to properly interact with the intelligent
426 * mode of the cluster controller.
427 */
428static inline int numaq_cpu_present_to_apicid(int mps_cpu)
429{
430 if (mps_cpu < 60)
431 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
432 else
433 return BAD_APICID;
434}
435
436static inline int numaq_apicid_to_node(int logical_apicid)
437{
438 return logical_apicid >> 4;
439}
440
441static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
442{
443 int node = numaq_apicid_to_node(logical_apicid);
444 int cpu = __ffs(logical_apicid & 0xf);
445
446 return physid_mask_of_physid(cpu + 4*node);
447}
448
449/* Where the IO area was mapped on multiquad, always 0 otherwise */
450void *xquad_portio;
451
452static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
453{
454 return 1;
455}
456
457/*
458 * We use physical apicids here, not logical, so just return the default
459 * physical broadcast to stop people from breaking us
460 */
461static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask)
462{
463 return 0x0F;
464}
465
466static inline unsigned int
467numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
468 const struct cpumask *andmask)
469{
470 return 0x0F;
471}
472
473/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
474static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
475{
476 return cpuid_apic >> index_msb;
477}
478static int __numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
479{
480 numaq_mps_oem_check(mpc, oem, productid);
481 return found_numaq;
482}
483
484static int probe_numaq(void)
485{
486 /* already know from get_memcfg_numaq() */
487 return found_numaq;
488}
489
490static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
491{
492 /* Careful. Some cpus do not strictly honor the set of cpus
493 * specified in the interrupt destination when using lowest
494 * priority interrupt delivery mode.
495 *
496 * In particular there was a hyperthreading cpu observed to
497 * deliver interrupts to the wrong hyperthread when only one
498 * hyperthread was specified in the interrupt desitination.
499 */
500 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
501}
502
503static void numaq_setup_portio_remap(void)
504{
505 int num_quads = num_online_nodes();
506
507 if (num_quads <= 1)
508 return;
509
510 printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
511 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
512 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
513 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
514}
515
516struct genapic apic_numaq = {
517
518 .name = "NUMAQ",
519 .probe = probe_numaq,
520 .acpi_madt_oem_check = NULL,
521 .apic_id_registered = numaq_apic_id_registered,
522
523 .irq_delivery_mode = dest_LowestPrio,
524 /* physical delivery on LOCAL quad: */
525 .irq_dest_mode = 0,
526
527 .target_cpus = numaq_target_cpus,
528 .disable_esr = 1,
529 .dest_logical = APIC_DEST_LOGICAL,
530 .check_apicid_used = numaq_check_apicid_used,
531 .check_apicid_present = numaq_check_apicid_present,
532
533 .vector_allocation_domain = numaq_vector_allocation_domain,
534 .init_apic_ldr = numaq_init_apic_ldr,
535
536 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
537 .setup_apic_routing = numaq_setup_apic_routing,
538 .multi_timer_check = numaq_multi_timer_check,
539 .apicid_to_node = numaq_apicid_to_node,
540 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
541 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
542 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
543 .setup_portio_remap = numaq_setup_portio_remap,
544 .check_phys_apicid_present = numaq_check_phys_apicid_present,
545 .enable_apic_mode = NULL,
546 .phys_pkg_id = numaq_phys_pkg_id,
547 .mps_oem_check = __numaq_mps_oem_check,
548
549 .get_apic_id = numaq_get_apic_id,
550 .set_apic_id = NULL,
551 .apic_id_mask = 0x0F << 24,
552
553 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
554 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
555
556 .send_IPI_mask = numaq_send_IPI_mask,
557 .send_IPI_mask_allbutself = NULL,
558 .send_IPI_allbutself = numaq_send_IPI_allbutself,
559 .send_IPI_all = numaq_send_IPI_all,
560 .send_IPI_self = default_send_IPI_self,
561
562 .wakeup_cpu = NULL,
563 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
564 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
565
566 /* We don't do anything here because we use NMI's to boot instead */
567 .wait_for_init_deassert = NULL,
568
569 .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
570 .store_NMI_vector = numaq_store_NMI_vector,
571 .inquire_remote_apic = NULL,
572};
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1dd..3a7c5a44082e 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
7 7
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
11{ 12{
12 __raw_spin_lock(lock); 13 __raw_spin_lock(lock);
13} 14}
@@ -25,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {
25}; 26};
26EXPORT_SYMBOL(pv_lock_ops); 27EXPORT_SYMBOL(pv_lock_ops);
27 28
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..cea11c8e3049 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
44{ 44{
45} 45}
46 46
47/* identity function, which can be inlined */
48u32 _paravirt_ident_32(u32 x)
49{
50 return x;
51}
52
53u64 _paravirt_ident_64(u64 x)
54{
55 return x;
56}
57
47static void __init default_banner(void) 58static void __init default_banner(void)
48{ 59{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 60 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 149 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 150 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 151 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 152 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 153 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 154 ret = paravirt_patch_nop();
155
156 /* identity functions just return their single argument */
157 else if (opfunc == _paravirt_ident_32)
158 ret = paravirt_patch_ident_32(insnbuf, len);
159 else if (opfunc == _paravirt_ident_64)
160 ret = paravirt_patch_ident_64(insnbuf, len);
161
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 162 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 164 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
292 310
293struct pv_irq_ops pv_irq_ops = { 311struct pv_irq_ops pv_irq_ops = {
294 .init_IRQ = native_init_IRQ, 312 .init_IRQ = native_init_IRQ,
295 .save_fl = native_save_fl, 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
296 .restore_fl = native_restore_fl, 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
297 .irq_disable = native_irq_disable, 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
298 .irq_enable = native_irq_enable, 316 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
299 .safe_halt = native_safe_halt, 317 .safe_halt = native_safe_halt,
300 .halt = native_halt, 318 .halt = native_halt,
301#ifdef CONFIG_X86_64 319#ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
373#endif 391#endif
374}; 392};
375 393
394#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
395/* 32-bit pagetable entries */
396#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
397#else
398/* 64-bit pagetable entries */
399#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
400#endif
401
376struct pv_mmu_ops pv_mmu_ops = { 402struct pv_mmu_ops pv_mmu_ops = {
377#ifndef CONFIG_X86_64 403#ifndef CONFIG_X86_64
378 .pagetable_setup_start = native_pagetable_setup_start, 404 .pagetable_setup_start = native_pagetable_setup_start,
@@ -424,22 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
424 .pmd_clear = native_pmd_clear, 450 .pmd_clear = native_pmd_clear,
425#endif 451#endif
426 .set_pud = native_set_pud, 452 .set_pud = native_set_pud,
427 .pmd_val = native_pmd_val, 453
428 .make_pmd = native_make_pmd, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT,
429 456
430#if PAGETABLE_LEVELS == 4 457#if PAGETABLE_LEVELS == 4
431 .pud_val = native_pud_val, 458 .pud_val = PTE_IDENT,
432 .make_pud = native_make_pud, 459 .make_pud = PTE_IDENT,
460
433 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
434#endif 462#endif
435#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* PAGETABLE_LEVELS >= 3 */
436 464
437 .pte_val = native_pte_val, 465 .pte_val = PTE_IDENT,
438 .pte_flags = native_pte_flags, 466 .pgd_val = PTE_IDENT,
439 .pgd_val = native_pgd_val,
440 467
441 .make_pte = native_make_pte, 468 .make_pte = PTE_IDENT,
442 .make_pgd = native_make_pgd, 469 .make_pgd = PTE_IDENT,
443 470
444 .dup_mmap = paravirt_nop, 471 .dup_mmap = paravirt_nop,
445 .exit_mmap = paravirt_nop, 472 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e1e731d78f38..d28bbdc35e4e 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1567,7 +1567,7 @@ static int __init calgary_parse_options(char *p)
1567 ++p; 1567 ++p;
1568 if (*p == '\0') 1568 if (*p == '\0')
1569 break; 1569 break;
1570 bridge = simple_strtol(p, &endp, 0); 1570 bridge = simple_strtoul(p, &endp, 0);
1571 if (p == endp) 1571 if (p == endp)
1572 break; 1572 break;
1573 1573
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 192624820217..b25428533141 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/iommu.h> 8#include <asm/iommu.h>
9#include <asm/gart.h>
9#include <asm/calgary.h> 10#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 11#include <asm/amd_iommu.h>
11 12
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
30/* Set this to 1 if there is a HW IOMMU in the system */ 31/* Set this to 1 if there is a HW IOMMU in the system */
31int iommu_detected __read_mostly = 0; 32int iommu_detected __read_mostly = 0;
32 33
33/* This tells the BIO block layer to assume merging. Default to off
34 because we cannot guarantee merging later. */
35int iommu_bio_merge __read_mostly = 0;
36EXPORT_SYMBOL(iommu_bio_merge);
37
38dma_addr_t bad_dma_address __read_mostly = 0; 34dma_addr_t bad_dma_address __read_mostly = 0;
39EXPORT_SYMBOL(bad_dma_address); 35EXPORT_SYMBOL(bad_dma_address);
40 36
@@ -42,7 +38,7 @@ EXPORT_SYMBOL(bad_dma_address);
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 38 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 39 to older i386. */
44struct device x86_dma_fallback_dev = { 40struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 41 .init_name = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 42 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 43 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 44};
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
105 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
106 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
107} 103}
104#endif
108 105
109void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
110{ 107{
108#ifdef CONFIG_X86_64
111 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
112 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
113 /* 113 /*
114 * The order of these functions is important for 114 * The order of these functions is important for
115 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
125 pci_swiotlb_init(); 125 pci_swiotlb_init();
126} 126}
127 127
128unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_nr_pages);
135#endif
136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
139{ 130{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
188 } 179 }
189 180
190 if (!strncmp(p, "biomerge", 8)) { 181 if (!strncmp(p, "biomerge", 8)) {
191 iommu_bio_merge = 4096;
192 iommu_merge = 1; 182 iommu_merge = 1;
193 force_iommu = 1; 183 force_iommu = 1;
194 } 184 }
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
300static __devinit void via_no_dac(struct pci_dev *dev) 290static __devinit void via_no_dac(struct pci_dev *dev)
301{ 291{
302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
303 printk(KERN_INFO "PCI: VIA PCI bridge detected." 293 printk(KERN_INFO
304 "Disabling DAC.\n"); 294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
305 forbid_dac = 1; 295 forbid_dac = 1;
306 } 296 }
307} 297}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a42b02b4df68..d5768b1af080 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -5,7 +5,7 @@
5 * This allows to use PCI devices that only support 32bit addresses on systems 5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB. 6 * with more than 4GB.
7 * 7 *
8 * See Documentation/DMA-mapping.txt for the interface specification. 8 * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * Subject to the GNU General Public License v2 only. 11 * Subject to the GNU General Public License v2 only.
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
52 * to trigger bugs with some popular PCI cards, in particular 3ware (but 52 * to trigger bugs with some popular PCI cards, in particular 3ware (but
53 * has been also also seen with Qlogic at least). 53 * has been also also seen with Qlogic at least).
54 */ 54 */
55int iommu_fullflush = 1; 55static int iommu_fullflush = 1;
56 56
57/* Allocation bitmap for the remapping area: */ 57/* Allocation bitmap for the remapping area: */
58static DEFINE_SPINLOCK(iommu_bitmap_lock); 58static DEFINE_SPINLOCK(iommu_bitmap_lock);
@@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size)
123 123
124 spin_lock_irqsave(&iommu_bitmap_lock, flags); 124 spin_lock_irqsave(&iommu_bitmap_lock, flags);
125 iommu_area_free(iommu_gart_bitmap, offset, size); 125 iommu_area_free(iommu_gart_bitmap, offset, size);
126 if (offset >= next_bit)
127 next_bit = offset + size;
126 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 128 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
127} 129}
128 130
@@ -743,10 +745,8 @@ void __init gart_iommu_init(void)
743 unsigned long scratch; 745 unsigned long scratch;
744 long i; 746 long i;
745 747
746 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { 748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
747 printk(KERN_INFO "PCI-GART: No AMD GART found.\n");
748 return; 749 return;
749 }
750 750
751#ifndef CONFIG_AGP_AMD64 751#ifndef CONFIG_AGP_AMD64
752 no_agp = 1; 752 no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..d59c91747665 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c
new file mode 100644
index 000000000000..22337b75de62
--- /dev/null
+++ b/arch/x86/kernel/probe_32.c
@@ -0,0 +1,411 @@
1/*
2 * Default generic APIC driver. This handles up to 8 CPUs.
3 *
4 * Copyright 2003 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, v.2
6 *
7 * Generic x86 APIC driver probe layer.
8 */
9#include <linux/threads.h>
10#include <linux/cpumask.h>
11#include <linux/string.h>
12#include <linux/kernel.h>
13#include <linux/ctype.h>
14#include <linux/init.h>
15#include <linux/errno.h>
16#include <asm/fixmap.h>
17#include <asm/mpspec.h>
18#include <asm/apicdef.h>
19#include <asm/genapic.h>
20#include <asm/setup.h>
21
22#include <linux/threads.h>
23#include <linux/cpumask.h>
24#include <asm/mpspec.h>
25#include <asm/genapic.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/genapic.h>
33#include <asm/ipi.h>
34
35#include <linux/smp.h>
36#include <linux/init.h>
37#include <linux/interrupt.h>
38#include <asm/acpi.h>
39#include <asm/arch_hooks.h>
40#include <asm/e820.h>
41#include <asm/setup.h>
42
43#include <asm/genapic.h>
44
45#ifdef CONFIG_HOTPLUG_CPU
46#define DEFAULT_SEND_IPI (1)
47#else
48#define DEFAULT_SEND_IPI (0)
49#endif
50
51int no_broadcast = DEFAULT_SEND_IPI;
52
53#ifdef CONFIG_X86_LOCAL_APIC
54
55static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
56{
57 /*
58 * Careful. Some cpus do not strictly honor the set of cpus
59 * specified in the interrupt destination when using lowest
60 * priority interrupt delivery mode.
61 *
62 * In particular there was a hyperthreading cpu observed to
63 * deliver interrupts to the wrong hyperthread when only one
64 * hyperthread was specified in the interrupt desitination.
65 */
66 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
67}
68
69/* should be called last. */
70static int probe_default(void)
71{
72 return 1;
73}
74
75struct genapic apic_default = {
76
77 .name = "default",
78 .probe = probe_default,
79 .acpi_madt_oem_check = NULL,
80 .apic_id_registered = default_apic_id_registered,
81
82 .irq_delivery_mode = dest_LowestPrio,
83 /* logical delivery broadcast to all CPUs: */
84 .irq_dest_mode = 1,
85
86 .target_cpus = default_target_cpus,
87 .disable_esr = 0,
88 .dest_logical = APIC_DEST_LOGICAL,
89 .check_apicid_used = default_check_apicid_used,
90 .check_apicid_present = default_check_apicid_present,
91
92 .vector_allocation_domain = default_vector_allocation_domain,
93 .init_apic_ldr = default_init_apic_ldr,
94
95 .ioapic_phys_id_map = default_ioapic_phys_id_map,
96 .setup_apic_routing = default_setup_apic_routing,
97 .multi_timer_check = NULL,
98 .apicid_to_node = default_apicid_to_node,
99 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
100 .cpu_present_to_apicid = default_cpu_present_to_apicid,
101 .apicid_to_cpu_present = default_apicid_to_cpu_present,
102 .setup_portio_remap = NULL,
103 .check_phys_apicid_present = default_check_phys_apicid_present,
104 .enable_apic_mode = NULL,
105 .phys_pkg_id = default_phys_pkg_id,
106 .mps_oem_check = NULL,
107
108 .get_apic_id = default_get_apic_id,
109 .set_apic_id = NULL,
110 .apic_id_mask = 0x0F << 24,
111
112 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
113 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
114
115 .send_IPI_mask = default_send_IPI_mask_logical,
116 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
117 .send_IPI_allbutself = default_send_IPI_allbutself,
118 .send_IPI_all = default_send_IPI_all,
119 .send_IPI_self = default_send_IPI_self,
120
121 .wakeup_cpu = NULL,
122 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
123 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
124
125 .wait_for_init_deassert = default_wait_for_init_deassert,
126
127 .smp_callin_clear_local_apic = NULL,
128 .store_NMI_vector = NULL,
129 .inquire_remote_apic = default_inquire_remote_apic,
130};
131
132extern struct genapic apic_numaq;
133extern struct genapic apic_summit;
134extern struct genapic apic_bigsmp;
135extern struct genapic apic_es7000;
136extern struct genapic apic_default;
137
138struct genapic *apic = &apic_default;
139
140static struct genapic *apic_probe[] __initdata = {
141#ifdef CONFIG_X86_NUMAQ
142 &apic_numaq,
143#endif
144#ifdef CONFIG_X86_SUMMIT
145 &apic_summit,
146#endif
147#ifdef CONFIG_X86_BIGSMP
148 &apic_bigsmp,
149#endif
150#ifdef CONFIG_X86_ES7000
151 &apic_es7000,
152#endif
153 &apic_default, /* must be last */
154 NULL,
155};
156
157static int cmdline_apic __initdata;
158static int __init parse_apic(char *arg)
159{
160 int i;
161
162 if (!arg)
163 return -EINVAL;
164
165 for (i = 0; apic_probe[i]; i++) {
166 if (!strcmp(apic_probe[i]->name, arg)) {
167 apic = apic_probe[i];
168 cmdline_apic = 1;
169 return 0;
170 }
171 }
172
173 if (x86_quirks->update_genapic)
174 x86_quirks->update_genapic();
175
176 /* Parsed again by __setup for debug/verbose */
177 return 0;
178}
179early_param("apic", parse_apic);
180
181void __init generic_bigsmp_probe(void)
182{
183#ifdef CONFIG_X86_BIGSMP
184 /*
185 * This routine is used to switch to bigsmp mode when
186 * - There is no apic= option specified by the user
187 * - generic_apic_probe() has chosen apic_default as the sub_arch
188 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
189 */
190
191 if (!cmdline_apic && apic == &apic_default) {
192 if (apic_bigsmp.probe()) {
193 apic = &apic_bigsmp;
194 if (x86_quirks->update_genapic)
195 x86_quirks->update_genapic();
196 printk(KERN_INFO "Overriding APIC driver with %s\n",
197 apic->name);
198 }
199 }
200#endif
201}
202
203void __init generic_apic_probe(void)
204{
205 if (!cmdline_apic) {
206 int i;
207 for (i = 0; apic_probe[i]; i++) {
208 if (apic_probe[i]->probe()) {
209 apic = apic_probe[i];
210 break;
211 }
212 }
213 /* Not visible without early console */
214 if (!apic_probe[i])
215 panic("Didn't find an APIC driver");
216
217 if (x86_quirks->update_genapic)
218 x86_quirks->update_genapic();
219 }
220 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
221}
222
223/* These functions can switch the APIC even after the initial ->probe() */
224
225int __init
226generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
227{
228 int i;
229
230 for (i = 0; apic_probe[i]; ++i) {
231 if (!apic_probe[i]->mps_oem_check)
232 continue;
233 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
234 continue;
235
236 if (!cmdline_apic) {
237 apic = apic_probe[i];
238 if (x86_quirks->update_genapic)
239 x86_quirks->update_genapic();
240 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
241 apic->name);
242 }
243 return 1;
244 }
245 return 0;
246}
247
248int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
249{
250 int i;
251
252 for (i = 0; apic_probe[i]; ++i) {
253 if (!apic_probe[i]->acpi_madt_oem_check)
254 continue;
255 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
256 continue;
257
258 if (!cmdline_apic) {
259 apic = apic_probe[i];
260 if (x86_quirks->update_genapic)
261 x86_quirks->update_genapic();
262 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
263 apic->name);
264 }
265 return 1;
266 }
267 return 0;
268}
269
270#endif /* CONFIG_X86_LOCAL_APIC */
271
272/**
273 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
274 *
275 * Description:
276 * Perform any necessary interrupt initialisation prior to setting up
277 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
278 * interrupts should be initialised here if the machine emulates a PC
279 * in any way.
280 **/
281void __init pre_intr_init_hook(void)
282{
283 if (x86_quirks->arch_pre_intr_init) {
284 if (x86_quirks->arch_pre_intr_init())
285 return;
286 }
287 init_ISA_irqs();
288}
289
290/**
291 * intr_init_hook - post gate setup interrupt initialisation
292 *
293 * Description:
294 * Fill in any interrupts that may have been left out by the general
295 * init_IRQ() routine. interrupts having to do with the machine rather
296 * than the devices on the I/O bus (like APIC interrupts in intel MP
297 * systems) are started here.
298 **/
299void __init intr_init_hook(void)
300{
301 if (x86_quirks->arch_intr_init) {
302 if (x86_quirks->arch_intr_init())
303 return;
304 }
305}
306
307/**
308 * pre_setup_arch_hook - hook called prior to any setup_arch() execution
309 *
310 * Description:
311 * generally used to activate any machine specific identification
312 * routines that may be needed before setup_arch() runs. On Voyager
313 * this is used to get the board revision and type.
314 **/
315void __init pre_setup_arch_hook(void)
316{
317}
318
319/**
320 * trap_init_hook - initialise system specific traps
321 *
322 * Description:
323 * Called as the final act of trap_init(). Used in VISWS to initialise
324 * the various board specific APIC traps.
325 **/
326void __init trap_init_hook(void)
327{
328 if (x86_quirks->arch_trap_init) {
329 if (x86_quirks->arch_trap_init())
330 return;
331 }
332}
333
334static struct irqaction irq0 = {
335 .handler = timer_interrupt,
336 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
337 .mask = CPU_MASK_NONE,
338 .name = "timer"
339};
340
341/**
342 * pre_time_init_hook - do any specific initialisations before.
343 *
344 **/
345void __init pre_time_init_hook(void)
346{
347 if (x86_quirks->arch_pre_time_init)
348 x86_quirks->arch_pre_time_init();
349}
350
351/**
352 * time_init_hook - do any specific initialisations for the system timer.
353 *
354 * Description:
355 * Must plug the system timer interrupt source at HZ into the IRQ listed
356 * in irq_vectors.h:TIMER_IRQ
357 **/
358void __init time_init_hook(void)
359{
360 if (x86_quirks->arch_time_init) {
361 /*
362 * A nonzero return code does not mean failure, it means
363 * that the architecture quirk does not want any
364 * generic (timer) setup to be performed after this:
365 */
366 if (x86_quirks->arch_time_init())
367 return;
368 }
369
370 irq0.mask = cpumask_of_cpu(0);
371 setup_irq(0, &irq0);
372}
373
374#ifdef CONFIG_MCA
375/**
376 * mca_nmi_hook - hook into MCA specific NMI chain
377 *
378 * Description:
379 * The MCA (Microchannel Architecture) has an NMI chain for NMI sources
380 * along the MCA bus. Use this to hook into that chain if you will need
381 * it.
382 **/
383void mca_nmi_hook(void)
384{
385 /*
386 * If I recall correctly, there's a whole bunch of other things that
387 * we can do to check for NMI problems, but that's all I know about
388 * at the moment.
389 */
390 pr_warning("NMI generated from unknown source!\n");
391}
392#endif
393
394static __init int no_ipi_broadcast(char *str)
395{
396 get_option(&str, &no_broadcast);
397 pr_info("Using %s mode\n",
398 no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
399 return 1;
400}
401__setup("no_ipi_broadcast=", no_ipi_broadcast);
402
403static int __init print_ipi_mode(void)
404{
405 pr_info("Using IPI %s mode\n",
406 no_broadcast ? "No-Shortcut" : "Shortcut");
407 return 0;
408}
409
410late_initcall(print_ipi_mode);
411
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
index 675a48c404a5..071e7fea42e5 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -18,7 +18,7 @@
18#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/sections.h> 19#include <asm/sections.h>
20#include <asm/io.h> 20#include <asm/io.h>
21#include <setup_arch.h> 21#include <asm/setup_arch.h>
22 22
23static struct resource system_rom_resource = { 23static struct resource system_rom_resource = {
24 .name = "System ROM", 24 .name = "System ROM",
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..87b69d4fac16 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,13 +1,16 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
4#include <linux/smp.h> 5#include <linux/smp.h>
5#include <linux/slab.h> 6#include <linux/slab.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7#include <linux/module.h> 8#include <linux/module.h>
8#include <linux/pm.h> 9#include <linux/pm.h>
9#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h>
10#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/apic.h>
11 14
12unsigned long idle_halt; 15unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt); 16EXPORT_SYMBOL(idle_halt);
@@ -100,6 +103,9 @@ static inline int hlt_use_halt(void)
100void default_idle(void) 103void default_idle(void)
101{ 104{
102 if (hlt_use_halt()) { 105 if (hlt_use_halt()) {
106 struct power_trace it;
107
108 trace_power_start(&it, POWER_CSTATE, 1);
103 current_thread_info()->status &= ~TS_POLLING; 109 current_thread_info()->status &= ~TS_POLLING;
104 /* 110 /*
105 * TS_POLLING-cleared state must be visible before we 111 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +118,7 @@ void default_idle(void)
112 else 118 else
113 local_irq_enable(); 119 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121 trace_power_end(&it);
115 } else { 122 } else {
116 local_irq_enable(); 123 local_irq_enable();
117 /* loop is done by the caller */ 124 /* loop is done by the caller */
@@ -122,6 +129,21 @@ void default_idle(void)
122EXPORT_SYMBOL(default_idle); 129EXPORT_SYMBOL(default_idle);
123#endif 130#endif
124 131
132void stop_this_cpu(void *dummy)
133{
134 local_irq_disable();
135 /*
136 * Remove this CPU:
137 */
138 cpu_clear(smp_processor_id(), cpu_online_map);
139 disable_local_APIC();
140
141 for (;;) {
142 if (hlt_works(smp_processor_id()))
143 halt();
144 }
145}
146
125static void do_nothing(void *unused) 147static void do_nothing(void *unused)
126{ 148{
127} 149}
@@ -154,24 +176,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154 */ 176 */
155void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 177void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156{ 178{
179 struct power_trace it;
180
181 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157 if (!need_resched()) { 182 if (!need_resched()) {
183 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
184 clflush((void *)&current_thread_info()->flags);
185
158 __monitor((void *)&current_thread_info()->flags, 0, 0); 186 __monitor((void *)&current_thread_info()->flags, 0, 0);
159 smp_mb(); 187 smp_mb();
160 if (!need_resched()) 188 if (!need_resched())
161 __mwait(ax, cx); 189 __mwait(ax, cx);
162 } 190 }
191 trace_power_end(&it);
163} 192}
164 193
165/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 194/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166static void mwait_idle(void) 195static void mwait_idle(void)
167{ 196{
197 struct power_trace it;
168 if (!need_resched()) { 198 if (!need_resched()) {
199 trace_power_start(&it, POWER_CSTATE, 1);
200 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
201 clflush((void *)&current_thread_info()->flags);
202
169 __monitor((void *)&current_thread_info()->flags, 0, 0); 203 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb(); 204 smp_mb();
171 if (!need_resched()) 205 if (!need_resched())
172 __sti_mwait(0, 0); 206 __sti_mwait(0, 0);
173 else 207 else
174 local_irq_enable(); 208 local_irq_enable();
209 trace_power_end(&it);
175 } else 210 } else
176 local_irq_enable(); 211 local_irq_enable();
177} 212}
@@ -183,9 +218,13 @@ static void mwait_idle(void)
183 */ 218 */
184static void poll_idle(void) 219static void poll_idle(void)
185{ 220{
221 struct power_trace it;
222
223 trace_power_start(&it, POWER_CSTATE, 0);
186 local_irq_enable(); 224 local_irq_enable();
187 while (!need_resched()) 225 while (!need_resched())
188 cpu_relax(); 226 cpu_relax();
227 trace_power_end(&it);
189} 228}
190 229
191/* 230/*
@@ -270,7 +309,7 @@ static void c1e_idle(void)
270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 309 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
271 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 310 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
272 c1e_detected = 1; 311 c1e_detected = 1;
273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 312 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
274 mark_tsc_unstable("TSC halt in AMD C1E"); 313 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n"); 314 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); 315 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
@@ -311,7 +350,7 @@ static void c1e_idle(void)
311 350
312void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 351void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
313{ 352{
314#ifdef CONFIG_X86_SMP 353#ifdef CONFIG_SMP
315 if (pm_idle == poll_idle && smp_num_siblings > 1) { 354 if (pm_idle == poll_idle && smp_num_siblings > 1) {
316 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 355 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
317 " performance may degrade.\n"); 356 " performance may degrade.\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..fec79ad85dc6 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
11 11
12#include <stdarg.h> 12#include <stdarg.h>
13 13
14#include <linux/stackprotector.h>
14#include <linux/cpu.h> 15#include <linux/cpu.h>
15#include <linux/errno.h> 16#include <linux/errno.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -38,11 +39,13 @@
38#include <linux/percpu.h> 39#include <linux/percpu.h>
39#include <linux/prctl.h> 40#include <linux/prctl.h>
40#include <linux/dmi.h> 41#include <linux/dmi.h>
42#include <linux/ftrace.h>
43#include <linux/uaccess.h>
44#include <linux/io.h>
45#include <linux/kdebug.h>
41 46
42#include <asm/uaccess.h>
43#include <asm/pgtable.h> 47#include <asm/pgtable.h>
44#include <asm/system.h> 48#include <asm/system.h>
45#include <asm/io.h>
46#include <asm/ldt.h> 49#include <asm/ldt.h>
47#include <asm/processor.h> 50#include <asm/processor.h>
48#include <asm/i387.h> 51#include <asm/i387.h>
@@ -55,19 +58,15 @@
55 58
56#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
57#include <asm/cpu.h> 60#include <asm/cpu.h>
58#include <asm/kdebug.h>
59#include <asm/idle.h> 61#include <asm/idle.h>
60#include <asm/syscalls.h> 62#include <asm/syscalls.h>
61#include <asm/smp.h> 63#include <asm/ds.h>
62 64
63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 65asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
64 66
65DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 67DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
66EXPORT_PER_CPU_SYMBOL(current_task); 68EXPORT_PER_CPU_SYMBOL(current_task);
67 69
68DEFINE_PER_CPU(int, cpu_number);
69EXPORT_PER_CPU_SYMBOL(cpu_number);
70
71/* 70/*
72 * Return saved PC of a blocked thread. 71 * Return saved PC of a blocked thread.
73 */ 72 */
@@ -93,6 +92,15 @@ void cpu_idle(void)
93{ 92{
94 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
95 94
95 /*
96 * If we're the non-boot CPU, nothing set the stack canary up
97 * for us. CPU0 already has it initialized but no harm in
98 * doing it again. This is a good place for updating it, as
99 * we wont ever return from this function (so the invalid
100 * canaries already on the stack wont ever trigger).
101 */
102 boot_init_stack_canary();
103
96 current_thread_info()->status |= TS_POLLING; 104 current_thread_info()->status |= TS_POLLING;
97 105
98 /* endless idle loop with no priority at all */ 106 /* endless idle loop with no priority at all */
@@ -110,7 +118,6 @@ void cpu_idle(void)
110 play_dead(); 118 play_dead();
111 119
112 local_irq_disable(); 120 local_irq_disable();
113 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
114 /* Don't trace irqs off for idle */ 121 /* Don't trace irqs off for idle */
115 stop_critical_timings(); 122 stop_critical_timings();
116 pm_idle(); 123 pm_idle();
@@ -134,7 +141,7 @@ void __show_regs(struct pt_regs *regs, int all)
134 if (user_mode_vm(regs)) { 141 if (user_mode_vm(regs)) {
135 sp = regs->sp; 142 sp = regs->sp;
136 ss = regs->ss & 0xffff; 143 ss = regs->ss & 0xffff;
137 savesegment(gs, gs); 144 gs = get_user_gs(regs);
138 } else { 145 } else {
139 sp = (unsigned long) (&regs->sp); 146 sp = (unsigned long) (&regs->sp);
140 savesegment(ss, ss); 147 savesegment(ss, ss);
@@ -203,7 +210,7 @@ extern void kernel_thread_helper(void);
203/* 210/*
204 * Create a kernel thread 211 * Create a kernel thread
205 */ 212 */
206int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) 213int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
207{ 214{
208 struct pt_regs regs; 215 struct pt_regs regs;
209 216
@@ -215,6 +222,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
215 regs.ds = __USER_DS; 222 regs.ds = __USER_DS;
216 regs.es = __USER_DS; 223 regs.es = __USER_DS;
217 regs.fs = __KERNEL_PERCPU; 224 regs.fs = __KERNEL_PERCPU;
225 regs.gs = __KERNEL_STACK_CANARY;
218 regs.orig_ax = -1; 226 regs.orig_ax = -1;
219 regs.ip = (unsigned long) kernel_thread_helper; 227 regs.ip = (unsigned long) kernel_thread_helper;
220 regs.cs = __KERNEL_CS | get_kernel_rpl(); 228 regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -250,14 +258,8 @@ void exit_thread(void)
250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 258 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
251 put_cpu(); 259 put_cpu();
252 } 260 }
253#ifdef CONFIG_X86_DS 261
254 /* Free any DS contexts that have not been properly released. */ 262 ds_exit_thread(current);
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
261} 263}
262 264
263void flush_thread(void) 265void flush_thread(void)
@@ -270,7 +272,7 @@ void flush_thread(void)
270 tsk->thread.debugreg3 = 0; 272 tsk->thread.debugreg3 = 0;
271 tsk->thread.debugreg6 = 0; 273 tsk->thread.debugreg6 = 0;
272 tsk->thread.debugreg7 = 0; 274 tsk->thread.debugreg7 = 0;
273 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 275 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
274 clear_tsk_thread_flag(tsk, TIF_DEBUG); 276 clear_tsk_thread_flag(tsk, TIF_DEBUG);
275 /* 277 /*
276 * Forget coprocessor state.. 278 * Forget coprocessor state..
@@ -297,9 +299,9 @@ void prepare_to_copy(struct task_struct *tsk)
297 299
298int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 300int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
299 unsigned long unused, 301 unsigned long unused,
300 struct task_struct * p, struct pt_regs * regs) 302 struct task_struct *p, struct pt_regs *regs)
301{ 303{
302 struct pt_regs * childregs; 304 struct pt_regs *childregs;
303 struct task_struct *tsk; 305 struct task_struct *tsk;
304 int err; 306 int err;
305 307
@@ -313,7 +315,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
313 315
314 p->thread.ip = (unsigned long) ret_from_fork; 316 p->thread.ip = (unsigned long) ret_from_fork;
315 317
316 savesegment(gs, p->thread.gs); 318 task_user_gs(p) = get_user_gs(regs);
317 319
318 tsk = current; 320 tsk = current;
319 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 321 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -339,13 +341,19 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
339 kfree(p->thread.io_bitmap_ptr); 341 kfree(p->thread.io_bitmap_ptr);
340 p->thread.io_bitmap_max = 0; 342 p->thread.io_bitmap_max = 0;
341 } 343 }
344
345 ds_copy_thread(p, current);
346
347 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
348 p->thread.debugctlmsr = 0;
349
342 return err; 350 return err;
343} 351}
344 352
345void 353void
346start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 354start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
347{ 355{
348 __asm__("movl %0, %%gs" :: "r"(0)); 356 set_user_gs(regs, 0);
349 regs->fs = 0; 357 regs->fs = 0;
350 set_fs(USER_DS); 358 set_fs(USER_DS);
351 regs->ds = __USER_DS; 359 regs->ds = __USER_DS;
@@ -419,48 +427,19 @@ int set_tsc_mode(unsigned int val)
419 return 0; 427 return 0;
420} 428}
421 429
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
451static noinline void 430static noinline void
452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 431__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
453 struct tss_struct *tss) 432 struct tss_struct *tss)
454{ 433{
455 struct thread_struct *prev, *next; 434 struct thread_struct *prev, *next;
456 unsigned long debugctl;
457 435
458 prev = &prev_p->thread; 436 prev = &prev_p->thread;
459 next = &next_p->thread; 437 next = &next_p->thread;
460 438
461 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 439 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
462 440 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
463 if (next->debugctlmsr != debugctl) 441 ds_switch_to(prev_p, next_p);
442 else if (next->debugctlmsr != prev->debugctlmsr)
464 update_debugctlmsr(next->debugctlmsr); 443 update_debugctlmsr(next->debugctlmsr);
465 444
466 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 445 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +461,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
482 hard_enable_TSC(); 461 hard_enable_TSC();
483 } 462 }
484 463
485#ifdef CONFIG_X86_PTRACE_BTS
486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
488
489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
491#endif /* CONFIG_X86_PTRACE_BTS */
492
493
494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 464 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
495 /* 465 /*
496 * Disable the bitmap via an invalid offset. We still cache 466 * Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +518,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 518 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 519 * for example.
550 */ 520 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 521__notrace_funcgraph struct task_struct *
522__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 523{
553 struct thread_struct *prev = &prev_p->thread, 524 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 525 *next = &next_p->thread;
@@ -579,7 +550,7 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
579 * used %fs or %gs (it does not today), or if the kernel is 550 * used %fs or %gs (it does not today), or if the kernel is
580 * running inside of a hypervisor layer. 551 * running inside of a hypervisor layer.
581 */ 552 */
582 savesegment(gs, prev->gs); 553 lazy_save_gs(prev->gs);
583 554
584 /* 555 /*
585 * Load the per-thread Thread-Local Storage descriptor. 556 * Load the per-thread Thread-Local Storage descriptor.
@@ -625,31 +596,31 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
625 * Restore %gs if needed (which is common) 596 * Restore %gs if needed (which is common)
626 */ 597 */
627 if (prev->gs | next->gs) 598 if (prev->gs | next->gs)
628 loadsegment(gs, next->gs); 599 lazy_load_gs(next->gs);
629 600
630 x86_write_percpu(current_task, next_p); 601 percpu_write(current_task, next_p);
631 602
632 return prev_p; 603 return prev_p;
633} 604}
634 605
635asmlinkage int sys_fork(struct pt_regs regs) 606int sys_fork(struct pt_regs *regs)
636{ 607{
637 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL); 608 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
638} 609}
639 610
640asmlinkage int sys_clone(struct pt_regs regs) 611int sys_clone(struct pt_regs *regs)
641{ 612{
642 unsigned long clone_flags; 613 unsigned long clone_flags;
643 unsigned long newsp; 614 unsigned long newsp;
644 int __user *parent_tidptr, *child_tidptr; 615 int __user *parent_tidptr, *child_tidptr;
645 616
646 clone_flags = regs.bx; 617 clone_flags = regs->bx;
647 newsp = regs.cx; 618 newsp = regs->cx;
648 parent_tidptr = (int __user *)regs.dx; 619 parent_tidptr = (int __user *)regs->dx;
649 child_tidptr = (int __user *)regs.di; 620 child_tidptr = (int __user *)regs->di;
650 if (!newsp) 621 if (!newsp)
651 newsp = regs.sp; 622 newsp = regs->sp;
652 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 623 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
653} 624}
654 625
655/* 626/*
@@ -662,27 +633,27 @@ asmlinkage int sys_clone(struct pt_regs regs)
662 * do not have enough call-clobbered registers to hold all 633 * do not have enough call-clobbered registers to hold all
663 * the information you need. 634 * the information you need.
664 */ 635 */
665asmlinkage int sys_vfork(struct pt_regs regs) 636int sys_vfork(struct pt_regs *regs)
666{ 637{
667 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL); 638 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL);
668} 639}
669 640
670/* 641/*
671 * sys_execve() executes a new program. 642 * sys_execve() executes a new program.
672 */ 643 */
673asmlinkage int sys_execve(struct pt_regs regs) 644int sys_execve(struct pt_regs *regs)
674{ 645{
675 int error; 646 int error;
676 char * filename; 647 char *filename;
677 648
678 filename = getname((char __user *) regs.bx); 649 filename = getname((char __user *) regs->bx);
679 error = PTR_ERR(filename); 650 error = PTR_ERR(filename);
680 if (IS_ERR(filename)) 651 if (IS_ERR(filename))
681 goto out; 652 goto out;
682 error = do_execve(filename, 653 error = do_execve(filename,
683 (char __user * __user *) regs.cx, 654 (char __user * __user *) regs->cx,
684 (char __user * __user *) regs.dx, 655 (char __user * __user *) regs->dx,
685 &regs); 656 regs);
686 if (error == 0) { 657 if (error == 0) {
687 /* Make sure we don't return using sysenter.. */ 658 /* Make sure we don't return using sysenter.. */
688 set_thread_flag(TIF_IRET); 659 set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..836ef6575f01 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -39,22 +40,30 @@
39#include <linux/prctl.h> 40#include <linux/prctl.h>
40#include <linux/uaccess.h> 41#include <linux/uaccess.h>
41#include <linux/io.h> 42#include <linux/io.h>
43#include <linux/ftrace.h>
44#include <linux/dmi.h>
42 45
43#include <asm/pgtable.h> 46#include <asm/pgtable.h>
44#include <asm/system.h> 47#include <asm/system.h>
45#include <asm/processor.h> 48#include <asm/processor.h>
46#include <asm/i387.h> 49#include <asm/i387.h>
47#include <asm/mmu_context.h> 50#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h> 51#include <asm/prctl.h>
50#include <asm/desc.h> 52#include <asm/desc.h>
51#include <asm/proto.h> 53#include <asm/proto.h>
52#include <asm/ia32.h> 54#include <asm/ia32.h>
53#include <asm/idle.h> 55#include <asm/idle.h>
54#include <asm/syscalls.h> 56#include <asm/syscalls.h>
57#include <asm/ds.h>
55 58
56asmlinkage extern void ret_from_fork(void); 59asmlinkage extern void ret_from_fork(void);
57 60
61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62EXPORT_PER_CPU_SYMBOL(current_task);
63
64DEFINE_PER_CPU(unsigned long, old_rsp);
65static DEFINE_PER_CPU(unsigned char, is_idle);
66
58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 68
60static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -73,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
73 82
74void enter_idle(void) 83void enter_idle(void)
75{ 84{
76 write_pda(isidle, 1); 85 percpu_write(is_idle, 1);
77 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
78} 87}
79 88
80static void __exit_idle(void) 89static void __exit_idle(void)
81{ 90{
82 if (test_and_clear_bit_pda(0, isidle) == 0) 91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
83 return; 92 return;
84 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
85} 94}
@@ -109,6 +118,16 @@ static inline void play_dead(void)
109void cpu_idle(void) 118void cpu_idle(void)
110{ 119{
111 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121
122 /*
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
128 */
129 boot_init_stack_canary();
130
112 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
113 while (1) { 132 while (1) {
114 tick_nohz_stop_sched_tick(1); 133 tick_nohz_stop_sched_tick(1);
@@ -149,14 +168,18 @@ void __show_regs(struct pt_regs *regs, int all)
149 unsigned long d0, d1, d2, d3, d6, d7; 168 unsigned long d0, d1, d2, d3, d6, d7;
150 unsigned int fsindex, gsindex; 169 unsigned int fsindex, gsindex;
151 unsigned int ds, cs, es; 170 unsigned int ds, cs, es;
171 const char *board;
152 172
153 printk("\n"); 173 printk("\n");
154 print_modules(); 174 print_modules();
155 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", 175 board = dmi_get_system_info(DMI_PRODUCT_NAME);
176 if (!board)
177 board = "";
178 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
156 current->pid, current->comm, print_tainted(), 179 current->pid, current->comm, print_tainted(),
157 init_utsname()->release, 180 init_utsname()->release,
158 (int)strcspn(init_utsname()->version, " "), 181 (int)strcspn(init_utsname()->version, " "),
159 init_utsname()->version); 182 init_utsname()->version, board);
160 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 183 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
161 printk_address(regs->ip, 1); 184 printk_address(regs->ip, 1);
162 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 185 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
@@ -235,14 +258,8 @@ void exit_thread(void)
235 t->io_bitmap_max = 0; 258 t->io_bitmap_max = 0;
236 put_cpu(); 259 put_cpu();
237 } 260 }
238#ifdef CONFIG_X86_DS 261
239 /* Free any DS contexts that have not been properly released. */ 262 ds_exit_thread(current);
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
246} 263}
247 264
248void flush_thread(void) 265void flush_thread(void)
@@ -372,6 +389,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
372 if (err) 389 if (err)
373 goto out; 390 goto out;
374 } 391 }
392
393 ds_copy_thread(p, me);
394
395 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
396 p->thread.debugctlmsr = 0;
397
375 err = 0; 398 err = 0;
376out: 399out:
377 if (err && p->thread.io_bitmap_ptr) { 400 if (err && p->thread.io_bitmap_ptr) {
@@ -390,7 +413,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
390 load_gs_index(0); 413 load_gs_index(0);
391 regs->ip = new_ip; 414 regs->ip = new_ip;
392 regs->sp = new_sp; 415 regs->sp = new_sp;
393 write_pda(oldrsp, new_sp); 416 percpu_write(old_rsp, new_sp);
394 regs->cs = __USER_CS; 417 regs->cs = __USER_CS;
395 regs->ss = __USER_DS; 418 regs->ss = __USER_DS;
396 regs->flags = 0x200; 419 regs->flags = 0x200;
@@ -470,35 +493,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
470 struct tss_struct *tss) 493 struct tss_struct *tss)
471{ 494{
472 struct thread_struct *prev, *next; 495 struct thread_struct *prev, *next;
473 unsigned long debugctl;
474 496
475 prev = &prev_p->thread, 497 prev = &prev_p->thread,
476 next = &next_p->thread; 498 next = &next_p->thread;
477 499
478 debugctl = prev->debugctlmsr; 500 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
479 501 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
480#ifdef CONFIG_X86_DS 502 ds_switch_to(prev_p, next_p);
481 { 503 else if (next->debugctlmsr != prev->debugctlmsr)
482 unsigned long ds_prev = 0, ds_next = 0;
483
484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
498 }
499#endif /* CONFIG_X86_DS */
500
501 if (next->debugctlmsr != debugctl)
502 update_debugctlmsr(next->debugctlmsr); 504 update_debugctlmsr(next->debugctlmsr);
503 505
504 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 506 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +535,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
533 */ 535 */
534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 536 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
535 } 537 }
536
537#ifdef CONFIG_X86_PTRACE_BTS
538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
540
541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
543#endif /* CONFIG_X86_PTRACE_BTS */
544} 538}
545 539
546/* 540/*
@@ -551,8 +545,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 545 * - could test fs/gs bitsliced
552 * 546 *
553 * Kprobes not supported here. Set the probe on schedule instead. 547 * Kprobes not supported here. Set the probe on schedule instead.
548 * Function graph tracer not supported too.
554 */ 549 */
555struct task_struct * 550__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 551__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 552{
558 struct thread_struct *prev = &prev_p->thread; 553 struct thread_struct *prev = &prev_p->thread;
@@ -639,21 +634,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
639 /* 634 /*
640 * Switch the PDA and FPU contexts. 635 * Switch the PDA and FPU contexts.
641 */ 636 */
642 prev->usersp = read_pda(oldrsp); 637 prev->usersp = percpu_read(old_rsp);
643 write_pda(oldrsp, next->usersp); 638 percpu_write(old_rsp, next->usersp);
644 write_pda(pcurrent, next_p); 639 percpu_write(current_task, next_p);
645 640
646 write_pda(kernelstack, 641 percpu_write(kernel_stack,
647 (unsigned long)task_stack_page(next_p) + 642 (unsigned long)task_stack_page(next_p) +
648 THREAD_SIZE - PDA_STACKOFFSET); 643 THREAD_SIZE - KERNEL_STACK_OFFSET);
649#ifdef CONFIG_CC_STACKPROTECTOR
650 write_pda(stack_canary, next_p->stack_canary);
651 /*
652 * Build time only check to make sure the stack_canary is at
653 * offset 40 in the pda; this is a gcc ABI requirement
654 */
655 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
656#endif
657 644
658 /* 645 /*
659 * Now maybe reload the debug registers and handle I/O bitmaps 646 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..7ec39ab37a2d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)
75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
76{ 76{
77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
78 regno >>= 2; 78 return &regs->bx + (regno >> 2);
79 if (regno > FS)
80 --regno;
81 return &regs->bx + regno;
82} 79}
83 80
84static u16 get_segment_reg(struct task_struct *task, unsigned long offset) 81static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +87,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
90 if (offset != offsetof(struct user_regs_struct, gs)) 87 if (offset != offsetof(struct user_regs_struct, gs))
91 retval = *pt_regs_access(task_pt_regs(task), offset); 88 retval = *pt_regs_access(task_pt_regs(task), offset);
92 else { 89 else {
93 retval = task->thread.gs;
94 if (task == current) 90 if (task == current)
95 savesegment(gs, retval); 91 retval = get_user_gs(task_pt_regs(task));
92 else
93 retval = task_user_gs(task);
96 } 94 }
97 return retval; 95 return retval;
98} 96}
@@ -126,13 +124,10 @@ static int set_segment_reg(struct task_struct *task,
126 break; 124 break;
127 125
128 case offsetof(struct user_regs_struct, gs): 126 case offsetof(struct user_regs_struct, gs):
129 task->thread.gs = value;
130 if (task == current) 127 if (task == current)
131 /* 128 set_user_gs(task_pt_regs(task), value);
132 * The user-mode %gs is not affected by 129 else
133 * kernel entry, so we must update the CPU. 130 task_user_gs(task) = value;
134 */
135 loadsegment(gs, value);
136 } 131 }
137 132
138 return 0; 133 return 0;
@@ -581,158 +576,91 @@ static int ioperm_get(struct task_struct *target,
581} 576}
582 577
583#ifdef CONFIG_X86_PTRACE_BTS 578#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 579static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 580 struct bts_struct __user *out)
665{ 581{
666 struct bts_struct ret; 582 const struct bts_trace *trace;
667 const void *bts_record; 583 struct bts_struct bts;
668 size_t bts_index, bts_end; 584 const unsigned char *at;
669 int error; 585 int error;
670 586
671 error = ds_get_bts_end(child, &bts_end); 587 trace = ds_read_bts(child->bts);
672 if (error < 0) 588 if (!trace)
673 return error; 589 return -EPERM;
674
675 if (bts_end <= index)
676 return -EINVAL;
677 590
678 error = ds_get_bts_index(child, &bts_index); 591 at = trace->ds.top - ((index + 1) * trace->ds.size);
679 if (error < 0) 592 if ((void *)at < trace->ds.begin)
680 return error; 593 at += (trace->ds.n * trace->ds.size);
681 594
682 /* translate the ptrace bts index into the ds bts index */ 595 if (!trace->read)
683 bts_index += bts_end - (index + 1); 596 return -EOPNOTSUPP;
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 597
687 error = ds_access_bts(child, bts_index, &bts_record); 598 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 599 if (error < 0)
689 return error; 600 return error;
690 601
691 ptrace_bts_translate_record(&ret, bts_record); 602 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 603 return -EFAULT;
695 604
696 return sizeof(ret); 605 return sizeof(bts);
697} 606}
698 607
699static int ptrace_bts_drain(struct task_struct *child, 608static int ptrace_bts_drain(struct task_struct *child,
700 long size, 609 long size,
701 struct bts_struct __user *out) 610 struct bts_struct __user *out)
702{ 611{
703 struct bts_struct ret; 612 const struct bts_trace *trace;
704 const unsigned char *raw; 613 const unsigned char *at;
705 size_t end, i; 614 int error, drained = 0;
706 int error;
707 615
708 error = ds_get_bts_index(child, &end); 616 trace = ds_read_bts(child->bts);
709 if (error < 0) 617 if (!trace)
710 return error; 618 return -EPERM;
619
620 if (!trace->read)
621 return -EOPNOTSUPP;
711 622
712 if (size < (end * sizeof(struct bts_struct))) 623 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 624 return -EIO;
714 625
715 error = ds_access_bts(child, 0, (const void **)&raw); 626 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 627 out++, drained++, at += trace->ds.size) {
717 return error; 628 struct bts_struct bts;
629 int error;
718 630
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 631 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 632 if (error < 0)
633 return error;
721 634
722 if (copy_to_user(out, &ret, sizeof(ret))) 635 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 636 return -EFAULT;
724 } 637 }
725 638
726 error = ds_clear_bts(child); 639 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
640
641 error = ds_reset_bts(child->bts);
727 if (error < 0) 642 if (error < 0)
728 return error; 643 return error;
729 644
730 return end; 645 return drained;
646}
647
648static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
649{
650 child->bts_buffer = alloc_locked_buffer(size);
651 if (!child->bts_buffer)
652 return -ENOMEM;
653
654 child->bts_size = size;
655
656 return 0;
731} 657}
732 658
733static void ptrace_bts_ovfl(struct task_struct *child) 659static void ptrace_bts_free_buffer(struct task_struct *child)
734{ 660{
735 send_sig(child->thread.bts_ovfl_signal, child, 0); 661 free_locked_buffer(child->bts_buffer, child->bts_size);
662 child->bts_buffer = NULL;
663 child->bts_size = 0;
736} 664}
737 665
738static int ptrace_bts_config(struct task_struct *child, 666static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +668,86 @@ static int ptrace_bts_config(struct task_struct *child,
740 const struct ptrace_bts_config __user *ucfg) 668 const struct ptrace_bts_config __user *ucfg)
741{ 669{
742 struct ptrace_bts_config cfg; 670 struct ptrace_bts_config cfg;
743 int error = 0; 671 unsigned int flags = 0;
744 672
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748
749 error = -EIO;
750 if (cfg_size < sizeof(cfg)) 673 if (cfg_size < sizeof(cfg))
751 goto errout; 674 return -EIO;
752 675
753 error = -EFAULT;
754 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 676 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
755 goto errout; 677 return -EFAULT;
756 678
757 error = -EINVAL; 679 if (child->bts) {
758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && 680 ds_release_bts(child->bts);
759 !(cfg.flags & PTRACE_BTS_O_ALLOC)) 681 child->bts = NULL;
760 goto errout; 682 }
761 683
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 684 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
763 ds_ovfl_callback_t ovfl = NULL; 685 if (!cfg.signal)
764 unsigned int sig = 0; 686 return -EINVAL;
765 687
766 /* we ignore the error in case we were not tracing child */ 688 return -EOPNOTSUPP;
767 (void)ds_release_bts(child);
768 689
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 690 child->thread.bts_ovfl_signal = cfg.signal;
770 if (!cfg.signal) 691 }
771 goto errout;
772 692
773 sig = cfg.signal; 693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
774 ovfl = ptrace_bts_ovfl; 694 (cfg.size != child->bts_size)) {
775 } 695 int error;
776 696
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 697 ptrace_bts_free_buffer(child);
778 if (error < 0)
779 goto errout;
780 698
781 child->thread.bts_ovfl_signal = sig; 699 error = ptrace_bts_allocate_buffer(child, cfg.size);
700 if (error < 0)
701 return error;
782 } 702 }
783 703
784 error = -EINVAL;
785 if (!child->thread.ds_ctx && cfg.flags)
786 goto errout;
787
788 if (cfg.flags & PTRACE_BTS_O_TRACE) 704 if (cfg.flags & PTRACE_BTS_O_TRACE)
789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 705 flags |= BTS_USER;
790 else
791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
792 706
793 if (cfg.flags & PTRACE_BTS_O_SCHED) 707 if (cfg.flags & PTRACE_BTS_O_SCHED)
794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 708 flags |= BTS_TIMESTAMPS;
795 else
796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
797 709
798 error = sizeof(cfg); 710 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
711 /* ovfl = */ NULL, /* th = */ (size_t)-1,
712 flags);
713 if (IS_ERR(child->bts)) {
714 int error = PTR_ERR(child->bts);
799 715
800out: 716 ptrace_bts_free_buffer(child);
801 if (child->thread.debugctlmsr) 717 child->bts = NULL;
802 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
803 else
804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
805 718
806 return error; 719 return error;
720 }
807 721
808errout: 722 return sizeof(cfg);
809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
811 goto out;
812} 723}
813 724
814static int ptrace_bts_status(struct task_struct *child, 725static int ptrace_bts_status(struct task_struct *child,
815 long cfg_size, 726 long cfg_size,
816 struct ptrace_bts_config __user *ucfg) 727 struct ptrace_bts_config __user *ucfg)
817{ 728{
729 const struct bts_trace *trace;
818 struct ptrace_bts_config cfg; 730 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
822 731
823 if (cfg_size < sizeof(cfg)) 732 if (cfg_size < sizeof(cfg))
824 return -EIO; 733 return -EIO;
825 734
826 error = ds_get_bts_end(child, &end); 735 trace = ds_read_bts(child->bts);
827 if (error < 0) 736 if (!trace)
828 return error; 737 return -EPERM;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
833
834 error = ds_access_bts(child, /* index = */ end, &max);
835 if (error < 0)
836 return error;
837 738
838 memset(&cfg, 0, sizeof(cfg)); 739 memset(&cfg, 0, sizeof(cfg));
839 cfg.size = (max - base); 740 cfg.size = trace->ds.end - trace->ds.begin;
840 cfg.signal = child->thread.bts_ovfl_signal; 741 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct); 742 cfg.bts_size = sizeof(struct bts_struct);
842 743
843 if (cfg.signal) 744 if (cfg.signal)
844 cfg.flags |= PTRACE_BTS_O_SIGNAL; 745 cfg.flags |= PTRACE_BTS_O_SIGNAL;
845 746
846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 747 if (trace->ds.flags & BTS_USER)
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE; 748 cfg.flags |= PTRACE_BTS_O_TRACE;
849 749
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 750 if (trace->ds.flags & BTS_TIMESTAMPS)
851 cfg.flags |= PTRACE_BTS_O_SCHED; 751 cfg.flags |= PTRACE_BTS_O_SCHED;
852 752
853 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 753 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,110 +756,77 @@ static int ptrace_bts_status(struct task_struct *child,
856 return sizeof(cfg); 756 return sizeof(cfg);
857} 757}
858 758
859static int ptrace_bts_write_record(struct task_struct *child, 759static int ptrace_bts_clear(struct task_struct *child)
860 const struct bts_struct *in)
861{ 760{
862 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 761 const struct bts_trace *trace;
863 762
864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 763 trace = ds_read_bts(child->bts);
764 if (!trace)
765 return -EPERM;
865 766
866 memset(bts_record, 0, bts_cfg.sizeof_bts); 767 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
867 switch (in->qualifier) {
868 case BTS_INVALID:
869 break;
870 768
871 case BTS_BRANCH: 769 return ds_reset_bts(child->bts);
872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip); 770}
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
875 771
876 case BTS_TASK_ARRIVES: 772static int ptrace_bts_size(struct task_struct *child)
877 case BTS_TASK_DEPARTS: 773{
878 bts_set(bts_record, bts_from, bts_escape); 774 const struct bts_trace *trace;
879 bts_set(bts_record, bts_qual, in->qualifier);
880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
882 775
883 default: 776 trace = ds_read_bts(child->bts);
884 return -EINVAL; 777 if (!trace)
885 } 778 return -EPERM;
886 779
887 /* The writing task will be the switched-to task on a context 780 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 781}
892 782
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 783static void ptrace_bts_fork(struct task_struct *tsk)
894 enum bts_qualifier qualifier)
895{ 784{
896 struct bts_struct rec = { 785 tsk->bts = NULL;
897 .qualifier = qualifier, 786 tsk->bts_buffer = NULL;
898 .variant.jiffies = jiffies_64 787 tsk->bts_size = 0;
899 }; 788 tsk->thread.bts_ovfl_signal = 0;
900
901 ptrace_bts_write_record(tsk, &rec);
902} 789}
903 790
904static const struct bts_configuration bts_cfg_netburst = { 791static void ptrace_bts_untrace(struct task_struct *child)
905 .sizeof_bts = sizeof(long) * 3, 792{
906 .sizeof_field = sizeof(long), 793 if (unlikely(child->bts)) {
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5) 794 ds_release_bts(child->bts);
908}; 795 child->bts = NULL;
796
797 /* We cannot update total_vm and locked_vm since
798 child's mm is already gone. But we can reclaim the
799 memory. */
800 kfree(child->bts_buffer);
801 child->bts_buffer = NULL;
802 child->bts_size = 0;
803 }
804}
909 805
910static const struct bts_configuration bts_cfg_pentium_m = { 806static void ptrace_bts_detach(struct task_struct *child)
911 .sizeof_bts = sizeof(long) * 3, 807{
912 .sizeof_field = sizeof(long), 808 if (unlikely(child->bts)) {
913 .debugctl_mask = (1<<6)|(1<<7) 809 ds_release_bts(child->bts);
914}; 810 child->bts = NULL;
915 811
916static const struct bts_configuration bts_cfg_core2 = { 812 ptrace_bts_free_buffer(child);
917 .sizeof_bts = 8 * 3, 813 }
918 .sizeof_field = 8, 814}
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9) 815#else
920}; 816static inline void ptrace_bts_fork(struct task_struct *tsk) {}
817static inline void ptrace_bts_detach(struct task_struct *child) {}
818static inline void ptrace_bts_untrace(struct task_struct *child) {}
819#endif /* CONFIG_X86_PTRACE_BTS */
921 820
922static inline void bts_configure(const struct bts_configuration *cfg) 821void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
923{ 822{
924 bts_cfg = *cfg; 823 ptrace_bts_fork(child);
925} 824}
926 825
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) 826void x86_ptrace_untrace(struct task_struct *child)
928{ 827{
929 switch (c->x86) { 828 ptrace_bts_untrace(child);
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961} 829}
962#endif /* CONFIG_X86_PTRACE_BTS */
963 830
964/* 831/*
965 * Called by kernel/ptrace.c when detaching.. 832 * Called by kernel/ptrace.c when detaching..
@@ -972,15 +839,7 @@ void ptrace_disable(struct task_struct *child)
972#ifdef TIF_SYSCALL_EMU 839#ifdef TIF_SYSCALL_EMU
973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 840 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
974#endif 841#endif
975#ifdef CONFIG_X86_PTRACE_BTS 842 ptrace_bts_detach(child);
976 (void)ds_release_bts(child);
977
978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
979 if (!child->thread.debugctlmsr)
980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
981
982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
983#endif /* CONFIG_X86_PTRACE_BTS */
984} 843}
985 844
986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 845#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1112,7 +971,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1112 break; 971 break;
1113 972
1114 case PTRACE_BTS_SIZE: 973 case PTRACE_BTS_SIZE:
1115 ret = ds_get_bts_index(child, /* pos = */ NULL); 974 ret = ptrace_bts_size(child);
1116 break; 975 break;
1117 976
1118 case PTRACE_BTS_GET: 977 case PTRACE_BTS_GET:
@@ -1121,7 +980,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1121 break; 980 break;
1122 981
1123 case PTRACE_BTS_CLEAR: 982 case PTRACE_BTS_CLEAR:
1124 ret = ds_clear_bts(child); 983 ret = ptrace_bts_clear(child);
1125 break; 984 break;
1126 985
1127 case PTRACE_BTS_DRAIN: 986 case PTRACE_BTS_DRAIN:
@@ -1384,6 +1243,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1384 1243
1385 case PTRACE_GET_THREAD_AREA: 1244 case PTRACE_GET_THREAD_AREA:
1386 case PTRACE_SET_THREAD_AREA: 1245 case PTRACE_SET_THREAD_AREA:
1246#ifdef CONFIG_X86_PTRACE_BTS
1247 case PTRACE_BTS_CONFIG:
1248 case PTRACE_BTS_STATUS:
1249 case PTRACE_BTS_SIZE:
1250 case PTRACE_BTS_GET:
1251 case PTRACE_BTS_CLEAR:
1252 case PTRACE_BTS_DRAIN:
1253#endif /* CONFIG_X86_PTRACE_BTS */
1387 return arch_ptrace(child, request, addr, data); 1254 return arch_ptrace(child, request, addr, data);
1388 1255
1389 default: 1256 default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
168 ich_force_enable_hpet); 168 ich_force_enable_hpet);
169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
170 ich_force_enable_hpet); 170 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
172 ich_force_enable_hpet); 174 ich_force_enable_hpet);
173 175
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3cd512484e5..32e8f0af292c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,9 @@
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h>
16#include <asm/virtext.h>
17#include <asm/cpu.h>
15 18
16#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
17# include <linux/dmi.h> 20# include <linux/dmi.h>
@@ -21,8 +24,7 @@
21# include <asm/iommu.h> 24# include <asm/iommu.h>
22#endif 25#endif
23 26
24#include <mach_ipi.h> 27#include <asm/genapic.h>
25
26 28
27/* 29/*
28 * Power off function, if any 30 * Power off function, if any
@@ -39,7 +41,16 @@ int reboot_force;
39static int reboot_cpu = -1; 41static int reboot_cpu = -1;
40#endif 42#endif
41 43
42/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] 44/* This is set if we need to go through the 'emergency' path.
45 * When machine_emergency_restart() is called, we may be on
46 * an inconsistent state and won't be able to do a clean cleanup
47 */
48static int reboot_emergency;
49
50/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
51bool port_cf9_safe = false;
52
53/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
43 warm Don't set the cold reboot flag 54 warm Don't set the cold reboot flag
44 cold Set the cold reboot flag 55 cold Set the cold reboot flag
45 bios Reboot by jumping through the BIOS (only for X86_32) 56 bios Reboot by jumping through the BIOS (only for X86_32)
@@ -48,6 +59,7 @@ static int reboot_cpu = -1;
48 kbd Use the keyboard controller. cold reset (default) 59 kbd Use the keyboard controller. cold reset (default)
49 acpi Use the RESET_REG in the FADT 60 acpi Use the RESET_REG in the FADT
50 efi Use efi reset_system runtime service 61 efi Use efi reset_system runtime service
62 pci Use the so-called "PCI reset register", CF9
51 force Avoid anything that could hang. 63 force Avoid anything that could hang.
52 */ 64 */
53static int __init reboot_setup(char *str) 65static int __init reboot_setup(char *str)
@@ -82,6 +94,7 @@ static int __init reboot_setup(char *str)
82 case 'k': 94 case 'k':
83 case 't': 95 case 't':
84 case 'e': 96 case 'e':
97 case 'p':
85 reboot_type = *str; 98 reboot_type = *str;
86 break; 99 break;
87 100
@@ -172,6 +185,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
172 DMI_MATCH(DMI_BOARD_NAME, "0KW626"), 185 DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
173 }, 186 },
174 }, 187 },
188 { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
189 .callback = set_bios_reboot,
190 .ident = "Dell OptiPlex 330",
191 .matches = {
192 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
193 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
194 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
195 },
196 },
175 { /* Handle problems with rebooting on Dell 2400's */ 197 { /* Handle problems with rebooting on Dell 2400's */
176 .callback = set_bios_reboot, 198 .callback = set_bios_reboot,
177 .ident = "Dell PowerEdge 2400", 199 .ident = "Dell PowerEdge 2400",
@@ -354,6 +376,48 @@ static inline void kb_wait(void)
354 } 376 }
355} 377}
356 378
379static void vmxoff_nmi(int cpu, struct die_args *args)
380{
381 cpu_emergency_vmxoff();
382}
383
384/* Use NMIs as IPIs to tell all CPUs to disable virtualization
385 */
386static void emergency_vmx_disable_all(void)
387{
388 /* Just make sure we won't change CPUs while doing this */
389 local_irq_disable();
390
391 /* We need to disable VMX on all CPUs before rebooting, otherwise
392 * we risk hanging up the machine, because the CPU ignore INIT
393 * signals when VMX is enabled.
394 *
395 * We can't take any locks and we may be on an inconsistent
396 * state, so we use NMIs as IPIs to tell the other CPUs to disable
397 * VMX and halt.
398 *
399 * For safety, we will avoid running the nmi_shootdown_cpus()
400 * stuff unnecessarily, but we don't have a way to check
401 * if other CPUs have VMX enabled. So we will call it only if the
402 * CPU we are running on has VMX enabled.
403 *
404 * We will miss cases where VMX is not enabled on all CPUs. This
405 * shouldn't do much harm because KVM always enable VMX on all
406 * CPUs anyway. But we can miss it on the small window where KVM
407 * is still enabling VMX.
408 */
409 if (cpu_has_vmx() && cpu_vmx_enabled()) {
410 /* Disable VMX on this CPU.
411 */
412 cpu_vmxoff();
413
414 /* Halt and disable VMX on the other CPUs */
415 nmi_shootdown_cpus(vmxoff_nmi);
416
417 }
418}
419
420
357void __attribute__((weak)) mach_reboot_fixups(void) 421void __attribute__((weak)) mach_reboot_fixups(void)
358{ 422{
359} 423}
@@ -362,6 +426,9 @@ static void native_machine_emergency_restart(void)
362{ 426{
363 int i; 427 int i;
364 428
429 if (reboot_emergency)
430 emergency_vmx_disable_all();
431
365 /* Tell the BIOS if we want cold or warm reboot */ 432 /* Tell the BIOS if we want cold or warm reboot */
366 *((unsigned short *)__va(0x472)) = reboot_mode; 433 *((unsigned short *)__va(0x472)) = reboot_mode;
367 434
@@ -398,12 +465,27 @@ static void native_machine_emergency_restart(void)
398 reboot_type = BOOT_KBD; 465 reboot_type = BOOT_KBD;
399 break; 466 break;
400 467
401
402 case BOOT_EFI: 468 case BOOT_EFI:
403 if (efi_enabled) 469 if (efi_enabled)
404 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, 470 efi.reset_system(reboot_mode ?
471 EFI_RESET_WARM :
472 EFI_RESET_COLD,
405 EFI_SUCCESS, 0, NULL); 473 EFI_SUCCESS, 0, NULL);
474 reboot_type = BOOT_KBD;
475 break;
476
477 case BOOT_CF9:
478 port_cf9_safe = true;
479 /* fall through */
406 480
481 case BOOT_CF9_COND:
482 if (port_cf9_safe) {
483 u8 cf9 = inb(0xcf9) & ~6;
484 outb(cf9|2, 0xcf9); /* Request hard reset */
485 udelay(50);
486 outb(cf9|6, 0xcf9); /* Actually do the reset */
487 udelay(50);
488 }
407 reboot_type = BOOT_KBD; 489 reboot_type = BOOT_KBD;
408 break; 490 break;
409 } 491 }
@@ -420,7 +502,7 @@ void native_machine_shutdown(void)
420 502
421#ifdef CONFIG_X86_32 503#ifdef CONFIG_X86_32
422 /* See if there has been given a command line override */ 504 /* See if there has been given a command line override */
423 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 505 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
424 cpu_online(reboot_cpu)) 506 cpu_online(reboot_cpu))
425 reboot_cpu_id = reboot_cpu; 507 reboot_cpu_id = reboot_cpu;
426#endif 508#endif
@@ -430,7 +512,7 @@ void native_machine_shutdown(void)
430 reboot_cpu_id = smp_processor_id(); 512 reboot_cpu_id = smp_processor_id();
431 513
432 /* Make certain I only run on the appropriate processor */ 514 /* Make certain I only run on the appropriate processor */
433 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); 515 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
434 516
435 /* O.K Now that I'm on the appropriate processor, 517 /* O.K Now that I'm on the appropriate processor,
436 * stop all of the others. 518 * stop all of the others.
@@ -453,17 +535,28 @@ void native_machine_shutdown(void)
453#endif 535#endif
454} 536}
455 537
538static void __machine_emergency_restart(int emergency)
539{
540 reboot_emergency = emergency;
541 machine_ops.emergency_restart();
542}
543
456static void native_machine_restart(char *__unused) 544static void native_machine_restart(char *__unused)
457{ 545{
458 printk("machine restart\n"); 546 printk("machine restart\n");
459 547
460 if (!reboot_force) 548 if (!reboot_force)
461 machine_shutdown(); 549 machine_shutdown();
462 machine_emergency_restart(); 550 __machine_emergency_restart(0);
463} 551}
464 552
465static void native_machine_halt(void) 553static void native_machine_halt(void)
466{ 554{
555 /* stop other cpus and apics */
556 machine_shutdown();
557
558 /* stop this cpu */
559 stop_this_cpu(NULL);
467} 560}
468 561
469static void native_machine_power_off(void) 562static void native_machine_power_off(void)
@@ -498,7 +591,7 @@ void machine_shutdown(void)
498 591
499void machine_emergency_restart(void) 592void machine_emergency_restart(void)
500{ 593{
501 machine_ops.emergency_restart(); 594 __machine_emergency_restart(1);
502} 595}
503 596
504void machine_restart(char *cmd) 597void machine_restart(char *cmd)
@@ -558,10 +651,7 @@ static int crash_nmi_callback(struct notifier_block *self,
558 651
559static void smp_send_nmi_allbutself(void) 652static void smp_send_nmi_allbutself(void)
560{ 653{
561 cpumask_t mask = cpu_online_map; 654 apic->send_IPI_allbutself(NMI_VECTOR);
562 cpu_clear(safe_smp_processor_id(), mask);
563 if (!cpus_empty(mask))
564 send_IPI_mask(mask, NMI_VECTOR);
565} 655}
566 656
567static struct notifier_block crash_nmi_nb = { 657static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0fa6790c1dd3..8fce6c714514 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -81,7 +81,7 @@
81#include <asm/io_apic.h> 81#include <asm/io_apic.h>
82#include <asm/ist.h> 82#include <asm/ist.h>
83#include <asm/vmi.h> 83#include <asm/vmi.h>
84#include <setup_arch.h> 84#include <asm/setup_arch.h>
85#include <asm/bios_ebda.h> 85#include <asm/bios_ebda.h>
86#include <asm/cacheflush.h> 86#include <asm/cacheflush.h>
87#include <asm/processor.h> 87#include <asm/processor.h>
@@ -89,15 +89,17 @@
89 89
90#include <asm/system.h> 90#include <asm/system.h>
91#include <asm/vsyscall.h> 91#include <asm/vsyscall.h>
92#include <asm/smp.h> 92#include <asm/cpu.h>
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
96#include <asm/gart.h>
96#include <asm/mmu_context.h> 97#include <asm/mmu_context.h>
97#include <asm/proto.h> 98#include <asm/proto.h>
98 99
99#include <mach_apic.h> 100#include <asm/genapic.h>
100#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h>
101 103
102#include <asm/percpu.h> 104#include <asm/percpu.h>
103#include <asm/topology.h> 105#include <asm/topology.h>
@@ -110,6 +112,20 @@
110#define ARCH_SETUP 112#define ARCH_SETUP
111#endif 113#endif
112 114
115unsigned int boot_cpu_id __read_mostly;
116
117#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu)
119{
120 return __default_cpu_present_to_apicid(mps_cpu);
121}
122
123int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
124{
125 return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
126}
127#endif
128
113#ifndef CONFIG_DEBUG_BOOT_PARAMS 129#ifndef CONFIG_DEBUG_BOOT_PARAMS
114struct boot_params __initdata boot_params; 130struct boot_params __initdata boot_params;
115#else 131#else
@@ -448,6 +464,7 @@ static void __init reserve_early_setup_data(void)
448 * @size: Size of the crashkernel memory to reserve. 464 * @size: Size of the crashkernel memory to reserve.
449 * Returns the base address on success, and -1ULL on failure. 465 * Returns the base address on success, and -1ULL on failure.
450 */ 466 */
467static
451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) 468unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
452{ 469{
453 const unsigned long long alignment = 16<<20; /* 16M */ 470 const unsigned long long alignment = 16<<20; /* 16M */
@@ -583,165 +600,27 @@ static int __init setup_elfcorehdr(char *arg)
583early_param("elfcorehdr", setup_elfcorehdr); 600early_param("elfcorehdr", setup_elfcorehdr);
584#endif 601#endif
585 602
586static struct x86_quirks default_x86_quirks __initdata; 603static int __init default_update_genapic(void)
587
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589
590/*
591 * Some BIOSes seem to corrupt the low 64k of memory during events
592 * like suspend/resume and unplugging an HDMI cable. Reserve all
593 * remaining free memory in that area and fill it with a distinct
594 * pattern.
595 */
596#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
597#define MAX_SCAN_AREAS 8
598
599static int __read_mostly memory_corruption_check = -1;
600
601static unsigned __read_mostly corruption_check_size = 64*1024;
602static unsigned __read_mostly corruption_check_period = 60; /* seconds */
603
604static struct e820entry scan_areas[MAX_SCAN_AREAS];
605static int num_scan_areas;
606
607
608static int set_corruption_check(char *arg)
609{ 604{
610 char *end; 605#ifdef CONFIG_SMP
611 606 if (!apic->wakeup_cpu)
612 memory_corruption_check = simple_strtol(arg, &end, 10); 607 apic->wakeup_cpu = wakeup_secondary_cpu_via_init;
613
614 return (*end == 0) ? 0 : -EINVAL;
615}
616early_param("memory_corruption_check", set_corruption_check);
617
618static int set_corruption_check_period(char *arg)
619{
620 char *end;
621
622 corruption_check_period = simple_strtoul(arg, &end, 10);
623
624 return (*end == 0) ? 0 : -EINVAL;
625}
626early_param("memory_corruption_check_period", set_corruption_check_period);
627
628static int set_corruption_check_size(char *arg)
629{
630 char *end;
631 unsigned size;
632
633 size = memparse(arg, &end);
634
635 if (*end == '\0')
636 corruption_check_size = size;
637
638 return (size == corruption_check_size) ? 0 : -EINVAL;
639}
640early_param("memory_corruption_check_size", set_corruption_check_size);
641
642
643static void __init setup_bios_corruption_check(void)
644{
645 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
646
647 if (memory_corruption_check == -1) {
648 memory_corruption_check =
649#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
650 1
651#else
652 0
653#endif 608#endif
654 ;
655 }
656
657 if (corruption_check_size == 0)
658 memory_corruption_check = 0;
659
660 if (!memory_corruption_check)
661 return;
662
663 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
664 609
665 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 610 return 0;
666 u64 size;
667 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
668
669 if (addr == 0)
670 break;
671
672 if ((addr + size) > corruption_check_size)
673 size = corruption_check_size - addr;
674
675 if (size == 0)
676 break;
677
678 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
679 scan_areas[num_scan_areas].addr = addr;
680 scan_areas[num_scan_areas].size = size;
681 num_scan_areas++;
682
683 /* Assume we've already mapped this early memory */
684 memset(__va(addr), 0, size);
685
686 addr += size;
687 }
688
689 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
690 num_scan_areas);
691 update_e820();
692}
693
694static struct timer_list periodic_check_timer;
695
696void check_for_bios_corruption(void)
697{
698 int i;
699 int corruption = 0;
700
701 if (!memory_corruption_check)
702 return;
703
704 for(i = 0; i < num_scan_areas; i++) {
705 unsigned long *addr = __va(scan_areas[i].addr);
706 unsigned long size = scan_areas[i].size;
707
708 for(; size; addr++, size -= sizeof(unsigned long)) {
709 if (!*addr)
710 continue;
711 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
712 addr, __pa(addr), *addr);
713 corruption = 1;
714 *addr = 0;
715 }
716 }
717
718 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
719}
720
721static void periodic_check_for_corruption(unsigned long data)
722{
723 check_for_bios_corruption();
724 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
725} 611}
726 612
727void start_periodic_check_for_corruption(void) 613static struct x86_quirks default_x86_quirks __initdata = {
728{ 614 .update_genapic = default_update_genapic,
729 if (!memory_corruption_check || corruption_check_period == 0) 615};
730 return;
731
732 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
733 corruption_check_period);
734 616
735 init_timer(&periodic_check_timer); 617struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
736 periodic_check_timer.function = &periodic_check_for_corruption;
737 periodic_check_for_corruption(0);
738}
739#endif
740 618
619#ifdef CONFIG_X86_RESERVE_LOW_64K
741static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 620static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
742{ 621{
743 printk(KERN_NOTICE 622 printk(KERN_NOTICE
744 "%s detected: BIOS may corrupt low RAM, working it around.\n", 623 "%s detected: BIOS may corrupt low RAM, working around it.\n",
745 d->ident); 624 d->ident);
746 625
747 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); 626 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
@@ -749,6 +628,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
749 628
750 return 0; 629 return 0;
751} 630}
631#endif
752 632
753/* List of systems that have known low memory corruption BIOS problems */ 633/* List of systems that have known low memory corruption BIOS problems */
754static struct dmi_system_id __initdata bad_bios_dmi_table[] = { 634static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -764,7 +644,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
764 .callback = dmi_low_memory_corruption, 644 .callback = dmi_low_memory_corruption,
765 .ident = "Phoenix BIOS", 645 .ident = "Phoenix BIOS",
766 .matches = { 646 .matches = {
767 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), 647 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
768 }, 648 },
769 }, 649 },
770#endif 650#endif
@@ -794,6 +674,9 @@ void __init setup_arch(char **cmdline_p)
794 printk(KERN_INFO "Command line: %s\n", boot_command_line); 674 printk(KERN_INFO "Command line: %s\n", boot_command_line);
795#endif 675#endif
796 676
677 /* VMI may relocate the fixmap; do this before touching ioremap area */
678 vmi_init();
679
797 early_cpu_init(); 680 early_cpu_init();
798 early_ioremap_init(); 681 early_ioremap_init();
799 682
@@ -880,13 +763,8 @@ void __init setup_arch(char **cmdline_p)
880 check_efer(); 763 check_efer();
881#endif 764#endif
882 765
883#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 766 /* Must be before kernel pagetables are setup */
884 /* 767 vmi_activate();
885 * Must be before kernel pagetables are setup
886 * or fixmap area is touched.
887 */
888 vmi_init();
889#endif
890 768
891 /* after early param, so could get panic from serial */ 769 /* after early param, so could get panic from serial */
892 reserve_early_setup_data(); 770 reserve_early_setup_data();
@@ -909,6 +787,12 @@ void __init setup_arch(char **cmdline_p)
909 787
910 dmi_check_system(bad_bios_dmi_table); 788 dmi_check_system(bad_bios_dmi_table);
911 789
790 /*
791 * VMware detection requires dmi to be available, so this
792 * needs to be done after dmi_scan_machine, for the BP.
793 */
794 init_hypervisor(&boot_cpu_data);
795
912#ifdef CONFIG_X86_32 796#ifdef CONFIG_X86_32
913 probe_roms(); 797 probe_roms();
914#endif 798#endif
@@ -1021,12 +905,11 @@ void __init setup_arch(char **cmdline_p)
1021 */ 905 */
1022 acpi_reserve_bootmem(); 906 acpi_reserve_bootmem();
1023#endif 907#endif
1024#ifdef CONFIG_X86_FIND_SMP_CONFIG
1025 /* 908 /*
1026 * Find and reserve possible boot-time SMP configuration: 909 * Find and reserve possible boot-time SMP configuration:
1027 */ 910 */
1028 find_smp_config(); 911 find_smp_config();
1029#endif 912
1030 reserve_crashkernel(); 913 reserve_crashkernel();
1031 914
1032#ifdef CONFIG_X86_64 915#ifdef CONFIG_X86_64
@@ -1053,9 +936,7 @@ void __init setup_arch(char **cmdline_p)
1053 map_vsyscall(); 936 map_vsyscall();
1054#endif 937#endif
1055 938
1056#ifdef CONFIG_X86_GENERICARCH
1057 generic_apic_probe(); 939 generic_apic_probe();
1058#endif
1059 940
1060 early_quirks(); 941 early_quirks();
1061 942
@@ -1082,7 +963,7 @@ void __init setup_arch(char **cmdline_p)
1082 ioapic_init_mappings(); 963 ioapic_init_mappings();
1083 964
1084 /* need to wait for io_apic is mapped */ 965 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs(); 966 probe_nr_irqs_gsi();
1086 967
1087 kvm_guest_init(); 968 kvm_guest_init();
1088 969
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb770..d992e6cff730 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -5,133 +5,54 @@
5#include <linux/percpu.h> 5#include <linux/percpu.h>
6#include <linux/kexec.h> 6#include <linux/kexec.h>
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <asm/smp.h> 8#include <linux/smp.h>
9#include <asm/percpu.h> 9#include <linux/topology.h>
10#include <asm/sections.h> 10#include <asm/sections.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/setup.h> 12#include <asm/setup.h>
13#include <asm/topology.h>
14#include <asm/mpspec.h> 13#include <asm/mpspec.h>
15#include <asm/apicdef.h> 14#include <asm/apicdef.h>
16#include <asm/highmem.h> 15#include <asm/highmem.h>
16#include <asm/proto.h>
17#include <asm/cpumask.h>
18#include <asm/cpu.h>
19#include <asm/stackprotector.h>
17 20
18#ifdef CONFIG_X86_LOCAL_APIC 21#ifdef CONFIG_DEBUG_PER_CPU_MAPS
19unsigned int num_processors; 22# define DBG(x...) printk(KERN_DEBUG x)
20unsigned disabled_cpus __cpuinitdata;
21/* Processor that is doing the boot up */
22unsigned int boot_cpu_physical_apicid = -1U;
23unsigned int max_physical_apicid;
24EXPORT_SYMBOL(boot_cpu_physical_apicid);
25
26/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map;
28#endif
29
30/* map cpu index to physical APIC ID */
31DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
32DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
34EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
35
36#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
37#define X86_64_NUMA 1
38
39/* map cpu index to node index */
40DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
41EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
42
43/* which logical CPUs are on which nodes */
44cpumask_t *node_to_cpumask_map;
45EXPORT_SYMBOL(node_to_cpumask_map);
46
47/* setup node_to_cpumask_map */
48static void __init setup_node_to_cpumask_map(void);
49
50#else 23#else
51static inline void setup_node_to_cpumask_map(void) { } 24# define DBG(x...)
52#endif 25#endif
53 26
54#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 27DEFINE_PER_CPU(int, cpu_number);
55/* 28EXPORT_PER_CPU_SYMBOL(cpu_number);
56 * Copy data used in early init routines from the initial arrays to the
57 * per cpu data areas. These arrays then become expendable and the
58 * *_early_ptr's are zeroed indicating that the static arrays are gone.
59 */
60static void __init setup_per_cpu_maps(void)
61{
62 int cpu;
63 29
64 for_each_possible_cpu(cpu) { 30#ifdef CONFIG_X86_64
65 per_cpu(x86_cpu_to_apicid, cpu) = 31#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
66 early_per_cpu_map(x86_cpu_to_apicid, cpu); 32#else
67 per_cpu(x86_bios_cpu_apicid, cpu) = 33#define BOOT_PERCPU_OFFSET 0
68 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
69#ifdef X86_64_NUMA
70 per_cpu(x86_cpu_to_node_map, cpu) =
71 early_per_cpu_map(x86_cpu_to_node_map, cpu);
72#endif 34#endif
73 }
74 35
75 /* indicate the early static arrays will soon be gone */ 36DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
76 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 37EXPORT_PER_CPU_SYMBOL(this_cpu_off);
77 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
78#ifdef X86_64_NUMA
79 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
80#endif
81}
82 38
83#ifdef CONFIG_X86_32 39unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
84/* 40 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
85 * Great future not-so-futuristic plan: make i386 and x86_64 do it 41};
86 * the same way
87 */
88unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
89EXPORT_SYMBOL(__per_cpu_offset); 42EXPORT_SYMBOL(__per_cpu_offset);
90static inline void setup_cpu_pda_map(void) { }
91
92#elif !defined(CONFIG_SMP)
93static inline void setup_cpu_pda_map(void) { }
94
95#else /* CONFIG_SMP && CONFIG_X86_64 */
96 43
97/* 44static inline void setup_percpu_segment(int cpu)
98 * Allocate cpu_pda pointer table and array via alloc_bootmem.
99 */
100static void __init setup_cpu_pda_map(void)
101{ 45{
102 char *pda; 46#ifdef CONFIG_X86_32
103 struct x8664_pda **new_cpu_pda; 47 struct desc_struct gdt;
104 unsigned long size;
105 int cpu;
106
107 size = roundup(sizeof(struct x8664_pda), cache_line_size());
108
109 /* allocate cpu_pda array and pointer table */
110 {
111 unsigned long tsize = nr_cpu_ids * sizeof(void *);
112 unsigned long asize = size * (nr_cpu_ids - 1);
113
114 tsize = roundup(tsize, cache_line_size());
115 new_cpu_pda = alloc_bootmem(tsize + asize);
116 pda = (char *)new_cpu_pda + tsize;
117 }
118
119 /* initialize pointer table to static pda's */
120 for_each_possible_cpu(cpu) {
121 if (cpu == 0) {
122 /* leave boot cpu pda in place */
123 new_cpu_pda[0] = cpu_pda(0);
124 continue;
125 }
126 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
127 new_cpu_pda[cpu]->in_bootmem = 1;
128 pda += size;
129 }
130 48
131 /* point to new pointer table */ 49 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
132 _cpu_pda = new_cpu_pda; 50 0x2 | DESCTYPE_S, 0x8);
133} 51 gdt.s = 1;
52 write_gdt_entry(get_cpu_gdt_table(cpu),
53 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
134#endif 54#endif
55}
135 56
136/* 57/*
137 * Great future plan: 58 * Great future plan:
@@ -140,251 +61,86 @@ static void __init setup_cpu_pda_map(void)
140 */ 61 */
141void __init setup_per_cpu_areas(void) 62void __init setup_per_cpu_areas(void)
142{ 63{
143 ssize_t size, old_size; 64 ssize_t size;
144 char *ptr; 65 char *ptr;
145 int cpu; 66 int cpu;
146 unsigned long align = 1;
147
148 /* Setup cpu_pda map */
149 setup_cpu_pda_map();
150 67
151 /* Copy section for each CPU (we discard the original) */ 68 /* Copy section for each CPU (we discard the original) */
152 old_size = PERCPU_ENOUGH_ROOM; 69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
153 align = max_t(unsigned long, PAGE_SIZE, align); 70
154 size = roundup(old_size, align); 71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
156 size); 73
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
157 75
158 for_each_possible_cpu(cpu) { 76 for_each_possible_cpu(cpu) {
159#ifndef CONFIG_NEED_MULTIPLE_NODES 77#ifndef CONFIG_NEED_MULTIPLE_NODES
160 ptr = __alloc_bootmem(size, align, 78 ptr = alloc_bootmem_pages(size);
161 __pa(MAX_DMA_ADDRESS));
162#else 79#else
163 int node = early_cpu_to_node(cpu); 80 int node = early_cpu_to_node(cpu);
164 if (!node_online(node) || !NODE_DATA(node)) { 81 if (!node_online(node) || !NODE_DATA(node)) {
165 ptr = __alloc_bootmem(size, align, 82 ptr = alloc_bootmem_pages(size);
166 __pa(MAX_DMA_ADDRESS)); 83 pr_info("cpu %d has no node %d or node-local memory\n",
167 printk(KERN_INFO
168 "cpu %d has no node %d or node-local memory\n",
169 cpu, node); 84 cpu, node);
170 if (ptr) 85 pr_debug("per cpu data for cpu%d at %016lx\n",
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", 86 cpu, __pa(ptr));
172 cpu, __pa(ptr)); 87 } else {
173 } 88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
174 else { 89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 90 cpu, node, __pa(ptr));
176 __pa(MAX_DMA_ADDRESS));
177 if (ptr)
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
179 cpu, node, __pa(ptr));
180 } 91 }
181#endif 92#endif
93
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
182 per_cpu_offset(cpu) = ptr - __per_cpu_start; 95 per_cpu_offset(cpu) = ptr - __per_cpu_start;
183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu);
99 setup_stack_canary_segment(cpu);
100 /*
101 * Copy data used in early init routines from the
102 * initial arrays to the per cpu data areas. These
103 * arrays then become expendable and the *_early_ptr's
104 * are zeroed indicating that the static arrays are
105 * gone.
106 */
107#ifdef CONFIG_X86_LOCAL_APIC
108 per_cpu(x86_cpu_to_apicid, cpu) =
109 early_per_cpu_map(x86_cpu_to_apicid, cpu);
110 per_cpu(x86_bios_cpu_apicid, cpu) =
111 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
112#endif
113#ifdef CONFIG_X86_64
114 per_cpu(irq_stack_ptr, cpu) =
115 per_cpu(irq_stack_union.irq_stack, cpu) +
116 IRQ_STACK_SIZE - 64;
117#ifdef CONFIG_NUMA
118 per_cpu(x86_cpu_to_node_map, cpu) =
119 early_per_cpu_map(x86_cpu_to_node_map, cpu);
120#endif
121#endif
122 /*
123 * Up to this point, the boot CPU has been using .data.init
124 * area. Reload any changed state for the boot CPU.
125 */
126 if (cpu == boot_cpu_id)
127 switch_to_new_gdt(cpu);
128
129 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
184 } 130 }
185 131
186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", 132 /* indicate the early static arrays will soon be gone */
187 NR_CPUS, nr_cpu_ids, nr_node_ids); 133#ifdef CONFIG_X86_LOCAL_APIC
188 134 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
189 /* Setup percpu data maps */ 135 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
190 setup_per_cpu_maps(); 136#endif
137#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
138 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
139#endif
191 140
192 /* Setup node to cpumask map */ 141 /* Setup node to cpumask map */
193 setup_node_to_cpumask_map(); 142 setup_node_to_cpumask_map();
194}
195
196#endif
197 143
198#ifdef X86_64_NUMA 144 /* Setup cpu initialized, callin, callout masks */
199 145 setup_cpu_local_masks();
200/*
201 * Allocate node_to_cpumask_map based on number of available nodes
202 * Requires node_possible_map to be valid.
203 *
204 * Note: node_to_cpumask() is not valid until after this is done.
205 */
206static void __init setup_node_to_cpumask_map(void)
207{
208 unsigned int node, num = 0;
209 cpumask_t *map;
210
211 /* setup nr_node_ids if not done yet */
212 if (nr_node_ids == MAX_NUMNODES) {
213 for_each_node_mask(node, node_possible_map)
214 num = node;
215 nr_node_ids = num + 1;
216 }
217
218 /* allocate the map */
219 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
220
221 pr_debug("Node to cpumask map at %p for %d nodes\n",
222 map, nr_node_ids);
223
224 /* node_to_cpumask() will now work */
225 node_to_cpumask_map = map;
226}
227
228void __cpuinit numa_set_node(int cpu, int node)
229{
230 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
231
232 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
233 cpu_pda(cpu)->nodenumber = node;
234
235 if (cpu_to_node_map)
236 cpu_to_node_map[cpu] = node;
237
238 else if (per_cpu_offset(cpu))
239 per_cpu(x86_cpu_to_node_map, cpu) = node;
240
241 else
242 pr_debug("Setting node for non-present cpu %d\n", cpu);
243}
244
245void __cpuinit numa_clear_node(int cpu)
246{
247 numa_set_node(cpu, NUMA_NO_NODE);
248}
249
250#ifndef CONFIG_DEBUG_PER_CPU_MAPS
251
252void __cpuinit numa_add_cpu(int cpu)
253{
254 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
255}
256
257void __cpuinit numa_remove_cpu(int cpu)
258{
259 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
260}
261
262#else /* CONFIG_DEBUG_PER_CPU_MAPS */
263
264/*
265 * --------- debug versions of the numa functions ---------
266 */
267static void __cpuinit numa_set_cpumask(int cpu, int enable)
268{
269 int node = cpu_to_node(cpu);
270 cpumask_t *mask;
271 char buf[64];
272
273 if (node_to_cpumask_map == NULL) {
274 printk(KERN_ERR "node_to_cpumask_map NULL\n");
275 dump_stack();
276 return;
277 }
278
279 mask = &node_to_cpumask_map[node];
280 if (enable)
281 cpu_set(cpu, *mask);
282 else
283 cpu_clear(cpu, *mask);
284
285 cpulist_scnprintf(buf, sizeof(buf), *mask);
286 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
287 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
288 }
289
290void __cpuinit numa_add_cpu(int cpu)
291{
292 numa_set_cpumask(cpu, 1);
293}
294
295void __cpuinit numa_remove_cpu(int cpu)
296{
297 numa_set_cpumask(cpu, 0);
298}
299
300int cpu_to_node(int cpu)
301{
302 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
303 printk(KERN_WARNING
304 "cpu_to_node(%d): usage too early!\n", cpu);
305 dump_stack();
306 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
307 }
308 return per_cpu(x86_cpu_to_node_map, cpu);
309}
310EXPORT_SYMBOL(cpu_to_node);
311
312/*
313 * Same function as cpu_to_node() but used if called before the
314 * per_cpu areas are setup.
315 */
316int early_cpu_to_node(int cpu)
317{
318 if (early_per_cpu_ptr(x86_cpu_to_node_map))
319 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
320
321 if (!per_cpu_offset(cpu)) {
322 printk(KERN_WARNING
323 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
324 dump_stack();
325 return NUMA_NO_NODE;
326 }
327 return per_cpu(x86_cpu_to_node_map, cpu);
328} 146}
329
330
331/* empty cpumask */
332static const cpumask_t cpu_mask_none;
333
334/*
335 * Returns a pointer to the bitmask of CPUs on Node 'node'.
336 */
337const cpumask_t *_node_to_cpumask_ptr(int node)
338{
339 if (node_to_cpumask_map == NULL) {
340 printk(KERN_WARNING
341 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
342 node);
343 dump_stack();
344 return (const cpumask_t *)&cpu_online_map;
345 }
346 if (node >= nr_node_ids) {
347 printk(KERN_WARNING
348 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
349 node, nr_node_ids);
350 dump_stack();
351 return &cpu_mask_none;
352 }
353 return &node_to_cpumask_map[node];
354}
355EXPORT_SYMBOL(_node_to_cpumask_ptr);
356
357/*
358 * Returns a bitmask of CPUs on Node 'node'.
359 *
360 * Side note: this function creates the returned cpumask on the stack
361 * so with a high NR_CPUS count, excessive stack space is used. The
362 * node_to_cpumask_ptr function should be used whenever possible.
363 */
364cpumask_t node_to_cpumask(int node)
365{
366 if (node_to_cpumask_map == NULL) {
367 printk(KERN_WARNING
368 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
369 dump_stack();
370 return cpu_online_map;
371 }
372 if (node >= nr_node_ids) {
373 printk(KERN_WARNING
374 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
375 node, nr_node_ids);
376 dump_stack();
377 return cpu_mask_none;
378 }
379 return node_to_cpumask_map[node];
380}
381EXPORT_SYMBOL(node_to_cpumask);
382
383/*
384 * --------- end of debug versions of the numa functions ---------
385 */
386
387#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
388
389#endif /* X86_64_NUMA */
390
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce4..000000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
1#ifdef CONFIG_X86_32
2struct sigframe {
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
15 unsigned long extramask[_NSIG_WORDS-1];
16 char retcode[8];
17 /* fp state follows here */
18};
19
20struct rt_sigframe {
21 char __user *pretcode;
22 int sig;
23 struct siginfo __user *pinfo;
24 void __user *puc;
25 struct siginfo info;
26 struct ucontext uc;
27 char retcode[8];
28 /* fp state follows here */
29};
30#else
31struct rt_sigframe {
32 char __user *pretcode;
33 struct ucontext uc;
34 struct siginfo info;
35 /* fp state follows here */
36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f22..7cdcd16885ed 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
3 * 4 *
4 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson 5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
5 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
6 */ 8 */
7#include <linux/list.h>
8 9
9#include <linux/personality.h> 10#include <linux/sched.h>
10#include <linux/binfmts.h> 11#include <linux/mm.h>
11#include <linux/suspend.h> 12#include <linux/smp.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/ptrace.h>
14#include <linux/signal.h> 14#include <linux/signal.h>
15#include <linux/stddef.h>
16#include <linux/unistd.h>
17#include <linux/errno.h> 15#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h>
20#include <linux/tracehook.h> 18#include <linux/tracehook.h>
21#include <linux/elf.h> 19#include <linux/unistd.h>
22#include <linux/smp.h> 20#include <linux/stddef.h>
23#include <linux/mm.h> 21#include <linux/personality.h>
22#include <linux/uaccess.h>
24 23
25#include <asm/processor.h> 24#include <asm/processor.h>
26#include <asm/ucontext.h> 25#include <asm/ucontext.h>
27#include <asm/uaccess.h>
28#include <asm/i387.h> 26#include <asm/i387.h>
29#include <asm/vdso.h> 27#include <asm/vdso.h>
28
29#ifdef CONFIG_X86_64
30#include <asm/proto.h>
31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */
34
30#include <asm/syscall.h> 35#include <asm/syscall.h>
31#include <asm/syscalls.h> 36#include <asm/syscalls.h>
32 37
33#include "sigframe.h" 38#include <asm/sigframe.h>
34 39
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36 41
@@ -45,99 +50,24 @@
45# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
46#endif 51#endif
47 52
48/* 53#define COPY(x) do { \
49 * Atomically swap in the new signal mask, and wait for a signal. 54 get_user_ex(regs->x, &sc->x); \
50 */ 55} while (0)
51asmlinkage int
52sys_sigsuspend(int history0, int history1, old_sigset_t mask)
53{
54 mask &= _BLOCKABLE;
55 spin_lock_irq(&current->sighand->siglock);
56 current->saved_sigmask = current->blocked;
57 siginitset(&current->blocked, mask);
58 recalc_sigpending();
59 spin_unlock_irq(&current->sighand->siglock);
60
61 current->state = TASK_INTERRUPTIBLE;
62 schedule();
63 set_restore_sigmask();
64 56
65 return -ERESTARTNOHAND; 57#define GET_SEG(seg) ({ \
66} 58 unsigned short tmp; \
59 get_user_ex(tmp, &sc->seg); \
60 tmp; \
61})
67 62
68asmlinkage int 63#define COPY_SEG(seg) do { \
69sys_sigaction(int sig, const struct old_sigaction __user *act, 64 regs->seg = GET_SEG(seg); \
70 struct old_sigaction __user *oact) 65} while (0)
71{
72 struct k_sigaction new_ka, old_ka;
73 int ret;
74 66
75 if (act) { 67#define COPY_SEG_CPL3(seg) do { \
76 old_sigset_t mask; 68 regs->seg = GET_SEG(seg) | 3; \
69} while (0)
77 70
78 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
79 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
80 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
81 return -EFAULT;
82
83 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
84 __get_user(mask, &act->sa_mask);
85 siginitset(&new_ka.sa.sa_mask, mask);
86 }
87
88 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
89
90 if (!ret && oact) {
91 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
92 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
93 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
94 return -EFAULT;
95
96 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
97 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
98 }
99
100 return ret;
101}
102
103asmlinkage int sys_sigaltstack(unsigned long bx)
104{
105 /*
106 * This is needed to make gcc realize it doesn't own the
107 * "struct pt_regs"
108 */
109 struct pt_regs *regs = (struct pt_regs *)&bx;
110 const stack_t __user *uss = (const stack_t __user *)bx;
111 stack_t __user *uoss = (stack_t __user *)regs->cx;
112
113 return do_sigaltstack(uss, uoss, regs->sp);
114}
115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
137
138/*
139 * Do a signal return; undo the signal stack.
140 */
141static int 71static int
142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 72restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
143 unsigned long *pax) 73 unsigned long *pax)
@@ -149,150 +79,136 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 /* Always make any pending restarted system calls return -EINTR */ 79 /* Always make any pending restarted system calls return -EINTR */
150 current_thread_info()->restart_block.fn = do_no_restart_syscall; 80 current_thread_info()->restart_block.fn = do_no_restart_syscall;
151 81
152 GET_SEG(gs); 82 get_user_try {
153 COPY_SEG(fs);
154 COPY_SEG(es);
155 COPY_SEG(ds);
156 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
157 COPY(dx); COPY(cx); COPY(ip);
158 COPY_SEG_STRICT(cs);
159 COPY_SEG_STRICT(ss);
160
161 err |= __get_user(tmpflags, &sc->flags);
162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
163 regs->orig_ax = -1; /* disable syscall checks */
164
165 err |= __get_user(buf, &sc->fpstate);
166 err |= restore_i387_xstate(buf);
167
168 err |= __get_user(*pax, &sc->ax);
169 return err;
170}
171 83
172asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 84#ifdef CONFIG_X86_32
173{ 85 set_user_gs(regs, GET_SEG(gs));
174 struct sigframe __user *frame; 86 COPY_SEG(fs);
175 struct pt_regs *regs; 87 COPY_SEG(es);
176 unsigned long ax; 88 COPY_SEG(ds);
177 sigset_t set; 89#endif /* CONFIG_X86_32 */
178 90
179 regs = (struct pt_regs *) &__unused; 91 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
180 frame = (struct sigframe __user *)(regs->sp - 8); 92 COPY(dx); COPY(cx); COPY(ip);
181 93
182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 94#ifdef CONFIG_X86_64
183 goto badframe; 95 COPY(r8);
184 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 96 COPY(r9);
185 && __copy_from_user(&set.sig[1], &frame->extramask, 97 COPY(r10);
186 sizeof(frame->extramask)))) 98 COPY(r11);
187 goto badframe; 99 COPY(r12);
100 COPY(r13);
101 COPY(r14);
102 COPY(r15);
103#endif /* CONFIG_X86_64 */
188 104
189 sigdelsetmask(&set, ~_BLOCKABLE); 105#ifdef CONFIG_X86_32
190 spin_lock_irq(&current->sighand->siglock); 106 COPY_SEG_CPL3(cs);
191 current->blocked = set; 107 COPY_SEG_CPL3(ss);
192 recalc_sigpending(); 108#else /* !CONFIG_X86_32 */
193 spin_unlock_irq(&current->sighand->siglock); 109 /* Kernel saves and restores only the CS segment register on signals,
110 * which is the bare minimum needed to allow mixed 32/64-bit code.
111 * App's signal handler can save/restore other segments if needed. */
112 COPY_SEG_CPL3(cs);
113#endif /* CONFIG_X86_32 */
194 114
195 if (restore_sigcontext(regs, &frame->sc, &ax)) 115 get_user_ex(tmpflags, &sc->flags);
196 goto badframe; 116 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
197 return ax; 117 regs->orig_ax = -1; /* disable syscall checks */
198 118
199badframe: 119 get_user_ex(buf, &sc->fpstate);
200 if (show_unhandled_signals && printk_ratelimit()) { 120 err |= restore_i387_xstate(buf);
201 printk("%s%s[%d] bad frame in sigreturn frame:"
202 "%p ip:%lx sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->sp, regs->orig_ax);
206 print_vma_addr(" in ", regs->ip);
207 printk(KERN_CONT "\n");
208 }
209 121
210 force_sig(SIGSEGV, current); 122 get_user_ex(*pax, &sc->ax);
123 } get_user_catch(err);
211 124
212 return 0; 125 return err;
213} 126}
214 127
215static long do_rt_sigreturn(struct pt_regs *regs) 128static int
129setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
130 struct pt_regs *regs, unsigned long mask)
216{ 131{
217 struct rt_sigframe __user *frame; 132 int err = 0;
218 unsigned long ax;
219 sigset_t set;
220
221 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
223 goto badframe;
224 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
225 goto badframe;
226
227 sigdelsetmask(&set, ~_BLOCKABLE);
228 spin_lock_irq(&current->sighand->siglock);
229 current->blocked = set;
230 recalc_sigpending();
231 spin_unlock_irq(&current->sighand->siglock);
232 133
233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 134 put_user_try {
234 goto badframe;
235 135
236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) 136#ifdef CONFIG_X86_32
237 goto badframe; 137 put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
138 put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
139 put_user_ex(regs->es, (unsigned int __user *)&sc->es);
140 put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
141#endif /* CONFIG_X86_32 */
238 142
239 return ax; 143 put_user_ex(regs->di, &sc->di);
144 put_user_ex(regs->si, &sc->si);
145 put_user_ex(regs->bp, &sc->bp);
146 put_user_ex(regs->sp, &sc->sp);
147 put_user_ex(regs->bx, &sc->bx);
148 put_user_ex(regs->dx, &sc->dx);
149 put_user_ex(regs->cx, &sc->cx);
150 put_user_ex(regs->ax, &sc->ax);
151#ifdef CONFIG_X86_64
152 put_user_ex(regs->r8, &sc->r8);
153 put_user_ex(regs->r9, &sc->r9);
154 put_user_ex(regs->r10, &sc->r10);
155 put_user_ex(regs->r11, &sc->r11);
156 put_user_ex(regs->r12, &sc->r12);
157 put_user_ex(regs->r13, &sc->r13);
158 put_user_ex(regs->r14, &sc->r14);
159 put_user_ex(regs->r15, &sc->r15);
160#endif /* CONFIG_X86_64 */
161
162 put_user_ex(current->thread.trap_no, &sc->trapno);
163 put_user_ex(current->thread.error_code, &sc->err);
164 put_user_ex(regs->ip, &sc->ip);
165#ifdef CONFIG_X86_32
166 put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
167 put_user_ex(regs->flags, &sc->flags);
168 put_user_ex(regs->sp, &sc->sp_at_signal);
169 put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
170#else /* !CONFIG_X86_32 */
171 put_user_ex(regs->flags, &sc->flags);
172 put_user_ex(regs->cs, &sc->cs);
173 put_user_ex(0, &sc->gs);
174 put_user_ex(0, &sc->fs);
175#endif /* CONFIG_X86_32 */
240 176
241badframe: 177 put_user_ex(fpstate, &sc->fpstate);
242 signal_fault(regs, frame, "rt_sigreturn");
243 return 0;
244}
245 178
246asmlinkage int sys_rt_sigreturn(unsigned long __unused) 179 /* non-iBCS2 extensions.. */
247{ 180 put_user_ex(mask, &sc->oldmask);
248 struct pt_regs *regs = (struct pt_regs *)&__unused; 181 put_user_ex(current->thread.cr2, &sc->cr2);
182 } put_user_catch(err);
249 183
250 return do_rt_sigreturn(regs); 184 return err;
251} 185}
252 186
253/* 187/*
254 * Set up a signal frame. 188 * Set up a signal frame.
255 */ 189 */
256static int 190#ifdef CONFIG_X86_32
257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 191static const struct {
258 struct pt_regs *regs, unsigned long mask) 192 u16 poplmovl;
259{ 193 u32 val;
260 int tmp, err = 0; 194 u16 int80;
261 195} __attribute__((packed)) retcode = {
262 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 196 0xb858, /* popl %eax; movl $..., %eax */
263 savesegment(gs, tmp); 197 __NR_sigreturn,
264 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 198 0x80cd, /* int $0x80 */
265 199};
266 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 200
267 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); 201static const struct {
268 err |= __put_user(regs->di, &sc->di); 202 u8 movl;
269 err |= __put_user(regs->si, &sc->si); 203 u32 val;
270 err |= __put_user(regs->bp, &sc->bp); 204 u16 int80;
271 err |= __put_user(regs->sp, &sc->sp); 205 u8 pad;
272 err |= __put_user(regs->bx, &sc->bx); 206} __attribute__((packed)) rt_retcode = {
273 err |= __put_user(regs->dx, &sc->dx); 207 0xb8, /* movl $..., %eax */
274 err |= __put_user(regs->cx, &sc->cx); 208 __NR_rt_sigreturn,
275 err |= __put_user(regs->ax, &sc->ax); 209 0x80cd, /* int $0x80 */
276 err |= __put_user(current->thread.trap_no, &sc->trapno); 210 0
277 err |= __put_user(current->thread.error_code, &sc->err); 211};
278 err |= __put_user(regs->ip, &sc->ip);
279 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
280 err |= __put_user(regs->flags, &sc->flags);
281 err |= __put_user(regs->sp, &sc->sp_at_signal);
282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
283
284 tmp = save_i387_xstate(fpstate);
285 if (tmp < 0)
286 err = 1;
287 else
288 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
289
290 /* non-iBCS2 extensions.. */
291 err |= __put_user(mask, &sc->oldmask);
292 err |= __put_user(current->thread.cr2, &sc->cr2);
293
294 return err;
295}
296 212
297/* 213/*
298 * Determine which stack to use.. 214 * Determine which stack to use..
@@ -328,6 +244,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
328 if (used_math()) { 244 if (used_math()) {
329 sp = sp - sig_xstate_size; 245 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp; 246 *fpstate = (struct _fpstate *) sp;
247 if (save_i387_xstate(*fpstate) < 0)
248 return (void __user *)-1L;
331 } 249 }
332 250
333 sp -= frame_size; 251 sp -= frame_size;
@@ -383,9 +301,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
383 * reasons and because gdb uses it as a signature to notice 301 * reasons and because gdb uses it as a signature to notice
384 * signal handler stack frames. 302 * signal handler stack frames.
385 */ 303 */
386 err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); 304 err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
387 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
389 305
390 if (err) 306 if (err)
391 return -EFAULT; 307 return -EFAULT;
@@ -418,45 +334,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 334 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
419 return -EFAULT; 335 return -EFAULT;
420 336
421 err |= __put_user(sig, &frame->sig); 337 put_user_try {
422 err |= __put_user(&frame->info, &frame->pinfo); 338 put_user_ex(sig, &frame->sig);
423 err |= __put_user(&frame->uc, &frame->puc); 339 put_user_ex(&frame->info, &frame->pinfo);
424 err |= copy_siginfo_to_user(&frame->info, info); 340 put_user_ex(&frame->uc, &frame->puc);
425 if (err) 341 err |= copy_siginfo_to_user(&frame->info, info);
426 return -EFAULT;
427
428 /* Create the ucontext. */
429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
433 err |= __put_user(0, &frame->uc.uc_link);
434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
435 err |= __put_user(sas_ss_flags(regs->sp),
436 &frame->uc.uc_stack.ss_flags);
437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
439 regs, set->sig[0]);
440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
441 if (err)
442 return -EFAULT;
443 342
444 /* Set up to return from userspace. */ 343 /* Create the ucontext. */
445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 344 if (cpu_has_xsave)
446 if (ka->sa.sa_flags & SA_RESTORER) 345 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
447 restorer = ka->sa.sa_restorer; 346 else
448 err |= __put_user(restorer, &frame->pretcode); 347 put_user_ex(0, &frame->uc.uc_flags);
348 put_user_ex(0, &frame->uc.uc_link);
349 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
350 put_user_ex(sas_ss_flags(regs->sp),
351 &frame->uc.uc_stack.ss_flags);
352 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
353 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
354 regs, set->sig[0]);
355 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
356
357 /* Set up to return from userspace. */
358 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
359 if (ka->sa.sa_flags & SA_RESTORER)
360 restorer = ka->sa.sa_restorer;
361 put_user_ex(restorer, &frame->pretcode);
449 362
450 /* 363 /*
451 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 364 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
452 * 365 *
453 * WE DO NOT USE IT ANY MORE! It's only left here for historical 366 * WE DO NOT USE IT ANY MORE! It's only left here for historical
454 * reasons and because gdb uses it as a signature to notice 367 * reasons and because gdb uses it as a signature to notice
455 * signal handler stack frames. 368 * signal handler stack frames.
456 */ 369 */
457 err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); 370 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
458 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); 371 } put_user_catch(err);
459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
460 372
461 if (err) 373 if (err)
462 return -EFAULT; 374 return -EFAULT;
@@ -475,23 +387,286 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
475 387
476 return 0; 388 return 0;
477} 389}
390#else /* !CONFIG_X86_32 */
391/*
392 * Determine which stack to use..
393 */
394static void __user *
395get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
396{
397 /* Default to using normal stack - redzone*/
398 sp -= 128;
399
400 /* This is the X/Open sanctioned signal stack switching. */
401 if (ka->sa.sa_flags & SA_ONSTACK) {
402 if (sas_ss_flags(sp) == 0)
403 sp = current->sas_ss_sp + current->sas_ss_size;
404 }
405
406 return (void __user *)round_down(sp - size, 64);
407}
408
409static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
410 sigset_t *set, struct pt_regs *regs)
411{
412 struct rt_sigframe __user *frame;
413 void __user *fp = NULL;
414 int err = 0;
415 struct task_struct *me = current;
416
417 if (used_math()) {
418 fp = get_stack(ka, regs->sp, sig_xstate_size);
419 frame = (void __user *)round_down(
420 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
421
422 if (save_i387_xstate(fp) < 0)
423 return -EFAULT;
424 } else
425 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
426
427 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
428 return -EFAULT;
429
430 if (ka->sa.sa_flags & SA_SIGINFO) {
431 if (copy_siginfo_to_user(&frame->info, info))
432 return -EFAULT;
433 }
434
435 put_user_try {
436 /* Create the ucontext. */
437 if (cpu_has_xsave)
438 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
439 else
440 put_user_ex(0, &frame->uc.uc_flags);
441 put_user_ex(0, &frame->uc.uc_link);
442 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
443 put_user_ex(sas_ss_flags(regs->sp),
444 &frame->uc.uc_stack.ss_flags);
445 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
446 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
447 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
448
449 /* Set up to return from userspace. If provided, use a stub
450 already in userspace. */
451 /* x86-64 should always use SA_RESTORER. */
452 if (ka->sa.sa_flags & SA_RESTORER) {
453 put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
454 } else {
455 /* could use a vstub here */
456 err |= -EFAULT;
457 }
458 } put_user_catch(err);
459
460 if (err)
461 return -EFAULT;
462
463 /* Set up registers for signal handler */
464 regs->di = sig;
465 /* In case the signal handler was declared without prototypes */
466 regs->ax = 0;
467
468 /* This also works for non SA_SIGINFO handlers because they expect the
469 next argument after the signal number on the stack. */
470 regs->si = (unsigned long)&frame->info;
471 regs->dx = (unsigned long)&frame->uc;
472 regs->ip = (unsigned long) ka->sa.sa_handler;
473
474 regs->sp = (unsigned long)frame;
475
476 /* Set up the CS register to run signal handlers in 64-bit mode,
477 even if the handler happens to be interrupting 32-bit code. */
478 regs->cs = __USER_CS;
479
480 return 0;
481}
482#endif /* CONFIG_X86_32 */
483
484#ifdef CONFIG_X86_32
485/*
486 * Atomically swap in the new signal mask, and wait for a signal.
487 */
488asmlinkage int
489sys_sigsuspend(int history0, int history1, old_sigset_t mask)
490{
491 mask &= _BLOCKABLE;
492 spin_lock_irq(&current->sighand->siglock);
493 current->saved_sigmask = current->blocked;
494 siginitset(&current->blocked, mask);
495 recalc_sigpending();
496 spin_unlock_irq(&current->sighand->siglock);
497
498 current->state = TASK_INTERRUPTIBLE;
499 schedule();
500 set_restore_sigmask();
501
502 return -ERESTARTNOHAND;
503}
504
505asmlinkage int
506sys_sigaction(int sig, const struct old_sigaction __user *act,
507 struct old_sigaction __user *oact)
508{
509 struct k_sigaction new_ka, old_ka;
510 int ret = 0;
511
512 if (act) {
513 old_sigset_t mask;
514
515 if (!access_ok(VERIFY_READ, act, sizeof(*act)))
516 return -EFAULT;
517
518 get_user_try {
519 get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
520 get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
521 get_user_ex(mask, &act->sa_mask);
522 get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
523 } get_user_catch(ret);
524
525 if (ret)
526 return -EFAULT;
527 siginitset(&new_ka.sa.sa_mask, mask);
528 }
529
530 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
531
532 if (!ret && oact) {
533 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
534 return -EFAULT;
535
536 put_user_try {
537 put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
538 put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
539 put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
540 put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
541 } put_user_catch(ret);
542
543 if (ret)
544 return -EFAULT;
545 }
546
547 return ret;
548}
549#endif /* CONFIG_X86_32 */
550
551#ifdef CONFIG_X86_32
552int sys_sigaltstack(struct pt_regs *regs)
553{
554 const stack_t __user *uss = (const stack_t __user *)regs->bx;
555 stack_t __user *uoss = (stack_t __user *)regs->cx;
556
557 return do_sigaltstack(uss, uoss, regs->sp);
558}
559#else /* !CONFIG_X86_32 */
560asmlinkage long
561sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
562 struct pt_regs *regs)
563{
564 return do_sigaltstack(uss, uoss, regs->sp);
565}
566#endif /* CONFIG_X86_32 */
567
568/*
569 * Do a signal return; undo the signal stack.
570 */
571#ifdef CONFIG_X86_32
572unsigned long sys_sigreturn(struct pt_regs *regs)
573{
574 struct sigframe __user *frame;
575 unsigned long ax;
576 sigset_t set;
577
578 frame = (struct sigframe __user *)(regs->sp - 8);
579
580 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
581 goto badframe;
582 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
583 && __copy_from_user(&set.sig[1], &frame->extramask,
584 sizeof(frame->extramask))))
585 goto badframe;
586
587 sigdelsetmask(&set, ~_BLOCKABLE);
588 spin_lock_irq(&current->sighand->siglock);
589 current->blocked = set;
590 recalc_sigpending();
591 spin_unlock_irq(&current->sighand->siglock);
592
593 if (restore_sigcontext(regs, &frame->sc, &ax))
594 goto badframe;
595 return ax;
596
597badframe:
598 signal_fault(regs, frame, "sigreturn");
599
600 return 0;
601}
602#endif /* CONFIG_X86_32 */
603
604long sys_rt_sigreturn(struct pt_regs *regs)
605{
606 struct rt_sigframe __user *frame;
607 unsigned long ax;
608 sigset_t set;
609
610 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
611 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
612 goto badframe;
613 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
614 goto badframe;
615
616 sigdelsetmask(&set, ~_BLOCKABLE);
617 spin_lock_irq(&current->sighand->siglock);
618 current->blocked = set;
619 recalc_sigpending();
620 spin_unlock_irq(&current->sighand->siglock);
621
622 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
623 goto badframe;
624
625 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
626 goto badframe;
627
628 return ax;
629
630badframe:
631 signal_fault(regs, frame, "rt_sigreturn");
632 return 0;
633}
478 634
479/* 635/*
480 * OK, we're invoking a handler: 636 * OK, we're invoking a handler:
481 */ 637 */
482static int signr_convert(int sig) 638static int signr_convert(int sig)
483{ 639{
640#ifdef CONFIG_X86_32
484 struct thread_info *info = current_thread_info(); 641 struct thread_info *info = current_thread_info();
485 642
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) 643 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig]; 644 return info->exec_domain->signal_invmap[sig];
645#endif /* CONFIG_X86_32 */
488 return sig; 646 return sig;
489} 647}
490 648
649#ifdef CONFIG_X86_32
650
491#define is_ia32 1 651#define is_ia32 1
492#define ia32_setup_frame __setup_frame 652#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame 653#define ia32_setup_rt_frame __setup_rt_frame
494 654
655#else /* !CONFIG_X86_32 */
656
657#ifdef CONFIG_IA32_EMULATION
658#define is_ia32 test_thread_flag(TIF_IA32)
659#else /* !CONFIG_IA32_EMULATION */
660#define is_ia32 0
661#endif /* CONFIG_IA32_EMULATION */
662
663int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
664 sigset_t *set, struct pt_regs *regs);
665int ia32_setup_frame(int sig, struct k_sigaction *ka,
666 sigset_t *set, struct pt_regs *regs);
667
668#endif /* CONFIG_X86_32 */
669
495static int 670static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 671setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs) 672 sigset_t *set, struct pt_regs *regs)
@@ -592,7 +767,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
592 return 0; 767 return 0;
593} 768}
594 769
770#ifdef CONFIG_X86_32
595#define NR_restart_syscall __NR_restart_syscall 771#define NR_restart_syscall __NR_restart_syscall
772#else /* !CONFIG_X86_32 */
773#define NR_restart_syscall \
774 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
775#endif /* CONFIG_X86_32 */
776
596/* 777/*
597 * Note that 'init' is a special process: it doesn't get signals it doesn't 778 * Note that 'init' is a special process: it doesn't get signals it doesn't
598 * want to handle. Thus you cannot kill init even with a SIGKILL even by 779 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +885,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
704 struct task_struct *me = current; 885 struct task_struct *me = current;
705 886
706 if (show_unhandled_signals && printk_ratelimit()) { 887 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO 888 printk("%s"
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 889 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
890 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
709 me->comm, me->pid, where, frame, 891 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax); 892 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip); 893 print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db9..000000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
4 *
5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
19#include <linux/unistd.h>
20#include <linux/stddef.h>
21#include <linux/personality.h>
22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
25#include <asm/processor.h>
26#include <asm/ucontext.h>
27#include <asm/i387.h>
28#include <asm/proto.h>
29#include <asm/ia32_unistd.h>
30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
33#include "sigframe.h"
34
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36
37#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
38 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
39 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
40 X86_EFLAGS_CF)
41
42#ifdef CONFIG_X86_32
43# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
44#else
45# define FIX_EFLAGS __FIX_EFLAGS
46#endif
47
48asmlinkage long
49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
50 struct pt_regs *regs)
51{
52 return do_sigaltstack(uss, uoss, regs->sp);
53}
54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
64
65/*
66 * Do a signal return; undo the signal stack.
67 */
68static int
69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
70 unsigned long *pax)
71{
72 void __user *buf;
73 unsigned int tmpflags;
74 unsigned int err = 0;
75
76 /* Always make any pending restarted system calls return -EINTR */
77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
78
79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
80 COPY(dx); COPY(cx); COPY(ip);
81 COPY(r8);
82 COPY(r9);
83 COPY(r10);
84 COPY(r11);
85 COPY(r12);
86 COPY(r13);
87 COPY(r14);
88 COPY(r15);
89
90 /* Kernel saves and restores only the CS segment register on signals,
91 * which is the bare minimum needed to allow mixed 32/64-bit code.
92 * App's signal handler can save/restore other segments if needed. */
93 COPY_SEG_STRICT(cs);
94
95 err |= __get_user(tmpflags, &sc->flags);
96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
97 regs->orig_ax = -1; /* disable syscall checks */
98
99 err |= __get_user(buf, &sc->fpstate);
100 err |= restore_i387_xstate(buf);
101
102 err |= __get_user(*pax, &sc->ax);
103 return err;
104}
105
106static long do_rt_sigreturn(struct pt_regs *regs)
107{
108 struct rt_sigframe __user *frame;
109 unsigned long ax;
110 sigset_t set;
111
112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
114 goto badframe;
115 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
116 goto badframe;
117
118 sigdelsetmask(&set, ~_BLOCKABLE);
119 spin_lock_irq(&current->sighand->siglock);
120 current->blocked = set;
121 recalc_sigpending();
122 spin_unlock_irq(&current->sighand->siglock);
123
124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
125 goto badframe;
126
127 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
128 goto badframe;
129
130 return ax;
131
132badframe:
133 signal_fault(regs, frame, "rt_sigreturn");
134 return 0;
135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
142/*
143 * Set up a signal frame.
144 */
145
146static inline int
147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
149{
150 int err = 0;
151
152 err |= __put_user(regs->cs, &sc->cs);
153 err |= __put_user(0, &sc->gs);
154 err |= __put_user(0, &sc->fs);
155
156 err |= __put_user(regs->di, &sc->di);
157 err |= __put_user(regs->si, &sc->si);
158 err |= __put_user(regs->bp, &sc->bp);
159 err |= __put_user(regs->sp, &sc->sp);
160 err |= __put_user(regs->bx, &sc->bx);
161 err |= __put_user(regs->dx, &sc->dx);
162 err |= __put_user(regs->cx, &sc->cx);
163 err |= __put_user(regs->ax, &sc->ax);
164 err |= __put_user(regs->r8, &sc->r8);
165 err |= __put_user(regs->r9, &sc->r9);
166 err |= __put_user(regs->r10, &sc->r10);
167 err |= __put_user(regs->r11, &sc->r11);
168 err |= __put_user(regs->r12, &sc->r12);
169 err |= __put_user(regs->r13, &sc->r13);
170 err |= __put_user(regs->r14, &sc->r14);
171 err |= __put_user(regs->r15, &sc->r15);
172 err |= __put_user(me->thread.trap_no, &sc->trapno);
173 err |= __put_user(me->thread.error_code, &sc->err);
174 err |= __put_user(regs->ip, &sc->ip);
175 err |= __put_user(regs->flags, &sc->flags);
176 err |= __put_user(mask, &sc->oldmask);
177 err |= __put_user(me->thread.cr2, &sc->cr2);
178
179 return err;
180}
181
182/*
183 * Determine which stack to use..
184 */
185
186static void __user *
187get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
188{
189 unsigned long sp;
190
191 /* Default to using normal stack - redzone*/
192 sp = regs->sp - 128;
193
194 /* This is the X/Open sanctioned signal stack switching. */
195 if (ka->sa.sa_flags & SA_ONSTACK) {
196 if (sas_ss_flags(sp) == 0)
197 sp = current->sas_ss_sp + current->sas_ss_size;
198 }
199
200 return (void __user *)round_down(sp - size, 64);
201}
202
203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
204 sigset_t *set, struct pt_regs *regs)
205{
206 struct rt_sigframe __user *frame;
207 void __user *fp = NULL;
208 int err = 0;
209 struct task_struct *me = current;
210
211 if (used_math()) {
212 fp = get_stack(ka, regs, sig_xstate_size);
213 frame = (void __user *)round_down(
214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
215
216 if (save_i387_xstate(fp) < 0)
217 return -EFAULT;
218 } else
219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
220
221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
222 return -EFAULT;
223
224 if (ka->sa.sa_flags & SA_SIGINFO) {
225 if (copy_siginfo_to_user(&frame->info, info))
226 return -EFAULT;
227 }
228
229 /* Create the ucontext. */
230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
234 err |= __put_user(0, &frame->uc.uc_link);
235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
236 err |= __put_user(sas_ss_flags(regs->sp),
237 &frame->uc.uc_stack.ss_flags);
238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
241 if (sizeof(*set) == 16) {
242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
244 } else
245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
246
247 /* Set up to return from userspace. If provided, use a stub
248 already in userspace. */
249 /* x86-64 should always use SA_RESTORER. */
250 if (ka->sa.sa_flags & SA_RESTORER) {
251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
252 } else {
253 /* could use a vstub here */
254 return -EFAULT;
255 }
256
257 if (err)
258 return -EFAULT;
259
260 /* Set up registers for signal handler */
261 regs->di = sig;
262 /* In case the signal handler was declared without prototypes */
263 regs->ax = 0;
264
265 /* This also works for non SA_SIGINFO handlers because they expect the
266 next argument after the signal number on the stack. */
267 regs->si = (unsigned long)&frame->info;
268 regs->dx = (unsigned long)&frame->uc;
269 regs->ip = (unsigned long) ka->sa.sa_handler;
270
271 regs->sp = (unsigned long)frame;
272
273 /* Set up the CS register to run signal handlers in 64-bit mode,
274 even if the handler happens to be interrupting 32-bit code. */
275 regs->cs = __USER_CS;
276
277 return 0;
278}
279
280/*
281 * OK, we're invoking a handler
282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
317
318static int
319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
320 sigset_t *oldset, struct pt_regs *regs)
321{
322 int ret;
323
324 /* Are we from a system call? */
325 if (syscall_get_nr(current, regs) >= 0) {
326 /* If so, check system call restarting.. */
327 switch (syscall_get_error(current, regs)) {
328 case -ERESTART_RESTARTBLOCK:
329 case -ERESTARTNOHAND:
330 regs->ax = -EINTR;
331 break;
332
333 case -ERESTARTSYS:
334 if (!(ka->sa.sa_flags & SA_RESTART)) {
335 regs->ax = -EINTR;
336 break;
337 }
338 /* fallthrough */
339 case -ERESTARTNOINTR:
340 regs->ax = regs->orig_ax;
341 regs->ip -= 2;
342 break;
343 }
344 }
345
346 /*
347 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
348 * flag so that register information in the sigcontext is correct.
349 */
350 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
352 regs->flags &= ~X86_EFLAGS_TF;
353
354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
355
356 if (ret)
357 return ret;
358
359#ifdef CONFIG_X86_64
360 /*
361 * This has nothing to do with segment registers,
362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
367
368 /*
369 * Clear the direction flag as per the ABI for function entry.
370 */
371 regs->flags &= ~X86_EFLAGS_DF;
372
373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
392}
393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
396/*
397 * Note that 'init' is a special process: it doesn't get signals it doesn't
398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
399 * mistake.
400 */
401static void do_signal(struct pt_regs *regs)
402{
403 struct k_sigaction ka;
404 siginfo_t info;
405 int signr;
406 sigset_t *oldset;
407
408 /*
409 * We want the common case to go fast, which is why we may in certain
410 * cases get here from kernel mode. Just return without doing anything
411 * if so.
412 * X86_32: vm86 regs switched out by assembly code before reaching
413 * here, so testing against kernel CS suffices.
414 */
415 if (!user_mode(regs))
416 return;
417
418 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
419 oldset = &current->saved_sigmask;
420 else
421 oldset = &current->blocked;
422
423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
424 if (signr > 0) {
425 /*
426 * Re-enable any watchpoints before delivering the
427 * signal to user space. The processor register will
428 * have been cleared if the watchpoint triggered
429 * inside the kernel.
430 */
431 if (current->thread.debugreg7)
432 set_debugreg(current->thread.debugreg7, 7);
433
434 /* Whee! Actually deliver the signal. */
435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
436 /*
437 * A signal was successfully delivered; the saved
438 * sigmask will have been stored in the signal frame,
439 * and will be restored by sigreturn, so we can simply
440 * clear the TS_RESTORE_SIGMASK flag.
441 */
442 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
443 }
444 return;
445 }
446
447 /* Did we come from a system call? */
448 if (syscall_get_nr(current, regs) >= 0) {
449 /* Restart the system call - no handlers present */
450 switch (syscall_get_error(current, regs)) {
451 case -ERESTARTNOHAND:
452 case -ERESTARTSYS:
453 case -ERESTARTNOINTR:
454 regs->ax = regs->orig_ax;
455 regs->ip -= 2;
456 break;
457
458 case -ERESTART_RESTARTBLOCK:
459 regs->ax = NR_restart_syscall;
460 regs->ip -= 2;
461 break;
462 }
463 }
464
465 /*
466 * If there's no signal to deliver, we just put the saved sigmask
467 * back.
468 */
469 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
470 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
471 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
472 }
473}
474
475/*
476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{
482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
483 /* notify userspace of pending MCEs */
484 if (thread_info_flags & _TIF_MCE_NOTIFY)
485 mce_notify_user();
486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
487
488 /* deal with pending signal delivery */
489 if (thread_info_flags & _TIF_SIGPENDING)
490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
500}
501
502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
503{
504 struct task_struct *me = current;
505
506 if (show_unhandled_signals && printk_ratelimit()) {
507 printk(KERN_INFO
508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
511 print_vma_addr(" in ", regs->ip);
512 printk(KERN_CONT "\n");
513 }
514
515 force_sig(SIGSEGV, me);
516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8f..eaaffae31cc0 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Intel SMP support routines. 2 * Intel SMP support routines.
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs. 6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 * 7 *
8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> 8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,8 +26,7 @@
26#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
27#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <mach_ipi.h> 29#include <asm/genapic.h>
30#include <mach_apic.h>
31/* 30/*
32 * Some notes on x86 processor bugs affecting SMP operation: 31 * Some notes on x86 processor bugs affecting SMP operation:
33 * 32 *
@@ -118,39 +117,33 @@ static void native_smp_send_reschedule(int cpu)
118 WARN_ON(1); 117 WARN_ON(1);
119 return; 118 return;
120 } 119 }
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 120 apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
122} 121}
123 122
124void native_send_call_func_single_ipi(int cpu) 123void native_send_call_func_single_ipi(int cpu)
125{ 124{
126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); 125 apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
127} 126}
128 127
129void native_send_call_func_ipi(cpumask_t mask) 128void native_send_call_func_ipi(const struct cpumask *mask)
130{ 129{
131 cpumask_t allbutself; 130 cpumask_var_t allbutself;
132 131
133 allbutself = cpu_online_map; 132 if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
134 cpu_clear(smp_processor_id(), allbutself); 133 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
134 return;
135 }
136
137 cpumask_copy(allbutself, cpu_online_mask);
138 cpumask_clear_cpu(smp_processor_id(), allbutself);
135 139
136 if (cpus_equal(mask, allbutself) && 140 if (cpumask_equal(mask, allbutself) &&
137 cpus_equal(cpu_online_map, cpu_callout_map)) 141 cpumask_equal(cpu_online_mask, cpu_callout_mask))
138 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 142 apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
139 else 143 else
140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 144 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
141}
142 145
143static void stop_this_cpu(void *dummy) 146 free_cpumask_var(allbutself);
144{
145 local_irq_disable();
146 /*
147 * Remove this CPU:
148 */
149 cpu_clear(smp_processor_id(), cpu_online_map);
150 disable_local_APIC();
151 if (hlt_works(smp_processor_id()))
152 for (;;) halt();
153 for (;;);
154} 147}
155 148
156/* 149/*
@@ -178,11 +171,7 @@ static void native_smp_send_stop(void)
178void smp_reschedule_interrupt(struct pt_regs *regs) 171void smp_reschedule_interrupt(struct pt_regs *regs)
179{ 172{
180 ack_APIC_irq(); 173 ack_APIC_irq();
181#ifdef CONFIG_X86_32 174 inc_irq_stat(irq_resched_count);
182 __get_cpu_var(irq_stat).irq_resched_count++;
183#else
184 add_pda(irq_resched_count, 1);
185#endif
186} 175}
187 176
188void smp_call_function_interrupt(struct pt_regs *regs) 177void smp_call_function_interrupt(struct pt_regs *regs)
@@ -190,11 +179,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
190 ack_APIC_irq(); 179 ack_APIC_irq();
191 irq_enter(); 180 irq_enter();
192 generic_smp_call_function_interrupt(); 181 generic_smp_call_function_interrupt();
193#ifdef CONFIG_X86_32 182 inc_irq_stat(irq_call_count);
194 __get_cpu_var(irq_stat).irq_call_count++;
195#else
196 add_pda(irq_call_count, 1);
197#endif
198 irq_exit(); 183 irq_exit();
199} 184}
200 185
@@ -203,11 +188,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
203 ack_APIC_irq(); 188 ack_APIC_irq();
204 irq_enter(); 189 irq_enter();
205 generic_smp_call_function_single_interrupt(); 190 generic_smp_call_function_single_interrupt();
206#ifdef CONFIG_X86_32 191 inc_irq_stat(irq_call_count);
207 __get_cpu_var(irq_stat).irq_call_count++;
208#else
209 add_pda(irq_call_count, 1);
210#endif
211 irq_exit(); 192 irq_exit();
212} 193}
213 194
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7b1093397319..af57f88186e7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs. 6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 * 7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to 8 * Much of the core SMP work is based on previous work by Thomas Radke, to
@@ -53,7 +53,6 @@
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h> 55#include <asm/idle.h>
56#include <asm/smp.h>
57#include <asm/trampoline.h> 56#include <asm/trampoline.h>
58#include <asm/cpu.h> 57#include <asm/cpu.h>
59#include <asm/numa.h> 58#include <asm/numa.h>
@@ -62,11 +61,12 @@
62#include <asm/mtrr.h> 61#include <asm/mtrr.h>
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/genapic.h>
64#include <asm/setup.h>
65#include <asm/uv/uv.h>
65#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
66 67
67#include <mach_apic.h> 68#include <asm/genapic.h>
68#include <mach_wakecpu.h> 69#include <asm/smpboot_hooks.h>
69#include <smpboot_hooks.h>
70 70
71#ifdef CONFIG_X86_32 71#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 72u8 apicid_2_node[MAX_APICID];
@@ -101,15 +101,6 @@ EXPORT_SYMBOL(smp_num_siblings);
101/* Last level cache ID of each logical CPU */ 101/* Last level cache ID of each logical CPU */
102DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 102DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
103 103
104/* bitmap of online cpus */
105cpumask_t cpu_online_map __read_mostly;
106EXPORT_SYMBOL(cpu_online_map);
107
108cpumask_t cpu_callin_map;
109cpumask_t cpu_callout_map;
110cpumask_t cpu_possible_map;
111EXPORT_SYMBOL(cpu_possible_map);
112
113/* representing HT siblings of each logical CPU */ 104/* representing HT siblings of each logical CPU */
114DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); 105DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
115EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 106EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -125,9 +116,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
125static atomic_t init_deasserted; 116static atomic_t init_deasserted;
126 117
127 118
128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map;
130
131/* Set if we find a B stepping CPU */ 119/* Set if we find a B stepping CPU */
132static int __cpuinitdata smp_b_stepping; 120static int __cpuinitdata smp_b_stepping;
133 121
@@ -145,7 +133,7 @@ EXPORT_SYMBOL(cpu_to_node_map);
145static void map_cpu_to_node(int cpu, int node) 133static void map_cpu_to_node(int cpu, int node)
146{ 134{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); 135 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpu_set(cpu, node_to_cpumask_map[node]); 136 cpumask_set_cpu(cpu, &node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node; 137 cpu_to_node_map[cpu] = node;
150} 138}
151 139
@@ -156,7 +144,7 @@ static void unmap_cpu_to_node(int cpu)
156 144
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); 145 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++) 146 for (node = 0; node < MAX_NUMNODES; node++)
159 cpu_clear(cpu, node_to_cpumask_map[node]); 147 cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0; 148 cpu_to_node_map[cpu] = 0;
161} 149}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ 150#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
@@ -174,7 +162,7 @@ static void map_cpu_to_logical_apicid(void)
174{ 162{
175 int cpu = smp_processor_id(); 163 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id(); 164 int apicid = logical_smp_processor_id();
177 int node = apicid_to_node(apicid); 165 int node = apic->apicid_to_node(apicid);
178 166
179 if (!node_online(node)) 167 if (!node_online(node))
180 node = first_online_node; 168 node = first_online_node;
@@ -207,14 +195,15 @@ static void __cpuinit smp_callin(void)
207 * our local APIC. We have to wait for the IPI or we'll 195 * our local APIC. We have to wait for the IPI or we'll
208 * lock up on an APIC access. 196 * lock up on an APIC access.
209 */ 197 */
210 wait_for_init_deassert(&init_deasserted); 198 if (apic->wait_for_init_deassert)
199 apic->wait_for_init_deassert(&init_deasserted);
211 200
212 /* 201 /*
213 * (This works even if the APIC is not enabled.) 202 * (This works even if the APIC is not enabled.)
214 */ 203 */
215 phys_id = read_apic_id(); 204 phys_id = read_apic_id();
216 cpuid = smp_processor_id(); 205 cpuid = smp_processor_id();
217 if (cpu_isset(cpuid, cpu_callin_map)) { 206 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 207 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
219 phys_id, cpuid); 208 phys_id, cpuid);
220 } 209 }
@@ -236,7 +225,7 @@ static void __cpuinit smp_callin(void)
236 /* 225 /*
237 * Has the boot CPU finished it's STARTUP sequence? 226 * Has the boot CPU finished it's STARTUP sequence?
238 */ 227 */
239 if (cpu_isset(cpuid, cpu_callout_map)) 228 if (cpumask_test_cpu(cpuid, cpu_callout_mask))
240 break; 229 break;
241 cpu_relax(); 230 cpu_relax();
242 } 231 }
@@ -254,7 +243,8 @@ static void __cpuinit smp_callin(void)
254 */ 243 */
255 244
256 pr_debug("CALLIN, before setup_local_APIC().\n"); 245 pr_debug("CALLIN, before setup_local_APIC().\n");
257 smp_callin_clear_local_apic(); 246 if (apic->smp_callin_clear_local_apic)
247 apic->smp_callin_clear_local_apic();
258 setup_local_APIC(); 248 setup_local_APIC();
259 end_local_APIC_setup(); 249 end_local_APIC_setup();
260 map_cpu_to_logical_apicid(); 250 map_cpu_to_logical_apicid();
@@ -279,7 +269,7 @@ static void __cpuinit smp_callin(void)
279 /* 269 /*
280 * Allow the master to continue. 270 * Allow the master to continue.
281 */ 271 */
282 cpu_set(cpuid, cpu_callin_map); 272 cpumask_set_cpu(cpuid, cpu_callin_mask);
283} 273}
284 274
285static int __cpuinitdata unsafe_smp; 275static int __cpuinitdata unsafe_smp;
@@ -287,16 +277,14 @@ static int __cpuinitdata unsafe_smp;
287/* 277/*
288 * Activate a secondary processor. 278 * Activate a secondary processor.
289 */ 279 */
290static void __cpuinit start_secondary(void *unused) 280notrace static void __cpuinit start_secondary(void *unused)
291{ 281{
292 /* 282 /*
293 * Don't put *anything* before cpu_init(), SMP booting is too 283 * Don't put *anything* before cpu_init(), SMP booting is too
294 * fragile that we want to limit the things done here to the 284 * fragile that we want to limit the things done here to the
295 * most necessary things. 285 * most necessary things.
296 */ 286 */
297#ifdef CONFIG_VMI
298 vmi_bringup(); 287 vmi_bringup();
299#endif
300 cpu_init(); 288 cpu_init();
301 preempt_disable(); 289 preempt_disable();
302 smp_callin(); 290 smp_callin();
@@ -339,7 +327,7 @@ static void __cpuinit start_secondary(void *unused)
339 ipi_call_lock(); 327 ipi_call_lock();
340 lock_vector_lock(); 328 lock_vector_lock();
341 __setup_vector_irq(smp_processor_id()); 329 __setup_vector_irq(smp_processor_id());
342 cpu_set(smp_processor_id(), cpu_online_map); 330 set_cpu_online(smp_processor_id(), true);
343 unlock_vector_lock(); 331 unlock_vector_lock();
344 ipi_call_unlock(); 332 ipi_call_unlock();
345 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 333 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
@@ -445,50 +433,52 @@ void __cpuinit set_cpu_sibling_map(int cpu)
445 int i; 433 int i;
446 struct cpuinfo_x86 *c = &cpu_data(cpu); 434 struct cpuinfo_x86 *c = &cpu_data(cpu);
447 435
448 cpu_set(cpu, cpu_sibling_setup_map); 436 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
449 437
450 if (smp_num_siblings > 1) { 438 if (smp_num_siblings > 1) {
451 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { 439 for_each_cpu(i, cpu_sibling_setup_mask) {
452 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 440 struct cpuinfo_x86 *o = &cpu_data(i);
453 c->cpu_core_id == cpu_data(i).cpu_core_id) { 441
454 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 442 if (c->phys_proc_id == o->phys_proc_id &&
455 cpu_set(cpu, per_cpu(cpu_sibling_map, i)); 443 c->cpu_core_id == o->cpu_core_id) {
456 cpu_set(i, per_cpu(cpu_core_map, cpu)); 444 cpumask_set_cpu(i, cpu_sibling_mask(cpu));
457 cpu_set(cpu, per_cpu(cpu_core_map, i)); 445 cpumask_set_cpu(cpu, cpu_sibling_mask(i));
458 cpu_set(i, c->llc_shared_map); 446 cpumask_set_cpu(i, cpu_core_mask(cpu));
459 cpu_set(cpu, cpu_data(i).llc_shared_map); 447 cpumask_set_cpu(cpu, cpu_core_mask(i));
448 cpumask_set_cpu(i, &c->llc_shared_map);
449 cpumask_set_cpu(cpu, &o->llc_shared_map);
460 } 450 }
461 } 451 }
462 } else { 452 } else {
463 cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); 453 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
464 } 454 }
465 455
466 cpu_set(cpu, c->llc_shared_map); 456 cpumask_set_cpu(cpu, &c->llc_shared_map);
467 457
468 if (current_cpu_data.x86_max_cores == 1) { 458 if (current_cpu_data.x86_max_cores == 1) {
469 per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu); 459 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
470 c->booted_cores = 1; 460 c->booted_cores = 1;
471 return; 461 return;
472 } 462 }
473 463
474 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { 464 for_each_cpu(i, cpu_sibling_setup_mask) {
475 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
476 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
477 cpu_set(i, c->llc_shared_map); 467 cpumask_set_cpu(i, &c->llc_shared_map);
478 cpu_set(cpu, cpu_data(i).llc_shared_map); 468 cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map);
479 } 469 }
480 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 470 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
481 cpu_set(i, per_cpu(cpu_core_map, cpu)); 471 cpumask_set_cpu(i, cpu_core_mask(cpu));
482 cpu_set(cpu, per_cpu(cpu_core_map, i)); 472 cpumask_set_cpu(cpu, cpu_core_mask(i));
483 /* 473 /*
484 * Does this new cpu bringup a new core? 474 * Does this new cpu bringup a new core?
485 */ 475 */
486 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) { 476 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
487 /* 477 /*
488 * for each core in package, increment 478 * for each core in package, increment
489 * the booted_cores for this new cpu 479 * the booted_cores for this new cpu
490 */ 480 */
491 if (first_cpu(per_cpu(cpu_sibling_map, i)) == i) 481 if (cpumask_first(cpu_sibling_mask(i)) == i)
492 c->booted_cores++; 482 c->booted_cores++;
493 /* 483 /*
494 * increment the core count for all 484 * increment the core count for all
@@ -503,7 +493,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
503} 493}
504 494
505/* maps the cpu to the sched domain representing multi-core */ 495/* maps the cpu to the sched domain representing multi-core */
506cpumask_t cpu_coregroup_map(int cpu) 496const struct cpumask *cpu_coregroup_mask(int cpu)
507{ 497{
508 struct cpuinfo_x86 *c = &cpu_data(cpu); 498 struct cpuinfo_x86 *c = &cpu_data(cpu);
509 /* 499 /*
@@ -511,9 +501,14 @@ cpumask_t cpu_coregroup_map(int cpu)
511 * And for power savings, we return cpu_core_map 501 * And for power savings, we return cpu_core_map
512 */ 502 */
513 if (sched_mc_power_savings || sched_smt_power_savings) 503 if (sched_mc_power_savings || sched_smt_power_savings)
514 return per_cpu(cpu_core_map, cpu); 504 return cpu_core_mask(cpu);
515 else 505 else
516 return c->llc_shared_map; 506 return &c->llc_shared_map;
507}
508
509cpumask_t cpu_coregroup_map(int cpu)
510{
511 return *cpu_coregroup_mask(cpu);
517} 512}
518 513
519static void impress_friends(void) 514static void impress_friends(void)
@@ -525,7 +520,7 @@ static void impress_friends(void)
525 */ 520 */
526 pr_debug("Before bogomips.\n"); 521 pr_debug("Before bogomips.\n");
527 for_each_possible_cpu(cpu) 522 for_each_possible_cpu(cpu)
528 if (cpu_isset(cpu, cpu_callout_map)) 523 if (cpumask_test_cpu(cpu, cpu_callout_mask))
529 bogosum += cpu_data(cpu).loops_per_jiffy; 524 bogosum += cpu_data(cpu).loops_per_jiffy;
530 printk(KERN_INFO 525 printk(KERN_INFO
531 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", 526 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
@@ -536,7 +531,7 @@ static void impress_friends(void)
536 pr_debug("Before bogocount - setting activated=1.\n"); 531 pr_debug("Before bogocount - setting activated=1.\n");
537} 532}
538 533
539static inline void __inquire_remote_apic(int apicid) 534void __inquire_remote_apic(int apicid)
540{ 535{
541 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 536 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
542 char *names[] = { "ID", "VERSION", "SPIV" }; 537 char *names[] = { "ID", "VERSION", "SPIV" };
@@ -575,14 +570,13 @@ static inline void __inquire_remote_apic(int apicid)
575 } 570 }
576} 571}
577 572
578#ifdef WAKE_SECONDARY_VIA_NMI
579/* 573/*
580 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal 574 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
581 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 575 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
582 * won't ... remember to clear down the APIC, etc later. 576 * won't ... remember to clear down the APIC, etc later.
583 */ 577 */
584static int __devinit 578int __devinit
585wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 579wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
586{ 580{
587 unsigned long send_status, accept_status = 0; 581 unsigned long send_status, accept_status = 0;
588 int maxlvt; 582 int maxlvt;
@@ -590,7 +584,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
590 /* Target chip */ 584 /* Target chip */
591 /* Boot on the stack */ 585 /* Boot on the stack */
592 /* Kick the second */ 586 /* Kick the second */
593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); 587 apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
594 588
595 pr_debug("Waiting for send to finish...\n"); 589 pr_debug("Waiting for send to finish...\n");
596 send_status = safe_apic_wait_icr_idle(); 590 send_status = safe_apic_wait_icr_idle();
@@ -599,7 +593,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
599 * Give the other CPU some time to accept the IPI. 593 * Give the other CPU some time to accept the IPI.
600 */ 594 */
601 udelay(200); 595 udelay(200);
602 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 596 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
603 maxlvt = lapic_get_maxlvt(); 597 maxlvt = lapic_get_maxlvt();
604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 598 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
605 apic_write(APIC_ESR, 0); 599 apic_write(APIC_ESR, 0);
@@ -614,11 +608,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
614 608
615 return (send_status | accept_status); 609 return (send_status | accept_status);
616} 610}
617#endif /* WAKE_SECONDARY_VIA_NMI */
618 611
619#ifdef WAKE_SECONDARY_VIA_INIT 612int __devinit
620static int __devinit 613wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
621wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
622{ 614{
623 unsigned long send_status, accept_status = 0; 615 unsigned long send_status, accept_status = 0;
624 int maxlvt, num_starts, j; 616 int maxlvt, num_starts, j;
@@ -737,7 +729,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
737 729
738 return (send_status | accept_status); 730 return (send_status | accept_status);
739} 731}
740#endif /* WAKE_SECONDARY_VIA_INIT */
741 732
742struct create_idle { 733struct create_idle {
743 struct work_struct work; 734 struct work_struct work;
@@ -755,57 +746,11 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
755 complete(&c_idle->done); 746 complete(&c_idle->done);
756} 747}
757 748
758#ifdef CONFIG_X86_64
759
760/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
761static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
762{
763 if (!after_bootmem)
764 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
765}
766
767/*
768 * Allocate node local memory for the AP pda.
769 *
770 * Must be called after the _cpu_pda pointer table is initialized.
771 */
772int __cpuinit get_local_pda(int cpu)
773{
774 struct x8664_pda *oldpda, *newpda;
775 unsigned long size = sizeof(struct x8664_pda);
776 int node = cpu_to_node(cpu);
777
778 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
779 return 0;
780
781 oldpda = cpu_pda(cpu);
782 newpda = kmalloc_node(size, GFP_ATOMIC, node);
783 if (!newpda) {
784 printk(KERN_ERR "Could not allocate node local PDA "
785 "for CPU %d on node %d\n", cpu, node);
786
787 if (oldpda)
788 return 0; /* have a usable pda */
789 else
790 return -1;
791 }
792
793 if (oldpda) {
794 memcpy(newpda, oldpda, size);
795 free_bootmem_pda(oldpda);
796 }
797
798 newpda->in_bootmem = 0;
799 cpu_pda(cpu) = newpda;
800 return 0;
801}
802#endif /* CONFIG_X86_64 */
803
804static int __cpuinit do_boot_cpu(int apicid, int cpu) 749static int __cpuinit do_boot_cpu(int apicid, int cpu)
805/* 750/*
806 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 751 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
807 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 752 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
808 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. 753 * Returns zero if CPU booted OK, else error code from ->wakeup_cpu.
809 */ 754 */
810{ 755{
811 unsigned long boot_error = 0; 756 unsigned long boot_error = 0;
@@ -818,16 +763,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
818 }; 763 };
819 INIT_WORK(&c_idle.work, do_fork_idle); 764 INIT_WORK(&c_idle.work, do_fork_idle);
820 765
821#ifdef CONFIG_X86_64
822 /* Allocate node local memory for AP pdas */
823 if (cpu > 0) {
824 boot_error = get_local_pda(cpu);
825 if (boot_error)
826 goto restore_state;
827 /* if can't get pda memory, can't start cpu */
828 }
829#endif
830
831 alternatives_smp_switch(1); 766 alternatives_smp_switch(1);
832 767
833 c_idle.idle = get_idle_for_cpu(cpu); 768 c_idle.idle = get_idle_for_cpu(cpu);
@@ -857,14 +792,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
857 792
858 set_idle_for_cpu(cpu, c_idle.idle); 793 set_idle_for_cpu(cpu, c_idle.idle);
859do_rest: 794do_rest:
860#ifdef CONFIG_X86_32
861 per_cpu(current_task, cpu) = c_idle.idle; 795 per_cpu(current_task, cpu) = c_idle.idle;
862 init_gdt(cpu); 796#ifdef CONFIG_X86_32
863 /* Stack for startup_32 can be just as for start_secondary onwards */ 797 /* Stack for startup_32 can be just as for start_secondary onwards */
864 irq_ctx_init(cpu); 798 irq_ctx_init(cpu);
865#else 799#else
866 cpu_pda(cpu)->pcurrent = c_idle.idle;
867 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 800 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
801 initial_gs = per_cpu_offset(cpu);
802 per_cpu(kernel_stack, cpu) =
803 (unsigned long)task_stack_page(c_idle.idle) -
804 KERNEL_STACK_OFFSET + THREAD_SIZE;
868#endif 805#endif
869 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 806 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
870 initial_code = (unsigned long)start_secondary; 807 initial_code = (unsigned long)start_secondary;
@@ -888,7 +825,8 @@ do_rest:
888 825
889 pr_debug("Setting warm reset code and vector.\n"); 826 pr_debug("Setting warm reset code and vector.\n");
890 827
891 store_NMI_vector(&nmi_high, &nmi_low); 828 if (apic->store_NMI_vector)
829 apic->store_NMI_vector(&nmi_high, &nmi_low);
892 830
893 smpboot_setup_warm_reset_vector(start_ip); 831 smpboot_setup_warm_reset_vector(start_ip);
894 /* 832 /*
@@ -903,26 +841,26 @@ do_rest:
903 /* 841 /*
904 * Starting actual IPI sequence... 842 * Starting actual IPI sequence...
905 */ 843 */
906 boot_error = wakeup_secondary_cpu(apicid, start_ip); 844 boot_error = apic->wakeup_cpu(apicid, start_ip);
907 845
908 if (!boot_error) { 846 if (!boot_error) {
909 /* 847 /*
910 * allow APs to start initializing. 848 * allow APs to start initializing.
911 */ 849 */
912 pr_debug("Before Callout %d.\n", cpu); 850 pr_debug("Before Callout %d.\n", cpu);
913 cpu_set(cpu, cpu_callout_map); 851 cpumask_set_cpu(cpu, cpu_callout_mask);
914 pr_debug("After Callout %d.\n", cpu); 852 pr_debug("After Callout %d.\n", cpu);
915 853
916 /* 854 /*
917 * Wait 5s total for a response 855 * Wait 5s total for a response
918 */ 856 */
919 for (timeout = 0; timeout < 50000; timeout++) { 857 for (timeout = 0; timeout < 50000; timeout++) {
920 if (cpu_isset(cpu, cpu_callin_map)) 858 if (cpumask_test_cpu(cpu, cpu_callin_mask))
921 break; /* It has booted */ 859 break; /* It has booted */
922 udelay(100); 860 udelay(100);
923 } 861 }
924 862
925 if (cpu_isset(cpu, cpu_callin_map)) { 863 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
926 /* number CPUs logically, starting from 1 (BSP is 0) */ 864 /* number CPUs logically, starting from 1 (BSP is 0) */
927 pr_debug("OK.\n"); 865 pr_debug("OK.\n");
928 printk(KERN_INFO "CPU%d: ", cpu); 866 printk(KERN_INFO "CPU%d: ", cpu);
@@ -937,19 +875,22 @@ do_rest:
937 else 875 else
938 /* trampoline code not run */ 876 /* trampoline code not run */
939 printk(KERN_ERR "Not responding.\n"); 877 printk(KERN_ERR "Not responding.\n");
940 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) 878 if (apic->inquire_remote_apic)
941 inquire_remote_apic(apicid); 879 apic->inquire_remote_apic(apicid);
942 } 880 }
943 } 881 }
944#ifdef CONFIG_X86_64 882
945restore_state:
946#endif
947 if (boot_error) { 883 if (boot_error) {
948 /* Try to put things back the way they were before ... */ 884 /* Try to put things back the way they were before ... */
949 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 885 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
950 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ 886
951 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 887 /* was set by do_boot_cpu() */
952 cpu_clear(cpu, cpu_present_map); 888 cpumask_clear_cpu(cpu, cpu_callout_mask);
889
890 /* was set by cpu_init() */
891 cpumask_clear_cpu(cpu, cpu_initialized_mask);
892
893 set_cpu_present(cpu, false);
953 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; 894 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
954 } 895 }
955 896
@@ -966,7 +907,7 @@ restore_state:
966 907
967int __cpuinit native_cpu_up(unsigned int cpu) 908int __cpuinit native_cpu_up(unsigned int cpu)
968{ 909{
969 int apicid = cpu_present_to_apicid(cpu); 910 int apicid = apic->cpu_present_to_apicid(cpu);
970 unsigned long flags; 911 unsigned long flags;
971 int err; 912 int err;
972 913
@@ -983,7 +924,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
983 /* 924 /*
984 * Already booted CPU? 925 * Already booted CPU?
985 */ 926 */
986 if (cpu_isset(cpu, cpu_callin_map)) { 927 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
987 pr_debug("do_boot_cpu %d Already started\n", cpu); 928 pr_debug("do_boot_cpu %d Already started\n", cpu);
988 return -ENOSYS; 929 return -ENOSYS;
989 } 930 }
@@ -1038,8 +979,9 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1038 */ 979 */
1039static __init void disable_smp(void) 980static __init void disable_smp(void)
1040{ 981{
1041 cpu_present_map = cpumask_of_cpu(0); 982 /* use the read/write pointers to the present and possible maps */
1042 cpu_possible_map = cpumask_of_cpu(0); 983 cpumask_copy(&cpu_present_map, cpumask_of(0));
984 cpumask_copy(&cpu_possible_map, cpumask_of(0));
1043 smpboot_clear_io_apic_irqs(); 985 smpboot_clear_io_apic_irqs();
1044 986
1045 if (smp_found_config) 987 if (smp_found_config)
@@ -1047,8 +989,8 @@ static __init void disable_smp(void)
1047 else 989 else
1048 physid_set_mask_of_physid(0, &phys_cpu_present_map); 990 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1049 map_cpu_to_logical_apicid(); 991 map_cpu_to_logical_apicid();
1050 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 992 cpumask_set_cpu(0, cpu_sibling_mask(0));
1051 cpu_set(0, per_cpu(cpu_core_map, 0)); 993 cpumask_set_cpu(0, cpu_core_mask(0));
1052} 994}
1053 995
1054/* 996/*
@@ -1058,26 +1000,26 @@ static int __init smp_sanity_check(unsigned max_cpus)
1058{ 1000{
1059 preempt_disable(); 1001 preempt_disable();
1060 1002
1061#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) 1003#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1062 if (def_to_bigsmp && nr_cpu_ids > 8) { 1004 if (def_to_bigsmp && nr_cpu_ids > 8) {
1063 unsigned int cpu; 1005 unsigned int cpu;
1064 unsigned nr; 1006 unsigned nr;
1065 1007
1066 printk(KERN_WARNING 1008 printk(KERN_WARNING
1067 "More than 8 CPUs detected - skipping them.\n" 1009 "More than 8 CPUs detected - skipping them.\n"
1068 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); 1010 "Use CONFIG_X86_BIGSMP.\n");
1069 1011
1070 nr = 0; 1012 nr = 0;
1071 for_each_present_cpu(cpu) { 1013 for_each_present_cpu(cpu) {
1072 if (nr >= 8) 1014 if (nr >= 8)
1073 cpu_clear(cpu, cpu_present_map); 1015 set_cpu_present(cpu, false);
1074 nr++; 1016 nr++;
1075 } 1017 }
1076 1018
1077 nr = 0; 1019 nr = 0;
1078 for_each_possible_cpu(cpu) { 1020 for_each_possible_cpu(cpu) {
1079 if (nr >= 8) 1021 if (nr >= 8)
1080 cpu_clear(cpu, cpu_possible_map); 1022 set_cpu_possible(cpu, false);
1081 nr++; 1023 nr++;
1082 } 1024 }
1083 1025
@@ -1086,8 +1028,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
1086#endif 1028#endif
1087 1029
1088 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1030 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1089 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1031 printk(KERN_WARNING
1090 "by the BIOS.\n", hard_smp_processor_id()); 1032 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1033 hard_smp_processor_id());
1034
1091 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1035 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1092 } 1036 }
1093 1037
@@ -1109,7 +1053,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1109 * Should not be necessary because the MP table should list the boot 1053 * Should not be necessary because the MP table should list the boot
1110 * CPU too, but we do it for the sake of robustness anyway. 1054 * CPU too, but we do it for the sake of robustness anyway.
1111 */ 1055 */
1112 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { 1056 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1113 printk(KERN_NOTICE 1057 printk(KERN_NOTICE
1114 "weird, boot CPU (#%d) not listed by the BIOS.\n", 1058 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1115 boot_cpu_physical_apicid); 1059 boot_cpu_physical_apicid);
@@ -1127,6 +1071,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1127 printk(KERN_ERR "... forcing use of dummy APIC emulation." 1071 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1128 "(tell your hw vendor)\n"); 1072 "(tell your hw vendor)\n");
1129 smpboot_clear_io_apic(); 1073 smpboot_clear_io_apic();
1074 arch_disable_smp_support();
1130 return -1; 1075 return -1;
1131 } 1076 }
1132 1077
@@ -1158,7 +1103,7 @@ static void __init smp_cpu_index_default(void)
1158 for_each_possible_cpu(i) { 1103 for_each_possible_cpu(i) {
1159 c = &cpu_data(i); 1104 c = &cpu_data(i);
1160 /* mark all to hotplug */ 1105 /* mark all to hotplug */
1161 c->cpu_index = NR_CPUS; 1106 c->cpu_index = nr_cpu_ids;
1162 } 1107 }
1163} 1108}
1164 1109
@@ -1171,7 +1116,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1171 preempt_disable(); 1116 preempt_disable();
1172 smp_cpu_index_default(); 1117 smp_cpu_index_default();
1173 current_cpu_data = boot_cpu_data; 1118 current_cpu_data = boot_cpu_data;
1174 cpu_callin_map = cpumask_of_cpu(0); 1119 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1175 mb(); 1120 mb();
1176 /* 1121 /*
1177 * Setup boot CPU information 1122 * Setup boot CPU information
@@ -1185,7 +1130,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1185 1130
1186#ifdef CONFIG_X86_64 1131#ifdef CONFIG_X86_64
1187 enable_IR_x2apic(); 1132 enable_IR_x2apic();
1188 setup_apic_routing(); 1133 default_setup_apic_routing();
1189#endif 1134#endif
1190 1135
1191 if (smp_sanity_check(max_cpus) < 0) { 1136 if (smp_sanity_check(max_cpus) < 0) {
@@ -1220,7 +1165,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1220 1165
1221 map_cpu_to_logical_apicid(); 1166 map_cpu_to_logical_apicid();
1222 1167
1223 setup_portio_remap(); 1168 if (apic->setup_portio_remap)
1169 apic->setup_portio_remap();
1224 1170
1225 smpboot_setup_io_apic(); 1171 smpboot_setup_io_apic();
1226 /* 1172 /*
@@ -1242,12 +1188,9 @@ out:
1242void __init native_smp_prepare_boot_cpu(void) 1188void __init native_smp_prepare_boot_cpu(void)
1243{ 1189{
1244 int me = smp_processor_id(); 1190 int me = smp_processor_id();
1245#ifdef CONFIG_X86_32 1191 switch_to_new_gdt(me);
1246 init_gdt(me); 1192 /* already set me in cpu_online_mask in boot_cpu_init() */
1247#endif 1193 cpumask_set_cpu(me, cpu_callout_mask);
1248 switch_to_new_gdt();
1249 /* already set me in cpu_online_map in boot_cpu_init() */
1250 cpu_set(me, cpu_callout_map);
1251 per_cpu(cpu_state, me) = CPU_ONLINE; 1194 per_cpu(cpu_state, me) = CPU_ONLINE;
1252} 1195}
1253 1196
@@ -1263,6 +1206,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1263 check_nmi_watchdog(); 1206 check_nmi_watchdog();
1264} 1207}
1265 1208
1209static int __initdata setup_possible_cpus = -1;
1210static int __init _setup_possible_cpus(char *str)
1211{
1212 get_option(&str, &setup_possible_cpus);
1213 return 0;
1214}
1215early_param("possible_cpus", _setup_possible_cpus);
1216
1217
1266/* 1218/*
1267 * cpu_possible_map should be static, it cannot change as cpu's 1219 * cpu_possible_map should be static, it cannot change as cpu's
1268 * are onlined, or offlined. The reason is per-cpu data-structures 1220 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,7 +1227,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1275 * 1227 *
1276 * Three ways to find out the number of additional hotplug CPUs: 1228 * Three ways to find out the number of additional hotplug CPUs:
1277 * - If the BIOS specified disabled CPUs in ACPI/mptables use that. 1229 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1278 * - The user can overwrite it with additional_cpus=NUM 1230 * - The user can overwrite it with possible_cpus=NUM
1279 * - Otherwise don't reserve additional CPUs. 1231 * - Otherwise don't reserve additional CPUs.
1280 * We do this because additional CPUs waste a lot of memory. 1232 * We do this because additional CPUs waste a lot of memory.
1281 * -AK 1233 * -AK
@@ -1288,15 +1240,25 @@ __init void prefill_possible_map(void)
1288 if (!num_processors) 1240 if (!num_processors)
1289 num_processors = 1; 1241 num_processors = 1;
1290 1242
1291 possible = num_processors + disabled_cpus; 1243 if (setup_possible_cpus == -1)
1292 if (possible > NR_CPUS) 1244 possible = num_processors + disabled_cpus;
1293 possible = NR_CPUS; 1245 else
1246 possible = setup_possible_cpus;
1247
1248 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1249
1250 if (possible > CONFIG_NR_CPUS) {
1251 printk(KERN_WARNING
1252 "%d Processors exceeds NR_CPUS limit of %d\n",
1253 possible, CONFIG_NR_CPUS);
1254 possible = CONFIG_NR_CPUS;
1255 }
1294 1256
1295 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1257 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1296 possible, max_t(int, possible - num_processors, 0)); 1258 possible, max_t(int, possible - num_processors, 0));
1297 1259
1298 for (i = 0; i < possible; i++) 1260 for (i = 0; i < possible; i++)
1299 cpu_set(i, cpu_possible_map); 1261 set_cpu_possible(i, true);
1300 1262
1301 nr_cpu_ids = possible; 1263 nr_cpu_ids = possible;
1302} 1264}
@@ -1308,31 +1270,31 @@ static void remove_siblinginfo(int cpu)
1308 int sibling; 1270 int sibling;
1309 struct cpuinfo_x86 *c = &cpu_data(cpu); 1271 struct cpuinfo_x86 *c = &cpu_data(cpu);
1310 1272
1311 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { 1273 for_each_cpu(sibling, cpu_core_mask(cpu)) {
1312 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1274 cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
1313 /*/ 1275 /*/
1314 * last thread sibling in this cpu core going down 1276 * last thread sibling in this cpu core going down
1315 */ 1277 */
1316 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) 1278 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
1317 cpu_data(sibling).booted_cores--; 1279 cpu_data(sibling).booted_cores--;
1318 } 1280 }
1319 1281
1320 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) 1282 for_each_cpu(sibling, cpu_sibling_mask(cpu))
1321 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1283 cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
1322 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1284 cpumask_clear(cpu_sibling_mask(cpu));
1323 cpus_clear(per_cpu(cpu_core_map, cpu)); 1285 cpumask_clear(cpu_core_mask(cpu));
1324 c->phys_proc_id = 0; 1286 c->phys_proc_id = 0;
1325 c->cpu_core_id = 0; 1287 c->cpu_core_id = 0;
1326 cpu_clear(cpu, cpu_sibling_setup_map); 1288 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1327} 1289}
1328 1290
1329static void __ref remove_cpu_from_maps(int cpu) 1291static void __ref remove_cpu_from_maps(int cpu)
1330{ 1292{
1331 cpu_clear(cpu, cpu_online_map); 1293 set_cpu_online(cpu, false);
1332 cpu_clear(cpu, cpu_callout_map); 1294 cpumask_clear_cpu(cpu, cpu_callout_mask);
1333 cpu_clear(cpu, cpu_callin_map); 1295 cpumask_clear_cpu(cpu, cpu_callin_mask);
1334 /* was set by cpu_init() */ 1296 /* was set by cpu_init() */
1335 cpu_clear(cpu, cpu_initialized); 1297 cpumask_clear_cpu(cpu, cpu_initialized_mask);
1336 numa_remove_cpu(cpu); 1298 numa_remove_cpu(cpu);
1337} 1299}
1338 1300
@@ -1355,7 +1317,7 @@ void cpu_disable_common(void)
1355 lock_vector_lock(); 1317 lock_vector_lock();
1356 remove_cpu_from_maps(cpu); 1318 remove_cpu_from_maps(cpu);
1357 unlock_vector_lock(); 1319 unlock_vector_lock();
1358 fixup_irqs(cpu_online_map); 1320 fixup_irqs();
1359} 1321}
1360 1322
1361int native_cpu_disable(void) 1323int native_cpu_disable(void)
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7#ifdef CONFIG_X86_32
8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10
11/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
16__cpuinit void init_gdt(int cpu)
17{
18 struct desc_struct gdt;
19
20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
23
24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29}
30#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..f7bddc2e37d1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * Stack trace management functions 2 * Stack trace management functions
3 * 3 *
4 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 4 * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 */ 5 */
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 7b987852e876..1e733eff9b33 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,8 +30,364 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/summit/mpparse.h>
34 33
34/*
35 * APIC driver for the IBM "Summit" chipset.
36 */
37#define APIC_DEFINITION 1
38#include <linux/threads.h>
39#include <linux/cpumask.h>
40#include <asm/mpspec.h>
41#include <asm/apic.h>
42#include <asm/smp.h>
43#include <asm/genapic.h>
44#include <asm/fixmap.h>
45#include <asm/apicdef.h>
46#include <asm/ipi.h>
47#include <linux/kernel.h>
48#include <linux/string.h>
49#include <linux/init.h>
50#include <linux/gfp.h>
51#include <linux/smp.h>
52
53static inline unsigned summit_get_apic_id(unsigned long x)
54{
55 return (x >> 24) & 0xFF;
56}
57
58static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector)
59{
60 default_send_IPI_mask_sequence_logical(mask, vector);
61}
62
63static inline void summit_send_IPI_allbutself(int vector)
64{
65 cpumask_t mask = cpu_online_map;
66 cpu_clear(smp_processor_id(), mask);
67
68 if (!cpus_empty(mask))
69 summit_send_IPI_mask(&mask, vector);
70}
71
72static inline void summit_send_IPI_all(int vector)
73{
74 summit_send_IPI_mask(&cpu_online_map, vector);
75}
76
77#include <asm/tsc.h>
78
79extern int use_cyclone;
80
81#ifdef CONFIG_X86_SUMMIT_NUMA
82extern void setup_summit(void);
83#else
84#define setup_summit() {}
85#endif
86
87static inline int
88summit_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
89{
90 if (!strncmp(oem, "IBM ENSW", 8) &&
91 (!strncmp(productid, "VIGIL SMP", 9)
92 || !strncmp(productid, "EXA", 3)
93 || !strncmp(productid, "RUTHLESS SMP", 12))){
94 mark_tsc_unstable("Summit based system");
95 use_cyclone = 1; /*enable cyclone-timer*/
96 setup_summit();
97 return 1;
98 }
99 return 0;
100}
101
102/* Hook from generic ACPI tables.c */
103static inline int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
104{
105 if (!strncmp(oem_id, "IBM", 3) &&
106 (!strncmp(oem_table_id, "SERVIGIL", 8)
107 || !strncmp(oem_table_id, "EXA", 3))){
108 mark_tsc_unstable("Summit based system");
109 use_cyclone = 1; /*enable cyclone-timer*/
110 setup_summit();
111 return 1;
112 }
113 return 0;
114}
115
116struct rio_table_hdr {
117 unsigned char version; /* Version number of this data structure */
118 /* Version 3 adds chassis_num & WP_index */
119 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
120 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
121} __attribute__((packed));
122
123struct scal_detail {
124 unsigned char node_id; /* Scalability Node ID */
125 unsigned long CBAR; /* Address of 1MB register space */
126 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
127 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
128 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
129 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
130 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
131 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
132 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
133} __attribute__((packed));
134
135struct rio_detail {
136 unsigned char node_id; /* RIO Node ID */
137 unsigned long BBAR; /* Address of 1MB register space */
138 unsigned char type; /* Type of device */
139 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
140 /* For CYC: Node ID of Twister that owns this CYC */
141 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
142 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
143 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
144 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
145 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
146 /* For CYC: 0 */
147 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
148 /* = 0 : the XAPIC is not used, ie:*/
149 /* ints fwded to another XAPIC */
150 /* Bits1:7 Reserved */
151 /* For CYC: Bits0:7 Reserved */
152 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
153 /* lower slot numbers/PCI bus numbers */
154 /* For CYC: No meaning */
155 unsigned char chassis_num; /* 1 based Chassis number */
156 /* For LookOut WPEGs this field indicates the */
157 /* Expansion Chassis #, enumerated from Boot */
158 /* Node WPEG external port, then Boot Node CYC */
159 /* external port, then Next Vigil chassis WPEG */
160 /* external port, etc. */
161 /* Shared Lookouts have only 1 chassis number (the */
162 /* first one assigned) */
163} __attribute__((packed));
164
165
166typedef enum {
167 CompatTwister = 0, /* Compatibility Twister */
168 AltTwister = 1, /* Alternate Twister of internal 8-way */
169 CompatCyclone = 2, /* Compatibility Cyclone */
170 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
171 CompatWPEG = 4, /* Compatibility WPEG */
172 AltWPEG = 5, /* Second Planar WPEG */
173 LookOutAWPEG = 6, /* LookOut WPEG */
174 LookOutBWPEG = 7, /* LookOut WPEG */
175} node_type;
176
177static inline int is_WPEG(struct rio_detail *rio){
178 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
179 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
180}
181
182
183/* In clustered mode, the high nibble of APIC ID is a cluster number.
184 * The low nibble is a 4-bit bitmap. */
185#define XAPIC_DEST_CPUS_SHIFT 4
186#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
187#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
188
189#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
190
191static inline const cpumask_t *summit_target_cpus(void)
192{
193 /* CPU_MASK_ALL (0xff) has undefined behaviour with
194 * dest_LowestPrio mode logical clustered apic interrupt routing
195 * Just start on cpu 0. IRQ balancing will spread load
196 */
197 return &cpumask_of_cpu(0);
198}
199
200static inline unsigned long
201summit_check_apicid_used(physid_mask_t bitmap, int apicid)
202{
203 return 0;
204}
205
206/* we don't use the phys_cpu_present_map to indicate apicid presence */
207static inline unsigned long summit_check_apicid_present(int bit)
208{
209 return 1;
210}
211
212#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
213
214extern u8 cpu_2_logical_apicid[];
215
216static inline void summit_init_apic_ldr(void)
217{
218 unsigned long val, id;
219 int count = 0;
220 u8 my_id = (u8)hard_smp_processor_id();
221 u8 my_cluster = (u8)apicid_cluster(my_id);
222#ifdef CONFIG_SMP
223 u8 lid;
224 int i;
225
226 /* Create logical APIC IDs by counting CPUs already in cluster. */
227 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
228 lid = cpu_2_logical_apicid[i];
229 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
230 ++count;
231 }
232#endif
233 /* We only have a 4 wide bitmap in cluster mode. If a deranged
234 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
235 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
236 id = my_cluster | (1UL << count);
237 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
238 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
239 val |= SET_APIC_LOGICAL_ID(id);
240 apic_write(APIC_LDR, val);
241}
242
243static inline int summit_apic_id_registered(void)
244{
245 return 1;
246}
247
248static inline void summit_setup_apic_routing(void)
249{
250 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
251 nr_ioapics);
252}
253
254static inline int summit_apicid_to_node(int logical_apicid)
255{
256#ifdef CONFIG_SMP
257 return apicid_2_node[hard_smp_processor_id()];
258#else
259 return 0;
260#endif
261}
262
263/* Mapping from cpu number to logical apicid */
264static inline int summit_cpu_to_logical_apicid(int cpu)
265{
266#ifdef CONFIG_SMP
267 if (cpu >= nr_cpu_ids)
268 return BAD_APICID;
269 return (int)cpu_2_logical_apicid[cpu];
270#else
271 return logical_smp_processor_id();
272#endif
273}
274
275static inline int summit_cpu_present_to_apicid(int mps_cpu)
276{
277 if (mps_cpu < nr_cpu_ids)
278 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
279 else
280 return BAD_APICID;
281}
282
283static inline physid_mask_t
284summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
285{
286 /* For clustered we don't have a good way to do this yet - hack */
287 return physids_promote(0x0F);
288}
289
290static inline physid_mask_t summit_apicid_to_cpu_present(int apicid)
291{
292 return physid_mask_of_physid(0);
293}
294
295static inline void summit_setup_portio_remap(void)
296{
297}
298
299static inline int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
300{
301 return 1;
302}
303
304static inline unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask)
305{
306 int cpus_found = 0;
307 int num_bits_set;
308 int apicid;
309 int cpu;
310
311 num_bits_set = cpus_weight(*cpumask);
312 /* Return id to all */
313 if (num_bits_set >= nr_cpu_ids)
314 return 0xFF;
315 /*
316 * The cpus in the mask must all be on the apic cluster. If are not
317 * on the same apicid cluster return default value of target_cpus():
318 */
319 cpu = first_cpu(*cpumask);
320 apicid = summit_cpu_to_logical_apicid(cpu);
321
322 while (cpus_found < num_bits_set) {
323 if (cpu_isset(cpu, *cpumask)) {
324 int new_apicid = summit_cpu_to_logical_apicid(cpu);
325
326 if (apicid_cluster(apicid) !=
327 apicid_cluster(new_apicid)) {
328 printk ("%s: Not a valid mask!\n", __func__);
329
330 return 0xFF;
331 }
332 apicid = apicid | new_apicid;
333 cpus_found++;
334 }
335 cpu++;
336 }
337 return apicid;
338}
339
340static inline unsigned int
341summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
342 const struct cpumask *andmask)
343{
344 int apicid = summit_cpu_to_logical_apicid(0);
345 cpumask_var_t cpumask;
346
347 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
348 return apicid;
349
350 cpumask_and(cpumask, inmask, andmask);
351 cpumask_and(cpumask, cpumask, cpu_online_mask);
352 apicid = summit_cpu_mask_to_apicid(cpumask);
353
354 free_cpumask_var(cpumask);
355
356 return apicid;
357}
358
359/*
360 * cpuid returns the value latched in the HW at reset, not the APIC ID
361 * register's value. For any box whose BIOS changes APIC IDs, like
362 * clustered APIC systems, we must use hard_smp_processor_id.
363 *
364 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
365 */
366static inline int summit_phys_pkg_id(int cpuid_apic, int index_msb)
367{
368 return hard_smp_processor_id() >> index_msb;
369}
370
371static int probe_summit(void)
372{
373 /* probed later in mptable/ACPI hooks */
374 return 0;
375}
376
377static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
378{
379 /* Careful. Some cpus do not strictly honor the set of cpus
380 * specified in the interrupt destination when using lowest
381 * priority interrupt delivery mode.
382 *
383 * In particular there was a hyperthreading cpu observed to
384 * deliver interrupts to the wrong hyperthread when only one
385 * hyperthread was specified in the interrupt desitination.
386 */
387 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
388}
389
390#ifdef CONFIG_X86_SUMMIT_NUMA
35static struct rio_table_hdr *rio_table_hdr __initdata; 391static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 392static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; 393static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
@@ -186,3 +542,61 @@ void __init setup_summit(void)
186 next_wpeg = 0; 542 next_wpeg = 0;
187 } while (next_wpeg != 0); 543 } while (next_wpeg != 0);
188} 544}
545#endif
546
547struct genapic apic_summit = {
548
549 .name = "summit",
550 .probe = probe_summit,
551 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
552 .apic_id_registered = summit_apic_id_registered,
553
554 .irq_delivery_mode = dest_LowestPrio,
555 /* logical delivery broadcast to all CPUs: */
556 .irq_dest_mode = 1,
557
558 .target_cpus = summit_target_cpus,
559 .disable_esr = 1,
560 .dest_logical = APIC_DEST_LOGICAL,
561 .check_apicid_used = summit_check_apicid_used,
562 .check_apicid_present = summit_check_apicid_present,
563
564 .vector_allocation_domain = summit_vector_allocation_domain,
565 .init_apic_ldr = summit_init_apic_ldr,
566
567 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
568 .setup_apic_routing = summit_setup_apic_routing,
569 .multi_timer_check = NULL,
570 .apicid_to_node = summit_apicid_to_node,
571 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
572 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
573 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
574 .setup_portio_remap = NULL,
575 .check_phys_apicid_present = summit_check_phys_apicid_present,
576 .enable_apic_mode = NULL,
577 .phys_pkg_id = summit_phys_pkg_id,
578 .mps_oem_check = summit_mps_oem_check,
579
580 .get_apic_id = summit_get_apic_id,
581 .set_apic_id = NULL,
582 .apic_id_mask = 0xFF << 24,
583
584 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
585 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
586
587 .send_IPI_mask = summit_send_IPI_mask,
588 .send_IPI_mask_allbutself = NULL,
589 .send_IPI_allbutself = summit_send_IPI_allbutself,
590 .send_IPI_all = summit_send_IPI_all,
591 .send_IPI_self = default_send_IPI_self,
592
593 .wakeup_cpu = NULL,
594 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
595 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
596
597 .wait_for_init_deassert = default_wait_for_init_deassert,
598
599 .smp_callin_clear_local_apic = NULL,
600 .store_NMI_vector = NULL,
601 .inquire_remote_apic = default_inquire_remote_apic,
602};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..3bdb64829b82 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
1ENTRY(sys_call_table) 1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ 2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit 3 .long sys_exit
4 .long sys_fork 4 .long ptregs_fork
5 .long sys_read 5 .long sys_read
6 .long sys_write 6 .long sys_write
7 .long sys_open /* 5 */ 7 .long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
10 .long sys_creat 10 .long sys_creat
11 .long sys_link 11 .long sys_link
12 .long sys_unlink /* 10 */ 12 .long sys_unlink /* 10 */
13 .long sys_execve 13 .long ptregs_execve
14 .long sys_chdir 14 .long sys_chdir
15 .long sys_time 15 .long sys_time
16 .long sys_mknod 16 .long sys_mknod
@@ -88,7 +88,7 @@ ENTRY(sys_call_table)
88 .long sys_uselib 88 .long sys_uselib
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
109 .long sys_newlstat 109 .long sys_newlstat
110 .long sys_newfstat 110 .long sys_newfstat
111 .long sys_uname 111 .long sys_uname
112 .long sys_iopl /* 110 */ 112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup 113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */ 114 .long sys_ni_syscall /* old "idle" system call */
115 .long sys_vm86old 115 .long ptregs_vm86old
116 .long sys_wait4 116 .long sys_wait4
117 .long sys_swapoff /* 115 */ 117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo 118 .long sys_sysinfo
119 .long sys_ipc 119 .long sys_ipc
120 .long sys_fsync 120 .long sys_fsync
121 .long sys_sigreturn 121 .long ptregs_sigreturn
122 .long sys_clone /* 120 */ 122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname 123 .long sys_setdomainname
124 .long sys_newuname 124 .long sys_newuname
125 .long sys_modify_ldt 125 .long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
165 .long sys_mremap 165 .long sys_mremap
166 .long sys_setresuid16 166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */ 167 .long sys_getresuid16 /* 165 */
168 .long sys_vm86 168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */ 169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll 170 .long sys_poll
171 .long sys_nfsservctl 171 .long sys_nfsservctl
172 .long sys_setresgid16 /* 170 */ 172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16 173 .long sys_getresgid16
174 .long sys_prctl 174 .long sys_prctl
175 .long sys_rt_sigreturn 175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction 176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */ 177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending 178 .long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
185 .long sys_getcwd 185 .long sys_getcwd
186 .long sys_capget 186 .long sys_capget
187 .long sys_capset /* 185 */ 187 .long sys_capset /* 185 */
188 .long sys_sigaltstack 188 .long ptregs_sigaltstack
189 .long sys_sendfile 189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */ 190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long sys_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap2
195 .long sys_truncate64 195 .long sys_truncate64
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea2..764c74e871f2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -38,7 +38,7 @@
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h> 39#include <asm/timer.h>
40 40
41#include "do_timer.h" 41#include <asm/do_timer.h>
42 42
43int timer_ack; 43int timer_ack;
44 44
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
75irqreturn_t timer_interrupt(int irq, void *dev_id) 75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{ 76{
77 /* Keep nmi watchdog up to date */ 77 /* Keep nmi watchdog up to date */
78 per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; 78 inc_irq_stat(irq0_irqs);
79 79
80#ifdef CONFIG_X86_IO_APIC 80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) { 81 if (timer_ack) {
@@ -105,8 +105,8 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
105 high bit of the PPI port B (0x61). Note that some PS/2s, 105 high bit of the PPI port B (0x61). Note that some PS/2s,
106 notably the 55SX, work fine if this is removed. */ 106 notably the 55SX, work fine if this is removed. */
107 107
108 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p(0x61); /* read the current state */
109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
110 } 110 }
111#endif 111#endif
112 112
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c216..e6e695acd725 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -17,10 +17,10 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h> 19#include <linux/mca.h>
20#include <linux/nmi.h>
20 21
21#include <asm/i8253.h> 22#include <asm/i8253.h>
22#include <asm/hpet.h> 23#include <asm/hpet.h>
23#include <asm/nmi.h>
24#include <asm/vgtod.h> 24#include <asm/vgtod.h>
25#include <asm/time.h> 25#include <asm/time.h>
26#include <asm/timer.h> 26#include <asm/timer.h>
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
49} 49}
50EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
51 51
52irqreturn_t timer_interrupt(int irq, void *dev_id) 52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{ 53{
54 add_pda(irq0_irqs, 1); 54 inc_irq_stat(irq0_irqs);
55 55
56 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
57 57
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
80 break; 80 break;
81 no_ctr_free = (i == 4); 81 no_ctr_free = (i == 4);
82 if (no_ctr_free) { 82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
83 i = 3; 85 i = 3;
84 rdmsrl(MSR_K7_EVNTSEL3, evntsel3); 86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
85 wrmsrl(MSR_K7_EVNTSEL3, 0); 87 wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index f4049f3513b6..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,257 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
38 BUG();
39 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
40 load_cr3(swapper_pg_dir);
41}
42EXPORT_SYMBOL_GPL(leave_mm);
43
44/*
45 *
46 * The flush IPI assumes that a thread switch happens in this order:
47 * [cpu0: the cpu that switches]
48 * 1) switch_mm() either 1a) or 1b)
49 * 1a) thread switch to a different mm
50 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
51 * Stop ipi delivery for the old mm. This is not synchronized with
52 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
53 * for the wrong mm, and in the worst case we perform a superfluous
54 * tlb flush.
55 * 1a2) set cpu_tlbstate to TLBSTATE_OK
56 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
57 * was in lazy tlb mode.
58 * 1a3) update cpu_tlbstate[].active_mm
59 * Now cpu0 accepts tlb flushes for the new mm.
60 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
61 * Now the other cpus will send tlb flush ipis.
62 * 1a4) change cr3.
63 * 1b) thread switch without mm change
64 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
65 * flush ipis.
66 * 1b1) set cpu_tlbstate to TLBSTATE_OK
67 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
68 * Atomically set the bit [other cpus will start sending flush ipis],
69 * and test the bit.
70 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
71 * 2) switch %%esp, ie current
72 *
73 * The interrupt must handle 2 special cases:
74 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
75 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
76 * runs in kernel space, the cpu could load tlb entries for user space
77 * pages.
78 *
79 * The good news is that cpu_tlbstate is local to each cpu, no
80 * write/read ordering problems.
81 */
82
83/*
84 * TLB flush IPI:
85 *
86 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
87 * 2) Leave the mm if we are in the lazy tlb mode.
88 */
89
90void smp_invalidate_interrupt(struct pt_regs *regs)
91{
92 unsigned long cpu;
93
94 cpu = get_cpu();
95
96 if (!cpu_isset(cpu, flush_cpumask))
97 goto out;
98 /*
99 * This was a BUG() but until someone can quote me the
100 * line from the intel manual that guarantees an IPI to
101 * multiple CPUs is retried _only_ on the erroring CPUs
102 * its staying as a return
103 *
104 * BUG();
105 */
106
107 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
108 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
109 if (flush_va == TLB_FLUSH_ALL)
110 local_flush_tlb();
111 else
112 __flush_tlb_one(flush_va);
113 } else
114 leave_mm(cpu);
115 }
116 ack_APIC_irq();
117 smp_mb__before_clear_bit();
118 cpu_clear(cpu, flush_cpumask);
119 smp_mb__after_clear_bit();
120out:
121 put_cpu_no_resched();
122 __get_cpu_var(irq_stat).irq_tlb_count++;
123}
124
125void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
126 unsigned long va)
127{
128 cpumask_t cpumask = *cpumaskp;
129
130 /*
131 * A couple of (to be removed) sanity checks:
132 *
133 * - current CPU must not be in mask
134 * - mask must exist :)
135 */
136 BUG_ON(cpus_empty(cpumask));
137 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
138 BUG_ON(!mm);
139
140#ifdef CONFIG_HOTPLUG_CPU
141 /* If a CPU which we ran on has gone down, OK. */
142 cpus_and(cpumask, cpumask, cpu_online_map);
143 if (unlikely(cpus_empty(cpumask)))
144 return;
145#endif
146
147 /*
148 * i'm not happy about this global shared spinlock in the
149 * MM hot path, but we'll see how contended it is.
150 * AK: x86-64 has a faster method that could be ported.
151 */
152 spin_lock(&tlbstate_lock);
153
154 flush_mm = mm;
155 flush_va = va;
156 cpus_or(flush_cpumask, cpumask, flush_cpumask);
157
158 /*
159 * Make the above memory operations globally visible before
160 * sending the IPI.
161 */
162 smp_mb();
163 /*
164 * We have to send the IPI only to
165 * CPUs affected.
166 */
167 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
168
169 while (!cpus_empty(flush_cpumask))
170 /* nothing. lockup detection does not belong here */
171 cpu_relax();
172
173 flush_mm = NULL;
174 flush_va = 0;
175 spin_unlock(&tlbstate_lock);
176}
177
178void flush_tlb_current_task(void)
179{
180 struct mm_struct *mm = current->mm;
181 cpumask_t cpu_mask;
182
183 preempt_disable();
184 cpu_mask = mm->cpu_vm_mask;
185 cpu_clear(smp_processor_id(), cpu_mask);
186
187 local_flush_tlb();
188 if (!cpus_empty(cpu_mask))
189 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
190 preempt_enable();
191}
192
193void flush_tlb_mm(struct mm_struct *mm)
194{
195 cpumask_t cpu_mask;
196
197 preempt_disable();
198 cpu_mask = mm->cpu_vm_mask;
199 cpu_clear(smp_processor_id(), cpu_mask);
200
201 if (current->active_mm == mm) {
202 if (current->mm)
203 local_flush_tlb();
204 else
205 leave_mm(smp_processor_id());
206 }
207 if (!cpus_empty(cpu_mask))
208 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
209
210 preempt_enable();
211}
212
213void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
214{
215 struct mm_struct *mm = vma->vm_mm;
216 cpumask_t cpu_mask;
217
218 preempt_disable();
219 cpu_mask = mm->cpu_vm_mask;
220 cpu_clear(smp_processor_id(), cpu_mask);
221
222 if (current->active_mm == mm) {
223 if (current->mm)
224 __flush_tlb_one(va);
225 else
226 leave_mm(smp_processor_id());
227 }
228
229 if (!cpus_empty(cpu_mask))
230 flush_tlb_others(cpu_mask, mm, va);
231
232 preempt_enable();
233}
234EXPORT_SYMBOL(flush_tlb_page);
235
236static void do_flush_tlb_all(void *info)
237{
238 unsigned long cpu = smp_processor_id();
239
240 __flush_tlb_all();
241 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
242 leave_mm(cpu);
243}
244
245void flush_tlb_all(void)
246{
247 on_each_cpu(do_flush_tlb_all, NULL, 1);
248}
249
250void reset_lazy_tlbstate(void)
251{
252 int cpu = raw_smp_processor_id();
253
254 per_cpu(cpu_tlbstate, cpu).state = 0;
255 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
256}
257
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index 8f919ca69494..000000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,284 +0,0 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h>
6#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h>
10
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h>
14#include <asm/mmu_context.h>
15#include <asm/proto.h>
16#include <asm/apicdef.h>
17#include <asm/idle.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20
21#include <mach_ipi.h>
22/*
23 * Smarter SMP flushing macros.
24 * c/o Linus Torvalds.
25 *
26 * These mean you can really definitely utterly forget about
27 * writing to user space from interrupts. (Its not allowed anyway).
28 *
29 * Optimizations Manfred Spraul <manfred@colorfullife.com>
30 *
31 * More scalable flush, from Andi Kleen
32 *
33 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data.
37 *
38 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now.
40 * In future when interrupts are split into per CPU domains this could be
41 * fixed, at the cost of triggering multiple IPIs in some cases.
42 */
43
44union smp_flush_state {
45 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm;
48 unsigned long flush_va;
49 spinlock_t tlbstate_lock;
50 };
51 char pad[SMP_CACHE_BYTES];
52} ____cacheline_aligned;
53
54/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state);
58
59/*
60 * We cannot call mmdrop() because we are in interrupt context,
61 * instead update mm->cpu_vm_mask.
62 */
63void leave_mm(int cpu)
64{
65 if (read_pda(mmu_state) == TLBSTATE_OK)
66 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir);
69}
70EXPORT_SYMBOL_GPL(leave_mm);
71
72/*
73 *
74 * The flush IPI assumes that a thread switch happens in this order:
75 * [cpu0: the cpu that switches]
76 * 1) switch_mm() either 1a) or 1b)
77 * 1a) thread switch to a different mm
78 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
79 * Stop ipi delivery for the old mm. This is not synchronized with
80 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
81 * for the wrong mm, and in the worst case we perform a superfluous
82 * tlb flush.
83 * 1a2) set cpu mmu_state to TLBSTATE_OK
84 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
85 * was in lazy tlb mode.
86 * 1a3) update cpu active_mm
87 * Now cpu0 accepts tlb flushes for the new mm.
88 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
89 * Now the other cpus will send tlb flush ipis.
90 * 1a4) change cr3.
91 * 1b) thread switch without mm change
92 * cpu active_mm is correct, cpu0 already handles
93 * flush ipis.
94 * 1b1) set cpu mmu_state to TLBSTATE_OK
95 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
96 * Atomically set the bit [other cpus will start sending flush ipis],
97 * and test the bit.
98 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
99 * 2) switch %%esp, ie current
100 *
101 * The interrupt must handle 2 special cases:
102 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
103 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
104 * runs in kernel space, the cpu could load tlb entries for user space
105 * pages.
106 *
107 * The good news is that cpu mmu_state is local to each cpu, no
108 * write/read ordering problems.
109 */
110
111/*
112 * TLB flush IPI:
113 *
114 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
115 * 2) Leave the mm if we are in the lazy tlb mode.
116 *
117 * Interrupts are disabled.
118 */
119
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
121{
122 int cpu;
123 int sender;
124 union smp_flush_state *f;
125
126 cpu = smp_processor_id();
127 /*
128 * orig_rax contains the negated interrupt vector.
129 * Use that to determine where the sender put the data.
130 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender);
133
134 if (!cpu_isset(cpu, f->flush_cpumask))
135 goto out;
136 /*
137 * This was a BUG() but until someone can quote me the
138 * line from the intel manual that guarantees an IPI to
139 * multiple CPUs is retried _only_ on the erroring CPUs
140 * its staying as a return
141 *
142 * BUG();
143 */
144
145 if (f->flush_mm == read_pda(active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb();
149 else
150 __flush_tlb_one(f->flush_va);
151 } else
152 leave_mm(cpu);
153 }
154out:
155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask);
157 add_pda(irq_tlb_count, 1);
158}
159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
161 unsigned long va)
162{
163 int sender;
164 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169
170 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender);
173
174 /*
175 * Could avoid this lock when
176 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
177 * probably not worth checking this for a cache-hot lock.
178 */
179 spin_lock(&f->tlbstate_lock);
180
181 f->flush_mm = mm;
182 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
184
185 /*
186 * Make the above memory operations globally visible before
187 * sending the IPI.
188 */
189 smp_mb();
190 /*
191 * We have to send the IPI only to
192 * CPUs affected.
193 */
194 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
195
196 while (!cpus_empty(f->flush_cpumask))
197 cpu_relax();
198
199 f->flush_mm = NULL;
200 f->flush_va = 0;
201 spin_unlock(&f->tlbstate_lock);
202}
203
204static int __cpuinit init_smp_flush(void)
205{
206 int i;
207
208 for_each_possible_cpu(i)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
210
211 return 0;
212}
213core_initcall(init_smp_flush);
214
215void flush_tlb_current_task(void)
216{
217 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219
220 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223
224 local_flush_tlb();
225 if (!cpus_empty(cpu_mask))
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable();
228}
229
230void flush_tlb_mm(struct mm_struct *mm)
231{
232 cpumask_t cpu_mask;
233
234 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237
238 if (current->active_mm == mm) {
239 if (current->mm)
240 local_flush_tlb();
241 else
242 leave_mm(smp_processor_id());
243 }
244 if (!cpus_empty(cpu_mask))
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
246
247 preempt_enable();
248}
249
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{
252 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254
255 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258
259 if (current->active_mm == mm) {
260 if (current->mm)
261 __flush_tlb_one(va);
262 else
263 leave_mm(smp_processor_id());
264 }
265
266 if (!cpus_empty(cpu_mask))
267 flush_tlb_others(cpu_mask, mm, va);
268
269 preempt_enable();
270}
271
272static void do_flush_tlb_all(void *info)
273{
274 unsigned long cpu = smp_processor_id();
275
276 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY)
278 leave_mm(cpu);
279}
280
281void flush_tlb_all(void)
282{
283 on_each_cpu(do_flush_tlb_all, NULL, 1);
284}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 04431f34fd16..f396e61bcb34 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
@@ -19,7 +20,7 @@
19#include <asm/tsc.h> 20#include <asm/tsc.h>
20#include <asm/irq_vectors.h> 21#include <asm/irq_vectors.h>
21 22
22#include <mach_apic.h> 23#include <asm/genapic.h>
23 24
24static struct bau_control **uv_bau_table_bases __read_mostly; 25static struct bau_control **uv_bau_table_bases __read_mostly;
25static int uv_bau_retry_limit __read_mostly; 26static int uv_bau_retry_limit __read_mostly;
@@ -200,6 +201,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
200 destination_timeouts = 0; 201 destination_timeouts = 0;
201 } 202 }
202 } 203 }
204 cpu_relax();
203 } 205 }
204 return FLUSH_COMPLETE; 206 return FLUSH_COMPLETE;
205} 207}
@@ -209,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
209 * 211 *
210 * Send a broadcast and wait for a broadcast message to complete. 212 * Send a broadcast and wait for a broadcast message to complete.
211 * 213 *
212 * The cpumaskp mask contains the cpus the broadcast was sent to. 214 * The flush_mask contains the cpus the broadcast was sent to.
213 * 215 *
214 * Returns 1 if all remote flushing was done. The mask is zeroed. 216 * Returns NULL if all remote flushing was done. The mask is zeroed.
215 * Returns 0 if some remote flushing remains to be done. The mask is left 217 * Returns @flush_mask if some remote flushing remains to be done. The
216 * unchanged. 218 * mask will have some bits still set.
217 */ 219 */
218int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 220const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
219 cpumask_t *cpumaskp) 221 struct bau_desc *bau_desc,
222 struct cpumask *flush_mask)
220{ 223{
221 int completion_status = 0; 224 int completion_status = 0;
222 int right_shift; 225 int right_shift;
@@ -256,66 +259,76 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
256 * the cpu's, all of which are still in the mask. 259 * the cpu's, all of which are still in the mask.
257 */ 260 */
258 __get_cpu_var(ptcstats).ptc_i++; 261 __get_cpu_var(ptcstats).ptc_i++;
259 return 0; 262 return flush_mask;
260 } 263 }
261 264
262 /* 265 /*
263 * Success, so clear the remote cpu's from the mask so we don't 266 * Success, so clear the remote cpu's from the mask so we don't
264 * use the IPI method of shootdown on them. 267 * use the IPI method of shootdown on them.
265 */ 268 */
266 for_each_cpu_mask(bit, *cpumaskp) { 269 for_each_cpu(bit, flush_mask) {
267 blade = uv_cpu_to_blade_id(bit); 270 blade = uv_cpu_to_blade_id(bit);
268 if (blade == this_blade) 271 if (blade == this_blade)
269 continue; 272 continue;
270 cpu_clear(bit, *cpumaskp); 273 cpumask_clear_cpu(bit, flush_mask);
271 } 274 }
272 if (!cpus_empty(*cpumaskp)) 275 if (!cpumask_empty(flush_mask))
273 return 0; 276 return flush_mask;
274 return 1; 277 return NULL;
275} 278}
276 279
277/** 280/**
278 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
279 * address or all TLB's 282 * address or all TLB's
280 * @cpumaskp: mask of all cpu's in which the address is to be removed 283 * @cpumask: mask of all cpu's in which the address is to be removed
281 * @mm: mm_struct containing virtual address range 284 * @mm: mm_struct containing virtual address range
282 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 285 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
286 * @cpu: the current cpu
283 * 287 *
284 * This is the entry point for initiating any UV global TLB shootdown. 288 * This is the entry point for initiating any UV global TLB shootdown.
285 * 289 *
286 * Purges the translation caches of all specified processors of the given 290 * Purges the translation caches of all specified processors of the given
287 * virtual address, or purges all TLB's on specified processors. 291 * virtual address, or purges all TLB's on specified processors.
288 * 292 *
289 * The caller has derived the cpumaskp from the mm_struct and has subtracted 293 * The caller has derived the cpumask from the mm_struct. This function
290 * the local cpu from the mask. This function is called only if there 294 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
291 * are bits set in the mask. (e.g. flush_tlb_page())
292 * 295 *
293 * The cpumaskp is converted into a nodemask of the nodes containing 296 * The cpumask is converted into a nodemask of the nodes containing
294 * the cpus. 297 * the cpus.
295 * 298 *
296 * Returns 1 if all remote flushing was done. 299 * Note that this function should be called with preemption disabled.
297 * Returns 0 if some remote flushing remains to be done. 300 *
301 * Returns NULL if all remote flushing was done.
302 * Returns pointer to cpumask if some remote flushing remains to be
303 * done. The returned pointer is valid till preemption is re-enabled.
298 */ 304 */
299int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 305const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
300 unsigned long va) 306 struct mm_struct *mm,
307 unsigned long va, unsigned int cpu)
301{ 308{
309 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
310 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
302 int i; 311 int i;
303 int bit; 312 int bit;
304 int blade; 313 int blade;
305 int cpu; 314 int uv_cpu;
306 int this_blade; 315 int this_blade;
307 int locals = 0; 316 int locals = 0;
308 struct bau_desc *bau_desc; 317 struct bau_desc *bau_desc;
309 318
310 cpu = uv_blade_processor_id(); 319 WARN_ON(!in_atomic());
320
321 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
322
323 uv_cpu = uv_blade_processor_id();
311 this_blade = uv_numa_blade_id(); 324 this_blade = uv_numa_blade_id();
312 bau_desc = __get_cpu_var(bau_control).descriptor_base; 325 bau_desc = __get_cpu_var(bau_control).descriptor_base;
313 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 326 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
314 327
315 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 328 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
316 329
317 i = 0; 330 i = 0;
318 for_each_cpu_mask(bit, *cpumaskp) { 331 for_each_cpu(bit, flush_mask) {
319 blade = uv_cpu_to_blade_id(bit); 332 blade = uv_cpu_to_blade_id(bit);
320 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 333 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
321 if (blade == this_blade) { 334 if (blade == this_blade) {
@@ -330,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
330 * no off_node flushing; return status for local node 343 * no off_node flushing; return status for local node
331 */ 344 */
332 if (locals) 345 if (locals)
333 return 0; 346 return flush_mask;
334 else 347 else
335 return 1; 348 return NULL;
336 } 349 }
337 __get_cpu_var(ptcstats).requestor++; 350 __get_cpu_var(ptcstats).requestor++;
338 __get_cpu_var(ptcstats).ntargeted += i; 351 __get_cpu_var(ptcstats).ntargeted += i;
339 352
340 bau_desc->payload.address = va; 353 bau_desc->payload.address = va;
341 bau_desc->payload.sending_cpu = smp_processor_id(); 354 bau_desc->payload.sending_cpu = cpu;
342 355
343 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 356 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
344} 357}
345 358
346/* 359/*
@@ -566,14 +579,10 @@ static int __init uv_ptc_init(void)
566 if (!is_uv_system()) 579 if (!is_uv_system())
567 return 0; 580 return 0;
568 581
569 if (!proc_mkdir("sgi_uv", NULL))
570 return -EINVAL;
571
572 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 582 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
573 if (!proc_uv_ptc) { 583 if (!proc_uv_ptc) {
574 printk(KERN_ERR "unable to create %s proc entry\n", 584 printk(KERN_ERR "unable to create %s proc entry\n",
575 UV_PTC_BASENAME); 585 UV_PTC_BASENAME);
576 remove_proc_entry("sgi_uv", NULL);
577 return -EINVAL; 586 return -EINVAL;
578 } 587 }
579 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; 588 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
@@ -586,7 +595,6 @@ static int __init uv_ptc_init(void)
586static struct bau_control * __init uv_table_bases_init(int blade, int node) 595static struct bau_control * __init uv_table_bases_init(int blade, int node)
587{ 596{
588 int i; 597 int i;
589 int *ip;
590 struct bau_msg_status *msp; 598 struct bau_msg_status *msp;
591 struct bau_control *bau_tabp; 599 struct bau_control *bau_tabp;
592 600
@@ -603,13 +611,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
603 bau_cpubits_clear(&msp->seen_by, (int) 611 bau_cpubits_clear(&msp->seen_by, (int)
604 uv_blade_nr_possible_cpus(blade)); 612 uv_blade_nr_possible_cpus(blade));
605 613
606 bau_tabp->watching =
607 kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
608 BUG_ON(!bau_tabp->watching);
609
610 for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
611 *ip = 0;
612
613 uv_bau_table_bases[blade] = bau_tabp; 614 uv_bau_table_bases[blade] = bau_tabp;
614 615
615 return bau_tabp; 616 return bau_tabp;
@@ -632,7 +633,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
632 bcp->bau_msg_head = bau_tablesp->va_queue_first; 633 bcp->bau_msg_head = bau_tablesp->va_queue_first;
633 bcp->va_queue_first = bau_tablesp->va_queue_first; 634 bcp->va_queue_first = bau_tablesp->va_queue_first;
634 bcp->va_queue_last = bau_tablesp->va_queue_last; 635 bcp->va_queue_last = bau_tablesp->va_queue_last;
635 bcp->watching = bau_tablesp->watching;
636 bcp->msg_statuses = bau_tablesp->msg_statuses; 636 bcp->msg_statuses = bau_tablesp->msg_statuses;
637 bcp->descriptor_base = adp; 637 bcp->descriptor_base = adp;
638 } 638 }
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024d..808031a5ba19 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h>
4 5
5/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 8
9void __init reserve_trampoline_memory(void)
10{
11#ifdef CONFIG_X86_32
12 /*
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
21 "TRAMPOLINE");
22}
23
8/* 24/*
9 * Currently trivial. Write the real->protected mode 25 * Currently trivial. Write the real->protected mode
10 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
12 */ 28 */
13unsigned long setup_trampoline(void) 29unsigned long setup_trampoline(void)
14{ 30{
15 memcpy(trampoline_base, trampoline_data, 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
16 trampoline_end - trampoline_data);
17 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
18} 33}
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 894293c598db..95a012a4664e 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -29,6 +29,7 @@
29#include <asm/page.h> 29#include <asm/page.h>
30#include <asm/msr.h> 30#include <asm/msr.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/processor-flags.h>
32 33
33.section .rodata, "a", @progbits 34.section .rodata, "a", @progbits
34 35
@@ -37,7 +38,7 @@
37ENTRY(trampoline_data) 38ENTRY(trampoline_data)
38r_base = . 39r_base = .
39 cli # We should be safe anyway 40 cli # We should be safe anyway
40 wbinvd 41 wbinvd
41 mov %cs, %ax # Code and data in the same place 42 mov %cs, %ax # Code and data in the same place
42 mov %ax, %ds 43 mov %ax, %ds
43 mov %ax, %es 44 mov %ax, %es
@@ -73,9 +74,8 @@ r_base = .
73 lidtl tidt - r_base # load idt with 0, 0 74 lidtl tidt - r_base # load idt with 0, 0
74 lgdtl tgdt - r_base # load gdt with whatever is appropriate 75 lgdtl tgdt - r_base # load gdt with whatever is appropriate
75 76
76 xor %ax, %ax 77 mov $X86_CR0_PE, %ax # protected mode (PE) bit
77 inc %ax # protected mode (PE) bit 78 lmsw %ax # into protected mode
78 lmsw %ax # into protected mode
79 79
80 # flush prefetch and jump to startup_32 80 # flush prefetch and jump to startup_32
81 ljmpl *(startup_32_vector - r_base) 81 ljmpl *(startup_32_vector - r_base)
@@ -86,9 +86,8 @@ startup_32:
86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register 86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
87 movl %eax, %ds 87 movl %eax, %ds
88 88
89 xorl %eax, %eax 89 movl $X86_CR4_PAE, %eax
90 btsl $5, %eax # Enable PAE mode 90 movl %eax, %cr4 # Enable PAE mode
91 movl %eax, %cr4
92 91
93 # Setup trampoline 4 level pagetables 92 # Setup trampoline 4 level pagetables
94 leal (trampoline_level4_pgt - r_base)(%esi), %eax 93 leal (trampoline_level4_pgt - r_base)(%esi), %eax
@@ -99,9 +98,9 @@ startup_32:
99 xorl %edx, %edx 98 xorl %edx, %edx
100 wrmsr 99 wrmsr
101 100
102 xorl %eax, %eax 101 # Enable paging and in turn activate Long Mode
103 btsl $31, %eax # Enable paging and in turn activate Long Mode 102 # Enable protected mode
104 btsl $0, %eax # Enable protected mode 103 movl $(X86_CR0_PG | X86_CR0_PE), %eax
105 movl %eax, %cr0 104 movl %eax, %cr0
106 105
107 /* 106 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 04d242ab0161..bde57f0f1616 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/unwind.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/errno.h> 24#include <linux/errno.h>
26#include <linux/kexec.h> 25#include <linux/kexec.h>
@@ -51,30 +50,22 @@
51#include <asm/debugreg.h> 50#include <asm/debugreg.h>
52#include <asm/atomic.h> 51#include <asm/atomic.h>
53#include <asm/system.h> 52#include <asm/system.h>
54#include <asm/unwind.h>
55#include <asm/traps.h> 53#include <asm/traps.h>
56#include <asm/desc.h> 54#include <asm/desc.h>
57#include <asm/i387.h> 55#include <asm/i387.h>
58 56
59#include <mach_traps.h> 57#include <asm/mach_traps.h>
60 58
61#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
63#include <asm/proto.h> 61#include <asm/proto.h>
64#include <asm/pda.h>
65#else 62#else
66#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h> 64#include <asm/arch_hooks.h>
68#include <asm/nmi.h>
69#include <asm/smp.h>
70#include <asm/io.h>
71#include <asm/traps.h> 65#include <asm/traps.h>
72 66
73#include "cpu/mcheck/mce.h" 67#include "cpu/mcheck/mce.h"
74 68
75DECLARE_BITMAP(used_vectors, NR_VECTORS);
76EXPORT_SYMBOL_GPL(used_vectors);
77
78asmlinkage int system_call(void); 69asmlinkage int system_call(void);
79 70
80/* Do we ignore FPU interrupts ? */ 71/* Do we ignore FPU interrupts ? */
@@ -89,6 +80,9 @@ gate_desc idt_table[256]
89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
90#endif 81#endif
91 82
83DECLARE_BITMAP(used_vectors, NR_VECTORS);
84EXPORT_SYMBOL_GPL(used_vectors);
85
92static int ignore_nmis; 86static int ignore_nmis;
93 87
94static inline void conditional_sti(struct pt_regs *regs) 88static inline void conditional_sti(struct pt_regs *regs)
@@ -292,8 +286,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
292 tsk->thread.error_code = error_code; 286 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8; 287 tsk->thread.trap_no = 8;
294 288
295 /* This is always a kernel trap and never fixable (and thus must 289 /*
296 never return). */ 290 * This is always a kernel trap and never fixable (and thus must
291 * never return).
292 */
297 for (;;) 293 for (;;)
298 die(str, regs, error_code); 294 die(str, regs, error_code);
299} 295}
@@ -481,11 +477,7 @@ do_nmi(struct pt_regs *regs, long error_code)
481{ 477{
482 nmi_enter(); 478 nmi_enter();
483 479
484#ifdef CONFIG_X86_32 480 inc_irq_stat(__nmi_count);
485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
486#else
487 add_pda(__nmi_count, 1);
488#endif
489 481
490 if (!ignore_nmis) 482 if (!ignore_nmis)
491 default_do_nmi(regs); 483 default_do_nmi(regs);
@@ -524,9 +516,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
524} 516}
525 517
526#ifdef CONFIG_X86_64 518#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack 519/*
528 for scheduling or signal handling. The actual stack switch is done in 520 * Help handler running on IST stack to switch back to user stack
529 entry.S */ 521 * for scheduling or signal handling. The actual stack switch is done in
522 * entry.S
523 */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) 524asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{ 525{
532 struct pt_regs *regs = eregs; 526 struct pt_regs *regs = eregs;
@@ -536,8 +530,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
536 /* Exception from user space */ 530 /* Exception from user space */
537 else if (user_mode(eregs)) 531 else if (user_mode(eregs))
538 regs = task_pt_regs(current); 532 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to 533 /*
540 kernel process stack. */ 534 * Exception from kernel and interrupts are enabled. Move to
535 * kernel process stack.
536 */
541 else if (eregs->flags & X86_EFLAGS_IF) 537 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 538 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs) 539 if (eregs != regs)
@@ -664,7 +660,7 @@ void math_error(void __user *ip)
664{ 660{
665 struct task_struct *task; 661 struct task_struct *task;
666 siginfo_t info; 662 siginfo_t info;
667 unsigned short cwd, swd; 663 unsigned short cwd, swd, err;
668 664
669 /* 665 /*
670 * Save the info for the exception handler and clear the error. 666 * Save the info for the exception handler and clear the error.
@@ -675,7 +671,6 @@ void math_error(void __user *ip)
675 task->thread.error_code = 0; 671 task->thread.error_code = 0;
676 info.si_signo = SIGFPE; 672 info.si_signo = SIGFPE;
677 info.si_errno = 0; 673 info.si_errno = 0;
678 info.si_code = __SI_FAULT;
679 info.si_addr = ip; 674 info.si_addr = ip;
680 /* 675 /*
681 * (~cwd & swd) will mask out exceptions that are not set to unmasked 676 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +684,30 @@ void math_error(void __user *ip)
689 */ 684 */
690 cwd = get_fpu_cwd(task); 685 cwd = get_fpu_cwd(task);
691 swd = get_fpu_swd(task); 686 swd = get_fpu_swd(task);
692 switch (swd & ~cwd & 0x3f) { 687
693 case 0x000: /* No unmasked exception */ 688 err = swd & ~cwd;
694#ifdef CONFIG_X86_32 689
695 return; 690 if (err & 0x001) { /* Invalid op */
696#endif
697 default: /* Multiple exceptions */
698 break;
699 case 0x001: /* Invalid Op */
700 /* 691 /*
701 * swd & 0x240 == 0x040: Stack Underflow 692 * swd & 0x240 == 0x040: Stack Underflow
702 * swd & 0x240 == 0x240: Stack Overflow 693 * swd & 0x240 == 0x240: Stack Overflow
703 * User must clear the SF bit (0x40) if set 694 * User must clear the SF bit (0x40) if set
704 */ 695 */
705 info.si_code = FPE_FLTINV; 696 info.si_code = FPE_FLTINV;
706 break; 697 } else if (err & 0x004) { /* Divide by Zero */
707 case 0x002: /* Denormalize */
708 case 0x010: /* Underflow */
709 info.si_code = FPE_FLTUND;
710 break;
711 case 0x004: /* Zero Divide */
712 info.si_code = FPE_FLTDIV; 698 info.si_code = FPE_FLTDIV;
713 break; 699 } else if (err & 0x008) { /* Overflow */
714 case 0x008: /* Overflow */
715 info.si_code = FPE_FLTOVF; 700 info.si_code = FPE_FLTOVF;
716 break; 701 } else if (err & 0x012) { /* Denormal, Underflow */
717 case 0x020: /* Precision */ 702 info.si_code = FPE_FLTUND;
703 } else if (err & 0x020) { /* Precision */
718 info.si_code = FPE_FLTRES; 704 info.si_code = FPE_FLTRES;
719 break; 705 } else {
706 /*
707 * If we're using IRQ 13, or supposedly even some trap 16
708 * implementations, it's possible we get a spurious trap...
709 */
710 return; /* Spurious trap, no error */
720 } 711 }
721 force_sig_info(SIGFPE, &info, task); 712 force_sig_info(SIGFPE, &info, task);
722} 713}
@@ -904,7 +895,7 @@ asmlinkage void math_state_restore(void)
904EXPORT_SYMBOL_GPL(math_state_restore); 895EXPORT_SYMBOL_GPL(math_state_restore);
905 896
906#ifndef CONFIG_MATH_EMULATION 897#ifndef CONFIG_MATH_EMULATION
907asmlinkage void math_emulate(long arg) 898void math_emulate(struct math_emu_info *info)
908{ 899{
909 printk(KERN_EMERG 900 printk(KERN_EMERG
910 "math-emulation not enabled and no coprocessor found.\n"); 901 "math-emulation not enabled and no coprocessor found.\n");
@@ -915,12 +906,16 @@ asmlinkage void math_emulate(long arg)
915#endif /* CONFIG_MATH_EMULATION */ 906#endif /* CONFIG_MATH_EMULATION */
916 907
917dotraplinkage void __kprobes 908dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error) 909do_device_not_available(struct pt_regs *regs, long error_code)
919{ 910{
920#ifdef CONFIG_X86_32 911#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) { 912 if (read_cr0() & X86_CR0_EM) {
913 struct math_emu_info info = { };
914
922 conditional_sti(regs); 915 conditional_sti(regs);
923 math_emulate(0); 916
917 info.regs = regs;
918 math_emulate(&info);
924 } else { 919 } else {
925 math_state_restore(); /* interrupts still off */ 920 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs); 921 conditional_sti(regs);
@@ -949,9 +944,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
949 944
950void __init trap_init(void) 945void __init trap_init(void)
951{ 946{
952#ifdef CONFIG_X86_32
953 int i; 947 int i;
954#endif
955 948
956#ifdef CONFIG_EISA 949#ifdef CONFIG_EISA
957 void __iomem *p = early_ioremap(0x0FFFD9, 4); 950 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1008,11 +1001,15 @@ void __init trap_init(void)
1008 } 1001 }
1009 1002
1010 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 1003 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1004#endif
1011 1005
1012 /* Reserve all the builtin and the syscall vector: */ 1006 /* Reserve all the builtin and the syscall vector: */
1013 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1007 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1014 set_bit(i, used_vectors); 1008 set_bit(i, used_vectors);
1015 1009
1010#ifdef CONFIG_X86_64
1011 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1012#else
1016 set_bit(SYSCALL_VECTOR, used_vectors); 1013 set_bit(SYSCALL_VECTOR, used_vectors);
1017#endif 1014#endif
1018 /* 1015 /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 424093b157d3..83d53ce5d4c4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
15#include <asm/vgtod.h> 15#include <asm/vgtod.h>
16#include <asm/time.h> 16#include <asm/time.h>
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h>
18 19
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
31 erroneous rdtsc usage on !cpu_has_tsc processors */ 32 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1; 33static int tsc_disabled = -1;
33 34
35static int tsc_clocksource_reliable;
34/* 36/*
35 * Scheduler clock - returns current time in nanosec units. 37 * Scheduler clock - returns current time in nanosec units.
36 */ 38 */
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
98 100
99__setup("notsc", notsc_setup); 101__setup("notsc", notsc_setup);
100 102
103static int __init tsc_setup(char *str)
104{
105 if (!strcmp(str, "reliable"))
106 tsc_clocksource_reliable = 1;
107 return 1;
108}
109
110__setup("tsc=", tsc_setup);
111
101#define MAX_RETRIES 5 112#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000 113#define SMI_TRESHOLD 50000
103 114
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
352{ 363{
353 u64 tsc1, tsc2, delta, ref1, ref2; 364 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 365 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate; 366 unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
356 int hpet = is_hpet_enabled(), i, loopmin; 367 int hpet = is_hpet_enabled(), i, loopmin;
357 368
369 tsc_khz = get_hypervisor_tsc_freq();
370 if (tsc_khz) {
371 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
372 return tsc_khz;
373 }
374
358 local_irq_save(flags); 375 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate(); 376 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags); 377 local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
731 {} 748 {}
732}; 749};
733 750
734/* 751static void __init check_system_tsc_reliable(void)
735 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC 752{
736 */
737#ifdef CONFIG_MGEODE_LX 753#ifdef CONFIG_MGEODE_LX
738/* RTSC counts during suspend */ 754 /* RTSC counts during suspend */
739#define RTSC_SUSP 0x100 755#define RTSC_SUSP 0x100
740
741static void __init check_geode_tsc_reliable(void)
742{
743 unsigned long res_low, res_high; 756 unsigned long res_low, res_high;
744 757
745 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 758 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
759 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
746 if (res_low & RTSC_SUSP) 760 if (res_low & RTSC_SUSP)
747 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 761 tsc_clocksource_reliable = 1;
748}
749#else
750static inline void check_geode_tsc_reliable(void) { }
751#endif 762#endif
763 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
764 tsc_clocksource_reliable = 1;
765}
752 766
753/* 767/*
754 * Make an educated guess if the TSC is trustworthy and synchronized 768 * Make an educated guess if the TSC is trustworthy and synchronized
@@ -759,7 +773,7 @@ __cpuinit int unsynchronized_tsc(void)
759 if (!cpu_has_tsc || tsc_unstable) 773 if (!cpu_has_tsc || tsc_unstable)
760 return 1; 774 return 1;
761 775
762#ifdef CONFIG_X86_SMP 776#ifdef CONFIG_SMP
763 if (apic_is_clustered_box()) 777 if (apic_is_clustered_box())
764 return 1; 778 return 1;
765#endif 779#endif
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
783{ 797{
784 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, 798 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
785 clocksource_tsc.shift); 799 clocksource_tsc.shift);
800 if (tsc_clocksource_reliable)
801 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
786 /* lower the rating if we already know its unstable: */ 802 /* lower the rating if we already know its unstable: */
787 if (check_tsc_unstable()) { 803 if (check_tsc_unstable()) {
788 clocksource_tsc.rating = 0; 804 clocksource_tsc.rating = 0;
@@ -843,7 +859,7 @@ void __init tsc_init(void)
843 if (unsynchronized_tsc()) 859 if (unsynchronized_tsc())
844 mark_tsc_unstable("TSCs unsynchronized"); 860 mark_tsc_unstable("TSCs unsynchronized");
845 861
846 check_geode_tsc_reliable(); 862 check_system_tsc_reliable();
847 init_tsc_clocksource(); 863 init_tsc_clocksource();
848} 864}
849 865
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9ffb01c31c40..bf36328f6ef9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,9 @@ static __cpuinit void check_tsc_warp(void)
46 cycles_t start, now, prev, end; 46 cycles_t start, now, prev, end;
47 int i; 47 int i;
48 48
49 rdtsc_barrier();
49 start = get_cycles(); 50 start = get_cycles();
51 rdtsc_barrier();
50 /* 52 /*
51 * The measurement runs for 20 msecs: 53 * The measurement runs for 20 msecs:
52 */ 54 */
@@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(void)
61 */ 63 */
62 __raw_spin_lock(&sync_lock); 64 __raw_spin_lock(&sync_lock);
63 prev = last_tsc; 65 prev = last_tsc;
66 rdtsc_barrier();
64 now = get_cycles(); 67 now = get_cycles();
68 rdtsc_barrier();
65 last_tsc = now; 69 last_tsc = now;
66 __raw_spin_unlock(&sync_lock); 70 __raw_spin_unlock(&sync_lock);
67 71
@@ -108,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
108 if (unsynchronized_tsc()) 112 if (unsynchronized_tsc())
109 return; 113 return;
110 114
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return;
119 }
120
111 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
112 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
113 123
@@ -161,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
161{ 171{
162 int cpus = 2; 172 int cpus = 2;
163 173
164 if (unsynchronized_tsc()) 174 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
165 return; 175 return;
166 176
167 /* 177 /*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 0c9667f0752a..4fd646e6dd43 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -32,9 +32,9 @@
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <asm/genapic.h>
36 36
37#include "mach_apic.h" 37#include <asm/genapic.h>
38 38
39#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
40 40
@@ -176,33 +176,31 @@ static int __init visws_get_smp_config(unsigned int early)
176 * No problem for Linux. 176 * No problem for Linux.
177 */ 177 */
178 178
179static void __init MP_processor_info(struct mpc_config_processor *m) 179static void __init MP_processor_info(struct mpc_cpu *m)
180{ 180{
181 int ver, logical_apicid; 181 int ver, logical_apicid;
182 physid_mask_t apic_cpus; 182 physid_mask_t apic_cpus;
183 183
184 if (!(m->mpc_cpuflag & CPU_ENABLED)) 184 if (!(m->cpuflag & CPU_ENABLED))
185 return; 185 return;
186 186
187 logical_apicid = m->mpc_apicid; 187 logical_apicid = m->apicid;
188 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", 188 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
189 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", 189 m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
190 m->mpc_apicid, 190 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
191 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 191 (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
192 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
193 m->mpc_apicver);
194 192
195 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) 193 if (m->cpuflag & CPU_BOOTPROCESSOR)
196 boot_cpu_physical_apicid = m->mpc_apicid; 194 boot_cpu_physical_apicid = m->apicid;
197 195
198 ver = m->mpc_apicver; 196 ver = m->apicver;
199 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { 197 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
200 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 198 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
201 m->mpc_apicid, MAX_APICS); 199 m->apicid, MAX_APICS);
202 return; 200 return;
203 } 201 }
204 202
205 apic_cpus = apicid_to_cpu_present(m->mpc_apicid); 203 apic_cpus = apic->apicid_to_cpu_present(m->apicid);
206 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 204 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
207 /* 205 /*
208 * Validate version 206 * Validate version
@@ -210,15 +208,15 @@ static void __init MP_processor_info(struct mpc_config_processor *m)
210 if (ver == 0x0) { 208 if (ver == 0x0) {
211 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " 209 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
212 "fixing up to 0x10. (tell your hw vendor)\n", 210 "fixing up to 0x10. (tell your hw vendor)\n",
213 m->mpc_apicid); 211 m->apicid);
214 ver = 0x10; 212 ver = 0x10;
215 } 213 }
216 apic_version[m->mpc_apicid] = ver; 214 apic_version[m->apicid] = ver;
217} 215}
218 216
219static int __init visws_find_smp_config(unsigned int reserve) 217static int __init visws_find_smp_config(unsigned int reserve)
220{ 218{
221 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 219 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
222 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 220 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
223 221
224 if (ncpus > CO_CPU_MAX) { 222 if (ncpus > CO_CPU_MAX) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720d..d7ac84e7fc1c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
158 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159 159
160 ret->fs = current->thread.saved_fs; 160 ret->fs = current->thread.saved_fs;
161 loadsegment(gs, current->thread.saved_gs); 161 set_user_gs(ret, current->thread.saved_gs);
162 162
163 return ret; 163 return ret;
164} 164}
@@ -197,9 +197,9 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200asmlinkage int sys_vm86old(struct pt_regs regs) 200int sys_vm86old(struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; 202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 203 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 204 * this avoids wasting of stack space.
205 * This remains on the stack until we 205 * This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
218 if (tmp) 218 if (tmp)
219 goto out; 219 goto out;
220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
221 info.regs32 = &regs; 221 info.regs32 = regs;
222 tsk->thread.vm86_info = v86; 222 tsk->thread.vm86_info = v86;
223 do_sys_vm86(&info, tsk); 223 do_sys_vm86(&info, tsk);
224 ret = 0; /* we never return here */ 224 ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
227} 227}
228 228
229 229
230asmlinkage int sys_vm86(struct pt_regs regs) 230int sys_vm86(struct pt_regs *regs)
231{ 231{
232 struct kernel_vm86_struct info; /* declare this _on top_, 232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 233 * this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
239 struct vm86plus_struct __user *v86; 239 struct vm86plus_struct __user *v86;
240 240
241 tsk = current; 241 tsk = current;
242 switch (regs.bx) { 242 switch (regs->bx) {
243 case VM86_REQUEST_IRQ: 243 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 244 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 245 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 246 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); 247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
248 goto out; 248 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 249 case VM86_PLUS_INSTALL_CHECK:
250 /* 250 /*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
261 ret = -EPERM; 261 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 262 if (tsk->thread.saved_sp0)
263 goto out; 263 goto out;
264 v86 = (struct vm86plus_struct __user *)regs.cx; 264 v86 = (struct vm86plus_struct __user *)regs->cx;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 266 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 267 sizeof(info.regs));
268 ret = -EFAULT; 268 ret = -EFAULT;
269 if (tmp) 269 if (tmp)
270 goto out; 270 goto out;
271 info.regs32 = &regs; 271 info.regs32 = regs;
272 info.vm86plus.is_vm86pus = 1; 272 info.vm86plus.is_vm86pus = 1;
273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
274 do_sys_vm86(&info, tsk); 274 do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
323 info->regs32->ax = 0; 323 info->regs32->ax = 0;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 324 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 325 tsk->thread.saved_fs = info->regs32->fs;
326 savesegment(gs, tsk->thread.saved_gs); 326 tsk->thread.saved_gs = get_user_gs(info->regs32);
327 327
328 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8b6c393ab9fd..f052c84ecbe4 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
266{ 266{
267} 267}
268 268
269#ifdef CONFIG_DEBUG_PAGE_TYPE
270
271#ifdef CONFIG_X86_PAE
272#define MAX_BOOT_PTS (2048+4+1)
273#else
274#define MAX_BOOT_PTS (1024+1)
275#endif
276
277/*
278 * During boot, mem_map is not yet available in paging_init, so stash
279 * all the boot page allocations here.
280 */
281static struct {
282 u32 pfn;
283 int type;
284} boot_page_allocations[MAX_BOOT_PTS];
285static int num_boot_page_allocations;
286static int boot_allocations_applied;
287
288void vmi_apply_boot_page_allocations(void)
289{
290 int i;
291 BUG_ON(!mem_map);
292 for (i = 0; i < num_boot_page_allocations; i++) {
293 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
294 page->type = boot_page_allocations[i].type;
295 page->type = boot_page_allocations[i].type &
296 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
297 }
298 boot_allocations_applied = 1;
299}
300
301static void record_page_type(u32 pfn, int type)
302{
303 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
304 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
305 boot_page_allocations[num_boot_page_allocations].type = type;
306 num_boot_page_allocations++;
307}
308
309static void check_zeroed_page(u32 pfn, int type, struct page *page)
310{
311 u32 *ptr;
312 int i;
313 int limit = PAGE_SIZE / sizeof(int);
314
315 if (page_address(page))
316 ptr = (u32 *)page_address(page);
317 else
318 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
319 /*
320 * When cloning the root in non-PAE mode, only the userspace
321 * pdes need to be zeroed.
322 */
323 if (type & VMI_PAGE_CLONE)
324 limit = KERNEL_PGD_BOUNDARY;
325 for (i = 0; i < limit; i++)
326 BUG_ON(ptr[i]);
327}
328
329/*
330 * We stash the page type into struct page so we can verify the page
331 * types are used properly.
332 */
333static void vmi_set_page_type(u32 pfn, int type)
334{
335 /* PAE can have multiple roots per page - don't track */
336 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
337 return;
338
339 if (boot_allocations_applied) {
340 struct page *page = pfn_to_page(pfn);
341 if (type != VMI_PAGE_NORMAL)
342 BUG_ON(page->type);
343 else
344 BUG_ON(page->type == VMI_PAGE_NORMAL);
345 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
346 if (type & VMI_PAGE_ZEROED)
347 check_zeroed_page(pfn, type, page);
348 } else {
349 record_page_type(pfn, type);
350 }
351}
352
353static void vmi_check_page_type(u32 pfn, int type)
354{
355 /* PAE can have multiple roots per page - skip checks */
356 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
357 return;
358
359 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
360 if (boot_allocations_applied) {
361 struct page *page = pfn_to_page(pfn);
362 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
363 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
364 BUG_ON((type & page->type) == 0);
365 }
366}
367#else
368#define vmi_set_page_type(p,t) do { } while (0)
369#define vmi_check_page_type(p,t) do { } while (0)
370#endif
371
372#ifdef CONFIG_HIGHPTE 269#ifdef CONFIG_HIGHPTE
373static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) 270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 271{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
395 292
396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 294{
398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 296}
401 297
@@ -406,27 +302,32 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
406 * It is called only for swapper_pg_dir, which already has 302 * It is called only for swapper_pg_dir, which already has
407 * data on it. 303 * data on it.
408 */ 304 */
409 vmi_set_page_type(pfn, VMI_PAGE_L2);
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 305 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 306}
412 307
413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) 308static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 309{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 310 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 311}
419 312
420static void vmi_release_pte(unsigned long pfn) 313static void vmi_release_pte(unsigned long pfn)
421{ 314{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 315 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 316}
425 317
426static void vmi_release_pmd(unsigned long pfn) 318static void vmi_release_pmd(unsigned long pfn)
427{ 319{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 320 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 321}
322
323/*
324 * We use the pgd_free hook for releasing the pgd page:
325 */
326static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
327{
328 unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
329
330 vmi_ops.release_page(pfn, VMI_PAGE_L2);
430} 331}
431 332
432/* 333/*
@@ -450,26 +351,22 @@ static void vmi_release_pmd(unsigned long pfn)
450 351
451static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 352static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
452{ 353{
453 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
454 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 354 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
455} 355}
456 356
457static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 357static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
458{ 358{
459 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
460 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 359 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
461} 360}
462 361
463static void vmi_set_pte(pte_t *ptep, pte_t pte) 362static void vmi_set_pte(pte_t *ptep, pte_t pte)
464{ 363{
465 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ 364 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
466 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
467 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 365 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
468} 366}
469 367
470static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 368static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
471{ 369{
472 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
473 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 370 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
474} 371}
475 372
@@ -477,10 +374,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
477{ 374{
478#ifdef CONFIG_X86_PAE 375#ifdef CONFIG_X86_PAE
479 const pte_t pte = { .pte = pmdval.pmd }; 376 const pte_t pte = { .pte = pmdval.pmd };
480 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
481#else 377#else
482 const pte_t pte = { pmdval.pud.pgd.pgd }; 378 const pte_t pte = { pmdval.pud.pgd.pgd };
483 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
484#endif 379#endif
485 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); 380 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
486} 381}
@@ -502,7 +397,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
502 397
503static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 398static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
504{ 399{
505 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
506 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); 400 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
507} 401}
508 402
@@ -510,21 +404,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
510{ 404{
511 /* Um, eww */ 405 /* Um, eww */
512 const pte_t pte = { .pte = pudval.pgd.pgd }; 406 const pte_t pte = { .pte = pudval.pgd.pgd };
513 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
514 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 407 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
515} 408}
516 409
517static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 410static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
518{ 411{
519 const pte_t pte = { .pte = 0 }; 412 const pte_t pte = { .pte = 0 };
520 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
521 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 413 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
522} 414}
523 415
524static void vmi_pmd_clear(pmd_t *pmd) 416static void vmi_pmd_clear(pmd_t *pmd)
525{ 417{
526 const pte_t pte = { .pte = 0 }; 418 const pte_t pte = { .pte = 0 };
527 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
528 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 419 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
529} 420}
530#endif 421#endif
@@ -789,10 +680,11 @@ static inline int __init activate_vmi(void)
789 para_fill(pv_mmu_ops.write_cr2, SetCR2); 680 para_fill(pv_mmu_ops.write_cr2, SetCR2);
790 para_fill(pv_mmu_ops.write_cr3, SetCR3); 681 para_fill(pv_mmu_ops.write_cr3, SetCR3);
791 para_fill(pv_cpu_ops.write_cr4, SetCR4); 682 para_fill(pv_cpu_ops.write_cr4, SetCR4);
792 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 683
793 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 684 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
794 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 685 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
795 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 686 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
687 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
796 688
797 para_fill(pv_cpu_ops.wbinvd, WBINVD); 689 para_fill(pv_cpu_ops.wbinvd, WBINVD);
798 para_fill(pv_cpu_ops.read_tsc, RDTSC); 690 para_fill(pv_cpu_ops.read_tsc, RDTSC);
@@ -881,6 +773,7 @@ static inline int __init activate_vmi(void)
881 if (vmi_ops.release_page) { 773 if (vmi_ops.release_page) {
882 pv_mmu_ops.release_pte = vmi_release_pte; 774 pv_mmu_ops.release_pte = vmi_release_pte;
883 pv_mmu_ops.release_pmd = vmi_release_pmd; 775 pv_mmu_ops.release_pmd = vmi_release_pmd;
776 pv_mmu_ops.pgd_free = vmi_pgd_free;
884 } 777 }
885 778
886 /* Set linear is needed in all cases */ 779 /* Set linear is needed in all cases */
@@ -960,8 +853,6 @@ static inline int __init activate_vmi(void)
960 853
961void __init vmi_init(void) 854void __init vmi_init(void)
962{ 855{
963 unsigned long flags;
964
965 if (!vmi_rom) 856 if (!vmi_rom)
966 probe_vmi_rom(); 857 probe_vmi_rom();
967 else 858 else
@@ -973,13 +864,21 @@ void __init vmi_init(void)
973 864
974 reserve_top_address(-vmi_rom->virtual_top); 865 reserve_top_address(-vmi_rom->virtual_top);
975 866
976 local_irq_save(flags);
977 activate_vmi();
978
979#ifdef CONFIG_X86_IO_APIC 867#ifdef CONFIG_X86_IO_APIC
980 /* This is virtual hardware; timer routing is wired correctly */ 868 /* This is virtual hardware; timer routing is wired correctly */
981 no_timer_check = 1; 869 no_timer_check = 1;
982#endif 870#endif
871}
872
873void __init vmi_activate(void)
874{
875 unsigned long flags;
876
877 if (!vmi_rom)
878 return;
879
880 local_irq_save(flags);
881 activate_vmi();
983 local_irq_restore(flags & X86_EFLAGS_IF); 882 local_irq_restore(flags & X86_EFLAGS_IF);
984} 883}
985 884
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f8635..a4791ef412d1 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 /* Upper bound is clockevent's use of ulong for cycle deltas. */ 226 /* Upper bound is clockevent's use of ulong for cycle deltas. */
227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); 227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
228 evt->min_delta_ns = clockevent_delta2ns(1, evt); 228 evt->min_delta_ns = clockevent_delta2ns(1, evt);
229 evt->cpumask = cpumask_of_cpu(cpu); 229 evt->cpumask = cpumask_of(cpu);
230 230
231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
232 evt->name, evt->mult, evt->shift); 232 evt->name, evt->mult, evt->shift);
@@ -256,7 +256,7 @@ void __devinit vmi_time_bsp_init(void)
256 */ 256 */
257 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 257 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
258 local_irq_disable(); 258 local_irq_disable();
259#ifdef CONFIG_X86_SMP 259#ifdef CONFIG_SMP
260 /* 260 /*
261 * XXX handle_percpu_irq only defined for SMP; we need to switch over 261 * XXX handle_percpu_irq only defined for SMP; we need to switch over
262 * to using it, since this is a local interrupt, which each CPU must 262 * to using it, since this is a local interrupt, which each CPU must
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..3eba7f7bac05 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
@@ -177,14 +178,7 @@ SECTIONS
177 __initramfs_end = .; 178 __initramfs_end = .;
178 } 179 }
179#endif 180#endif
180 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
184 *(.data.percpu)
185 *(.data.percpu.shared_aligned)
186 __per_cpu_end = .;
187 }
188 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
189 /* freed after init ends here */ 183 /* freed after init ends here */
190 184
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..087a7f2c639b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
8#include <asm/page.h> 9#include <asm/page.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(0); /* ___ */ 26 note PT_NOTE FLAGS(0); /* ___ */
23} 27}
24SECTIONS 28SECTIONS
@@ -35,6 +39,7 @@ SECTIONS
35 SCHED_TEXT 39 SCHED_TEXT
36 LOCK_TEXT 40 LOCK_TEXT
37 KPROBES_TEXT 41 KPROBES_TEXT
42 IRQENTRY_TEXT
38 *(.fixup) 43 *(.fixup)
39 *(.gnu.warning) 44 *(.gnu.warning)
40 _etext = .; /* End of text section */ 45 _etext = .; /* End of text section */
@@ -207,14 +212,28 @@ SECTIONS
207 __initramfs_end = .; 212 __initramfs_end = .;
208#endif 213#endif
209 214
215#ifdef CONFIG_SMP
216 /*
217 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
218 * output PHDR, so the next output section - __data_nosave - should
219 * start another section data.init2. Also, pda should be at the head of
220 * percpu area. Preallocate it and define the percpu offset symbol
221 * so that it can be accessed as a percpu variable.
222 */
223 . = ALIGN(PAGE_SIZE);
224 PERCPU_VADDR(0, :percpu)
225#else
210 PERCPU(PAGE_SIZE) 226 PERCPU(PAGE_SIZE)
227#endif
211 228
212 . = ALIGN(PAGE_SIZE); 229 . = ALIGN(PAGE_SIZE);
213 __init_end = .; 230 __init_end = .;
214 231
215 . = ALIGN(PAGE_SIZE); 232 . = ALIGN(PAGE_SIZE);
216 __nosave_begin = .; 233 __nosave_begin = .;
217 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave)
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
218 . = ALIGN(PAGE_SIZE); 237 . = ALIGN(PAGE_SIZE);
219 __nosave_end = .; 238 __nosave_end = .;
220 239
@@ -238,8 +257,21 @@ SECTIONS
238 DWARF_DEBUG 257 DWARF_DEBUG
239} 258}
240 259
260 /*
261 * Per-cpu symbols which need to be offset from __per_cpu_load
262 * for the boot processor.
263 */
264#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
265INIT_PER_CPU(gdt_page);
266INIT_PER_CPU(irq_stack_union);
267
241/* 268/*
242 * Build-time check on the image size: 269 * Build-time check on the image size:
243 */ 270 */
244ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 271ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
245 "kernel image bigger than KERNEL_IMAGE_SIZE") 272 "kernel image bigger than KERNEL_IMAGE_SIZE")
273
274#ifdef CONFIG_SMP
275ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area");
277#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..c609205df594 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
128 gettimeofday(tv,NULL); 131 gettimeofday(tv,NULL);
129 return; 132 return;
130 } 133 }
134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
131 now = vread(); 141 now = vread();
142 rdtsc_barrier();
143
132 base = __vsyscall_gtod_data.clock.cycle_last; 144 base = __vsyscall_gtod_data.clock.cycle_last;
133 mask = __vsyscall_gtod_data.clock.mask; 145 mask = __vsyscall_gtod_data.clock.mask;
134 mult = __vsyscall_gtod_data.clock.mult; 146 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index b13acb75e822..2b54fe002e94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
159 * Restore the extended state if present. Otherwise, restore the FP/SSE 159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state. 160 * state.
161 */ 161 */
162int restore_user_xstate(void __user *buf) 162static int restore_user_xstate(void __user *buf)
163{ 163{
164 struct _fpx_sw_bytes fx_sw_user; 164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask; 165 u64 mask;
@@ -310,7 +310,7 @@ static void __init setup_xstate_init(void)
310/* 310/*
311 * Enable and initialize the xsave feature. 311 * Enable and initialize the xsave feature.
312 */ 312 */
313void __init xsave_cntxt_init(void) 313void __ref xsave_cntxt_init(void)
314{ 314{
315 unsigned int eax, ebx, ecx, edx; 315 unsigned int eax, ebx, ecx, edx;
316 316