aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2007-10-11 05:17:24 -0400
committerThomas Gleixner <tglx@linutronix.de>2007-10-11 05:17:24 -0400
commit250c22777fe1ccd7ac588579a6c16db4c0161cc5 (patch)
tree55c317efb7d792ec6fdae1d1937c67a502c48dec /arch/x86
parent2db55d344e529492545cb3b755c7e9ba8e4fa94e (diff)
x86_64: move kernel
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/Makefile_322
-rw-r--r--arch/x86/kernel/Makefile_6454
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/aperture_64.c298
-rw-r--r--arch/x86/kernel/apic_64.c1253
-rw-r--r--arch/x86/kernel/asm-offsets_64.c85
-rw-r--r--arch/x86/kernel/audit_64.c81
-rw-r--r--arch/x86/kernel/bugs_64.c24
-rw-r--r--arch/x86/kernel/crash_64.c135
-rw-r--r--arch/x86/kernel/crash_dump_64.c47
-rw-r--r--arch/x86/kernel/e820_64.c725
-rw-r--r--arch/x86/kernel/early-quirks_64.c127
-rw-r--r--arch/x86/kernel/early_printk.c259
-rw-r--r--arch/x86/kernel/entry_64.S1172
-rw-r--r--arch/x86/kernel/genapic_64.c66
-rw-r--r--arch/x86/kernel/genapic_flat_64.c194
-rw-r--r--arch/x86/kernel/head64.c86
-rw-r--r--arch/x86/kernel/head_64.S416
-rw-r--r--arch/x86/kernel/hpet_64.c493
-rw-r--r--arch/x86/kernel/i387_64.c151
-rw-r--r--arch/x86/kernel/i8259_64.c544
-rw-r--r--arch/x86/kernel/init_task_64.c54
-rw-r--r--arch/x86/kernel/io_apic_64.c2202
-rw-r--r--arch/x86/kernel/ioport_64.c119
-rw-r--r--arch/x86/kernel/irq_64.c213
-rw-r--r--arch/x86/kernel/k8.c123
-rw-r--r--arch/x86/kernel/kprobes_64.c749
-rw-r--r--arch/x86/kernel/ldt_64.c252
-rw-r--r--arch/x86/kernel/machine_kexec_64.c259
-rw-r--r--arch/x86/kernel/mce_64.c875
-rw-r--r--arch/x86/kernel/mce_amd_64.c689
-rw-r--r--arch/x86/kernel/mce_intel_64.c89
-rw-r--r--arch/x86/kernel/module_64.c185
-rw-r--r--arch/x86/kernel/mpparse_64.c852
-rw-r--r--arch/x86/kernel/nmi_64.c483
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1578
-rw-r--r--arch/x86/kernel/pci-dma_64.c346
-rw-r--r--arch/x86/kernel/pci-gart_64.c740
-rw-r--r--arch/x86/kernel/pci-nommu_64.c97
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process_64.c903
-rw-r--r--arch/x86/kernel/ptrace_64.c627
-rw-r--r--arch/x86/kernel/reboot_64.c171
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S276
-rw-r--r--arch/x86/kernel/setup64.c289
-rw-r--r--arch/x86/kernel/setup_64.c1117
-rw-r--r--arch/x86/kernel/signal_64.c495
-rw-r--r--arch/x86/kernel/smp_64.c523
-rw-r--r--arch/x86/kernel/smpboot_64.c1085
-rw-r--r--arch/x86/kernel/stacktrace.c54
-rw-r--r--arch/x86/kernel/suspend_64.c239
-rw-r--r--arch/x86/kernel/suspend_asm_64.S110
-rw-r--r--arch/x86/kernel/sys_x86_64.c159
-rw-r--r--arch/x86/kernel/syscall_64.c26
-rw-r--r--arch/x86/kernel/tce_64.c189
-rw-r--r--arch/x86/kernel/time_64.c447
-rw-r--r--arch/x86/kernel/trampoline_64.S166
-rw-r--r--arch/x86/kernel/traps_64.c1138
-rw-r--r--arch/x86/kernel/tsc_64.c207
-rw-r--r--arch/x86/kernel/tsc_sync.c188
-rw-r--r--arch/x86/kernel/verify_cpu_64.S105
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S235
-rw-r--r--arch/x86/kernel/vsmp_64.c49
-rw-r--r--arch/x86/kernel/vsyscall_64.c349
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c62
68 files changed, 25438 insertions, 7 deletions
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index cff3d1dc5dd4..49467640751f 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -174,7 +174,7 @@ no_longmode:
174 hlt 174 hlt
175 jmp 1b 175 jmp 1b
176 176
177#include "../../../x86_64/kernel/verify_cpu_64.S" 177#include "../../kernel/verify_cpu_64.S"
178 178
179 /* Be careful here startup_64 needs to be at a predictable 179 /* Be careful here startup_64 needs to be at a predictable
180 * address so I can export it in an ELF header. Bootloaders 180 * address so I can export it in an ELF header. Bootloaders
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 577d08f4b8bb..45855c97923e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,5 +1,5 @@
1ifeq ($(CONFIG_X86_32),y) 1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/kernel/Makefile_32 2include ${srctree}/arch/x86/kernel/Makefile_32
3else 3else
4include ${srctree}/arch/x86_64/kernel/Makefile_64 4include ${srctree}/arch/x86/kernel/Makefile_64
5endif 5endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 5096f486d389..cb25523026a6 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -83,6 +83,4 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
83 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE 83 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
84 $(call if_changed,syscall) 84 $(call if_changed,syscall)
85 85
86k8-y += ../../x86_64/kernel/k8.o
87stacktrace-y += ../../x86_64/kernel/stacktrace.o
88 86
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
new file mode 100644
index 000000000000..6e6b5909e465
--- /dev/null
+++ b/arch/x86/kernel/Makefile_64
@@ -0,0 +1,54 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_64.o head64.o init_task_64.o vmlinux.lds
6EXTRA_AFLAGS := -traditional
7obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
8 ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
9 x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
10 setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
11 pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
12 perfctr-watchdog.o
13
14obj-$(CONFIG_STACKTRACE) += stacktrace.o
15obj-$(CONFIG_X86_MCE) += mce_64.o therm_throt.o
16obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
17obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
18obj-$(CONFIG_MTRR) += ../../x86/kernel/cpu/mtrr/
19obj-$(CONFIG_ACPI) += ../../x86/kernel/acpi/
20obj-$(CONFIG_X86_MSR) += msr.o
21obj-$(CONFIG_MICROCODE) += microcode.o
22obj-$(CONFIG_X86_CPUID) += cpuid.o
23obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
24obj-y += apic_64.o nmi_64.o
25obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
26obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash_64.o
27obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
28obj-$(CONFIG_PM) += suspend_64.o
29obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
30obj-$(CONFIG_CPU_FREQ) += ../../x86/kernel/cpu/cpufreq/
31obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
32obj-$(CONFIG_IOMMU) += pci-gart_64.o aperture_64.o
33obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
34obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
35obj-$(CONFIG_KPROBES) += kprobes_64.o
36obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
37obj-$(CONFIG_X86_VSMP) += vsmp_64.o
38obj-$(CONFIG_K8_NB) += k8.o
39obj-$(CONFIG_AUDIT) += audit_64.o
40
41obj-$(CONFIG_MODULES) += module_64.o
42obj-$(CONFIG_PCI) += early-quirks_64.o
43
44obj-y += topology.o
45obj-y += intel_cacheinfo.o
46obj-y += addon_cpuid_features.o
47obj-y += pcspeaker.o
48
49CFLAGS_vsyscall_64.o := $(PROFILING) -g0
50
51therm_throt-y += ../../x86/kernel/cpu/mcheck/therm_throt.o
52intel_cacheinfo-y += ../../x86/kernel/cpu/intel_cacheinfo.o
53addon_cpuid_features-y += ../../x86/kernel/cpu/addon_cpuid_features.o
54perfctr-watchdog-y += ../../x86/kernel/cpu/perfctr-watchdog.o
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5e3b3f5496c5..8b4357e1efe0 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -269,7 +269,7 @@ no_longmode:
269 movb $0xbc,%al ; outb %al,$0x80 269 movb $0xbc,%al ; outb %al,$0x80
270 jmp no_longmode 270 jmp no_longmode
271 271
272#include "../../../x86_64/kernel/verify_cpu_64.S" 272#include "../verify_cpu_64.S"
273 273
274/* This code uses an extended set of video mode numbers. These include: 274/* This code uses an extended set of video mode numbers. These include:
275 * Aliases for standard modes 275 * Aliases for standard modes
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 000000000000..8f681cae7bf7
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,298 @@
1/*
2 * Firmware replacement code.
3 *
4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/bootmem.h>
16#include <linux/mmzone.h>
17#include <linux/pci_ids.h>
18#include <linux/pci.h>
19#include <linux/bitops.h>
20#include <linux/ioport.h>
21#include <asm/e820.h>
22#include <asm/io.h>
23#include <asm/iommu.h>
24#include <asm/pci-direct.h>
25#include <asm/dma.h>
26#include <asm/k8.h>
27
28int iommu_aperture;
29int iommu_aperture_disabled __initdata = 0;
30int iommu_aperture_allowed __initdata = 0;
31
32int fallback_aper_order __initdata = 1; /* 64MB */
33int fallback_aper_force __initdata = 0;
34
35int fix_aperture __initdata = 1;
36
37static struct resource gart_resource = {
38 .name = "GART",
39 .flags = IORESOURCE_MEM,
40};
41
42static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
43{
44 gart_resource.start = aper_base;
45 gart_resource.end = aper_base + aper_size - 1;
46 insert_resource(&iomem_resource, &gart_resource);
47}
48
49/* This code runs before the PCI subsystem is initialized, so just
50 access the northbridge directly. */
51
52static u32 __init allocate_aperture(void)
53{
54 u32 aper_size;
55 void *p;
56
57 if (fallback_aper_order > 7)
58 fallback_aper_order = 7;
59 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
60
61 /*
62 * Aperture has to be naturally aligned. This means an 2GB aperture won't
63 * have much chance of finding a place in the lower 4GB of memory.
64 * Unfortunately we cannot move it up because that would make the
65 * IOMMU useless.
66 */
67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
68 if (!p || __pa(p)+aper_size > 0xffffffff) {
69 printk("Cannot allocate aperture memory hole (%p,%uK)\n",
70 p, aper_size>>10);
71 if (p)
72 free_bootmem(__pa(p), aper_size);
73 return 0;
74 }
75 printk("Mapping aperture over %d KB of RAM @ %lx\n",
76 aper_size >> 10, __pa(p));
77 insert_aperture_resource((u32)__pa(p), aper_size);
78 return (u32)__pa(p);
79}
80
81static int __init aperture_valid(u64 aper_base, u32 aper_size)
82{
83 if (!aper_base)
84 return 0;
85 if (aper_size < 64*1024*1024) {
86 printk("Aperture too small (%d MB)\n", aper_size>>20);
87 return 0;
88 }
89 if (aper_base + aper_size > 0x100000000UL) {
90 printk("Aperture beyond 4GB. Ignoring.\n");
91 return 0;
92 }
93 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
94 printk("Aperture pointing to e820 RAM. Ignoring.\n");
95 return 0;
96 }
97 return 1;
98}
99
100/* Find a PCI capability */
101static __u32 __init find_cap(int num, int slot, int func, int cap)
102{
103 u8 pos;
104 int bytes;
105 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
106 return 0;
107 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
108 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
109 u8 id;
110 pos &= ~3;
111 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
112 if (id == 0xff)
113 break;
114 if (id == cap)
115 return pos;
116 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
117 }
118 return 0;
119}
120
121/* Read a standard AGPv3 bridge header */
122static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
123{
124 u32 apsize;
125 u32 apsizereg;
126 int nbits;
127 u32 aper_low, aper_hi;
128 u64 aper;
129
130 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
131 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
132 if (apsizereg == 0xffffffff) {
133 printk("APSIZE in AGP bridge unreadable\n");
134 return 0;
135 }
136
137 apsize = apsizereg & 0xfff;
138 /* Some BIOS use weird encodings not in the AGPv3 table. */
139 if (apsize & 0xff)
140 apsize |= 0xf00;
141 nbits = hweight16(apsize);
142 *order = 7 - nbits;
143 if ((int)*order < 0) /* < 32MB */
144 *order = 0;
145
146 aper_low = read_pci_config(num,slot,func, 0x10);
147 aper_hi = read_pci_config(num,slot,func,0x14);
148 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
149
150 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
151 aper, 32 << *order, apsizereg);
152
153 if (!aperture_valid(aper, (32*1024*1024) << *order))
154 return 0;
155 return (u32)aper;
156}
157
158/* Look for an AGP bridge. Windows only expects the aperture in the
159 AGP bridge and some BIOS forget to initialize the Northbridge too.
160 Work around this here.
161
162 Do an PCI bus scan by hand because we're running before the PCI
163 subsystem.
164
165 All K8 AGP bridges are AGPv3 compliant, so we can do this scan
166 generically. It's probably overkill to always scan all slots because
167 the AGP bridges should be always an own bus on the HT hierarchy,
168 but do it here for future safety. */
169static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
170{
171 int num, slot, func;
172
173 /* Poor man's PCI discovery */
174 for (num = 0; num < 256; num++) {
175 for (slot = 0; slot < 32; slot++) {
176 for (func = 0; func < 8; func++) {
177 u32 class, cap;
178 u8 type;
179 class = read_pci_config(num,slot,func,
180 PCI_CLASS_REVISION);
181 if (class == 0xffffffff)
182 break;
183
184 switch (class >> 16) {
185 case PCI_CLASS_BRIDGE_HOST:
186 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
187 /* AGP bridge? */
188 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
189 if (!cap)
190 break;
191 *valid_agp = 1;
192 return read_agp(num,slot,func,cap,order);
193 }
194
195 /* No multi-function device? */
196 type = read_pci_config_byte(num,slot,func,
197 PCI_HEADER_TYPE);
198 if (!(type & 0x80))
199 break;
200 }
201 }
202 }
203 printk("No AGP bridge found\n");
204 return 0;
205}
206
207void __init iommu_hole_init(void)
208{
209 int fix, num;
210 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
211 u64 aper_base, last_aper_base = 0;
212 int valid_agp = 0;
213
214 if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
215 return;
216
217 printk(KERN_INFO "Checking aperture...\n");
218
219 fix = 0;
220 for (num = 24; num < 32; num++) {
221 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
222 continue;
223
224 iommu_detected = 1;
225 iommu_aperture = 1;
226
227 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
228 aper_size = (32 * 1024 * 1024) << aper_order;
229 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
230 aper_base <<= 25;
231
232 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
233 aper_base, aper_size>>20);
234
235 if (!aperture_valid(aper_base, aper_size)) {
236 fix = 1;
237 break;
238 }
239
240 if ((last_aper_order && aper_order != last_aper_order) ||
241 (last_aper_base && aper_base != last_aper_base)) {
242 fix = 1;
243 break;
244 }
245 last_aper_order = aper_order;
246 last_aper_base = aper_base;
247 }
248
249 if (!fix && !fallback_aper_force) {
250 if (last_aper_base) {
251 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
252 insert_aperture_resource((u32)last_aper_base, n);
253 }
254 return;
255 }
256
257 if (!fallback_aper_force)
258 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
259
260 if (aper_alloc) {
261 /* Got the aperture from the AGP bridge */
262 } else if (swiotlb && !valid_agp) {
263 /* Do nothing */
264 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
265 force_iommu ||
266 valid_agp ||
267 fallback_aper_force) {
268 printk("Your BIOS doesn't leave a aperture memory hole\n");
269 printk("Please enable the IOMMU option in the BIOS setup\n");
270 printk("This costs you %d MB of RAM\n",
271 32 << fallback_aper_order);
272
273 aper_order = fallback_aper_order;
274 aper_alloc = allocate_aperture();
275 if (!aper_alloc) {
276 /* Could disable AGP and IOMMU here, but it's probably
277 not worth it. But the later users cannot deal with
278 bad apertures and turning on the aperture over memory
279 causes very strange problems, so it's better to
280 panic early. */
281 panic("Not enough memory for aperture");
282 }
283 } else {
284 return;
285 }
286
287 /* Fix up the north bridges */
288 for (num = 24; num < 32; num++) {
289 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
290 continue;
291
292 /* Don't enable translation yet. That is done later.
293 Assume this BIOS didn't initialise the GART so
294 just overwrite all previous bits */
295 write_pci_config(0, num, 3, 0x90, aper_order<<1);
296 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
297 }
298}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
new file mode 100644
index 000000000000..925758dbca0c
--- /dev/null
+++ b/arch/x86/kernel/apic_64.c
@@ -0,0 +1,1253 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/module.h>
27#include <linux/ioport.h>
28
29#include <asm/atomic.h>
30#include <asm/smp.h>
31#include <asm/mtrr.h>
32#include <asm/mpspec.h>
33#include <asm/pgalloc.h>
34#include <asm/mach_apic.h>
35#include <asm/nmi.h>
36#include <asm/idle.h>
37#include <asm/proto.h>
38#include <asm/timex.h>
39#include <asm/hpet.h>
40#include <asm/apic.h>
41
42int apic_mapped;
43int apic_verbosity;
44int apic_runs_main_timer;
45int apic_calibrate_pmtmr __initdata;
46
47int disable_apic_timer __initdata;
48
49/* Local APIC timer works in C2? */
50int local_apic_timer_c2_ok;
51EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
52
53static struct resource *ioapic_resources;
54static struct resource lapic_resource = {
55 .name = "Local APIC",
56 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
57};
58
59/*
60 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
61 * IPIs in place of local APIC timers
62 */
63static cpumask_t timer_interrupt_broadcast_ipi_mask;
64
65/* Using APIC to generate smp_local_timer_interrupt? */
66int using_apic_timer __read_mostly = 0;
67
68static void apic_pm_activate(void);
69
70void apic_wait_icr_idle(void)
71{
72 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
73 cpu_relax();
74}
75
76unsigned int safe_apic_wait_icr_idle(void)
77{
78 unsigned int send_status;
79 int timeout;
80
81 timeout = 0;
82 do {
83 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
84 if (!send_status)
85 break;
86 udelay(100);
87 } while (timeout++ < 1000);
88
89 return send_status;
90}
91
92void enable_NMI_through_LVT0 (void * dummy)
93{
94 unsigned int v;
95
96 /* unmask and set to NMI */
97 v = APIC_DM_NMI;
98 apic_write(APIC_LVT0, v);
99}
100
101int get_maxlvt(void)
102{
103 unsigned int v, maxlvt;
104
105 v = apic_read(APIC_LVR);
106 maxlvt = GET_APIC_MAXLVT(v);
107 return maxlvt;
108}
109
110/*
111 * 'what should we do if we get a hw irq event on an illegal vector'.
112 * each architecture has to answer this themselves.
113 */
114void ack_bad_irq(unsigned int irq)
115{
116 printk("unexpected IRQ trap at vector %02x\n", irq);
117 /*
118 * Currently unexpected vectors happen only on SMP and APIC.
119 * We _must_ ack these because every local APIC has only N
120 * irq slots per priority level, and a 'hanging, unacked' IRQ
121 * holds up an irq slot - in excessive cases (when multiple
122 * unexpected vectors occur) that might lock up the APIC
123 * completely.
124 * But don't ack when the APIC is disabled. -AK
125 */
126 if (!disable_apic)
127 ack_APIC_irq();
128}
129
130void clear_local_APIC(void)
131{
132 int maxlvt;
133 unsigned int v;
134
135 maxlvt = get_maxlvt();
136
137 /*
138 * Masking an LVT entry can trigger a local APIC error
139 * if the vector is zero. Mask LVTERR first to prevent this.
140 */
141 if (maxlvt >= 3) {
142 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
143 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
144 }
145 /*
146 * Careful: we have to set masks only first to deassert
147 * any level-triggered sources.
148 */
149 v = apic_read(APIC_LVTT);
150 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
151 v = apic_read(APIC_LVT0);
152 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
153 v = apic_read(APIC_LVT1);
154 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
155 if (maxlvt >= 4) {
156 v = apic_read(APIC_LVTPC);
157 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
158 }
159
160 /*
161 * Clean APIC state for other OSs:
162 */
163 apic_write(APIC_LVTT, APIC_LVT_MASKED);
164 apic_write(APIC_LVT0, APIC_LVT_MASKED);
165 apic_write(APIC_LVT1, APIC_LVT_MASKED);
166 if (maxlvt >= 3)
167 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
168 if (maxlvt >= 4)
169 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
170 apic_write(APIC_ESR, 0);
171 apic_read(APIC_ESR);
172}
173
174void disconnect_bsp_APIC(int virt_wire_setup)
175{
176 /* Go back to Virtual Wire compatibility mode */
177 unsigned long value;
178
179 /* For the spurious interrupt use vector F, and enable it */
180 value = apic_read(APIC_SPIV);
181 value &= ~APIC_VECTOR_MASK;
182 value |= APIC_SPIV_APIC_ENABLED;
183 value |= 0xf;
184 apic_write(APIC_SPIV, value);
185
186 if (!virt_wire_setup) {
187 /* For LVT0 make it edge triggered, active high, external and enabled */
188 value = apic_read(APIC_LVT0);
189 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
190 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
191 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
192 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
193 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
194 apic_write(APIC_LVT0, value);
195 } else {
196 /* Disable LVT0 */
197 apic_write(APIC_LVT0, APIC_LVT_MASKED);
198 }
199
200 /* For LVT1 make it edge triggered, active high, nmi and enabled */
201 value = apic_read(APIC_LVT1);
202 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
203 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
204 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
205 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
206 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
207 apic_write(APIC_LVT1, value);
208}
209
210void disable_local_APIC(void)
211{
212 unsigned int value;
213
214 clear_local_APIC();
215
216 /*
217 * Disable APIC (implies clearing of registers
218 * for 82489DX!).
219 */
220 value = apic_read(APIC_SPIV);
221 value &= ~APIC_SPIV_APIC_ENABLED;
222 apic_write(APIC_SPIV, value);
223}
224
225/*
226 * This is to verify that we're looking at a real local APIC.
227 * Check these against your board if the CPUs aren't getting
228 * started for no apparent reason.
229 */
230int __init verify_local_APIC(void)
231{
232 unsigned int reg0, reg1;
233
234 /*
235 * The version register is read-only in a real APIC.
236 */
237 reg0 = apic_read(APIC_LVR);
238 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
239 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
240 reg1 = apic_read(APIC_LVR);
241 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
242
243 /*
244 * The two version reads above should print the same
245 * numbers. If the second one is different, then we
246 * poke at a non-APIC.
247 */
248 if (reg1 != reg0)
249 return 0;
250
251 /*
252 * Check if the version looks reasonably.
253 */
254 reg1 = GET_APIC_VERSION(reg0);
255 if (reg1 == 0x00 || reg1 == 0xff)
256 return 0;
257 reg1 = get_maxlvt();
258 if (reg1 < 0x02 || reg1 == 0xff)
259 return 0;
260
261 /*
262 * The ID register is read/write in a real APIC.
263 */
264 reg0 = apic_read(APIC_ID);
265 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
266 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
267 reg1 = apic_read(APIC_ID);
268 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
269 apic_write(APIC_ID, reg0);
270 if (reg1 != (reg0 ^ APIC_ID_MASK))
271 return 0;
272
273 /*
274 * The next two are just to see if we have sane values.
275 * They're only really relevant if we're in Virtual Wire
276 * compatibility mode, but most boxes are anymore.
277 */
278 reg0 = apic_read(APIC_LVT0);
279 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
280 reg1 = apic_read(APIC_LVT1);
281 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
282
283 return 1;
284}
285
286void __init sync_Arb_IDs(void)
287{
288 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
289 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
290 if (ver >= 0x14) /* P4 or higher */
291 return;
292
293 /*
294 * Wait for idle.
295 */
296 apic_wait_icr_idle();
297
298 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
299 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
300 | APIC_DM_INIT);
301}
302
303/*
304 * An initial setup of the virtual wire mode.
305 */
306void __init init_bsp_APIC(void)
307{
308 unsigned int value;
309
310 /*
311 * Don't do the setup now if we have a SMP BIOS as the
312 * through-I/O-APIC virtual wire mode might be active.
313 */
314 if (smp_found_config || !cpu_has_apic)
315 return;
316
317 value = apic_read(APIC_LVR);
318
319 /*
320 * Do not trust the local APIC being empty at bootup.
321 */
322 clear_local_APIC();
323
324 /*
325 * Enable APIC.
326 */
327 value = apic_read(APIC_SPIV);
328 value &= ~APIC_VECTOR_MASK;
329 value |= APIC_SPIV_APIC_ENABLED;
330 value |= APIC_SPIV_FOCUS_DISABLED;
331 value |= SPURIOUS_APIC_VECTOR;
332 apic_write(APIC_SPIV, value);
333
334 /*
335 * Set up the virtual wire mode.
336 */
337 apic_write(APIC_LVT0, APIC_DM_EXTINT);
338 value = APIC_DM_NMI;
339 apic_write(APIC_LVT1, value);
340}
341
342void __cpuinit setup_local_APIC (void)
343{
344 unsigned int value, maxlvt;
345 int i, j;
346
347 value = apic_read(APIC_LVR);
348
349 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
350
351 /*
352 * Double-check whether this APIC is really registered.
353 * This is meaningless in clustered apic mode, so we skip it.
354 */
355 if (!apic_id_registered())
356 BUG();
357
358 /*
359 * Intel recommends to set DFR, LDR and TPR before enabling
360 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
361 * document number 292116). So here it goes...
362 */
363 init_apic_ldr();
364
365 /*
366 * Set Task Priority to 'accept all'. We never change this
367 * later on.
368 */
369 value = apic_read(APIC_TASKPRI);
370 value &= ~APIC_TPRI_MASK;
371 apic_write(APIC_TASKPRI, value);
372
373 /*
374 * After a crash, we no longer service the interrupts and a pending
375 * interrupt from previous kernel might still have ISR bit set.
376 *
377 * Most probably by now CPU has serviced that pending interrupt and
378 * it might not have done the ack_APIC_irq() because it thought,
379 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
380 * does not clear the ISR bit and cpu thinks it has already serivced
381 * the interrupt. Hence a vector might get locked. It was noticed
382 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
383 */
384 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
385 value = apic_read(APIC_ISR + i*0x10);
386 for (j = 31; j >= 0; j--) {
387 if (value & (1<<j))
388 ack_APIC_irq();
389 }
390 }
391
392 /*
393 * Now that we are all set up, enable the APIC
394 */
395 value = apic_read(APIC_SPIV);
396 value &= ~APIC_VECTOR_MASK;
397 /*
398 * Enable APIC
399 */
400 value |= APIC_SPIV_APIC_ENABLED;
401
402 /* We always use processor focus */
403
404 /*
405 * Set spurious IRQ vector
406 */
407 value |= SPURIOUS_APIC_VECTOR;
408 apic_write(APIC_SPIV, value);
409
410 /*
411 * Set up LVT0, LVT1:
412 *
413 * set up through-local-APIC on the BP's LINT0. This is not
414 * strictly necessary in pure symmetric-IO mode, but sometimes
415 * we delegate interrupts to the 8259A.
416 */
417 /*
418 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
419 */
420 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
421 if (!smp_processor_id() && !value) {
422 value = APIC_DM_EXTINT;
423 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
424 } else {
425 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
426 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
427 }
428 apic_write(APIC_LVT0, value);
429
430 /*
431 * only the BP should see the LINT1 NMI signal, obviously.
432 */
433 if (!smp_processor_id())
434 value = APIC_DM_NMI;
435 else
436 value = APIC_DM_NMI | APIC_LVT_MASKED;
437 apic_write(APIC_LVT1, value);
438
439 {
440 unsigned oldvalue;
441 maxlvt = get_maxlvt();
442 oldvalue = apic_read(APIC_ESR);
443 value = ERROR_APIC_VECTOR; // enables sending errors
444 apic_write(APIC_LVTERR, value);
445 /*
446 * spec says clear errors after enabling vector.
447 */
448 if (maxlvt > 3)
449 apic_write(APIC_ESR, 0);
450 value = apic_read(APIC_ESR);
451 if (value != oldvalue)
452 apic_printk(APIC_VERBOSE,
453 "ESR value after enabling vector: %08x, after %08x\n",
454 oldvalue, value);
455 }
456
457 nmi_watchdog_default();
458 setup_apic_nmi_watchdog(NULL);
459 apic_pm_activate();
460}
461
462#ifdef CONFIG_PM
463
464static struct {
465 /* 'active' is true if the local APIC was enabled by us and
466 not the BIOS; this signifies that we are also responsible
467 for disabling it before entering apm/acpi suspend */
468 int active;
469 /* r/w apic fields */
470 unsigned int apic_id;
471 unsigned int apic_taskpri;
472 unsigned int apic_ldr;
473 unsigned int apic_dfr;
474 unsigned int apic_spiv;
475 unsigned int apic_lvtt;
476 unsigned int apic_lvtpc;
477 unsigned int apic_lvt0;
478 unsigned int apic_lvt1;
479 unsigned int apic_lvterr;
480 unsigned int apic_tmict;
481 unsigned int apic_tdcr;
482 unsigned int apic_thmr;
483} apic_pm_state;
484
485static int lapic_suspend(struct sys_device *dev, pm_message_t state)
486{
487 unsigned long flags;
488 int maxlvt;
489
490 if (!apic_pm_state.active)
491 return 0;
492
493 maxlvt = get_maxlvt();
494
495 apic_pm_state.apic_id = apic_read(APIC_ID);
496 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
497 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
498 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
499 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
500 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
501 if (maxlvt >= 4)
502 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
503 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
504 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
505 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
506 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
507 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
508#ifdef CONFIG_X86_MCE_INTEL
509 if (maxlvt >= 5)
510 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
511#endif
512 local_irq_save(flags);
513 disable_local_APIC();
514 local_irq_restore(flags);
515 return 0;
516}
517
518static int lapic_resume(struct sys_device *dev)
519{
520 unsigned int l, h;
521 unsigned long flags;
522 int maxlvt;
523
524 if (!apic_pm_state.active)
525 return 0;
526
527 maxlvt = get_maxlvt();
528
529 local_irq_save(flags);
530 rdmsr(MSR_IA32_APICBASE, l, h);
531 l &= ~MSR_IA32_APICBASE_BASE;
532 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
533 wrmsr(MSR_IA32_APICBASE, l, h);
534 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
535 apic_write(APIC_ID, apic_pm_state.apic_id);
536 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
537 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
538 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
539 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
540 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
541 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
542#ifdef CONFIG_X86_MCE_INTEL
543 if (maxlvt >= 5)
544 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
545#endif
546 if (maxlvt >= 4)
547 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
548 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
549 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
550 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
551 apic_write(APIC_ESR, 0);
552 apic_read(APIC_ESR);
553 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
554 apic_write(APIC_ESR, 0);
555 apic_read(APIC_ESR);
556 local_irq_restore(flags);
557 return 0;
558}
559
560static struct sysdev_class lapic_sysclass = {
561 set_kset_name("lapic"),
562 .resume = lapic_resume,
563 .suspend = lapic_suspend,
564};
565
566static struct sys_device device_lapic = {
567 .id = 0,
568 .cls = &lapic_sysclass,
569};
570
571static void __cpuinit apic_pm_activate(void)
572{
573 apic_pm_state.active = 1;
574}
575
576static int __init init_lapic_sysfs(void)
577{
578 int error;
579 if (!cpu_has_apic)
580 return 0;
581 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
582 error = sysdev_class_register(&lapic_sysclass);
583 if (!error)
584 error = sysdev_register(&device_lapic);
585 return error;
586}
587device_initcall(init_lapic_sysfs);
588
589#else /* CONFIG_PM */
590
591static void apic_pm_activate(void) { }
592
593#endif /* CONFIG_PM */
594
595static int __init apic_set_verbosity(char *str)
596{
597 if (str == NULL) {
598 skip_ioapic_setup = 0;
599 ioapic_force = 1;
600 return 0;
601 }
602 if (strcmp("debug", str) == 0)
603 apic_verbosity = APIC_DEBUG;
604 else if (strcmp("verbose", str) == 0)
605 apic_verbosity = APIC_VERBOSE;
606 else {
607 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
608 " use apic=verbose or apic=debug\n", str);
609 return -EINVAL;
610 }
611
612 return 0;
613}
614early_param("apic", apic_set_verbosity);
615
616/*
617 * Detect and enable local APICs on non-SMP boards.
618 * Original code written by Keir Fraser.
619 * On AMD64 we trust the BIOS - if it says no APIC it is likely
620 * not correctly set up (usually the APIC timer won't work etc.)
621 */
622
623static int __init detect_init_APIC (void)
624{
625 if (!cpu_has_apic) {
626 printk(KERN_INFO "No local APIC present\n");
627 return -1;
628 }
629
630 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
631 boot_cpu_id = 0;
632 return 0;
633}
634
635#ifdef CONFIG_X86_IO_APIC
636static struct resource * __init ioapic_setup_resources(void)
637{
638#define IOAPIC_RESOURCE_NAME_SIZE 11
639 unsigned long n;
640 struct resource *res;
641 char *mem;
642 int i;
643
644 if (nr_ioapics <= 0)
645 return NULL;
646
647 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
648 n *= nr_ioapics;
649
650 mem = alloc_bootmem(n);
651 res = (void *)mem;
652
653 if (mem != NULL) {
654 memset(mem, 0, n);
655 mem += sizeof(struct resource) * nr_ioapics;
656
657 for (i = 0; i < nr_ioapics; i++) {
658 res[i].name = mem;
659 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
660 sprintf(mem, "IOAPIC %u", i);
661 mem += IOAPIC_RESOURCE_NAME_SIZE;
662 }
663 }
664
665 ioapic_resources = res;
666
667 return res;
668}
669
670static int __init ioapic_insert_resources(void)
671{
672 int i;
673 struct resource *r = ioapic_resources;
674
675 if (!r) {
676 printk("IO APIC resources could be not be allocated.\n");
677 return -1;
678 }
679
680 for (i = 0; i < nr_ioapics; i++) {
681 insert_resource(&iomem_resource, r);
682 r++;
683 }
684
685 return 0;
686}
687
688/* Insert the IO APIC resources after PCI initialization has occured to handle
689 * IO APICS that are mapped in on a BAR in PCI space. */
690late_initcall(ioapic_insert_resources);
691#endif
692
693void __init init_apic_mappings(void)
694{
695 unsigned long apic_phys;
696
697 /*
698 * If no local APIC can be found then set up a fake all
699 * zeroes page to simulate the local APIC and another
700 * one for the IO-APIC.
701 */
702 if (!smp_found_config && detect_init_APIC()) {
703 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
704 apic_phys = __pa(apic_phys);
705 } else
706 apic_phys = mp_lapic_addr;
707
708 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
709 apic_mapped = 1;
710 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
711
712 /* Put local APIC into the resource map. */
713 lapic_resource.start = apic_phys;
714 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
715 insert_resource(&iomem_resource, &lapic_resource);
716
717 /*
718 * Fetch the APIC ID of the BSP in case we have a
719 * default configuration (or the MP table is broken).
720 */
721 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
722
723 {
724 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
725 int i;
726 struct resource *ioapic_res;
727
728 ioapic_res = ioapic_setup_resources();
729 for (i = 0; i < nr_ioapics; i++) {
730 if (smp_found_config) {
731 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
732 } else {
733 ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
734 ioapic_phys = __pa(ioapic_phys);
735 }
736 set_fixmap_nocache(idx, ioapic_phys);
737 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
738 __fix_to_virt(idx), ioapic_phys);
739 idx++;
740
741 if (ioapic_res != NULL) {
742 ioapic_res->start = ioapic_phys;
743 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
744 ioapic_res++;
745 }
746 }
747 }
748}
749
750/*
751 * This function sets up the local APIC timer, with a timeout of
752 * 'clocks' APIC bus clock. During calibration we actually call
753 * this function twice on the boot CPU, once with a bogus timeout
754 * value, second time for real. The other (noncalibrating) CPUs
755 * call this function only once, with the real, calibrated value.
756 *
757 * We do reads before writes even if unnecessary, to get around the
758 * P5 APIC double write bug.
759 */
760
761#define APIC_DIVISOR 16
762
763static void __setup_APIC_LVTT(unsigned int clocks)
764{
765 unsigned int lvtt_value, tmp_value;
766 int cpu = smp_processor_id();
767
768 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
769
770 if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
771 lvtt_value |= APIC_LVT_MASKED;
772
773 apic_write(APIC_LVTT, lvtt_value);
774
775 /*
776 * Divide PICLK by 16
777 */
778 tmp_value = apic_read(APIC_TDCR);
779 apic_write(APIC_TDCR, (tmp_value
780 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
781 | APIC_TDR_DIV_16);
782
783 apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
784}
785
786static void setup_APIC_timer(unsigned int clocks)
787{
788 unsigned long flags;
789
790 local_irq_save(flags);
791
792 /* wait for irq slice */
793 if (hpet_address && hpet_use_timer) {
794 u32 trigger = hpet_readl(HPET_T0_CMP);
795 while (hpet_readl(HPET_T0_CMP) == trigger)
796 /* do nothing */ ;
797 } else {
798 int c1, c2;
799 outb_p(0x00, 0x43);
800 c2 = inb_p(0x40);
801 c2 |= inb_p(0x40) << 8;
802 do {
803 c1 = c2;
804 outb_p(0x00, 0x43);
805 c2 = inb_p(0x40);
806 c2 |= inb_p(0x40) << 8;
807 } while (c2 - c1 < 300);
808 }
809 __setup_APIC_LVTT(clocks);
810 /* Turn off PIT interrupt if we use APIC timer as main timer.
811 Only works with the PM timer right now
812 TBD fix it for HPET too. */
813 if ((pmtmr_ioport != 0) &&
814 smp_processor_id() == boot_cpu_id &&
815 apic_runs_main_timer == 1 &&
816 !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
817 stop_timer_interrupt();
818 apic_runs_main_timer++;
819 }
820 local_irq_restore(flags);
821}
822
823/*
824 * In this function we calibrate APIC bus clocks to the external
825 * timer. Unfortunately we cannot use jiffies and the timer irq
826 * to calibrate, since some later bootup code depends on getting
827 * the first irq? Ugh.
828 *
829 * We want to do the calibration only once since we
830 * want to have local timer irqs syncron. CPUs connected
831 * by the same APIC bus have the very same bus frequency.
832 * And we want to have irqs off anyways, no accidental
833 * APIC irq that way.
834 */
835
836#define TICK_COUNT 100000000
837
838static int __init calibrate_APIC_clock(void)
839{
840 unsigned apic, apic_start;
841 unsigned long tsc, tsc_start;
842 int result;
843 /*
844 * Put whatever arbitrary (but long enough) timeout
845 * value into the APIC clock, we just want to get the
846 * counter running for calibration.
847 */
848 __setup_APIC_LVTT(4000000000);
849
850 apic_start = apic_read(APIC_TMCCT);
851#ifdef CONFIG_X86_PM_TIMER
852 if (apic_calibrate_pmtmr && pmtmr_ioport) {
853 pmtimer_wait(5000); /* 5ms wait */
854 apic = apic_read(APIC_TMCCT);
855 result = (apic_start - apic) * 1000L / 5;
856 } else
857#endif
858 {
859 rdtscll(tsc_start);
860
861 do {
862 apic = apic_read(APIC_TMCCT);
863 rdtscll(tsc);
864 } while ((tsc - tsc_start) < TICK_COUNT &&
865 (apic_start - apic) < TICK_COUNT);
866
867 result = (apic_start - apic) * 1000L * tsc_khz /
868 (tsc - tsc_start);
869 }
870 printk("result %d\n", result);
871
872
873 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
874 result / 1000 / 1000, result / 1000 % 1000);
875
876 return result * APIC_DIVISOR / HZ;
877}
878
879static unsigned int calibration_result;
880
881void __init setup_boot_APIC_clock (void)
882{
883 if (disable_apic_timer) {
884 printk(KERN_INFO "Disabling APIC timer\n");
885 return;
886 }
887
888 printk(KERN_INFO "Using local APIC timer interrupts.\n");
889 using_apic_timer = 1;
890
891 local_irq_disable();
892
893 calibration_result = calibrate_APIC_clock();
894 /*
895 * Now set up the timer for real.
896 */
897 setup_APIC_timer(calibration_result);
898
899 local_irq_enable();
900}
901
902void __cpuinit setup_secondary_APIC_clock(void)
903{
904 local_irq_disable(); /* FIXME: Do we need this? --RR */
905 setup_APIC_timer(calibration_result);
906 local_irq_enable();
907}
908
909void disable_APIC_timer(void)
910{
911 if (using_apic_timer) {
912 unsigned long v;
913
914 v = apic_read(APIC_LVTT);
915 /*
916 * When an illegal vector value (0-15) is written to an LVT
917 * entry and delivery mode is Fixed, the APIC may signal an
918 * illegal vector error, with out regard to whether the mask
919 * bit is set or whether an interrupt is actually seen on input.
920 *
921 * Boot sequence might call this function when the LVTT has
922 * '0' vector value. So make sure vector field is set to
923 * valid value.
924 */
925 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
926 apic_write(APIC_LVTT, v);
927 }
928}
929
930void enable_APIC_timer(void)
931{
932 int cpu = smp_processor_id();
933
934 if (using_apic_timer &&
935 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
936 unsigned long v;
937
938 v = apic_read(APIC_LVTT);
939 apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
940 }
941}
942
943void switch_APIC_timer_to_ipi(void *cpumask)
944{
945 cpumask_t mask = *(cpumask_t *)cpumask;
946 int cpu = smp_processor_id();
947
948 if (cpu_isset(cpu, mask) &&
949 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
950 disable_APIC_timer();
951 cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
952 }
953}
954EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
955
956void smp_send_timer_broadcast_ipi(void)
957{
958 int cpu = smp_processor_id();
959 cpumask_t mask;
960
961 cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
962
963 if (cpu_isset(cpu, mask)) {
964 cpu_clear(cpu, mask);
965 add_pda(apic_timer_irqs, 1);
966 smp_local_timer_interrupt();
967 }
968
969 if (!cpus_empty(mask)) {
970 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
971 }
972}
973
974void switch_ipi_to_APIC_timer(void *cpumask)
975{
976 cpumask_t mask = *(cpumask_t *)cpumask;
977 int cpu = smp_processor_id();
978
979 if (cpu_isset(cpu, mask) &&
980 cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
981 cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
982 enable_APIC_timer();
983 }
984}
985EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
986
987int setup_profiling_timer(unsigned int multiplier)
988{
989 return -EINVAL;
990}
991
992void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
993 unsigned char msg_type, unsigned char mask)
994{
995 unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
996 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
997 apic_write(reg, v);
998}
999
1000#undef APIC_DIVISOR
1001
1002/*
1003 * Local timer interrupt handler. It does both profiling and
1004 * process statistics/rescheduling.
1005 *
1006 * We do profiling in every local tick, statistics/rescheduling
1007 * happen only every 'profiling multiplier' ticks. The default
1008 * multiplier is 1 and it can be changed by writing the new multiplier
1009 * value into /proc/profile.
1010 */
1011
1012void smp_local_timer_interrupt(void)
1013{
1014 profile_tick(CPU_PROFILING);
1015#ifdef CONFIG_SMP
1016 update_process_times(user_mode(get_irq_regs()));
1017#endif
1018 if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
1019 main_timer_handler();
1020 /*
1021 * We take the 'long' return path, and there every subsystem
1022 * grabs the appropriate locks (kernel lock/ irq lock).
1023 *
1024 * We might want to decouple profiling from the 'long path',
1025 * and do the profiling totally in assembly.
1026 *
1027 * Currently this isn't too much of an issue (performance wise),
1028 * we can take more than 100K local irqs per second on a 100 MHz P5.
1029 */
1030}
1031
1032/*
1033 * Local APIC timer interrupt. This is the most natural way for doing
1034 * local interrupts, but local timer interrupts can be emulated by
1035 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1036 *
1037 * [ if a single-CPU system runs an SMP kernel then we call the local
1038 * interrupt as well. Thus we cannot inline the local irq ... ]
1039 */
1040void smp_apic_timer_interrupt(struct pt_regs *regs)
1041{
1042 struct pt_regs *old_regs = set_irq_regs(regs);
1043
1044 /*
1045 * the NMI deadlock-detector uses this.
1046 */
1047 add_pda(apic_timer_irqs, 1);
1048
1049 /*
1050 * NOTE! We'd better ACK the irq immediately,
1051 * because timer handling can be slow.
1052 */
1053 ack_APIC_irq();
1054 /*
1055 * update_process_times() expects us to have done irq_enter().
1056 * Besides, if we don't timer interrupts ignore the global
1057 * interrupt lock, which is the WrongThing (tm) to do.
1058 */
1059 exit_idle();
1060 irq_enter();
1061 smp_local_timer_interrupt();
1062 irq_exit();
1063 set_irq_regs(old_regs);
1064}
1065
1066/*
1067 * apic_is_clustered_box() -- Check if we can expect good TSC
1068 *
1069 * Thus far, the major user of this is IBM's Summit2 series:
1070 *
1071 * Clustered boxes may have unsynced TSC problems if they are
1072 * multi-chassis. Use available data to take a good guess.
1073 * If in doubt, go HPET.
1074 */
1075__cpuinit int apic_is_clustered_box(void)
1076{
1077 int i, clusters, zeros;
1078 unsigned id;
1079 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1080
1081 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1082
1083 for (i = 0; i < NR_CPUS; i++) {
1084 id = bios_cpu_apicid[i];
1085 if (id != BAD_APICID)
1086 __set_bit(APIC_CLUSTERID(id), clustermap);
1087 }
1088
1089 /* Problem: Partially populated chassis may not have CPUs in some of
1090 * the APIC clusters they have been allocated. Only present CPUs have
1091 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
1092 * clusters are allocated sequentially, count zeros only if they are
1093 * bounded by ones.
1094 */
1095 clusters = 0;
1096 zeros = 0;
1097 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1098 if (test_bit(i, clustermap)) {
1099 clusters += 1 + zeros;
1100 zeros = 0;
1101 } else
1102 ++zeros;
1103 }
1104
1105 /*
1106 * If clusters > 2, then should be multi-chassis.
1107 * May have to revisit this when multi-core + hyperthreaded CPUs come
1108 * out, but AFAIK this will work even for them.
1109 */
1110 return (clusters > 2);
1111}
1112
1113/*
1114 * This interrupt should _never_ happen with our APIC/SMP architecture
1115 */
1116asmlinkage void smp_spurious_interrupt(void)
1117{
1118 unsigned int v;
1119 exit_idle();
1120 irq_enter();
1121 /*
1122 * Check if this really is a spurious interrupt and ACK it
1123 * if it is a vectored one. Just in case...
1124 * Spurious interrupts should not be ACKed.
1125 */
1126 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1127 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1128 ack_APIC_irq();
1129
1130 irq_exit();
1131}
1132
1133/*
1134 * This interrupt should never happen with our APIC/SMP architecture
1135 */
1136
1137asmlinkage void smp_error_interrupt(void)
1138{
1139 unsigned int v, v1;
1140
1141 exit_idle();
1142 irq_enter();
1143 /* First tickle the hardware, only then report what went on. -- REW */
1144 v = apic_read(APIC_ESR);
1145 apic_write(APIC_ESR, 0);
1146 v1 = apic_read(APIC_ESR);
1147 ack_APIC_irq();
1148 atomic_inc(&irq_err_count);
1149
1150 /* Here is what the APIC error bits mean:
1151 0: Send CS error
1152 1: Receive CS error
1153 2: Send accept error
1154 3: Receive accept error
1155 4: Reserved
1156 5: Send illegal vector
1157 6: Received illegal vector
1158 7: Illegal register address
1159 */
1160 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1161 smp_processor_id(), v , v1);
1162 irq_exit();
1163}
1164
1165int disable_apic;
1166
1167/*
1168 * This initializes the IO-APIC and APIC hardware if this is
1169 * a UP kernel.
1170 */
1171int __init APIC_init_uniprocessor (void)
1172{
1173 if (disable_apic) {
1174 printk(KERN_INFO "Apic disabled\n");
1175 return -1;
1176 }
1177 if (!cpu_has_apic) {
1178 disable_apic = 1;
1179 printk(KERN_INFO "Apic disabled by BIOS\n");
1180 return -1;
1181 }
1182
1183 verify_local_APIC();
1184
1185 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1186 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1187
1188 setup_local_APIC();
1189
1190 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1191 setup_IO_APIC();
1192 else
1193 nr_ioapics = 0;
1194 setup_boot_APIC_clock();
1195 check_nmi_watchdog();
1196 return 0;
1197}
1198
1199static __init int setup_disableapic(char *str)
1200{
1201 disable_apic = 1;
1202 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1203 return 0;
1204}
1205early_param("disableapic", setup_disableapic);
1206
1207/* same as disableapic, for compatibility */
1208static __init int setup_nolapic(char *str)
1209{
1210 return setup_disableapic(str);
1211}
1212early_param("nolapic", setup_nolapic);
1213
1214static int __init parse_lapic_timer_c2_ok(char *arg)
1215{
1216 local_apic_timer_c2_ok = 1;
1217 return 0;
1218}
1219early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1220
1221static __init int setup_noapictimer(char *str)
1222{
1223 if (str[0] != ' ' && str[0] != 0)
1224 return 0;
1225 disable_apic_timer = 1;
1226 return 1;
1227}
1228
1229static __init int setup_apicmaintimer(char *str)
1230{
1231 apic_runs_main_timer = 1;
1232 nohpet = 1;
1233 return 1;
1234}
1235__setup("apicmaintimer", setup_apicmaintimer);
1236
1237static __init int setup_noapicmaintimer(char *str)
1238{
1239 apic_runs_main_timer = -1;
1240 return 1;
1241}
1242__setup("noapicmaintimer", setup_noapicmaintimer);
1243
1244static __init int setup_apicpmtimer(char *s)
1245{
1246 apic_calibrate_pmtmr = 1;
1247 notsc_setup(NULL);
1248 return setup_apicmaintimer(NULL);
1249}
1250__setup("apicpmtimer", setup_apicpmtimer);
1251
1252__setup("noapictimer", setup_noapictimer);
1253
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644
index 000000000000..778953bc636c
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -0,0 +1,85 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/stddef.h>
10#include <linux/errno.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <asm/pda.h>
14#include <asm/processor.h>
15#include <asm/segment.h>
16#include <asm/thread_info.h>
17#include <asm/ia32.h>
18
19#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
21
22#define BLANK() asm volatile("\n->" : : )
23
24#define __NO_STUBS 1
25#undef __SYSCALL
26#undef _ASM_X86_64_UNISTD_H_
27#define __SYSCALL(nr, sym) [nr] = 1,
28static char syscalls[] = {
29#include <asm/unistd.h>
30};
31
32int main(void)
33{
34#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
35 ENTRY(state);
36 ENTRY(flags);
37 ENTRY(thread);
38 ENTRY(pid);
39 BLANK();
40#undef ENTRY
41#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
42 ENTRY(flags);
43 ENTRY(addr_limit);
44 ENTRY(preempt_count);
45 ENTRY(status);
46 BLANK();
47#undef ENTRY
48#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
49 ENTRY(kernelstack);
50 ENTRY(oldrsp);
51 ENTRY(pcurrent);
52 ENTRY(irqcount);
53 ENTRY(cpunumber);
54 ENTRY(irqstackptr);
55 ENTRY(data_offset);
56 BLANK();
57#undef ENTRY
58#ifdef CONFIG_IA32_EMULATION
59#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
60 ENTRY(eax);
61 ENTRY(ebx);
62 ENTRY(ecx);
63 ENTRY(edx);
64 ENTRY(esi);
65 ENTRY(edi);
66 ENTRY(ebp);
67 ENTRY(esp);
68 ENTRY(eip);
69 BLANK();
70#undef ENTRY
71 DEFINE(IA32_RT_SIGFRAME_sigcontext,
72 offsetof (struct rt_sigframe32, uc.uc_mcontext));
73 BLANK();
74#endif
75 DEFINE(pbe_address, offsetof(struct pbe, address));
76 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
77 DEFINE(pbe_next, offsetof(struct pbe, next));
78 BLANK();
79 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
80 BLANK();
81 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
82 BLANK();
83 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
84 return 0;
85}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644
index 000000000000..06d3e5a14d9d
--- /dev/null
+++ b/arch/x86/kernel/audit_64.c
@@ -0,0 +1,81 @@
1#include <linux/init.h>
2#include <linux/types.h>
3#include <linux/audit.h>
4#include <asm/unistd.h>
5
6static unsigned dir_class[] = {
7#include <asm-generic/audit_dir_write.h>
8~0U
9};
10
11static unsigned read_class[] = {
12#include <asm-generic/audit_read.h>
13~0U
14};
15
16static unsigned write_class[] = {
17#include <asm-generic/audit_write.h>
18~0U
19};
20
21static unsigned chattr_class[] = {
22#include <asm-generic/audit_change_attr.h>
23~0U
24};
25
26static unsigned signal_class[] = {
27#include <asm-generic/audit_signal.h>
28~0U
29};
30
31int audit_classify_arch(int arch)
32{
33#ifdef CONFIG_IA32_EMULATION
34 if (arch == AUDIT_ARCH_I386)
35 return 1;
36#endif
37 return 0;
38}
39
40int audit_classify_syscall(int abi, unsigned syscall)
41{
42#ifdef CONFIG_IA32_EMULATION
43 extern int ia32_classify_syscall(unsigned);
44 if (abi == AUDIT_ARCH_I386)
45 return ia32_classify_syscall(syscall);
46#endif
47 switch(syscall) {
48 case __NR_open:
49 return 2;
50 case __NR_openat:
51 return 3;
52 case __NR_execve:
53 return 5;
54 default:
55 return 0;
56 }
57}
58
59static int __init audit_classes_init(void)
60{
61#ifdef CONFIG_IA32_EMULATION
62 extern __u32 ia32_dir_class[];
63 extern __u32 ia32_write_class[];
64 extern __u32 ia32_read_class[];
65 extern __u32 ia32_chattr_class[];
66 extern __u32 ia32_signal_class[];
67 audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
68 audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
69 audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
70 audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
71 audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
72#endif
73 audit_register_class(AUDIT_CLASS_WRITE, write_class);
74 audit_register_class(AUDIT_CLASS_READ, read_class);
75 audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
76 audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
77 audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
78 return 0;
79}
80
81__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
new file mode 100644
index 000000000000..4e5e9d364d63
--- /dev/null
+++ b/arch/x86/kernel/bugs_64.c
@@ -0,0 +1,24 @@
1/*
2 * arch/x86_64/kernel/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2000 SuSE
6 */
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <asm/alternative.h>
11#include <asm/bugs.h>
12#include <asm/processor.h>
13#include <asm/mtrr.h>
14
15void __init check_bugs(void)
16{
17 identify_cpu(&boot_cpu_data);
18 mtrr_bp_init();
19#if !defined(CONFIG_SMP)
20 printk("CPU: ");
21 print_cpu_info(&boot_cpu_data);
22#endif
23 alternative_instructions();
24}
diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c
new file mode 100644
index 000000000000..13432a1ae904
--- /dev/null
+++ b/arch/x86/kernel/crash_64.c
@@ -0,0 +1,135 @@
1/*
2 * Architecture specific (x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17#include <linux/delay.h>
18#include <linux/elf.h>
19#include <linux/elfcore.h>
20#include <linux/kdebug.h>
21
22#include <asm/processor.h>
23#include <asm/hardirq.h>
24#include <asm/nmi.h>
25#include <asm/hw_irq.h>
26#include <asm/mach_apic.h>
27
28/* This keeps a track of which one is crashing cpu. */
29static int crashing_cpu;
30
31#ifdef CONFIG_SMP
32static atomic_t waiting_for_crash_ipi;
33
34static int crash_nmi_callback(struct notifier_block *self,
35 unsigned long val, void *data)
36{
37 struct pt_regs *regs;
38 int cpu;
39
40 if (val != DIE_NMI_IPI)
41 return NOTIFY_OK;
42
43 regs = ((struct die_args *)data)->regs;
44 cpu = raw_smp_processor_id();
45
46 /*
47 * Don't do anything if this handler is invoked on crashing cpu.
48 * Otherwise, system will completely hang. Crashing cpu can get
49 * an NMI if system was initially booted with nmi_watchdog parameter.
50 */
51 if (cpu == crashing_cpu)
52 return NOTIFY_STOP;
53 local_irq_disable();
54
55 crash_save_cpu(regs, cpu);
56 disable_local_APIC();
57 atomic_dec(&waiting_for_crash_ipi);
58 /* Assume hlt works */
59 for(;;)
60 halt();
61
62 return 1;
63}
64
65static void smp_send_nmi_allbutself(void)
66{
67 send_IPI_allbutself(NMI_VECTOR);
68}
69
70/*
71 * This code is a best effort heuristic to get the
72 * other cpus to stop executing. So races with
73 * cpu hotplug shouldn't matter.
74 */
75
76static struct notifier_block crash_nmi_nb = {
77 .notifier_call = crash_nmi_callback,
78};
79
80static void nmi_shootdown_cpus(void)
81{
82 unsigned long msecs;
83
84 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
85 if (register_die_notifier(&crash_nmi_nb))
86 return; /* return what? */
87
88 /*
89 * Ensure the new callback function is set before sending
90 * out the NMI
91 */
92 wmb();
93
94 smp_send_nmi_allbutself();
95
96 msecs = 1000; /* Wait at most a second for the other cpus to stop */
97 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
98 mdelay(1);
99 msecs--;
100 }
101 /* Leave the nmi callback set */
102 disable_local_APIC();
103}
104#else
105static void nmi_shootdown_cpus(void)
106{
107 /* There are no cpus to shootdown */
108}
109#endif
110
111void machine_crash_shutdown(struct pt_regs *regs)
112{
113 /*
114 * This function is only called after the system
115 * has panicked or is otherwise in a critical state.
116 * The minimum amount of code to allow a kexec'd kernel
117 * to run successfully needs to happen here.
118 *
119 * In practice this means shooting down the other cpus in
120 * an SMP system.
121 */
122 /* The kernel is broken so disable interrupts */
123 local_irq_disable();
124
125 /* Make a note of crashing cpu. Will be used in NMI callback.*/
126 crashing_cpu = smp_processor_id();
127 nmi_shootdown_cpus();
128
129 if(cpu_has_apic)
130 disable_local_APIC();
131
132 disable_IO_APIC();
133
134 crash_save_cpu(regs, smp_processor_id());
135}
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644
index 000000000000..942deac4d43a
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -0,0 +1,47 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/errno.h>
9#include <linux/crash_dump.h>
10
11#include <asm/uaccess.h>
12#include <asm/io.h>
13
14/**
15 * copy_oldmem_page - copy one page from "oldmem"
16 * @pfn: page frame number to be copied
17 * @buf: target memory address for the copy; this can be in kernel address
18 * space or user address space (see @userbuf)
19 * @csize: number of bytes to copy
20 * @offset: offset in bytes into the page (based on pfn) to begin the copy
21 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
22 * otherwise @buf is in kernel address space, use memcpy().
23 *
24 * Copy a page from "oldmem". For this page, there is no pte mapped
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf)
29{
30 void *vaddr;
31
32 if (!csize)
33 return 0;
34
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
36
37 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) {
39 iounmap(vaddr);
40 return -EFAULT;
41 }
42 } else
43 memcpy(buf, (vaddr + offset), csize);
44
45 iounmap(vaddr);
46 return csize;
47}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
new file mode 100644
index 000000000000..0f4d5e209e9b
--- /dev/null
+++ b/arch/x86/kernel/e820_64.c
@@ -0,0 +1,725 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/suspend.h>
21#include <linux/pfn.h>
22
23#include <asm/pgtable.h>
24#include <asm/page.h>
25#include <asm/e820.h>
26#include <asm/proto.h>
27#include <asm/bootsetup.h>
28#include <asm/sections.h>
29
30struct e820map e820;
31
32/*
33 * PFN of last memory page.
34 */
35unsigned long end_pfn;
36EXPORT_SYMBOL(end_pfn);
37
38/*
39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
40 * The direct mapping extends to end_pfn_map, so that we can directly access
41 * apertures, ACPI and other tables without having to play with fixmaps.
42 */
43unsigned long end_pfn_map;
44
45/*
46 * Last pfn which the user wants to use.
47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49
50extern struct resource code_resource, data_resource;
51
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */
53static inline int bad_addr(unsigned long *addrp, unsigned long size)
54{
55 unsigned long addr = *addrp, last = addr + size;
56
57 /* various gunk below that needed for SMP startup */
58 if (addr < 0x8000) {
59 *addrp = PAGE_ALIGN(0x8000);
60 return 1;
61 }
62
63 /* direct mapping tables of the kernel */
64 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
65 *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
66 return 1;
67 }
68
69 /* initrd */
70#ifdef CONFIG_BLK_DEV_INITRD
71 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
72 addr < INITRD_START+INITRD_SIZE) {
73 *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
74 return 1;
75 }
76#endif
77 /* kernel code */
78 if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
79 *addrp = PAGE_ALIGN(__pa_symbol(&_end));
80 return 1;
81 }
82
83 if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
84 *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
85 return 1;
86 }
87
88#ifdef CONFIG_NUMA
89 /* NUMA memory to node map */
90 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
91 *addrp = nodemap_addr + nodemap_size;
92 return 1;
93 }
94#endif
95 /* XXX ramdisk image here? */
96 return 0;
97}
98
99/*
100 * This function checks if any part of the range <start,end> is mapped
101 * with type.
102 */
103int
104e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
105{
106 int i;
107 for (i = 0; i < e820.nr_map; i++) {
108 struct e820entry *ei = &e820.map[i];
109 if (type && ei->type != type)
110 continue;
111 if (ei->addr >= end || ei->addr + ei->size <= start)
112 continue;
113 return 1;
114 }
115 return 0;
116}
117EXPORT_SYMBOL_GPL(e820_any_mapped);
118
119/*
120 * This function checks if the entire range <start,end> is mapped with type.
121 *
122 * Note: this function only works correct if the e820 table is sorted and
123 * not-overlapping, which is the case
124 */
125int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
126{
127 int i;
128 for (i = 0; i < e820.nr_map; i++) {
129 struct e820entry *ei = &e820.map[i];
130 if (type && ei->type != type)
131 continue;
132 /* is the region (part) in overlap with the current region ?*/
133 if (ei->addr >= end || ei->addr + ei->size <= start)
134 continue;
135
136 /* if the region is at the beginning of <start,end> we move
137 * start to the end of the region since it's ok until there
138 */
139 if (ei->addr <= start)
140 start = ei->addr + ei->size;
141 /* if start is now at or beyond end, we're done, full coverage */
142 if (start >= end)
143 return 1; /* we're done */
144 }
145 return 0;
146}
147
148/*
149 * Find a free area in a specific range.
150 */
151unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
152{
153 int i;
154 for (i = 0; i < e820.nr_map; i++) {
155 struct e820entry *ei = &e820.map[i];
156 unsigned long addr = ei->addr, last;
157 if (ei->type != E820_RAM)
158 continue;
159 if (addr < start)
160 addr = start;
161 if (addr > ei->addr + ei->size)
162 continue;
163 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
164 ;
165 last = PAGE_ALIGN(addr) + size;
166 if (last > ei->addr + ei->size)
167 continue;
168 if (last > end)
169 continue;
170 return addr;
171 }
172 return -1UL;
173}
174
175/*
176 * Find the highest page frame number we have available
177 */
178unsigned long __init e820_end_of_ram(void)
179{
180 unsigned long end_pfn = 0;
181 end_pfn = find_max_pfn_with_active_regions();
182
183 if (end_pfn > end_pfn_map)
184 end_pfn_map = end_pfn;
185 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
186 end_pfn_map = MAXMEM>>PAGE_SHIFT;
187 if (end_pfn > end_user_pfn)
188 end_pfn = end_user_pfn;
189 if (end_pfn > end_pfn_map)
190 end_pfn = end_pfn_map;
191
192 printk("end_pfn_map = %lu\n", end_pfn_map);
193 return end_pfn;
194}
195
196/*
197 * Mark e820 reserved areas as busy for the resource manager.
198 */
199void __init e820_reserve_resources(void)
200{
201 int i;
202 for (i = 0; i < e820.nr_map; i++) {
203 struct resource *res;
204 res = alloc_bootmem_low(sizeof(struct resource));
205 switch (e820.map[i].type) {
206 case E820_RAM: res->name = "System RAM"; break;
207 case E820_ACPI: res->name = "ACPI Tables"; break;
208 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
209 default: res->name = "reserved";
210 }
211 res->start = e820.map[i].addr;
212 res->end = res->start + e820.map[i].size - 1;
213 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
214 request_resource(&iomem_resource, res);
215 if (e820.map[i].type == E820_RAM) {
216 /*
217 * We don't know which RAM region contains kernel data,
218 * so we try it repeatedly and let the resource manager
219 * test it.
220 */
221 request_resource(res, &code_resource);
222 request_resource(res, &data_resource);
223#ifdef CONFIG_KEXEC
224 request_resource(res, &crashk_res);
225#endif
226 }
227 }
228}
229
230/*
231 * Find the ranges of physical addresses that do not correspond to
232 * e820 RAM areas and mark the corresponding pages as nosave for software
233 * suspend and suspend to RAM.
234 *
235 * This function requires the e820 map to be sorted and without any
236 * overlapping entries and assumes the first e820 area to be RAM.
237 */
238void __init e820_mark_nosave_regions(void)
239{
240 int i;
241 unsigned long paddr;
242
243 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
244 for (i = 1; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246
247 if (paddr < ei->addr)
248 register_nosave_region(PFN_DOWN(paddr),
249 PFN_UP(ei->addr));
250
251 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
252 if (ei->type != E820_RAM)
253 register_nosave_region(PFN_UP(ei->addr),
254 PFN_DOWN(paddr));
255
256 if (paddr >= (end_pfn << PAGE_SHIFT))
257 break;
258 }
259}
260
261/*
262 * Finds an active region in the address range from start_pfn to end_pfn and
263 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
264 */
265static int __init e820_find_active_region(const struct e820entry *ei,
266 unsigned long start_pfn,
267 unsigned long end_pfn,
268 unsigned long *ei_startpfn,
269 unsigned long *ei_endpfn)
270{
271 *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
272 *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
273
274 /* Skip map entries smaller than a page */
275 if (*ei_startpfn >= *ei_endpfn)
276 return 0;
277
278 /* Check if end_pfn_map should be updated */
279 if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
280 end_pfn_map = *ei_endpfn;
281
282 /* Skip if map is outside the node */
283 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
284 *ei_startpfn >= end_pfn)
285 return 0;
286
287 /* Check for overlaps */
288 if (*ei_startpfn < start_pfn)
289 *ei_startpfn = start_pfn;
290 if (*ei_endpfn > end_pfn)
291 *ei_endpfn = end_pfn;
292
293 /* Obey end_user_pfn to save on memmap */
294 if (*ei_startpfn >= end_user_pfn)
295 return 0;
296 if (*ei_endpfn > end_user_pfn)
297 *ei_endpfn = end_user_pfn;
298
299 return 1;
300}
301
302/* Walk the e820 map and register active regions within a node */
303void __init
304e820_register_active_regions(int nid, unsigned long start_pfn,
305 unsigned long end_pfn)
306{
307 unsigned long ei_startpfn;
308 unsigned long ei_endpfn;
309 int i;
310
311 for (i = 0; i < e820.nr_map; i++)
312 if (e820_find_active_region(&e820.map[i],
313 start_pfn, end_pfn,
314 &ei_startpfn, &ei_endpfn))
315 add_active_range(nid, ei_startpfn, ei_endpfn);
316}
317
318/*
319 * Add a memory region to the kernel e820 map.
320 */
321void __init add_memory_region(unsigned long start, unsigned long size, int type)
322{
323 int x = e820.nr_map;
324
325 if (x == E820MAX) {
326 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
327 return;
328 }
329
330 e820.map[x].addr = start;
331 e820.map[x].size = size;
332 e820.map[x].type = type;
333 e820.nr_map++;
334}
335
336/*
337 * Find the hole size (in bytes) in the memory range.
338 * @start: starting address of the memory range to scan
339 * @end: ending address of the memory range to scan
340 */
341unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
342{
343 unsigned long start_pfn = start >> PAGE_SHIFT;
344 unsigned long end_pfn = end >> PAGE_SHIFT;
345 unsigned long ei_startpfn;
346 unsigned long ei_endpfn;
347 unsigned long ram = 0;
348 int i;
349
350 for (i = 0; i < e820.nr_map; i++) {
351 if (e820_find_active_region(&e820.map[i],
352 start_pfn, end_pfn,
353 &ei_startpfn, &ei_endpfn))
354 ram += ei_endpfn - ei_startpfn;
355 }
356 return end - start - (ram << PAGE_SHIFT);
357}
358
359void __init e820_print_map(char *who)
360{
361 int i;
362
363 for (i = 0; i < e820.nr_map; i++) {
364 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
365 (unsigned long long) e820.map[i].addr,
366 (unsigned long long) (e820.map[i].addr + e820.map[i].size));
367 switch (e820.map[i].type) {
368 case E820_RAM: printk("(usable)\n");
369 break;
370 case E820_RESERVED:
371 printk("(reserved)\n");
372 break;
373 case E820_ACPI:
374 printk("(ACPI data)\n");
375 break;
376 case E820_NVS:
377 printk("(ACPI NVS)\n");
378 break;
379 default: printk("type %u\n", e820.map[i].type);
380 break;
381 }
382 }
383}
384
385/*
386 * Sanitize the BIOS e820 map.
387 *
388 * Some e820 responses include overlapping entries. The following
389 * replaces the original e820 map with a new one, removing overlaps.
390 *
391 */
392static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
393{
394 struct change_member {
395 struct e820entry *pbios; /* pointer to original bios entry */
396 unsigned long long addr; /* address for this change point */
397 };
398 static struct change_member change_point_list[2*E820MAX] __initdata;
399 static struct change_member *change_point[2*E820MAX] __initdata;
400 static struct e820entry *overlap_list[E820MAX] __initdata;
401 static struct e820entry new_bios[E820MAX] __initdata;
402 struct change_member *change_tmp;
403 unsigned long current_type, last_type;
404 unsigned long long last_addr;
405 int chgidx, still_changing;
406 int overlap_entries;
407 int new_bios_entry;
408 int old_nr, new_nr, chg_nr;
409 int i;
410
411 /*
412 Visually we're performing the following (1,2,3,4 = memory types)...
413
414 Sample memory map (w/overlaps):
415 ____22__________________
416 ______________________4_
417 ____1111________________
418 _44_____________________
419 11111111________________
420 ____________________33__
421 ___________44___________
422 __________33333_________
423 ______________22________
424 ___________________2222_
425 _________111111111______
426 _____________________11_
427 _________________4______
428
429 Sanitized equivalent (no overlap):
430 1_______________________
431 _44_____________________
432 ___1____________________
433 ____22__________________
434 ______11________________
435 _________1______________
436 __________3_____________
437 ___________44___________
438 _____________33_________
439 _______________2________
440 ________________1_______
441 _________________4______
442 ___________________2____
443 ____________________33__
444 ______________________4_
445 */
446
447 /* if there's only one memory region, don't bother */
448 if (*pnr_map < 2)
449 return -1;
450
451 old_nr = *pnr_map;
452
453 /* bail out if we find any unreasonable addresses in bios map */
454 for (i=0; i<old_nr; i++)
455 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
456 return -1;
457
458 /* create pointers for initial change-point information (for sorting) */
459 for (i=0; i < 2*old_nr; i++)
460 change_point[i] = &change_point_list[i];
461
462 /* record all known change-points (starting and ending addresses),
463 omitting those that are for empty memory regions */
464 chgidx = 0;
465 for (i=0; i < old_nr; i++) {
466 if (biosmap[i].size != 0) {
467 change_point[chgidx]->addr = biosmap[i].addr;
468 change_point[chgidx++]->pbios = &biosmap[i];
469 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
470 change_point[chgidx++]->pbios = &biosmap[i];
471 }
472 }
473 chg_nr = chgidx;
474
475 /* sort change-point list by memory addresses (low -> high) */
476 still_changing = 1;
477 while (still_changing) {
478 still_changing = 0;
479 for (i=1; i < chg_nr; i++) {
480 /* if <current_addr> > <last_addr>, swap */
481 /* or, if current=<start_addr> & last=<end_addr>, swap */
482 if ((change_point[i]->addr < change_point[i-1]->addr) ||
483 ((change_point[i]->addr == change_point[i-1]->addr) &&
484 (change_point[i]->addr == change_point[i]->pbios->addr) &&
485 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
486 )
487 {
488 change_tmp = change_point[i];
489 change_point[i] = change_point[i-1];
490 change_point[i-1] = change_tmp;
491 still_changing=1;
492 }
493 }
494 }
495
496 /* create a new bios memory map, removing overlaps */
497 overlap_entries=0; /* number of entries in the overlap table */
498 new_bios_entry=0; /* index for creating new bios map entries */
499 last_type = 0; /* start with undefined memory type */
500 last_addr = 0; /* start with 0 as last starting address */
501 /* loop through change-points, determining affect on the new bios map */
502 for (chgidx=0; chgidx < chg_nr; chgidx++)
503 {
504 /* keep track of all overlapping bios entries */
505 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
506 {
507 /* add map entry to overlap list (> 1 entry implies an overlap) */
508 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
509 }
510 else
511 {
512 /* remove entry from list (order independent, so swap with last) */
513 for (i=0; i<overlap_entries; i++)
514 {
515 if (overlap_list[i] == change_point[chgidx]->pbios)
516 overlap_list[i] = overlap_list[overlap_entries-1];
517 }
518 overlap_entries--;
519 }
520 /* if there are overlapping entries, decide which "type" to use */
521 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
522 current_type = 0;
523 for (i=0; i<overlap_entries; i++)
524 if (overlap_list[i]->type > current_type)
525 current_type = overlap_list[i]->type;
526 /* continue building up new bios map based on this information */
527 if (current_type != last_type) {
528 if (last_type != 0) {
529 new_bios[new_bios_entry].size =
530 change_point[chgidx]->addr - last_addr;
531 /* move forward only if the new size was non-zero */
532 if (new_bios[new_bios_entry].size != 0)
533 if (++new_bios_entry >= E820MAX)
534 break; /* no more space left for new bios entries */
535 }
536 if (current_type != 0) {
537 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
538 new_bios[new_bios_entry].type = current_type;
539 last_addr=change_point[chgidx]->addr;
540 }
541 last_type = current_type;
542 }
543 }
544 new_nr = new_bios_entry; /* retain count for new bios entries */
545
546 /* copy new bios mapping into original location */
547 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
548 *pnr_map = new_nr;
549
550 return 0;
551}
552
553/*
554 * Copy the BIOS e820 map into a safe place.
555 *
556 * Sanity-check it while we're at it..
557 *
558 * If we're lucky and live on a modern system, the setup code
559 * will have given us a memory map that we can use to properly
560 * set up memory. If we aren't, we'll fake a memory map.
561 */
562static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
563{
564 /* Only one memory region (or negative)? Ignore it */
565 if (nr_map < 2)
566 return -1;
567
568 do {
569 unsigned long start = biosmap->addr;
570 unsigned long size = biosmap->size;
571 unsigned long end = start + size;
572 unsigned long type = biosmap->type;
573
574 /* Overflow in 64 bits? Ignore the memory map. */
575 if (start > end)
576 return -1;
577
578 add_memory_region(start, size, type);
579 } while (biosmap++,--nr_map);
580 return 0;
581}
582
583void early_panic(char *msg)
584{
585 early_printk(msg);
586 panic(msg);
587}
588
589void __init setup_memory_region(void)
590{
591 /*
592 * Try to copy the BIOS-supplied E820-map.
593 *
594 * Otherwise fake a memory map; one section from 0k->640k,
595 * the next section from 1mb->appropriate_mem_k
596 */
597 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
598 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
599 early_panic("Cannot find a valid memory map");
600 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
601 e820_print_map("BIOS-e820");
602}
603
604static int __init parse_memopt(char *p)
605{
606 if (!p)
607 return -EINVAL;
608 end_user_pfn = memparse(p, &p);
609 end_user_pfn >>= PAGE_SHIFT;
610 return 0;
611}
612early_param("mem", parse_memopt);
613
614static int userdef __initdata;
615
616static int __init parse_memmap_opt(char *p)
617{
618 char *oldp;
619 unsigned long long start_at, mem_size;
620
621 if (!strcmp(p, "exactmap")) {
622#ifdef CONFIG_CRASH_DUMP
623 /* If we are doing a crash dump, we
624 * still need to know the real mem
625 * size before original memory map is
626 * reset.
627 */
628 e820_register_active_regions(0, 0, -1UL);
629 saved_max_pfn = e820_end_of_ram();
630 remove_all_active_ranges();
631#endif
632 end_pfn_map = 0;
633 e820.nr_map = 0;
634 userdef = 1;
635 return 0;
636 }
637
638 oldp = p;
639 mem_size = memparse(p, &p);
640 if (p == oldp)
641 return -EINVAL;
642 if (*p == '@') {
643 start_at = memparse(p+1, &p);
644 add_memory_region(start_at, mem_size, E820_RAM);
645 } else if (*p == '#') {
646 start_at = memparse(p+1, &p);
647 add_memory_region(start_at, mem_size, E820_ACPI);
648 } else if (*p == '$') {
649 start_at = memparse(p+1, &p);
650 add_memory_region(start_at, mem_size, E820_RESERVED);
651 } else {
652 end_user_pfn = (mem_size >> PAGE_SHIFT);
653 }
654 return *p == '\0' ? 0 : -EINVAL;
655}
656early_param("memmap", parse_memmap_opt);
657
658void __init finish_e820_parsing(void)
659{
660 if (userdef) {
661 printk(KERN_INFO "user-defined physical RAM map:\n");
662 e820_print_map("user");
663 }
664}
665
666unsigned long pci_mem_start = 0xaeedbabe;
667EXPORT_SYMBOL(pci_mem_start);
668
669/*
670 * Search for the biggest gap in the low 32 bits of the e820
671 * memory space. We pass this space to PCI to assign MMIO resources
672 * for hotplug or unconfigured devices in.
673 * Hopefully the BIOS let enough space left.
674 */
675__init void e820_setup_gap(void)
676{
677 unsigned long gapstart, gapsize, round;
678 unsigned long last;
679 int i;
680 int found = 0;
681
682 last = 0x100000000ull;
683 gapstart = 0x10000000;
684 gapsize = 0x400000;
685 i = e820.nr_map;
686 while (--i >= 0) {
687 unsigned long long start = e820.map[i].addr;
688 unsigned long long end = start + e820.map[i].size;
689
690 /*
691 * Since "last" is at most 4GB, we know we'll
692 * fit in 32 bits if this condition is true
693 */
694 if (last > end) {
695 unsigned long gap = last - end;
696
697 if (gap > gapsize) {
698 gapsize = gap;
699 gapstart = end;
700 found = 1;
701 }
702 }
703 if (start < last)
704 last = start;
705 }
706
707 if (!found) {
708 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
709 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
710 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
711 }
712
713 /*
714 * See how much we want to round up: start off with
715 * rounding to the next 1MB area.
716 */
717 round = 0x100000;
718 while ((gapsize >> 4) > round)
719 round += round;
720 /* Fun with two's complement */
721 pci_mem_start = (gapstart + round) & -round;
722
723 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
724 pci_mem_start, gapstart, gapsize);
725}
diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c
new file mode 100644
index 000000000000..13aa4fd728f3
--- /dev/null
+++ b/arch/x86/kernel/early-quirks_64.c
@@ -0,0 +1,127 @@
1/* Various workarounds for chipset bugs.
2 This code runs very early and can't use the regular PCI subsystem
3 The entries are keyed to PCI bridges which usually identify chipsets
4 uniquely.
5 This is only for whole classes of chipsets with specific problems which
6 need early invasive action (e.g. before the timers are initialized).
7 Most PCI device specific workarounds can be done later and should be
8 in standard PCI quirks
9 Mainboard specific bugs should be handled by DMI entries.
10 CPU specific bugs in setup.c */
11
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci_ids.h>
15#include <asm/pci-direct.h>
16#include <asm/proto.h>
17#include <asm/iommu.h>
18#include <asm/dma.h>
19
20static void __init via_bugs(void)
21{
22#ifdef CONFIG_IOMMU
23 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
24 !iommu_aperture_allowed) {
25 printk(KERN_INFO
26 "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
27 iommu_aperture_disabled = 1;
28 }
29#endif
30}
31
32#ifdef CONFIG_ACPI
33
34static int __init nvidia_hpet_check(struct acpi_table_header *header)
35{
36 return 0;
37}
38#endif
39
40static void __init nvidia_bugs(void)
41{
42#ifdef CONFIG_ACPI
43 /*
44 * All timer overrides on Nvidia are
45 * wrong unless HPET is enabled.
46 * Unfortunately that's not true on many Asus boards.
47 * We don't know yet how to detect this automatically, but
48 * at least allow a command line override.
49 */
50 if (acpi_use_timer_override)
51 return;
52
53 if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
54 acpi_skip_timer_override = 1;
55 printk(KERN_INFO "Nvidia board "
56 "detected. Ignoring ACPI "
57 "timer override.\n");
58 printk(KERN_INFO "If you got timer trouble "
59 "try acpi_use_timer_override\n");
60 }
61#endif
62 /* RED-PEN skip them on mptables too? */
63
64}
65
66static void __init ati_bugs(void)
67{
68 if (timer_over_8254 == 1) {
69 timer_over_8254 = 0;
70 printk(KERN_INFO
71 "ATI board detected. Disabling timer routing over 8254.\n");
72 }
73}
74
75struct chipset {
76 u16 vendor;
77 void (*f)(void);
78};
79
80static struct chipset early_qrk[] __initdata = {
81 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
82 { PCI_VENDOR_ID_VIA, via_bugs },
83 { PCI_VENDOR_ID_ATI, ati_bugs },
84 {}
85};
86
87void __init early_quirks(void)
88{
89 int num, slot, func;
90
91 if (!early_pci_allowed())
92 return;
93
94 /* Poor man's PCI discovery */
95 for (num = 0; num < 32; num++) {
96 for (slot = 0; slot < 32; slot++) {
97 for (func = 0; func < 8; func++) {
98 u32 class;
99 u32 vendor;
100 u8 type;
101 int i;
102 class = read_pci_config(num,slot,func,
103 PCI_CLASS_REVISION);
104 if (class == 0xffffffff)
105 break;
106
107 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
108 continue;
109
110 vendor = read_pci_config(num, slot, func,
111 PCI_VENDOR_ID);
112 vendor &= 0xffff;
113
114 for (i = 0; early_qrk[i].f; i++)
115 if (early_qrk[i].vendor == vendor) {
116 early_qrk[i].f();
117 return;
118 }
119
120 type = read_pci_config_byte(num, slot, func,
121 PCI_HEADER_TYPE);
122 if (!(type & 0x80))
123 break;
124 }
125 }
126 }
127}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 92f812ba275c..fd9aff3f3890 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -1,2 +1,259 @@
1#include <linux/console.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/string.h>
5#include <linux/screen_info.h>
6#include <asm/io.h>
7#include <asm/processor.h>
8#include <asm/fcntl.h>
9#include <xen/hvc-console.h>
1 10
2#include "../../x86_64/kernel/early_printk.c" 11/* Simple VGA output */
12
13#ifdef __i386__
14#include <asm/setup.h>
15#else
16#include <asm/bootsetup.h>
17#endif
18#define VGABASE (__ISA_IO_base + 0xb8000)
19
20static int max_ypos = 25, max_xpos = 80;
21static int current_ypos = 25, current_xpos = 0;
22
23static void early_vga_write(struct console *con, const char *str, unsigned n)
24{
25 char c;
26 int i, k, j;
27
28 while ((c = *str++) != '\0' && n-- > 0) {
29 if (current_ypos >= max_ypos) {
30 /* scroll 1 line up */
31 for (k = 1, j = 0; k < max_ypos; k++, j++) {
32 for (i = 0; i < max_xpos; i++) {
33 writew(readw(VGABASE+2*(max_xpos*k+i)),
34 VGABASE + 2*(max_xpos*j + i));
35 }
36 }
37 for (i = 0; i < max_xpos; i++)
38 writew(0x720, VGABASE + 2*(max_xpos*j + i));
39 current_ypos = max_ypos-1;
40 }
41 if (c == '\n') {
42 current_xpos = 0;
43 current_ypos++;
44 } else if (c != '\r') {
45 writew(((0x7 << 8) | (unsigned short) c),
46 VGABASE + 2*(max_xpos*current_ypos +
47 current_xpos++));
48 if (current_xpos >= max_xpos) {
49 current_xpos = 0;
50 current_ypos++;
51 }
52 }
53 }
54}
55
56static struct console early_vga_console = {
57 .name = "earlyvga",
58 .write = early_vga_write,
59 .flags = CON_PRINTBUFFER,
60 .index = -1,
61};
62
63/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
64
65static int early_serial_base = 0x3f8; /* ttyS0 */
66
67#define XMTRDY 0x20
68
69#define DLAB 0x80
70
71#define TXR 0 /* Transmit register (WRITE) */
72#define RXR 0 /* Receive register (READ) */
73#define IER 1 /* Interrupt Enable */
74#define IIR 2 /* Interrupt ID */
75#define FCR 2 /* FIFO control */
76#define LCR 3 /* Line control */
77#define MCR 4 /* Modem control */
78#define LSR 5 /* Line Status */
79#define MSR 6 /* Modem Status */
80#define DLL 0 /* Divisor Latch Low */
81#define DLH 1 /* Divisor latch High */
82
83static int early_serial_putc(unsigned char ch)
84{
85 unsigned timeout = 0xffff;
86 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
87 cpu_relax();
88 outb(ch, early_serial_base + TXR);
89 return timeout ? 0 : -1;
90}
91
92static void early_serial_write(struct console *con, const char *s, unsigned n)
93{
94 while (*s && n-- > 0) {
95 if (*s == '\n')
96 early_serial_putc('\r');
97 early_serial_putc(*s);
98 s++;
99 }
100}
101
102#define DEFAULT_BAUD 9600
103
104static __init void early_serial_init(char *s)
105{
106 unsigned char c;
107 unsigned divisor;
108 unsigned baud = DEFAULT_BAUD;
109 char *e;
110
111 if (*s == ',')
112 ++s;
113
114 if (*s) {
115 unsigned port;
116 if (!strncmp(s,"0x",2)) {
117 early_serial_base = simple_strtoul(s, &e, 16);
118 } else {
119 static int bases[] = { 0x3f8, 0x2f8 };
120
121 if (!strncmp(s,"ttyS",4))
122 s += 4;
123 port = simple_strtoul(s, &e, 10);
124 if (port > 1 || s == e)
125 port = 0;
126 early_serial_base = bases[port];
127 }
128 s += strcspn(s, ",");
129 if (*s == ',')
130 s++;
131 }
132
133 outb(0x3, early_serial_base + LCR); /* 8n1 */
134 outb(0, early_serial_base + IER); /* no interrupt */
135 outb(0, early_serial_base + FCR); /* no fifo */
136 outb(0x3, early_serial_base + MCR); /* DTR + RTS */
137
138 if (*s) {
139 baud = simple_strtoul(s, &e, 0);
140 if (baud == 0 || s == e)
141 baud = DEFAULT_BAUD;
142 }
143
144 divisor = 115200 / baud;
145 c = inb(early_serial_base + LCR);
146 outb(c | DLAB, early_serial_base + LCR);
147 outb(divisor & 0xff, early_serial_base + DLL);
148 outb((divisor >> 8) & 0xff, early_serial_base + DLH);
149 outb(c & ~DLAB, early_serial_base + LCR);
150}
151
152static struct console early_serial_console = {
153 .name = "earlyser",
154 .write = early_serial_write,
155 .flags = CON_PRINTBUFFER,
156 .index = -1,
157};
158
159/* Console interface to a host file on AMD's SimNow! */
160
161static int simnow_fd;
162
163enum {
164 MAGIC1 = 0xBACCD00A,
165 MAGIC2 = 0xCA110000,
166 XOPEN = 5,
167 XWRITE = 4,
168};
169
170static noinline long simnow(long cmd, long a, long b, long c)
171{
172 long ret;
173 asm volatile("cpuid" :
174 "=a" (ret) :
175 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
176 return ret;
177}
178
179static void __init simnow_init(char *str)
180{
181 char *fn = "klog";
182 if (*str == '=')
183 fn = ++str;
184 /* error ignored */
185 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
186}
187
188static void simnow_write(struct console *con, const char *s, unsigned n)
189{
190 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
191}
192
193static struct console simnow_console = {
194 .name = "simnow",
195 .write = simnow_write,
196 .flags = CON_PRINTBUFFER,
197 .index = -1,
198};
199
200/* Direct interface for emergencies */
201struct console *early_console = &early_vga_console;
202static int early_console_initialized = 0;
203
204void early_printk(const char *fmt, ...)
205{
206 char buf[512];
207 int n;
208 va_list ap;
209
210 va_start(ap,fmt);
211 n = vscnprintf(buf,512,fmt,ap);
212 early_console->write(early_console,buf,n);
213 va_end(ap);
214}
215
216static int __initdata keep_early;
217
218static int __init setup_early_printk(char *buf)
219{
220 if (!buf)
221 return 0;
222
223 if (early_console_initialized)
224 return 0;
225 early_console_initialized = 1;
226
227 if (strstr(buf, "keep"))
228 keep_early = 1;
229
230 if (!strncmp(buf, "serial", 6)) {
231 early_serial_init(buf + 6);
232 early_console = &early_serial_console;
233 } else if (!strncmp(buf, "ttyS", 4)) {
234 early_serial_init(buf);
235 early_console = &early_serial_console;
236 } else if (!strncmp(buf, "vga", 3)
237 && SCREEN_INFO.orig_video_isVGA == 1) {
238 max_xpos = SCREEN_INFO.orig_video_cols;
239 max_ypos = SCREEN_INFO.orig_video_lines;
240 current_ypos = SCREEN_INFO.orig_y;
241 early_console = &early_vga_console;
242 } else if (!strncmp(buf, "simnow", 6)) {
243 simnow_init(buf + 6);
244 early_console = &simnow_console;
245 keep_early = 1;
246#ifdef CONFIG_HVC_XEN
247 } else if (!strncmp(buf, "xen", 3)) {
248 early_console = &xenboot_console;
249#endif
250 }
251
252 if (keep_early)
253 early_console->flags &= ~CON_BOOT;
254 else
255 early_console->flags |= CON_BOOT;
256 register_console(early_console);
257 return 0;
258}
259early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
new file mode 100644
index 000000000000..1d232e5f5658
--- /dev/null
+++ b/arch/x86/kernel/entry_64.S
@@ -0,0 +1,1172 @@
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
17 *
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
23 *
24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
38 */
39
40#include <linux/linkage.h>
41#include <asm/segment.h>
42#include <asm/cache.h>
43#include <asm/errno.h>
44#include <asm/dwarf2.h>
45#include <asm/calling.h>
46#include <asm/asm-offsets.h>
47#include <asm/msr.h>
48#include <asm/unistd.h>
49#include <asm/thread_info.h>
50#include <asm/hw_irq.h>
51#include <asm/page.h>
52#include <asm/irqflags.h>
53
54 .code64
55
56#ifndef CONFIG_PREEMPT
57#define retint_kernel retint_restore_args
58#endif
59
60
61.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
62#ifdef CONFIG_TRACE_IRQFLAGS
63 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
64 jnc 1f
65 TRACE_IRQS_ON
661:
67#endif
68.endm
69
70/*
71 * C code is not supposed to know about undefined top of stack. Every time
72 * a C function with an pt_regs argument is called from the SYSCALL based
73 * fast path FIXUP_TOP_OF_STACK is needed.
74 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
75 * manipulation.
76 */
77
78 /* %rsp:at FRAMEEND */
79 .macro FIXUP_TOP_OF_STACK tmp
80 movq %gs:pda_oldrsp,\tmp
81 movq \tmp,RSP(%rsp)
82 movq $__USER_DS,SS(%rsp)
83 movq $__USER_CS,CS(%rsp)
84 movq $-1,RCX(%rsp)
85 movq R11(%rsp),\tmp /* get eflags */
86 movq \tmp,EFLAGS(%rsp)
87 .endm
88
89 .macro RESTORE_TOP_OF_STACK tmp,offset=0
90 movq RSP-\offset(%rsp),\tmp
91 movq \tmp,%gs:pda_oldrsp
92 movq EFLAGS-\offset(%rsp),\tmp
93 movq \tmp,R11-\offset(%rsp)
94 .endm
95
96 .macro FAKE_STACK_FRAME child_rip
97 /* push in order ss, rsp, eflags, cs, rip */
98 xorl %eax, %eax
99 pushq %rax /* ss */
100 CFI_ADJUST_CFA_OFFSET 8
101 /*CFI_REL_OFFSET ss,0*/
102 pushq %rax /* rsp */
103 CFI_ADJUST_CFA_OFFSET 8
104 CFI_REL_OFFSET rsp,0
105 pushq $(1<<9) /* eflags - interrupts on */
106 CFI_ADJUST_CFA_OFFSET 8
107 /*CFI_REL_OFFSET rflags,0*/
108 pushq $__KERNEL_CS /* cs */
109 CFI_ADJUST_CFA_OFFSET 8
110 /*CFI_REL_OFFSET cs,0*/
111 pushq \child_rip /* rip */
112 CFI_ADJUST_CFA_OFFSET 8
113 CFI_REL_OFFSET rip,0
114 pushq %rax /* orig rax */
115 CFI_ADJUST_CFA_OFFSET 8
116 .endm
117
118 .macro UNFAKE_STACK_FRAME
119 addq $8*6, %rsp
120 CFI_ADJUST_CFA_OFFSET -(6*8)
121 .endm
122
123 .macro CFI_DEFAULT_STACK start=1
124 .if \start
125 CFI_STARTPROC simple
126 CFI_SIGNAL_FRAME
127 CFI_DEF_CFA rsp,SS+8
128 .else
129 CFI_DEF_CFA_OFFSET SS+8
130 .endif
131 CFI_REL_OFFSET r15,R15
132 CFI_REL_OFFSET r14,R14
133 CFI_REL_OFFSET r13,R13
134 CFI_REL_OFFSET r12,R12
135 CFI_REL_OFFSET rbp,RBP
136 CFI_REL_OFFSET rbx,RBX
137 CFI_REL_OFFSET r11,R11
138 CFI_REL_OFFSET r10,R10
139 CFI_REL_OFFSET r9,R9
140 CFI_REL_OFFSET r8,R8
141 CFI_REL_OFFSET rax,RAX
142 CFI_REL_OFFSET rcx,RCX
143 CFI_REL_OFFSET rdx,RDX
144 CFI_REL_OFFSET rsi,RSI
145 CFI_REL_OFFSET rdi,RDI
146 CFI_REL_OFFSET rip,RIP
147 /*CFI_REL_OFFSET cs,CS*/
148 /*CFI_REL_OFFSET rflags,EFLAGS*/
149 CFI_REL_OFFSET rsp,RSP
150 /*CFI_REL_OFFSET ss,SS*/
151 .endm
152/*
153 * A newly forked process directly context switches into this.
154 */
155/* rdi: prev */
156ENTRY(ret_from_fork)
157 CFI_DEFAULT_STACK
158 push kernel_eflags(%rip)
159 CFI_ADJUST_CFA_OFFSET 4
160 popf # reset kernel eflags
161 CFI_ADJUST_CFA_OFFSET -4
162 call schedule_tail
163 GET_THREAD_INFO(%rcx)
164 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
165 jnz rff_trace
166rff_action:
167 RESTORE_REST
168 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
169 je int_ret_from_sys_call
170 testl $_TIF_IA32,threadinfo_flags(%rcx)
171 jnz int_ret_from_sys_call
172 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
173 jmp ret_from_sys_call
174rff_trace:
175 movq %rsp,%rdi
176 call syscall_trace_leave
177 GET_THREAD_INFO(%rcx)
178 jmp rff_action
179 CFI_ENDPROC
180END(ret_from_fork)
181
182/*
183 * System call entry. Upto 6 arguments in registers are supported.
184 *
185 * SYSCALL does not save anything on the stack and does not change the
186 * stack pointer.
187 */
188
189/*
190 * Register setup:
191 * rax system call number
192 * rdi arg0
193 * rcx return address for syscall/sysret, C arg3
194 * rsi arg1
195 * rdx arg2
196 * r10 arg3 (--> moved to rcx for C)
197 * r8 arg4
198 * r9 arg5
199 * r11 eflags for syscall/sysret, temporary for C
200 * r12-r15,rbp,rbx saved by C code, not touched.
201 *
202 * Interrupts are off on entry.
203 * Only called from user space.
204 *
205 * XXX if we had a free scratch register we could save the RSP into the stack frame
206 * and report it properly in ps. Unfortunately we haven't.
207 *
208 * When user can change the frames always force IRET. That is because
209 * it deals with uncanonical addresses better. SYSRET has trouble
210 * with them due to bugs in both AMD and Intel CPUs.
211 */
212
213ENTRY(system_call)
214 CFI_STARTPROC simple
215 CFI_SIGNAL_FRAME
216 CFI_DEF_CFA rsp,PDA_STACKOFFSET
217 CFI_REGISTER rip,rcx
218 /*CFI_REGISTER rflags,r11*/
219 swapgs
220 movq %rsp,%gs:pda_oldrsp
221 movq %gs:pda_kernelstack,%rsp
222 /*
223 * No need to follow this irqs off/on section - it's straight
224 * and short:
225 */
226 sti
227 SAVE_ARGS 8,1
228 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
229 movq %rcx,RIP-ARGOFFSET(%rsp)
230 CFI_REL_OFFSET rip,RIP-ARGOFFSET
231 GET_THREAD_INFO(%rcx)
232 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
233 jnz tracesys
234 cmpq $__NR_syscall_max,%rax
235 ja badsys
236 movq %r10,%rcx
237 call *sys_call_table(,%rax,8) # XXX: rip relative
238 movq %rax,RAX-ARGOFFSET(%rsp)
239/*
240 * Syscall return path ending with SYSRET (fast path)
241 * Has incomplete stack frame and undefined top of stack.
242 */
243ret_from_sys_call:
244 movl $_TIF_ALLWORK_MASK,%edi
245 /* edi: flagmask */
246sysret_check:
247 GET_THREAD_INFO(%rcx)
248 cli
249 TRACE_IRQS_OFF
250 movl threadinfo_flags(%rcx),%edx
251 andl %edi,%edx
252 jnz sysret_careful
253 CFI_REMEMBER_STATE
254 /*
255 * sysretq will re-enable interrupts:
256 */
257 TRACE_IRQS_ON
258 movq RIP-ARGOFFSET(%rsp),%rcx
259 CFI_REGISTER rip,rcx
260 RESTORE_ARGS 0,-ARG_SKIP,1
261 /*CFI_REGISTER rflags,r11*/
262 movq %gs:pda_oldrsp,%rsp
263 swapgs
264 sysretq
265
266 CFI_RESTORE_STATE
267 /* Handle reschedules */
268 /* edx: work, edi: workmask */
269sysret_careful:
270 bt $TIF_NEED_RESCHED,%edx
271 jnc sysret_signal
272 TRACE_IRQS_ON
273 sti
274 pushq %rdi
275 CFI_ADJUST_CFA_OFFSET 8
276 call schedule
277 popq %rdi
278 CFI_ADJUST_CFA_OFFSET -8
279 jmp sysret_check
280
281 /* Handle a signal */
282sysret_signal:
283 TRACE_IRQS_ON
284 sti
285 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
286 jz 1f
287
288 /* Really a signal */
289 /* edx: work flags (arg3) */
290 leaq do_notify_resume(%rip),%rax
291 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
292 xorl %esi,%esi # oldset -> arg2
293 call ptregscall_common
2941: movl $_TIF_NEED_RESCHED,%edi
295 /* Use IRET because user could have changed frame. This
296 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
297 cli
298 TRACE_IRQS_OFF
299 jmp int_with_check
300
301badsys:
302 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
303 jmp ret_from_sys_call
304
305 /* Do syscall tracing */
306tracesys:
307 SAVE_REST
308 movq $-ENOSYS,RAX(%rsp)
309 FIXUP_TOP_OF_STACK %rdi
310 movq %rsp,%rdi
311 call syscall_trace_enter
312 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
313 RESTORE_REST
314 cmpq $__NR_syscall_max,%rax
315 movq $-ENOSYS,%rcx
316 cmova %rcx,%rax
317 ja 1f
318 movq %r10,%rcx /* fixup for C */
319 call *sys_call_table(,%rax,8)
3201: movq %rax,RAX-ARGOFFSET(%rsp)
321 /* Use IRET because user could have changed frame */
322
323/*
324 * Syscall return path ending with IRET.
325 * Has correct top of stack, but partial stack frame.
326 */
327 .globl int_ret_from_sys_call
328int_ret_from_sys_call:
329 cli
330 TRACE_IRQS_OFF
331 testl $3,CS-ARGOFFSET(%rsp)
332 je retint_restore_args
333 movl $_TIF_ALLWORK_MASK,%edi
334 /* edi: mask to check */
335int_with_check:
336 GET_THREAD_INFO(%rcx)
337 movl threadinfo_flags(%rcx),%edx
338 andl %edi,%edx
339 jnz int_careful
340 andl $~TS_COMPAT,threadinfo_status(%rcx)
341 jmp retint_swapgs
342
343 /* Either reschedule or signal or syscall exit tracking needed. */
344 /* First do a reschedule test. */
345 /* edx: work, edi: workmask */
346int_careful:
347 bt $TIF_NEED_RESCHED,%edx
348 jnc int_very_careful
349 TRACE_IRQS_ON
350 sti
351 pushq %rdi
352 CFI_ADJUST_CFA_OFFSET 8
353 call schedule
354 popq %rdi
355 CFI_ADJUST_CFA_OFFSET -8
356 cli
357 TRACE_IRQS_OFF
358 jmp int_with_check
359
360 /* handle signals and tracing -- both require a full stack frame */
361int_very_careful:
362 TRACE_IRQS_ON
363 sti
364 SAVE_REST
365 /* Check for syscall exit trace */
366 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
367 jz int_signal
368 pushq %rdi
369 CFI_ADJUST_CFA_OFFSET 8
370 leaq 8(%rsp),%rdi # &ptregs -> arg1
371 call syscall_trace_leave
372 popq %rdi
373 CFI_ADJUST_CFA_OFFSET -8
374 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
375 jmp int_restore_rest
376
377int_signal:
378 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
379 jz 1f
380 movq %rsp,%rdi # &ptregs -> arg1
381 xorl %esi,%esi # oldset -> arg2
382 call do_notify_resume
3831: movl $_TIF_NEED_RESCHED,%edi
384int_restore_rest:
385 RESTORE_REST
386 cli
387 TRACE_IRQS_OFF
388 jmp int_with_check
389 CFI_ENDPROC
390END(system_call)
391
392/*
393 * Certain special system calls that need to save a complete full stack frame.
394 */
395
396 .macro PTREGSCALL label,func,arg
397 .globl \label
398\label:
399 leaq \func(%rip),%rax
400 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
401 jmp ptregscall_common
402END(\label)
403 .endm
404
405 CFI_STARTPROC
406
407 PTREGSCALL stub_clone, sys_clone, %r8
408 PTREGSCALL stub_fork, sys_fork, %rdi
409 PTREGSCALL stub_vfork, sys_vfork, %rdi
410 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
411 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
412 PTREGSCALL stub_iopl, sys_iopl, %rsi
413
414ENTRY(ptregscall_common)
415 popq %r11
416 CFI_ADJUST_CFA_OFFSET -8
417 CFI_REGISTER rip, r11
418 SAVE_REST
419 movq %r11, %r15
420 CFI_REGISTER rip, r15
421 FIXUP_TOP_OF_STACK %r11
422 call *%rax
423 RESTORE_TOP_OF_STACK %r11
424 movq %r15, %r11
425 CFI_REGISTER rip, r11
426 RESTORE_REST
427 pushq %r11
428 CFI_ADJUST_CFA_OFFSET 8
429 CFI_REL_OFFSET rip, 0
430 ret
431 CFI_ENDPROC
432END(ptregscall_common)
433
434ENTRY(stub_execve)
435 CFI_STARTPROC
436 popq %r11
437 CFI_ADJUST_CFA_OFFSET -8
438 CFI_REGISTER rip, r11
439 SAVE_REST
440 FIXUP_TOP_OF_STACK %r11
441 call sys_execve
442 RESTORE_TOP_OF_STACK %r11
443 movq %rax,RAX(%rsp)
444 RESTORE_REST
445 jmp int_ret_from_sys_call
446 CFI_ENDPROC
447END(stub_execve)
448
449/*
450 * sigreturn is special because it needs to restore all registers on return.
451 * This cannot be done with SYSRET, so use the IRET return path instead.
452 */
453ENTRY(stub_rt_sigreturn)
454 CFI_STARTPROC
455 addq $8, %rsp
456 CFI_ADJUST_CFA_OFFSET -8
457 SAVE_REST
458 movq %rsp,%rdi
459 FIXUP_TOP_OF_STACK %r11
460 call sys_rt_sigreturn
461 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
462 RESTORE_REST
463 jmp int_ret_from_sys_call
464 CFI_ENDPROC
465END(stub_rt_sigreturn)
466
467/*
468 * initial frame state for interrupts and exceptions
469 */
470 .macro _frame ref
471 CFI_STARTPROC simple
472 CFI_SIGNAL_FRAME
473 CFI_DEF_CFA rsp,SS+8-\ref
474 /*CFI_REL_OFFSET ss,SS-\ref*/
475 CFI_REL_OFFSET rsp,RSP-\ref
476 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
477 /*CFI_REL_OFFSET cs,CS-\ref*/
478 CFI_REL_OFFSET rip,RIP-\ref
479 .endm
480
481/* initial frame state for interrupts (and exceptions without error code) */
482#define INTR_FRAME _frame RIP
483/* initial frame state for exceptions with error code (and interrupts with
484 vector already pushed) */
485#define XCPT_FRAME _frame ORIG_RAX
486
487/*
488 * Interrupt entry/exit.
489 *
490 * Interrupt entry points save only callee clobbered registers in fast path.
491 *
492 * Entry runs with interrupts off.
493 */
494
495/* 0(%rsp): interrupt number */
496 .macro interrupt func
497 cld
498 SAVE_ARGS
499 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
500 pushq %rbp
501 CFI_ADJUST_CFA_OFFSET 8
502 CFI_REL_OFFSET rbp, 0
503 movq %rsp,%rbp
504 CFI_DEF_CFA_REGISTER rbp
505 testl $3,CS(%rdi)
506 je 1f
507 swapgs
508 /* irqcount is used to check if a CPU is already on an interrupt
509 stack or not. While this is essentially redundant with preempt_count
510 it is a little cheaper to use a separate counter in the PDA
511 (short of moving irq_enter into assembly, which would be too
512 much work) */
5131: incl %gs:pda_irqcount
514 cmoveq %gs:pda_irqstackptr,%rsp
515 push %rbp # backlink for old unwinder
516 /*
517 * We entered an interrupt context - irqs are off:
518 */
519 TRACE_IRQS_OFF
520 call \func
521 .endm
522
523ENTRY(common_interrupt)
524 XCPT_FRAME
525 interrupt do_IRQ
526 /* 0(%rsp): oldrsp-ARGOFFSET */
527ret_from_intr:
528 cli
529 TRACE_IRQS_OFF
530 decl %gs:pda_irqcount
531 leaveq
532 CFI_DEF_CFA_REGISTER rsp
533 CFI_ADJUST_CFA_OFFSET -8
534exit_intr:
535 GET_THREAD_INFO(%rcx)
536 testl $3,CS-ARGOFFSET(%rsp)
537 je retint_kernel
538
539 /* Interrupt came from user space */
540 /*
541 * Has a correct top of stack, but a partial stack frame
542 * %rcx: thread info. Interrupts off.
543 */
544retint_with_reschedule:
545 movl $_TIF_WORK_MASK,%edi
546retint_check:
547 movl threadinfo_flags(%rcx),%edx
548 andl %edi,%edx
549 CFI_REMEMBER_STATE
550 jnz retint_careful
551retint_swapgs:
552 /*
553 * The iretq could re-enable interrupts:
554 */
555 cli
556 TRACE_IRQS_IRETQ
557 swapgs
558 jmp restore_args
559
560retint_restore_args:
561 cli
562 /*
563 * The iretq could re-enable interrupts:
564 */
565 TRACE_IRQS_IRETQ
566restore_args:
567 RESTORE_ARGS 0,8,0
568iret_label:
569 iretq
570
571 .section __ex_table,"a"
572 .quad iret_label,bad_iret
573 .previous
574 .section .fixup,"ax"
575 /* force a signal here? this matches i386 behaviour */
576 /* running with kernel gs */
577bad_iret:
578 movq $11,%rdi /* SIGSEGV */
579 TRACE_IRQS_ON
580 sti
581 jmp do_exit
582 .previous
583
584 /* edi: workmask, edx: work */
585retint_careful:
586 CFI_RESTORE_STATE
587 bt $TIF_NEED_RESCHED,%edx
588 jnc retint_signal
589 TRACE_IRQS_ON
590 sti
591 pushq %rdi
592 CFI_ADJUST_CFA_OFFSET 8
593 call schedule
594 popq %rdi
595 CFI_ADJUST_CFA_OFFSET -8
596 GET_THREAD_INFO(%rcx)
597 cli
598 TRACE_IRQS_OFF
599 jmp retint_check
600
601retint_signal:
602 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
603 jz retint_swapgs
604 TRACE_IRQS_ON
605 sti
606 SAVE_REST
607 movq $-1,ORIG_RAX(%rsp)
608 xorl %esi,%esi # oldset
609 movq %rsp,%rdi # &pt_regs
610 call do_notify_resume
611 RESTORE_REST
612 cli
613 TRACE_IRQS_OFF
614 movl $_TIF_NEED_RESCHED,%edi
615 GET_THREAD_INFO(%rcx)
616 jmp retint_check
617
618#ifdef CONFIG_PREEMPT
619 /* Returning to kernel space. Check if we need preemption */
620 /* rcx: threadinfo. interrupts off. */
621ENTRY(retint_kernel)
622 cmpl $0,threadinfo_preempt_count(%rcx)
623 jnz retint_restore_args
624 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
625 jnc retint_restore_args
626 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
627 jnc retint_restore_args
628 call preempt_schedule_irq
629 jmp exit_intr
630#endif
631
632 CFI_ENDPROC
633END(common_interrupt)
634
635/*
636 * APIC interrupts.
637 */
638 .macro apicinterrupt num,func
639 INTR_FRAME
640 pushq $~(\num)
641 CFI_ADJUST_CFA_OFFSET 8
642 interrupt \func
643 jmp ret_from_intr
644 CFI_ENDPROC
645 .endm
646
647ENTRY(thermal_interrupt)
648 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
649END(thermal_interrupt)
650
651ENTRY(threshold_interrupt)
652 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
653END(threshold_interrupt)
654
655#ifdef CONFIG_SMP
656ENTRY(reschedule_interrupt)
657 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
658END(reschedule_interrupt)
659
660 .macro INVALIDATE_ENTRY num
661ENTRY(invalidate_interrupt\num)
662 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
663END(invalidate_interrupt\num)
664 .endm
665
666 INVALIDATE_ENTRY 0
667 INVALIDATE_ENTRY 1
668 INVALIDATE_ENTRY 2
669 INVALIDATE_ENTRY 3
670 INVALIDATE_ENTRY 4
671 INVALIDATE_ENTRY 5
672 INVALIDATE_ENTRY 6
673 INVALIDATE_ENTRY 7
674
675ENTRY(call_function_interrupt)
676 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
677END(call_function_interrupt)
678ENTRY(irq_move_cleanup_interrupt)
679 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
680END(irq_move_cleanup_interrupt)
681#endif
682
683ENTRY(apic_timer_interrupt)
684 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
685END(apic_timer_interrupt)
686
687ENTRY(error_interrupt)
688 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
689END(error_interrupt)
690
691ENTRY(spurious_interrupt)
692 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
693END(spurious_interrupt)
694
695/*
696 * Exception entry points.
697 */
698 .macro zeroentry sym
699 INTR_FRAME
700 pushq $0 /* push error code/oldrax */
701 CFI_ADJUST_CFA_OFFSET 8
702 pushq %rax /* push real oldrax to the rdi slot */
703 CFI_ADJUST_CFA_OFFSET 8
704 CFI_REL_OFFSET rax,0
705 leaq \sym(%rip),%rax
706 jmp error_entry
707 CFI_ENDPROC
708 .endm
709
710 .macro errorentry sym
711 XCPT_FRAME
712 pushq %rax
713 CFI_ADJUST_CFA_OFFSET 8
714 CFI_REL_OFFSET rax,0
715 leaq \sym(%rip),%rax
716 jmp error_entry
717 CFI_ENDPROC
718 .endm
719
720 /* error code is on the stack already */
721 /* handle NMI like exceptions that can happen everywhere */
722 .macro paranoidentry sym, ist=0, irqtrace=1
723 SAVE_ALL
724 cld
725 movl $1,%ebx
726 movl $MSR_GS_BASE,%ecx
727 rdmsr
728 testl %edx,%edx
729 js 1f
730 swapgs
731 xorl %ebx,%ebx
7321:
733 .if \ist
734 movq %gs:pda_data_offset, %rbp
735 .endif
736 movq %rsp,%rdi
737 movq ORIG_RAX(%rsp),%rsi
738 movq $-1,ORIG_RAX(%rsp)
739 .if \ist
740 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
741 .endif
742 call \sym
743 .if \ist
744 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
745 .endif
746 cli
747 .if \irqtrace
748 TRACE_IRQS_OFF
749 .endif
750 .endm
751
752 /*
753 * "Paranoid" exit path from exception stack.
754 * Paranoid because this is used by NMIs and cannot take
755 * any kernel state for granted.
756 * We don't do kernel preemption checks here, because only
757 * NMI should be common and it does not enable IRQs and
758 * cannot get reschedule ticks.
759 *
760 * "trace" is 0 for the NMI handler only, because irq-tracing
761 * is fundamentally NMI-unsafe. (we cannot change the soft and
762 * hard flags at once, atomically)
763 */
764 .macro paranoidexit trace=1
765 /* ebx: no swapgs flag */
766paranoid_exit\trace:
767 testl %ebx,%ebx /* swapgs needed? */
768 jnz paranoid_restore\trace
769 testl $3,CS(%rsp)
770 jnz paranoid_userspace\trace
771paranoid_swapgs\trace:
772 .if \trace
773 TRACE_IRQS_IRETQ 0
774 .endif
775 swapgs
776paranoid_restore\trace:
777 RESTORE_ALL 8
778 iretq
779paranoid_userspace\trace:
780 GET_THREAD_INFO(%rcx)
781 movl threadinfo_flags(%rcx),%ebx
782 andl $_TIF_WORK_MASK,%ebx
783 jz paranoid_swapgs\trace
784 movq %rsp,%rdi /* &pt_regs */
785 call sync_regs
786 movq %rax,%rsp /* switch stack for scheduling */
787 testl $_TIF_NEED_RESCHED,%ebx
788 jnz paranoid_schedule\trace
789 movl %ebx,%edx /* arg3: thread flags */
790 .if \trace
791 TRACE_IRQS_ON
792 .endif
793 sti
794 xorl %esi,%esi /* arg2: oldset */
795 movq %rsp,%rdi /* arg1: &pt_regs */
796 call do_notify_resume
797 cli
798 .if \trace
799 TRACE_IRQS_OFF
800 .endif
801 jmp paranoid_userspace\trace
802paranoid_schedule\trace:
803 .if \trace
804 TRACE_IRQS_ON
805 .endif
806 sti
807 call schedule
808 cli
809 .if \trace
810 TRACE_IRQS_OFF
811 .endif
812 jmp paranoid_userspace\trace
813 CFI_ENDPROC
814 .endm
815
816/*
817 * Exception entry point. This expects an error code/orig_rax on the stack
818 * and the exception handler in %rax.
819 */
820KPROBE_ENTRY(error_entry)
821 _frame RDI
822 CFI_REL_OFFSET rax,0
823 /* rdi slot contains rax, oldrax contains error code */
824 cld
825 subq $14*8,%rsp
826 CFI_ADJUST_CFA_OFFSET (14*8)
827 movq %rsi,13*8(%rsp)
828 CFI_REL_OFFSET rsi,RSI
829 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
830 CFI_REGISTER rax,rsi
831 movq %rdx,12*8(%rsp)
832 CFI_REL_OFFSET rdx,RDX
833 movq %rcx,11*8(%rsp)
834 CFI_REL_OFFSET rcx,RCX
835 movq %rsi,10*8(%rsp) /* store rax */
836 CFI_REL_OFFSET rax,RAX
837 movq %r8, 9*8(%rsp)
838 CFI_REL_OFFSET r8,R8
839 movq %r9, 8*8(%rsp)
840 CFI_REL_OFFSET r9,R9
841 movq %r10,7*8(%rsp)
842 CFI_REL_OFFSET r10,R10
843 movq %r11,6*8(%rsp)
844 CFI_REL_OFFSET r11,R11
845 movq %rbx,5*8(%rsp)
846 CFI_REL_OFFSET rbx,RBX
847 movq %rbp,4*8(%rsp)
848 CFI_REL_OFFSET rbp,RBP
849 movq %r12,3*8(%rsp)
850 CFI_REL_OFFSET r12,R12
851 movq %r13,2*8(%rsp)
852 CFI_REL_OFFSET r13,R13
853 movq %r14,1*8(%rsp)
854 CFI_REL_OFFSET r14,R14
855 movq %r15,(%rsp)
856 CFI_REL_OFFSET r15,R15
857 xorl %ebx,%ebx
858 testl $3,CS(%rsp)
859 je error_kernelspace
860error_swapgs:
861 swapgs
862error_sti:
863 movq %rdi,RDI(%rsp)
864 CFI_REL_OFFSET rdi,RDI
865 movq %rsp,%rdi
866 movq ORIG_RAX(%rsp),%rsi /* get error code */
867 movq $-1,ORIG_RAX(%rsp)
868 call *%rax
869 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
870error_exit:
871 movl %ebx,%eax
872 RESTORE_REST
873 cli
874 TRACE_IRQS_OFF
875 GET_THREAD_INFO(%rcx)
876 testl %eax,%eax
877 jne retint_kernel
878 movl threadinfo_flags(%rcx),%edx
879 movl $_TIF_WORK_MASK,%edi
880 andl %edi,%edx
881 jnz retint_careful
882 /*
883 * The iret might restore flags:
884 */
885 TRACE_IRQS_IRETQ
886 swapgs
887 RESTORE_ARGS 0,8,0
888 jmp iret_label
889 CFI_ENDPROC
890
891error_kernelspace:
892 incl %ebx
893 /* There are two places in the kernel that can potentially fault with
894 usergs. Handle them here. The exception handlers after
895 iret run with kernel gs again, so don't set the user space flag.
896 B stepping K8s sometimes report an truncated RIP for IRET
897 exceptions returning to compat mode. Check for these here too. */
898 leaq iret_label(%rip),%rbp
899 cmpq %rbp,RIP(%rsp)
900 je error_swapgs
901 movl %ebp,%ebp /* zero extend */
902 cmpq %rbp,RIP(%rsp)
903 je error_swapgs
904 cmpq $gs_change,RIP(%rsp)
905 je error_swapgs
906 jmp error_sti
907KPROBE_END(error_entry)
908
909 /* Reload gs selector with exception handling */
910 /* edi: new selector */
911ENTRY(load_gs_index)
912 CFI_STARTPROC
913 pushf
914 CFI_ADJUST_CFA_OFFSET 8
915 cli
916 swapgs
917gs_change:
918 movl %edi,%gs
9192: mfence /* workaround */
920 swapgs
921 popf
922 CFI_ADJUST_CFA_OFFSET -8
923 ret
924 CFI_ENDPROC
925ENDPROC(load_gs_index)
926
927 .section __ex_table,"a"
928 .align 8
929 .quad gs_change,bad_gs
930 .previous
931 .section .fixup,"ax"
932 /* running with kernelgs */
933bad_gs:
934 swapgs /* switch back to user gs */
935 xorl %eax,%eax
936 movl %eax,%gs
937 jmp 2b
938 .previous
939
940/*
941 * Create a kernel thread.
942 *
943 * C extern interface:
944 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
945 *
946 * asm input arguments:
947 * rdi: fn, rsi: arg, rdx: flags
948 */
949ENTRY(kernel_thread)
950 CFI_STARTPROC
951 FAKE_STACK_FRAME $child_rip
952 SAVE_ALL
953
954 # rdi: flags, rsi: usp, rdx: will be &pt_regs
955 movq %rdx,%rdi
956 orq kernel_thread_flags(%rip),%rdi
957 movq $-1, %rsi
958 movq %rsp, %rdx
959
960 xorl %r8d,%r8d
961 xorl %r9d,%r9d
962
963 # clone now
964 call do_fork
965 movq %rax,RAX(%rsp)
966 xorl %edi,%edi
967
968 /*
969 * It isn't worth to check for reschedule here,
970 * so internally to the x86_64 port you can rely on kernel_thread()
971 * not to reschedule the child before returning, this avoids the need
972 * of hacks for example to fork off the per-CPU idle tasks.
973 * [Hopefully no generic code relies on the reschedule -AK]
974 */
975 RESTORE_ALL
976 UNFAKE_STACK_FRAME
977 ret
978 CFI_ENDPROC
979ENDPROC(kernel_thread)
980
981child_rip:
982 pushq $0 # fake return address
983 CFI_STARTPROC
984 /*
985 * Here we are in the child and the registers are set as they were
986 * at kernel_thread() invocation in the parent.
987 */
988 movq %rdi, %rax
989 movq %rsi, %rdi
990 call *%rax
991 # exit
992 xorl %edi, %edi
993 call do_exit
994 CFI_ENDPROC
995ENDPROC(child_rip)
996
997/*
998 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
999 *
1000 * C extern interface:
1001 * extern long execve(char *name, char **argv, char **envp)
1002 *
1003 * asm input arguments:
1004 * rdi: name, rsi: argv, rdx: envp
1005 *
1006 * We want to fallback into:
1007 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
1008 *
1009 * do_sys_execve asm fallback arguments:
1010 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
1011 */
1012ENTRY(kernel_execve)
1013 CFI_STARTPROC
1014 FAKE_STACK_FRAME $0
1015 SAVE_ALL
1016 call sys_execve
1017 movq %rax, RAX(%rsp)
1018 RESTORE_REST
1019 testq %rax,%rax
1020 je int_ret_from_sys_call
1021 RESTORE_ARGS
1022 UNFAKE_STACK_FRAME
1023 ret
1024 CFI_ENDPROC
1025ENDPROC(kernel_execve)
1026
1027KPROBE_ENTRY(page_fault)
1028 errorentry do_page_fault
1029KPROBE_END(page_fault)
1030
1031ENTRY(coprocessor_error)
1032 zeroentry do_coprocessor_error
1033END(coprocessor_error)
1034
1035ENTRY(simd_coprocessor_error)
1036 zeroentry do_simd_coprocessor_error
1037END(simd_coprocessor_error)
1038
1039ENTRY(device_not_available)
1040 zeroentry math_state_restore
1041END(device_not_available)
1042
1043 /* runs on exception stack */
1044KPROBE_ENTRY(debug)
1045 INTR_FRAME
1046 pushq $0
1047 CFI_ADJUST_CFA_OFFSET 8
1048 paranoidentry do_debug, DEBUG_STACK
1049 paranoidexit
1050KPROBE_END(debug)
1051
1052 /* runs on exception stack */
1053KPROBE_ENTRY(nmi)
1054 INTR_FRAME
1055 pushq $-1
1056 CFI_ADJUST_CFA_OFFSET 8
1057 paranoidentry do_nmi, 0, 0
1058#ifdef CONFIG_TRACE_IRQFLAGS
1059 paranoidexit 0
1060#else
1061 jmp paranoid_exit1
1062 CFI_ENDPROC
1063#endif
1064KPROBE_END(nmi)
1065
1066KPROBE_ENTRY(int3)
1067 INTR_FRAME
1068 pushq $0
1069 CFI_ADJUST_CFA_OFFSET 8
1070 paranoidentry do_int3, DEBUG_STACK
1071 jmp paranoid_exit1
1072 CFI_ENDPROC
1073KPROBE_END(int3)
1074
1075ENTRY(overflow)
1076 zeroentry do_overflow
1077END(overflow)
1078
1079ENTRY(bounds)
1080 zeroentry do_bounds
1081END(bounds)
1082
1083ENTRY(invalid_op)
1084 zeroentry do_invalid_op
1085END(invalid_op)
1086
1087ENTRY(coprocessor_segment_overrun)
1088 zeroentry do_coprocessor_segment_overrun
1089END(coprocessor_segment_overrun)
1090
1091ENTRY(reserved)
1092 zeroentry do_reserved
1093END(reserved)
1094
1095 /* runs on exception stack */
1096ENTRY(double_fault)
1097 XCPT_FRAME
1098 paranoidentry do_double_fault
1099 jmp paranoid_exit1
1100 CFI_ENDPROC
1101END(double_fault)
1102
1103ENTRY(invalid_TSS)
1104 errorentry do_invalid_TSS
1105END(invalid_TSS)
1106
1107ENTRY(segment_not_present)
1108 errorentry do_segment_not_present
1109END(segment_not_present)
1110
1111 /* runs on exception stack */
1112ENTRY(stack_segment)
1113 XCPT_FRAME
1114 paranoidentry do_stack_segment
1115 jmp paranoid_exit1
1116 CFI_ENDPROC
1117END(stack_segment)
1118
1119KPROBE_ENTRY(general_protection)
1120 errorentry do_general_protection
1121KPROBE_END(general_protection)
1122
1123ENTRY(alignment_check)
1124 errorentry do_alignment_check
1125END(alignment_check)
1126
1127ENTRY(divide_error)
1128 zeroentry do_divide_error
1129END(divide_error)
1130
1131ENTRY(spurious_interrupt_bug)
1132 zeroentry do_spurious_interrupt_bug
1133END(spurious_interrupt_bug)
1134
1135#ifdef CONFIG_X86_MCE
1136 /* runs on exception stack */
1137ENTRY(machine_check)
1138 INTR_FRAME
1139 pushq $0
1140 CFI_ADJUST_CFA_OFFSET 8
1141 paranoidentry do_machine_check
1142 jmp paranoid_exit1
1143 CFI_ENDPROC
1144END(machine_check)
1145#endif
1146
1147/* Call softirq on interrupt stack. Interrupts are off. */
1148ENTRY(call_softirq)
1149 CFI_STARTPROC
1150 push %rbp
1151 CFI_ADJUST_CFA_OFFSET 8
1152 CFI_REL_OFFSET rbp,0
1153 mov %rsp,%rbp
1154 CFI_DEF_CFA_REGISTER rbp
1155 incl %gs:pda_irqcount
1156 cmove %gs:pda_irqstackptr,%rsp
1157 push %rbp # backlink for old unwinder
1158 call __do_softirq
1159 leaveq
1160 CFI_DEF_CFA_REGISTER rsp
1161 CFI_ADJUST_CFA_OFFSET -8
1162 decl %gs:pda_irqcount
1163 ret
1164 CFI_ENDPROC
1165ENDPROC(call_softirq)
1166
1167KPROBE_ENTRY(ignore_sysret)
1168 CFI_STARTPROC
1169 mov $-ENOSYS,%eax
1170 sysret
1171 CFI_ENDPROC
1172ENDPROC(ignore_sysret)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
new file mode 100644
index 000000000000..47496a40e84f
--- /dev/null
+++ b/arch/x86/kernel/genapic_64.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/threads.h>
12#include <linux/cpumask.h>
13#include <linux/string.h>
14#include <linux/module.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18
19#include <asm/smp.h>
20#include <asm/ipi.h>
21#include <asm/genapic.h>
22
23#ifdef CONFIG_ACPI
24#include <acpi/acpi_bus.h>
25#endif
26
27/* which logical CPU number maps to which CPU (physical APIC ID) */
28u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
29 = { [0 ... NR_CPUS-1] = BAD_APICID };
30EXPORT_SYMBOL(x86_cpu_to_apicid);
31
32u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
33
34struct genapic __read_mostly *genapic = &apic_flat;
35
36/*
37 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
38 */
39void __init setup_apic_routing(void)
40{
41#ifdef CONFIG_ACPI
42 /*
43 * Quirk: some x86_64 machines can only use physical APIC mode
44 * regardless of how many processors are present (x86_64 ES7000
45 * is an example).
46 */
47 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
48 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
49 genapic = &apic_physflat;
50 else
51#endif
52
53 if (cpus_weight(cpu_possible_map) <= 8)
54 genapic = &apic_flat;
55 else
56 genapic = &apic_physflat;
57
58 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
59}
60
61/* Same for both flat and physical. */
62
63void send_IPI_self(int vector)
64{
65 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
66}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
new file mode 100644
index 000000000000..ecb01eefdd27
--- /dev/null
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -0,0 +1,194 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Flat APIC subarch code.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/errno.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <asm/smp.h>
19#include <asm/ipi.h>
20#include <asm/genapic.h>
21
22static cpumask_t flat_target_cpus(void)
23{
24 return cpu_online_map;
25}
26
27static cpumask_t flat_vector_allocation_domain(int cpu)
28{
29 /* Careful. Some cpus do not strictly honor the set of cpus
30 * specified in the interrupt destination when using lowest
31 * priority interrupt delivery mode.
32 *
33 * In particular there was a hyperthreading cpu observed to
34 * deliver interrupts to the wrong hyperthread when only one
35 * hyperthread was specified in the interrupt desitination.
36 */
37 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
38 return domain;
39}
40
41/*
42 * Set up the logical destination ID.
43 *
44 * Intel recommends to set DFR, LDR and TPR before enabling
45 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
46 * document number 292116). So here it goes...
47 */
48static void flat_init_apic_ldr(void)
49{
50 unsigned long val;
51 unsigned long num, id;
52
53 num = smp_processor_id();
54 id = 1UL << num;
55 x86_cpu_to_log_apicid[num] = id;
56 apic_write(APIC_DFR, APIC_DFR_FLAT);
57 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
58 val |= SET_APIC_LOGICAL_ID(id);
59 apic_write(APIC_LDR, val);
60}
61
62static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
63{
64 unsigned long mask = cpus_addr(cpumask)[0];
65 unsigned long flags;
66
67 local_irq_save(flags);
68 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
69 local_irq_restore(flags);
70}
71
72static void flat_send_IPI_allbutself(int vector)
73{
74#ifdef CONFIG_HOTPLUG_CPU
75 int hotplug = 1;
76#else
77 int hotplug = 0;
78#endif
79 if (hotplug || vector == NMI_VECTOR) {
80 cpumask_t allbutme = cpu_online_map;
81
82 cpu_clear(smp_processor_id(), allbutme);
83
84 if (!cpus_empty(allbutme))
85 flat_send_IPI_mask(allbutme, vector);
86 } else if (num_online_cpus() > 1) {
87 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
88 }
89}
90
91static void flat_send_IPI_all(int vector)
92{
93 if (vector == NMI_VECTOR)
94 flat_send_IPI_mask(cpu_online_map, vector);
95 else
96 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
97}
98
99static int flat_apic_id_registered(void)
100{
101 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
102}
103
104static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
105{
106 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
107}
108
109static unsigned int phys_pkg_id(int index_msb)
110{
111 return hard_smp_processor_id() >> index_msb;
112}
113
114struct genapic apic_flat = {
115 .name = "flat",
116 .int_delivery_mode = dest_LowestPrio,
117 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
118 .target_cpus = flat_target_cpus,
119 .vector_allocation_domain = flat_vector_allocation_domain,
120 .apic_id_registered = flat_apic_id_registered,
121 .init_apic_ldr = flat_init_apic_ldr,
122 .send_IPI_all = flat_send_IPI_all,
123 .send_IPI_allbutself = flat_send_IPI_allbutself,
124 .send_IPI_mask = flat_send_IPI_mask,
125 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
126 .phys_pkg_id = phys_pkg_id,
127};
128
129/*
130 * Physflat mode is used when there are more than 8 CPUs on a AMD system.
131 * We cannot use logical delivery in this case because the mask
132 * overflows, so use physical mode.
133 */
134
135static cpumask_t physflat_target_cpus(void)
136{
137 return cpu_online_map;
138}
139
140static cpumask_t physflat_vector_allocation_domain(int cpu)
141{
142 cpumask_t domain = CPU_MASK_NONE;
143 cpu_set(cpu, domain);
144 return domain;
145}
146
147
148static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
149{
150 send_IPI_mask_sequence(cpumask, vector);
151}
152
153static void physflat_send_IPI_allbutself(int vector)
154{
155 cpumask_t allbutme = cpu_online_map;
156
157 cpu_clear(smp_processor_id(), allbutme);
158 physflat_send_IPI_mask(allbutme, vector);
159}
160
161static void physflat_send_IPI_all(int vector)
162{
163 physflat_send_IPI_mask(cpu_online_map, vector);
164}
165
166static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
167{
168 int cpu;
169
170 /*
171 * We're using fixed IRQ delivery, can only return one phys APIC ID.
172 * May as well be the first.
173 */
174 cpu = first_cpu(cpumask);
175 if ((unsigned)cpu < NR_CPUS)
176 return x86_cpu_to_apicid[cpu];
177 else
178 return BAD_APICID;
179}
180
181struct genapic apic_physflat = {
182 .name = "physical flat",
183 .int_delivery_mode = dest_Fixed,
184 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
185 .target_cpus = physflat_target_cpus,
186 .vector_allocation_domain = physflat_vector_allocation_domain,
187 .apic_id_registered = flat_apic_id_registered,
188 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
189 .send_IPI_all = physflat_send_IPI_all,
190 .send_IPI_allbutself = physflat_send_IPI_allbutself,
191 .send_IPI_mask = physflat_send_IPI_mask,
192 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
193 .phys_pkg_id = phys_pkg_id,
194};
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644
index 000000000000..6c34bdd22e26
--- /dev/null
+++ b/arch/x86/kernel/head64.c
@@ -0,0 +1,86 @@
1/*
2 * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 */
6
7#include <linux/init.h>
8#include <linux/linkage.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/string.h>
12#include <linux/percpu.h>
13
14#include <asm/processor.h>
15#include <asm/proto.h>
16#include <asm/smp.h>
17#include <asm/bootsetup.h>
18#include <asm/setup.h>
19#include <asm/desc.h>
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22#include <asm/sections.h>
23
24static void __init zap_identity_mappings(void)
25{
26 pgd_t *pgd = pgd_offset_k(0UL);
27 pgd_clear(pgd);
28 __flush_tlb();
29}
30
31/* Don't add a printk in there. printk relies on the PDA which is not initialized
32 yet. */
33static void __init clear_bss(void)
34{
35 memset(__bss_start, 0,
36 (unsigned long) __bss_stop - (unsigned long) __bss_start);
37}
38
39#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
40#define OLD_CL_MAGIC_ADDR 0x20
41#define OLD_CL_MAGIC 0xA33F
42#define OLD_CL_OFFSET 0x22
43
44static void __init copy_bootdata(char *real_mode_data)
45{
46 unsigned long new_data;
47 char * command_line;
48
49 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
50 new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
51 if (!new_data) {
52 if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
53 return;
54 }
55 new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
56 }
57 command_line = __va(new_data);
58 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
59}
60
61void __init x86_64_start_kernel(char * real_mode_data)
62{
63 int i;
64
65 /* clear bss before set_intr_gate with early_idt_handler */
66 clear_bss();
67
68 /* Make NULL pointers segfault */
69 zap_identity_mappings();
70
71 for (i = 0; i < IDT_ENTRIES; i++)
72 set_intr_gate(i, early_idt_handler);
73 asm volatile("lidt %0" :: "m" (idt_descr));
74
75 early_printk("Kernel alive\n");
76
77 for (i = 0; i < NR_CPUS; i++)
78 cpu_pda(i) = &boot_cpu_pda[i];
79
80 pda_init(0);
81 copy_bootdata(__va(real_mode_data));
82#ifdef CONFIG_SMP
83 cpu_set(0, cpu_online_map);
84#endif
85 start_kernel();
86}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644
index 000000000000..b6167fe3330e
--- /dev/null
+++ b/arch/x86/kernel/head_64.S
@@ -0,0 +1,416 @@
1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
9 */
10
11
12#include <linux/linkage.h>
13#include <linux/threads.h>
14#include <linux/init.h>
15#include <asm/desc.h>
16#include <asm/segment.h>
17#include <asm/pgtable.h>
18#include <asm/page.h>
19#include <asm/msr.h>
20#include <asm/cache.h>
21
22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
23 * because we need identity-mapped pages.
24 *
25 */
26
27 .text
28 .section .text.head
29 .code64
30 .globl startup_64
31startup_64:
32
33 /*
34 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
35 * and someone has loaded an identity mapped page table
36 * for us. These identity mapped page tables map all of the
37 * kernel pages and possibly all of memory.
38 *
39 * %esi holds a physical pointer to real_mode_data.
40 *
41 * We come here either directly from a 64bit bootloader, or from
42 * arch/x86_64/boot/compressed/head.S.
43 *
44 * We only come here initially at boot nothing else comes here.
45 *
46 * Since we may be loaded at an address different from what we were
47 * compiled to run at we first fixup the physical addresses in our page
48 * tables and then reload them.
49 */
50
51 /* Compute the delta between the address I am compiled to run at and the
52 * address I am actually running at.
53 */
54 leaq _text(%rip), %rbp
55 subq $_text - __START_KERNEL_map, %rbp
56
57 /* Is the address not 2M aligned? */
58 movq %rbp, %rax
59 andl $~LARGE_PAGE_MASK, %eax
60 testl %eax, %eax
61 jnz bad_address
62
63 /* Is the address too large? */
64 leaq _text(%rip), %rdx
65 movq $PGDIR_SIZE, %rax
66 cmpq %rax, %rdx
67 jae bad_address
68
69 /* Fixup the physical addresses in the page table
70 */
71 addq %rbp, init_level4_pgt + 0(%rip)
72 addq %rbp, init_level4_pgt + (258*8)(%rip)
73 addq %rbp, init_level4_pgt + (511*8)(%rip)
74
75 addq %rbp, level3_ident_pgt + 0(%rip)
76
77 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
78 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
79
80 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
81
82 /* Add an Identity mapping if I am above 1G */
83 leaq _text(%rip), %rdi
84 andq $LARGE_PAGE_MASK, %rdi
85
86 movq %rdi, %rax
87 shrq $PUD_SHIFT, %rax
88 andq $(PTRS_PER_PUD - 1), %rax
89 jz ident_complete
90
91 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
92 leaq level3_ident_pgt(%rip), %rbx
93 movq %rdx, 0(%rbx, %rax, 8)
94
95 movq %rdi, %rax
96 shrq $PMD_SHIFT, %rax
97 andq $(PTRS_PER_PMD - 1), %rax
98 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
99 leaq level2_spare_pgt(%rip), %rbx
100 movq %rdx, 0(%rbx, %rax, 8)
101ident_complete:
102
103 /* Fixup the kernel text+data virtual addresses
104 */
105 leaq level2_kernel_pgt(%rip), %rdi
106 leaq 4096(%rdi), %r8
107 /* See if it is a valid page table entry */
1081: testq $1, 0(%rdi)
109 jz 2f
110 addq %rbp, 0(%rdi)
111 /* Go to the next page */
1122: addq $8, %rdi
113 cmp %r8, %rdi
114 jne 1b
115
116 /* Fixup phys_base */
117 addq %rbp, phys_base(%rip)
118
119#ifdef CONFIG_SMP
120 addq %rbp, trampoline_level4_pgt + 0(%rip)
121 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
122#endif
123#ifdef CONFIG_ACPI_SLEEP
124 addq %rbp, wakeup_level4_pgt + 0(%rip)
125 addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
126#endif
127
128 /* Due to ENTRY(), sometimes the empty space gets filled with
129 * zeros. Better take a jmp than relying on empty space being
130 * filled with 0x90 (nop)
131 */
132 jmp secondary_startup_64
133ENTRY(secondary_startup_64)
134 /*
135 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
136 * and someone has loaded a mapped page table.
137 *
138 * %esi holds a physical pointer to real_mode_data.
139 *
140 * We come here either from startup_64 (using physical addresses)
141 * or from trampoline.S (using virtual addresses).
142 *
143 * Using virtual addresses from trampoline.S removes the need
144 * to have any identity mapped pages in the kernel page table
145 * after the boot processor executes this code.
146 */
147
148 /* Enable PAE mode and PGE */
149 xorq %rax, %rax
150 btsq $5, %rax
151 btsq $7, %rax
152 movq %rax, %cr4
153
154 /* Setup early boot stage 4 level pagetables. */
155 movq $(init_level4_pgt - __START_KERNEL_map), %rax
156 addq phys_base(%rip), %rax
157 movq %rax, %cr3
158
159 /* Ensure I am executing from virtual addresses */
160 movq $1f, %rax
161 jmp *%rax
1621:
163
164 /* Check if nx is implemented */
165 movl $0x80000001, %eax
166 cpuid
167 movl %edx,%edi
168
169 /* Setup EFER (Extended Feature Enable Register) */
170 movl $MSR_EFER, %ecx
171 rdmsr
172 btsl $_EFER_SCE, %eax /* Enable System Call */
173 btl $20,%edi /* No Execute supported? */
174 jnc 1f
175 btsl $_EFER_NX, %eax
1761: wrmsr /* Make changes effective */
177
178 /* Setup cr0 */
179#define CR0_PM 1 /* protected mode */
180#define CR0_MP (1<<1)
181#define CR0_ET (1<<4)
182#define CR0_NE (1<<5)
183#define CR0_WP (1<<16)
184#define CR0_AM (1<<18)
185#define CR0_PAGING (1<<31)
186 movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
187 /* Make changes effective */
188 movq %rax, %cr0
189
190 /* Setup a boot time stack */
191 movq init_rsp(%rip),%rsp
192
193 /* zero EFLAGS after setting rsp */
194 pushq $0
195 popfq
196
197 /*
198 * We must switch to a new descriptor in kernel space for the GDT
199 * because soon the kernel won't have access anymore to the userspace
200 * addresses where we're currently running on. We have to do that here
201 * because in 32bit we couldn't load a 64bit linear address.
202 */
203 lgdt cpu_gdt_descr(%rip)
204
205 /* set up data segments. actually 0 would do too */
206 movl $__KERNEL_DS,%eax
207 movl %eax,%ds
208 movl %eax,%ss
209 movl %eax,%es
210
211 /*
212 * We don't really need to load %fs or %gs, but load them anyway
213 * to kill any stale realmode selectors. This allows execution
214 * under VT hardware.
215 */
216 movl %eax,%fs
217 movl %eax,%gs
218
219 /*
220 * Setup up a dummy PDA. this is just for some early bootup code
221 * that does in_interrupt()
222 */
223 movl $MSR_GS_BASE,%ecx
224 movq $empty_zero_page,%rax
225 movq %rax,%rdx
226 shrq $32,%rdx
227 wrmsr
228
229 /* esi is pointer to real mode structure with interesting info.
230 pass it to C */
231 movl %esi, %edi
232
233 /* Finally jump to run C code and to be on real kernel address
234 * Since we are running on identity-mapped space we have to jump
235 * to the full 64bit address, this is only possible as indirect
236 * jump. In addition we need to ensure %cs is set so we make this
237 * a far return.
238 */
239 movq initial_code(%rip),%rax
240 pushq $0 # fake return address to stop unwinder
241 pushq $__KERNEL_CS # set correct cs
242 pushq %rax # target address in negative space
243 lretq
244
245 /* SMP bootup changes these two */
246#ifndef CONFIG_HOTPLUG_CPU
247 .pushsection .init.data
248#endif
249 .align 8
250 .globl initial_code
251initial_code:
252 .quad x86_64_start_kernel
253#ifndef CONFIG_HOTPLUG_CPU
254 .popsection
255#endif
256 .globl init_rsp
257init_rsp:
258 .quad init_thread_union+THREAD_SIZE-8
259
260bad_address:
261 jmp bad_address
262
263ENTRY(early_idt_handler)
264 cmpl $2,early_recursion_flag(%rip)
265 jz 1f
266 incl early_recursion_flag(%rip)
267 xorl %eax,%eax
268 movq 8(%rsp),%rsi # get rip
269 movq (%rsp),%rdx
270 movq %cr2,%rcx
271 leaq early_idt_msg(%rip),%rdi
272 call early_printk
273 cmpl $2,early_recursion_flag(%rip)
274 jz 1f
275 call dump_stack
276#ifdef CONFIG_KALLSYMS
277 leaq early_idt_ripmsg(%rip),%rdi
278 movq 8(%rsp),%rsi # get rip again
279 call __print_symbol
280#endif
2811: hlt
282 jmp 1b
283early_recursion_flag:
284 .long 0
285
286early_idt_msg:
287 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
288early_idt_ripmsg:
289 .asciz "RIP %s\n"
290
291.balign PAGE_SIZE
292
293#define NEXT_PAGE(name) \
294 .balign PAGE_SIZE; \
295ENTRY(name)
296
297/* Automate the creation of 1 to 1 mapping pmd entries */
298#define PMDS(START, PERM, COUNT) \
299 i = 0 ; \
300 .rept (COUNT) ; \
301 .quad (START) + (i << 21) + (PERM) ; \
302 i = i + 1 ; \
303 .endr
304
305 /*
306 * This default setting generates an ident mapping at address 0x100000
307 * and a mapping for the kernel that precisely maps virtual address
308 * 0xffffffff80000000 to physical address 0x000000. (always using
309 * 2Mbyte large pages provided by PAE mode)
310 */
311NEXT_PAGE(init_level4_pgt)
312 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
313 .fill 257,8,0
314 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
315 .fill 252,8,0
316 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
317 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
318
319NEXT_PAGE(level3_ident_pgt)
320 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
321 .fill 511,8,0
322
323NEXT_PAGE(level3_kernel_pgt)
324 .fill 510,8,0
325 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
326 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
327 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
328
329NEXT_PAGE(level2_fixmap_pgt)
330 .fill 506,8,0
331 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
332 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
333 .fill 5,8,0
334
335NEXT_PAGE(level1_fixmap_pgt)
336 .fill 512,8,0
337
338NEXT_PAGE(level2_ident_pgt)
339 /* Since I easily can, map the first 1G.
340 * Don't set NX because code runs from these pages.
341 */
342 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
343
344NEXT_PAGE(level2_kernel_pgt)
345 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
346 When you change this change KERNEL_TEXT_SIZE in page.h too. */
347 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
348 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
349 /* Module mapping starts here */
350 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
351
352NEXT_PAGE(level2_spare_pgt)
353 .fill 512,8,0
354
355#undef PMDS
356#undef NEXT_PAGE
357
358 .data
359 .align 16
360 .globl cpu_gdt_descr
361cpu_gdt_descr:
362 .word gdt_end-cpu_gdt_table-1
363gdt:
364 .quad cpu_gdt_table
365#ifdef CONFIG_SMP
366 .rept NR_CPUS-1
367 .word 0
368 .quad 0
369 .endr
370#endif
371
372ENTRY(phys_base)
373 /* This must match the first entry in level2_kernel_pgt */
374 .quad 0x0000000000000000
375
376/* We need valid kernel segments for data and code in long mode too
377 * IRET will check the segment types kkeil 2000/10/28
378 * Also sysret mandates a special GDT layout
379 */
380
381 .section .data.page_aligned, "aw"
382 .align PAGE_SIZE
383
384/* The TLS descriptors are currently at a different place compared to i386.
385 Hopefully nobody expects them at a fixed place (Wine?) */
386
387ENTRY(cpu_gdt_table)
388 .quad 0x0000000000000000 /* NULL descriptor */
389 .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
390 .quad 0x00af9b000000ffff /* __KERNEL_CS */
391 .quad 0x00cf93000000ffff /* __KERNEL_DS */
392 .quad 0x00cffb000000ffff /* __USER32_CS */
393 .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
394 .quad 0x00affb000000ffff /* __USER_CS */
395 .quad 0x0 /* unused */
396 .quad 0,0 /* TSS */
397 .quad 0,0 /* LDT */
398 .quad 0,0,0 /* three TLS descriptors */
399 .quad 0x0000f40000000000 /* node/CPU stored in limit */
400gdt_end:
401 /* asm/segment.h:GDT_ENTRIES must match this */
402 /* This should be a multiple of the cache line size */
403 /* GDTs of other CPUs are now dynamically allocated */
404
405 /* zero the remaining page */
406 .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
407
408 .section .bss, "aw", @nobits
409 .align L1_CACHE_BYTES
410ENTRY(idt_table)
411 .skip 256 * 16
412
413 .section .bss.page_aligned, "aw", @nobits
414 .align PAGE_SIZE
415ENTRY(empty_zero_page)
416 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c
new file mode 100644
index 000000000000..e2d1b912e154
--- /dev/null
+++ b/arch/x86/kernel/hpet_64.c
@@ -0,0 +1,493 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/mc146818rtc.h>
5#include <linux/time.h>
6#include <linux/clocksource.h>
7#include <linux/ioport.h>
8#include <linux/acpi.h>
9#include <linux/hpet.h>
10#include <asm/pgtable.h>
11#include <asm/vsyscall.h>
12#include <asm/timex.h>
13#include <asm/hpet.h>
14
15#define HPET_MASK 0xFFFFFFFF
16#define HPET_SHIFT 22
17
18/* FSEC = 10^-15 NSEC = 10^-9 */
19#define FSEC_PER_NSEC 1000000
20
21int nohpet __initdata;
22
23unsigned long hpet_address;
24unsigned long hpet_period; /* fsecs / HPET clock */
25unsigned long hpet_tick; /* HPET clocks / interrupt */
26
27int hpet_use_timer; /* Use counter of hpet for time keeping,
28 * otherwise PIT
29 */
30
31#ifdef CONFIG_HPET
32static __init int late_hpet_init(void)
33{
34 struct hpet_data hd;
35 unsigned int ntimer;
36
37 if (!hpet_address)
38 return 0;
39
40 memset(&hd, 0, sizeof(hd));
41
42 ntimer = hpet_readl(HPET_ID);
43 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
44 ntimer++;
45
46 /*
47 * Register with driver.
48 * Timer0 and Timer1 is used by platform.
49 */
50 hd.hd_phys_address = hpet_address;
51 hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
52 hd.hd_nirqs = ntimer;
53 hd.hd_flags = HPET_DATA_PLATFORM;
54 hpet_reserve_timer(&hd, 0);
55#ifdef CONFIG_HPET_EMULATE_RTC
56 hpet_reserve_timer(&hd, 1);
57#endif
58 hd.hd_irq[0] = HPET_LEGACY_8254;
59 hd.hd_irq[1] = HPET_LEGACY_RTC;
60 if (ntimer > 2) {
61 struct hpet *hpet;
62 struct hpet_timer *timer;
63 int i;
64
65 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
66 timer = &hpet->hpet_timers[2];
67 for (i = 2; i < ntimer; timer++, i++)
68 hd.hd_irq[i] = (timer->hpet_config &
69 Tn_INT_ROUTE_CNF_MASK) >>
70 Tn_INT_ROUTE_CNF_SHIFT;
71
72 }
73
74 hpet_alloc(&hd);
75 return 0;
76}
77fs_initcall(late_hpet_init);
78#endif
79
80int hpet_timer_stop_set_go(unsigned long tick)
81{
82 unsigned int cfg;
83
84/*
85 * Stop the timers and reset the main counter.
86 */
87
88 cfg = hpet_readl(HPET_CFG);
89 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
90 hpet_writel(cfg, HPET_CFG);
91 hpet_writel(0, HPET_COUNTER);
92 hpet_writel(0, HPET_COUNTER + 4);
93
94/*
95 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
96 * and period also hpet_tick.
97 */
98 if (hpet_use_timer) {
99 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
100 HPET_TN_32BIT, HPET_T0_CFG);
101 hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
102 hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
103 cfg |= HPET_CFG_LEGACY;
104 }
105/*
106 * Go!
107 */
108
109 cfg |= HPET_CFG_ENABLE;
110 hpet_writel(cfg, HPET_CFG);
111
112 return 0;
113}
114
115static cycle_t read_hpet(void)
116{
117 return (cycle_t)hpet_readl(HPET_COUNTER);
118}
119
120static cycle_t __vsyscall_fn vread_hpet(void)
121{
122 return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
123}
124
125struct clocksource clocksource_hpet = {
126 .name = "hpet",
127 .rating = 250,
128 .read = read_hpet,
129 .mask = (cycle_t)HPET_MASK,
130 .mult = 0, /* set below */
131 .shift = HPET_SHIFT,
132 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
133 .vread = vread_hpet,
134};
135
136int __init hpet_arch_init(void)
137{
138 unsigned int id;
139 u64 tmp;
140
141 if (!hpet_address)
142 return -1;
143 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
144 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
145
146/*
147 * Read the period, compute tick and quotient.
148 */
149
150 id = hpet_readl(HPET_ID);
151
152 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
153 return -1;
154
155 hpet_period = hpet_readl(HPET_PERIOD);
156 if (hpet_period < 100000 || hpet_period > 100000000)
157 return -1;
158
159 hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
160
161 hpet_use_timer = (id & HPET_ID_LEGSUP);
162
163 /*
164 * hpet period is in femto seconds per cycle
165 * so we need to convert this to ns/cyc units
166 * aproximated by mult/2^shift
167 *
168 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
169 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
170 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
171 * (fsec/cyc << shift)/1000000 = mult
172 * (hpet_period << shift)/FSEC_PER_NSEC = mult
173 */
174 tmp = (u64)hpet_period << HPET_SHIFT;
175 do_div(tmp, FSEC_PER_NSEC);
176 clocksource_hpet.mult = (u32)tmp;
177 clocksource_register(&clocksource_hpet);
178
179 return hpet_timer_stop_set_go(hpet_tick);
180}
181
182int hpet_reenable(void)
183{
184 return hpet_timer_stop_set_go(hpet_tick);
185}
186
187/*
188 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
189 * it to the HPET timer of known frequency.
190 */
191
192#define TICK_COUNT 100000000
193#define SMI_THRESHOLD 50000
194#define MAX_TRIES 5
195
196/*
197 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
198 * occurs between the reads of the hpet & TSC.
199 */
200static void __init read_hpet_tsc(int *hpet, int *tsc)
201{
202 int tsc1, tsc2, hpet1, i;
203
204 for (i = 0; i < MAX_TRIES; i++) {
205 tsc1 = get_cycles_sync();
206 hpet1 = hpet_readl(HPET_COUNTER);
207 tsc2 = get_cycles_sync();
208 if ((tsc2 - tsc1) < SMI_THRESHOLD)
209 break;
210 }
211 *hpet = hpet1;
212 *tsc = tsc2;
213}
214
215unsigned int __init hpet_calibrate_tsc(void)
216{
217 int tsc_start, hpet_start;
218 int tsc_now, hpet_now;
219 unsigned long flags;
220
221 local_irq_save(flags);
222
223 read_hpet_tsc(&hpet_start, &tsc_start);
224
225 do {
226 local_irq_disable();
227 read_hpet_tsc(&hpet_now, &tsc_now);
228 local_irq_restore(flags);
229 } while ((tsc_now - tsc_start) < TICK_COUNT &&
230 (hpet_now - hpet_start) < TICK_COUNT);
231
232 return (tsc_now - tsc_start) * 1000000000L
233 / ((hpet_now - hpet_start) * hpet_period / 1000);
234}
235
236#ifdef CONFIG_HPET_EMULATE_RTC
237/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
238 * is enabled, we support RTC interrupt functionality in software.
239 * RTC has 3 kinds of interrupts:
240 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
241 * is updated
242 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
243 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
244 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
245 * (1) and (2) above are implemented using polling at a frequency of
246 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
247 * overhead. (DEFAULT_RTC_INT_FREQ)
248 * For (3), we use interrupts at 64Hz or user specified periodic
249 * frequency, whichever is higher.
250 */
251#include <linux/rtc.h>
252
253#define DEFAULT_RTC_INT_FREQ 64
254#define RTC_NUM_INTS 1
255
256static unsigned long UIE_on;
257static unsigned long prev_update_sec;
258
259static unsigned long AIE_on;
260static struct rtc_time alarm_time;
261
262static unsigned long PIE_on;
263static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
264static unsigned long PIE_count;
265
266static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
267static unsigned int hpet_t1_cmp; /* cached comparator register */
268
269int is_hpet_enabled(void)
270{
271 return hpet_address != 0;
272}
273
274/*
275 * Timer 1 for RTC, we do not use periodic interrupt feature,
276 * even if HPET supports periodic interrupts on Timer 1.
277 * The reason being, to set up a periodic interrupt in HPET, we need to
278 * stop the main counter. And if we do that everytime someone diables/enables
279 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
280 * So, for the time being, simulate the periodic interrupt in software.
281 *
282 * hpet_rtc_timer_init() is called for the first time and during subsequent
283 * interuppts reinit happens through hpet_rtc_timer_reinit().
284 */
285int hpet_rtc_timer_init(void)
286{
287 unsigned int cfg, cnt;
288 unsigned long flags;
289
290 if (!is_hpet_enabled())
291 return 0;
292 /*
293 * Set the counter 1 and enable the interrupts.
294 */
295 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
296 hpet_rtc_int_freq = PIE_freq;
297 else
298 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
299
300 local_irq_save(flags);
301
302 cnt = hpet_readl(HPET_COUNTER);
303 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
304 hpet_writel(cnt, HPET_T1_CMP);
305 hpet_t1_cmp = cnt;
306
307 cfg = hpet_readl(HPET_T1_CFG);
308 cfg &= ~HPET_TN_PERIODIC;
309 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
310 hpet_writel(cfg, HPET_T1_CFG);
311
312 local_irq_restore(flags);
313
314 return 1;
315}
316
317static void hpet_rtc_timer_reinit(void)
318{
319 unsigned int cfg, cnt, ticks_per_int, lost_ints;
320
321 if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
322 cfg = hpet_readl(HPET_T1_CFG);
323 cfg &= ~HPET_TN_ENABLE;
324 hpet_writel(cfg, HPET_T1_CFG);
325 return;
326 }
327
328 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
329 hpet_rtc_int_freq = PIE_freq;
330 else
331 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
332
333 /* It is more accurate to use the comparator value than current count.*/
334 ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
335 hpet_t1_cmp += ticks_per_int;
336 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
337
338 /*
339 * If the interrupt handler was delayed too long, the write above tries
340 * to schedule the next interrupt in the past and the hardware would
341 * not interrupt until the counter had wrapped around.
342 * So we have to check that the comparator wasn't set to a past time.
343 */
344 cnt = hpet_readl(HPET_COUNTER);
345 if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
346 lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
347 /* Make sure that, even with the time needed to execute
348 * this code, the next scheduled interrupt has been moved
349 * back to the future: */
350 lost_ints++;
351
352 hpet_t1_cmp += lost_ints * ticks_per_int;
353 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
354
355 if (PIE_on)
356 PIE_count += lost_ints;
357
358 if (printk_ratelimit())
359 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
360 hpet_rtc_int_freq);
361 }
362}
363
364/*
365 * The functions below are called from rtc driver.
366 * Return 0 if HPET is not being used.
367 * Otherwise do the necessary changes and return 1.
368 */
369int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
370{
371 if (!is_hpet_enabled())
372 return 0;
373
374 if (bit_mask & RTC_UIE)
375 UIE_on = 0;
376 if (bit_mask & RTC_PIE)
377 PIE_on = 0;
378 if (bit_mask & RTC_AIE)
379 AIE_on = 0;
380
381 return 1;
382}
383
384int hpet_set_rtc_irq_bit(unsigned long bit_mask)
385{
386 int timer_init_reqd = 0;
387
388 if (!is_hpet_enabled())
389 return 0;
390
391 if (!(PIE_on | AIE_on | UIE_on))
392 timer_init_reqd = 1;
393
394 if (bit_mask & RTC_UIE) {
395 UIE_on = 1;
396 }
397 if (bit_mask & RTC_PIE) {
398 PIE_on = 1;
399 PIE_count = 0;
400 }
401 if (bit_mask & RTC_AIE) {
402 AIE_on = 1;
403 }
404
405 if (timer_init_reqd)
406 hpet_rtc_timer_init();
407
408 return 1;
409}
410
411int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
412{
413 if (!is_hpet_enabled())
414 return 0;
415
416 alarm_time.tm_hour = hrs;
417 alarm_time.tm_min = min;
418 alarm_time.tm_sec = sec;
419
420 return 1;
421}
422
423int hpet_set_periodic_freq(unsigned long freq)
424{
425 if (!is_hpet_enabled())
426 return 0;
427
428 PIE_freq = freq;
429 PIE_count = 0;
430
431 return 1;
432}
433
434int hpet_rtc_dropped_irq(void)
435{
436 if (!is_hpet_enabled())
437 return 0;
438
439 return 1;
440}
441
442irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
443{
444 struct rtc_time curr_time;
445 unsigned long rtc_int_flag = 0;
446 int call_rtc_interrupt = 0;
447
448 hpet_rtc_timer_reinit();
449
450 if (UIE_on | AIE_on) {
451 rtc_get_rtc_time(&curr_time);
452 }
453 if (UIE_on) {
454 if (curr_time.tm_sec != prev_update_sec) {
455 /* Set update int info, call real rtc int routine */
456 call_rtc_interrupt = 1;
457 rtc_int_flag = RTC_UF;
458 prev_update_sec = curr_time.tm_sec;
459 }
460 }
461 if (PIE_on) {
462 PIE_count++;
463 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
464 /* Set periodic int info, call real rtc int routine */
465 call_rtc_interrupt = 1;
466 rtc_int_flag |= RTC_PF;
467 PIE_count = 0;
468 }
469 }
470 if (AIE_on) {
471 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
472 (curr_time.tm_min == alarm_time.tm_min) &&
473 (curr_time.tm_hour == alarm_time.tm_hour)) {
474 /* Set alarm int info, call real rtc int routine */
475 call_rtc_interrupt = 1;
476 rtc_int_flag |= RTC_AF;
477 }
478 }
479 if (call_rtc_interrupt) {
480 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
481 rtc_interrupt(rtc_int_flag, dev_id);
482 }
483 return IRQ_HANDLED;
484}
485#endif
486
487static int __init nohpet_setup(char *s)
488{
489 nohpet = 1;
490 return 1;
491}
492
493__setup("nohpet", nohpet_setup);
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
new file mode 100644
index 000000000000..1d58c13bc6bc
--- /dev/null
+++ b/arch/x86/kernel/i387_64.c
@@ -0,0 +1,151 @@
1/*
2 * linux/arch/x86_64/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * General FPU state handling cleanups
9 * Gareth Hughes <gareth@valinux.com>, May 2000
10 *
11 * x86-64 rework 2002 Andi Kleen.
12 * Does direct fxsave in and out of user space now for signal handlers.
13 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
14 * the 64bit user space sees a FXSAVE frame directly.
15 */
16
17#include <linux/sched.h>
18#include <linux/init.h>
19#include <asm/processor.h>
20#include <asm/i387.h>
21#include <asm/sigcontext.h>
22#include <asm/user.h>
23#include <asm/ptrace.h>
24#include <asm/uaccess.h>
25
26unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
27
28void mxcsr_feature_mask_init(void)
29{
30 unsigned int mask;
31 clts();
32 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
33 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
34 mask = current->thread.i387.fxsave.mxcsr_mask;
35 if (mask == 0) mask = 0x0000ffbf;
36 mxcsr_feature_mask &= mask;
37 stts();
38}
39
40/*
41 * Called at bootup to set up the initial FPU state that is later cloned
42 * into all processes.
43 */
44void __cpuinit fpu_init(void)
45{
46 unsigned long oldcr0 = read_cr0();
47 extern void __bad_fxsave_alignment(void);
48
49 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
50 __bad_fxsave_alignment();
51 set_in_cr4(X86_CR4_OSFXSR);
52 set_in_cr4(X86_CR4_OSXMMEXCPT);
53
54 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
55
56 mxcsr_feature_mask_init();
57 /* clean state in init */
58 current_thread_info()->status = 0;
59 clear_used_math();
60}
61
62void init_fpu(struct task_struct *child)
63{
64 if (tsk_used_math(child)) {
65 if (child == current)
66 unlazy_fpu(child);
67 return;
68 }
69 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
70 child->thread.i387.fxsave.cwd = 0x37f;
71 child->thread.i387.fxsave.mxcsr = 0x1f80;
72 /* only the device not available exception or ptrace can call init_fpu */
73 set_stopped_child_used_math(child);
74}
75
76/*
77 * Signal frame handlers.
78 */
79
80int save_i387(struct _fpstate __user *buf)
81{
82 struct task_struct *tsk = current;
83 int err = 0;
84
85 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
86 sizeof(tsk->thread.i387.fxsave));
87
88 if ((unsigned long)buf % 16)
89 printk("save_i387: bad fpstate %p\n",buf);
90
91 if (!used_math())
92 return 0;
93 clear_used_math(); /* trigger finit */
94 if (task_thread_info(tsk)->status & TS_USEDFPU) {
95 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
96 if (err) return err;
97 stts();
98 } else {
99 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
100 sizeof(struct i387_fxsave_struct)))
101 return -1;
102 }
103 return 1;
104}
105
106/*
107 * ptrace request handlers.
108 */
109
110int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
111{
112 init_fpu(tsk);
113 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
114 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
115}
116
117int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
118{
119 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
120 sizeof(struct user_i387_struct)))
121 return -EFAULT;
122 return 0;
123}
124
125/*
126 * FPU state for core dumps.
127 */
128
129int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
130{
131 struct task_struct *tsk = current;
132
133 if (!used_math())
134 return 0;
135
136 unlazy_fpu(tsk);
137 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
138 return 1;
139}
140
141int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
142{
143 int fpvalid = !!tsk_used_math(tsk);
144
145 if (fpvalid) {
146 if (tsk == current)
147 unlazy_fpu(tsk);
148 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
149}
150 return fpvalid;
151}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
new file mode 100644
index 000000000000..948cae646099
--- /dev/null
+++ b/arch/x86/kernel/i8259_64.c
@@ -0,0 +1,544 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24
25/*
26 * Common place to define all x86 IRQ vectors
27 *
28 * This builds up the IRQ handler stubs using some ugly macros in irq.h
29 *
30 * These macros create the low-level assembly IRQ routines that save
31 * register context and call do_IRQ(). do_IRQ() then does all the
32 * operations that are needed to keep the AT (or SMP IOAPIC)
33 * interrupt-controller happy.
34 */
35
36#define BI(x,y) \
37 BUILD_IRQ(x##y)
38
39#define BUILD_16_IRQS(x) \
40 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
41 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
42 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
43 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
44
45/*
46 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
47 * (these are usually mapped to vectors 0x30-0x3f)
48 */
49
50/*
51 * The IO-APIC gives us many more interrupt sources. Most of these
52 * are unused but an SMP system is supposed to have enough memory ...
53 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
54 * across the spectrum, so we really want to be prepared to get all
55 * of these. Plus, more powerful systems might have more than 64
56 * IO-APIC registers.
57 *
58 * (these are usually mapped into the 0x30-0xff vector range)
59 */
60 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
61BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
62BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
63BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
64
65#undef BUILD_16_IRQS
66#undef BI
67
68
69#define IRQ(x,y) \
70 IRQ##x##y##_interrupt
71
72#define IRQLIST_16(x) \
73 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
74 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
75 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
76 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
77
78/* for the irq vectors */
79static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
83 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
84};
85
86#undef IRQ
87#undef IRQLIST_16
88
89/*
90 * This is the 'legacy' 8259A Programmable Interrupt Controller,
91 * present in the majority of PC/AT boxes.
92 * plus some generic x86 specific things if generic specifics makes
93 * any sense at all.
94 * this file should become arch/i386/kernel/irq.c when the old irq.c
95 * moves to arch independent land
96 */
97
98static int i8259A_auto_eoi;
99DEFINE_SPINLOCK(i8259A_lock);
100static void mask_and_ack_8259A(unsigned int);
101
102static struct irq_chip i8259A_chip = {
103 .name = "XT-PIC",
104 .mask = disable_8259A_irq,
105 .disable = disable_8259A_irq,
106 .unmask = enable_8259A_irq,
107 .mask_ack = mask_and_ack_8259A,
108};
109
110/*
111 * 8259A PIC functions to handle ISA devices:
112 */
113
114/*
115 * This contains the irq mask for both 8259A irq controllers,
116 */
117static unsigned int cached_irq_mask = 0xffff;
118
119#define __byte(x,y) (((unsigned char *)&(y))[x])
120#define cached_21 (__byte(0,cached_irq_mask))
121#define cached_A1 (__byte(1,cached_irq_mask))
122
123/*
124 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
125 * boards the timer interrupt is not really connected to any IO-APIC pin,
126 * it's fed to the master 8259A's IR0 line only.
127 *
128 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
129 * this 'mixed mode' IRQ handling costs nothing because it's only used
130 * at IRQ setup time.
131 */
132unsigned long io_apic_irqs;
133
134void disable_8259A_irq(unsigned int irq)
135{
136 unsigned int mask = 1 << irq;
137 unsigned long flags;
138
139 spin_lock_irqsave(&i8259A_lock, flags);
140 cached_irq_mask |= mask;
141 if (irq & 8)
142 outb(cached_A1,0xA1);
143 else
144 outb(cached_21,0x21);
145 spin_unlock_irqrestore(&i8259A_lock, flags);
146}
147
148void enable_8259A_irq(unsigned int irq)
149{
150 unsigned int mask = ~(1 << irq);
151 unsigned long flags;
152
153 spin_lock_irqsave(&i8259A_lock, flags);
154 cached_irq_mask &= mask;
155 if (irq & 8)
156 outb(cached_A1,0xA1);
157 else
158 outb(cached_21,0x21);
159 spin_unlock_irqrestore(&i8259A_lock, flags);
160}
161
162int i8259A_irq_pending(unsigned int irq)
163{
164 unsigned int mask = 1<<irq;
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&i8259A_lock, flags);
169 if (irq < 8)
170 ret = inb(0x20) & mask;
171 else
172 ret = inb(0xA0) & (mask >> 8);
173 spin_unlock_irqrestore(&i8259A_lock, flags);
174
175 return ret;
176}
177
178void make_8259A_irq(unsigned int irq)
179{
180 disable_irq_nosync(irq);
181 io_apic_irqs &= ~(1<<irq);
182 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
183 "XT");
184 enable_irq(irq);
185}
186
187/*
188 * This function assumes to be called rarely. Switching between
189 * 8259A registers is slow.
190 * This has to be protected by the irq controller spinlock
191 * before being called.
192 */
193static inline int i8259A_irq_real(unsigned int irq)
194{
195 int value;
196 int irqmask = 1<<irq;
197
198 if (irq < 8) {
199 outb(0x0B,0x20); /* ISR register */
200 value = inb(0x20) & irqmask;
201 outb(0x0A,0x20); /* back to the IRR register */
202 return value;
203 }
204 outb(0x0B,0xA0); /* ISR register */
205 value = inb(0xA0) & (irqmask >> 8);
206 outb(0x0A,0xA0); /* back to the IRR register */
207 return value;
208}
209
210/*
211 * Careful! The 8259A is a fragile beast, it pretty
212 * much _has_ to be done exactly like this (mask it
213 * first, _then_ send the EOI, and the order of EOI
214 * to the two 8259s is important!
215 */
216static void mask_and_ack_8259A(unsigned int irq)
217{
218 unsigned int irqmask = 1 << irq;
219 unsigned long flags;
220
221 spin_lock_irqsave(&i8259A_lock, flags);
222 /*
223 * Lightweight spurious IRQ detection. We do not want
224 * to overdo spurious IRQ handling - it's usually a sign
225 * of hardware problems, so we only do the checks we can
226 * do without slowing down good hardware unnecessarily.
227 *
228 * Note that IRQ7 and IRQ15 (the two spurious IRQs
229 * usually resulting from the 8259A-1|2 PICs) occur
230 * even if the IRQ is masked in the 8259A. Thus we
231 * can check spurious 8259A IRQs without doing the
232 * quite slow i8259A_irq_real() call for every IRQ.
233 * This does not cover 100% of spurious interrupts,
234 * but should be enough to warn the user that there
235 * is something bad going on ...
236 */
237 if (cached_irq_mask & irqmask)
238 goto spurious_8259A_irq;
239 cached_irq_mask |= irqmask;
240
241handle_real_irq:
242 if (irq & 8) {
243 inb(0xA1); /* DUMMY - (do we need this?) */
244 outb(cached_A1,0xA1);
245 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
246 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
247 } else {
248 inb(0x21); /* DUMMY - (do we need this?) */
249 outb(cached_21,0x21);
250 outb(0x60+irq,0x20); /* 'Specific EOI' to master */
251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return;
254
255spurious_8259A_irq:
256 /*
257 * this is the slow path - should happen rarely.
258 */
259 if (i8259A_irq_real(irq))
260 /*
261 * oops, the IRQ _is_ in service according to the
262 * 8259A - not spurious, go handle it.
263 */
264 goto handle_real_irq;
265
266 {
267 static int spurious_irq_mask;
268 /*
269 * At this point we can be sure the IRQ is spurious,
270 * lets ACK and report it. [once per IRQ]
271 */
272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
274 spurious_irq_mask |= irqmask;
275 }
276 atomic_inc(&irq_err_count);
277 /*
278 * Theoretically we do not have to handle this IRQ,
279 * but in Linux this does not cause problems and is
280 * simpler for us.
281 */
282 goto handle_real_irq;
283 }
284}
285
286void init_8259A(int auto_eoi)
287{
288 unsigned long flags;
289
290 i8259A_auto_eoi = auto_eoi;
291
292 spin_lock_irqsave(&i8259A_lock, flags);
293
294 outb(0xff, 0x21); /* mask all of 8259A-1 */
295 outb(0xff, 0xA1); /* mask all of 8259A-2 */
296
297 /*
298 * outb_p - this has to work on a wide range of PC hardware.
299 */
300 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
301 outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
302 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
303 if (auto_eoi)
304 outb_p(0x03, 0x21); /* master does Auto EOI */
305 else
306 outb_p(0x01, 0x21); /* master expects normal EOI */
307
308 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
309 outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
310 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
311 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
312 is to be investigated) */
313
314 if (auto_eoi)
315 /*
316 * in AEOI mode we just have to mask the interrupt
317 * when acking.
318 */
319 i8259A_chip.mask_ack = disable_8259A_irq;
320 else
321 i8259A_chip.mask_ack = mask_and_ack_8259A;
322
323 udelay(100); /* wait for 8259A to initialize */
324
325 outb(cached_21, 0x21); /* restore master IRQ mask */
326 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
327
328 spin_unlock_irqrestore(&i8259A_lock, flags);
329}
330
331static char irq_trigger[2];
332/**
333 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
334 */
335static void restore_ELCR(char *trigger)
336{
337 outb(trigger[0], 0x4d0);
338 outb(trigger[1], 0x4d1);
339}
340
341static void save_ELCR(char *trigger)
342{
343 /* IRQ 0,1,2,8,13 are marked as reserved */
344 trigger[0] = inb(0x4d0) & 0xF8;
345 trigger[1] = inb(0x4d1) & 0xDE;
346}
347
348static int i8259A_resume(struct sys_device *dev)
349{
350 init_8259A(i8259A_auto_eoi);
351 restore_ELCR(irq_trigger);
352 return 0;
353}
354
355static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
356{
357 save_ELCR(irq_trigger);
358 return 0;
359}
360
361static int i8259A_shutdown(struct sys_device *dev)
362{
363 /* Put the i8259A into a quiescent state that
364 * the kernel initialization code can get it
365 * out of.
366 */
367 outb(0xff, 0x21); /* mask all of 8259A-1 */
368 outb(0xff, 0xA1); /* mask all of 8259A-1 */
369 return 0;
370}
371
372static struct sysdev_class i8259_sysdev_class = {
373 set_kset_name("i8259"),
374 .suspend = i8259A_suspend,
375 .resume = i8259A_resume,
376 .shutdown = i8259A_shutdown,
377};
378
379static struct sys_device device_i8259A = {
380 .id = 0,
381 .cls = &i8259_sysdev_class,
382};
383
384static int __init i8259A_init_sysfs(void)
385{
386 int error = sysdev_class_register(&i8259_sysdev_class);
387 if (!error)
388 error = sysdev_register(&device_i8259A);
389 return error;
390}
391
392device_initcall(i8259A_init_sysfs);
393
394/*
395 * IRQ2 is cascade interrupt to second interrupt controller
396 */
397
398static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
399DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
400 [0 ... IRQ0_VECTOR - 1] = -1,
401 [IRQ0_VECTOR] = 0,
402 [IRQ1_VECTOR] = 1,
403 [IRQ2_VECTOR] = 2,
404 [IRQ3_VECTOR] = 3,
405 [IRQ4_VECTOR] = 4,
406 [IRQ5_VECTOR] = 5,
407 [IRQ6_VECTOR] = 6,
408 [IRQ7_VECTOR] = 7,
409 [IRQ8_VECTOR] = 8,
410 [IRQ9_VECTOR] = 9,
411 [IRQ10_VECTOR] = 10,
412 [IRQ11_VECTOR] = 11,
413 [IRQ12_VECTOR] = 12,
414 [IRQ13_VECTOR] = 13,
415 [IRQ14_VECTOR] = 14,
416 [IRQ15_VECTOR] = 15,
417 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
418};
419
420void __init init_ISA_irqs (void)
421{
422 int i;
423
424 init_bsp_APIC();
425 init_8259A(0);
426
427 for (i = 0; i < NR_IRQS; i++) {
428 irq_desc[i].status = IRQ_DISABLED;
429 irq_desc[i].action = NULL;
430 irq_desc[i].depth = 1;
431
432 if (i < 16) {
433 /*
434 * 16 old-style INTA-cycle interrupts:
435 */
436 set_irq_chip_and_handler_name(i, &i8259A_chip,
437 handle_level_irq, "XT");
438 } else {
439 /*
440 * 'high' PCI IRQs filled in on demand
441 */
442 irq_desc[i].chip = &no_irq_chip;
443 }
444 }
445}
446
447static void setup_timer_hardware(void)
448{
449 outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
450 udelay(10);
451 outb_p(LATCH & 0xff , 0x40); /* LSB */
452 udelay(10);
453 outb(LATCH >> 8 , 0x40); /* MSB */
454}
455
456static int timer_resume(struct sys_device *dev)
457{
458 setup_timer_hardware();
459 return 0;
460}
461
462void i8254_timer_resume(void)
463{
464 setup_timer_hardware();
465}
466
467static struct sysdev_class timer_sysclass = {
468 set_kset_name("timer_pit"),
469 .resume = timer_resume,
470};
471
472static struct sys_device device_timer = {
473 .id = 0,
474 .cls = &timer_sysclass,
475};
476
477static int __init init_timer_sysfs(void)
478{
479 int error = sysdev_class_register(&timer_sysclass);
480 if (!error)
481 error = sysdev_register(&device_timer);
482 return error;
483}
484
485device_initcall(init_timer_sysfs);
486
487void __init init_IRQ(void)
488{
489 int i;
490
491 init_ISA_irqs();
492 /*
493 * Cover the whole vector space, no vector can escape
494 * us. (some of these will be overridden and become
495 * 'special' SMP interrupts)
496 */
497 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
498 int vector = FIRST_EXTERNAL_VECTOR + i;
499 if (vector != IA32_SYSCALL_VECTOR)
500 set_intr_gate(vector, interrupt[i]);
501 }
502
503#ifdef CONFIG_SMP
504 /*
505 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
506 * IPI, driven by wakeup.
507 */
508 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
509
510 /* IPIs for invalidation */
511 set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
512 set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
513 set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
514 set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
515 set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
516 set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
517 set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
518 set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
519
520 /* IPI for generic function call */
521 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
522
523 /* Low priority IPI to cleanup after moving an irq */
524 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
525#endif
526 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
527 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
528
529 /* self generated IPI for local APIC timer */
530 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
531
532 /* IPI vectors for APIC spurious and error interrupts */
533 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
534 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
535
536 /*
537 * Set the clock to HZ Hz, we already have a valid
538 * vector now:
539 */
540 setup_timer_hardware();
541
542 if (!acpi_ioapic)
543 setup_irq(2, &irq2);
544}
diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c
new file mode 100644
index 000000000000..4ff33d4f8551
--- /dev/null
+++ b/arch/x86/kernel/init_task_64.c
@@ -0,0 +1,54 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial task structure.
23 *
24 * We need to make sure that this is 8192-byte aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40/*
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data.cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */
47DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
48
49/* Copies of the original ist values from the tss are only accessed during
50 * debugging, no special alignment required.
51 */
52DEFINE_PER_CPU(struct orig_ist, orig_ist);
53
54#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
new file mode 100644
index 000000000000..966fa1062491
--- /dev/null
+++ b/arch/x86/kernel/io_apic_64.c
@@ -0,0 +1,2202 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/pci.h>
29#include <linux/mc146818rtc.h>
30#include <linux/acpi.h>
31#include <linux/sysdev.h>
32#include <linux/msi.h>
33#include <linux/htirq.h>
34#ifdef CONFIG_ACPI
35#include <acpi/acpi_bus.h>
36#endif
37
38#include <asm/idle.h>
39#include <asm/io.h>
40#include <asm/smp.h>
41#include <asm/desc.h>
42#include <asm/proto.h>
43#include <asm/mach_apic.h>
44#include <asm/acpi.h>
45#include <asm/dma.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49
50struct irq_cfg {
51 cpumask_t domain;
52 cpumask_t old_domain;
53 unsigned move_cleanup_count;
54 u8 vector;
55 u8 move_in_progress : 1;
56};
57
58/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
59struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
60 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
61 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
62 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
63 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
64 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
65 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
66 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
67 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
68 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
69 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
70 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
71 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
72 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
73 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
74 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
75 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
76};
77
78static int assign_irq_vector(int irq, cpumask_t mask);
79
80#define __apicdebuginit __init
81
82int sis_apic_bug; /* not actually supported, dummy for compile */
83
84static int no_timer_check;
85
86static int disable_timer_pin_1 __initdata;
87
88int timer_over_8254 __initdata = 1;
89
90/* Where if anywhere is the i8259 connect in external int mode */
91static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
92
93static DEFINE_SPINLOCK(ioapic_lock);
94DEFINE_SPINLOCK(vector_lock);
95
96/*
97 * # of IRQ routing registers
98 */
99int nr_ioapic_registers[MAX_IO_APICS];
100
101/*
102 * Rough estimation of how many shared IRQs there are, can
103 * be changed anytime.
104 */
105#define MAX_PLUS_SHARED_IRQS NR_IRQS
106#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
107
108/*
109 * This is performance-critical, we want to do it O(1)
110 *
111 * the indexing order of this array favors 1:1 mappings
112 * between pins and IRQs.
113 */
114
115static struct irq_pin_list {
116 short apic, pin, next;
117} irq_2_pin[PIN_MAP_SIZE];
118
119struct io_apic {
120 unsigned int index;
121 unsigned int unused[3];
122 unsigned int data;
123};
124
125static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
126{
127 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
128 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
129}
130
131static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
132{
133 struct io_apic __iomem *io_apic = io_apic_base(apic);
134 writel(reg, &io_apic->index);
135 return readl(&io_apic->data);
136}
137
138static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
139{
140 struct io_apic __iomem *io_apic = io_apic_base(apic);
141 writel(reg, &io_apic->index);
142 writel(value, &io_apic->data);
143}
144
145/*
146 * Re-write a value: to be used for read-modify-write
147 * cycles where the read already set up the index register.
148 */
149static inline void io_apic_modify(unsigned int apic, unsigned int value)
150{
151 struct io_apic __iomem *io_apic = io_apic_base(apic);
152 writel(value, &io_apic->data);
153}
154
155static int io_apic_level_ack_pending(unsigned int irq)
156{
157 struct irq_pin_list *entry;
158 unsigned long flags;
159 int pending = 0;
160
161 spin_lock_irqsave(&ioapic_lock, flags);
162 entry = irq_2_pin + irq;
163 for (;;) {
164 unsigned int reg;
165 int pin;
166
167 pin = entry->pin;
168 if (pin == -1)
169 break;
170 reg = io_apic_read(entry->apic, 0x10 + pin*2);
171 /* Is the remote IRR bit set? */
172 pending |= (reg >> 14) & 1;
173 if (!entry->next)
174 break;
175 entry = irq_2_pin + entry->next;
176 }
177 spin_unlock_irqrestore(&ioapic_lock, flags);
178 return pending;
179}
180
181/*
182 * Synchronize the IO-APIC and the CPU by doing
183 * a dummy read from the IO-APIC
184 */
185static inline void io_apic_sync(unsigned int apic)
186{
187 struct io_apic __iomem *io_apic = io_apic_base(apic);
188 readl(&io_apic->data);
189}
190
191#define __DO_ACTION(R, ACTION, FINAL) \
192 \
193{ \
194 int pin; \
195 struct irq_pin_list *entry = irq_2_pin + irq; \
196 \
197 BUG_ON(irq >= NR_IRQS); \
198 for (;;) { \
199 unsigned int reg; \
200 pin = entry->pin; \
201 if (pin == -1) \
202 break; \
203 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
204 reg ACTION; \
205 io_apic_modify(entry->apic, reg); \
206 FINAL; \
207 if (!entry->next) \
208 break; \
209 entry = irq_2_pin + entry->next; \
210 } \
211}
212
213union entry_union {
214 struct { u32 w1, w2; };
215 struct IO_APIC_route_entry entry;
216};
217
218static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
219{
220 union entry_union eu;
221 unsigned long flags;
222 spin_lock_irqsave(&ioapic_lock, flags);
223 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
224 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
225 spin_unlock_irqrestore(&ioapic_lock, flags);
226 return eu.entry;
227}
228
229/*
230 * When we write a new IO APIC routing entry, we need to write the high
231 * word first! If the mask bit in the low word is clear, we will enable
232 * the interrupt, and we need to make sure the entry is fully populated
233 * before that happens.
234 */
235static void
236__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
237{
238 union entry_union eu;
239 eu.entry = e;
240 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
241 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
242}
243
244static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
245{
246 unsigned long flags;
247 spin_lock_irqsave(&ioapic_lock, flags);
248 __ioapic_write_entry(apic, pin, e);
249 spin_unlock_irqrestore(&ioapic_lock, flags);
250}
251
252/*
253 * When we mask an IO APIC routing entry, we need to write the low
254 * word first, in order to set the mask bit before we change the
255 * high bits!
256 */
257static void ioapic_mask_entry(int apic, int pin)
258{
259 unsigned long flags;
260 union entry_union eu = { .entry.mask = 1 };
261
262 spin_lock_irqsave(&ioapic_lock, flags);
263 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
264 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
265 spin_unlock_irqrestore(&ioapic_lock, flags);
266}
267
268#ifdef CONFIG_SMP
269static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
270{
271 int apic, pin;
272 struct irq_pin_list *entry = irq_2_pin + irq;
273
274 BUG_ON(irq >= NR_IRQS);
275 for (;;) {
276 unsigned int reg;
277 apic = entry->apic;
278 pin = entry->pin;
279 if (pin == -1)
280 break;
281 io_apic_write(apic, 0x11 + pin*2, dest);
282 reg = io_apic_read(apic, 0x10 + pin*2);
283 reg &= ~0x000000ff;
284 reg |= vector;
285 io_apic_modify(apic, reg);
286 if (!entry->next)
287 break;
288 entry = irq_2_pin + entry->next;
289 }
290}
291
292static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
293{
294 struct irq_cfg *cfg = irq_cfg + irq;
295 unsigned long flags;
296 unsigned int dest;
297 cpumask_t tmp;
298
299 cpus_and(tmp, mask, cpu_online_map);
300 if (cpus_empty(tmp))
301 return;
302
303 if (assign_irq_vector(irq, mask))
304 return;
305
306 cpus_and(tmp, cfg->domain, mask);
307 dest = cpu_mask_to_apicid(tmp);
308
309 /*
310 * Only the high 8 bits are valid.
311 */
312 dest = SET_APIC_LOGICAL_ID(dest);
313
314 spin_lock_irqsave(&ioapic_lock, flags);
315 __target_IO_APIC_irq(irq, dest, cfg->vector);
316 irq_desc[irq].affinity = mask;
317 spin_unlock_irqrestore(&ioapic_lock, flags);
318}
319#endif
320
321/*
322 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
323 * shared ISA-space IRQs, so we have to support them. We are super
324 * fast in the common case, and fast for shared ISA-space IRQs.
325 */
326static void add_pin_to_irq(unsigned int irq, int apic, int pin)
327{
328 static int first_free_entry = NR_IRQS;
329 struct irq_pin_list *entry = irq_2_pin + irq;
330
331 BUG_ON(irq >= NR_IRQS);
332 while (entry->next)
333 entry = irq_2_pin + entry->next;
334
335 if (entry->pin != -1) {
336 entry->next = first_free_entry;
337 entry = irq_2_pin + entry->next;
338 if (++first_free_entry >= PIN_MAP_SIZE)
339 panic("io_apic.c: ran out of irq_2_pin entries!");
340 }
341 entry->apic = apic;
342 entry->pin = pin;
343}
344
345
346#define DO_ACTION(name,R,ACTION, FINAL) \
347 \
348 static void name##_IO_APIC_irq (unsigned int irq) \
349 __DO_ACTION(R, ACTION, FINAL)
350
351DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
352 /* mask = 1 */
353DO_ACTION( __unmask, 0, &= 0xfffeffff, )
354 /* mask = 0 */
355
356static void mask_IO_APIC_irq (unsigned int irq)
357{
358 unsigned long flags;
359
360 spin_lock_irqsave(&ioapic_lock, flags);
361 __mask_IO_APIC_irq(irq);
362 spin_unlock_irqrestore(&ioapic_lock, flags);
363}
364
365static void unmask_IO_APIC_irq (unsigned int irq)
366{
367 unsigned long flags;
368
369 spin_lock_irqsave(&ioapic_lock, flags);
370 __unmask_IO_APIC_irq(irq);
371 spin_unlock_irqrestore(&ioapic_lock, flags);
372}
373
374static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
375{
376 struct IO_APIC_route_entry entry;
377
378 /* Check delivery_mode to be sure we're not clearing an SMI pin */
379 entry = ioapic_read_entry(apic, pin);
380 if (entry.delivery_mode == dest_SMI)
381 return;
382 /*
383 * Disable it in the IO-APIC irq-routing table:
384 */
385 ioapic_mask_entry(apic, pin);
386}
387
388static void clear_IO_APIC (void)
389{
390 int apic, pin;
391
392 for (apic = 0; apic < nr_ioapics; apic++)
393 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
394 clear_IO_APIC_pin(apic, pin);
395}
396
397int skip_ioapic_setup;
398int ioapic_force;
399
400static int __init parse_noapic(char *str)
401{
402 disable_ioapic_setup();
403 return 0;
404}
405early_param("noapic", parse_noapic);
406
407/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
408static int __init disable_timer_pin_setup(char *arg)
409{
410 disable_timer_pin_1 = 1;
411 return 1;
412}
413__setup("disable_timer_pin_1", disable_timer_pin_setup);
414
415static int __init setup_disable_8254_timer(char *s)
416{
417 timer_over_8254 = -1;
418 return 1;
419}
420static int __init setup_enable_8254_timer(char *s)
421{
422 timer_over_8254 = 2;
423 return 1;
424}
425
426__setup("disable_8254_timer", setup_disable_8254_timer);
427__setup("enable_8254_timer", setup_enable_8254_timer);
428
429
430/*
431 * Find the IRQ entry number of a certain pin.
432 */
433static int find_irq_entry(int apic, int pin, int type)
434{
435 int i;
436
437 for (i = 0; i < mp_irq_entries; i++)
438 if (mp_irqs[i].mpc_irqtype == type &&
439 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
440 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
441 mp_irqs[i].mpc_dstirq == pin)
442 return i;
443
444 return -1;
445}
446
447/*
448 * Find the pin to which IRQ[irq] (ISA) is connected
449 */
450static int __init find_isa_irq_pin(int irq, int type)
451{
452 int i;
453
454 for (i = 0; i < mp_irq_entries; i++) {
455 int lbus = mp_irqs[i].mpc_srcbus;
456
457 if (test_bit(lbus, mp_bus_not_pci) &&
458 (mp_irqs[i].mpc_irqtype == type) &&
459 (mp_irqs[i].mpc_srcbusirq == irq))
460
461 return mp_irqs[i].mpc_dstirq;
462 }
463 return -1;
464}
465
466static int __init find_isa_irq_apic(int irq, int type)
467{
468 int i;
469
470 for (i = 0; i < mp_irq_entries; i++) {
471 int lbus = mp_irqs[i].mpc_srcbus;
472
473 if (test_bit(lbus, mp_bus_not_pci) &&
474 (mp_irqs[i].mpc_irqtype == type) &&
475 (mp_irqs[i].mpc_srcbusirq == irq))
476 break;
477 }
478 if (i < mp_irq_entries) {
479 int apic;
480 for(apic = 0; apic < nr_ioapics; apic++) {
481 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
482 return apic;
483 }
484 }
485
486 return -1;
487}
488
489/*
490 * Find a specific PCI IRQ entry.
491 * Not an __init, possibly needed by modules
492 */
493static int pin_2_irq(int idx, int apic, int pin);
494
495int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
496{
497 int apic, i, best_guess = -1;
498
499 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
500 bus, slot, pin);
501 if (mp_bus_id_to_pci_bus[bus] == -1) {
502 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
503 return -1;
504 }
505 for (i = 0; i < mp_irq_entries; i++) {
506 int lbus = mp_irqs[i].mpc_srcbus;
507
508 for (apic = 0; apic < nr_ioapics; apic++)
509 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
510 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
511 break;
512
513 if (!test_bit(lbus, mp_bus_not_pci) &&
514 !mp_irqs[i].mpc_irqtype &&
515 (bus == lbus) &&
516 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
517 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
518
519 if (!(apic || IO_APIC_IRQ(irq)))
520 continue;
521
522 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
523 return irq;
524 /*
525 * Use the first all-but-pin matching entry as a
526 * best-guess fuzzy result for broken mptables.
527 */
528 if (best_guess < 0)
529 best_guess = irq;
530 }
531 }
532 BUG_ON(best_guess >= NR_IRQS);
533 return best_guess;
534}
535
536/* ISA interrupts are always polarity zero edge triggered,
537 * when listed as conforming in the MP table. */
538
539#define default_ISA_trigger(idx) (0)
540#define default_ISA_polarity(idx) (0)
541
542/* PCI interrupts are always polarity one level triggered,
543 * when listed as conforming in the MP table. */
544
545#define default_PCI_trigger(idx) (1)
546#define default_PCI_polarity(idx) (1)
547
548static int __init MPBIOS_polarity(int idx)
549{
550 int bus = mp_irqs[idx].mpc_srcbus;
551 int polarity;
552
553 /*
554 * Determine IRQ line polarity (high active or low active):
555 */
556 switch (mp_irqs[idx].mpc_irqflag & 3)
557 {
558 case 0: /* conforms, ie. bus-type dependent polarity */
559 if (test_bit(bus, mp_bus_not_pci))
560 polarity = default_ISA_polarity(idx);
561 else
562 polarity = default_PCI_polarity(idx);
563 break;
564 case 1: /* high active */
565 {
566 polarity = 0;
567 break;
568 }
569 case 2: /* reserved */
570 {
571 printk(KERN_WARNING "broken BIOS!!\n");
572 polarity = 1;
573 break;
574 }
575 case 3: /* low active */
576 {
577 polarity = 1;
578 break;
579 }
580 default: /* invalid */
581 {
582 printk(KERN_WARNING "broken BIOS!!\n");
583 polarity = 1;
584 break;
585 }
586 }
587 return polarity;
588}
589
590static int MPBIOS_trigger(int idx)
591{
592 int bus = mp_irqs[idx].mpc_srcbus;
593 int trigger;
594
595 /*
596 * Determine IRQ trigger mode (edge or level sensitive):
597 */
598 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
599 {
600 case 0: /* conforms, ie. bus-type dependent */
601 if (test_bit(bus, mp_bus_not_pci))
602 trigger = default_ISA_trigger(idx);
603 else
604 trigger = default_PCI_trigger(idx);
605 break;
606 case 1: /* edge */
607 {
608 trigger = 0;
609 break;
610 }
611 case 2: /* reserved */
612 {
613 printk(KERN_WARNING "broken BIOS!!\n");
614 trigger = 1;
615 break;
616 }
617 case 3: /* level */
618 {
619 trigger = 1;
620 break;
621 }
622 default: /* invalid */
623 {
624 printk(KERN_WARNING "broken BIOS!!\n");
625 trigger = 0;
626 break;
627 }
628 }
629 return trigger;
630}
631
632static inline int irq_polarity(int idx)
633{
634 return MPBIOS_polarity(idx);
635}
636
637static inline int irq_trigger(int idx)
638{
639 return MPBIOS_trigger(idx);
640}
641
642static int pin_2_irq(int idx, int apic, int pin)
643{
644 int irq, i;
645 int bus = mp_irqs[idx].mpc_srcbus;
646
647 /*
648 * Debugging check, we are in big trouble if this message pops up!
649 */
650 if (mp_irqs[idx].mpc_dstirq != pin)
651 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
652
653 if (test_bit(bus, mp_bus_not_pci)) {
654 irq = mp_irqs[idx].mpc_srcbusirq;
655 } else {
656 /*
657 * PCI IRQs are mapped in order
658 */
659 i = irq = 0;
660 while (i < apic)
661 irq += nr_ioapic_registers[i++];
662 irq += pin;
663 }
664 BUG_ON(irq >= NR_IRQS);
665 return irq;
666}
667
668static int __assign_irq_vector(int irq, cpumask_t mask)
669{
670 /*
671 * NOTE! The local APIC isn't very good at handling
672 * multiple interrupts at the same interrupt level.
673 * As the interrupt level is determined by taking the
674 * vector number and shifting that right by 4, we
675 * want to spread these out a bit so that they don't
676 * all fall in the same interrupt level.
677 *
678 * Also, we've got to be careful not to trash gate
679 * 0x80, because int 0x80 is hm, kind of importantish. ;)
680 */
681 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
682 unsigned int old_vector;
683 int cpu;
684 struct irq_cfg *cfg;
685
686 BUG_ON((unsigned)irq >= NR_IRQS);
687 cfg = &irq_cfg[irq];
688
689 /* Only try and allocate irqs on cpus that are present */
690 cpus_and(mask, mask, cpu_online_map);
691
692 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
693 return -EBUSY;
694
695 old_vector = cfg->vector;
696 if (old_vector) {
697 cpumask_t tmp;
698 cpus_and(tmp, cfg->domain, mask);
699 if (!cpus_empty(tmp))
700 return 0;
701 }
702
703 for_each_cpu_mask(cpu, mask) {
704 cpumask_t domain, new_mask;
705 int new_cpu;
706 int vector, offset;
707
708 domain = vector_allocation_domain(cpu);
709 cpus_and(new_mask, domain, cpu_online_map);
710
711 vector = current_vector;
712 offset = current_offset;
713next:
714 vector += 8;
715 if (vector >= FIRST_SYSTEM_VECTOR) {
716 /* If we run out of vectors on large boxen, must share them. */
717 offset = (offset + 1) % 8;
718 vector = FIRST_DEVICE_VECTOR + offset;
719 }
720 if (unlikely(current_vector == vector))
721 continue;
722 if (vector == IA32_SYSCALL_VECTOR)
723 goto next;
724 for_each_cpu_mask(new_cpu, new_mask)
725 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
726 goto next;
727 /* Found one! */
728 current_vector = vector;
729 current_offset = offset;
730 if (old_vector) {
731 cfg->move_in_progress = 1;
732 cfg->old_domain = cfg->domain;
733 }
734 for_each_cpu_mask(new_cpu, new_mask)
735 per_cpu(vector_irq, new_cpu)[vector] = irq;
736 cfg->vector = vector;
737 cfg->domain = domain;
738 return 0;
739 }
740 return -ENOSPC;
741}
742
743static int assign_irq_vector(int irq, cpumask_t mask)
744{
745 int err;
746 unsigned long flags;
747
748 spin_lock_irqsave(&vector_lock, flags);
749 err = __assign_irq_vector(irq, mask);
750 spin_unlock_irqrestore(&vector_lock, flags);
751 return err;
752}
753
754static void __clear_irq_vector(int irq)
755{
756 struct irq_cfg *cfg;
757 cpumask_t mask;
758 int cpu, vector;
759
760 BUG_ON((unsigned)irq >= NR_IRQS);
761 cfg = &irq_cfg[irq];
762 BUG_ON(!cfg->vector);
763
764 vector = cfg->vector;
765 cpus_and(mask, cfg->domain, cpu_online_map);
766 for_each_cpu_mask(cpu, mask)
767 per_cpu(vector_irq, cpu)[vector] = -1;
768
769 cfg->vector = 0;
770 cfg->domain = CPU_MASK_NONE;
771}
772
773void __setup_vector_irq(int cpu)
774{
775 /* Initialize vector_irq on a new cpu */
776 /* This function must be called with vector_lock held */
777 int irq, vector;
778
779 /* Mark the inuse vectors */
780 for (irq = 0; irq < NR_IRQS; ++irq) {
781 if (!cpu_isset(cpu, irq_cfg[irq].domain))
782 continue;
783 vector = irq_cfg[irq].vector;
784 per_cpu(vector_irq, cpu)[vector] = irq;
785 }
786 /* Mark the free vectors */
787 for (vector = 0; vector < NR_VECTORS; ++vector) {
788 irq = per_cpu(vector_irq, cpu)[vector];
789 if (irq < 0)
790 continue;
791 if (!cpu_isset(cpu, irq_cfg[irq].domain))
792 per_cpu(vector_irq, cpu)[vector] = -1;
793 }
794}
795
796
797static struct irq_chip ioapic_chip;
798
799static void ioapic_register_intr(int irq, unsigned long trigger)
800{
801 if (trigger) {
802 irq_desc[irq].status |= IRQ_LEVEL;
803 set_irq_chip_and_handler_name(irq, &ioapic_chip,
804 handle_fasteoi_irq, "fasteoi");
805 } else {
806 irq_desc[irq].status &= ~IRQ_LEVEL;
807 set_irq_chip_and_handler_name(irq, &ioapic_chip,
808 handle_edge_irq, "edge");
809 }
810}
811
812static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
813 int trigger, int polarity)
814{
815 struct irq_cfg *cfg = irq_cfg + irq;
816 struct IO_APIC_route_entry entry;
817 cpumask_t mask;
818
819 if (!IO_APIC_IRQ(irq))
820 return;
821
822 mask = TARGET_CPUS;
823 if (assign_irq_vector(irq, mask))
824 return;
825
826 cpus_and(mask, cfg->domain, mask);
827
828 apic_printk(APIC_VERBOSE,KERN_DEBUG
829 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
830 "IRQ %d Mode:%i Active:%i)\n",
831 apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
832 irq, trigger, polarity);
833
834 /*
835 * add it to the IO-APIC irq-routing table:
836 */
837 memset(&entry,0,sizeof(entry));
838
839 entry.delivery_mode = INT_DELIVERY_MODE;
840 entry.dest_mode = INT_DEST_MODE;
841 entry.dest = cpu_mask_to_apicid(mask);
842 entry.mask = 0; /* enable IRQ */
843 entry.trigger = trigger;
844 entry.polarity = polarity;
845 entry.vector = cfg->vector;
846
847 /* Mask level triggered irqs.
848 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
849 */
850 if (trigger)
851 entry.mask = 1;
852
853 ioapic_register_intr(irq, trigger);
854 if (irq < 16)
855 disable_8259A_irq(irq);
856
857 ioapic_write_entry(apic, pin, entry);
858}
859
860static void __init setup_IO_APIC_irqs(void)
861{
862 int apic, pin, idx, irq, first_notcon = 1;
863
864 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
865
866 for (apic = 0; apic < nr_ioapics; apic++) {
867 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
868
869 idx = find_irq_entry(apic,pin,mp_INT);
870 if (idx == -1) {
871 if (first_notcon) {
872 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
873 first_notcon = 0;
874 } else
875 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
876 continue;
877 }
878
879 irq = pin_2_irq(idx, apic, pin);
880 add_pin_to_irq(irq, apic, pin);
881
882 setup_IO_APIC_irq(apic, pin, irq,
883 irq_trigger(idx), irq_polarity(idx));
884 }
885 }
886
887 if (!first_notcon)
888 apic_printk(APIC_VERBOSE," not connected.\n");
889}
890
891/*
892 * Set up the 8259A-master output pin as broadcast to all
893 * CPUs.
894 */
895static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
896{
897 struct IO_APIC_route_entry entry;
898 unsigned long flags;
899
900 memset(&entry,0,sizeof(entry));
901
902 disable_8259A_irq(0);
903
904 /* mask LVT0 */
905 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
906
907 /*
908 * We use logical delivery to get the timer IRQ
909 * to the first CPU.
910 */
911 entry.dest_mode = INT_DEST_MODE;
912 entry.mask = 0; /* unmask IRQ now */
913 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
914 entry.delivery_mode = INT_DELIVERY_MODE;
915 entry.polarity = 0;
916 entry.trigger = 0;
917 entry.vector = vector;
918
919 /*
920 * The timer IRQ doesn't have to know that behind the
921 * scene we have a 8259A-master in AEOI mode ...
922 */
923 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
924
925 /*
926 * Add it to the IO-APIC irq-routing table:
927 */
928 spin_lock_irqsave(&ioapic_lock, flags);
929 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
930 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
931 spin_unlock_irqrestore(&ioapic_lock, flags);
932
933 enable_8259A_irq(0);
934}
935
936void __apicdebuginit print_IO_APIC(void)
937{
938 int apic, i;
939 union IO_APIC_reg_00 reg_00;
940 union IO_APIC_reg_01 reg_01;
941 union IO_APIC_reg_02 reg_02;
942 unsigned long flags;
943
944 if (apic_verbosity == APIC_QUIET)
945 return;
946
947 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
948 for (i = 0; i < nr_ioapics; i++)
949 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
950 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
951
952 /*
953 * We are a bit conservative about what we expect. We have to
954 * know about every hardware change ASAP.
955 */
956 printk(KERN_INFO "testing the IO APIC.......................\n");
957
958 for (apic = 0; apic < nr_ioapics; apic++) {
959
960 spin_lock_irqsave(&ioapic_lock, flags);
961 reg_00.raw = io_apic_read(apic, 0);
962 reg_01.raw = io_apic_read(apic, 1);
963 if (reg_01.bits.version >= 0x10)
964 reg_02.raw = io_apic_read(apic, 2);
965 spin_unlock_irqrestore(&ioapic_lock, flags);
966
967 printk("\n");
968 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
969 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
970 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
971
972 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
973 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
974
975 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
976 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
977
978 if (reg_01.bits.version >= 0x10) {
979 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
980 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
981 }
982
983 printk(KERN_DEBUG ".... IRQ redirection table:\n");
984
985 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
986 " Stat Dmod Deli Vect: \n");
987
988 for (i = 0; i <= reg_01.bits.entries; i++) {
989 struct IO_APIC_route_entry entry;
990
991 entry = ioapic_read_entry(apic, i);
992
993 printk(KERN_DEBUG " %02x %03X ",
994 i,
995 entry.dest
996 );
997
998 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
999 entry.mask,
1000 entry.trigger,
1001 entry.irr,
1002 entry.polarity,
1003 entry.delivery_status,
1004 entry.dest_mode,
1005 entry.delivery_mode,
1006 entry.vector
1007 );
1008 }
1009 }
1010 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1011 for (i = 0; i < NR_IRQS; i++) {
1012 struct irq_pin_list *entry = irq_2_pin + i;
1013 if (entry->pin < 0)
1014 continue;
1015 printk(KERN_DEBUG "IRQ%d ", i);
1016 for (;;) {
1017 printk("-> %d:%d", entry->apic, entry->pin);
1018 if (!entry->next)
1019 break;
1020 entry = irq_2_pin + entry->next;
1021 }
1022 printk("\n");
1023 }
1024
1025 printk(KERN_INFO ".................................... done.\n");
1026
1027 return;
1028}
1029
1030#if 0
1031
1032static __apicdebuginit void print_APIC_bitfield (int base)
1033{
1034 unsigned int v;
1035 int i, j;
1036
1037 if (apic_verbosity == APIC_QUIET)
1038 return;
1039
1040 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1041 for (i = 0; i < 8; i++) {
1042 v = apic_read(base + i*0x10);
1043 for (j = 0; j < 32; j++) {
1044 if (v & (1<<j))
1045 printk("1");
1046 else
1047 printk("0");
1048 }
1049 printk("\n");
1050 }
1051}
1052
1053void __apicdebuginit print_local_APIC(void * dummy)
1054{
1055 unsigned int v, ver, maxlvt;
1056
1057 if (apic_verbosity == APIC_QUIET)
1058 return;
1059
1060 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1061 smp_processor_id(), hard_smp_processor_id());
1062 v = apic_read(APIC_ID);
1063 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
1064 v = apic_read(APIC_LVR);
1065 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1066 ver = GET_APIC_VERSION(v);
1067 maxlvt = get_maxlvt();
1068
1069 v = apic_read(APIC_TASKPRI);
1070 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1071
1072 v = apic_read(APIC_ARBPRI);
1073 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1074 v & APIC_ARBPRI_MASK);
1075 v = apic_read(APIC_PROCPRI);
1076 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1077
1078 v = apic_read(APIC_EOI);
1079 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1080 v = apic_read(APIC_RRR);
1081 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1082 v = apic_read(APIC_LDR);
1083 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1084 v = apic_read(APIC_DFR);
1085 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1086 v = apic_read(APIC_SPIV);
1087 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1088
1089 printk(KERN_DEBUG "... APIC ISR field:\n");
1090 print_APIC_bitfield(APIC_ISR);
1091 printk(KERN_DEBUG "... APIC TMR field:\n");
1092 print_APIC_bitfield(APIC_TMR);
1093 printk(KERN_DEBUG "... APIC IRR field:\n");
1094 print_APIC_bitfield(APIC_IRR);
1095
1096 v = apic_read(APIC_ESR);
1097 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1098
1099 v = apic_read(APIC_ICR);
1100 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1101 v = apic_read(APIC_ICR2);
1102 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1103
1104 v = apic_read(APIC_LVTT);
1105 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1106
1107 if (maxlvt > 3) { /* PC is LVT#4. */
1108 v = apic_read(APIC_LVTPC);
1109 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1110 }
1111 v = apic_read(APIC_LVT0);
1112 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1113 v = apic_read(APIC_LVT1);
1114 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1115
1116 if (maxlvt > 2) { /* ERR is LVT#3. */
1117 v = apic_read(APIC_LVTERR);
1118 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1119 }
1120
1121 v = apic_read(APIC_TMICT);
1122 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1123 v = apic_read(APIC_TMCCT);
1124 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1125 v = apic_read(APIC_TDCR);
1126 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1127 printk("\n");
1128}
1129
1130void print_all_local_APICs (void)
1131{
1132 on_each_cpu(print_local_APIC, NULL, 1, 1);
1133}
1134
1135void __apicdebuginit print_PIC(void)
1136{
1137 unsigned int v;
1138 unsigned long flags;
1139
1140 if (apic_verbosity == APIC_QUIET)
1141 return;
1142
1143 printk(KERN_DEBUG "\nprinting PIC contents\n");
1144
1145 spin_lock_irqsave(&i8259A_lock, flags);
1146
1147 v = inb(0xa1) << 8 | inb(0x21);
1148 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1149
1150 v = inb(0xa0) << 8 | inb(0x20);
1151 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1152
1153 outb(0x0b,0xa0);
1154 outb(0x0b,0x20);
1155 v = inb(0xa0) << 8 | inb(0x20);
1156 outb(0x0a,0xa0);
1157 outb(0x0a,0x20);
1158
1159 spin_unlock_irqrestore(&i8259A_lock, flags);
1160
1161 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1162
1163 v = inb(0x4d1) << 8 | inb(0x4d0);
1164 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1165}
1166
1167#endif /* 0 */
1168
1169static void __init enable_IO_APIC(void)
1170{
1171 union IO_APIC_reg_01 reg_01;
1172 int i8259_apic, i8259_pin;
1173 int i, apic;
1174 unsigned long flags;
1175
1176 for (i = 0; i < PIN_MAP_SIZE; i++) {
1177 irq_2_pin[i].pin = -1;
1178 irq_2_pin[i].next = 0;
1179 }
1180
1181 /*
1182 * The number of IO-APIC IRQ registers (== #pins):
1183 */
1184 for (apic = 0; apic < nr_ioapics; apic++) {
1185 spin_lock_irqsave(&ioapic_lock, flags);
1186 reg_01.raw = io_apic_read(apic, 1);
1187 spin_unlock_irqrestore(&ioapic_lock, flags);
1188 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1189 }
1190 for(apic = 0; apic < nr_ioapics; apic++) {
1191 int pin;
1192 /* See if any of the pins is in ExtINT mode */
1193 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1194 struct IO_APIC_route_entry entry;
1195 entry = ioapic_read_entry(apic, pin);
1196
1197 /* If the interrupt line is enabled and in ExtInt mode
1198 * I have found the pin where the i8259 is connected.
1199 */
1200 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1201 ioapic_i8259.apic = apic;
1202 ioapic_i8259.pin = pin;
1203 goto found_i8259;
1204 }
1205 }
1206 }
1207 found_i8259:
1208 /* Look to see what if the MP table has reported the ExtINT */
1209 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1210 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1211 /* Trust the MP table if nothing is setup in the hardware */
1212 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1213 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1214 ioapic_i8259.pin = i8259_pin;
1215 ioapic_i8259.apic = i8259_apic;
1216 }
1217 /* Complain if the MP table and the hardware disagree */
1218 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1219 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1220 {
1221 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1222 }
1223
1224 /*
1225 * Do not trust the IO-APIC being empty at bootup
1226 */
1227 clear_IO_APIC();
1228}
1229
1230/*
1231 * Not an __init, needed by the reboot code
1232 */
1233void disable_IO_APIC(void)
1234{
1235 /*
1236 * Clear the IO-APIC before rebooting:
1237 */
1238 clear_IO_APIC();
1239
1240 /*
1241 * If the i8259 is routed through an IOAPIC
1242 * Put that IOAPIC in virtual wire mode
1243 * so legacy interrupts can be delivered.
1244 */
1245 if (ioapic_i8259.pin != -1) {
1246 struct IO_APIC_route_entry entry;
1247
1248 memset(&entry, 0, sizeof(entry));
1249 entry.mask = 0; /* Enabled */
1250 entry.trigger = 0; /* Edge */
1251 entry.irr = 0;
1252 entry.polarity = 0; /* High */
1253 entry.delivery_status = 0;
1254 entry.dest_mode = 0; /* Physical */
1255 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1256 entry.vector = 0;
1257 entry.dest = GET_APIC_ID(apic_read(APIC_ID));
1258
1259 /*
1260 * Add it to the IO-APIC irq-routing table:
1261 */
1262 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1263 }
1264
1265 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1266}
1267
1268/*
1269 * There is a nasty bug in some older SMP boards, their mptable lies
1270 * about the timer IRQ. We do the following to work around the situation:
1271 *
1272 * - timer IRQ defaults to IO-APIC IRQ
1273 * - if this function detects that timer IRQs are defunct, then we fall
1274 * back to ISA timer IRQs
1275 */
1276static int __init timer_irq_works(void)
1277{
1278 unsigned long t1 = jiffies;
1279
1280 local_irq_enable();
1281 /* Let ten ticks pass... */
1282 mdelay((10 * 1000) / HZ);
1283
1284 /*
1285 * Expect a few ticks at least, to be sure some possible
1286 * glue logic does not lock up after one or two first
1287 * ticks in a non-ExtINT mode. Also the local APIC
1288 * might have cached one ExtINT interrupt. Finally, at
1289 * least one tick may be lost due to delays.
1290 */
1291
1292 /* jiffies wrap? */
1293 if (jiffies - t1 > 4)
1294 return 1;
1295 return 0;
1296}
1297
1298/*
1299 * In the SMP+IOAPIC case it might happen that there are an unspecified
1300 * number of pending IRQ events unhandled. These cases are very rare,
1301 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1302 * better to do it this way as thus we do not have to be aware of
1303 * 'pending' interrupts in the IRQ path, except at this point.
1304 */
1305/*
1306 * Edge triggered needs to resend any interrupt
1307 * that was delayed but this is now handled in the device
1308 * independent code.
1309 */
1310
1311/*
1312 * Starting up a edge-triggered IO-APIC interrupt is
1313 * nasty - we need to make sure that we get the edge.
1314 * If it is already asserted for some reason, we need
1315 * return 1 to indicate that is was pending.
1316 *
1317 * This is not complete - we should be able to fake
1318 * an edge even if it isn't on the 8259A...
1319 */
1320
1321static unsigned int startup_ioapic_irq(unsigned int irq)
1322{
1323 int was_pending = 0;
1324 unsigned long flags;
1325
1326 spin_lock_irqsave(&ioapic_lock, flags);
1327 if (irq < 16) {
1328 disable_8259A_irq(irq);
1329 if (i8259A_irq_pending(irq))
1330 was_pending = 1;
1331 }
1332 __unmask_IO_APIC_irq(irq);
1333 spin_unlock_irqrestore(&ioapic_lock, flags);
1334
1335 return was_pending;
1336}
1337
1338static int ioapic_retrigger_irq(unsigned int irq)
1339{
1340 struct irq_cfg *cfg = &irq_cfg[irq];
1341 cpumask_t mask;
1342 unsigned long flags;
1343
1344 spin_lock_irqsave(&vector_lock, flags);
1345 cpus_clear(mask);
1346 cpu_set(first_cpu(cfg->domain), mask);
1347
1348 send_IPI_mask(mask, cfg->vector);
1349 spin_unlock_irqrestore(&vector_lock, flags);
1350
1351 return 1;
1352}
1353
1354/*
1355 * Level and edge triggered IO-APIC interrupts need different handling,
1356 * so we use two separate IRQ descriptors. Edge triggered IRQs can be
1357 * handled with the level-triggered descriptor, but that one has slightly
1358 * more overhead. Level-triggered interrupts cannot be handled with the
1359 * edge-triggered handler, without risking IRQ storms and other ugly
1360 * races.
1361 */
1362
1363#ifdef CONFIG_SMP
1364asmlinkage void smp_irq_move_cleanup_interrupt(void)
1365{
1366 unsigned vector, me;
1367 ack_APIC_irq();
1368 exit_idle();
1369 irq_enter();
1370
1371 me = smp_processor_id();
1372 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
1373 unsigned int irq;
1374 struct irq_desc *desc;
1375 struct irq_cfg *cfg;
1376 irq = __get_cpu_var(vector_irq)[vector];
1377 if (irq >= NR_IRQS)
1378 continue;
1379
1380 desc = irq_desc + irq;
1381 cfg = irq_cfg + irq;
1382 spin_lock(&desc->lock);
1383 if (!cfg->move_cleanup_count)
1384 goto unlock;
1385
1386 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
1387 goto unlock;
1388
1389 __get_cpu_var(vector_irq)[vector] = -1;
1390 cfg->move_cleanup_count--;
1391unlock:
1392 spin_unlock(&desc->lock);
1393 }
1394
1395 irq_exit();
1396}
1397
1398static void irq_complete_move(unsigned int irq)
1399{
1400 struct irq_cfg *cfg = irq_cfg + irq;
1401 unsigned vector, me;
1402
1403 if (likely(!cfg->move_in_progress))
1404 return;
1405
1406 vector = ~get_irq_regs()->orig_rax;
1407 me = smp_processor_id();
1408 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1409 cpumask_t cleanup_mask;
1410
1411 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1412 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
1413 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
1414 cfg->move_in_progress = 0;
1415 }
1416}
1417#else
1418static inline void irq_complete_move(unsigned int irq) {}
1419#endif
1420
1421static void ack_apic_edge(unsigned int irq)
1422{
1423 irq_complete_move(irq);
1424 move_native_irq(irq);
1425 ack_APIC_irq();
1426}
1427
1428static void ack_apic_level(unsigned int irq)
1429{
1430 int do_unmask_irq = 0;
1431
1432 irq_complete_move(irq);
1433#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
1434 /* If we are moving the irq we need to mask it */
1435 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
1436 do_unmask_irq = 1;
1437 mask_IO_APIC_irq(irq);
1438 }
1439#endif
1440
1441 /*
1442 * We must acknowledge the irq before we move it or the acknowledge will
1443 * not propagate properly.
1444 */
1445 ack_APIC_irq();
1446
1447 /* Now we can move and renable the irq */
1448 if (unlikely(do_unmask_irq)) {
1449 /* Only migrate the irq if the ack has been received.
1450 *
1451 * On rare occasions the broadcast level triggered ack gets
1452 * delayed going to ioapics, and if we reprogram the
1453 * vector while Remote IRR is still set the irq will never
1454 * fire again.
1455 *
1456 * To prevent this scenario we read the Remote IRR bit
1457 * of the ioapic. This has two effects.
1458 * - On any sane system the read of the ioapic will
1459 * flush writes (and acks) going to the ioapic from
1460 * this cpu.
1461 * - We get to see if the ACK has actually been delivered.
1462 *
1463 * Based on failed experiments of reprogramming the
1464 * ioapic entry from outside of irq context starting
1465 * with masking the ioapic entry and then polling until
1466 * Remote IRR was clear before reprogramming the
1467 * ioapic I don't trust the Remote IRR bit to be
1468 * completey accurate.
1469 *
1470 * However there appears to be no other way to plug
1471 * this race, so if the Remote IRR bit is not
1472 * accurate and is causing problems then it is a hardware bug
1473 * and you can go talk to the chipset vendor about it.
1474 */
1475 if (!io_apic_level_ack_pending(irq))
1476 move_masked_irq(irq);
1477 unmask_IO_APIC_irq(irq);
1478 }
1479}
1480
1481static struct irq_chip ioapic_chip __read_mostly = {
1482 .name = "IO-APIC",
1483 .startup = startup_ioapic_irq,
1484 .mask = mask_IO_APIC_irq,
1485 .unmask = unmask_IO_APIC_irq,
1486 .ack = ack_apic_edge,
1487 .eoi = ack_apic_level,
1488#ifdef CONFIG_SMP
1489 .set_affinity = set_ioapic_affinity_irq,
1490#endif
1491 .retrigger = ioapic_retrigger_irq,
1492};
1493
1494static inline void init_IO_APIC_traps(void)
1495{
1496 int irq;
1497
1498 /*
1499 * NOTE! The local APIC isn't very good at handling
1500 * multiple interrupts at the same interrupt level.
1501 * As the interrupt level is determined by taking the
1502 * vector number and shifting that right by 4, we
1503 * want to spread these out a bit so that they don't
1504 * all fall in the same interrupt level.
1505 *
1506 * Also, we've got to be careful not to trash gate
1507 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1508 */
1509 for (irq = 0; irq < NR_IRQS ; irq++) {
1510 int tmp = irq;
1511 if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
1512 /*
1513 * Hmm.. We don't have an entry for this,
1514 * so default to an old-fashioned 8259
1515 * interrupt if we can..
1516 */
1517 if (irq < 16)
1518 make_8259A_irq(irq);
1519 else
1520 /* Strange. Oh, well.. */
1521 irq_desc[irq].chip = &no_irq_chip;
1522 }
1523 }
1524}
1525
1526static void enable_lapic_irq (unsigned int irq)
1527{
1528 unsigned long v;
1529
1530 v = apic_read(APIC_LVT0);
1531 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1532}
1533
1534static void disable_lapic_irq (unsigned int irq)
1535{
1536 unsigned long v;
1537
1538 v = apic_read(APIC_LVT0);
1539 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1540}
1541
1542static void ack_lapic_irq (unsigned int irq)
1543{
1544 ack_APIC_irq();
1545}
1546
1547static void end_lapic_irq (unsigned int i) { /* nothing */ }
1548
1549static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1550 .name = "local-APIC",
1551 .typename = "local-APIC-edge",
1552 .startup = NULL, /* startup_irq() not used for IRQ0 */
1553 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1554 .enable = enable_lapic_irq,
1555 .disable = disable_lapic_irq,
1556 .ack = ack_lapic_irq,
1557 .end = end_lapic_irq,
1558};
1559
1560static void setup_nmi (void)
1561{
1562 /*
1563 * Dirty trick to enable the NMI watchdog ...
1564 * We put the 8259A master into AEOI mode and
1565 * unmask on all local APICs LVT0 as NMI.
1566 *
1567 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1568 * is from Maciej W. Rozycki - so we do not have to EOI from
1569 * the NMI handler or the timer interrupt.
1570 */
1571 printk(KERN_INFO "activating NMI Watchdog ...");
1572
1573 enable_NMI_through_LVT0(NULL);
1574
1575 printk(" done.\n");
1576}
1577
1578/*
1579 * This looks a bit hackish but it's about the only one way of sending
1580 * a few INTA cycles to 8259As and any associated glue logic. ICR does
1581 * not support the ExtINT mode, unfortunately. We need to send these
1582 * cycles as some i82489DX-based boards have glue logic that keeps the
1583 * 8259A interrupt line asserted until INTA. --macro
1584 */
1585static inline void unlock_ExtINT_logic(void)
1586{
1587 int apic, pin, i;
1588 struct IO_APIC_route_entry entry0, entry1;
1589 unsigned char save_control, save_freq_select;
1590 unsigned long flags;
1591
1592 pin = find_isa_irq_pin(8, mp_INT);
1593 apic = find_isa_irq_apic(8, mp_INT);
1594 if (pin == -1)
1595 return;
1596
1597 spin_lock_irqsave(&ioapic_lock, flags);
1598 *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1599 *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1600 spin_unlock_irqrestore(&ioapic_lock, flags);
1601 clear_IO_APIC_pin(apic, pin);
1602
1603 memset(&entry1, 0, sizeof(entry1));
1604
1605 entry1.dest_mode = 0; /* physical delivery */
1606 entry1.mask = 0; /* unmask IRQ now */
1607 entry1.dest = hard_smp_processor_id();
1608 entry1.delivery_mode = dest_ExtINT;
1609 entry1.polarity = entry0.polarity;
1610 entry1.trigger = 0;
1611 entry1.vector = 0;
1612
1613 spin_lock_irqsave(&ioapic_lock, flags);
1614 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
1615 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
1616 spin_unlock_irqrestore(&ioapic_lock, flags);
1617
1618 save_control = CMOS_READ(RTC_CONTROL);
1619 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
1620 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
1621 RTC_FREQ_SELECT);
1622 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
1623
1624 i = 100;
1625 while (i-- > 0) {
1626 mdelay(10);
1627 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
1628 i -= 10;
1629 }
1630
1631 CMOS_WRITE(save_control, RTC_CONTROL);
1632 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
1633 clear_IO_APIC_pin(apic, pin);
1634
1635 spin_lock_irqsave(&ioapic_lock, flags);
1636 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
1637 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
1638 spin_unlock_irqrestore(&ioapic_lock, flags);
1639}
1640
1641/*
1642 * This code may look a bit paranoid, but it's supposed to cooperate with
1643 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1644 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1645 * fanatically on his truly buggy board.
1646 *
1647 * FIXME: really need to revamp this for modern platforms only.
1648 */
1649static inline void check_timer(void)
1650{
1651 struct irq_cfg *cfg = irq_cfg + 0;
1652 int apic1, pin1, apic2, pin2;
1653
1654 /*
1655 * get/set the timer IRQ vector:
1656 */
1657 disable_8259A_irq(0);
1658 assign_irq_vector(0, TARGET_CPUS);
1659
1660 /*
1661 * Subtle, code in do_timer_interrupt() expects an AEOI
1662 * mode for the 8259A whenever interrupts are routed
1663 * through I/O APICs. Also IRQ0 has to be enabled in
1664 * the 8259A which implies the virtual wire has to be
1665 * disabled in the local APIC.
1666 */
1667 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1668 init_8259A(1);
1669 if (timer_over_8254 > 0)
1670 enable_8259A_irq(0);
1671
1672 pin1 = find_isa_irq_pin(0, mp_INT);
1673 apic1 = find_isa_irq_apic(0, mp_INT);
1674 pin2 = ioapic_i8259.pin;
1675 apic2 = ioapic_i8259.apic;
1676
1677 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
1678 cfg->vector, apic1, pin1, apic2, pin2);
1679
1680 if (pin1 != -1) {
1681 /*
1682 * Ok, does IRQ0 through the IOAPIC work?
1683 */
1684 unmask_IO_APIC_irq(0);
1685 if (!no_timer_check && timer_irq_works()) {
1686 nmi_watchdog_default();
1687 if (nmi_watchdog == NMI_IO_APIC) {
1688 disable_8259A_irq(0);
1689 setup_nmi();
1690 enable_8259A_irq(0);
1691 }
1692 if (disable_timer_pin_1 > 0)
1693 clear_IO_APIC_pin(0, pin1);
1694 return;
1695 }
1696 clear_IO_APIC_pin(apic1, pin1);
1697 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
1698 "connected to IO-APIC\n");
1699 }
1700
1701 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
1702 "through the 8259A ... ");
1703 if (pin2 != -1) {
1704 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
1705 apic2, pin2);
1706 /*
1707 * legacy devices should be connected to IO APIC #0
1708 */
1709 setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
1710 if (timer_irq_works()) {
1711 apic_printk(APIC_VERBOSE," works.\n");
1712 nmi_watchdog_default();
1713 if (nmi_watchdog == NMI_IO_APIC) {
1714 setup_nmi();
1715 }
1716 return;
1717 }
1718 /*
1719 * Cleanup, just in case ...
1720 */
1721 clear_IO_APIC_pin(apic2, pin2);
1722 }
1723 apic_printk(APIC_VERBOSE," failed.\n");
1724
1725 if (nmi_watchdog == NMI_IO_APIC) {
1726 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
1727 nmi_watchdog = 0;
1728 }
1729
1730 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
1731
1732 disable_8259A_irq(0);
1733 irq_desc[0].chip = &lapic_irq_type;
1734 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1735 enable_8259A_irq(0);
1736
1737 if (timer_irq_works()) {
1738 apic_printk(APIC_VERBOSE," works.\n");
1739 return;
1740 }
1741 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1742 apic_printk(APIC_VERBOSE," failed.\n");
1743
1744 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
1745
1746 init_8259A(0);
1747 make_8259A_irq(0);
1748 apic_write(APIC_LVT0, APIC_DM_EXTINT);
1749
1750 unlock_ExtINT_logic();
1751
1752 if (timer_irq_works()) {
1753 apic_printk(APIC_VERBOSE," works.\n");
1754 return;
1755 }
1756 apic_printk(APIC_VERBOSE," failed :(.\n");
1757 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
1758}
1759
1760static int __init notimercheck(char *s)
1761{
1762 no_timer_check = 1;
1763 return 1;
1764}
1765__setup("no_timer_check", notimercheck);
1766
1767/*
1768 *
1769 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
1770 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
1771 * Linux doesn't really care, as it's not actually used
1772 * for any interrupt handling anyway.
1773 */
1774#define PIC_IRQS (1<<2)
1775
1776void __init setup_IO_APIC(void)
1777{
1778 enable_IO_APIC();
1779
1780 if (acpi_ioapic)
1781 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1782 else
1783 io_apic_irqs = ~PIC_IRQS;
1784
1785 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1786
1787 sync_Arb_IDs();
1788 setup_IO_APIC_irqs();
1789 init_IO_APIC_traps();
1790 check_timer();
1791 if (!acpi_ioapic)
1792 print_IO_APIC();
1793}
1794
1795struct sysfs_ioapic_data {
1796 struct sys_device dev;
1797 struct IO_APIC_route_entry entry[0];
1798};
1799static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
1800
1801static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
1802{
1803 struct IO_APIC_route_entry *entry;
1804 struct sysfs_ioapic_data *data;
1805 int i;
1806
1807 data = container_of(dev, struct sysfs_ioapic_data, dev);
1808 entry = data->entry;
1809 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
1810 *entry = ioapic_read_entry(dev->id, i);
1811
1812 return 0;
1813}
1814
1815static int ioapic_resume(struct sys_device *dev)
1816{
1817 struct IO_APIC_route_entry *entry;
1818 struct sysfs_ioapic_data *data;
1819 unsigned long flags;
1820 union IO_APIC_reg_00 reg_00;
1821 int i;
1822
1823 data = container_of(dev, struct sysfs_ioapic_data, dev);
1824 entry = data->entry;
1825
1826 spin_lock_irqsave(&ioapic_lock, flags);
1827 reg_00.raw = io_apic_read(dev->id, 0);
1828 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
1829 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
1830 io_apic_write(dev->id, 0, reg_00.raw);
1831 }
1832 spin_unlock_irqrestore(&ioapic_lock, flags);
1833 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
1834 ioapic_write_entry(dev->id, i, entry[i]);
1835
1836 return 0;
1837}
1838
1839static struct sysdev_class ioapic_sysdev_class = {
1840 set_kset_name("ioapic"),
1841 .suspend = ioapic_suspend,
1842 .resume = ioapic_resume,
1843};
1844
1845static int __init ioapic_init_sysfs(void)
1846{
1847 struct sys_device * dev;
1848 int i, size, error = 0;
1849
1850 error = sysdev_class_register(&ioapic_sysdev_class);
1851 if (error)
1852 return error;
1853
1854 for (i = 0; i < nr_ioapics; i++ ) {
1855 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
1856 * sizeof(struct IO_APIC_route_entry);
1857 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
1858 if (!mp_ioapic_data[i]) {
1859 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1860 continue;
1861 }
1862 memset(mp_ioapic_data[i], 0, size);
1863 dev = &mp_ioapic_data[i]->dev;
1864 dev->id = i;
1865 dev->cls = &ioapic_sysdev_class;
1866 error = sysdev_register(dev);
1867 if (error) {
1868 kfree(mp_ioapic_data[i]);
1869 mp_ioapic_data[i] = NULL;
1870 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1871 continue;
1872 }
1873 }
1874
1875 return 0;
1876}
1877
1878device_initcall(ioapic_init_sysfs);
1879
1880/*
1881 * Dynamic irq allocate and deallocation
1882 */
1883int create_irq(void)
1884{
1885 /* Allocate an unused irq */
1886 int irq;
1887 int new;
1888 unsigned long flags;
1889
1890 irq = -ENOSPC;
1891 spin_lock_irqsave(&vector_lock, flags);
1892 for (new = (NR_IRQS - 1); new >= 0; new--) {
1893 if (platform_legacy_irq(new))
1894 continue;
1895 if (irq_cfg[new].vector != 0)
1896 continue;
1897 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
1898 irq = new;
1899 break;
1900 }
1901 spin_unlock_irqrestore(&vector_lock, flags);
1902
1903 if (irq >= 0) {
1904 dynamic_irq_init(irq);
1905 }
1906 return irq;
1907}
1908
1909void destroy_irq(unsigned int irq)
1910{
1911 unsigned long flags;
1912
1913 dynamic_irq_cleanup(irq);
1914
1915 spin_lock_irqsave(&vector_lock, flags);
1916 __clear_irq_vector(irq);
1917 spin_unlock_irqrestore(&vector_lock, flags);
1918}
1919
1920/*
1921 * MSI mesage composition
1922 */
1923#ifdef CONFIG_PCI_MSI
1924static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
1925{
1926 struct irq_cfg *cfg = irq_cfg + irq;
1927 int err;
1928 unsigned dest;
1929 cpumask_t tmp;
1930
1931 tmp = TARGET_CPUS;
1932 err = assign_irq_vector(irq, tmp);
1933 if (!err) {
1934 cpus_and(tmp, cfg->domain, tmp);
1935 dest = cpu_mask_to_apicid(tmp);
1936
1937 msg->address_hi = MSI_ADDR_BASE_HI;
1938 msg->address_lo =
1939 MSI_ADDR_BASE_LO |
1940 ((INT_DEST_MODE == 0) ?
1941 MSI_ADDR_DEST_MODE_PHYSICAL:
1942 MSI_ADDR_DEST_MODE_LOGICAL) |
1943 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
1944 MSI_ADDR_REDIRECTION_CPU:
1945 MSI_ADDR_REDIRECTION_LOWPRI) |
1946 MSI_ADDR_DEST_ID(dest);
1947
1948 msg->data =
1949 MSI_DATA_TRIGGER_EDGE |
1950 MSI_DATA_LEVEL_ASSERT |
1951 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
1952 MSI_DATA_DELIVERY_FIXED:
1953 MSI_DATA_DELIVERY_LOWPRI) |
1954 MSI_DATA_VECTOR(cfg->vector);
1955 }
1956 return err;
1957}
1958
1959#ifdef CONFIG_SMP
1960static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
1961{
1962 struct irq_cfg *cfg = irq_cfg + irq;
1963 struct msi_msg msg;
1964 unsigned int dest;
1965 cpumask_t tmp;
1966
1967 cpus_and(tmp, mask, cpu_online_map);
1968 if (cpus_empty(tmp))
1969 return;
1970
1971 if (assign_irq_vector(irq, mask))
1972 return;
1973
1974 cpus_and(tmp, cfg->domain, mask);
1975 dest = cpu_mask_to_apicid(tmp);
1976
1977 read_msi_msg(irq, &msg);
1978
1979 msg.data &= ~MSI_DATA_VECTOR_MASK;
1980 msg.data |= MSI_DATA_VECTOR(cfg->vector);
1981 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
1982 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1983
1984 write_msi_msg(irq, &msg);
1985 irq_desc[irq].affinity = mask;
1986}
1987#endif /* CONFIG_SMP */
1988
1989/*
1990 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
1991 * which implement the MSI or MSI-X Capability Structure.
1992 */
1993static struct irq_chip msi_chip = {
1994 .name = "PCI-MSI",
1995 .unmask = unmask_msi_irq,
1996 .mask = mask_msi_irq,
1997 .ack = ack_apic_edge,
1998#ifdef CONFIG_SMP
1999 .set_affinity = set_msi_irq_affinity,
2000#endif
2001 .retrigger = ioapic_retrigger_irq,
2002};
2003
2004int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2005{
2006 struct msi_msg msg;
2007 int irq, ret;
2008 irq = create_irq();
2009 if (irq < 0)
2010 return irq;
2011
2012 ret = msi_compose_msg(dev, irq, &msg);
2013 if (ret < 0) {
2014 destroy_irq(irq);
2015 return ret;
2016 }
2017
2018 set_irq_msi(irq, desc);
2019 write_msi_msg(irq, &msg);
2020
2021 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2022
2023 return 0;
2024}
2025
2026void arch_teardown_msi_irq(unsigned int irq)
2027{
2028 destroy_irq(irq);
2029}
2030
2031#endif /* CONFIG_PCI_MSI */
2032
2033/*
2034 * Hypertransport interrupt support
2035 */
2036#ifdef CONFIG_HT_IRQ
2037
2038#ifdef CONFIG_SMP
2039
2040static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2041{
2042 struct ht_irq_msg msg;
2043 fetch_ht_irq_msg(irq, &msg);
2044
2045 msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
2046 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2047
2048 msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
2049 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2050
2051 write_ht_irq_msg(irq, &msg);
2052}
2053
2054static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2055{
2056 struct irq_cfg *cfg = irq_cfg + irq;
2057 unsigned int dest;
2058 cpumask_t tmp;
2059
2060 cpus_and(tmp, mask, cpu_online_map);
2061 if (cpus_empty(tmp))
2062 return;
2063
2064 if (assign_irq_vector(irq, mask))
2065 return;
2066
2067 cpus_and(tmp, cfg->domain, mask);
2068 dest = cpu_mask_to_apicid(tmp);
2069
2070 target_ht_irq(irq, dest, cfg->vector);
2071 irq_desc[irq].affinity = mask;
2072}
2073#endif
2074
2075static struct irq_chip ht_irq_chip = {
2076 .name = "PCI-HT",
2077 .mask = mask_ht_irq,
2078 .unmask = unmask_ht_irq,
2079 .ack = ack_apic_edge,
2080#ifdef CONFIG_SMP
2081 .set_affinity = set_ht_irq_affinity,
2082#endif
2083 .retrigger = ioapic_retrigger_irq,
2084};
2085
2086int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2087{
2088 struct irq_cfg *cfg = irq_cfg + irq;
2089 int err;
2090 cpumask_t tmp;
2091
2092 tmp = TARGET_CPUS;
2093 err = assign_irq_vector(irq, tmp);
2094 if (!err) {
2095 struct ht_irq_msg msg;
2096 unsigned dest;
2097
2098 cpus_and(tmp, cfg->domain, tmp);
2099 dest = cpu_mask_to_apicid(tmp);
2100
2101 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2102
2103 msg.address_lo =
2104 HT_IRQ_LOW_BASE |
2105 HT_IRQ_LOW_DEST_ID(dest) |
2106 HT_IRQ_LOW_VECTOR(cfg->vector) |
2107 ((INT_DEST_MODE == 0) ?
2108 HT_IRQ_LOW_DM_PHYSICAL :
2109 HT_IRQ_LOW_DM_LOGICAL) |
2110 HT_IRQ_LOW_RQEOI_EDGE |
2111 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2112 HT_IRQ_LOW_MT_FIXED :
2113 HT_IRQ_LOW_MT_ARBITRATED) |
2114 HT_IRQ_LOW_IRQ_MASKED;
2115
2116 write_ht_irq_msg(irq, &msg);
2117
2118 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2119 handle_edge_irq, "edge");
2120 }
2121 return err;
2122}
2123#endif /* CONFIG_HT_IRQ */
2124
2125/* --------------------------------------------------------------------------
2126 ACPI-based IOAPIC Configuration
2127 -------------------------------------------------------------------------- */
2128
2129#ifdef CONFIG_ACPI
2130
2131#define IO_APIC_MAX_ID 0xFE
2132
2133int __init io_apic_get_redir_entries (int ioapic)
2134{
2135 union IO_APIC_reg_01 reg_01;
2136 unsigned long flags;
2137
2138 spin_lock_irqsave(&ioapic_lock, flags);
2139 reg_01.raw = io_apic_read(ioapic, 1);
2140 spin_unlock_irqrestore(&ioapic_lock, flags);
2141
2142 return reg_01.bits.entries;
2143}
2144
2145
2146int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2147{
2148 if (!IO_APIC_IRQ(irq)) {
2149 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2150 ioapic);
2151 return -EINVAL;
2152 }
2153
2154 /*
2155 * IRQs < 16 are already in the irq_2_pin[] map
2156 */
2157 if (irq >= 16)
2158 add_pin_to_irq(irq, ioapic, pin);
2159
2160 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
2161
2162 return 0;
2163}
2164
2165#endif /* CONFIG_ACPI */
2166
2167
2168/*
2169 * This function currently is only a helper for the i386 smp boot process where
2170 * we need to reprogram the ioredtbls to cater for the cpus which have come online
2171 * so mask in all cases should simply be TARGET_CPUS
2172 */
2173#ifdef CONFIG_SMP
2174void __init setup_ioapic_dest(void)
2175{
2176 int pin, ioapic, irq, irq_entry;
2177
2178 if (skip_ioapic_setup == 1)
2179 return;
2180
2181 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
2182 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
2183 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
2184 if (irq_entry == -1)
2185 continue;
2186 irq = pin_2_irq(irq_entry, ioapic, pin);
2187
2188 /* setup_IO_APIC_irqs could fail to get vector for some device
2189 * when you have too many devices, because at that time only boot
2190 * cpu is online.
2191 */
2192 if (!irq_cfg[irq].vector)
2193 setup_IO_APIC_irq(ioapic, pin, irq,
2194 irq_trigger(irq_entry),
2195 irq_polarity(irq_entry));
2196 else
2197 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2198 }
2199
2200 }
2201}
2202#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
new file mode 100644
index 000000000000..653efa30b0f4
--- /dev/null
+++ b/arch/x86/kernel/ioport_64.c
@@ -0,0 +1,119 @@
1/*
2 * linux/arch/x86_64/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/capability.h>
11#include <linux/errno.h>
12#include <linux/types.h>
13#include <linux/ioport.h>
14#include <linux/smp.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18#include <linux/syscalls.h>
19
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
22{
23 int i;
24 if (new_value)
25 for (i = base; i < base + extent; i++)
26 __set_bit(i, bitmap);
27 else
28 for (i = base; i < base + extent; i++)
29 clear_bit(i, bitmap);
30}
31
32/*
33 * this changes the io permissions bitmap in the current task.
34 */
35asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
36{
37 unsigned int i, max_long, bytes, bytes_updated;
38 struct thread_struct * t = &current->thread;
39 struct tss_struct * tss;
40 unsigned long *bitmap;
41
42 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
43 return -EINVAL;
44 if (turn_on && !capable(CAP_SYS_RAWIO))
45 return -EPERM;
46
47 /*
48 * If it's the first ioperm() call in this thread's lifetime, set the
49 * IO bitmap up. ioperm() is much less timing critical than clone(),
50 * this is why we delay this operation until now:
51 */
52 if (!t->io_bitmap_ptr) {
53 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
54 if (!bitmap)
55 return -ENOMEM;
56
57 memset(bitmap, 0xff, IO_BITMAP_BYTES);
58 t->io_bitmap_ptr = bitmap;
59 set_thread_flag(TIF_IO_BITMAP);
60 }
61
62 /*
63 * do it in the per-thread copy and in the TSS ...
64 *
65 * Disable preemption via get_cpu() - we must not switch away
66 * because the ->io_bitmap_max value must match the bitmap
67 * contents:
68 */
69 tss = &per_cpu(init_tss, get_cpu());
70
71 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
72
73 /*
74 * Search for a (possibly new) maximum. This is simple and stupid,
75 * to keep it obviously correct:
76 */
77 max_long = 0;
78 for (i = 0; i < IO_BITMAP_LONGS; i++)
79 if (t->io_bitmap_ptr[i] != ~0UL)
80 max_long = i;
81
82 bytes = (max_long + 1) * sizeof(long);
83 bytes_updated = max(bytes, t->io_bitmap_max);
84
85 t->io_bitmap_max = bytes;
86
87 /* Update the TSS: */
88 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
89
90 put_cpu();
91
92 return 0;
93}
94
95/*
96 * sys_iopl has to be used when you want to access the IO ports
97 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
98 * you'd need 8kB of bitmaps/process, which is a bit excessive.
99 *
100 * Here we just change the eflags value on the stack: we allow
101 * only the super-user to do it. This depends on the stack-layout
102 * on system-call entry - see also fork() and the signal handling
103 * code.
104 */
105
106asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
107{
108 unsigned int old = (regs->eflags >> 12) & 3;
109
110 if (level > 3)
111 return -EINVAL;
112 /* Trying to gain more privileges? */
113 if (level > old) {
114 if (!capable(CAP_SYS_RAWIO))
115 return -EPERM;
116 }
117 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
118 return 0;
119}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644
index 000000000000..39cb3fa83ebb
--- /dev/null
+++ b/arch/x86/kernel/irq_64.c
@@ -0,0 +1,213 @@
1/*
2 * linux/arch/x86_64/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86_64-specific interrupt
7 * entry and irq statistics code. All the remaining irq logic is
8 * done by the generic kernel/irq/ code and in the
9 * x86_64-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/kernel_stat.h>
14#include <linux/interrupt.h>
15#include <linux/seq_file.h>
16#include <linux/module.h>
17#include <linux/delay.h>
18#include <asm/uaccess.h>
19#include <asm/io_apic.h>
20#include <asm/idle.h>
21#include <asm/smp.h>
22
23atomic_t irq_err_count;
24
25#ifdef CONFIG_DEBUG_STACKOVERFLOW
26/*
27 * Probabilistic stack overflow check:
28 *
29 * Only check the stack in process context, because everything else
30 * runs on the big interrupt stacks. Checking reliably is too expensive,
31 * so we just check from interrupts.
32 */
33static inline void stack_overflow_check(struct pt_regs *regs)
34{
35 u64 curbase = (u64)task_stack_page(current);
36 static unsigned long warned = -60*HZ;
37
38 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
39 regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
40 time_after(jiffies, warned + 60*HZ)) {
41 printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
42 current->comm, curbase, regs->rsp);
43 show_stack(NULL,NULL);
44 warned = jiffies;
45 }
46}
47#endif
48
49/*
50 * Generic, controller-independent functions:
51 */
52
53int show_interrupts(struct seq_file *p, void *v)
54{
55 int i = *(loff_t *) v, j;
56 struct irqaction * action;
57 unsigned long flags;
58
59 if (i == 0) {
60 seq_printf(p, " ");
61 for_each_online_cpu(j)
62 seq_printf(p, "CPU%-8d",j);
63 seq_putc(p, '\n');
64 }
65
66 if (i < NR_IRQS) {
67 spin_lock_irqsave(&irq_desc[i].lock, flags);
68 action = irq_desc[i].action;
69 if (!action)
70 goto skip;
71 seq_printf(p, "%3d: ",i);
72#ifndef CONFIG_SMP
73 seq_printf(p, "%10u ", kstat_irqs(i));
74#else
75 for_each_online_cpu(j)
76 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
77#endif
78 seq_printf(p, " %8s", irq_desc[i].chip->name);
79 seq_printf(p, "-%-8s", irq_desc[i].name);
80
81 seq_printf(p, " %s", action->name);
82 for (action=action->next; action; action = action->next)
83 seq_printf(p, ", %s", action->name);
84 seq_putc(p, '\n');
85skip:
86 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
87 } else if (i == NR_IRQS) {
88 seq_printf(p, "NMI: ");
89 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
91 seq_putc(p, '\n');
92 seq_printf(p, "LOC: ");
93 for_each_online_cpu(j)
94 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
95 seq_putc(p, '\n');
96 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
97 }
98 return 0;
99}
100
101/*
102 * do_IRQ handles all normal device IRQ's (the special
103 * SMP cross-CPU interrupts have their own specific
104 * handlers).
105 */
106asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
107{
108 struct pt_regs *old_regs = set_irq_regs(regs);
109
110 /* high bit used in ret_from_ code */
111 unsigned vector = ~regs->orig_rax;
112 unsigned irq;
113
114 exit_idle();
115 irq_enter();
116 irq = __get_cpu_var(vector_irq)[vector];
117
118#ifdef CONFIG_DEBUG_STACKOVERFLOW
119 stack_overflow_check(regs);
120#endif
121
122 if (likely(irq < NR_IRQS))
123 generic_handle_irq(irq);
124 else {
125 if (!disable_apic)
126 ack_APIC_irq();
127
128 if (printk_ratelimit())
129 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
130 __func__, smp_processor_id(), vector);
131 }
132
133 irq_exit();
134
135 set_irq_regs(old_regs);
136 return 1;
137}
138
139#ifdef CONFIG_HOTPLUG_CPU
140void fixup_irqs(cpumask_t map)
141{
142 unsigned int irq;
143 static int warned;
144
145 for (irq = 0; irq < NR_IRQS; irq++) {
146 cpumask_t mask;
147 int break_affinity = 0;
148 int set_affinity = 1;
149
150 if (irq == 2)
151 continue;
152
153 /* interrupt's are disabled at this point */
154 spin_lock(&irq_desc[irq].lock);
155
156 if (!irq_has_action(irq) ||
157 cpus_equal(irq_desc[irq].affinity, map)) {
158 spin_unlock(&irq_desc[irq].lock);
159 continue;
160 }
161
162 cpus_and(mask, irq_desc[irq].affinity, map);
163 if (cpus_empty(mask)) {
164 break_affinity = 1;
165 mask = map;
166 }
167
168 if (irq_desc[irq].chip->mask)
169 irq_desc[irq].chip->mask(irq);
170
171 if (irq_desc[irq].chip->set_affinity)
172 irq_desc[irq].chip->set_affinity(irq, mask);
173 else if (!(warned++))
174 set_affinity = 0;
175
176 if (irq_desc[irq].chip->unmask)
177 irq_desc[irq].chip->unmask(irq);
178
179 spin_unlock(&irq_desc[irq].lock);
180
181 if (break_affinity && set_affinity)
182 printk("Broke affinity for irq %i\n", irq);
183 else if (!set_affinity)
184 printk("Cannot set affinity for irq %i\n", irq);
185 }
186
187 /* That doesn't seem sufficient. Give it 1ms. */
188 local_irq_enable();
189 mdelay(1);
190 local_irq_disable();
191}
192#endif
193
194extern void call_softirq(void);
195
196asmlinkage void do_softirq(void)
197{
198 __u32 pending;
199 unsigned long flags;
200
201 if (in_interrupt())
202 return;
203
204 local_irq_save(flags);
205 pending = local_softirq_pending();
206 /* Switch to interrupt stack */
207 if (pending) {
208 call_softirq();
209 WARN_ON_ONCE(softirq_count());
210 }
211 local_irq_restore(flags);
212}
213EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
new file mode 100644
index 000000000000..7377ccb21335
--- /dev/null
+++ b/arch/x86/kernel/k8.c
@@ -0,0 +1,123 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/gfp.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/k8.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15
16static u32 *flush_words;
17
18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
21 {}
22};
23EXPORT_SYMBOL(k8_nb_ids);
24
25struct pci_dev **k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges);
27
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
29{
30 do {
31 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
32 if (!dev)
33 break;
34 } while (!pci_match_id(&k8_nb_ids[0], dev));
35 return dev;
36}
37
38int cache_k8_northbridges(void)
39{
40 int i;
41 struct pci_dev *dev;
42
43 if (num_k8_northbridges)
44 return 0;
45
46 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++;
49
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
51 GFP_KERNEL);
52 if (!k8_northbridges)
53 return -ENOMEM;
54
55 if (!num_k8_northbridges) {
56 k8_northbridges[0] = NULL;
57 return 0;
58 }
59
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
61 if (!flush_words) {
62 kfree(k8_northbridges);
63 return -ENOMEM;
64 }
65
66 dev = NULL;
67 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 }
72 k8_northbridges[i] = NULL;
73 return 0;
74}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges);
76
77/* Ignores subdevice/subvendor but as far as I can figure out
78 they're useless anyways */
79int __init early_is_k8_nb(u32 device)
80{
81 struct pci_device_id *id;
82 u32 vendor = device & 0xffff;
83 device >>= 16;
84 for (id = k8_nb_ids; id->vendor; id++)
85 if (vendor == id->vendor && device == id->device)
86 return 1;
87 return 0;
88}
89
90void k8_flush_garts(void)
91{
92 int flushed, i;
93 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock);
95
96 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c,
104 flush_words[i]|1);
105 flushed++;
106 }
107 for (i = 0; i < num_k8_northbridges; i++) {
108 u32 w;
109 /* Make sure the hardware actually executed the flush*/
110 for (;;) {
111 pci_read_config_dword(k8_northbridges[i],
112 0x9c, &w);
113 if (!(w & 1))
114 break;
115 cpu_relax();
116 }
117 }
118 spin_unlock_irqrestore(&gart_lock, flags);
119 if (!flushed)
120 printk("nothing to flush?\n");
121}
122EXPORT_SYMBOL_GPL(k8_flush_garts);
123
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
new file mode 100644
index 000000000000..a30e004682e2
--- /dev/null
+++ b/arch/x86/kernel/kprobes_64.c
@@ -0,0 +1,749 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/x86_64/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> adapted for x86_64
28 * 2005-Mar Roland McGrath <roland@redhat.com>
29 * Fixed to handle %rip-relative addressing mode correctly.
30 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
31 * Added function return probes functionality
32 */
33
34#include <linux/kprobes.h>
35#include <linux/ptrace.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/preempt.h>
39#include <linux/module.h>
40#include <linux/kdebug.h>
41
42#include <asm/pgtable.h>
43#include <asm/uaccess.h>
44#include <asm/alternative.h>
45
46void jprobe_return_end(void);
47static void __kprobes arch_copy_kprobe(struct kprobe *p);
48
49DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
50DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
51
52/*
53 * returns non-zero if opcode modifies the interrupt flag.
54 */
55static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
56{
57 switch (*insn) {
58 case 0xfa: /* cli */
59 case 0xfb: /* sti */
60 case 0xcf: /* iret/iretd */
61 case 0x9d: /* popf/popfd */
62 return 1;
63 }
64
65 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
66 return 1;
67 return 0;
68}
69
70int __kprobes arch_prepare_kprobe(struct kprobe *p)
71{
72 /* insn: must be on special executable page on x86_64. */
73 p->ainsn.insn = get_insn_slot();
74 if (!p->ainsn.insn) {
75 return -ENOMEM;
76 }
77 arch_copy_kprobe(p);
78 return 0;
79}
80
81/*
82 * Determine if the instruction uses the %rip-relative addressing mode.
83 * If it does, return the address of the 32-bit displacement word.
84 * If not, return null.
85 */
86static s32 __kprobes *is_riprel(u8 *insn)
87{
88#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
89 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
90 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
91 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
92 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
93 << (row % 64))
94 static const u64 onebyte_has_modrm[256 / 64] = {
95 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96 /* ------------------------------- */
97 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
98 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
99 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
100 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
101 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
102 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
103 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
104 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
105 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
106 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
107 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
108 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
109 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
110 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
111 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
112 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
113 /* ------------------------------- */
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 };
116 static const u64 twobyte_has_modrm[256 / 64] = {
117 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
118 /* ------------------------------- */
119 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
120 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
121 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
122 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
123 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
124 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
125 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
126 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
127 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
128 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
129 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
130 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
131 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
132 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
133 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
134 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
135 /* ------------------------------- */
136 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
137 };
138#undef W
139 int need_modrm;
140
141 /* Skip legacy instruction prefixes. */
142 while (1) {
143 switch (*insn) {
144 case 0x66:
145 case 0x67:
146 case 0x2e:
147 case 0x3e:
148 case 0x26:
149 case 0x64:
150 case 0x65:
151 case 0x36:
152 case 0xf0:
153 case 0xf3:
154 case 0xf2:
155 ++insn;
156 continue;
157 }
158 break;
159 }
160
161 /* Skip REX instruction prefix. */
162 if ((*insn & 0xf0) == 0x40)
163 ++insn;
164
165 if (*insn == 0x0f) { /* Two-byte opcode. */
166 ++insn;
167 need_modrm = test_bit(*insn, twobyte_has_modrm);
168 } else { /* One-byte opcode. */
169 need_modrm = test_bit(*insn, onebyte_has_modrm);
170 }
171
172 if (need_modrm) {
173 u8 modrm = *++insn;
174 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
175 /* Displacement follows ModRM byte. */
176 return (s32 *) ++insn;
177 }
178 }
179
180 /* No %rip-relative addressing mode here. */
181 return NULL;
182}
183
184static void __kprobes arch_copy_kprobe(struct kprobe *p)
185{
186 s32 *ripdisp;
187 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
188 ripdisp = is_riprel(p->ainsn.insn);
189 if (ripdisp) {
190 /*
191 * The copied instruction uses the %rip-relative
192 * addressing mode. Adjust the displacement for the
193 * difference between the original location of this
194 * instruction and the location of the copy that will
195 * actually be run. The tricky bit here is making sure
196 * that the sign extension happens correctly in this
197 * calculation, since we need a signed 32-bit result to
198 * be sign-extended to 64 bits when it's added to the
199 * %rip value and yield the same 64-bit result that the
200 * sign-extension of the original signed 32-bit
201 * displacement would have given.
202 */
203 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
204 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
205 *ripdisp = disp;
206 }
207 p->opcode = *p->addr;
208}
209
210void __kprobes arch_arm_kprobe(struct kprobe *p)
211{
212 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
213}
214
215void __kprobes arch_disarm_kprobe(struct kprobe *p)
216{
217 text_poke(p->addr, &p->opcode, 1);
218}
219
220void __kprobes arch_remove_kprobe(struct kprobe *p)
221{
222 mutex_lock(&kprobe_mutex);
223 free_insn_slot(p->ainsn.insn, 0);
224 mutex_unlock(&kprobe_mutex);
225}
226
227static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
228{
229 kcb->prev_kprobe.kp = kprobe_running();
230 kcb->prev_kprobe.status = kcb->kprobe_status;
231 kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
232 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
233}
234
235static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
236{
237 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
238 kcb->kprobe_status = kcb->prev_kprobe.status;
239 kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
240 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
241}
242
243static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
244 struct kprobe_ctlblk *kcb)
245{
246 __get_cpu_var(current_kprobe) = p;
247 kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
248 = (regs->eflags & (TF_MASK | IF_MASK));
249 if (is_IF_modifier(p->ainsn.insn))
250 kcb->kprobe_saved_rflags &= ~IF_MASK;
251}
252
253static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
254{
255 regs->eflags |= TF_MASK;
256 regs->eflags &= ~IF_MASK;
257 /*single step inline if the instruction is an int3*/
258 if (p->opcode == BREAKPOINT_INSTRUCTION)
259 regs->rip = (unsigned long)p->addr;
260 else
261 regs->rip = (unsigned long)p->ainsn.insn;
262}
263
264/* Called with kretprobe_lock held */
265void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
266 struct pt_regs *regs)
267{
268 unsigned long *sara = (unsigned long *)regs->rsp;
269
270 ri->ret_addr = (kprobe_opcode_t *) *sara;
271 /* Replace the return addr with trampoline addr */
272 *sara = (unsigned long) &kretprobe_trampoline;
273}
274
275int __kprobes kprobe_handler(struct pt_regs *regs)
276{
277 struct kprobe *p;
278 int ret = 0;
279 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
280 struct kprobe_ctlblk *kcb;
281
282 /*
283 * We don't want to be preempted for the entire
284 * duration of kprobe processing
285 */
286 preempt_disable();
287 kcb = get_kprobe_ctlblk();
288
289 /* Check we're not actually recursing */
290 if (kprobe_running()) {
291 p = get_kprobe(addr);
292 if (p) {
293 if (kcb->kprobe_status == KPROBE_HIT_SS &&
294 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
295 regs->eflags &= ~TF_MASK;
296 regs->eflags |= kcb->kprobe_saved_rflags;
297 goto no_kprobe;
298 } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
299 /* TODO: Provide re-entrancy from
300 * post_kprobes_handler() and avoid exception
301 * stack corruption while single-stepping on
302 * the instruction of the new probe.
303 */
304 arch_disarm_kprobe(p);
305 regs->rip = (unsigned long)p->addr;
306 reset_current_kprobe();
307 ret = 1;
308 } else {
309 /* We have reentered the kprobe_handler(), since
310 * another probe was hit while within the
311 * handler. We here save the original kprobe
312 * variables and just single step on instruction
313 * of the new probe without calling any user
314 * handlers.
315 */
316 save_previous_kprobe(kcb);
317 set_current_kprobe(p, regs, kcb);
318 kprobes_inc_nmissed_count(p);
319 prepare_singlestep(p, regs);
320 kcb->kprobe_status = KPROBE_REENTER;
321 return 1;
322 }
323 } else {
324 if (*addr != BREAKPOINT_INSTRUCTION) {
325 /* The breakpoint instruction was removed by
326 * another cpu right after we hit, no further
327 * handling of this interrupt is appropriate
328 */
329 regs->rip = (unsigned long)addr;
330 ret = 1;
331 goto no_kprobe;
332 }
333 p = __get_cpu_var(current_kprobe);
334 if (p->break_handler && p->break_handler(p, regs)) {
335 goto ss_probe;
336 }
337 }
338 goto no_kprobe;
339 }
340
341 p = get_kprobe(addr);
342 if (!p) {
343 if (*addr != BREAKPOINT_INSTRUCTION) {
344 /*
345 * The breakpoint instruction was removed right
346 * after we hit it. Another cpu has removed
347 * either a probepoint or a debugger breakpoint
348 * at this address. In either case, no further
349 * handling of this interrupt is appropriate.
350 * Back up over the (now missing) int3 and run
351 * the original instruction.
352 */
353 regs->rip = (unsigned long)addr;
354 ret = 1;
355 }
356 /* Not one of ours: let kernel handle it */
357 goto no_kprobe;
358 }
359
360 set_current_kprobe(p, regs, kcb);
361 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
362
363 if (p->pre_handler && p->pre_handler(p, regs))
364 /* handler has already set things up, so skip ss setup */
365 return 1;
366
367ss_probe:
368 prepare_singlestep(p, regs);
369 kcb->kprobe_status = KPROBE_HIT_SS;
370 return 1;
371
372no_kprobe:
373 preempt_enable_no_resched();
374 return ret;
375}
376
377/*
378 * For function-return probes, init_kprobes() establishes a probepoint
379 * here. When a retprobed function returns, this probe is hit and
380 * trampoline_probe_handler() runs, calling the kretprobe's handler.
381 */
382 void kretprobe_trampoline_holder(void)
383 {
384 asm volatile ( ".global kretprobe_trampoline\n"
385 "kretprobe_trampoline: \n"
386 "nop\n");
387 }
388
389/*
390 * Called when we hit the probe point at kretprobe_trampoline
391 */
392int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
393{
394 struct kretprobe_instance *ri = NULL;
395 struct hlist_head *head, empty_rp;
396 struct hlist_node *node, *tmp;
397 unsigned long flags, orig_ret_address = 0;
398 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
399
400 INIT_HLIST_HEAD(&empty_rp);
401 spin_lock_irqsave(&kretprobe_lock, flags);
402 head = kretprobe_inst_table_head(current);
403
404 /*
405 * It is possible to have multiple instances associated with a given
406 * task either because an multiple functions in the call path
407 * have a return probe installed on them, and/or more then one return
408 * return probe was registered for a target function.
409 *
410 * We can handle this because:
411 * - instances are always inserted at the head of the list
412 * - when multiple return probes are registered for the same
413 * function, the first instance's ret_addr will point to the
414 * real return address, and all the rest will point to
415 * kretprobe_trampoline
416 */
417 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
418 if (ri->task != current)
419 /* another task is sharing our hash bucket */
420 continue;
421
422 if (ri->rp && ri->rp->handler)
423 ri->rp->handler(ri, regs);
424
425 orig_ret_address = (unsigned long)ri->ret_addr;
426 recycle_rp_inst(ri, &empty_rp);
427
428 if (orig_ret_address != trampoline_address)
429 /*
430 * This is the real return address. Any other
431 * instances associated with this task are for
432 * other calls deeper on the call stack
433 */
434 break;
435 }
436
437 kretprobe_assert(ri, orig_ret_address, trampoline_address);
438 regs->rip = orig_ret_address;
439
440 reset_current_kprobe();
441 spin_unlock_irqrestore(&kretprobe_lock, flags);
442 preempt_enable_no_resched();
443
444 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
445 hlist_del(&ri->hlist);
446 kfree(ri);
447 }
448 /*
449 * By returning a non-zero value, we are telling
450 * kprobe_handler() that we don't want the post_handler
451 * to run (and have re-enabled preemption)
452 */
453 return 1;
454}
455
456/*
457 * Called after single-stepping. p->addr is the address of the
458 * instruction whose first byte has been replaced by the "int 3"
459 * instruction. To avoid the SMP problems that can occur when we
460 * temporarily put back the original opcode to single-step, we
461 * single-stepped a copy of the instruction. The address of this
462 * copy is p->ainsn.insn.
463 *
464 * This function prepares to return from the post-single-step
465 * interrupt. We have to fix up the stack as follows:
466 *
467 * 0) Except in the case of absolute or indirect jump or call instructions,
468 * the new rip is relative to the copied instruction. We need to make
469 * it relative to the original instruction.
470 *
471 * 1) If the single-stepped instruction was pushfl, then the TF and IF
472 * flags are set in the just-pushed eflags, and may need to be cleared.
473 *
474 * 2) If the single-stepped instruction was a call, the return address
475 * that is atop the stack is the address following the copied instruction.
476 * We need to make it the address following the original instruction.
477 */
478static void __kprobes resume_execution(struct kprobe *p,
479 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
480{
481 unsigned long *tos = (unsigned long *)regs->rsp;
482 unsigned long next_rip = 0;
483 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
484 unsigned long orig_rip = (unsigned long)p->addr;
485 kprobe_opcode_t *insn = p->ainsn.insn;
486
487 /*skip the REX prefix*/
488 if (*insn >= 0x40 && *insn <= 0x4f)
489 insn++;
490
491 switch (*insn) {
492 case 0x9c: /* pushfl */
493 *tos &= ~(TF_MASK | IF_MASK);
494 *tos |= kcb->kprobe_old_rflags;
495 break;
496 case 0xc3: /* ret/lret */
497 case 0xcb:
498 case 0xc2:
499 case 0xca:
500 regs->eflags &= ~TF_MASK;
501 /* rip is already adjusted, no more changes required*/
502 return;
503 case 0xe8: /* call relative - Fix return addr */
504 *tos = orig_rip + (*tos - copy_rip);
505 break;
506 case 0xff:
507 if ((insn[1] & 0x30) == 0x10) {
508 /* call absolute, indirect */
509 /* Fix return addr; rip is correct. */
510 next_rip = regs->rip;
511 *tos = orig_rip + (*tos - copy_rip);
512 } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
513 ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
514 /* rip is correct. */
515 next_rip = regs->rip;
516 }
517 break;
518 case 0xea: /* jmp absolute -- rip is correct */
519 next_rip = regs->rip;
520 break;
521 default:
522 break;
523 }
524
525 regs->eflags &= ~TF_MASK;
526 if (next_rip) {
527 regs->rip = next_rip;
528 } else {
529 regs->rip = orig_rip + (regs->rip - copy_rip);
530 }
531}
532
533int __kprobes post_kprobe_handler(struct pt_regs *regs)
534{
535 struct kprobe *cur = kprobe_running();
536 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
537
538 if (!cur)
539 return 0;
540
541 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
542 kcb->kprobe_status = KPROBE_HIT_SSDONE;
543 cur->post_handler(cur, regs, 0);
544 }
545
546 resume_execution(cur, regs, kcb);
547 regs->eflags |= kcb->kprobe_saved_rflags;
548
549 /* Restore the original saved kprobes variables and continue. */
550 if (kcb->kprobe_status == KPROBE_REENTER) {
551 restore_previous_kprobe(kcb);
552 goto out;
553 }
554 reset_current_kprobe();
555out:
556 preempt_enable_no_resched();
557
558 /*
559 * if somebody else is singlestepping across a probe point, eflags
560 * will have TF set, in which case, continue the remaining processing
561 * of do_debug, as if this is not a probe hit.
562 */
563 if (regs->eflags & TF_MASK)
564 return 0;
565
566 return 1;
567}
568
569int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
570{
571 struct kprobe *cur = kprobe_running();
572 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
573 const struct exception_table_entry *fixup;
574
575 switch(kcb->kprobe_status) {
576 case KPROBE_HIT_SS:
577 case KPROBE_REENTER:
578 /*
579 * We are here because the instruction being single
580 * stepped caused a page fault. We reset the current
581 * kprobe and the rip points back to the probe address
582 * and allow the page fault handler to continue as a
583 * normal page fault.
584 */
585 regs->rip = (unsigned long)cur->addr;
586 regs->eflags |= kcb->kprobe_old_rflags;
587 if (kcb->kprobe_status == KPROBE_REENTER)
588 restore_previous_kprobe(kcb);
589 else
590 reset_current_kprobe();
591 preempt_enable_no_resched();
592 break;
593 case KPROBE_HIT_ACTIVE:
594 case KPROBE_HIT_SSDONE:
595 /*
596 * We increment the nmissed count for accounting,
597 * we can also use npre/npostfault count for accouting
598 * these specific fault cases.
599 */
600 kprobes_inc_nmissed_count(cur);
601
602 /*
603 * We come here because instructions in the pre/post
604 * handler caused the page_fault, this could happen
605 * if handler tries to access user space by
606 * copy_from_user(), get_user() etc. Let the
607 * user-specified handler try to fix it first.
608 */
609 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
610 return 1;
611
612 /*
613 * In case the user-specified fault handler returned
614 * zero, try to fix up.
615 */
616 fixup = search_exception_tables(regs->rip);
617 if (fixup) {
618 regs->rip = fixup->fixup;
619 return 1;
620 }
621
622 /*
623 * fixup() could not handle it,
624 * Let do_page_fault() fix it.
625 */
626 break;
627 default:
628 break;
629 }
630 return 0;
631}
632
633/*
634 * Wrapper routine for handling exceptions.
635 */
636int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
637 unsigned long val, void *data)
638{
639 struct die_args *args = (struct die_args *)data;
640 int ret = NOTIFY_DONE;
641
642 if (args->regs && user_mode(args->regs))
643 return ret;
644
645 switch (val) {
646 case DIE_INT3:
647 if (kprobe_handler(args->regs))
648 ret = NOTIFY_STOP;
649 break;
650 case DIE_DEBUG:
651 if (post_kprobe_handler(args->regs))
652 ret = NOTIFY_STOP;
653 break;
654 case DIE_GPF:
655 case DIE_PAGE_FAULT:
656 /* kprobe_running() needs smp_processor_id() */
657 preempt_disable();
658 if (kprobe_running() &&
659 kprobe_fault_handler(args->regs, args->trapnr))
660 ret = NOTIFY_STOP;
661 preempt_enable();
662 break;
663 default:
664 break;
665 }
666 return ret;
667}
668
669int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
670{
671 struct jprobe *jp = container_of(p, struct jprobe, kp);
672 unsigned long addr;
673 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
674
675 kcb->jprobe_saved_regs = *regs;
676 kcb->jprobe_saved_rsp = (long *) regs->rsp;
677 addr = (unsigned long)(kcb->jprobe_saved_rsp);
678 /*
679 * As Linus pointed out, gcc assumes that the callee
680 * owns the argument space and could overwrite it, e.g.
681 * tailcall optimization. So, to be absolutely safe
682 * we also save and restore enough stack bytes to cover
683 * the argument area.
684 */
685 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
686 MIN_STACK_SIZE(addr));
687 regs->eflags &= ~IF_MASK;
688 regs->rip = (unsigned long)(jp->entry);
689 return 1;
690}
691
692void __kprobes jprobe_return(void)
693{
694 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
695
696 asm volatile (" xchg %%rbx,%%rsp \n"
697 " int3 \n"
698 " .globl jprobe_return_end \n"
699 " jprobe_return_end: \n"
700 " nop \n"::"b"
701 (kcb->jprobe_saved_rsp):"memory");
702}
703
704int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
705{
706 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
707 u8 *addr = (u8 *) (regs->rip - 1);
708 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
709 struct jprobe *jp = container_of(p, struct jprobe, kp);
710
711 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
712 if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
713 struct pt_regs *saved_regs =
714 container_of(kcb->jprobe_saved_rsp,
715 struct pt_regs, rsp);
716 printk("current rsp %p does not match saved rsp %p\n",
717 (long *)regs->rsp, kcb->jprobe_saved_rsp);
718 printk("Saved registers for jprobe %p\n", jp);
719 show_registers(saved_regs);
720 printk("Current registers\n");
721 show_registers(regs);
722 BUG();
723 }
724 *regs = kcb->jprobe_saved_regs;
725 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
726 MIN_STACK_SIZE(stack_addr));
727 preempt_enable_no_resched();
728 return 1;
729 }
730 return 0;
731}
732
733static struct kprobe trampoline_p = {
734 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
735 .pre_handler = trampoline_probe_handler
736};
737
738int __init arch_init_kprobes(void)
739{
740 return register_kprobe(&trampoline_p);
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
746 return 1;
747
748 return 0;
749}
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
new file mode 100644
index 000000000000..bc9ffd5c19cc
--- /dev/null
+++ b/arch/x86/kernel/ldt_64.c
@@ -0,0 +1,252 @@
1/*
2 * linux/arch/x86_64/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 * Copyright (C) 2002 Andi Kleen
7 *
8 * This handles calls from both 32bit and 64bit mode.
9 */
10
11#include <linux/errno.h>
12#include <linux/sched.h>
13#include <linux/string.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/vmalloc.h>
17#include <linux/slab.h>
18
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <asm/ldt.h>
22#include <asm/desc.h>
23#include <asm/proto.h>
24
25#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26static void flush_ldt(void *null)
27{
28 if (current->active_mm)
29 load_LDT(&current->active_mm->context);
30}
31#endif
32
33static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
34{
35 void *oldldt;
36 void *newldt;
37 unsigned oldsize;
38
39 if (mincount <= (unsigned)pc->size)
40 return 0;
41 oldsize = pc->size;
42 mincount = (mincount+511)&(~511);
43 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
44 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
45 else
46 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
47
48 if (!newldt)
49 return -ENOMEM;
50
51 if (oldsize)
52 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
53 oldldt = pc->ldt;
54 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
55 wmb();
56 pc->ldt = newldt;
57 wmb();
58 pc->size = mincount;
59 wmb();
60 if (reload) {
61#ifdef CONFIG_SMP
62 cpumask_t mask;
63
64 preempt_disable();
65 mask = cpumask_of_cpu(smp_processor_id());
66 load_LDT(pc);
67 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
68 smp_call_function(flush_ldt, NULL, 1, 1);
69 preempt_enable();
70#else
71 load_LDT(pc);
72#endif
73 }
74 if (oldsize) {
75 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
76 vfree(oldldt);
77 else
78 kfree(oldldt);
79 }
80 return 0;
81}
82
83static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
84{
85 int err = alloc_ldt(new, old->size, 0);
86 if (err < 0)
87 return err;
88 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
89 return 0;
90}
91
92/*
93 * we do not have to muck with descriptors here, that is
94 * done in switch_mm() as needed.
95 */
96int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
97{
98 struct mm_struct * old_mm;
99 int retval = 0;
100
101 init_MUTEX(&mm->context.sem);
102 mm->context.size = 0;
103 old_mm = current->mm;
104 if (old_mm && old_mm->context.size > 0) {
105 down(&old_mm->context.sem);
106 retval = copy_ldt(&mm->context, &old_mm->context);
107 up(&old_mm->context.sem);
108 }
109 return retval;
110}
111
112/*
113 *
114 * Don't touch the LDT register - we're already in the next thread.
115 */
116void destroy_context(struct mm_struct *mm)
117{
118 if (mm->context.size) {
119 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
120 vfree(mm->context.ldt);
121 else
122 kfree(mm->context.ldt);
123 mm->context.size = 0;
124 }
125}
126
127static int read_ldt(void __user * ptr, unsigned long bytecount)
128{
129 int err;
130 unsigned long size;
131 struct mm_struct * mm = current->mm;
132
133 if (!mm->context.size)
134 return 0;
135 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
136 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
137
138 down(&mm->context.sem);
139 size = mm->context.size*LDT_ENTRY_SIZE;
140 if (size > bytecount)
141 size = bytecount;
142
143 err = 0;
144 if (copy_to_user(ptr, mm->context.ldt, size))
145 err = -EFAULT;
146 up(&mm->context.sem);
147 if (err < 0)
148 goto error_return;
149 if (size != bytecount) {
150 /* zero-fill the rest */
151 if (clear_user(ptr+size, bytecount-size) != 0) {
152 err = -EFAULT;
153 goto error_return;
154 }
155 }
156 return bytecount;
157error_return:
158 return err;
159}
160
161static int read_default_ldt(void __user * ptr, unsigned long bytecount)
162{
163 /* Arbitrary number */
164 /* x86-64 default LDT is all zeros */
165 if (bytecount > 128)
166 bytecount = 128;
167 if (clear_user(ptr, bytecount))
168 return -EFAULT;
169 return bytecount;
170}
171
172static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
173{
174 struct task_struct *me = current;
175 struct mm_struct * mm = me->mm;
176 __u32 entry_1, entry_2, *lp;
177 int error;
178 struct user_desc ldt_info;
179
180 error = -EINVAL;
181
182 if (bytecount != sizeof(ldt_info))
183 goto out;
184 error = -EFAULT;
185 if (copy_from_user(&ldt_info, ptr, bytecount))
186 goto out;
187
188 error = -EINVAL;
189 if (ldt_info.entry_number >= LDT_ENTRIES)
190 goto out;
191 if (ldt_info.contents == 3) {
192 if (oldmode)
193 goto out;
194 if (ldt_info.seg_not_present == 0)
195 goto out;
196 }
197
198 down(&mm->context.sem);
199 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
200 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
201 if (error < 0)
202 goto out_unlock;
203 }
204
205 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
206
207 /* Allow LDTs to be cleared by the user. */
208 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
209 if (oldmode || LDT_empty(&ldt_info)) {
210 entry_1 = 0;
211 entry_2 = 0;
212 goto install;
213 }
214 }
215
216 entry_1 = LDT_entry_a(&ldt_info);
217 entry_2 = LDT_entry_b(&ldt_info);
218 if (oldmode)
219 entry_2 &= ~(1 << 20);
220
221 /* Install the new entry ... */
222install:
223 *lp = entry_1;
224 *(lp+1) = entry_2;
225 error = 0;
226
227out_unlock:
228 up(&mm->context.sem);
229out:
230 return error;
231}
232
233asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
234{
235 int ret = -ENOSYS;
236
237 switch (func) {
238 case 0:
239 ret = read_ldt(ptr, bytecount);
240 break;
241 case 1:
242 ret = write_ldt(ptr, bytecount, 1);
243 break;
244 case 2:
245 ret = read_default_ldt(ptr, bytecount);
246 break;
247 case 0x11:
248 ret = write_ldt(ptr, bytecount, 0);
249 break;
250 }
251 return ret;
252}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644
index 000000000000..c3a554703672
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -0,0 +1,259 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/string.h>
12#include <linux/reboot.h>
13#include <asm/pgtable.h>
14#include <asm/tlbflush.h>
15#include <asm/mmu_context.h>
16#include <asm/io.h>
17
18#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
19static u64 kexec_pgd[512] PAGE_ALIGNED;
20static u64 kexec_pud0[512] PAGE_ALIGNED;
21static u64 kexec_pmd0[512] PAGE_ALIGNED;
22static u64 kexec_pte0[512] PAGE_ALIGNED;
23static u64 kexec_pud1[512] PAGE_ALIGNED;
24static u64 kexec_pmd1[512] PAGE_ALIGNED;
25static u64 kexec_pte1[512] PAGE_ALIGNED;
26
27static void init_level2_page(pmd_t *level2p, unsigned long addr)
28{
29 unsigned long end_addr;
30
31 addr &= PAGE_MASK;
32 end_addr = addr + PUD_SIZE;
33 while (addr < end_addr) {
34 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
35 addr += PMD_SIZE;
36 }
37}
38
39static int init_level3_page(struct kimage *image, pud_t *level3p,
40 unsigned long addr, unsigned long last_addr)
41{
42 unsigned long end_addr;
43 int result;
44
45 result = 0;
46 addr &= PAGE_MASK;
47 end_addr = addr + PGDIR_SIZE;
48 while ((addr < last_addr) && (addr < end_addr)) {
49 struct page *page;
50 pmd_t *level2p;
51
52 page = kimage_alloc_control_pages(image, 0);
53 if (!page) {
54 result = -ENOMEM;
55 goto out;
56 }
57 level2p = (pmd_t *)page_address(page);
58 init_level2_page(level2p, addr);
59 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
60 addr += PUD_SIZE;
61 }
62 /* clear the unused entries */
63 while (addr < end_addr) {
64 pud_clear(level3p++);
65 addr += PUD_SIZE;
66 }
67out:
68 return result;
69}
70
71
72static int init_level4_page(struct kimage *image, pgd_t *level4p,
73 unsigned long addr, unsigned long last_addr)
74{
75 unsigned long end_addr;
76 int result;
77
78 result = 0;
79 addr &= PAGE_MASK;
80 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
81 while ((addr < last_addr) && (addr < end_addr)) {
82 struct page *page;
83 pud_t *level3p;
84
85 page = kimage_alloc_control_pages(image, 0);
86 if (!page) {
87 result = -ENOMEM;
88 goto out;
89 }
90 level3p = (pud_t *)page_address(page);
91 result = init_level3_page(image, level3p, addr, last_addr);
92 if (result) {
93 goto out;
94 }
95 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
96 addr += PGDIR_SIZE;
97 }
98 /* clear the unused entries */
99 while (addr < end_addr) {
100 pgd_clear(level4p++);
101 addr += PGDIR_SIZE;
102 }
103out:
104 return result;
105}
106
107
108static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
109{
110 pgd_t *level4p;
111 level4p = (pgd_t *)__va(start_pgtable);
112 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
113}
114
115static void set_idt(void *newidt, u16 limit)
116{
117 struct desc_ptr curidt;
118
119 /* x86-64 supports unaliged loads & stores */
120 curidt.size = limit;
121 curidt.address = (unsigned long)newidt;
122
123 __asm__ __volatile__ (
124 "lidtq %0\n"
125 : : "m" (curidt)
126 );
127};
128
129
130static void set_gdt(void *newgdt, u16 limit)
131{
132 struct desc_ptr curgdt;
133
134 /* x86-64 supports unaligned loads & stores */
135 curgdt.size = limit;
136 curgdt.address = (unsigned long)newgdt;
137
138 __asm__ __volatile__ (
139 "lgdtq %0\n"
140 : : "m" (curgdt)
141 );
142};
143
144static void load_segments(void)
145{
146 __asm__ __volatile__ (
147 "\tmovl %0,%%ds\n"
148 "\tmovl %0,%%es\n"
149 "\tmovl %0,%%ss\n"
150 "\tmovl %0,%%fs\n"
151 "\tmovl %0,%%gs\n"
152 : : "a" (__KERNEL_DS) : "memory"
153 );
154}
155
156int machine_kexec_prepare(struct kimage *image)
157{
158 unsigned long start_pgtable;
159 int result;
160
161 /* Calculate the offsets */
162 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
163
164 /* Setup the identity mapped 64bit page table */
165 result = init_pgtable(image, start_pgtable);
166 if (result)
167 return result;
168
169 return 0;
170}
171
172void machine_kexec_cleanup(struct kimage *image)
173{
174 return;
175}
176
177/*
178 * Do not allocate memory (or fail in any way) in machine_kexec().
179 * We are past the point of no return, committed to rebooting now.
180 */
181NORET_TYPE void machine_kexec(struct kimage *image)
182{
183 unsigned long page_list[PAGES_NR];
184 void *control_page;
185
186 /* Interrupts aren't acceptable while we reboot */
187 local_irq_disable();
188
189 control_page = page_address(image->control_code_page) + PAGE_SIZE;
190 memcpy(control_page, relocate_kernel, PAGE_SIZE);
191
192 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
194 page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
195 page_list[VA_PGD] = (unsigned long)kexec_pgd;
196 page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
198 page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
200 page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
202 page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
204 page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
206 page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
208
209 page_list[PA_TABLE_PAGE] =
210 (unsigned long)__pa(page_address(image->control_code_page));
211
212 /* The segment registers are funny things, they have both a
213 * visible and an invisible part. Whenever the visible part is
214 * set to a specific selector, the invisible part is loaded
215 * with from a table in memory. At no other time is the
216 * descriptor table in memory accessed.
217 *
218 * I take advantage of this here by force loading the
219 * segments, before I zap the gdt with an invalid value.
220 */
221 load_segments();
222 /* The gdt & idt are now invalid.
223 * If you want to load them you must set up your own idt & gdt.
224 */
225 set_gdt(phys_to_virt(0),0);
226 set_idt(phys_to_virt(0),0);
227
228 /* now call it */
229 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
230 image->start);
231}
232
233/* crashkernel=size@addr specifies the location to reserve for
234 * a crash kernel. By reserving this memory we guarantee
235 * that linux never set's it up as a DMA target.
236 * Useful for holding code to do something appropriate
237 * after a kernel panic.
238 */
239static int __init setup_crashkernel(char *arg)
240{
241 unsigned long size, base;
242 char *p;
243 if (!arg)
244 return -EINVAL;
245 size = memparse(arg, &p);
246 if (arg == p)
247 return -EINVAL;
248 if (*p == '@') {
249 base = memparse(p+1, &p);
250 /* FIXME: Do I want a sanity check to validate the
251 * memory range? Yes you do, but it's too early for
252 * e820 -AK */
253 crashk_res.start = base;
254 crashk_res.end = base + size - 1;
255 }
256 return 0;
257}
258early_param("crashkernel", setup_crashkernel);
259
diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c
new file mode 100644
index 000000000000..a66d607f5b92
--- /dev/null
+++ b/arch/x86/kernel/mce_64.c
@@ -0,0 +1,875 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
18#include <linux/capability.h>
19#include <linux/cpu.h>
20#include <linux/percpu.h>
21#include <linux/poll.h>
22#include <linux/thread_info.h>
23#include <linux/ctype.h>
24#include <linux/kmod.h>
25#include <linux/kdebug.h>
26#include <asm/processor.h>
27#include <asm/msr.h>
28#include <asm/mce.h>
29#include <asm/uaccess.h>
30#include <asm/smp.h>
31#include <asm/idle.h>
32
33#define MISC_MCELOG_MINOR 227
34#define NR_BANKS 6
35
36atomic_t mce_entry;
37
38static int mce_dont_init;
39
40/*
41 * Tolerant levels:
42 * 0: always panic on uncorrected errors, log corrected errors
43 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
44 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
45 * 3: never panic or SIGBUS, log all errors (for testing only)
46 */
47static int tolerant = 1;
48static int banks;
49static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
50static unsigned long notify_user;
51static int rip_msr;
52static int mce_bootlog = 1;
53static atomic_t mce_events;
54
55static char trigger[128];
56static char *trigger_argv[2] = { trigger, NULL };
57
58static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
59
60/*
61 * Lockless MCE logging infrastructure.
62 * This avoids deadlocks on printk locks without having to break locks. Also
63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */
65
66struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN,
69};
70
71void mce_log(struct mce *mce)
72{
73 unsigned next, entry;
74 atomic_inc(&mce_events);
75 mce->finished = 0;
76 wmb();
77 for (;;) {
78 entry = rcu_dereference(mcelog.next);
79 /* The rmb forces the compiler to reload next in each
80 iteration */
81 rmb();
82 for (;;) {
83 /* When the buffer fills up discard new entries. Assume
84 that the earlier errors are the more interesting. */
85 if (entry >= MCE_LOG_LEN) {
86 set_bit(MCE_OVERFLOW, &mcelog.flags);
87 return;
88 }
89 /* Old left over entry. Skip. */
90 if (mcelog.entry[entry].finished) {
91 entry++;
92 continue;
93 }
94 break;
95 }
96 smp_rmb();
97 next = entry + 1;
98 if (cmpxchg(&mcelog.next, entry, next) == entry)
99 break;
100 }
101 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
102 wmb();
103 mcelog.entry[entry].finished = 1;
104 wmb();
105
106 set_bit(0, &notify_user);
107}
108
109static void print_mce(struct mce *m)
110{
111 printk(KERN_EMERG "\n"
112 KERN_EMERG "HARDWARE ERROR\n"
113 KERN_EMERG
114 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
115 m->cpu, m->mcgstatus, m->bank, m->status);
116 if (m->rip) {
117 printk(KERN_EMERG
118 "RIP%s %02x:<%016Lx> ",
119 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
120 m->cs, m->rip);
121 if (m->cs == __KERNEL_CS)
122 print_symbol("{%s}", m->rip);
123 printk("\n");
124 }
125 printk(KERN_EMERG "TSC %Lx ", m->tsc);
126 if (m->addr)
127 printk("ADDR %Lx ", m->addr);
128 if (m->misc)
129 printk("MISC %Lx ", m->misc);
130 printk("\n");
131 printk(KERN_EMERG "This is not a software problem!\n");
132 printk(KERN_EMERG
133 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
134}
135
136static void mce_panic(char *msg, struct mce *backup, unsigned long start)
137{
138 int i;
139
140 oops_begin();
141 for (i = 0; i < MCE_LOG_LEN; i++) {
142 unsigned long tsc = mcelog.entry[i].tsc;
143 if (time_before(tsc, start))
144 continue;
145 print_mce(&mcelog.entry[i]);
146 if (backup && mcelog.entry[i].tsc == backup->tsc)
147 backup = NULL;
148 }
149 if (backup)
150 print_mce(backup);
151 panic(msg);
152}
153
154static int mce_available(struct cpuinfo_x86 *c)
155{
156 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
157}
158
159static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
160{
161 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
162 m->rip = regs->rip;
163 m->cs = regs->cs;
164 } else {
165 m->rip = 0;
166 m->cs = 0;
167 }
168 if (rip_msr) {
169 /* Assume the RIP in the MSR is exact. Is this true? */
170 m->mcgstatus |= MCG_STATUS_EIPV;
171 rdmsrl(rip_msr, m->rip);
172 m->cs = 0;
173 }
174}
175
176/*
177 * The actual machine check handler
178 */
179
180void do_machine_check(struct pt_regs * regs, long error_code)
181{
182 struct mce m, panicm;
183 u64 mcestart = 0;
184 int i;
185 int panicm_found = 0;
186 /*
187 * If no_way_out gets set, there is no safe way to recover from this
188 * MCE. If tolerant is cranked up, we'll try anyway.
189 */
190 int no_way_out = 0;
191 /*
192 * If kill_it gets set, there might be a way to recover from this
193 * error.
194 */
195 int kill_it = 0;
196
197 atomic_inc(&mce_entry);
198
199 if (regs)
200 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
201 if (!banks)
202 goto out2;
203
204 memset(&m, 0, sizeof(struct mce));
205 m.cpu = smp_processor_id();
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 /* if the restart IP is not valid, we're done for */
208 if (!(m.mcgstatus & MCG_STATUS_RIPV))
209 no_way_out = 1;
210
211 rdtscll(mcestart);
212 barrier();
213
214 for (i = 0; i < banks; i++) {
215 if (!bank[i])
216 continue;
217
218 m.misc = 0;
219 m.addr = 0;
220 m.bank = i;
221 m.tsc = 0;
222
223 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
224 if ((m.status & MCI_STATUS_VAL) == 0)
225 continue;
226
227 if (m.status & MCI_STATUS_EN) {
228 /* if PCC was set, there's no way out */
229 no_way_out |= !!(m.status & MCI_STATUS_PCC);
230 /*
231 * If this error was uncorrectable and there was
232 * an overflow, we're in trouble. If no overflow,
233 * we might get away with just killing a task.
234 */
235 if (m.status & MCI_STATUS_UC) {
236 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
237 no_way_out = 1;
238 kill_it = 1;
239 }
240 }
241
242 if (m.status & MCI_STATUS_MISCV)
243 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
244 if (m.status & MCI_STATUS_ADDRV)
245 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
246
247 mce_get_rip(&m, regs);
248 if (error_code >= 0)
249 rdtscll(m.tsc);
250 if (error_code != -2)
251 mce_log(&m);
252
253 /* Did this bank cause the exception? */
254 /* Assume that the bank with uncorrectable errors did it,
255 and that there is only a single one. */
256 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
257 panicm = m;
258 panicm_found = 1;
259 }
260
261 add_taint(TAINT_MACHINE_CHECK);
262 }
263
264 /* Never do anything final in the polling timer */
265 if (!regs)
266 goto out;
267
268 /* If we didn't find an uncorrectable error, pick
269 the last one (shouldn't happen, just being safe). */
270 if (!panicm_found)
271 panicm = m;
272
273 /*
274 * If we have decided that we just CAN'T continue, and the user
275 * has not set tolerant to an insane level, give up and die.
276 */
277 if (no_way_out && tolerant < 3)
278 mce_panic("Machine check", &panicm, mcestart);
279
280 /*
281 * If the error seems to be unrecoverable, something should be
282 * done. Try to kill as little as possible. If we can kill just
283 * one task, do that. If the user has set the tolerance very
284 * high, don't try to do anything at all.
285 */
286 if (kill_it && tolerant < 3) {
287 int user_space = 0;
288
289 /*
290 * If the EIPV bit is set, it means the saved IP is the
291 * instruction which caused the MCE.
292 */
293 if (m.mcgstatus & MCG_STATUS_EIPV)
294 user_space = panicm.rip && (panicm.cs & 3);
295
296 /*
297 * If we know that the error was in user space, send a
298 * SIGBUS. Otherwise, panic if tolerance is low.
299 *
300 * do_exit() takes an awful lot of locks and has a slight
301 * risk of deadlocking.
302 */
303 if (user_space) {
304 do_exit(SIGBUS);
305 } else if (panic_on_oops || tolerant < 2) {
306 mce_panic("Uncorrected machine check",
307 &panicm, mcestart);
308 }
309 }
310
311 /* notify userspace ASAP */
312 set_thread_flag(TIF_MCE_NOTIFY);
313
314 out:
315 /* the last thing we do is clear state */
316 for (i = 0; i < banks; i++)
317 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
318 wrmsrl(MSR_IA32_MCG_STATUS, 0);
319 out2:
320 atomic_dec(&mce_entry);
321}
322
323#ifdef CONFIG_X86_MCE_INTEL
324/***
325 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
326 * @cpu: The CPU on which the event occured.
327 * @status: Event status information
328 *
329 * This function should be called by the thermal interrupt after the
330 * event has been processed and the decision was made to log the event
331 * further.
332 *
333 * The status parameter will be saved to the 'status' field of 'struct mce'
334 * and historically has been the register value of the
335 * MSR_IA32_THERMAL_STATUS (Intel) msr.
336 */
337void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
338{
339 struct mce m;
340
341 memset(&m, 0, sizeof(m));
342 m.cpu = cpu;
343 m.bank = MCE_THERMAL_BANK;
344 m.status = status;
345 rdtscll(m.tsc);
346 mce_log(&m);
347}
348#endif /* CONFIG_X86_MCE_INTEL */
349
350/*
351 * Periodic polling timer for "silent" machine check errors. If the
352 * poller finds an MCE, poll 2x faster. When the poller finds no more
353 * errors, poll 2x slower (up to check_interval seconds).
354 */
355
356static int check_interval = 5 * 60; /* 5 minutes */
357static int next_interval; /* in jiffies */
358static void mcheck_timer(struct work_struct *work);
359static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
360
361static void mcheck_check_cpu(void *info)
362{
363 if (mce_available(&current_cpu_data))
364 do_machine_check(NULL, 0);
365}
366
367static void mcheck_timer(struct work_struct *work)
368{
369 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
370
371 /*
372 * Alert userspace if needed. If we logged an MCE, reduce the
373 * polling interval, otherwise increase the polling interval.
374 */
375 if (mce_notify_user()) {
376 next_interval = max(next_interval/2, HZ/100);
377 } else {
378 next_interval = min(next_interval*2,
379 (int)round_jiffies_relative(check_interval*HZ));
380 }
381
382 schedule_delayed_work(&mcheck_work, next_interval);
383}
384
385/*
386 * This is only called from process context. This is where we do
387 * anything we need to alert userspace about new MCEs. This is called
388 * directly from the poller and also from entry.S and idle, thanks to
389 * TIF_MCE_NOTIFY.
390 */
391int mce_notify_user(void)
392{
393 clear_thread_flag(TIF_MCE_NOTIFY);
394 if (test_and_clear_bit(0, &notify_user)) {
395 static unsigned long last_print;
396 unsigned long now = jiffies;
397
398 wake_up_interruptible(&mce_wait);
399 if (trigger[0])
400 call_usermodehelper(trigger, trigger_argv, NULL,
401 UMH_NO_WAIT);
402
403 if (time_after_eq(now, last_print + (check_interval*HZ))) {
404 last_print = now;
405 printk(KERN_INFO "Machine check events logged\n");
406 }
407
408 return 1;
409 }
410 return 0;
411}
412
413/* see if the idle task needs to notify userspace */
414static int
415mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
416{
417 /* IDLE_END should be safe - interrupts are back on */
418 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
419 mce_notify_user();
420
421 return NOTIFY_OK;
422}
423
424static struct notifier_block mce_idle_notifier = {
425 .notifier_call = mce_idle_callback,
426};
427
428static __init int periodic_mcheck_init(void)
429{
430 next_interval = check_interval * HZ;
431 if (next_interval)
432 schedule_delayed_work(&mcheck_work,
433 round_jiffies_relative(next_interval));
434 idle_notifier_register(&mce_idle_notifier);
435 return 0;
436}
437__initcall(periodic_mcheck_init);
438
439
440/*
441 * Initialize Machine Checks for a CPU.
442 */
443static void mce_init(void *dummy)
444{
445 u64 cap;
446 int i;
447
448 rdmsrl(MSR_IA32_MCG_CAP, cap);
449 banks = cap & 0xff;
450 if (banks > NR_BANKS) {
451 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
452 banks = NR_BANKS;
453 }
454 /* Use accurate RIP reporting if available. */
455 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
456 rip_msr = MSR_IA32_MCG_EIP;
457
458 /* Log the machine checks left over from the previous reset.
459 This also clears all registers */
460 do_machine_check(NULL, mce_bootlog ? -1 : -2);
461
462 set_in_cr4(X86_CR4_MCE);
463
464 if (cap & MCG_CTL_P)
465 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
466
467 for (i = 0; i < banks; i++) {
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
470 }
471}
472
473/* Add per CPU specific workarounds here */
474static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
475{
476 /* This should be disabled by the BIOS, but isn't always */
477 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
478 /* disable GART TBL walk error reporting, which trips off
479 incorrectly with the IOMMU & 3ware & Cerberus. */
480 clear_bit(10, &bank[4]);
481 /* Lots of broken BIOS around that don't clear them
482 by default and leave crap in there. Don't log. */
483 mce_bootlog = 0;
484 }
485
486}
487
488static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
489{
490 switch (c->x86_vendor) {
491 case X86_VENDOR_INTEL:
492 mce_intel_feature_init(c);
493 break;
494 case X86_VENDOR_AMD:
495 mce_amd_feature_init(c);
496 break;
497 default:
498 break;
499 }
500}
501
502/*
503 * Called for each booted CPU to set up machine checks.
504 * Must be called with preempt off.
505 */
506void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
507{
508 static cpumask_t mce_cpus = CPU_MASK_NONE;
509
510 mce_cpu_quirks(c);
511
512 if (mce_dont_init ||
513 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
514 !mce_available(c))
515 return;
516
517 mce_init(NULL);
518 mce_cpu_features(c);
519}
520
521/*
522 * Character device to read and clear the MCE log.
523 */
524
525static DEFINE_SPINLOCK(mce_state_lock);
526static int open_count; /* #times opened */
527static int open_exclu; /* already open exclusive? */
528
529static int mce_open(struct inode *inode, struct file *file)
530{
531 spin_lock(&mce_state_lock);
532
533 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
534 spin_unlock(&mce_state_lock);
535 return -EBUSY;
536 }
537
538 if (file->f_flags & O_EXCL)
539 open_exclu = 1;
540 open_count++;
541
542 spin_unlock(&mce_state_lock);
543
544 return nonseekable_open(inode, file);
545}
546
547static int mce_release(struct inode *inode, struct file *file)
548{
549 spin_lock(&mce_state_lock);
550
551 open_count--;
552 open_exclu = 0;
553
554 spin_unlock(&mce_state_lock);
555
556 return 0;
557}
558
559static void collect_tscs(void *data)
560{
561 unsigned long *cpu_tsc = (unsigned long *)data;
562 rdtscll(cpu_tsc[smp_processor_id()]);
563}
564
565static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
566{
567 unsigned long *cpu_tsc;
568 static DECLARE_MUTEX(mce_read_sem);
569 unsigned next;
570 char __user *buf = ubuf;
571 int i, err;
572
573 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
574 if (!cpu_tsc)
575 return -ENOMEM;
576
577 down(&mce_read_sem);
578 next = rcu_dereference(mcelog.next);
579
580 /* Only supports full reads right now */
581 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
582 up(&mce_read_sem);
583 kfree(cpu_tsc);
584 return -EINVAL;
585 }
586
587 err = 0;
588 for (i = 0; i < next; i++) {
589 unsigned long start = jiffies;
590 while (!mcelog.entry[i].finished) {
591 if (time_after_eq(jiffies, start + 2)) {
592 memset(mcelog.entry + i,0, sizeof(struct mce));
593 goto timeout;
594 }
595 cpu_relax();
596 }
597 smp_rmb();
598 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
599 buf += sizeof(struct mce);
600 timeout:
601 ;
602 }
603
604 memset(mcelog.entry, 0, next * sizeof(struct mce));
605 mcelog.next = 0;
606
607 synchronize_sched();
608
609 /* Collect entries that were still getting written before the synchronize. */
610
611 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
612 for (i = next; i < MCE_LOG_LEN; i++) {
613 if (mcelog.entry[i].finished &&
614 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
615 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
616 smp_rmb();
617 buf += sizeof(struct mce);
618 memset(&mcelog.entry[i], 0, sizeof(struct mce));
619 }
620 }
621 up(&mce_read_sem);
622 kfree(cpu_tsc);
623 return err ? -EFAULT : buf - ubuf;
624}
625
626static unsigned int mce_poll(struct file *file, poll_table *wait)
627{
628 poll_wait(file, &mce_wait, wait);
629 if (rcu_dereference(mcelog.next))
630 return POLLIN | POLLRDNORM;
631 return 0;
632}
633
634static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
635{
636 int __user *p = (int __user *)arg;
637 if (!capable(CAP_SYS_ADMIN))
638 return -EPERM;
639 switch (cmd) {
640 case MCE_GET_RECORD_LEN:
641 return put_user(sizeof(struct mce), p);
642 case MCE_GET_LOG_LEN:
643 return put_user(MCE_LOG_LEN, p);
644 case MCE_GETCLEAR_FLAGS: {
645 unsigned flags;
646 do {
647 flags = mcelog.flags;
648 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
649 return put_user(flags, p);
650 }
651 default:
652 return -ENOTTY;
653 }
654}
655
656static const struct file_operations mce_chrdev_ops = {
657 .open = mce_open,
658 .release = mce_release,
659 .read = mce_read,
660 .poll = mce_poll,
661 .ioctl = mce_ioctl,
662};
663
664static struct miscdevice mce_log_device = {
665 MISC_MCELOG_MINOR,
666 "mcelog",
667 &mce_chrdev_ops,
668};
669
670static unsigned long old_cr4 __initdata;
671
672void __init stop_mce(void)
673{
674 old_cr4 = read_cr4();
675 clear_in_cr4(X86_CR4_MCE);
676}
677
678void __init restart_mce(void)
679{
680 if (old_cr4 & X86_CR4_MCE)
681 set_in_cr4(X86_CR4_MCE);
682}
683
684/*
685 * Old style boot options parsing. Only for compatibility.
686 */
687
688static int __init mcheck_disable(char *str)
689{
690 mce_dont_init = 1;
691 return 1;
692}
693
694/* mce=off disables machine check. Note you can reenable it later
695 using sysfs.
696 mce=TOLERANCELEVEL (number, see above)
697 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
698 mce=nobootlog Don't log MCEs from before booting. */
699static int __init mcheck_enable(char *str)
700{
701 if (*str == '=')
702 str++;
703 if (!strcmp(str, "off"))
704 mce_dont_init = 1;
705 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
706 mce_bootlog = str[0] == 'b';
707 else if (isdigit(str[0]))
708 get_option(&str, &tolerant);
709 else
710 printk("mce= argument %s ignored. Please use /sys", str);
711 return 1;
712}
713
714__setup("nomce", mcheck_disable);
715__setup("mce", mcheck_enable);
716
717/*
718 * Sysfs support
719 */
720
721/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
722 Only one CPU is active at this time, the others get readded later using
723 CPU hotplug. */
724static int mce_resume(struct sys_device *dev)
725{
726 mce_init(NULL);
727 return 0;
728}
729
730/* Reinit MCEs after user configuration changes */
731static void mce_restart(void)
732{
733 if (next_interval)
734 cancel_delayed_work(&mcheck_work);
735 /* Timer race is harmless here */
736 on_each_cpu(mce_init, NULL, 1, 1);
737 next_interval = check_interval * HZ;
738 if (next_interval)
739 schedule_delayed_work(&mcheck_work,
740 round_jiffies_relative(next_interval));
741}
742
743static struct sysdev_class mce_sysclass = {
744 .resume = mce_resume,
745 set_kset_name("machinecheck"),
746};
747
748DEFINE_PER_CPU(struct sys_device, device_mce);
749
750/* Why are there no generic functions for this? */
751#define ACCESSOR(name, var, start) \
752 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
753 return sprintf(buf, "%lx\n", (unsigned long)var); \
754 } \
755 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
756 char *end; \
757 unsigned long new = simple_strtoul(buf, &end, 0); \
758 if (end == buf) return -EINVAL; \
759 var = new; \
760 start; \
761 return end-buf; \
762 } \
763 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
764
765/* TBD should generate these dynamically based on number of available banks */
766ACCESSOR(bank0ctl,bank[0],mce_restart())
767ACCESSOR(bank1ctl,bank[1],mce_restart())
768ACCESSOR(bank2ctl,bank[2],mce_restart())
769ACCESSOR(bank3ctl,bank[3],mce_restart())
770ACCESSOR(bank4ctl,bank[4],mce_restart())
771ACCESSOR(bank5ctl,bank[5],mce_restart())
772
773static ssize_t show_trigger(struct sys_device *s, char *buf)
774{
775 strcpy(buf, trigger);
776 strcat(buf, "\n");
777 return strlen(trigger) + 1;
778}
779
780static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
781{
782 char *p;
783 int len;
784 strncpy(trigger, buf, sizeof(trigger));
785 trigger[sizeof(trigger)-1] = 0;
786 len = strlen(trigger);
787 p = strchr(trigger, '\n');
788 if (*p) *p = 0;
789 return len;
790}
791
792static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
793ACCESSOR(tolerant,tolerant,)
794ACCESSOR(check_interval,check_interval,mce_restart())
795static struct sysdev_attribute *mce_attributes[] = {
796 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
797 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
798 &attr_tolerant, &attr_check_interval, &attr_trigger,
799 NULL
800};
801
802/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
803static __cpuinit int mce_create_device(unsigned int cpu)
804{
805 int err;
806 int i;
807 if (!mce_available(&cpu_data[cpu]))
808 return -EIO;
809
810 per_cpu(device_mce,cpu).id = cpu;
811 per_cpu(device_mce,cpu).cls = &mce_sysclass;
812
813 err = sysdev_register(&per_cpu(device_mce,cpu));
814
815 if (!err) {
816 for (i = 0; mce_attributes[i]; i++)
817 sysdev_create_file(&per_cpu(device_mce,cpu),
818 mce_attributes[i]);
819 }
820 return err;
821}
822
823static void mce_remove_device(unsigned int cpu)
824{
825 int i;
826
827 for (i = 0; mce_attributes[i]; i++)
828 sysdev_remove_file(&per_cpu(device_mce,cpu),
829 mce_attributes[i]);
830 sysdev_unregister(&per_cpu(device_mce,cpu));
831 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
832}
833
834/* Get notified when a cpu comes on/off. Be hotplug friendly. */
835static int
836mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
837{
838 unsigned int cpu = (unsigned long)hcpu;
839
840 switch (action) {
841 case CPU_ONLINE:
842 case CPU_ONLINE_FROZEN:
843 mce_create_device(cpu);
844 break;
845 case CPU_DEAD:
846 case CPU_DEAD_FROZEN:
847 mce_remove_device(cpu);
848 break;
849 }
850 return NOTIFY_OK;
851}
852
853static struct notifier_block mce_cpu_notifier = {
854 .notifier_call = mce_cpu_callback,
855};
856
857static __init int mce_init_device(void)
858{
859 int err;
860 int i = 0;
861
862 if (!mce_available(&boot_cpu_data))
863 return -EIO;
864 err = sysdev_class_register(&mce_sysclass);
865
866 for_each_online_cpu(i) {
867 mce_create_device(i);
868 }
869
870 register_hotcpu_notifier(&mce_cpu_notifier);
871 misc_register(&mce_log_device);
872 return err;
873}
874
875device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c
new file mode 100644
index 000000000000..2f8a7f18b0fe
--- /dev/null
+++ b/arch/x86/kernel/mce_amd_64.c
@@ -0,0 +1,689 @@
1/*
2 * (c) 2005, 2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Written by Jacob Shin - AMD, Inc.
8 *
9 * Support : jacob.shin@amd.com
10 *
11 * April 2006
12 * - added support for AMD Family 0x10 processors
13 *
14 * All MC4_MISCi registers are shared between multi-cores
15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h>
23#include <linux/sched.h>
24#include <linux/smp.h>
25#include <linux/sysdev.h>
26#include <linux/sysfs.h>
27#include <asm/apic.h>
28#include <asm/mce.h>
29#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32
33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1"
35#define NR_BANKS 6
36#define NR_BLOCKS 9
37#define THRESHOLD_MAX 0xFFF
38#define INT_TYPE_APIC 0x00020000
39#define MASK_VALID_HI 0x80000000
40#define MASK_CNTP_HI 0x40000000
41#define MASK_LOCKED_HI 0x20000000
42#define MASK_LVTOFF_HI 0x00F00000
43#define MASK_COUNT_EN_HI 0x00080000
44#define MASK_INT_TYPE_HI 0x00060000
45#define MASK_OVERFLOW_HI 0x00010000
46#define MASK_ERR_COUNT_HI 0x00000FFF
47#define MASK_BLKPTR_LO 0xFF000000
48#define MCG_XBLK_ADDR 0xC0000400
49
50struct threshold_block {
51 unsigned int block;
52 unsigned int bank;
53 unsigned int cpu;
54 u32 address;
55 u16 interrupt_enable;
56 u16 threshold_limit;
57 struct kobject kobj;
58 struct list_head miscj;
59};
60
61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX,
65};
66
67struct threshold_bank {
68 struct kobject kobj;
69 struct threshold_block *blocks;
70 cpumask_t cpus;
71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73
74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = {
76 0, 0, 0, 0, 1
77};
78#endif
79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81
82/*
83 * CPU Initialization
84 */
85
86/* must be called with correct cpu affinity */
87static void threshold_restart_bank(struct threshold_block *b,
88 int reset, u16 old_limit)
89{
90 u32 mci_misc_hi, mci_misc_lo;
91
92 rdmsr(b->address, mci_misc_lo, mci_misc_hi);
93
94 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
95 reset = 1; /* limit cannot be lower than err count */
96
97 if (reset) { /* reset err count and overflow bit */
98 mci_misc_hi =
99 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
100 (THRESHOLD_MAX - b->threshold_limit);
101 } else if (old_limit) { /* change limit w/o reset */
102 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
103 (old_limit - b->threshold_limit);
104 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
105 (new_count & THRESHOLD_MAX);
106 }
107
108 b->interrupt_enable ?
109 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
110 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
111
112 mci_misc_hi |= MASK_COUNT_EN_HI;
113 wrmsr(b->address, mci_misc_lo, mci_misc_hi);
114}
115
116/* cpu init entry point, called from mce.c with preempt off */
117void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
118{
119 unsigned int bank, block;
120 unsigned int cpu = smp_processor_id();
121 u32 low = 0, high = 0, address = 0;
122
123 for (bank = 0; bank < NR_BANKS; ++bank) {
124 for (block = 0; block < NR_BLOCKS; ++block) {
125 if (block == 0)
126 address = MSR_IA32_MC0_MISC + bank * 4;
127 else if (block == 1) {
128 address = (low & MASK_BLKPTR_LO) >> 21;
129 if (!address)
130 break;
131 address += MCG_XBLK_ADDR;
132 }
133 else
134 ++address;
135
136 if (rdmsr_safe(address, &low, &high))
137 break;
138
139 if (!(high & MASK_VALID_HI)) {
140 if (block)
141 continue;
142 else
143 break;
144 }
145
146 if (!(high & MASK_CNTP_HI) ||
147 (high & MASK_LOCKED_HI))
148 continue;
149
150 if (!block)
151 per_cpu(bank_map, cpu) |= (1 << bank);
152#ifdef CONFIG_SMP
153 if (shared_bank[bank] && c->cpu_core_id)
154 break;
155#endif
156 high &= ~MASK_LVTOFF_HI;
157 high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
158 wrmsr(address, low, high);
159
160 setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
161 THRESHOLD_APIC_VECTOR,
162 K8_APIC_EXT_INT_MSG_FIX, 0);
163
164 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0);
166 }
167 }
168}
169
170/*
171 * APIC Interrupt Handler
172 */
173
174/*
175 * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
176 * the interrupt goes off when error_count reaches threshold_limit.
177 * the handler will simply log mcelog w/ software defined bank number.
178 */
179asmlinkage void mce_threshold_interrupt(void)
180{
181 unsigned int bank, block;
182 struct mce m;
183 u32 low = 0, high = 0, address = 0;
184
185 ack_APIC_irq();
186 exit_idle();
187 irq_enter();
188
189 memset(&m, 0, sizeof(m));
190 rdtscll(m.tsc);
191 m.cpu = smp_processor_id();
192
193 /* assume first bank caused it */
194 for (bank = 0; bank < NR_BANKS; ++bank) {
195 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
196 continue;
197 for (block = 0; block < NR_BLOCKS; ++block) {
198 if (block == 0)
199 address = MSR_IA32_MC0_MISC + bank * 4;
200 else if (block == 1) {
201 address = (low & MASK_BLKPTR_LO) >> 21;
202 if (!address)
203 break;
204 address += MCG_XBLK_ADDR;
205 }
206 else
207 ++address;
208
209 if (rdmsr_safe(address, &low, &high))
210 break;
211
212 if (!(high & MASK_VALID_HI)) {
213 if (block)
214 continue;
215 else
216 break;
217 }
218
219 if (!(high & MASK_CNTP_HI) ||
220 (high & MASK_LOCKED_HI))
221 continue;
222
223 /* Log the machine check that caused the threshold
224 event. */
225 do_machine_check(NULL, 0);
226
227 if (high & MASK_OVERFLOW_HI) {
228 rdmsrl(address, m.misc);
229 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
230 m.status);
231 m.bank = K8_MCE_THRESHOLD_BASE
232 + bank * NR_BLOCKS
233 + block;
234 mce_log(&m);
235 goto out;
236 }
237 }
238 }
239out:
240 irq_exit();
241}
242
243/*
244 * Sysfs Interface
245 */
246
247struct threshold_attr {
248 struct attribute attr;
249 ssize_t(*show) (struct threshold_block *, char *);
250 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
251};
252
253static cpumask_t affinity_set(unsigned int cpu)
254{
255 cpumask_t oldmask = current->cpus_allowed;
256 cpumask_t newmask = CPU_MASK_NONE;
257 cpu_set(cpu, newmask);
258 set_cpus_allowed(current, newmask);
259 return oldmask;
260}
261
262static void affinity_restore(cpumask_t oldmask)
263{
264 set_cpus_allowed(current, oldmask);
265}
266
267#define SHOW_FIELDS(name) \
268static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
269{ \
270 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
271}
272SHOW_FIELDS(interrupt_enable)
273SHOW_FIELDS(threshold_limit)
274
275static ssize_t store_interrupt_enable(struct threshold_block *b,
276 const char *buf, size_t count)
277{
278 char *end;
279 cpumask_t oldmask;
280 unsigned long new = simple_strtoul(buf, &end, 0);
281 if (end == buf)
282 return -EINVAL;
283 b->interrupt_enable = !!new;
284
285 oldmask = affinity_set(b->cpu);
286 threshold_restart_bank(b, 0, 0);
287 affinity_restore(oldmask);
288
289 return end - buf;
290}
291
292static ssize_t store_threshold_limit(struct threshold_block *b,
293 const char *buf, size_t count)
294{
295 char *end;
296 cpumask_t oldmask;
297 u16 old;
298 unsigned long new = simple_strtoul(buf, &end, 0);
299 if (end == buf)
300 return -EINVAL;
301 if (new > THRESHOLD_MAX)
302 new = THRESHOLD_MAX;
303 if (new < 1)
304 new = 1;
305 old = b->threshold_limit;
306 b->threshold_limit = new;
307
308 oldmask = affinity_set(b->cpu);
309 threshold_restart_bank(b, 0, old);
310 affinity_restore(oldmask);
311
312 return end - buf;
313}
314
315static ssize_t show_error_count(struct threshold_block *b, char *buf)
316{
317 u32 high, low;
318 cpumask_t oldmask;
319 oldmask = affinity_set(b->cpu);
320 rdmsr(b->address, low, high);
321 affinity_restore(oldmask);
322 return sprintf(buf, "%x\n",
323 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
324}
325
326static ssize_t store_error_count(struct threshold_block *b,
327 const char *buf, size_t count)
328{
329 cpumask_t oldmask;
330 oldmask = affinity_set(b->cpu);
331 threshold_restart_bank(b, 1, 0);
332 affinity_restore(oldmask);
333 return 1;
334}
335
336#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
337 .attr = {.name = __stringify(_name), .mode = _mode }, \
338 .show = _show, \
339 .store = _store, \
340};
341
342#define RW_ATTR(name) \
343static struct threshold_attr name = \
344 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
345
346RW_ATTR(interrupt_enable);
347RW_ATTR(threshold_limit);
348RW_ATTR(error_count);
349
350static struct attribute *default_attrs[] = {
351 &interrupt_enable.attr,
352 &threshold_limit.attr,
353 &error_count.attr,
354 NULL
355};
356
357#define to_block(k) container_of(k, struct threshold_block, kobj)
358#define to_attr(a) container_of(a, struct threshold_attr, attr)
359
360static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
361{
362 struct threshold_block *b = to_block(kobj);
363 struct threshold_attr *a = to_attr(attr);
364 ssize_t ret;
365 ret = a->show ? a->show(b, buf) : -EIO;
366 return ret;
367}
368
369static ssize_t store(struct kobject *kobj, struct attribute *attr,
370 const char *buf, size_t count)
371{
372 struct threshold_block *b = to_block(kobj);
373 struct threshold_attr *a = to_attr(attr);
374 ssize_t ret;
375 ret = a->store ? a->store(b, buf, count) : -EIO;
376 return ret;
377}
378
379static struct sysfs_ops threshold_ops = {
380 .show = show,
381 .store = store,
382};
383
384static struct kobj_type threshold_ktype = {
385 .sysfs_ops = &threshold_ops,
386 .default_attrs = default_attrs,
387};
388
389static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
390 unsigned int bank,
391 unsigned int block,
392 u32 address)
393{
394 int err;
395 u32 low, high;
396 struct threshold_block *b = NULL;
397
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0;
400
401 if (rdmsr_safe(address, &low, &high))
402 return 0;
403
404 if (!(high & MASK_VALID_HI)) {
405 if (block)
406 goto recurse;
407 else
408 return 0;
409 }
410
411 if (!(high & MASK_CNTP_HI) ||
412 (high & MASK_LOCKED_HI))
413 goto recurse;
414
415 b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
416 if (!b)
417 return -ENOMEM;
418
419 b->block = block;
420 b->bank = bank;
421 b->cpu = cpu;
422 b->address = address;
423 b->interrupt_enable = 0;
424 b->threshold_limit = THRESHOLD_MAX;
425
426 INIT_LIST_HEAD(&b->miscj);
427
428 if (per_cpu(threshold_banks, cpu)[bank]->blocks)
429 list_add(&b->miscj,
430 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
431 else
432 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
433
434 kobject_set_name(&b->kobj, "misc%i", block);
435 b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
436 b->kobj.ktype = &threshold_ktype;
437 err = kobject_register(&b->kobj);
438 if (err)
439 goto out_free;
440recurse:
441 if (!block) {
442 address = (low & MASK_BLKPTR_LO) >> 21;
443 if (!address)
444 return 0;
445 address += MCG_XBLK_ADDR;
446 } else
447 ++address;
448
449 err = allocate_threshold_blocks(cpu, bank, ++block, address);
450 if (err)
451 goto out_free;
452
453 return err;
454
455out_free:
456 if (b) {
457 kobject_unregister(&b->kobj);
458 kfree(b);
459 }
460 return err;
461}
462
463/* symlinks sibling shared banks to first core. first core owns dir/files. */
464static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
465{
466 int i, err = 0;
467 struct threshold_bank *b = NULL;
468 cpumask_t oldmask = CPU_MASK_NONE;
469 char name[32];
470
471 sprintf(name, "threshold_bank%i", bank);
472
473#ifdef CONFIG_SMP
474 if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */
475 i = first_cpu(cpu_core_map[cpu]);
476
477 /* first core not up yet */
478 if (cpu_data[i].cpu_core_id)
479 goto out;
480
481 /* already linked */
482 if (per_cpu(threshold_banks, cpu)[bank])
483 goto out;
484
485 b = per_cpu(threshold_banks, i)[bank];
486
487 if (!b)
488 goto out;
489
490 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
491 &b->kobj, name);
492 if (err)
493 goto out;
494
495 b->cpus = cpu_core_map[cpu];
496 per_cpu(threshold_banks, cpu)[bank] = b;
497 goto out;
498 }
499#endif
500
501 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
502 if (!b) {
503 err = -ENOMEM;
504 goto out;
505 }
506
507 kobject_set_name(&b->kobj, "threshold_bank%i", bank);
508 b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
509#ifndef CONFIG_SMP
510 b->cpus = CPU_MASK_ALL;
511#else
512 b->cpus = cpu_core_map[cpu];
513#endif
514 err = kobject_register(&b->kobj);
515 if (err)
516 goto out_free;
517
518 per_cpu(threshold_banks, cpu)[bank] = b;
519
520 oldmask = affinity_set(cpu);
521 err = allocate_threshold_blocks(cpu, bank, 0,
522 MSR_IA32_MC0_MISC + bank * 4);
523 affinity_restore(oldmask);
524
525 if (err)
526 goto out_free;
527
528 for_each_cpu_mask(i, b->cpus) {
529 if (i == cpu)
530 continue;
531
532 err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
533 &b->kobj, name);
534 if (err)
535 goto out;
536
537 per_cpu(threshold_banks, i)[bank] = b;
538 }
539
540 goto out;
541
542out_free:
543 per_cpu(threshold_banks, cpu)[bank] = NULL;
544 kfree(b);
545out:
546 return err;
547}
548
549/* create dir/files for all valid threshold banks */
550static __cpuinit int threshold_create_device(unsigned int cpu)
551{
552 unsigned int bank;
553 int err = 0;
554
555 for (bank = 0; bank < NR_BANKS; ++bank) {
556 if (!(per_cpu(bank_map, cpu) & 1 << bank))
557 continue;
558 err = threshold_create_bank(cpu, bank);
559 if (err)
560 goto out;
561 }
562out:
563 return err;
564}
565
566/*
567 * let's be hotplug friendly.
568 * in case of multiple core processors, the first core always takes ownership
569 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
570 */
571
572static void deallocate_threshold_block(unsigned int cpu,
573 unsigned int bank)
574{
575 struct threshold_block *pos = NULL;
576 struct threshold_block *tmp = NULL;
577 struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
578
579 if (!head)
580 return;
581
582 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
583 kobject_unregister(&pos->kobj);
584 list_del(&pos->miscj);
585 kfree(pos);
586 }
587
588 kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
589 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
590}
591
592static void threshold_remove_bank(unsigned int cpu, int bank)
593{
594 int i = 0;
595 struct threshold_bank *b;
596 char name[32];
597
598 b = per_cpu(threshold_banks, cpu)[bank];
599
600 if (!b)
601 return;
602
603 if (!b->blocks)
604 goto free_out;
605
606 sprintf(name, "threshold_bank%i", bank);
607
608#ifdef CONFIG_SMP
609 /* sibling symlink */
610 if (shared_bank[bank] && b->blocks->cpu != cpu) {
611 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
612 per_cpu(threshold_banks, cpu)[bank] = NULL;
613 return;
614 }
615#endif
616
617 /* remove all sibling symlinks before unregistering */
618 for_each_cpu_mask(i, b->cpus) {
619 if (i == cpu)
620 continue;
621
622 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
623 per_cpu(threshold_banks, i)[bank] = NULL;
624 }
625
626 deallocate_threshold_block(cpu, bank);
627
628free_out:
629 kobject_unregister(&b->kobj);
630 kfree(b);
631 per_cpu(threshold_banks, cpu)[bank] = NULL;
632}
633
634static void threshold_remove_device(unsigned int cpu)
635{
636 unsigned int bank;
637
638 for (bank = 0; bank < NR_BANKS; ++bank) {
639 if (!(per_cpu(bank_map, cpu) & 1 << bank))
640 continue;
641 threshold_remove_bank(cpu, bank);
642 }
643}
644
645/* get notified when a cpu comes on/off */
646static int threshold_cpu_callback(struct notifier_block *nfb,
647 unsigned long action, void *hcpu)
648{
649 /* cpu was unsigned int to begin with */
650 unsigned int cpu = (unsigned long)hcpu;
651
652 if (cpu >= NR_CPUS)
653 goto out;
654
655 switch (action) {
656 case CPU_ONLINE:
657 case CPU_ONLINE_FROZEN:
658 threshold_create_device(cpu);
659 break;
660 case CPU_DEAD:
661 case CPU_DEAD_FROZEN:
662 threshold_remove_device(cpu);
663 break;
664 default:
665 break;
666 }
667 out:
668 return NOTIFY_OK;
669}
670
671static struct notifier_block threshold_cpu_notifier = {
672 .notifier_call = threshold_cpu_callback,
673};
674
675static __init int threshold_init_device(void)
676{
677 unsigned lcpu = 0;
678
679 /* to hit CPUs online before the notifier is up */
680 for_each_online_cpu(lcpu) {
681 int err = threshold_create_device(lcpu);
682 if (err)
683 return err;
684 }
685 register_hotcpu_notifier(&threshold_cpu_notifier);
686 return 0;
687}
688
689device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c
new file mode 100644
index 000000000000..6551505d8a2c
--- /dev/null
+++ b/arch/x86/kernel/mce_intel_64.c
@@ -0,0 +1,89 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 */
5
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/percpu.h>
9#include <asm/processor.h>
10#include <asm/msr.h>
11#include <asm/mce.h>
12#include <asm/hw_irq.h>
13#include <asm/idle.h>
14#include <asm/therm_throt.h>
15
16asmlinkage void smp_thermal_interrupt(void)
17{
18 __u64 msr_val;
19
20 ack_APIC_irq();
21
22 exit_idle();
23 irq_enter();
24
25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28
29 irq_exit();
30}
31
32static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
33{
34 u32 l, h;
35 int tm2 = 0;
36 unsigned int cpu = smp_processor_id();
37
38 if (!cpu_has(c, X86_FEATURE_ACPI))
39 return;
40
41 if (!cpu_has(c, X86_FEATURE_ACC))
42 return;
43
44 /* first check if TM1 is already enabled by the BIOS, in which
45 * case there might be some SMM goo which handles it, so we can't even
46 * put a handler since it might be delivered via SMI already.
47 */
48 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
49 h = apic_read(APIC_LVTTHMR);
50 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
51 printk(KERN_DEBUG
52 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
53 return;
54 }
55
56 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
57 tm2 = 1;
58
59 if (h & APIC_VECTOR_MASK) {
60 printk(KERN_DEBUG
61 "CPU%d: Thermal LVT vector (%#x) already "
62 "installed\n", cpu, (h & APIC_VECTOR_MASK));
63 return;
64 }
65
66 h = THERMAL_APIC_VECTOR;
67 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
68 apic_write(APIC_LVTTHMR, h);
69
70 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
71 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
72
73 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
74 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
75
76 l = apic_read(APIC_LVTTHMR);
77 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
78 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
79 cpu, tm2 ? "TM2" : "TM1");
80
81 /* enable thermal throttle processing */
82 atomic_set(&therm_throt_en, 1);
83 return;
84}
85
86void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
87{
88 intel_init_thermal(c);
89}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
new file mode 100644
index 000000000000..a888e67f5874
--- /dev/null
+++ b/arch/x86/kernel/module_64.c
@@ -0,0 +1,185 @@
1/* Kernel module help for x86-64
2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/
19#include <linux/moduleloader.h>
20#include <linux/elf.h>
21#include <linux/vmalloc.h>
22#include <linux/fs.h>
23#include <linux/string.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/bug.h>
27
28#include <asm/system.h>
29#include <asm/page.h>
30#include <asm/pgtable.h>
31
32#define DEBUGP(fmt...)
33
34#ifndef CONFIG_UML
35void module_free(struct module *mod, void *module_region)
36{
37 vfree(module_region);
38 /* FIXME: If module_region == mod->init_region, trim exception
39 table entries. */
40}
41
42void *module_alloc(unsigned long size)
43{
44 struct vm_struct *area;
45
46 if (!size)
47 return NULL;
48 size = PAGE_ALIGN(size);
49 if (size > MODULES_LEN)
50 return NULL;
51
52 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
53 if (!area)
54 return NULL;
55
56 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
57}
58#endif
59
60/* We don't need anything special. */
61int module_frob_arch_sections(Elf_Ehdr *hdr,
62 Elf_Shdr *sechdrs,
63 char *secstrings,
64 struct module *mod)
65{
66 return 0;
67}
68
69int apply_relocate_add(Elf64_Shdr *sechdrs,
70 const char *strtab,
71 unsigned int symindex,
72 unsigned int relsec,
73 struct module *me)
74{
75 unsigned int i;
76 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
77 Elf64_Sym *sym;
78 void *loc;
79 u64 val;
80
81 DEBUGP("Applying relocate section %u to %u\n", relsec,
82 sechdrs[relsec].sh_info);
83 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
84 /* This is where to make the change */
85 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
86 + rel[i].r_offset;
87
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
91 + ELF64_R_SYM(rel[i].r_info);
92
93 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
94 (int)ELF64_R_TYPE(rel[i].r_info),
95 sym->st_value, rel[i].r_addend, (u64)loc);
96
97 val = sym->st_value + rel[i].r_addend;
98
99 switch (ELF64_R_TYPE(rel[i].r_info)) {
100 case R_X86_64_NONE:
101 break;
102 case R_X86_64_64:
103 *(u64 *)loc = val;
104 break;
105 case R_X86_64_32:
106 *(u32 *)loc = val;
107 if (val != *(u32 *)loc)
108 goto overflow;
109 break;
110 case R_X86_64_32S:
111 *(s32 *)loc = val;
112 if ((s64)val != *(s32 *)loc)
113 goto overflow;
114 break;
115 case R_X86_64_PC32:
116 val -= (u64)loc;
117 *(u32 *)loc = val;
118#if 0
119 if ((s64)val != *(s32 *)loc)
120 goto overflow;
121#endif
122 break;
123 default:
124 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
125 me->name, ELF64_R_TYPE(rel[i].r_info));
126 return -ENOEXEC;
127 }
128 }
129 return 0;
130
131overflow:
132 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
133 (int)ELF64_R_TYPE(rel[i].r_info), val);
134 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
135 me->name);
136 return -ENOEXEC;
137}
138
139int apply_relocate(Elf_Shdr *sechdrs,
140 const char *strtab,
141 unsigned int symindex,
142 unsigned int relsec,
143 struct module *me)
144{
145 printk("non add relocation not supported\n");
146 return -ENOSYS;
147}
148
149int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs,
151 struct module *me)
152{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
157 if (!strcmp(".text", secstrings + s->sh_name))
158 text = s;
159 if (!strcmp(".altinstructions", secstrings + s->sh_name))
160 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s;
163 }
164
165 if (alt) {
166 /* patch .altinstructions */
167 void *aseg = (void *)alt->sh_addr;
168 apply_alternatives(aseg, aseg + alt->sh_size);
169 }
170 if (locks && text) {
171 void *lseg = (void *)locks->sh_addr;
172 void *tseg = (void *)text->sh_addr;
173 alternatives_smp_module_add(me, me->name,
174 lseg, lseg + locks->sh_size,
175 tseg, tseg + text->sh_size);
176 }
177
178 return module_bug_finalize(hdr, sechdrs, me);
179}
180
181void module_arch_cleanup(struct module *mod)
182{
183 alternatives_smp_module_del(mod);
184 module_bug_cleanup(mod);
185}
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
new file mode 100644
index 000000000000..8bf0ca03ac8e
--- /dev/null
+++ b/arch/x86/kernel/mpparse_64.c
@@ -0,0 +1,852 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/init.h>
18#include <linux/delay.h>
19#include <linux/bootmem.h>
20#include <linux/kernel_stat.h>
21#include <linux/mc146818rtc.h>
22#include <linux/acpi.h>
23#include <linux/module.h>
24
25#include <asm/smp.h>
26#include <asm/mtrr.h>
27#include <asm/mpspec.h>
28#include <asm/pgalloc.h>
29#include <asm/io_apic.h>
30#include <asm/proto.h>
31#include <asm/acpi.h>
32
33/* Have we found an MP table */
34int smp_found_config;
35
36/*
37 * Various Linux-internal data structures created from the
38 * MP-table.
39 */
40DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
41int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
42
43static int mp_current_pci_id = 0;
44/* I/O APIC entries */
45struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
46
47/* # of MP IRQ source entries */
48struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
49
50/* MP IRQ source entries */
51int mp_irq_entries;
52
53int nr_ioapics;
54unsigned long mp_lapic_addr = 0;
55
56
57
58/* Processor that is doing the boot up */
59unsigned int boot_cpu_id = -1U;
60/* Internal processor count */
61unsigned int num_processors __cpuinitdata = 0;
62
63unsigned disabled_cpus __cpuinitdata;
64
65/* Bitmask of physically existing CPUs */
66physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
67
68u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
69
70
71/*
72 * Intel MP BIOS table parsing routines:
73 */
74
75/*
76 * Checksum an MP configuration block.
77 */
78
79static int __init mpf_checksum(unsigned char *mp, int len)
80{
81 int sum = 0;
82
83 while (len--)
84 sum += *mp++;
85
86 return sum & 0xFF;
87}
88
89static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
90{
91 int cpu;
92 cpumask_t tmp_map;
93 char *bootup_cpu = "";
94
95 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
96 disabled_cpus++;
97 return;
98 }
99 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
100 bootup_cpu = " (Bootup-CPU)";
101 boot_cpu_id = m->mpc_apicid;
102 }
103
104 printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
105
106 if (num_processors >= NR_CPUS) {
107 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
108 " Processor ignored.\n", NR_CPUS);
109 return;
110 }
111
112 num_processors++;
113 cpus_complement(tmp_map, cpu_present_map);
114 cpu = first_cpu(tmp_map);
115
116 physid_set(m->mpc_apicid, phys_cpu_present_map);
117 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
118 /*
119 * bios_cpu_apicid is required to have processors listed
120 * in same order as logical cpu numbers. Hence the first
121 * entry is BSP, and so on.
122 */
123 cpu = 0;
124 }
125 bios_cpu_apicid[cpu] = m->mpc_apicid;
126 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
127
128 cpu_set(cpu, cpu_possible_map);
129 cpu_set(cpu, cpu_present_map);
130}
131
132static void __init MP_bus_info (struct mpc_config_bus *m)
133{
134 char str[7];
135
136 memcpy(str, m->mpc_bustype, 6);
137 str[6] = 0;
138 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
139
140 if (strncmp(str, "ISA", 3) == 0) {
141 set_bit(m->mpc_busid, mp_bus_not_pci);
142 } else if (strncmp(str, "PCI", 3) == 0) {
143 clear_bit(m->mpc_busid, mp_bus_not_pci);
144 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
145 mp_current_pci_id++;
146 } else {
147 printk(KERN_ERR "Unknown bustype %s\n", str);
148 }
149}
150
151static int bad_ioapic(unsigned long address)
152{
153 if (nr_ioapics >= MAX_IO_APICS) {
154 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
155 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
156 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
157 }
158 if (!address) {
159 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
160 " found in table, skipping!\n");
161 return 1;
162 }
163 return 0;
164}
165
166static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
167{
168 if (!(m->mpc_flags & MPC_APIC_USABLE))
169 return;
170
171 printk("I/O APIC #%d at 0x%X.\n",
172 m->mpc_apicid, m->mpc_apicaddr);
173
174 if (bad_ioapic(m->mpc_apicaddr))
175 return;
176
177 mp_ioapics[nr_ioapics] = *m;
178 nr_ioapics++;
179}
180
181static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
182{
183 mp_irqs [mp_irq_entries] = *m;
184 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
185 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
186 m->mpc_irqtype, m->mpc_irqflag & 3,
187 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
188 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
189 if (++mp_irq_entries >= MAX_IRQ_SOURCES)
190 panic("Max # of irq sources exceeded!!\n");
191}
192
193static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
194{
195 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
196 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
197 m->mpc_irqtype, m->mpc_irqflag & 3,
198 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
199 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
200}
201
202/*
203 * Read/parse the MPC
204 */
205
206static int __init smp_read_mpc(struct mp_config_table *mpc)
207{
208 char str[16];
209 int count=sizeof(*mpc);
210 unsigned char *mpt=((unsigned char *)mpc)+count;
211
212 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
213 printk("MPTABLE: bad signature [%c%c%c%c]!\n",
214 mpc->mpc_signature[0],
215 mpc->mpc_signature[1],
216 mpc->mpc_signature[2],
217 mpc->mpc_signature[3]);
218 return 0;
219 }
220 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
221 printk("MPTABLE: checksum error!\n");
222 return 0;
223 }
224 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
225 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
226 mpc->mpc_spec);
227 return 0;
228 }
229 if (!mpc->mpc_lapic) {
230 printk(KERN_ERR "MPTABLE: null local APIC address!\n");
231 return 0;
232 }
233 memcpy(str,mpc->mpc_oem,8);
234 str[8] = 0;
235 printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
236
237 memcpy(str,mpc->mpc_productid,12);
238 str[12] = 0;
239 printk("MPTABLE: Product ID: %s ",str);
240
241 printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
242
243 /* save the local APIC address, it might be non-default */
244 if (!acpi_lapic)
245 mp_lapic_addr = mpc->mpc_lapic;
246
247 /*
248 * Now process the configuration blocks.
249 */
250 while (count < mpc->mpc_length) {
251 switch(*mpt) {
252 case MP_PROCESSOR:
253 {
254 struct mpc_config_processor *m=
255 (struct mpc_config_processor *)mpt;
256 if (!acpi_lapic)
257 MP_processor_info(m);
258 mpt += sizeof(*m);
259 count += sizeof(*m);
260 break;
261 }
262 case MP_BUS:
263 {
264 struct mpc_config_bus *m=
265 (struct mpc_config_bus *)mpt;
266 MP_bus_info(m);
267 mpt += sizeof(*m);
268 count += sizeof(*m);
269 break;
270 }
271 case MP_IOAPIC:
272 {
273 struct mpc_config_ioapic *m=
274 (struct mpc_config_ioapic *)mpt;
275 MP_ioapic_info(m);
276 mpt += sizeof(*m);
277 count += sizeof(*m);
278 break;
279 }
280 case MP_INTSRC:
281 {
282 struct mpc_config_intsrc *m=
283 (struct mpc_config_intsrc *)mpt;
284
285 MP_intsrc_info(m);
286 mpt += sizeof(*m);
287 count += sizeof(*m);
288 break;
289 }
290 case MP_LINTSRC:
291 {
292 struct mpc_config_lintsrc *m=
293 (struct mpc_config_lintsrc *)mpt;
294 MP_lintsrc_info(m);
295 mpt += sizeof(*m);
296 count += sizeof(*m);
297 break;
298 }
299 }
300 }
301 setup_apic_routing();
302 if (!num_processors)
303 printk(KERN_ERR "MPTABLE: no processors registered!\n");
304 return num_processors;
305}
306
307static int __init ELCR_trigger(unsigned int irq)
308{
309 unsigned int port;
310
311 port = 0x4d0 + (irq >> 3);
312 return (inb(port) >> (irq & 7)) & 1;
313}
314
315static void __init construct_default_ioirq_mptable(int mpc_default_type)
316{
317 struct mpc_config_intsrc intsrc;
318 int i;
319 int ELCR_fallback = 0;
320
321 intsrc.mpc_type = MP_INTSRC;
322 intsrc.mpc_irqflag = 0; /* conforming */
323 intsrc.mpc_srcbus = 0;
324 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
325
326 intsrc.mpc_irqtype = mp_INT;
327
328 /*
329 * If true, we have an ISA/PCI system with no IRQ entries
330 * in the MP table. To prevent the PCI interrupts from being set up
331 * incorrectly, we try to use the ELCR. The sanity check to see if
332 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
333 * never be level sensitive, so we simply see if the ELCR agrees.
334 * If it does, we assume it's valid.
335 */
336 if (mpc_default_type == 5) {
337 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
338
339 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
340 printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
341 else {
342 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
343 ELCR_fallback = 1;
344 }
345 }
346
347 for (i = 0; i < 16; i++) {
348 switch (mpc_default_type) {
349 case 2:
350 if (i == 0 || i == 13)
351 continue; /* IRQ0 & IRQ13 not connected */
352 /* fall through */
353 default:
354 if (i == 2)
355 continue; /* IRQ2 is never connected */
356 }
357
358 if (ELCR_fallback) {
359 /*
360 * If the ELCR indicates a level-sensitive interrupt, we
361 * copy that information over to the MP table in the
362 * irqflag field (level sensitive, active high polarity).
363 */
364 if (ELCR_trigger(i))
365 intsrc.mpc_irqflag = 13;
366 else
367 intsrc.mpc_irqflag = 0;
368 }
369
370 intsrc.mpc_srcbusirq = i;
371 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
372 MP_intsrc_info(&intsrc);
373 }
374
375 intsrc.mpc_irqtype = mp_ExtINT;
376 intsrc.mpc_srcbusirq = 0;
377 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
378 MP_intsrc_info(&intsrc);
379}
380
381static inline void __init construct_default_ISA_mptable(int mpc_default_type)
382{
383 struct mpc_config_processor processor;
384 struct mpc_config_bus bus;
385 struct mpc_config_ioapic ioapic;
386 struct mpc_config_lintsrc lintsrc;
387 int linttypes[2] = { mp_ExtINT, mp_NMI };
388 int i;
389
390 /*
391 * local APIC has default address
392 */
393 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
394
395 /*
396 * 2 CPUs, numbered 0 & 1.
397 */
398 processor.mpc_type = MP_PROCESSOR;
399 processor.mpc_apicver = 0;
400 processor.mpc_cpuflag = CPU_ENABLED;
401 processor.mpc_cpufeature = 0;
402 processor.mpc_featureflag = 0;
403 processor.mpc_reserved[0] = 0;
404 processor.mpc_reserved[1] = 0;
405 for (i = 0; i < 2; i++) {
406 processor.mpc_apicid = i;
407 MP_processor_info(&processor);
408 }
409
410 bus.mpc_type = MP_BUS;
411 bus.mpc_busid = 0;
412 switch (mpc_default_type) {
413 default:
414 printk(KERN_ERR "???\nUnknown standard configuration %d\n",
415 mpc_default_type);
416 /* fall through */
417 case 1:
418 case 5:
419 memcpy(bus.mpc_bustype, "ISA ", 6);
420 break;
421 }
422 MP_bus_info(&bus);
423 if (mpc_default_type > 4) {
424 bus.mpc_busid = 1;
425 memcpy(bus.mpc_bustype, "PCI ", 6);
426 MP_bus_info(&bus);
427 }
428
429 ioapic.mpc_type = MP_IOAPIC;
430 ioapic.mpc_apicid = 2;
431 ioapic.mpc_apicver = 0;
432 ioapic.mpc_flags = MPC_APIC_USABLE;
433 ioapic.mpc_apicaddr = 0xFEC00000;
434 MP_ioapic_info(&ioapic);
435
436 /*
437 * We set up most of the low 16 IO-APIC pins according to MPS rules.
438 */
439 construct_default_ioirq_mptable(mpc_default_type);
440
441 lintsrc.mpc_type = MP_LINTSRC;
442 lintsrc.mpc_irqflag = 0; /* conforming */
443 lintsrc.mpc_srcbusid = 0;
444 lintsrc.mpc_srcbusirq = 0;
445 lintsrc.mpc_destapic = MP_APIC_ALL;
446 for (i = 0; i < 2; i++) {
447 lintsrc.mpc_irqtype = linttypes[i];
448 lintsrc.mpc_destapiclint = i;
449 MP_lintsrc_info(&lintsrc);
450 }
451}
452
453static struct intel_mp_floating *mpf_found;
454
455/*
456 * Scan the memory blocks for an SMP configuration block.
457 */
458void __init get_smp_config (void)
459{
460 struct intel_mp_floating *mpf = mpf_found;
461
462 /*
463 * ACPI supports both logical (e.g. Hyper-Threading) and physical
464 * processors, where MPS only supports physical.
465 */
466 if (acpi_lapic && acpi_ioapic) {
467 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
468 return;
469 }
470 else if (acpi_lapic)
471 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
472
473 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
474
475 /*
476 * Now see if we need to read further.
477 */
478 if (mpf->mpf_feature1 != 0) {
479
480 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
481 construct_default_ISA_mptable(mpf->mpf_feature1);
482
483 } else if (mpf->mpf_physptr) {
484
485 /*
486 * Read the physical hardware table. Anything here will
487 * override the defaults.
488 */
489 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
490 smp_found_config = 0;
491 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
492 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
493 return;
494 }
495 /*
496 * If there are no explicit MP IRQ entries, then we are
497 * broken. We set up most of the low 16 IO-APIC pins to
498 * ISA defaults and hope it will work.
499 */
500 if (!mp_irq_entries) {
501 struct mpc_config_bus bus;
502
503 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
504
505 bus.mpc_type = MP_BUS;
506 bus.mpc_busid = 0;
507 memcpy(bus.mpc_bustype, "ISA ", 6);
508 MP_bus_info(&bus);
509
510 construct_default_ioirq_mptable(0);
511 }
512
513 } else
514 BUG();
515
516 printk(KERN_INFO "Processors: %d\n", num_processors);
517 /*
518 * Only use the first configuration found.
519 */
520}
521
522static int __init smp_scan_config (unsigned long base, unsigned long length)
523{
524 extern void __bad_mpf_size(void);
525 unsigned int *bp = phys_to_virt(base);
526 struct intel_mp_floating *mpf;
527
528 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
529 if (sizeof(*mpf) != 16)
530 __bad_mpf_size();
531
532 while (length > 0) {
533 mpf = (struct intel_mp_floating *)bp;
534 if ((*bp == SMP_MAGIC_IDENT) &&
535 (mpf->mpf_length == 1) &&
536 !mpf_checksum((unsigned char *)bp, 16) &&
537 ((mpf->mpf_specification == 1)
538 || (mpf->mpf_specification == 4)) ) {
539
540 smp_found_config = 1;
541 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
542 if (mpf->mpf_physptr)
543 reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
544 mpf_found = mpf;
545 return 1;
546 }
547 bp += 4;
548 length -= 16;
549 }
550 return 0;
551}
552
553void __init find_smp_config(void)
554{
555 unsigned int address;
556
557 /*
558 * FIXME: Linux assumes you have 640K of base ram..
559 * this continues the error...
560 *
561 * 1) Scan the bottom 1K for a signature
562 * 2) Scan the top 1K of base RAM
563 * 3) Scan the 64K of bios
564 */
565 if (smp_scan_config(0x0,0x400) ||
566 smp_scan_config(639*0x400,0x400) ||
567 smp_scan_config(0xF0000,0x10000))
568 return;
569 /*
570 * If it is an SMP machine we should know now.
571 *
572 * there is a real-mode segmented pointer pointing to the
573 * 4K EBDA area at 0x40E, calculate and scan it here.
574 *
575 * NOTE! There are Linux loaders that will corrupt the EBDA
576 * area, and as such this kind of SMP config may be less
577 * trustworthy, simply because the SMP table may have been
578 * stomped on during early boot. These loaders are buggy and
579 * should be fixed.
580 */
581
582 address = *(unsigned short *)phys_to_virt(0x40E);
583 address <<= 4;
584 if (smp_scan_config(address, 0x1000))
585 return;
586
587 /* If we have come this far, we did not find an MP table */
588 printk(KERN_INFO "No mptable found.\n");
589}
590
591/* --------------------------------------------------------------------------
592 ACPI-based MP Configuration
593 -------------------------------------------------------------------------- */
594
595#ifdef CONFIG_ACPI
596
597void __init mp_register_lapic_address(u64 address)
598{
599 mp_lapic_addr = (unsigned long) address;
600 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
601 if (boot_cpu_id == -1U)
602 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
603}
604
605void __cpuinit mp_register_lapic (u8 id, u8 enabled)
606{
607 struct mpc_config_processor processor;
608 int boot_cpu = 0;
609
610 if (id == boot_cpu_id)
611 boot_cpu = 1;
612
613 processor.mpc_type = MP_PROCESSOR;
614 processor.mpc_apicid = id;
615 processor.mpc_apicver = 0;
616 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
617 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
618 processor.mpc_cpufeature = 0;
619 processor.mpc_featureflag = 0;
620 processor.mpc_reserved[0] = 0;
621 processor.mpc_reserved[1] = 0;
622
623 MP_processor_info(&processor);
624}
625
626#define MP_ISA_BUS 0
627#define MP_MAX_IOAPIC_PIN 127
628
629static struct mp_ioapic_routing {
630 int apic_id;
631 int gsi_start;
632 int gsi_end;
633 u32 pin_programmed[4];
634} mp_ioapic_routing[MAX_IO_APICS];
635
636static int mp_find_ioapic(int gsi)
637{
638 int i = 0;
639
640 /* Find the IOAPIC that manages this GSI. */
641 for (i = 0; i < nr_ioapics; i++) {
642 if ((gsi >= mp_ioapic_routing[i].gsi_start)
643 && (gsi <= mp_ioapic_routing[i].gsi_end))
644 return i;
645 }
646
647 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
648 return -1;
649}
650
651static u8 uniq_ioapic_id(u8 id)
652{
653 int i;
654 DECLARE_BITMAP(used, 256);
655 bitmap_zero(used, 256);
656 for (i = 0; i < nr_ioapics; i++) {
657 struct mpc_config_ioapic *ia = &mp_ioapics[i];
658 __set_bit(ia->mpc_apicid, used);
659 }
660 if (!test_bit(id, used))
661 return id;
662 return find_first_zero_bit(used, 256);
663}
664
665void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
666{
667 int idx = 0;
668
669 if (bad_ioapic(address))
670 return;
671
672 idx = nr_ioapics;
673
674 mp_ioapics[idx].mpc_type = MP_IOAPIC;
675 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
676 mp_ioapics[idx].mpc_apicaddr = address;
677
678 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
679 mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
680 mp_ioapics[idx].mpc_apicver = 0;
681
682 /*
683 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
684 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
685 */
686 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
687 mp_ioapic_routing[idx].gsi_start = gsi_base;
688 mp_ioapic_routing[idx].gsi_end = gsi_base +
689 io_apic_get_redir_entries(idx);
690
691 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
692 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
693 mp_ioapics[idx].mpc_apicaddr,
694 mp_ioapic_routing[idx].gsi_start,
695 mp_ioapic_routing[idx].gsi_end);
696
697 nr_ioapics++;
698}
699
700void __init
701mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
702{
703 struct mpc_config_intsrc intsrc;
704 int ioapic = -1;
705 int pin = -1;
706
707 /*
708 * Convert 'gsi' to 'ioapic.pin'.
709 */
710 ioapic = mp_find_ioapic(gsi);
711 if (ioapic < 0)
712 return;
713 pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
714
715 /*
716 * TBD: This check is for faulty timer entries, where the override
717 * erroneously sets the trigger to level, resulting in a HUGE
718 * increase of timer interrupts!
719 */
720 if ((bus_irq == 0) && (trigger == 3))
721 trigger = 1;
722
723 intsrc.mpc_type = MP_INTSRC;
724 intsrc.mpc_irqtype = mp_INT;
725 intsrc.mpc_irqflag = (trigger << 2) | polarity;
726 intsrc.mpc_srcbus = MP_ISA_BUS;
727 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
728 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
729 intsrc.mpc_dstirq = pin; /* INTIN# */
730
731 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
732 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
733 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
734 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
735
736 mp_irqs[mp_irq_entries] = intsrc;
737 if (++mp_irq_entries == MAX_IRQ_SOURCES)
738 panic("Max # of irq sources exceeded!\n");
739}
740
741void __init mp_config_acpi_legacy_irqs(void)
742{
743 struct mpc_config_intsrc intsrc;
744 int i = 0;
745 int ioapic = -1;
746
747 /*
748 * Fabricate the legacy ISA bus (bus #31).
749 */
750 set_bit(MP_ISA_BUS, mp_bus_not_pci);
751
752 /*
753 * Locate the IOAPIC that manages the ISA IRQs (0-15).
754 */
755 ioapic = mp_find_ioapic(0);
756 if (ioapic < 0)
757 return;
758
759 intsrc.mpc_type = MP_INTSRC;
760 intsrc.mpc_irqflag = 0; /* Conforming */
761 intsrc.mpc_srcbus = MP_ISA_BUS;
762 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
763
764 /*
765 * Use the default configuration for the IRQs 0-15. Unless
766 * overridden by (MADT) interrupt source override entries.
767 */
768 for (i = 0; i < 16; i++) {
769 int idx;
770
771 for (idx = 0; idx < mp_irq_entries; idx++) {
772 struct mpc_config_intsrc *irq = mp_irqs + idx;
773
774 /* Do we already have a mapping for this ISA IRQ? */
775 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
776 break;
777
778 /* Do we already have a mapping for this IOAPIC pin */
779 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
780 (irq->mpc_dstirq == i))
781 break;
782 }
783
784 if (idx != mp_irq_entries) {
785 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
786 continue; /* IRQ already used */
787 }
788
789 intsrc.mpc_irqtype = mp_INT;
790 intsrc.mpc_srcbusirq = i; /* Identity mapped */
791 intsrc.mpc_dstirq = i;
792
793 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
794 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
795 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
796 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
797 intsrc.mpc_dstirq);
798
799 mp_irqs[mp_irq_entries] = intsrc;
800 if (++mp_irq_entries == MAX_IRQ_SOURCES)
801 panic("Max # of irq sources exceeded!\n");
802 }
803}
804
805int mp_register_gsi(u32 gsi, int triggering, int polarity)
806{
807 int ioapic = -1;
808 int ioapic_pin = 0;
809 int idx, bit = 0;
810
811 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
812 return gsi;
813
814 /* Don't set up the ACPI SCI because it's already set up */
815 if (acpi_gbl_FADT.sci_interrupt == gsi)
816 return gsi;
817
818 ioapic = mp_find_ioapic(gsi);
819 if (ioapic < 0) {
820 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
821 return gsi;
822 }
823
824 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
825
826 /*
827 * Avoid pin reprogramming. PRTs typically include entries
828 * with redundant pin->gsi mappings (but unique PCI devices);
829 * we only program the IOAPIC on the first.
830 */
831 bit = ioapic_pin % 32;
832 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
833 if (idx > 3) {
834 printk(KERN_ERR "Invalid reference to IOAPIC pin "
835 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
836 ioapic_pin);
837 return gsi;
838 }
839 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
840 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
841 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
842 return gsi;
843 }
844
845 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
846
847 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
848 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
849 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
850 return gsi;
851}
852#endif /*CONFIG_ACPI*/
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
new file mode 100644
index 000000000000..0ec6d2ddb931
--- /dev/null
+++ b/arch/x86/kernel/nmi_64.c
@@ -0,0 +1,483 @@
1/*
2 * linux/arch/x86_64/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Pavel Machek and
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */
14
15#include <linux/nmi.h>
16#include <linux/mm.h>
17#include <linux/delay.h>
18#include <linux/interrupt.h>
19#include <linux/module.h>
20#include <linux/sysdev.h>
21#include <linux/sysctl.h>
22#include <linux/kprobes.h>
23#include <linux/cpumask.h>
24#include <linux/kdebug.h>
25
26#include <asm/smp.h>
27#include <asm/nmi.h>
28#include <asm/proto.h>
29#include <asm/mce.h>
30
31int unknown_nmi_panic;
32int nmi_watchdog_enabled;
33int panic_on_unrecovered_nmi;
34
35static cpumask_t backtrace_mask = CPU_MASK_NONE;
36
37/* nmi_active:
38 * >0: the lapic NMI watchdog is active, but can be disabled
39 * <0: the lapic NMI watchdog has not been set up, and cannot
40 * be enabled
41 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
44int panic_on_timeout;
45
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled);
50
51/* local prototypes */
52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
53
54/* Run after command line and cpu_init init, but before all other checks */
55void nmi_watchdog_default(void)
56{
57 if (nmi_watchdog != NMI_DEFAULT)
58 return;
59 nmi_watchdog = NMI_NONE;
60}
61
62static int endflag __initdata = 0;
63
64#ifdef CONFIG_SMP
65/* The performance counters used by NMI_LOCAL_APIC don't trigger when
66 * the CPU is idle. To make sure the NMI watchdog really ticks on all
67 * CPUs during the test make them busy.
68 */
69static __init void nmi_cpu_busy(void *data)
70{
71 local_irq_enable_in_hardirq();
72 /* Intentionally don't use cpu_relax here. This is
73 to make sure that the performance counter really ticks,
74 even if there is a simulator or similar that catches the
75 pause instruction. On a real HT machine this is fine because
76 all other CPUs are busy with "useless" delay loops and don't
77 care if they get somewhat less cycles. */
78 while (endflag == 0)
79 mb();
80}
81#endif
82
83int __init check_nmi_watchdog (void)
84{
85 int *counts;
86 int cpu;
87
88 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
89 return 0;
90
91 if (!atomic_read(&nmi_active))
92 return 0;
93
94 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
95 if (!counts)
96 return -1;
97
98 printk(KERN_INFO "testing NMI watchdog ... ");
99
100#ifdef CONFIG_SMP
101 if (nmi_watchdog == NMI_LOCAL_APIC)
102 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
103#endif
104
105 for (cpu = 0; cpu < NR_CPUS; cpu++)
106 counts[cpu] = cpu_pda(cpu)->__nmi_count;
107 local_irq_enable();
108 mdelay((20*1000)/nmi_hz); // wait 20 ticks
109
110 for_each_online_cpu(cpu) {
111 if (!per_cpu(wd_enabled, cpu))
112 continue;
113 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
114 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu,
116 counts[cpu],
117 cpu_pda(cpu)->__nmi_count);
118 per_cpu(wd_enabled, cpu) = 0;
119 atomic_dec(&nmi_active);
120 }
121 }
122 if (!atomic_read(&nmi_active)) {
123 kfree(counts);
124 atomic_set(&nmi_active, -1);
125 endflag = 1;
126 return -1;
127 }
128 endflag = 1;
129 printk("OK.\n");
130
131 /* now that we know it works we can reduce NMI frequency to
132 something more reasonable; makes a difference in some configs */
133 if (nmi_watchdog == NMI_LOCAL_APIC)
134 nmi_hz = lapic_adjust_nmi_hz(1);
135
136 kfree(counts);
137 return 0;
138}
139
140int __init setup_nmi_watchdog(char *str)
141{
142 int nmi;
143
144 if (!strncmp(str,"panic",5)) {
145 panic_on_timeout = 1;
146 str = strchr(str, ',');
147 if (!str)
148 return 1;
149 ++str;
150 }
151
152 get_option(&str, &nmi);
153
154 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
155 return 0;
156
157 nmi_watchdog = nmi;
158 return 1;
159}
160
161__setup("nmi_watchdog=", setup_nmi_watchdog);
162
163
164static void __acpi_nmi_disable(void *__unused)
165{
166 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
167}
168
169/*
170 * Disable timer based NMIs on all CPUs:
171 */
172void acpi_nmi_disable(void)
173{
174 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
175 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
176}
177
178static void __acpi_nmi_enable(void *__unused)
179{
180 apic_write(APIC_LVT0, APIC_DM_NMI);
181}
182
183/*
184 * Enable timer based NMIs on all CPUs:
185 */
186void acpi_nmi_enable(void)
187{
188 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
189 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
190}
191#ifdef CONFIG_PM
192
193static int nmi_pm_active; /* nmi_active before suspend */
194
195static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
196{
197 /* only CPU0 goes here, other CPUs should be offline */
198 nmi_pm_active = atomic_read(&nmi_active);
199 stop_apic_nmi_watchdog(NULL);
200 BUG_ON(atomic_read(&nmi_active) != 0);
201 return 0;
202}
203
204static int lapic_nmi_resume(struct sys_device *dev)
205{
206 /* only CPU0 goes here, other CPUs should be offline */
207 if (nmi_pm_active > 0) {
208 setup_apic_nmi_watchdog(NULL);
209 touch_nmi_watchdog();
210 }
211 return 0;
212}
213
214static struct sysdev_class nmi_sysclass = {
215 set_kset_name("lapic_nmi"),
216 .resume = lapic_nmi_resume,
217 .suspend = lapic_nmi_suspend,
218};
219
220static struct sys_device device_lapic_nmi = {
221 .id = 0,
222 .cls = &nmi_sysclass,
223};
224
225static int __init init_lapic_nmi_sysfs(void)
226{
227 int error;
228
229 /* should really be a BUG_ON but b/c this is an
230 * init call, it just doesn't work. -dcz
231 */
232 if (nmi_watchdog != NMI_LOCAL_APIC)
233 return 0;
234
235 if ( atomic_read(&nmi_active) < 0 )
236 return 0;
237
238 error = sysdev_class_register(&nmi_sysclass);
239 if (!error)
240 error = sysdev_register(&device_lapic_nmi);
241 return error;
242}
243/* must come after the local APIC's device_initcall() */
244late_initcall(init_lapic_nmi_sysfs);
245
246#endif /* CONFIG_PM */
247
248void setup_apic_nmi_watchdog(void *unused)
249{
250 if (__get_cpu_var(wd_enabled) == 1)
251 return;
252
253 /* cheap hack to support suspend/resume */
254 /* if cpu0 is not active neither should the other cpus */
255 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
256 return;
257
258 switch (nmi_watchdog) {
259 case NMI_LOCAL_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 if (lapic_watchdog_init(nmi_hz) < 0) {
262 __get_cpu_var(wd_enabled) = 0;
263 return;
264 }
265 /* FALL THROUGH */
266 case NMI_IO_APIC:
267 __get_cpu_var(wd_enabled) = 1;
268 atomic_inc(&nmi_active);
269 }
270}
271
272void stop_apic_nmi_watchdog(void *unused)
273{
274 /* only support LOCAL and IO APICs for now */
275 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
276 (nmi_watchdog != NMI_IO_APIC))
277 return;
278 if (__get_cpu_var(wd_enabled) == 0)
279 return;
280 if (nmi_watchdog == NMI_LOCAL_APIC)
281 lapic_watchdog_stop();
282 __get_cpu_var(wd_enabled) = 0;
283 atomic_dec(&nmi_active);
284}
285
286/*
287 * the best way to detect whether a CPU has a 'hard lockup' problem
288 * is to check it's local APIC timer IRQ counts. If they are not
289 * changing then that CPU has some problem.
290 *
291 * as these watchdog NMI IRQs are generated on every CPU, we only
292 * have to check the current processor.
293 */
294
295static DEFINE_PER_CPU(unsigned, last_irq_sum);
296static DEFINE_PER_CPU(local_t, alert_counter);
297static DEFINE_PER_CPU(int, nmi_touch);
298
299void touch_nmi_watchdog(void)
300{
301 if (nmi_watchdog > 0) {
302 unsigned cpu;
303
304 /*
305 * Tell other CPUs to reset their alert counters. We cannot
306 * do it ourselves because the alert count increase is not
307 * atomic.
308 */
309 for_each_present_cpu(cpu) {
310 if (per_cpu(nmi_touch, cpu) != 1)
311 per_cpu(nmi_touch, cpu) = 1;
312 }
313 }
314
315 touch_softlockup_watchdog();
316}
317
318int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
319{
320 int sum;
321 int touched = 0;
322 int cpu = smp_processor_id();
323 int rc = 0;
324
325 /* check for other users first */
326 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
327 == NOTIFY_STOP) {
328 rc = 1;
329 touched = 1;
330 }
331
332 sum = read_pda(apic_timer_irqs);
333 if (__get_cpu_var(nmi_touch)) {
334 __get_cpu_var(nmi_touch) = 0;
335 touched = 1;
336 }
337
338 if (cpu_isset(cpu, backtrace_mask)) {
339 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
340
341 spin_lock(&lock);
342 printk("NMI backtrace for cpu %d\n", cpu);
343 dump_stack();
344 spin_unlock(&lock);
345 cpu_clear(cpu, backtrace_mask);
346 }
347
348#ifdef CONFIG_X86_MCE
349 /* Could check oops_in_progress here too, but it's safer
350 not too */
351 if (atomic_read(&mce_entry) > 0)
352 touched = 1;
353#endif
354 /* if the apic timer isn't firing, this cpu isn't doing much */
355 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
356 /*
357 * Ayiee, looks like this CPU is stuck ...
358 * wait a few IRQs (5 seconds) before doing the oops ...
359 */
360 local_inc(&__get_cpu_var(alert_counter));
361 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
362 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
363 panic_on_timeout);
364 } else {
365 __get_cpu_var(last_irq_sum) = sum;
366 local_set(&__get_cpu_var(alert_counter), 0);
367 }
368
369 /* see if the nmi watchdog went off */
370 if (!__get_cpu_var(wd_enabled))
371 return rc;
372 switch (nmi_watchdog) {
373 case NMI_LOCAL_APIC:
374 rc |= lapic_wd_event(nmi_hz);
375 break;
376 case NMI_IO_APIC:
377 /* don't know how to accurately check for this.
378 * just assume it was a watchdog timer interrupt
379 * This matches the old behaviour.
380 */
381 rc = 1;
382 break;
383 }
384 return rc;
385}
386
387static unsigned ignore_nmis;
388
389asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
390{
391 nmi_enter();
392 add_pda(__nmi_count,1);
393 if (!ignore_nmis)
394 default_do_nmi(regs);
395 nmi_exit();
396}
397
398int do_nmi_callback(struct pt_regs * regs, int cpu)
399{
400#ifdef CONFIG_SYSCTL
401 if (unknown_nmi_panic)
402 return unknown_nmi_panic_callback(regs, cpu);
403#endif
404 return 0;
405}
406
407void stop_nmi(void)
408{
409 acpi_nmi_disable();
410 ignore_nmis++;
411}
412
413void restart_nmi(void)
414{
415 ignore_nmis--;
416 acpi_nmi_enable();
417}
418
419#ifdef CONFIG_SYSCTL
420
421static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
422{
423 unsigned char reason = get_nmi_reason();
424 char buf[64];
425
426 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
427 die_nmi(buf, regs, 1); /* Always panic here */
428 return 0;
429}
430
431/*
432 * proc handler for /proc/sys/kernel/nmi
433 */
434int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
435 void __user *buffer, size_t *length, loff_t *ppos)
436{
437 int old_state;
438
439 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
440 old_state = nmi_watchdog_enabled;
441 proc_dointvec(table, write, file, buffer, length, ppos);
442 if (!!old_state == !!nmi_watchdog_enabled)
443 return 0;
444
445 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
446 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
447 return -EIO;
448 }
449
450 /* if nmi_watchdog is not set yet, then set it */
451 nmi_watchdog_default();
452
453 if (nmi_watchdog == NMI_LOCAL_APIC) {
454 if (nmi_watchdog_enabled)
455 enable_lapic_nmi_watchdog();
456 else
457 disable_lapic_nmi_watchdog();
458 } else {
459 printk( KERN_WARNING
460 "NMI watchdog doesn't know what hardware to touch\n");
461 return -EIO;
462 }
463 return 0;
464}
465
466#endif
467
468void __trigger_all_cpu_backtrace(void)
469{
470 int i;
471
472 backtrace_mask = cpu_online_map;
473 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
474 for (i = 0; i < 10 * 1000; i++) {
475 if (cpus_empty(backtrace_mask))
476 break;
477 mdelay(1);
478 }
479}
480
481EXPORT_SYMBOL(nmi_active);
482EXPORT_SYMBOL(nmi_watchdog);
483EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644
index 000000000000..71da01e73f03
--- /dev/null
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -0,0 +1,1578 @@
1/*
2 * Derived from arch/powerpc/kernel/iommu.c
3 *
4 * Copyright IBM Corporation, 2006-2007
5 * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
6 *
7 * Author: Jon Mason <jdmason@kudzu.us>
8 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
9
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/mm.h>
30#include <linux/spinlock.h>
31#include <linux/string.h>
32#include <linux/dma-mapping.h>
33#include <linux/init.h>
34#include <linux/bitops.h>
35#include <linux/pci_ids.h>
36#include <linux/pci.h>
37#include <linux/delay.h>
38#include <asm/iommu.h>
39#include <asm/calgary.h>
40#include <asm/tce.h>
41#include <asm/pci-direct.h>
42#include <asm/system.h>
43#include <asm/dma.h>
44#include <asm/rio.h>
45
46#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
47int use_calgary __read_mostly = 1;
48#else
49int use_calgary __read_mostly = 0;
50#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
51
52#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
53#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
54
55/* register offsets inside the host bridge space */
56#define CALGARY_CONFIG_REG 0x0108
57#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
58#define PHB_PLSSR_OFFSET 0x0120
59#define PHB_CONFIG_RW_OFFSET 0x0160
60#define PHB_IOBASE_BAR_LOW 0x0170
61#define PHB_IOBASE_BAR_HIGH 0x0180
62#define PHB_MEM_1_LOW 0x0190
63#define PHB_MEM_1_HIGH 0x01A0
64#define PHB_IO_ADDR_SIZE 0x01B0
65#define PHB_MEM_1_SIZE 0x01C0
66#define PHB_MEM_ST_OFFSET 0x01D0
67#define PHB_AER_OFFSET 0x0200
68#define PHB_CONFIG_0_HIGH 0x0220
69#define PHB_CONFIG_0_LOW 0x0230
70#define PHB_CONFIG_0_END 0x0240
71#define PHB_MEM_2_LOW 0x02B0
72#define PHB_MEM_2_HIGH 0x02C0
73#define PHB_MEM_2_SIZE_HIGH 0x02D0
74#define PHB_MEM_2_SIZE_LOW 0x02E0
75#define PHB_DOSHOLE_OFFSET 0x08E0
76
77/* CalIOC2 specific */
78#define PHB_SAVIOR_L2 0x0DB0
79#define PHB_PAGE_MIG_CTRL 0x0DA8
80#define PHB_PAGE_MIG_DEBUG 0x0DA0
81#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
82
83/* PHB_CONFIG_RW */
84#define PHB_TCE_ENABLE 0x20000000
85#define PHB_SLOT_DISABLE 0x1C000000
86#define PHB_DAC_DISABLE 0x01000000
87#define PHB_MEM2_ENABLE 0x00400000
88#define PHB_MCSR_ENABLE 0x00100000
89/* TAR (Table Address Register) */
90#define TAR_SW_BITS 0x0000ffffffff800fUL
91#define TAR_VALID 0x0000000000000008UL
92/* CSR (Channel/DMA Status Register) */
93#define CSR_AGENT_MASK 0xffe0ffff
94/* CCR (Calgary Configuration Register) */
95#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
96/* PMCR/PMDR (Page Migration Control/Debug Registers */
97#define PMR_SOFTSTOP 0x80000000
98#define PMR_SOFTSTOPFAULT 0x40000000
99#define PMR_HARDSTOP 0x20000000
100
101#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
102#define MAX_NUM_CHASSIS 8 /* max number of chassis */
103/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
104#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
105#define PHBS_PER_CALGARY 4
106
107/* register offsets in Calgary's internal register space */
108static const unsigned long tar_offsets[] = {
109 0x0580 /* TAR0 */,
110 0x0588 /* TAR1 */,
111 0x0590 /* TAR2 */,
112 0x0598 /* TAR3 */
113};
114
115static const unsigned long split_queue_offsets[] = {
116 0x4870 /* SPLIT QUEUE 0 */,
117 0x5870 /* SPLIT QUEUE 1 */,
118 0x6870 /* SPLIT QUEUE 2 */,
119 0x7870 /* SPLIT QUEUE 3 */
120};
121
122static const unsigned long phb_offsets[] = {
123 0x8000 /* PHB0 */,
124 0x9000 /* PHB1 */,
125 0xA000 /* PHB2 */,
126 0xB000 /* PHB3 */
127};
128
129/* PHB debug registers */
130
131static const unsigned long phb_debug_offsets[] = {
132 0x4000 /* PHB 0 DEBUG */,
133 0x5000 /* PHB 1 DEBUG */,
134 0x6000 /* PHB 2 DEBUG */,
135 0x7000 /* PHB 3 DEBUG */
136};
137
138/*
139 * STUFF register for each debug PHB,
140 * byte 1 = start bus number, byte 2 = end bus number
141 */
142
143#define PHB_DEBUG_STUFF_OFFSET 0x0020
144
145#define EMERGENCY_PAGES 32 /* = 128KB */
146
147unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
148static int translate_empty_slots __read_mostly = 0;
149static int calgary_detected __read_mostly = 0;
150
151static struct rio_table_hdr *rio_table_hdr __initdata;
152static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
153static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
154
155struct calgary_bus_info {
156 void *tce_space;
157 unsigned char translation_disabled;
158 signed char phbid;
159 void __iomem *bbar;
160};
161
162static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
163static void calgary_tce_cache_blast(struct iommu_table *tbl);
164static void calgary_dump_error_regs(struct iommu_table *tbl);
165static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
166static void calioc2_tce_cache_blast(struct iommu_table *tbl);
167static void calioc2_dump_error_regs(struct iommu_table *tbl);
168
169static struct cal_chipset_ops calgary_chip_ops = {
170 .handle_quirks = calgary_handle_quirks,
171 .tce_cache_blast = calgary_tce_cache_blast,
172 .dump_error_regs = calgary_dump_error_regs
173};
174
175static struct cal_chipset_ops calioc2_chip_ops = {
176 .handle_quirks = calioc2_handle_quirks,
177 .tce_cache_blast = calioc2_tce_cache_blast,
178 .dump_error_regs = calioc2_dump_error_regs
179};
180
181static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
182
183/* enable this to stress test the chip's TCE cache */
184#ifdef CONFIG_IOMMU_DEBUG
185int debugging __read_mostly = 1;
186
187static inline unsigned long verify_bit_range(unsigned long* bitmap,
188 int expected, unsigned long start, unsigned long end)
189{
190 unsigned long idx = start;
191
192 BUG_ON(start >= end);
193
194 while (idx < end) {
195 if (!!test_bit(idx, bitmap) != expected)
196 return idx;
197 ++idx;
198 }
199
200 /* all bits have the expected value */
201 return ~0UL;
202}
203#else /* debugging is disabled */
204int debugging __read_mostly = 0;
205
206static inline unsigned long verify_bit_range(unsigned long* bitmap,
207 int expected, unsigned long start, unsigned long end)
208{
209 return ~0UL;
210}
211
212#endif /* CONFIG_IOMMU_DEBUG */
213
214static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
215{
216 unsigned int npages;
217
218 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
219 npages >>= PAGE_SHIFT;
220
221 return npages;
222}
223
224static inline int translate_phb(struct pci_dev* dev)
225{
226 int disabled = bus_info[dev->bus->number].translation_disabled;
227 return !disabled;
228}
229
230static void iommu_range_reserve(struct iommu_table *tbl,
231 unsigned long start_addr, unsigned int npages)
232{
233 unsigned long index;
234 unsigned long end;
235 unsigned long badbit;
236 unsigned long flags;
237
238 index = start_addr >> PAGE_SHIFT;
239
240 /* bail out if we're asked to reserve a region we don't cover */
241 if (index >= tbl->it_size)
242 return;
243
244 end = index + npages;
245 if (end > tbl->it_size) /* don't go off the table */
246 end = tbl->it_size;
247
248 spin_lock_irqsave(&tbl->it_lock, flags);
249
250 badbit = verify_bit_range(tbl->it_map, 0, index, end);
251 if (badbit != ~0UL) {
252 if (printk_ratelimit())
253 printk(KERN_ERR "Calgary: entry already allocated at "
254 "0x%lx tbl %p dma 0x%lx npages %u\n",
255 badbit, tbl, start_addr, npages);
256 }
257
258 set_bit_string(tbl->it_map, index, npages);
259
260 spin_unlock_irqrestore(&tbl->it_lock, flags);
261}
262
263static unsigned long iommu_range_alloc(struct iommu_table *tbl,
264 unsigned int npages)
265{
266 unsigned long flags;
267 unsigned long offset;
268
269 BUG_ON(npages == 0);
270
271 spin_lock_irqsave(&tbl->it_lock, flags);
272
273 offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
274 tbl->it_size, npages);
275 if (offset == ~0UL) {
276 tbl->chip_ops->tce_cache_blast(tbl);
277 offset = find_next_zero_string(tbl->it_map, 0,
278 tbl->it_size, npages);
279 if (offset == ~0UL) {
280 printk(KERN_WARNING "Calgary: IOMMU full.\n");
281 spin_unlock_irqrestore(&tbl->it_lock, flags);
282 if (panic_on_overflow)
283 panic("Calgary: fix the allocator.\n");
284 else
285 return bad_dma_address;
286 }
287 }
288
289 set_bit_string(tbl->it_map, offset, npages);
290 tbl->it_hint = offset + npages;
291 BUG_ON(tbl->it_hint > tbl->it_size);
292
293 spin_unlock_irqrestore(&tbl->it_lock, flags);
294
295 return offset;
296}
297
298static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
299 unsigned int npages, int direction)
300{
301 unsigned long entry;
302 dma_addr_t ret = bad_dma_address;
303
304 entry = iommu_range_alloc(tbl, npages);
305
306 if (unlikely(entry == bad_dma_address))
307 goto error;
308
309 /* set the return dma address */
310 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
311
312 /* put the TCEs in the HW table */
313 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
314 direction);
315
316 return ret;
317
318error:
319 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
320 "iommu %p\n", npages, tbl);
321 return bad_dma_address;
322}
323
324static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
325 unsigned int npages)
326{
327 unsigned long entry;
328 unsigned long badbit;
329 unsigned long badend;
330 unsigned long flags;
331
332 /* were we called with bad_dma_address? */
333 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
334 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
335 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
336 "address 0x%Lx\n", dma_addr);
337 WARN_ON(1);
338 return;
339 }
340
341 entry = dma_addr >> PAGE_SHIFT;
342
343 BUG_ON(entry + npages > tbl->it_size);
344
345 tce_free(tbl, entry, npages);
346
347 spin_lock_irqsave(&tbl->it_lock, flags);
348
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 __clear_bit_string(tbl->it_map, entry, npages);
358
359 spin_unlock_irqrestore(&tbl->it_lock, flags);
360}
361
362static inline struct iommu_table *find_iommu_table(struct device *dev)
363{
364 struct pci_dev *pdev;
365 struct pci_bus *pbus;
366 struct iommu_table *tbl;
367
368 pdev = to_pci_dev(dev);
369
370 pbus = pdev->bus;
371
372 /* is the device behind a bridge? Look for the root bus */
373 while (pbus->parent)
374 pbus = pbus->parent;
375
376 tbl = pci_iommu(pbus);
377
378 BUG_ON(tbl && (tbl->it_busno != pbus->number));
379
380 return tbl;
381}
382
383static void calgary_unmap_sg(struct device *dev,
384 struct scatterlist *sglist, int nelems, int direction)
385{
386 struct iommu_table *tbl = find_iommu_table(dev);
387
388 if (!translate_phb(to_pci_dev(dev)))
389 return;
390
391 while (nelems--) {
392 unsigned int npages;
393 dma_addr_t dma = sglist->dma_address;
394 unsigned int dmalen = sglist->dma_length;
395
396 if (dmalen == 0)
397 break;
398
399 npages = num_dma_pages(dma, dmalen);
400 iommu_free(tbl, dma, npages);
401 sglist++;
402 }
403}
404
405static int calgary_nontranslate_map_sg(struct device* dev,
406 struct scatterlist *sg, int nelems, int direction)
407{
408 int i;
409
410 for (i = 0; i < nelems; i++ ) {
411 struct scatterlist *s = &sg[i];
412 BUG_ON(!s->page);
413 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
414 s->dma_length = s->length;
415 }
416 return nelems;
417}
418
419static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
420 int nelems, int direction)
421{
422 struct iommu_table *tbl = find_iommu_table(dev);
423 unsigned long vaddr;
424 unsigned int npages;
425 unsigned long entry;
426 int i;
427
428 if (!translate_phb(to_pci_dev(dev)))
429 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
430
431 for (i = 0; i < nelems; i++ ) {
432 struct scatterlist *s = &sg[i];
433 BUG_ON(!s->page);
434
435 vaddr = (unsigned long)page_address(s->page) + s->offset;
436 npages = num_dma_pages(vaddr, s->length);
437
438 entry = iommu_range_alloc(tbl, npages);
439 if (entry == bad_dma_address) {
440 /* makes sure unmap knows to stop */
441 s->dma_length = 0;
442 goto error;
443 }
444
445 s->dma_address = (entry << PAGE_SHIFT) | s->offset;
446
447 /* insert into HW table */
448 tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
449 direction);
450
451 s->dma_length = s->length;
452 }
453
454 return nelems;
455error:
456 calgary_unmap_sg(dev, sg, nelems, direction);
457 for (i = 0; i < nelems; i++) {
458 sg[i].dma_address = bad_dma_address;
459 sg[i].dma_length = 0;
460 }
461 return 0;
462}
463
464static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
465 size_t size, int direction)
466{
467 dma_addr_t dma_handle = bad_dma_address;
468 unsigned long uaddr;
469 unsigned int npages;
470 struct iommu_table *tbl = find_iommu_table(dev);
471
472 uaddr = (unsigned long)vaddr;
473 npages = num_dma_pages(uaddr, size);
474
475 if (translate_phb(to_pci_dev(dev)))
476 dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
477 else
478 dma_handle = virt_to_bus(vaddr);
479
480 return dma_handle;
481}
482
483static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
484 size_t size, int direction)
485{
486 struct iommu_table *tbl = find_iommu_table(dev);
487 unsigned int npages;
488
489 if (!translate_phb(to_pci_dev(dev)))
490 return;
491
492 npages = num_dma_pages(dma_handle, size);
493 iommu_free(tbl, dma_handle, npages);
494}
495
496static void* calgary_alloc_coherent(struct device *dev, size_t size,
497 dma_addr_t *dma_handle, gfp_t flag)
498{
499 void *ret = NULL;
500 dma_addr_t mapping;
501 unsigned int npages, order;
502 struct iommu_table *tbl = find_iommu_table(dev);
503
504 size = PAGE_ALIGN(size); /* size rounded up to full pages */
505 npages = size >> PAGE_SHIFT;
506 order = get_order(size);
507
508 /* alloc enough pages (and possibly more) */
509 ret = (void *)__get_free_pages(flag, order);
510 if (!ret)
511 goto error;
512 memset(ret, 0, size);
513
514 if (translate_phb(to_pci_dev(dev))) {
515 /* set up tces to cover the allocated range */
516 mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
517 if (mapping == bad_dma_address)
518 goto free;
519
520 *dma_handle = mapping;
521 } else /* non translated slot */
522 *dma_handle = virt_to_bus(ret);
523
524 return ret;
525
526free:
527 free_pages((unsigned long)ret, get_order(size));
528 ret = NULL;
529error:
530 return ret;
531}
532
533static const struct dma_mapping_ops calgary_dma_ops = {
534 .alloc_coherent = calgary_alloc_coherent,
535 .map_single = calgary_map_single,
536 .unmap_single = calgary_unmap_single,
537 .map_sg = calgary_map_sg,
538 .unmap_sg = calgary_unmap_sg,
539};
540
541static inline void __iomem * busno_to_bbar(unsigned char num)
542{
543 return bus_info[num].bbar;
544}
545
546static inline int busno_to_phbid(unsigned char num)
547{
548 return bus_info[num].phbid;
549}
550
551static inline unsigned long split_queue_offset(unsigned char num)
552{
553 size_t idx = busno_to_phbid(num);
554
555 return split_queue_offsets[idx];
556}
557
558static inline unsigned long tar_offset(unsigned char num)
559{
560 size_t idx = busno_to_phbid(num);
561
562 return tar_offsets[idx];
563}
564
565static inline unsigned long phb_offset(unsigned char num)
566{
567 size_t idx = busno_to_phbid(num);
568
569 return phb_offsets[idx];
570}
571
572static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
573{
574 unsigned long target = ((unsigned long)bar) | offset;
575 return (void __iomem*)target;
576}
577
578static inline int is_calioc2(unsigned short device)
579{
580 return (device == PCI_DEVICE_ID_IBM_CALIOC2);
581}
582
583static inline int is_calgary(unsigned short device)
584{
585 return (device == PCI_DEVICE_ID_IBM_CALGARY);
586}
587
588static inline int is_cal_pci_dev(unsigned short device)
589{
590 return (is_calgary(device) || is_calioc2(device));
591}
592
593static void calgary_tce_cache_blast(struct iommu_table *tbl)
594{
595 u64 val;
596 u32 aer;
597 int i = 0;
598 void __iomem *bbar = tbl->bbar;
599 void __iomem *target;
600
601 /* disable arbitration on the bus */
602 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
603 aer = readl(target);
604 writel(0, target);
605
606 /* read plssr to ensure it got there */
607 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
608 val = readl(target);
609
610 /* poll split queues until all DMA activity is done */
611 target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
612 do {
613 val = readq(target);
614 i++;
615 } while ((val & 0xff) != 0xff && i < 100);
616 if (i == 100)
617 printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
618 "continuing anyway\n");
619
620 /* invalidate TCE cache */
621 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
622 writeq(tbl->tar_val, target);
623
624 /* enable arbitration */
625 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
626 writel(aer, target);
627 (void)readl(target); /* flush */
628}
629
630static void calioc2_tce_cache_blast(struct iommu_table *tbl)
631{
632 void __iomem *bbar = tbl->bbar;
633 void __iomem *target;
634 u64 val64;
635 u32 val;
636 int i = 0;
637 int count = 1;
638 unsigned char bus = tbl->it_busno;
639
640begin:
641 printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
642 "sequence - count %d\n", bus, count);
643
644 /* 1. using the Page Migration Control reg set SoftStop */
645 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
646 val = be32_to_cpu(readl(target));
647 printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
648 val |= PMR_SOFTSTOP;
649 printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
650 writel(cpu_to_be32(val), target);
651
652 /* 2. poll split queues until all DMA activity is done */
653 printk(KERN_DEBUG "2a. starting to poll split queues\n");
654 target = calgary_reg(bbar, split_queue_offset(bus));
655 do {
656 val64 = readq(target);
657 i++;
658 } while ((val64 & 0xff) != 0xff && i < 100);
659 if (i == 100)
660 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
661 "continuing anyway\n");
662
663 /* 3. poll Page Migration DEBUG for SoftStopFault */
664 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
665 val = be32_to_cpu(readl(target));
666 printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
667
668 /* 4. if SoftStopFault - goto (1) */
669 if (val & PMR_SOFTSTOPFAULT) {
670 if (++count < 100)
671 goto begin;
672 else {
673 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
674 "aborting TCE cache flush sequence!\n");
675 return; /* pray for the best */
676 }
677 }
678
679 /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
680 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
681 printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
682 val = be32_to_cpu(readl(target));
683 printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
684 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
685 val = be32_to_cpu(readl(target));
686 printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
687
688 /* 6. invalidate TCE cache */
689 printk(KERN_DEBUG "6. invalidating TCE cache\n");
690 target = calgary_reg(bbar, tar_offset(bus));
691 writeq(tbl->tar_val, target);
692
693 /* 7. Re-read PMCR */
694 printk(KERN_DEBUG "7a. Re-reading PMCR\n");
695 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
696 val = be32_to_cpu(readl(target));
697 printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
698
699 /* 8. Remove HardStop */
700 printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
701 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
702 val = 0;
703 printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
704 writel(cpu_to_be32(val), target);
705 val = be32_to_cpu(readl(target));
706 printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
707}
708
709static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
710 u64 limit)
711{
712 unsigned int numpages;
713
714 limit = limit | 0xfffff;
715 limit++;
716
717 numpages = ((limit - start) >> PAGE_SHIFT);
718 iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
719}
720
721static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
722{
723 void __iomem *target;
724 u64 low, high, sizelow;
725 u64 start, limit;
726 struct iommu_table *tbl = pci_iommu(dev->bus);
727 unsigned char busnum = dev->bus->number;
728 void __iomem *bbar = tbl->bbar;
729
730 /* peripheral MEM_1 region */
731 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
732 low = be32_to_cpu(readl(target));
733 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
734 high = be32_to_cpu(readl(target));
735 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
736 sizelow = be32_to_cpu(readl(target));
737
738 start = (high << 32) | low;
739 limit = sizelow;
740
741 calgary_reserve_mem_region(dev, start, limit);
742}
743
744static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
745{
746 void __iomem *target;
747 u32 val32;
748 u64 low, high, sizelow, sizehigh;
749 u64 start, limit;
750 struct iommu_table *tbl = pci_iommu(dev->bus);
751 unsigned char busnum = dev->bus->number;
752 void __iomem *bbar = tbl->bbar;
753
754 /* is it enabled? */
755 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
756 val32 = be32_to_cpu(readl(target));
757 if (!(val32 & PHB_MEM2_ENABLE))
758 return;
759
760 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
761 low = be32_to_cpu(readl(target));
762 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
763 high = be32_to_cpu(readl(target));
764 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
765 sizelow = be32_to_cpu(readl(target));
766 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
767 sizehigh = be32_to_cpu(readl(target));
768
769 start = (high << 32) | low;
770 limit = (sizehigh << 32) | sizelow;
771
772 calgary_reserve_mem_region(dev, start, limit);
773}
774
775/*
776 * some regions of the IO address space do not get translated, so we
777 * must not give devices IO addresses in those regions. The regions
778 * are the 640KB-1MB region and the two PCI peripheral memory holes.
779 * Reserve all of them in the IOMMU bitmap to avoid giving them out
780 * later.
781 */
782static void __init calgary_reserve_regions(struct pci_dev *dev)
783{
784 unsigned int npages;
785 u64 start;
786 struct iommu_table *tbl = pci_iommu(dev->bus);
787
788 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
789 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
790
791 /* avoid the BIOS/VGA first 640KB-1MB region */
792 /* for CalIOC2 - avoid the entire first MB */
793 if (is_calgary(dev->device)) {
794 start = (640 * 1024);
795 npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
796 } else { /* calioc2 */
797 start = 0;
798 npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
799 }
800 iommu_range_reserve(tbl, start, npages);
801
802 /* reserve the two PCI peripheral memory regions in IO space */
803 calgary_reserve_peripheral_mem_1(dev);
804 calgary_reserve_peripheral_mem_2(dev);
805}
806
807static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
808{
809 u64 val64;
810 u64 table_phys;
811 void __iomem *target;
812 int ret;
813 struct iommu_table *tbl;
814
815 /* build TCE tables for each PHB */
816 ret = build_tce_table(dev, bbar);
817 if (ret)
818 return ret;
819
820 tbl = pci_iommu(dev->bus);
821 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
822 tce_free(tbl, 0, tbl->it_size);
823
824 if (is_calgary(dev->device))
825 tbl->chip_ops = &calgary_chip_ops;
826 else if (is_calioc2(dev->device))
827 tbl->chip_ops = &calioc2_chip_ops;
828 else
829 BUG();
830
831 calgary_reserve_regions(dev);
832
833 /* set TARs for each PHB */
834 target = calgary_reg(bbar, tar_offset(dev->bus->number));
835 val64 = be64_to_cpu(readq(target));
836
837 /* zero out all TAR bits under sw control */
838 val64 &= ~TAR_SW_BITS;
839 table_phys = (u64)__pa(tbl->it_base);
840
841 val64 |= table_phys;
842
843 BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
844 val64 |= (u64) specified_table_size;
845
846 tbl->tar_val = cpu_to_be64(val64);
847
848 writeq(tbl->tar_val, target);
849 readq(target); /* flush */
850
851 return 0;
852}
853
854static void __init calgary_free_bus(struct pci_dev *dev)
855{
856 u64 val64;
857 struct iommu_table *tbl = pci_iommu(dev->bus);
858 void __iomem *target;
859 unsigned int bitmapsz;
860
861 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
862 val64 = be64_to_cpu(readq(target));
863 val64 &= ~TAR_SW_BITS;
864 writeq(cpu_to_be64(val64), target);
865 readq(target); /* flush */
866
867 bitmapsz = tbl->it_size / BITS_PER_BYTE;
868 free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
869 tbl->it_map = NULL;
870
871 kfree(tbl);
872
873 set_pci_iommu(dev->bus, NULL);
874
875 /* Can't free bootmem allocated memory after system is up :-( */
876 bus_info[dev->bus->number].tce_space = NULL;
877}
878
879static void calgary_dump_error_regs(struct iommu_table *tbl)
880{
881 void __iomem *bbar = tbl->bbar;
882 void __iomem *target;
883 u32 csr, plssr;
884
885 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
886 csr = be32_to_cpu(readl(target));
887
888 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
889 plssr = be32_to_cpu(readl(target));
890
891 /* If no error, the agent ID in the CSR is not valid */
892 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
893 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
894}
895
896static void calioc2_dump_error_regs(struct iommu_table *tbl)
897{
898 void __iomem *bbar = tbl->bbar;
899 u32 csr, csmr, plssr, mck, rcstat;
900 void __iomem *target;
901 unsigned long phboff = phb_offset(tbl->it_busno);
902 unsigned long erroff;
903 u32 errregs[7];
904 int i;
905
906 /* dump CSR */
907 target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
908 csr = be32_to_cpu(readl(target));
909 /* dump PLSSR */
910 target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
911 plssr = be32_to_cpu(readl(target));
912 /* dump CSMR */
913 target = calgary_reg(bbar, phboff | 0x290);
914 csmr = be32_to_cpu(readl(target));
915 /* dump mck */
916 target = calgary_reg(bbar, phboff | 0x800);
917 mck = be32_to_cpu(readl(target));
918
919 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
920 tbl->it_busno);
921
922 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
923 csr, plssr, csmr, mck);
924
925 /* dump rest of error regs */
926 printk(KERN_EMERG "Calgary: ");
927 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
928 /* err regs are at 0x810 - 0x870 */
929 erroff = (0x810 + (i * 0x10));
930 target = calgary_reg(bbar, phboff | erroff);
931 errregs[i] = be32_to_cpu(readl(target));
932 printk("0x%08x@0x%lx ", errregs[i], erroff);
933 }
934 printk("\n");
935
936 /* root complex status */
937 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
938 rcstat = be32_to_cpu(readl(target));
939 printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
940 PHB_ROOT_COMPLEX_STATUS);
941}
942
943static void calgary_watchdog(unsigned long data)
944{
945 struct pci_dev *dev = (struct pci_dev *)data;
946 struct iommu_table *tbl = pci_iommu(dev->bus);
947 void __iomem *bbar = tbl->bbar;
948 u32 val32;
949 void __iomem *target;
950
951 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
952 val32 = be32_to_cpu(readl(target));
953
954 /* If no error, the agent ID in the CSR is not valid */
955 if (val32 & CSR_AGENT_MASK) {
956 tbl->chip_ops->dump_error_regs(tbl);
957
958 /* reset error */
959 writel(0, target);
960
961 /* Disable bus that caused the error */
962 target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
963 PHB_CONFIG_RW_OFFSET);
964 val32 = be32_to_cpu(readl(target));
965 val32 |= PHB_SLOT_DISABLE;
966 writel(cpu_to_be32(val32), target);
967 readl(target); /* flush */
968 } else {
969 /* Reset the timer */
970 mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
971 }
972}
973
974static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
975 unsigned char busnum, unsigned long timeout)
976{
977 u64 val64;
978 void __iomem *target;
979 unsigned int phb_shift = ~0; /* silence gcc */
980 u64 mask;
981
982 switch (busno_to_phbid(busnum)) {
983 case 0: phb_shift = (63 - 19);
984 break;
985 case 1: phb_shift = (63 - 23);
986 break;
987 case 2: phb_shift = (63 - 27);
988 break;
989 case 3: phb_shift = (63 - 35);
990 break;
991 default:
992 BUG_ON(busno_to_phbid(busnum));
993 }
994
995 target = calgary_reg(bbar, CALGARY_CONFIG_REG);
996 val64 = be64_to_cpu(readq(target));
997
998 /* zero out this PHB's timer bits */
999 mask = ~(0xFUL << phb_shift);
1000 val64 &= mask;
1001 val64 |= (timeout << phb_shift);
1002 writeq(cpu_to_be64(val64), target);
1003 readq(target); /* flush */
1004}
1005
1006static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1007{
1008 unsigned char busnum = dev->bus->number;
1009 void __iomem *bbar = tbl->bbar;
1010 void __iomem *target;
1011 u32 val;
1012
1013 /*
1014 * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
1015 */
1016 target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
1017 val = cpu_to_be32(readl(target));
1018 val |= 0x00800000;
1019 writel(cpu_to_be32(val), target);
1020}
1021
1022static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1023{
1024 unsigned char busnum = dev->bus->number;
1025
1026 /*
1027 * Give split completion a longer timeout on bus 1 for aic94xx
1028 * http://bugzilla.kernel.org/show_bug.cgi?id=7180
1029 */
1030 if (is_calgary(dev->device) && (busnum == 1))
1031 calgary_set_split_completion_timeout(tbl->bbar, busnum,
1032 CCR_2SEC_TIMEOUT);
1033}
1034
1035static void __init calgary_enable_translation(struct pci_dev *dev)
1036{
1037 u32 val32;
1038 unsigned char busnum;
1039 void __iomem *target;
1040 void __iomem *bbar;
1041 struct iommu_table *tbl;
1042
1043 busnum = dev->bus->number;
1044 tbl = pci_iommu(dev->bus);
1045 bbar = tbl->bbar;
1046
1047 /* enable TCE in PHB Config Register */
1048 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
1049 val32 = be32_to_cpu(readl(target));
1050 val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
1051
1052 printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
1053 (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
1054 "Calgary" : "CalIOC2", busnum);
1055 printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
1056 "bus.\n");
1057
1058 writel(cpu_to_be32(val32), target);
1059 readl(target); /* flush */
1060
1061 init_timer(&tbl->watchdog_timer);
1062 tbl->watchdog_timer.function = &calgary_watchdog;
1063 tbl->watchdog_timer.data = (unsigned long)dev;
1064 mod_timer(&tbl->watchdog_timer, jiffies);
1065}
1066
1067static void __init calgary_disable_translation(struct pci_dev *dev)
1068{
1069 u32 val32;
1070 unsigned char busnum;
1071 void __iomem *target;
1072 void __iomem *bbar;
1073 struct iommu_table *tbl;
1074
1075 busnum = dev->bus->number;
1076 tbl = pci_iommu(dev->bus);
1077 bbar = tbl->bbar;
1078
1079 /* disable TCE in PHB Config Register */
1080 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
1081 val32 = be32_to_cpu(readl(target));
1082 val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
1083
1084 printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
1085 writel(cpu_to_be32(val32), target);
1086 readl(target); /* flush */
1087
1088 del_timer_sync(&tbl->watchdog_timer);
1089}
1090
1091static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
1092{
1093 pci_dev_get(dev);
1094 set_pci_iommu(dev->bus, NULL);
1095
1096 /* is the device behind a bridge? */
1097 if (dev->bus->parent)
1098 dev->bus->parent->self = dev;
1099 else
1100 dev->bus->self = dev;
1101}
1102
1103static int __init calgary_init_one(struct pci_dev *dev)
1104{
1105 void __iomem *bbar;
1106 struct iommu_table *tbl;
1107 int ret;
1108
1109 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
1110
1111 bbar = busno_to_bbar(dev->bus->number);
1112 ret = calgary_setup_tar(dev, bbar);
1113 if (ret)
1114 goto done;
1115
1116 pci_dev_get(dev);
1117
1118 if (dev->bus->parent) {
1119 if (dev->bus->parent->self)
1120 printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
1121 "bus->parent->self!\n", dev);
1122 dev->bus->parent->self = dev;
1123 } else
1124 dev->bus->self = dev;
1125
1126 tbl = pci_iommu(dev->bus);
1127 tbl->chip_ops->handle_quirks(tbl, dev);
1128
1129 calgary_enable_translation(dev);
1130
1131 return 0;
1132
1133done:
1134 return ret;
1135}
1136
1137static int __init calgary_locate_bbars(void)
1138{
1139 int ret;
1140 int rioidx, phb, bus;
1141 void __iomem *bbar;
1142 void __iomem *target;
1143 unsigned long offset;
1144 u8 start_bus, end_bus;
1145 u32 val;
1146
1147 ret = -ENODATA;
1148 for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
1149 struct rio_detail *rio = rio_devs[rioidx];
1150
1151 if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
1152 continue;
1153
1154 /* map entire 1MB of Calgary config space */
1155 bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
1156 if (!bbar)
1157 goto error;
1158
1159 for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
1160 offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
1161 target = calgary_reg(bbar, offset);
1162
1163 val = be32_to_cpu(readl(target));
1164
1165 start_bus = (u8)((val & 0x00FF0000) >> 16);
1166 end_bus = (u8)((val & 0x0000FF00) >> 8);
1167
1168 if (end_bus) {
1169 for (bus = start_bus; bus <= end_bus; bus++) {
1170 bus_info[bus].bbar = bbar;
1171 bus_info[bus].phbid = phb;
1172 }
1173 } else {
1174 bus_info[start_bus].bbar = bbar;
1175 bus_info[start_bus].phbid = phb;
1176 }
1177 }
1178 }
1179
1180 return 0;
1181
1182error:
1183 /* scan bus_info and iounmap any bbars we previously ioremap'd */
1184 for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
1185 if (bus_info[bus].bbar)
1186 iounmap(bus_info[bus].bbar);
1187
1188 return ret;
1189}
1190
1191static int __init calgary_init(void)
1192{
1193 int ret;
1194 struct pci_dev *dev = NULL;
1195 void *tce_space;
1196
1197 ret = calgary_locate_bbars();
1198 if (ret)
1199 return ret;
1200
1201 do {
1202 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1203 if (!dev)
1204 break;
1205 if (!is_cal_pci_dev(dev->device))
1206 continue;
1207 if (!translate_phb(dev)) {
1208 calgary_init_one_nontraslated(dev);
1209 continue;
1210 }
1211 tce_space = bus_info[dev->bus->number].tce_space;
1212 if (!tce_space && !translate_empty_slots)
1213 continue;
1214
1215 ret = calgary_init_one(dev);
1216 if (ret)
1217 goto error;
1218 } while (1);
1219
1220 return ret;
1221
1222error:
1223 do {
1224 dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
1225 PCI_ANY_ID, dev);
1226 if (!dev)
1227 break;
1228 if (!is_cal_pci_dev(dev->device))
1229 continue;
1230 if (!translate_phb(dev)) {
1231 pci_dev_put(dev);
1232 continue;
1233 }
1234 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
1235 continue;
1236
1237 calgary_disable_translation(dev);
1238 calgary_free_bus(dev);
1239 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1240 } while (1);
1241
1242 return ret;
1243}
1244
1245static inline int __init determine_tce_table_size(u64 ram)
1246{
1247 int ret;
1248
1249 if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
1250 return specified_table_size;
1251
1252 /*
1253 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
1254 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
1255 * larger table size has twice as many entries, so shift the
1256 * max ram address by 13 to divide by 8K and then look at the
1257 * order of the result to choose between 0-7.
1258 */
1259 ret = get_order(ram >> 13);
1260 if (ret > TCE_TABLE_SIZE_8M)
1261 ret = TCE_TABLE_SIZE_8M;
1262
1263 return ret;
1264}
1265
1266static int __init build_detail_arrays(void)
1267{
1268 unsigned long ptr;
1269 int i, scal_detail_size, rio_detail_size;
1270
1271 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
1272 printk(KERN_WARNING
1273 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1274 "but system has %d nodes.\n",
1275 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
1276 return -ENODEV;
1277 }
1278
1279 switch (rio_table_hdr->version){
1280 case 2:
1281 scal_detail_size = 11;
1282 rio_detail_size = 13;
1283 break;
1284 case 3:
1285 scal_detail_size = 12;
1286 rio_detail_size = 15;
1287 break;
1288 default:
1289 printk(KERN_WARNING
1290 "Calgary: Invalid Rio Grande Table Version: %d\n",
1291 rio_table_hdr->version);
1292 return -EPROTO;
1293 }
1294
1295 ptr = ((unsigned long)rio_table_hdr) + 3;
1296 for (i = 0; i < rio_table_hdr->num_scal_dev;
1297 i++, ptr += scal_detail_size)
1298 scal_devs[i] = (struct scal_detail *)ptr;
1299
1300 for (i = 0; i < rio_table_hdr->num_rio_dev;
1301 i++, ptr += rio_detail_size)
1302 rio_devs[i] = (struct rio_detail *)ptr;
1303
1304 return 0;
1305}
1306
1307static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1308{
1309 int dev;
1310 u32 val;
1311
1312 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1313 /*
1314 * FIXME: properly scan for devices accross the
1315 * PCI-to-PCI bridge on every CalIOC2 port.
1316 */
1317 return 1;
1318 }
1319
1320 for (dev = 1; dev < 8; dev++) {
1321 val = read_pci_config(bus, dev, 0, 0);
1322 if (val != 0xffffffff)
1323 break;
1324 }
1325 return (val != 0xffffffff);
1326}
1327
1328void __init detect_calgary(void)
1329{
1330 int bus;
1331 void *tbl;
1332 int calgary_found = 0;
1333 unsigned long ptr;
1334 unsigned int offset, prev_offset;
1335 int ret;
1336
1337 /*
1338 * if the user specified iommu=off or iommu=soft or we found
1339 * another HW IOMMU already, bail out.
1340 */
1341 if (swiotlb || no_iommu || iommu_detected)
1342 return;
1343
1344 if (!use_calgary)
1345 return;
1346
1347 if (!early_pci_allowed())
1348 return;
1349
1350 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
1351
1352 ptr = (unsigned long)phys_to_virt(get_bios_ebda());
1353
1354 rio_table_hdr = NULL;
1355 prev_offset = 0;
1356 offset = 0x180;
1357 /*
1358 * The next offset is stored in the 1st word.
1359 * Only parse up until the offset increases:
1360 */
1361 while (offset > prev_offset) {
1362 /* The block id is stored in the 2nd word */
1363 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
1364 /* set the pointer past the offset & block id */
1365 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
1366 break;
1367 }
1368 prev_offset = offset;
1369 offset = *((unsigned short *)(ptr + offset));
1370 }
1371 if (!rio_table_hdr) {
1372 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
1373 "in EBDA - bailing!\n");
1374 return;
1375 }
1376
1377 ret = build_detail_arrays();
1378 if (ret) {
1379 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
1380 return;
1381 }
1382
1383 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
1384
1385 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1386 struct calgary_bus_info *info = &bus_info[bus];
1387 unsigned short pci_device;
1388 u32 val;
1389
1390 val = read_pci_config(bus, 0, 0, 0);
1391 pci_device = (val & 0xFFFF0000) >> 16;
1392
1393 if (!is_cal_pci_dev(pci_device))
1394 continue;
1395
1396 if (info->translation_disabled)
1397 continue;
1398
1399 if (calgary_bus_has_devices(bus, pci_device) ||
1400 translate_empty_slots) {
1401 tbl = alloc_tce_table();
1402 if (!tbl)
1403 goto cleanup;
1404 info->tce_space = tbl;
1405 calgary_found = 1;
1406 }
1407 }
1408
1409 printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
1410 calgary_found ? "found" : "not found");
1411
1412 if (calgary_found) {
1413 iommu_detected = 1;
1414 calgary_detected = 1;
1415 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1416 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1417 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1418 debugging ? "enabled" : "disabled");
1419 }
1420 return;
1421
1422cleanup:
1423 for (--bus; bus >= 0; --bus) {
1424 struct calgary_bus_info *info = &bus_info[bus];
1425
1426 if (info->tce_space)
1427 free_tce_table(info->tce_space);
1428 }
1429}
1430
1431int __init calgary_iommu_init(void)
1432{
1433 int ret;
1434
1435 if (no_iommu || swiotlb)
1436 return -ENODEV;
1437
1438 if (!calgary_detected)
1439 return -ENODEV;
1440
1441 /* ok, we're trying to use Calgary - let's roll */
1442 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1443
1444 ret = calgary_init();
1445 if (ret) {
1446 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1447 "falling back to no_iommu\n", ret);
1448 if (end_pfn > MAX_DMA32_PFN)
1449 printk(KERN_ERR "WARNING more than 4GB of memory, "
1450 "32bit PCI may malfunction.\n");
1451 return ret;
1452 }
1453
1454 force_iommu = 1;
1455 bad_dma_address = 0x0;
1456 dma_ops = &calgary_dma_ops;
1457
1458 return 0;
1459}
1460
1461static int __init calgary_parse_options(char *p)
1462{
1463 unsigned int bridge;
1464 size_t len;
1465 char* endp;
1466
1467 while (*p) {
1468 if (!strncmp(p, "64k", 3))
1469 specified_table_size = TCE_TABLE_SIZE_64K;
1470 else if (!strncmp(p, "128k", 4))
1471 specified_table_size = TCE_TABLE_SIZE_128K;
1472 else if (!strncmp(p, "256k", 4))
1473 specified_table_size = TCE_TABLE_SIZE_256K;
1474 else if (!strncmp(p, "512k", 4))
1475 specified_table_size = TCE_TABLE_SIZE_512K;
1476 else if (!strncmp(p, "1M", 2))
1477 specified_table_size = TCE_TABLE_SIZE_1M;
1478 else if (!strncmp(p, "2M", 2))
1479 specified_table_size = TCE_TABLE_SIZE_2M;
1480 else if (!strncmp(p, "4M", 2))
1481 specified_table_size = TCE_TABLE_SIZE_4M;
1482 else if (!strncmp(p, "8M", 2))
1483 specified_table_size = TCE_TABLE_SIZE_8M;
1484
1485 len = strlen("translate_empty_slots");
1486 if (!strncmp(p, "translate_empty_slots", len))
1487 translate_empty_slots = 1;
1488
1489 len = strlen("disable");
1490 if (!strncmp(p, "disable", len)) {
1491 p += len;
1492 if (*p == '=')
1493 ++p;
1494 if (*p == '\0')
1495 break;
1496 bridge = simple_strtol(p, &endp, 0);
1497 if (p == endp)
1498 break;
1499
1500 if (bridge < MAX_PHB_BUS_NUM) {
1501 printk(KERN_INFO "Calgary: disabling "
1502 "translation for PHB %#x\n", bridge);
1503 bus_info[bridge].translation_disabled = 1;
1504 }
1505 }
1506
1507 p = strpbrk(p, ",");
1508 if (!p)
1509 break;
1510
1511 p++; /* skip ',' */
1512 }
1513 return 1;
1514}
1515__setup("calgary=", calgary_parse_options);
1516
1517static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
1518{
1519 struct iommu_table *tbl;
1520 unsigned int npages;
1521 int i;
1522
1523 tbl = pci_iommu(dev->bus);
1524
1525 for (i = 0; i < 4; i++) {
1526 struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
1527
1528 /* Don't give out TCEs that map MEM resources */
1529 if (!(r->flags & IORESOURCE_MEM))
1530 continue;
1531
1532 /* 0-based? we reserve the whole 1st MB anyway */
1533 if (!r->start)
1534 continue;
1535
1536 /* cover the whole region */
1537 npages = (r->end - r->start) >> PAGE_SHIFT;
1538 npages++;
1539
1540 iommu_range_reserve(tbl, r->start, npages);
1541 }
1542}
1543
1544static int __init calgary_fixup_tce_spaces(void)
1545{
1546 struct pci_dev *dev = NULL;
1547 void *tce_space;
1548
1549 if (no_iommu || swiotlb || !calgary_detected)
1550 return -ENODEV;
1551
1552 printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
1553
1554 do {
1555 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1556 if (!dev)
1557 break;
1558 if (!is_cal_pci_dev(dev->device))
1559 continue;
1560 if (!translate_phb(dev))
1561 continue;
1562
1563 tce_space = bus_info[dev->bus->number].tce_space;
1564 if (!tce_space)
1565 continue;
1566
1567 calgary_fixup_one_tce_space(dev);
1568
1569 } while (1);
1570
1571 return 0;
1572}
1573
1574/*
1575 * We need to be call after pcibios_assign_resources (fs_initcall level)
1576 * and before device_initcall.
1577 */
1578rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
new file mode 100644
index 000000000000..29711445c818
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -0,0 +1,346 @@
1/*
2 * Dynamic DMA mapping support.
3 */
4
5#include <linux/types.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/pci.h>
9#include <linux/module.h>
10#include <asm/io.h>
11#include <asm/iommu.h>
12#include <asm/calgary.h>
13
14int iommu_merge __read_mostly = 0;
15EXPORT_SYMBOL(iommu_merge);
16
17dma_addr_t bad_dma_address __read_mostly;
18EXPORT_SYMBOL(bad_dma_address);
19
20/* This tells the BIO block layer to assume merging. Default to off
21 because we cannot guarantee merging later. */
22int iommu_bio_merge __read_mostly = 0;
23EXPORT_SYMBOL(iommu_bio_merge);
24
25static int iommu_sac_force __read_mostly = 0;
26
27int no_iommu __read_mostly;
28#ifdef CONFIG_IOMMU_DEBUG
29int panic_on_overflow __read_mostly = 1;
30int force_iommu __read_mostly = 1;
31#else
32int panic_on_overflow __read_mostly = 0;
33int force_iommu __read_mostly= 0;
34#endif
35
36/* Set this to 1 if there is a HW IOMMU in the system */
37int iommu_detected __read_mostly = 0;
38
39/* Dummy device used for NULL arguments (normally ISA). Better would
40 be probably a smaller DMA mask, but this is bug-to-bug compatible
41 to i386. */
42struct device fallback_dev = {
43 .bus_id = "fallback device",
44 .coherent_dma_mask = DMA_32BIT_MASK,
45 .dma_mask = &fallback_dev.coherent_dma_mask,
46};
47
48/* Allocate DMA memory on node near device */
49noinline static void *
50dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
51{
52 struct page *page;
53 int node;
54#ifdef CONFIG_PCI
55 if (dev->bus == &pci_bus_type)
56 node = pcibus_to_node(to_pci_dev(dev)->bus);
57 else
58#endif
59 node = numa_node_id();
60
61 if (node < first_node(node_online_map))
62 node = first_node(node_online_map);
63
64 page = alloc_pages_node(node, gfp, order);
65 return page ? page_address(page) : NULL;
66}
67
68/*
69 * Allocate memory for a coherent mapping.
70 */
71void *
72dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
73 gfp_t gfp)
74{
75 void *memory;
76 unsigned long dma_mask = 0;
77 u64 bus;
78
79 if (!dev)
80 dev = &fallback_dev;
81 dma_mask = dev->coherent_dma_mask;
82 if (dma_mask == 0)
83 dma_mask = DMA_32BIT_MASK;
84
85 /* Device not DMA able */
86 if (dev->dma_mask == NULL)
87 return NULL;
88
89 /* Don't invoke OOM killer */
90 gfp |= __GFP_NORETRY;
91
92 /* Kludge to make it bug-to-bug compatible with i386. i386
93 uses the normal dma_mask for alloc_coherent. */
94 dma_mask &= *dev->dma_mask;
95
96 /* Why <=? Even when the mask is smaller than 4GB it is often
97 larger than 16MB and in this case we have a chance of
98 finding fitting memory in the next higher zone first. If
99 not retry with true GFP_DMA. -AK */
100 if (dma_mask <= DMA_32BIT_MASK)
101 gfp |= GFP_DMA32;
102
103 again:
104 memory = dma_alloc_pages(dev, gfp, get_order(size));
105 if (memory == NULL)
106 return NULL;
107
108 {
109 int high, mmu;
110 bus = virt_to_bus(memory);
111 high = (bus + size) >= dma_mask;
112 mmu = high;
113 if (force_iommu && !(gfp & GFP_DMA))
114 mmu = 1;
115 else if (high) {
116 free_pages((unsigned long)memory,
117 get_order(size));
118
119 /* Don't use the 16MB ZONE_DMA unless absolutely
120 needed. It's better to use remapping first. */
121 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
122 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
123 goto again;
124 }
125
126 /* Let low level make its own zone decisions */
127 gfp &= ~(GFP_DMA32|GFP_DMA);
128
129 if (dma_ops->alloc_coherent)
130 return dma_ops->alloc_coherent(dev, size,
131 dma_handle, gfp);
132 return NULL;
133 }
134
135 memset(memory, 0, size);
136 if (!mmu) {
137 *dma_handle = virt_to_bus(memory);
138 return memory;
139 }
140 }
141
142 if (dma_ops->alloc_coherent) {
143 free_pages((unsigned long)memory, get_order(size));
144 gfp &= ~(GFP_DMA|GFP_DMA32);
145 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
146 }
147
148 if (dma_ops->map_simple) {
149 *dma_handle = dma_ops->map_simple(dev, memory,
150 size,
151 PCI_DMA_BIDIRECTIONAL);
152 if (*dma_handle != bad_dma_address)
153 return memory;
154 }
155
156 if (panic_on_overflow)
157 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
158 free_pages((unsigned long)memory, get_order(size));
159 return NULL;
160}
161EXPORT_SYMBOL(dma_alloc_coherent);
162
163/*
164 * Unmap coherent memory.
165 * The caller must ensure that the device has finished accessing the mapping.
166 */
167void dma_free_coherent(struct device *dev, size_t size,
168 void *vaddr, dma_addr_t bus)
169{
170 if (dma_ops->unmap_single)
171 dma_ops->unmap_single(dev, bus, size, 0);
172 free_pages((unsigned long)vaddr, get_order(size));
173}
174EXPORT_SYMBOL(dma_free_coherent);
175
176static int forbid_dac __read_mostly;
177
178int dma_supported(struct device *dev, u64 mask)
179{
180#ifdef CONFIG_PCI
181 if (mask > 0xffffffff && forbid_dac > 0) {
182
183
184
185 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
186 return 0;
187 }
188#endif
189
190 if (dma_ops->dma_supported)
191 return dma_ops->dma_supported(dev, mask);
192
193 /* Copied from i386. Doesn't make much sense, because it will
194 only work for pci_alloc_coherent.
195 The caller just has to use GFP_DMA in this case. */
196 if (mask < DMA_24BIT_MASK)
197 return 0;
198
199 /* Tell the device to use SAC when IOMMU force is on. This
200 allows the driver to use cheaper accesses in some cases.
201
202 Problem with this is that if we overflow the IOMMU area and
203 return DAC as fallback address the device may not handle it
204 correctly.
205
206 As a special case some controllers have a 39bit address
207 mode that is as efficient as 32bit (aic79xx). Don't force
208 SAC for these. Assume all masks <= 40 bits are of this
209 type. Normally this doesn't make any difference, but gives
210 more gentle handling of IOMMU overflow. */
211 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
212 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
213 return 0;
214 }
215
216 return 1;
217}
218EXPORT_SYMBOL(dma_supported);
219
220int dma_set_mask(struct device *dev, u64 mask)
221{
222 if (!dev->dma_mask || !dma_supported(dev, mask))
223 return -EIO;
224 *dev->dma_mask = mask;
225 return 0;
226}
227EXPORT_SYMBOL(dma_set_mask);
228
229/*
230 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
231 * documentation.
232 */
233__init int iommu_setup(char *p)
234{
235 iommu_merge = 1;
236
237 if (!p)
238 return -EINVAL;
239
240 while (*p) {
241 if (!strncmp(p,"off",3))
242 no_iommu = 1;
243 /* gart_parse_options has more force support */
244 if (!strncmp(p,"force",5))
245 force_iommu = 1;
246 if (!strncmp(p,"noforce",7)) {
247 iommu_merge = 0;
248 force_iommu = 0;
249 }
250
251 if (!strncmp(p, "biomerge",8)) {
252 iommu_bio_merge = 4096;
253 iommu_merge = 1;
254 force_iommu = 1;
255 }
256 if (!strncmp(p, "panic",5))
257 panic_on_overflow = 1;
258 if (!strncmp(p, "nopanic",7))
259 panic_on_overflow = 0;
260 if (!strncmp(p, "merge",5)) {
261 iommu_merge = 1;
262 force_iommu = 1;
263 }
264 if (!strncmp(p, "nomerge",7))
265 iommu_merge = 0;
266 if (!strncmp(p, "forcesac",8))
267 iommu_sac_force = 1;
268 if (!strncmp(p, "allowdac", 8))
269 forbid_dac = 0;
270 if (!strncmp(p, "nodac", 5))
271 forbid_dac = -1;
272
273#ifdef CONFIG_SWIOTLB
274 if (!strncmp(p, "soft",4))
275 swiotlb = 1;
276#endif
277
278#ifdef CONFIG_IOMMU
279 gart_parse_options(p);
280#endif
281
282#ifdef CONFIG_CALGARY_IOMMU
283 if (!strncmp(p, "calgary", 7))
284 use_calgary = 1;
285#endif /* CONFIG_CALGARY_IOMMU */
286
287 p += strcspn(p, ",");
288 if (*p == ',')
289 ++p;
290 }
291 return 0;
292}
293early_param("iommu", iommu_setup);
294
295void __init pci_iommu_alloc(void)
296{
297 /*
298 * The order of these functions is important for
299 * fall-back/fail-over reasons
300 */
301#ifdef CONFIG_IOMMU
302 iommu_hole_init();
303#endif
304
305#ifdef CONFIG_CALGARY_IOMMU
306 detect_calgary();
307#endif
308
309#ifdef CONFIG_SWIOTLB
310 pci_swiotlb_init();
311#endif
312}
313
314static int __init pci_iommu_init(void)
315{
316#ifdef CONFIG_CALGARY_IOMMU
317 calgary_iommu_init();
318#endif
319
320#ifdef CONFIG_IOMMU
321 gart_iommu_init();
322#endif
323
324 no_iommu_init();
325 return 0;
326}
327
328void pci_iommu_shutdown(void)
329{
330 gart_iommu_shutdown();
331}
332
333#ifdef CONFIG_PCI
334/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
335
336static __devinit void via_no_dac(struct pci_dev *dev)
337{
338 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
339 printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
340 forbid_dac = 1;
341 }
342}
343DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
344#endif
345/* Must execute after PCI subsystem */
346fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
new file mode 100644
index 000000000000..4918c575d582
--- /dev/null
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -0,0 +1,740 @@
1/*
2 * Dynamic DMA mapping support for AMD Hammer.
3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB.
7 *
8 * See Documentation/DMA-mapping.txt for the interface specification.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/agp_backend.h>
16#include <linux/init.h>
17#include <linux/mm.h>
18#include <linux/string.h>
19#include <linux/spinlock.h>
20#include <linux/pci.h>
21#include <linux/module.h>
22#include <linux/topology.h>
23#include <linux/interrupt.h>
24#include <linux/bitops.h>
25#include <linux/kdebug.h>
26#include <asm/atomic.h>
27#include <asm/io.h>
28#include <asm/mtrr.h>
29#include <asm/pgtable.h>
30#include <asm/proto.h>
31#include <asm/iommu.h>
32#include <asm/cacheflush.h>
33#include <asm/swiotlb.h>
34#include <asm/dma.h>
35#include <asm/k8.h>
36
37unsigned long iommu_bus_base; /* GART remapping area (physical) */
38static unsigned long iommu_size; /* size of remapping area bytes */
39static unsigned long iommu_pages; /* .. and in pages */
40
41u32 *iommu_gatt_base; /* Remapping table */
42
43/* If this is disabled the IOMMU will use an optimized flushing strategy
44 of only flushing when an mapping is reused. With it true the GART is flushed
45 for every mapping. Problem is that doing the lazy flush seems to trigger
46 bugs with some popular PCI cards, in particular 3ware (but has been also
47 also seen with Qlogic at least). */
48int iommu_fullflush = 1;
49
50/* Allocation bitmap for the remapping area */
51static DEFINE_SPINLOCK(iommu_bitmap_lock);
52static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
53
54static u32 gart_unmapped_entry;
55
56#define GPTE_VALID 1
57#define GPTE_COHERENT 2
58#define GPTE_ENCODE(x) \
59 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
60#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
61
62#define to_pages(addr,size) \
63 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
64
65#define EMERGENCY_PAGES 32 /* = 128KB */
66
67#ifdef CONFIG_AGP
68#define AGPEXTERN extern
69#else
70#define AGPEXTERN
71#endif
72
73/* backdoor interface to AGP driver */
74AGPEXTERN int agp_memory_reserved;
75AGPEXTERN __u32 *agp_gatt_table;
76
77static unsigned long next_bit; /* protected by iommu_bitmap_lock */
78static int need_flush; /* global flush state. set for each gart wrap */
79
80static unsigned long alloc_iommu(int size)
81{
82 unsigned long offset, flags;
83
84 spin_lock_irqsave(&iommu_bitmap_lock, flags);
85 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
86 if (offset == -1) {
87 need_flush = 1;
88 offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
89 }
90 if (offset != -1) {
91 set_bit_string(iommu_gart_bitmap, offset, size);
92 next_bit = offset+size;
93 if (next_bit >= iommu_pages) {
94 next_bit = 0;
95 need_flush = 1;
96 }
97 }
98 if (iommu_fullflush)
99 need_flush = 1;
100 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
101 return offset;
102}
103
104static void free_iommu(unsigned long offset, int size)
105{
106 unsigned long flags;
107 spin_lock_irqsave(&iommu_bitmap_lock, flags);
108 __clear_bit_string(iommu_gart_bitmap, offset, size);
109 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
110}
111
112/*
113 * Use global flush state to avoid races with multiple flushers.
114 */
115static void flush_gart(void)
116{
117 unsigned long flags;
118 spin_lock_irqsave(&iommu_bitmap_lock, flags);
119 if (need_flush) {
120 k8_flush_garts();
121 need_flush = 0;
122 }
123 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
124}
125
126#ifdef CONFIG_IOMMU_LEAK
127
128#define SET_LEAK(x) if (iommu_leak_tab) \
129 iommu_leak_tab[x] = __builtin_return_address(0);
130#define CLEAR_LEAK(x) if (iommu_leak_tab) \
131 iommu_leak_tab[x] = NULL;
132
133/* Debugging aid for drivers that don't free their IOMMU tables */
134static void **iommu_leak_tab;
135static int leak_trace;
136int iommu_leak_pages = 20;
137void dump_leak(void)
138{
139 int i;
140 static int dump;
141 if (dump || !iommu_leak_tab) return;
142 dump = 1;
143 show_stack(NULL,NULL);
144 /* Very crude. dump some from the end of the table too */
145 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages);
146 for (i = 0; i < iommu_leak_pages; i+=2) {
147 printk("%lu: ", iommu_pages-i);
148 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
149 printk("%c", (i+1)%2 == 0 ? '\n' : ' ');
150 }
151 printk("\n");
152}
153#else
154#define SET_LEAK(x)
155#define CLEAR_LEAK(x)
156#endif
157
158static void iommu_full(struct device *dev, size_t size, int dir)
159{
160 /*
161 * Ran out of IOMMU space for this operation. This is very bad.
162 * Unfortunately the drivers cannot handle this operation properly.
163 * Return some non mapped prereserved space in the aperture and
164 * let the Northbridge deal with it. This will result in garbage
165 * in the IO operation. When the size exceeds the prereserved space
166 * memory corruption will occur or random memory will be DMAed
167 * out. Hopefully no network devices use single mappings that big.
168 */
169
170 printk(KERN_ERR
171 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
172 size, dev->bus_id);
173
174 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
175 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
176 panic("PCI-DMA: Memory would be corrupted\n");
177 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
178 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
179 }
180
181#ifdef CONFIG_IOMMU_LEAK
182 dump_leak();
183#endif
184}
185
186static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
187{
188 u64 mask = *dev->dma_mask;
189 int high = addr + size > mask;
190 int mmu = high;
191 if (force_iommu)
192 mmu = 1;
193 return mmu;
194}
195
196static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
197{
198 u64 mask = *dev->dma_mask;
199 int high = addr + size > mask;
200 int mmu = high;
201 return mmu;
202}
203
204/* Map a single continuous physical area into the IOMMU.
205 * Caller needs to check if the iommu is needed and flush.
206 */
207static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
208 size_t size, int dir)
209{
210 unsigned long npages = to_pages(phys_mem, size);
211 unsigned long iommu_page = alloc_iommu(npages);
212 int i;
213 if (iommu_page == -1) {
214 if (!nonforced_iommu(dev, phys_mem, size))
215 return phys_mem;
216 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir);
219 return bad_dma_address;
220 }
221
222 for (i = 0; i < npages; i++) {
223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
224 SET_LEAK(iommu_page + i);
225 phys_mem += PAGE_SIZE;
226 }
227 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
228}
229
230static dma_addr_t gart_map_simple(struct device *dev, char *buf,
231 size_t size, int dir)
232{
233 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
234 flush_gart();
235 return map;
236}
237
238/* Map a single area into the IOMMU */
239static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
240{
241 unsigned long phys_mem, bus;
242
243 if (!dev)
244 dev = &fallback_dev;
245
246 phys_mem = virt_to_phys(addr);
247 if (!need_iommu(dev, phys_mem, size))
248 return phys_mem;
249
250 bus = gart_map_simple(dev, addr, size, dir);
251 return bus;
252}
253
254/*
255 * Free a DMA mapping.
256 */
257static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
258 size_t size, int direction)
259{
260 unsigned long iommu_page;
261 int npages;
262 int i;
263
264 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
265 dma_addr >= iommu_bus_base + iommu_size)
266 return;
267 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
268 npages = to_pages(dma_addr, size);
269 for (i = 0; i < npages; i++) {
270 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
271 CLEAR_LEAK(iommu_page + i);
272 }
273 free_iommu(iommu_page, npages);
274}
275
276/*
277 * Wrapper for pci_unmap_single working with scatterlists.
278 */
279static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
280{
281 int i;
282
283 for (i = 0; i < nents; i++) {
284 struct scatterlist *s = &sg[i];
285 if (!s->dma_length || !s->length)
286 break;
287 gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
288 }
289}
290
291/* Fallback for dma_map_sg in case of overflow */
292static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
293 int nents, int dir)
294{
295 int i;
296
297#ifdef CONFIG_IOMMU_DEBUG
298 printk(KERN_DEBUG "dma_map_sg overflow\n");
299#endif
300
301 for (i = 0; i < nents; i++ ) {
302 struct scatterlist *s = &sg[i];
303 unsigned long addr = page_to_phys(s->page) + s->offset;
304 if (nonforced_iommu(dev, addr, s->length)) {
305 addr = dma_map_area(dev, addr, s->length, dir);
306 if (addr == bad_dma_address) {
307 if (i > 0)
308 gart_unmap_sg(dev, sg, i, dir);
309 nents = 0;
310 sg[0].dma_length = 0;
311 break;
312 }
313 }
314 s->dma_address = addr;
315 s->dma_length = s->length;
316 }
317 flush_gart();
318 return nents;
319}
320
321/* Map multiple scatterlist entries continuous into the first. */
322static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
323 struct scatterlist *sout, unsigned long pages)
324{
325 unsigned long iommu_start = alloc_iommu(pages);
326 unsigned long iommu_page = iommu_start;
327 int i;
328
329 if (iommu_start == -1)
330 return -1;
331
332 for (i = start; i < stopat; i++) {
333 struct scatterlist *s = &sg[i];
334 unsigned long pages, addr;
335 unsigned long phys_addr = s->dma_address;
336
337 BUG_ON(i > start && s->offset);
338 if (i == start) {
339 *sout = *s;
340 sout->dma_address = iommu_bus_base;
341 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
342 sout->dma_length = s->length;
343 } else {
344 sout->dma_length += s->length;
345 }
346
347 addr = phys_addr;
348 pages = to_pages(s->offset, s->length);
349 while (pages--) {
350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
351 SET_LEAK(iommu_page);
352 addr += PAGE_SIZE;
353 iommu_page++;
354 }
355 }
356 BUG_ON(iommu_page - iommu_start != pages);
357 return 0;
358}
359
360static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
361 struct scatterlist *sout,
362 unsigned long pages, int need)
363{
364 if (!need) {
365 BUG_ON(stopat - start != 1);
366 *sout = sg[start];
367 sout->dma_length = sg[start].length;
368 return 0;
369 }
370 return __dma_map_cont(sg, start, stopat, sout, pages);
371}
372
373/*
374 * DMA map all entries in a scatterlist.
375 * Merge chunks that have page aligned sizes into a continuous mapping.
376 */
377int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
378{
379 int i;
380 int out;
381 int start;
382 unsigned long pages = 0;
383 int need = 0, nextneed;
384
385 if (nents == 0)
386 return 0;
387
388 if (!dev)
389 dev = &fallback_dev;
390
391 out = 0;
392 start = 0;
393 for (i = 0; i < nents; i++) {
394 struct scatterlist *s = &sg[i];
395 dma_addr_t addr = page_to_phys(s->page) + s->offset;
396 s->dma_address = addr;
397 BUG_ON(s->length == 0);
398
399 nextneed = need_iommu(dev, addr, s->length);
400
401 /* Handle the previous not yet processed entries */
402 if (i > start) {
403 struct scatterlist *ps = &sg[i-1];
404 /* Can only merge when the last chunk ends on a page
405 boundary and the new one doesn't have an offset. */
406 if (!iommu_merge || !nextneed || !need || s->offset ||
407 (ps->offset + ps->length) % PAGE_SIZE) {
408 if (dma_map_cont(sg, start, i, sg+out, pages,
409 need) < 0)
410 goto error;
411 out++;
412 pages = 0;
413 start = i;
414 }
415 }
416
417 need = nextneed;
418 pages += to_pages(s->offset, s->length);
419 }
420 if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
421 goto error;
422 out++;
423 flush_gart();
424 if (out < nents)
425 sg[out].dma_length = 0;
426 return out;
427
428error:
429 flush_gart();
430 gart_unmap_sg(dev, sg, nents, dir);
431 /* When it was forced or merged try again in a dumb way */
432 if (force_iommu || iommu_merge) {
433 out = dma_map_sg_nonforce(dev, sg, nents, dir);
434 if (out > 0)
435 return out;
436 }
437 if (panic_on_overflow)
438 panic("dma_map_sg: overflow on %lu pages\n", pages);
439 iommu_full(dev, pages << PAGE_SHIFT, dir);
440 for (i = 0; i < nents; i++)
441 sg[i].dma_address = bad_dma_address;
442 return 0;
443}
444
445static int no_agp;
446
447static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
448{
449 unsigned long a;
450 if (!iommu_size) {
451 iommu_size = aper_size;
452 if (!no_agp)
453 iommu_size /= 2;
454 }
455
456 a = aper + iommu_size;
457 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
458
459 if (iommu_size < 64*1024*1024)
460 printk(KERN_WARNING
461 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20);
462
463 return iommu_size;
464}
465
466static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
467{
468 unsigned aper_size = 0, aper_base_32;
469 u64 aper_base;
470 unsigned aper_order;
471
472 pci_read_config_dword(dev, 0x94, &aper_base_32);
473 pci_read_config_dword(dev, 0x90, &aper_order);
474 aper_order = (aper_order >> 1) & 7;
475
476 aper_base = aper_base_32 & 0x7fff;
477 aper_base <<= 25;
478
479 aper_size = (32 * 1024 * 1024) << aper_order;
480 if (aper_base + aper_size > 0x100000000UL || !aper_size)
481 aper_base = 0;
482
483 *size = aper_size;
484 return aper_base;
485}
486
487/*
488 * Private Northbridge GATT initialization in case we cannot use the
489 * AGP driver for some reason.
490 */
491static __init int init_k8_gatt(struct agp_kern_info *info)
492{
493 struct pci_dev *dev;
494 void *gatt;
495 unsigned aper_base, new_aper_base;
496 unsigned aper_size, gatt_size, new_aper_size;
497 int i;
498
499 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
500 aper_size = aper_base = info->aper_size = 0;
501 dev = NULL;
502 for (i = 0; i < num_k8_northbridges; i++) {
503 dev = k8_northbridges[i];
504 new_aper_base = read_aperture(dev, &new_aper_size);
505 if (!new_aper_base)
506 goto nommu;
507
508 if (!aper_base) {
509 aper_size = new_aper_size;
510 aper_base = new_aper_base;
511 }
512 if (aper_size != new_aper_size || aper_base != new_aper_base)
513 goto nommu;
514 }
515 if (!aper_base)
516 goto nommu;
517 info->aper_base = aper_base;
518 info->aper_size = aper_size>>20;
519
520 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
521 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
522 if (!gatt)
523 panic("Cannot allocate GATT table");
524 if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
525 panic("Could not set GART PTEs to uncacheable pages");
526 global_flush_tlb();
527
528 memset(gatt, 0, gatt_size);
529 agp_gatt_table = gatt;
530
531 for (i = 0; i < num_k8_northbridges; i++) {
532 u32 ctl;
533 u32 gatt_reg;
534
535 dev = k8_northbridges[i];
536 gatt_reg = __pa(gatt) >> 12;
537 gatt_reg <<= 4;
538 pci_write_config_dword(dev, 0x98, gatt_reg);
539 pci_read_config_dword(dev, 0x90, &ctl);
540
541 ctl |= 1;
542 ctl &= ~((1<<4) | (1<<5));
543
544 pci_write_config_dword(dev, 0x90, ctl);
545 }
546 flush_gart();
547
548 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
549 return 0;
550
551 nommu:
552 /* Should not happen anymore */
553 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
554 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
555 return -1;
556}
557
558extern int agp_amd64_init(void);
559
560static const struct dma_mapping_ops gart_dma_ops = {
561 .mapping_error = NULL,
562 .map_single = gart_map_single,
563 .map_simple = gart_map_simple,
564 .unmap_single = gart_unmap_single,
565 .sync_single_for_cpu = NULL,
566 .sync_single_for_device = NULL,
567 .sync_single_range_for_cpu = NULL,
568 .sync_single_range_for_device = NULL,
569 .sync_sg_for_cpu = NULL,
570 .sync_sg_for_device = NULL,
571 .map_sg = gart_map_sg,
572 .unmap_sg = gart_unmap_sg,
573};
574
575void gart_iommu_shutdown(void)
576{
577 struct pci_dev *dev;
578 int i;
579
580 if (no_agp && (dma_ops != &gart_dma_ops))
581 return;
582
583 for (i = 0; i < num_k8_northbridges; i++) {
584 u32 ctl;
585
586 dev = k8_northbridges[i];
587 pci_read_config_dword(dev, 0x90, &ctl);
588
589 ctl &= ~1;
590
591 pci_write_config_dword(dev, 0x90, ctl);
592 }
593}
594
595void __init gart_iommu_init(void)
596{
597 struct agp_kern_info info;
598 unsigned long aper_size;
599 unsigned long iommu_start;
600 unsigned long scratch;
601 long i;
602
603 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
604 printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
605 return;
606 }
607
608#ifndef CONFIG_AGP_AMD64
609 no_agp = 1;
610#else
611 /* Makefile puts PCI initialization via subsys_initcall first. */
612 /* Add other K8 AGP bridge drivers here */
613 no_agp = no_agp ||
614 (agp_amd64_init() < 0) ||
615 (agp_copy_info(agp_bridge, &info) < 0);
616#endif
617
618 if (swiotlb)
619 return;
620
621 /* Did we detect a different HW IOMMU? */
622 if (iommu_detected && !iommu_aperture)
623 return;
624
625 if (no_iommu ||
626 (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
627 !iommu_aperture ||
628 (no_agp && init_k8_gatt(&info) < 0)) {
629 if (end_pfn > MAX_DMA32_PFN) {
630 printk(KERN_ERR "WARNING more than 4GB of memory "
631 "but GART IOMMU not available.\n"
632 KERN_ERR "WARNING 32bit PCI may malfunction.\n");
633 }
634 return;
635 }
636
637 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
638 aper_size = info.aper_size * 1024 * 1024;
639 iommu_size = check_iommu_size(info.aper_base, aper_size);
640 iommu_pages = iommu_size >> PAGE_SHIFT;
641
642 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL,
643 get_order(iommu_pages/8));
644 if (!iommu_gart_bitmap)
645 panic("Cannot allocate iommu bitmap\n");
646 memset(iommu_gart_bitmap, 0, iommu_pages/8);
647
648#ifdef CONFIG_IOMMU_LEAK
649 if (leak_trace) {
650 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
651 get_order(iommu_pages*sizeof(void *)));
652 if (iommu_leak_tab)
653 memset(iommu_leak_tab, 0, iommu_pages * 8);
654 else
655 printk("PCI-DMA: Cannot allocate leak trace area\n");
656 }
657#endif
658
659 /*
660 * Out of IOMMU space handling.
661 * Reserve some invalid pages at the beginning of the GART.
662 */
663 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
664
665 agp_memory_reserved = iommu_size;
666 printk(KERN_INFO
667 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
668 iommu_size>>20);
669
670 iommu_start = aper_size - iommu_size;
671 iommu_bus_base = info.aper_base + iommu_start;
672 bad_dma_address = iommu_bus_base;
673 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
674
675 /*
676 * Unmap the IOMMU part of the GART. The alias of the page is
677 * always mapped with cache enabled and there is no full cache
678 * coherency across the GART remapping. The unmapping avoids
679 * automatic prefetches from the CPU allocating cache lines in
680 * there. All CPU accesses are done via the direct mapping to
681 * the backing memory. The GART address is only used by PCI
682 * devices.
683 */
684 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
685
686 /*
687 * Try to workaround a bug (thanks to BenH)
688 * Set unmapped entries to a scratch page instead of 0.
689 * Any prefetches that hit unmapped entries won't get an bus abort
690 * then.
691 */
692 scratch = get_zeroed_page(GFP_KERNEL);
693 if (!scratch)
694 panic("Cannot allocate iommu scratch page");
695 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
696 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
697 iommu_gatt_base[i] = gart_unmapped_entry;
698
699 flush_gart();
700 dma_ops = &gart_dma_ops;
701}
702
703void __init gart_parse_options(char *p)
704{
705 int arg;
706
707#ifdef CONFIG_IOMMU_LEAK
708 if (!strncmp(p,"leak",4)) {
709 leak_trace = 1;
710 p += 4;
711 if (*p == '=') ++p;
712 if (isdigit(*p) && get_option(&p, &arg))
713 iommu_leak_pages = arg;
714 }
715#endif
716 if (isdigit(*p) && get_option(&p, &arg))
717 iommu_size = arg;
718 if (!strncmp(p, "fullflush",8))
719 iommu_fullflush = 1;
720 if (!strncmp(p, "nofullflush",11))
721 iommu_fullflush = 0;
722 if (!strncmp(p,"noagp",5))
723 no_agp = 1;
724 if (!strncmp(p, "noaperture",10))
725 fix_aperture = 0;
726 /* duplicated from pci-dma.c */
727 if (!strncmp(p,"force",5))
728 iommu_aperture_allowed = 1;
729 if (!strncmp(p,"allowed",7))
730 iommu_aperture_allowed = 1;
731 if (!strncmp(p, "memaper", 7)) {
732 fallback_aper_force = 1;
733 p += 7;
734 if (*p == '=') {
735 ++p;
736 if (get_option(&p, &arg))
737 fallback_aper_order = arg;
738 }
739 }
740}
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c
new file mode 100644
index 000000000000..2a34c6c025a9
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu_64.c
@@ -0,0 +1,97 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <linux/dma-mapping.h>
8
9#include <asm/iommu.h>
10#include <asm/processor.h>
11#include <asm/dma.h>
12
13static int
14check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
15{
16 if (hwdev && bus + size > *hwdev->dma_mask) {
17 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
18 printk(KERN_ERR
19 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
20 name, (long long)bus, size,
21 (long long)*hwdev->dma_mask);
22 return 0;
23 }
24 return 1;
25}
26
27static dma_addr_t
28nommu_map_single(struct device *hwdev, void *ptr, size_t size,
29 int direction)
30{
31 dma_addr_t bus = virt_to_bus(ptr);
32 if (!check_addr("map_single", hwdev, bus, size))
33 return bad_dma_address;
34 return bus;
35}
36
37static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
38 int direction)
39{
40}
41
42/* Map a set of buffers described by scatterlist in streaming
43 * mode for DMA. This is the scatter-gather version of the
44 * above pci_map_single interface. Here the scatter gather list
45 * elements are each tagged with the appropriate dma address
46 * and length. They are obtained via sg_dma_{address,length}(SG).
47 *
48 * NOTE: An implementation may be able to use a smaller number of
49 * DMA address/length pairs than there are SG table elements.
50 * (for example via virtual mapping capabilities)
51 * The routine returns the number of addr/length pairs actually
52 * used, at most nents.
53 *
54 * Device ownership issues as mentioned above for pci_map_single are
55 * the same here.
56 */
57static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
58 int nents, int direction)
59{
60 int i;
61
62 for (i = 0; i < nents; i++ ) {
63 struct scatterlist *s = &sg[i];
64 BUG_ON(!s->page);
65 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
66 if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
67 return 0;
68 s->dma_length = s->length;
69 }
70 return nents;
71}
72
73/* Unmap a set of streaming mode DMA translations.
74 * Again, cpu read rules concerning calls here are the same as for
75 * pci_unmap_single() above.
76 */
77static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
78 int nents, int dir)
79{
80}
81
82const struct dma_mapping_ops nommu_dma_ops = {
83 .map_single = nommu_map_single,
84 .unmap_single = nommu_unmap_single,
85 .map_sg = nommu_map_sg,
86 .unmap_sg = nommu_unmap_sg,
87 .is_phys = 1,
88};
89
90void __init no_iommu_init(void)
91{
92 if (dma_ops)
93 return;
94
95 force_iommu = 0; /* no HW IOMMU */
96 dma_ops = &nommu_dma_ops;
97}
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
new file mode 100644
index 000000000000..b2f405ea7c85
--- /dev/null
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -0,0 +1,44 @@
1/* Glue code to lib/swiotlb.c */
2
3#include <linux/pci.h>
4#include <linux/cache.h>
5#include <linux/module.h>
6#include <linux/dma-mapping.h>
7
8#include <asm/iommu.h>
9#include <asm/swiotlb.h>
10#include <asm/dma.h>
11
12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb);
14
15const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error,
17 .alloc_coherent = swiotlb_alloc_coherent,
18 .free_coherent = swiotlb_free_coherent,
19 .map_single = swiotlb_map_single,
20 .unmap_single = swiotlb_unmap_single,
21 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
22 .sync_single_for_device = swiotlb_sync_single_for_device,
23 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
24 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
25 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
26 .sync_sg_for_device = swiotlb_sync_sg_for_device,
27 .map_sg = swiotlb_map_sg,
28 .unmap_sg = swiotlb_unmap_sg,
29 .dma_supported = NULL,
30};
31
32void __init pci_swiotlb_init(void)
33{
34 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
35 if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
36 swiotlb = 1;
37 if (swiotlb_force)
38 swiotlb = 1;
39 if (swiotlb) {
40 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
41 swiotlb_init();
42 dma_ops = &swiotlb_dma_ops;
43 }
44}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
new file mode 100644
index 000000000000..ae8f91214f15
--- /dev/null
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -0,0 +1,69 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <asm/io.h>
23#include <asm/proto.h>
24#include <asm/msr.h>
25#include <asm/vsyscall.h>
26
27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644
index 000000000000..98956555450b
--- /dev/null
+++ b/arch/x86/kernel/process_64.c
@@ -0,0 +1,903 @@
1/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
11 *
12 * CPU hotplug support - ashok.raj@intel.com
13 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
21#include <linux/cpu.h>
22#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/fs.h>
27#include <linux/elfcore.h>
28#include <linux/smp.h>
29#include <linux/slab.h>
30#include <linux/user.h>
31#include <linux/module.h>
32#include <linux/a.out.h>
33#include <linux/interrupt.h>
34#include <linux/delay.h>
35#include <linux/ptrace.h>
36#include <linux/utsname.h>
37#include <linux/random.h>
38#include <linux/notifier.h>
39#include <linux/kprobes.h>
40#include <linux/kdebug.h>
41
42#include <asm/uaccess.h>
43#include <asm/pgtable.h>
44#include <asm/system.h>
45#include <asm/io.h>
46#include <asm/processor.h>
47#include <asm/i387.h>
48#include <asm/mmu_context.h>
49#include <asm/pda.h>
50#include <asm/prctl.h>
51#include <asm/desc.h>
52#include <asm/proto.h>
53#include <asm/ia32.h>
54#include <asm/idle.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59
60unsigned long boot_option_idle_override = 0;
61EXPORT_SYMBOL(boot_option_idle_override);
62
63/*
64 * Powermanagement idle function, if any..
65 */
66void (*pm_idle)(void);
67EXPORT_SYMBOL(pm_idle);
68static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69
70static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71
72void idle_notifier_register(struct notifier_block *n)
73{
74 atomic_notifier_chain_register(&idle_notifier, n);
75}
76EXPORT_SYMBOL_GPL(idle_notifier_register);
77
78void idle_notifier_unregister(struct notifier_block *n)
79{
80 atomic_notifier_chain_unregister(&idle_notifier, n);
81}
82EXPORT_SYMBOL(idle_notifier_unregister);
83
84void enter_idle(void)
85{
86 write_pda(isidle, 1);
87 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88}
89
90static void __exit_idle(void)
91{
92 if (test_and_clear_bit_pda(0, isidle) == 0)
93 return;
94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
100 /* idle loop has pid 0 */
101 if (current->pid)
102 return;
103 __exit_idle();
104}
105
106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
110static void default_idle(void)
111{
112 current_thread_info()->status &= ~TS_POLLING;
113 /*
114 * TS_POLLING-cleared state must be visible before we
115 * test NEED_RESCHED:
116 */
117 smp_mb();
118 local_irq_disable();
119 if (!need_resched()) {
120 /* Enables interrupts one instruction before HLT.
121 x86 special cases this so there is no race. */
122 safe_halt();
123 } else
124 local_irq_enable();
125 current_thread_info()->status |= TS_POLLING;
126}
127
128/*
129 * On SMP it's slightly faster (but much more power-consuming!)
130 * to poll the ->need_resched flag instead of waiting for the
131 * cross-CPU IPI to arrive. Use this option with caution.
132 */
133static void poll_idle (void)
134{
135 local_irq_enable();
136 cpu_relax();
137}
138
139void cpu_idle_wait(void)
140{
141 unsigned int cpu, this_cpu = get_cpu();
142 cpumask_t map, tmp = current->cpus_allowed;
143
144 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
145 put_cpu();
146
147 cpus_clear(map);
148 for_each_online_cpu(cpu) {
149 per_cpu(cpu_idle_state, cpu) = 1;
150 cpu_set(cpu, map);
151 }
152
153 __get_cpu_var(cpu_idle_state) = 0;
154
155 wmb();
156 do {
157 ssleep(1);
158 for_each_online_cpu(cpu) {
159 if (cpu_isset(cpu, map) &&
160 !per_cpu(cpu_idle_state, cpu))
161 cpu_clear(cpu, map);
162 }
163 cpus_and(map, map, cpu_online_map);
164 } while (!cpus_empty(map));
165
166 set_cpus_allowed(current, tmp);
167}
168EXPORT_SYMBOL_GPL(cpu_idle_wait);
169
170#ifdef CONFIG_HOTPLUG_CPU
171DECLARE_PER_CPU(int, cpu_state);
172
173#include <asm/nmi.h>
174/* We halt the CPU with physical CPU hotplug */
175static inline void play_dead(void)
176{
177 idle_task_exit();
178 wbinvd();
179 mb();
180 /* Ack it */
181 __get_cpu_var(cpu_state) = CPU_DEAD;
182
183 local_irq_disable();
184 while (1)
185 halt();
186}
187#else
188static inline void play_dead(void)
189{
190 BUG();
191}
192#endif /* CONFIG_HOTPLUG_CPU */
193
194/*
195 * The idle thread. There's no useful work to be
196 * done, so just try to conserve power and have a
197 * low exit latency (ie sit in a loop waiting for
198 * somebody to say that they'd like to reschedule)
199 */
200void cpu_idle (void)
201{
202 current_thread_info()->status |= TS_POLLING;
203 /* endless idle loop with no priority at all */
204 while (1) {
205 while (!need_resched()) {
206 void (*idle)(void);
207
208 if (__get_cpu_var(cpu_idle_state))
209 __get_cpu_var(cpu_idle_state) = 0;
210
211 rmb();
212 idle = pm_idle;
213 if (!idle)
214 idle = default_idle;
215 if (cpu_is_offline(smp_processor_id()))
216 play_dead();
217 /*
218 * Idle routines should keep interrupts disabled
219 * from here on, until they go to idle.
220 * Otherwise, idle callbacks can misfire.
221 */
222 local_irq_disable();
223 enter_idle();
224 idle();
225 /* In many cases the interrupt that ended idle
226 has already called exit_idle. But some idle
227 loops can be woken up without interrupt. */
228 __exit_idle();
229 }
230
231 preempt_enable_no_resched();
232 schedule();
233 preempt_disable();
234 }
235}
236
237/*
238 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
239 * which can obviate IPI to trigger checking of need_resched.
240 * We execute MONITOR against need_resched and enter optimized wait state
241 * through MWAIT. Whenever someone changes need_resched, we would be woken
242 * up from MWAIT (without an IPI).
243 *
244 * New with Core Duo processors, MWAIT can take some hints based on CPU
245 * capability.
246 */
247void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
248{
249 if (!need_resched()) {
250 __monitor((void *)&current_thread_info()->flags, 0, 0);
251 smp_mb();
252 if (!need_resched())
253 __mwait(eax, ecx);
254 }
255}
256
257/* Default MONITOR/MWAIT with no hints, used for default C1 state */
258static void mwait_idle(void)
259{
260 if (!need_resched()) {
261 __monitor((void *)&current_thread_info()->flags, 0, 0);
262 smp_mb();
263 if (!need_resched())
264 __sti_mwait(0, 0);
265 else
266 local_irq_enable();
267 } else {
268 local_irq_enable();
269 }
270}
271
272void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
273{
274 static int printed;
275 if (cpu_has(c, X86_FEATURE_MWAIT)) {
276 /*
277 * Skip, if setup has overridden idle.
278 * One CPU supports mwait => All CPUs supports mwait
279 */
280 if (!pm_idle) {
281 if (!printed) {
282 printk(KERN_INFO "using mwait in idle threads.\n");
283 printed = 1;
284 }
285 pm_idle = mwait_idle;
286 }
287 }
288}
289
290static int __init idle_setup (char *str)
291{
292 if (!strcmp(str, "poll")) {
293 printk("using polling idle threads.\n");
294 pm_idle = poll_idle;
295 } else if (!strcmp(str, "mwait"))
296 force_mwait = 1;
297 else
298 return -1;
299
300 boot_option_idle_override = 1;
301 return 0;
302}
303early_param("idle", idle_setup);
304
305/* Prints also some state that isn't saved in the pt_regs */
306void __show_regs(struct pt_regs * regs)
307{
308 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
309 unsigned long d0, d1, d2, d3, d6, d7;
310 unsigned int fsindex,gsindex;
311 unsigned int ds,cs,es;
312
313 printk("\n");
314 print_modules();
315 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
316 current->pid, current->comm, print_tainted(),
317 init_utsname()->release,
318 (int)strcspn(init_utsname()->version, " "),
319 init_utsname()->version);
320 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
321 printk_address(regs->rip);
322 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
323 regs->eflags);
324 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
325 regs->rax, regs->rbx, regs->rcx);
326 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
327 regs->rdx, regs->rsi, regs->rdi);
328 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
329 regs->rbp, regs->r8, regs->r9);
330 printk("R10: %016lx R11: %016lx R12: %016lx\n",
331 regs->r10, regs->r11, regs->r12);
332 printk("R13: %016lx R14: %016lx R15: %016lx\n",
333 regs->r13, regs->r14, regs->r15);
334
335 asm("movl %%ds,%0" : "=r" (ds));
336 asm("movl %%cs,%0" : "=r" (cs));
337 asm("movl %%es,%0" : "=r" (es));
338 asm("movl %%fs,%0" : "=r" (fsindex));
339 asm("movl %%gs,%0" : "=r" (gsindex));
340
341 rdmsrl(MSR_FS_BASE, fs);
342 rdmsrl(MSR_GS_BASE, gs);
343 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
344
345 cr0 = read_cr0();
346 cr2 = read_cr2();
347 cr3 = read_cr3();
348 cr4 = read_cr4();
349
350 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
351 fs,fsindex,gs,gsindex,shadowgs);
352 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
353 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
354
355 get_debugreg(d0, 0);
356 get_debugreg(d1, 1);
357 get_debugreg(d2, 2);
358 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
359 get_debugreg(d3, 3);
360 get_debugreg(d6, 6);
361 get_debugreg(d7, 7);
362 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
363}
364
365void show_regs(struct pt_regs *regs)
366{
367 printk("CPU %d:", smp_processor_id());
368 __show_regs(regs);
369 show_trace(NULL, regs, (void *)(regs + 1));
370}
371
372/*
373 * Free current thread data structures etc..
374 */
375void exit_thread(void)
376{
377 struct task_struct *me = current;
378 struct thread_struct *t = &me->thread;
379
380 if (me->thread.io_bitmap_ptr) {
381 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
382
383 kfree(t->io_bitmap_ptr);
384 t->io_bitmap_ptr = NULL;
385 clear_thread_flag(TIF_IO_BITMAP);
386 /*
387 * Careful, clear this in the TSS too:
388 */
389 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
390 t->io_bitmap_max = 0;
391 put_cpu();
392 }
393}
394
395void flush_thread(void)
396{
397 struct task_struct *tsk = current;
398
399 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
400 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
401 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
402 clear_tsk_thread_flag(tsk, TIF_IA32);
403 } else {
404 set_tsk_thread_flag(tsk, TIF_IA32);
405 current_thread_info()->status |= TS_COMPAT;
406 }
407 }
408 clear_tsk_thread_flag(tsk, TIF_DEBUG);
409
410 tsk->thread.debugreg0 = 0;
411 tsk->thread.debugreg1 = 0;
412 tsk->thread.debugreg2 = 0;
413 tsk->thread.debugreg3 = 0;
414 tsk->thread.debugreg6 = 0;
415 tsk->thread.debugreg7 = 0;
416 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
417 /*
418 * Forget coprocessor state..
419 */
420 clear_fpu(tsk);
421 clear_used_math();
422}
423
424void release_thread(struct task_struct *dead_task)
425{
426 if (dead_task->mm) {
427 if (dead_task->mm->context.size) {
428 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
429 dead_task->comm,
430 dead_task->mm->context.ldt,
431 dead_task->mm->context.size);
432 BUG();
433 }
434 }
435}
436
437static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
438{
439 struct user_desc ud = {
440 .base_addr = addr,
441 .limit = 0xfffff,
442 .seg_32bit = 1,
443 .limit_in_pages = 1,
444 .useable = 1,
445 };
446 struct n_desc_struct *desc = (void *)t->thread.tls_array;
447 desc += tls;
448 desc->a = LDT_entry_a(&ud);
449 desc->b = LDT_entry_b(&ud);
450}
451
452static inline u32 read_32bit_tls(struct task_struct *t, int tls)
453{
454 struct desc_struct *desc = (void *)t->thread.tls_array;
455 desc += tls;
456 return desc->base0 |
457 (((u32)desc->base1) << 16) |
458 (((u32)desc->base2) << 24);
459}
460
461/*
462 * This gets called before we allocate a new thread and copy
463 * the current task into it.
464 */
465void prepare_to_copy(struct task_struct *tsk)
466{
467 unlazy_fpu(tsk);
468}
469
470int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
471 unsigned long unused,
472 struct task_struct * p, struct pt_regs * regs)
473{
474 int err;
475 struct pt_regs * childregs;
476 struct task_struct *me = current;
477
478 childregs = ((struct pt_regs *)
479 (THREAD_SIZE + task_stack_page(p))) - 1;
480 *childregs = *regs;
481
482 childregs->rax = 0;
483 childregs->rsp = rsp;
484 if (rsp == ~0UL)
485 childregs->rsp = (unsigned long)childregs;
486
487 p->thread.rsp = (unsigned long) childregs;
488 p->thread.rsp0 = (unsigned long) (childregs+1);
489 p->thread.userrsp = me->thread.userrsp;
490
491 set_tsk_thread_flag(p, TIF_FORK);
492
493 p->thread.fs = me->thread.fs;
494 p->thread.gs = me->thread.gs;
495
496 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
497 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
498 asm("mov %%es,%0" : "=m" (p->thread.es));
499 asm("mov %%ds,%0" : "=m" (p->thread.ds));
500
501 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
502 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
503 if (!p->thread.io_bitmap_ptr) {
504 p->thread.io_bitmap_max = 0;
505 return -ENOMEM;
506 }
507 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
508 IO_BITMAP_BYTES);
509 set_tsk_thread_flag(p, TIF_IO_BITMAP);
510 }
511
512 /*
513 * Set a new TLS for the child thread?
514 */
515 if (clone_flags & CLONE_SETTLS) {
516#ifdef CONFIG_IA32_EMULATION
517 if (test_thread_flag(TIF_IA32))
518 err = ia32_child_tls(p, childregs);
519 else
520#endif
521 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
522 if (err)
523 goto out;
524 }
525 err = 0;
526out:
527 if (err && p->thread.io_bitmap_ptr) {
528 kfree(p->thread.io_bitmap_ptr);
529 p->thread.io_bitmap_max = 0;
530 }
531 return err;
532}
533
534/*
535 * This special macro can be used to load a debugging register
536 */
537#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
538
539static inline void __switch_to_xtra(struct task_struct *prev_p,
540 struct task_struct *next_p,
541 struct tss_struct *tss)
542{
543 struct thread_struct *prev, *next;
544
545 prev = &prev_p->thread,
546 next = &next_p->thread;
547
548 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
549 loaddebug(next, 0);
550 loaddebug(next, 1);
551 loaddebug(next, 2);
552 loaddebug(next, 3);
553 /* no 4 and 5 */
554 loaddebug(next, 6);
555 loaddebug(next, 7);
556 }
557
558 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
559 /*
560 * Copy the relevant range of the IO bitmap.
561 * Normally this is 128 bytes or less:
562 */
563 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
564 max(prev->io_bitmap_max, next->io_bitmap_max));
565 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
566 /*
567 * Clear any possible leftover bits:
568 */
569 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
570 }
571}
572
573/*
574 * switch_to(x,y) should switch tasks from x to y.
575 *
576 * This could still be optimized:
577 * - fold all the options into a flag word and test it with a single test.
578 * - could test fs/gs bitsliced
579 *
580 * Kprobes not supported here. Set the probe on schedule instead.
581 */
582__kprobes struct task_struct *
583__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584{
585 struct thread_struct *prev = &prev_p->thread,
586 *next = &next_p->thread;
587 int cpu = smp_processor_id();
588 struct tss_struct *tss = &per_cpu(init_tss, cpu);
589
590 /* we're going to use this soon, after a few expensive things */
591 if (next_p->fpu_counter>5)
592 prefetch(&next->i387.fxsave);
593
594 /*
595 * Reload esp0, LDT and the page table pointer:
596 */
597 tss->rsp0 = next->rsp0;
598
599 /*
600 * Switch DS and ES.
601 * This won't pick up thread selector changes, but I guess that is ok.
602 */
603 asm volatile("mov %%es,%0" : "=m" (prev->es));
604 if (unlikely(next->es | prev->es))
605 loadsegment(es, next->es);
606
607 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
608 if (unlikely(next->ds | prev->ds))
609 loadsegment(ds, next->ds);
610
611 load_TLS(next, cpu);
612
613 /*
614 * Switch FS and GS.
615 */
616 {
617 unsigned fsindex;
618 asm volatile("movl %%fs,%0" : "=r" (fsindex));
619 /* segment register != 0 always requires a reload.
620 also reload when it has changed.
621 when prev process used 64bit base always reload
622 to avoid an information leak. */
623 if (unlikely(fsindex | next->fsindex | prev->fs)) {
624 loadsegment(fs, next->fsindex);
625 /* check if the user used a selector != 0
626 * if yes clear 64bit base, since overloaded base
627 * is always mapped to the Null selector
628 */
629 if (fsindex)
630 prev->fs = 0;
631 }
632 /* when next process has a 64bit base use it */
633 if (next->fs)
634 wrmsrl(MSR_FS_BASE, next->fs);
635 prev->fsindex = fsindex;
636 }
637 {
638 unsigned gsindex;
639 asm volatile("movl %%gs,%0" : "=r" (gsindex));
640 if (unlikely(gsindex | next->gsindex | prev->gs)) {
641 load_gs_index(next->gsindex);
642 if (gsindex)
643 prev->gs = 0;
644 }
645 if (next->gs)
646 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
647 prev->gsindex = gsindex;
648 }
649
650 /* Must be after DS reload */
651 unlazy_fpu(prev_p);
652
653 /*
654 * Switch the PDA and FPU contexts.
655 */
656 prev->userrsp = read_pda(oldrsp);
657 write_pda(oldrsp, next->userrsp);
658 write_pda(pcurrent, next_p);
659
660 write_pda(kernelstack,
661 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
662#ifdef CONFIG_CC_STACKPROTECTOR
663 write_pda(stack_canary, next_p->stack_canary);
664 /*
665 * Build time only check to make sure the stack_canary is at
666 * offset 40 in the pda; this is a gcc ABI requirement
667 */
668 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
669#endif
670
671 /*
672 * Now maybe reload the debug registers and handle I/O bitmaps
673 */
674 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
675 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
676 __switch_to_xtra(prev_p, next_p, tss);
677
678 /* If the task has used fpu the last 5 timeslices, just do a full
679 * restore of the math state immediately to avoid the trap; the
680 * chances of needing FPU soon are obviously high now
681 */
682 if (next_p->fpu_counter>5)
683 math_state_restore();
684 return prev_p;
685}
686
687/*
688 * sys_execve() executes a new program.
689 */
690asmlinkage
691long sys_execve(char __user *name, char __user * __user *argv,
692 char __user * __user *envp, struct pt_regs regs)
693{
694 long error;
695 char * filename;
696
697 filename = getname(name);
698 error = PTR_ERR(filename);
699 if (IS_ERR(filename))
700 return error;
701 error = do_execve(filename, argv, envp, &regs);
702 if (error == 0) {
703 task_lock(current);
704 current->ptrace &= ~PT_DTRACE;
705 task_unlock(current);
706 }
707 putname(filename);
708 return error;
709}
710
711void set_personality_64bit(void)
712{
713 /* inherit personality from parent */
714
715 /* Make sure to be in 64bit mode */
716 clear_thread_flag(TIF_IA32);
717
718 /* TBD: overwrites user setup. Should have two bits.
719 But 64bit processes have always behaved this way,
720 so it's not too bad. The main problem is just that
721 32bit childs are affected again. */
722 current->personality &= ~READ_IMPLIES_EXEC;
723}
724
725asmlinkage long sys_fork(struct pt_regs *regs)
726{
727 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
728}
729
730asmlinkage long
731sys_clone(unsigned long clone_flags, unsigned long newsp,
732 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
733{
734 if (!newsp)
735 newsp = regs->rsp;
736 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
737}
738
739/*
740 * This is trivial, and on the face of it looks like it
741 * could equally well be done in user mode.
742 *
743 * Not so, for quite unobvious reasons - register pressure.
744 * In user mode vfork() cannot have a stack frame, and if
745 * done by calling the "clone()" system call directly, you
746 * do not have enough call-clobbered registers to hold all
747 * the information you need.
748 */
749asmlinkage long sys_vfork(struct pt_regs *regs)
750{
751 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
752 NULL, NULL);
753}
754
755unsigned long get_wchan(struct task_struct *p)
756{
757 unsigned long stack;
758 u64 fp,rip;
759 int count = 0;
760
761 if (!p || p == current || p->state==TASK_RUNNING)
762 return 0;
763 stack = (unsigned long)task_stack_page(p);
764 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
765 return 0;
766 fp = *(u64 *)(p->thread.rsp);
767 do {
768 if (fp < (unsigned long)stack ||
769 fp > (unsigned long)stack+THREAD_SIZE)
770 return 0;
771 rip = *(u64 *)(fp+8);
772 if (!in_sched_functions(rip))
773 return rip;
774 fp = *(u64 *)fp;
775 } while (count++ < 16);
776 return 0;
777}
778
779long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
780{
781 int ret = 0;
782 int doit = task == current;
783 int cpu;
784
785 switch (code) {
786 case ARCH_SET_GS:
787 if (addr >= TASK_SIZE_OF(task))
788 return -EPERM;
789 cpu = get_cpu();
790 /* handle small bases via the GDT because that's faster to
791 switch. */
792 if (addr <= 0xffffffff) {
793 set_32bit_tls(task, GS_TLS, addr);
794 if (doit) {
795 load_TLS(&task->thread, cpu);
796 load_gs_index(GS_TLS_SEL);
797 }
798 task->thread.gsindex = GS_TLS_SEL;
799 task->thread.gs = 0;
800 } else {
801 task->thread.gsindex = 0;
802 task->thread.gs = addr;
803 if (doit) {
804 load_gs_index(0);
805 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
806 }
807 }
808 put_cpu();
809 break;
810 case ARCH_SET_FS:
811 /* Not strictly needed for fs, but do it for symmetry
812 with gs */
813 if (addr >= TASK_SIZE_OF(task))
814 return -EPERM;
815 cpu = get_cpu();
816 /* handle small bases via the GDT because that's faster to
817 switch. */
818 if (addr <= 0xffffffff) {
819 set_32bit_tls(task, FS_TLS, addr);
820 if (doit) {
821 load_TLS(&task->thread, cpu);
822 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
823 }
824 task->thread.fsindex = FS_TLS_SEL;
825 task->thread.fs = 0;
826 } else {
827 task->thread.fsindex = 0;
828 task->thread.fs = addr;
829 if (doit) {
830 /* set the selector to 0 to not confuse
831 __switch_to */
832 asm volatile("movl %0,%%fs" :: "r" (0));
833 ret = checking_wrmsrl(MSR_FS_BASE, addr);
834 }
835 }
836 put_cpu();
837 break;
838 case ARCH_GET_FS: {
839 unsigned long base;
840 if (task->thread.fsindex == FS_TLS_SEL)
841 base = read_32bit_tls(task, FS_TLS);
842 else if (doit)
843 rdmsrl(MSR_FS_BASE, base);
844 else
845 base = task->thread.fs;
846 ret = put_user(base, (unsigned long __user *)addr);
847 break;
848 }
849 case ARCH_GET_GS: {
850 unsigned long base;
851 unsigned gsindex;
852 if (task->thread.gsindex == GS_TLS_SEL)
853 base = read_32bit_tls(task, GS_TLS);
854 else if (doit) {
855 asm("movl %%gs,%0" : "=r" (gsindex));
856 if (gsindex)
857 rdmsrl(MSR_KERNEL_GS_BASE, base);
858 else
859 base = task->thread.gs;
860 }
861 else
862 base = task->thread.gs;
863 ret = put_user(base, (unsigned long __user *)addr);
864 break;
865 }
866
867 default:
868 ret = -EINVAL;
869 break;
870 }
871
872 return ret;
873}
874
875long sys_arch_prctl(int code, unsigned long addr)
876{
877 return do_arch_prctl(current, code, addr);
878}
879
880/*
881 * Capture the user space registers if the task is not running (in user space)
882 */
883int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
884{
885 struct pt_regs *pp, ptregs;
886
887 pp = task_pt_regs(tsk);
888
889 ptregs = *pp;
890 ptregs.cs &= 0xffff;
891 ptregs.ss &= 0xffff;
892
893 elf_core_copy_regs(regs, &ptregs);
894
895 return 1;
896}
897
898unsigned long arch_align_stack(unsigned long sp)
899{
900 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
901 sp -= get_random_int() % 8192;
902 return sp & ~0xf;
903}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
new file mode 100644
index 000000000000..eea3702427b4
--- /dev/null
+++ b/arch/x86/kernel/ptrace_64.c
@@ -0,0 +1,627 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * x86-64 port 2000-2002 Andi Kleen
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/user.h>
17#include <linux/security.h>
18#include <linux/audit.h>
19#include <linux/seccomp.h>
20#include <linux/signal.h>
21
22#include <asm/uaccess.h>
23#include <asm/pgtable.h>
24#include <asm/system.h>
25#include <asm/processor.h>
26#include <asm/i387.h>
27#include <asm/debugreg.h>
28#include <asm/ldt.h>
29#include <asm/desc.h>
30#include <asm/proto.h>
31#include <asm/ia32.h>
32
33/*
34 * does not yet catch signals sent when the child dies.
35 * in exit.c or in signal.c.
36 */
37
38/*
39 * Determines which flags the user has access to [1 = access, 0 = no access].
40 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
41 * Also masks reserved bits (63-22, 15, 5, 3, 1).
42 */
43#define FLAG_MASK 0x54dd5UL
44
45/* set's the trap flag. */
46#define TRAP_FLAG 0x100UL
47
48/*
49 * eflags and offset of eflags on child stack..
50 */
51#define EFLAGS offsetof(struct pt_regs, eflags)
52#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
53
54/*
55 * this routine will get a word off of the processes privileged stack.
56 * the offset is how far from the base addr as stored in the TSS.
57 * this routine assumes that all the privileged stacks are in our
58 * data space.
59 */
60static inline unsigned long get_stack_long(struct task_struct *task, int offset)
61{
62 unsigned char *stack;
63
64 stack = (unsigned char *)task->thread.rsp0;
65 stack += offset;
66 return (*((unsigned long *)stack));
67}
68
69/*
70 * this routine will put a word on the processes privileged stack.
71 * the offset is how far from the base addr as stored in the TSS.
72 * this routine assumes that all the privileged stacks are in our
73 * data space.
74 */
75static inline long put_stack_long(struct task_struct *task, int offset,
76 unsigned long data)
77{
78 unsigned char * stack;
79
80 stack = (unsigned char *) task->thread.rsp0;
81 stack += offset;
82 *(unsigned long *) stack = data;
83 return 0;
84}
85
86#define LDT_SEGMENT 4
87
88unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
89{
90 unsigned long addr, seg;
91
92 addr = regs->rip;
93 seg = regs->cs & 0xffff;
94
95 /*
96 * We'll assume that the code segments in the GDT
97 * are all zero-based. That is largely true: the
98 * TLS segments are used for data, and the PNPBIOS
99 * and APM bios ones we just ignore here.
100 */
101 if (seg & LDT_SEGMENT) {
102 u32 *desc;
103 unsigned long base;
104
105 seg &= ~7UL;
106
107 down(&child->mm->context.sem);
108 if (unlikely((seg >> 3) >= child->mm->context.size))
109 addr = -1L; /* bogus selector, access would fault */
110 else {
111 desc = child->mm->context.ldt + seg;
112 base = ((desc[0] >> 16) |
113 ((desc[1] & 0xff) << 16) |
114 (desc[1] & 0xff000000));
115
116 /* 16-bit code segment? */
117 if (!((desc[1] >> 22) & 1))
118 addr &= 0xffff;
119 addr += base;
120 }
121 up(&child->mm->context.sem);
122 }
123
124 return addr;
125}
126
127static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
128{
129 int i, copied;
130 unsigned char opcode[15];
131 unsigned long addr = convert_rip_to_linear(child, regs);
132
133 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
134 for (i = 0; i < copied; i++) {
135 switch (opcode[i]) {
136 /* popf and iret */
137 case 0x9d: case 0xcf:
138 return 1;
139
140 /* CHECKME: 64 65 */
141
142 /* opcode and address size prefixes */
143 case 0x66: case 0x67:
144 continue;
145 /* irrelevant prefixes (segment overrides and repeats) */
146 case 0x26: case 0x2e:
147 case 0x36: case 0x3e:
148 case 0x64: case 0x65:
149 case 0xf2: case 0xf3:
150 continue;
151
152 case 0x40 ... 0x4f:
153 if (regs->cs != __USER_CS)
154 /* 32-bit mode: register increment */
155 return 0;
156 /* 64-bit mode: REX prefix */
157 continue;
158
159 /* CHECKME: f2, f3 */
160
161 /*
162 * pushf: NOTE! We should probably not let
163 * the user see the TF bit being set. But
164 * it's more pain than it's worth to avoid
165 * it, and a debugger could emulate this
166 * all in user space if it _really_ cares.
167 */
168 case 0x9c:
169 default:
170 return 0;
171 }
172 }
173 return 0;
174}
175
176static void set_singlestep(struct task_struct *child)
177{
178 struct pt_regs *regs = task_pt_regs(child);
179
180 /*
181 * Always set TIF_SINGLESTEP - this guarantees that
182 * we single-step system calls etc.. This will also
183 * cause us to set TF when returning to user mode.
184 */
185 set_tsk_thread_flag(child, TIF_SINGLESTEP);
186
187 /*
188 * If TF was already set, don't do anything else
189 */
190 if (regs->eflags & TRAP_FLAG)
191 return;
192
193 /* Set TF on the kernel stack.. */
194 regs->eflags |= TRAP_FLAG;
195
196 /*
197 * ..but if TF is changed by the instruction we will trace,
198 * don't mark it as being "us" that set it, so that we
199 * won't clear it by hand later.
200 */
201 if (is_setting_trap_flag(child, regs))
202 return;
203
204 child->ptrace |= PT_DTRACE;
205}
206
207static void clear_singlestep(struct task_struct *child)
208{
209 /* Always clear TIF_SINGLESTEP... */
210 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
211
212 /* But touch TF only if it was set by us.. */
213 if (child->ptrace & PT_DTRACE) {
214 struct pt_regs *regs = task_pt_regs(child);
215 regs->eflags &= ~TRAP_FLAG;
216 child->ptrace &= ~PT_DTRACE;
217 }
218}
219
220/*
221 * Called by kernel/ptrace.c when detaching..
222 *
223 * Make sure the single step bit is not set.
224 */
225void ptrace_disable(struct task_struct *child)
226{
227 clear_singlestep(child);
228}
229
230static int putreg(struct task_struct *child,
231 unsigned long regno, unsigned long value)
232{
233 unsigned long tmp;
234
235 switch (regno) {
236 case offsetof(struct user_regs_struct,fs):
237 if (value && (value & 3) != 3)
238 return -EIO;
239 child->thread.fsindex = value & 0xffff;
240 return 0;
241 case offsetof(struct user_regs_struct,gs):
242 if (value && (value & 3) != 3)
243 return -EIO;
244 child->thread.gsindex = value & 0xffff;
245 return 0;
246 case offsetof(struct user_regs_struct,ds):
247 if (value && (value & 3) != 3)
248 return -EIO;
249 child->thread.ds = value & 0xffff;
250 return 0;
251 case offsetof(struct user_regs_struct,es):
252 if (value && (value & 3) != 3)
253 return -EIO;
254 child->thread.es = value & 0xffff;
255 return 0;
256 case offsetof(struct user_regs_struct,ss):
257 if ((value & 3) != 3)
258 return -EIO;
259 value &= 0xffff;
260 return 0;
261 case offsetof(struct user_regs_struct,fs_base):
262 if (value >= TASK_SIZE_OF(child))
263 return -EIO;
264 child->thread.fs = value;
265 return 0;
266 case offsetof(struct user_regs_struct,gs_base):
267 if (value >= TASK_SIZE_OF(child))
268 return -EIO;
269 child->thread.gs = value;
270 return 0;
271 case offsetof(struct user_regs_struct, eflags):
272 value &= FLAG_MASK;
273 tmp = get_stack_long(child, EFL_OFFSET);
274 tmp &= ~FLAG_MASK;
275 value |= tmp;
276 break;
277 case offsetof(struct user_regs_struct,cs):
278 if ((value & 3) != 3)
279 return -EIO;
280 value &= 0xffff;
281 break;
282 }
283 put_stack_long(child, regno - sizeof(struct pt_regs), value);
284 return 0;
285}
286
287static unsigned long getreg(struct task_struct *child, unsigned long regno)
288{
289 unsigned long val;
290 switch (regno) {
291 case offsetof(struct user_regs_struct, fs):
292 return child->thread.fsindex;
293 case offsetof(struct user_regs_struct, gs):
294 return child->thread.gsindex;
295 case offsetof(struct user_regs_struct, ds):
296 return child->thread.ds;
297 case offsetof(struct user_regs_struct, es):
298 return child->thread.es;
299 case offsetof(struct user_regs_struct, fs_base):
300 return child->thread.fs;
301 case offsetof(struct user_regs_struct, gs_base):
302 return child->thread.gs;
303 default:
304 regno = regno - sizeof(struct pt_regs);
305 val = get_stack_long(child, regno);
306 if (test_tsk_thread_flag(child, TIF_IA32))
307 val &= 0xffffffff;
308 return val;
309 }
310
311}
312
313long arch_ptrace(struct task_struct *child, long request, long addr, long data)
314{
315 long i, ret;
316 unsigned ui;
317
318 switch (request) {
319 /* when I and D space are separate, these will need to be fixed. */
320 case PTRACE_PEEKTEXT: /* read word at location addr. */
321 case PTRACE_PEEKDATA:
322 ret = generic_ptrace_peekdata(child, addr, data);
323 break;
324
325 /* read the word at location addr in the USER area. */
326 case PTRACE_PEEKUSR: {
327 unsigned long tmp;
328
329 ret = -EIO;
330 if ((addr & 7) ||
331 addr > sizeof(struct user) - 7)
332 break;
333
334 switch (addr) {
335 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
336 tmp = getreg(child, addr);
337 break;
338 case offsetof(struct user, u_debugreg[0]):
339 tmp = child->thread.debugreg0;
340 break;
341 case offsetof(struct user, u_debugreg[1]):
342 tmp = child->thread.debugreg1;
343 break;
344 case offsetof(struct user, u_debugreg[2]):
345 tmp = child->thread.debugreg2;
346 break;
347 case offsetof(struct user, u_debugreg[3]):
348 tmp = child->thread.debugreg3;
349 break;
350 case offsetof(struct user, u_debugreg[6]):
351 tmp = child->thread.debugreg6;
352 break;
353 case offsetof(struct user, u_debugreg[7]):
354 tmp = child->thread.debugreg7;
355 break;
356 default:
357 tmp = 0;
358 break;
359 }
360 ret = put_user(tmp,(unsigned long __user *) data);
361 break;
362 }
363
364 /* when I and D space are separate, this will have to be fixed. */
365 case PTRACE_POKETEXT: /* write the word at location addr. */
366 case PTRACE_POKEDATA:
367 ret = generic_ptrace_pokedata(child, addr, data);
368 break;
369
370 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
371 {
372 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
373 ret = -EIO;
374 if ((addr & 7) ||
375 addr > sizeof(struct user) - 7)
376 break;
377
378 switch (addr) {
379 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
380 ret = putreg(child, addr, data);
381 break;
382 /* Disallows to set a breakpoint into the vsyscall */
383 case offsetof(struct user, u_debugreg[0]):
384 if (data >= TASK_SIZE_OF(child) - dsize) break;
385 child->thread.debugreg0 = data;
386 ret = 0;
387 break;
388 case offsetof(struct user, u_debugreg[1]):
389 if (data >= TASK_SIZE_OF(child) - dsize) break;
390 child->thread.debugreg1 = data;
391 ret = 0;
392 break;
393 case offsetof(struct user, u_debugreg[2]):
394 if (data >= TASK_SIZE_OF(child) - dsize) break;
395 child->thread.debugreg2 = data;
396 ret = 0;
397 break;
398 case offsetof(struct user, u_debugreg[3]):
399 if (data >= TASK_SIZE_OF(child) - dsize) break;
400 child->thread.debugreg3 = data;
401 ret = 0;
402 break;
403 case offsetof(struct user, u_debugreg[6]):
404 if (data >> 32)
405 break;
406 child->thread.debugreg6 = data;
407 ret = 0;
408 break;
409 case offsetof(struct user, u_debugreg[7]):
410 /* See arch/i386/kernel/ptrace.c for an explanation of
411 * this awkward check.*/
412 data &= ~DR_CONTROL_RESERVED;
413 for(i=0; i<4; i++)
414 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
415 break;
416 if (i == 4) {
417 child->thread.debugreg7 = data;
418 if (data)
419 set_tsk_thread_flag(child, TIF_DEBUG);
420 else
421 clear_tsk_thread_flag(child, TIF_DEBUG);
422 ret = 0;
423 }
424 break;
425 }
426 break;
427 }
428 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
429 case PTRACE_CONT: /* restart after signal. */
430
431 ret = -EIO;
432 if (!valid_signal(data))
433 break;
434 if (request == PTRACE_SYSCALL)
435 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
436 else
437 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
438 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
439 child->exit_code = data;
440 /* make sure the single step bit is not set. */
441 clear_singlestep(child);
442 wake_up_process(child);
443 ret = 0;
444 break;
445
446#ifdef CONFIG_IA32_EMULATION
447 /* This makes only sense with 32bit programs. Allow a
448 64bit debugger to fully examine them too. Better
449 don't use it against 64bit processes, use
450 PTRACE_ARCH_PRCTL instead. */
451 case PTRACE_SET_THREAD_AREA: {
452 struct user_desc __user *p;
453 int old;
454 p = (struct user_desc __user *)data;
455 get_user(old, &p->entry_number);
456 put_user(addr, &p->entry_number);
457 ret = do_set_thread_area(&child->thread, p);
458 put_user(old, &p->entry_number);
459 break;
460 case PTRACE_GET_THREAD_AREA:
461 p = (struct user_desc __user *)data;
462 get_user(old, &p->entry_number);
463 put_user(addr, &p->entry_number);
464 ret = do_get_thread_area(&child->thread, p);
465 put_user(old, &p->entry_number);
466 break;
467 }
468#endif
469 /* normal 64bit interface to access TLS data.
470 Works just like arch_prctl, except that the arguments
471 are reversed. */
472 case PTRACE_ARCH_PRCTL:
473 ret = do_arch_prctl(child, data, addr);
474 break;
475
476/*
477 * make the child exit. Best I can do is send it a sigkill.
478 * perhaps it should be put in the status that it wants to
479 * exit.
480 */
481 case PTRACE_KILL:
482 ret = 0;
483 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
484 break;
485 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
486 child->exit_code = SIGKILL;
487 /* make sure the single step bit is not set. */
488 clear_singlestep(child);
489 wake_up_process(child);
490 break;
491
492 case PTRACE_SINGLESTEP: /* set the trap flag. */
493 ret = -EIO;
494 if (!valid_signal(data))
495 break;
496 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
497 set_singlestep(child);
498 child->exit_code = data;
499 /* give it a chance to run. */
500 wake_up_process(child);
501 ret = 0;
502 break;
503
504 case PTRACE_DETACH:
505 /* detach a process that was attached. */
506 ret = ptrace_detach(child, data);
507 break;
508
509 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
510 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
511 sizeof(struct user_regs_struct))) {
512 ret = -EIO;
513 break;
514 }
515 ret = 0;
516 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
517 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
518 data += sizeof(long);
519 }
520 break;
521 }
522
523 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
524 unsigned long tmp;
525 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
526 sizeof(struct user_regs_struct))) {
527 ret = -EIO;
528 break;
529 }
530 ret = 0;
531 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
532 ret = __get_user(tmp, (unsigned long __user *) data);
533 if (ret)
534 break;
535 ret = putreg(child, ui, tmp);
536 if (ret)
537 break;
538 data += sizeof(long);
539 }
540 break;
541 }
542
543 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
544 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
545 sizeof(struct user_i387_struct))) {
546 ret = -EIO;
547 break;
548 }
549 ret = get_fpregs((struct user_i387_struct __user *)data, child);
550 break;
551 }
552
553 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
554 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
555 sizeof(struct user_i387_struct))) {
556 ret = -EIO;
557 break;
558 }
559 set_stopped_child_used_math(child);
560 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
561 break;
562 }
563
564 default:
565 ret = ptrace_request(child, request, addr, data);
566 break;
567 }
568 return ret;
569}
570
571static void syscall_trace(struct pt_regs *regs)
572{
573
574#if 0
575 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
576 current->comm,
577 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
578 current_thread_info()->flags, current->ptrace);
579#endif
580
581 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
582 ? 0x80 : 0));
583 /*
584 * this isn't the same as continuing with a signal, but it will do
585 * for normal use. strace only continues with a signal if the
586 * stopping signal is not SIGTRAP. -brl
587 */
588 if (current->exit_code) {
589 send_sig(current->exit_code, current, 1);
590 current->exit_code = 0;
591 }
592}
593
594asmlinkage void syscall_trace_enter(struct pt_regs *regs)
595{
596 /* do the secure computing check first */
597 secure_computing(regs->orig_rax);
598
599 if (test_thread_flag(TIF_SYSCALL_TRACE)
600 && (current->ptrace & PT_PTRACED))
601 syscall_trace(regs);
602
603 if (unlikely(current->audit_context)) {
604 if (test_thread_flag(TIF_IA32)) {
605 audit_syscall_entry(AUDIT_ARCH_I386,
606 regs->orig_rax,
607 regs->rbx, regs->rcx,
608 regs->rdx, regs->rsi);
609 } else {
610 audit_syscall_entry(AUDIT_ARCH_X86_64,
611 regs->orig_rax,
612 regs->rdi, regs->rsi,
613 regs->rdx, regs->r10);
614 }
615 }
616}
617
618asmlinkage void syscall_trace_leave(struct pt_regs *regs)
619{
620 if (unlikely(current->audit_context))
621 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
622
623 if ((test_thread_flag(TIF_SYSCALL_TRACE)
624 || test_thread_flag(TIF_SINGLESTEP))
625 && (current->ptrace & PT_PTRACED))
626 syscall_trace(regs);
627}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
new file mode 100644
index 000000000000..368db2b9c5ac
--- /dev/null
+++ b/arch/x86/kernel/reboot_64.c
@@ -0,0 +1,171 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <linux/pm.h>
10#include <linux/kdebug.h>
11#include <linux/sched.h>
12#include <asm/io.h>
13#include <asm/delay.h>
14#include <asm/hw_irq.h>
15#include <asm/system.h>
16#include <asm/pgtable.h>
17#include <asm/tlbflush.h>
18#include <asm/apic.h>
19#include <asm/iommu.h>
20
21/*
22 * Power off function, if any
23 */
24void (*pm_power_off)(void);
25EXPORT_SYMBOL(pm_power_off);
26
27static long no_idt[3];
28static enum {
29 BOOT_TRIPLE = 't',
30 BOOT_KBD = 'k'
31} reboot_type = BOOT_KBD;
32static int reboot_mode = 0;
33int reboot_force;
34
35/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
36 warm Don't set the cold reboot flag
37 cold Set the cold reboot flag
38 triple Force a triple fault (init)
39 kbd Use the keyboard controller. cold reset (default)
40 force Avoid anything that could hang.
41 */
42static int __init reboot_setup(char *str)
43{
44 for (;;) {
45 switch (*str) {
46 case 'w':
47 reboot_mode = 0x1234;
48 break;
49
50 case 'c':
51 reboot_mode = 0;
52 break;
53
54 case 't':
55 case 'b':
56 case 'k':
57 reboot_type = *str;
58 break;
59 case 'f':
60 reboot_force = 1;
61 break;
62 }
63 if((str = strchr(str,',')) != NULL)
64 str++;
65 else
66 break;
67 }
68 return 1;
69}
70
71__setup("reboot=", reboot_setup);
72
73static inline void kb_wait(void)
74{
75 int i;
76
77 for (i=0; i<0x10000; i++)
78 if ((inb_p(0x64) & 0x02) == 0)
79 break;
80}
81
82void machine_shutdown(void)
83{
84 unsigned long flags;
85
86 /* Stop the cpus and apics */
87#ifdef CONFIG_SMP
88 int reboot_cpu_id;
89
90 /* The boot cpu is always logical cpu 0 */
91 reboot_cpu_id = 0;
92
93 /* Make certain the cpu I'm about to reboot on is online */
94 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
95 reboot_cpu_id = smp_processor_id();
96 }
97
98 /* Make certain I only run on the appropriate processor */
99 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
100
101 /* O.K Now that I'm on the appropriate processor,
102 * stop all of the others.
103 */
104 smp_send_stop();
105#endif
106
107 local_irq_save(flags);
108
109#ifndef CONFIG_SMP
110 disable_local_APIC();
111#endif
112
113 disable_IO_APIC();
114
115 local_irq_restore(flags);
116
117 pci_iommu_shutdown();
118}
119
120void machine_emergency_restart(void)
121{
122 int i;
123
124 /* Tell the BIOS if we want cold or warm reboot */
125 *((unsigned short *)__va(0x472)) = reboot_mode;
126
127 for (;;) {
128 /* Could also try the reset bit in the Hammer NB */
129 switch (reboot_type) {
130 case BOOT_KBD:
131 for (i=0; i<10; i++) {
132 kb_wait();
133 udelay(50);
134 outb(0xfe,0x64); /* pulse reset low */
135 udelay(50);
136 }
137
138 case BOOT_TRIPLE:
139 __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
140 __asm__ __volatile__("int3");
141
142 reboot_type = BOOT_KBD;
143 break;
144 }
145 }
146}
147
148void machine_restart(char * __unused)
149{
150 printk("machine restart\n");
151
152 if (!reboot_force) {
153 machine_shutdown();
154 }
155 machine_emergency_restart();
156}
157
158void machine_halt(void)
159{
160}
161
162void machine_power_off(void)
163{
164 if (pm_power_off) {
165 if (!reboot_force) {
166 machine_shutdown();
167 }
168 pm_power_off();
169 }
170}
171
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644
index 000000000000..14e95872c6a3
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -0,0 +1,276 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
12
13/*
14 * Must be relocatable PIC code callable as a C function
15 */
16
17#define PTR(x) (x << 3)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20
21 .text
22 .align PAGE_ALIGNED
23 .code64
24 .globl relocate_kernel
25relocate_kernel:
26 /* %rdi indirection_page
27 * %rsi page_list
28 * %rdx start address
29 */
30
31 /* map the control page at its virtual address */
32
33 movq $0x0000ff8000000000, %r10 /* mask */
34 mov $(39 - 3), %cl /* bits to shift */
35 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
36
37 movq %r11, %r9
38 andq %r10, %r9
39 shrq %cl, %r9
40
41 movq PTR(VA_PGD)(%rsi), %r8
42 addq %r8, %r9
43 movq PTR(PA_PUD_0)(%rsi), %r8
44 orq $PAGE_ATTR, %r8
45 movq %r8, (%r9)
46
47 shrq $9, %r10
48 sub $9, %cl
49
50 movq %r11, %r9
51 andq %r10, %r9
52 shrq %cl, %r9
53
54 movq PTR(VA_PUD_0)(%rsi), %r8
55 addq %r8, %r9
56 movq PTR(PA_PMD_0)(%rsi), %r8
57 orq $PAGE_ATTR, %r8
58 movq %r8, (%r9)
59
60 shrq $9, %r10
61 sub $9, %cl
62
63 movq %r11, %r9
64 andq %r10, %r9
65 shrq %cl, %r9
66
67 movq PTR(VA_PMD_0)(%rsi), %r8
68 addq %r8, %r9
69 movq PTR(PA_PTE_0)(%rsi), %r8
70 orq $PAGE_ATTR, %r8
71 movq %r8, (%r9)
72
73 shrq $9, %r10
74 sub $9, %cl
75
76 movq %r11, %r9
77 andq %r10, %r9
78 shrq %cl, %r9
79
80 movq PTR(VA_PTE_0)(%rsi), %r8
81 addq %r8, %r9
82 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
83 orq $PAGE_ATTR, %r8
84 movq %r8, (%r9)
85
86 /* identity map the control page at its physical address */
87
88 movq $0x0000ff8000000000, %r10 /* mask */
89 mov $(39 - 3), %cl /* bits to shift */
90 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
91
92 movq %r11, %r9
93 andq %r10, %r9
94 shrq %cl, %r9
95
96 movq PTR(VA_PGD)(%rsi), %r8
97 addq %r8, %r9
98 movq PTR(PA_PUD_1)(%rsi), %r8
99 orq $PAGE_ATTR, %r8
100 movq %r8, (%r9)
101
102 shrq $9, %r10
103 sub $9, %cl
104
105 movq %r11, %r9
106 andq %r10, %r9
107 shrq %cl, %r9
108
109 movq PTR(VA_PUD_1)(%rsi), %r8
110 addq %r8, %r9
111 movq PTR(PA_PMD_1)(%rsi), %r8
112 orq $PAGE_ATTR, %r8
113 movq %r8, (%r9)
114
115 shrq $9, %r10
116 sub $9, %cl
117
118 movq %r11, %r9
119 andq %r10, %r9
120 shrq %cl, %r9
121
122 movq PTR(VA_PMD_1)(%rsi), %r8
123 addq %r8, %r9
124 movq PTR(PA_PTE_1)(%rsi), %r8
125 orq $PAGE_ATTR, %r8
126 movq %r8, (%r9)
127
128 shrq $9, %r10
129 sub $9, %cl
130
131 movq %r11, %r9
132 andq %r10, %r9
133 shrq %cl, %r9
134
135 movq PTR(VA_PTE_1)(%rsi), %r8
136 addq %r8, %r9
137 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
138 orq $PAGE_ATTR, %r8
139 movq %r8, (%r9)
140
141relocate_new_kernel:
142 /* %rdi indirection_page
143 * %rsi page_list
144 * %rdx start address
145 */
146
147 /* zero out flags, and disable interrupts */
148 pushq $0
149 popfq
150
151 /* get physical address of control page now */
152 /* this is impossible after page table switch */
153 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
154
155 /* get physical address of page table now too */
156 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
157
158 /* switch to new set of page tables */
159 movq PTR(PA_PGD)(%rsi), %r9
160 movq %r9, %cr3
161
162 /* setup a new stack at the end of the physical control page */
163 lea 4096(%r8), %rsp
164
165 /* jump to identity mapped page */
166 addq $(identity_mapped - relocate_kernel), %r8
167 pushq %r8
168 ret
169
170identity_mapped:
171 /* store the start address on the stack */
172 pushq %rdx
173
174 /* Set cr0 to a known state:
175 * 31 1 == Paging enabled
176 * 18 0 == Alignment check disabled
177 * 16 0 == Write protect disabled
178 * 3 0 == No task switch
179 * 2 0 == Don't do FP software emulation.
180 * 0 1 == Proctected mode enabled
181 */
182 movq %cr0, %rax
183 andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
184 orl $((1<<31)|(1<<0)), %eax
185 movq %rax, %cr0
186
187 /* Set cr4 to a known state:
188 * 10 0 == xmm exceptions disabled
189 * 9 0 == xmm registers instructions disabled
190 * 8 0 == performance monitoring counter disabled
191 * 7 0 == page global disabled
192 * 6 0 == machine check exceptions disabled
193 * 5 1 == physical address extension enabled
194 * 4 0 == page size extensions disabled
195 * 3 0 == Debug extensions disabled
196 * 2 0 == Time stamp disable (disabled)
197 * 1 0 == Protected mode virtual interrupts disabled
198 * 0 0 == VME disabled
199 */
200
201 movq $((1<<5)), %rax
202 movq %rax, %cr4
203
204 jmp 1f
2051:
206
207 /* Switch to the identity mapped page tables,
208 * and flush the TLB.
209 */
210 movq %rcx, %cr3
211
212 /* Do the copies */
213 movq %rdi, %rcx /* Put the page_list in %rcx */
214 xorq %rdi, %rdi
215 xorq %rsi, %rsi
216 jmp 1f
217
2180: /* top, read another word for the indirection page */
219
220 movq (%rbx), %rcx
221 addq $8, %rbx
2221:
223 testq $0x1, %rcx /* is it a destination page? */
224 jz 2f
225 movq %rcx, %rdi
226 andq $0xfffffffffffff000, %rdi
227 jmp 0b
2282:
229 testq $0x2, %rcx /* is it an indirection page? */
230 jz 2f
231 movq %rcx, %rbx
232 andq $0xfffffffffffff000, %rbx
233 jmp 0b
2342:
235 testq $0x4, %rcx /* is it the done indicator? */
236 jz 2f
237 jmp 3f
2382:
239 testq $0x8, %rcx /* is it the source indicator? */
240 jz 0b /* Ignore it otherwise */
241 movq %rcx, %rsi /* For ever source page do a copy */
242 andq $0xfffffffffffff000, %rsi
243
244 movq $512, %rcx
245 rep ; movsq
246 jmp 0b
2473:
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB by reloading %cr3 here, it's handy,
252 * and not processor dependent.
253 */
254 movq %cr3, %rax
255 movq %rax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %rsp alone */
259
260 xorq %rax, %rax
261 xorq %rbx, %rbx
262 xorq %rcx, %rcx
263 xorq %rdx, %rdx
264 xorq %rsi, %rsi
265 xorq %rdi, %rdi
266 xorq %rbp, %rbp
267 xorq %r8, %r8
268 xorq %r9, %r9
269 xorq %r10, %r9
270 xorq %r11, %r11
271 xorq %r12, %r12
272 xorq %r13, %r13
273 xorq %r14, %r14
274 xorq %r15, %r15
275
276 ret
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
new file mode 100644
index 000000000000..1200aaac403e
--- /dev/null
+++ b/arch/x86/kernel/setup64.c
@@ -0,0 +1,289 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 */
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/bootmem.h>
12#include <linux/bitops.h>
13#include <linux/module.h>
14#include <asm/bootsetup.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/proto.h>
25#include <asm/sections.h>
26
27char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
28
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30
31struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
32EXPORT_SYMBOL(_cpu_pda);
33struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
34
35struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
36
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38
39unsigned long __supported_pte_mask __read_mostly = ~0UL;
40static int do_not_nx __cpuinitdata = 0;
41
42/* noexec=on|off
43Control non executable mappings for 64bit processes.
44
45on Enable(default)
46off Disable
47*/
48static int __init nonx_setup(char *str)
49{
50 if (!str)
51 return -EINVAL;
52 if (!strncmp(str, "on", 2)) {
53 __supported_pte_mask |= _PAGE_NX;
54 do_not_nx = 0;
55 } else if (!strncmp(str, "off", 3)) {
56 do_not_nx = 1;
57 __supported_pte_mask &= ~_PAGE_NX;
58 }
59 return 0;
60}
61early_param("noexec", nonx_setup);
62
63int force_personality32 = 0;
64
65/* noexec32=on|off
66Control non executable heap for 32bit processes.
67To control the stack too use noexec=off
68
69on PROT_READ does not imply PROT_EXEC for 32bit processes
70off PROT_READ implies PROT_EXEC (default)
71*/
72static int __init nonx32_setup(char *str)
73{
74 if (!strcmp(str, "on"))
75 force_personality32 &= ~READ_IMPLIES_EXEC;
76 else if (!strcmp(str, "off"))
77 force_personality32 |= READ_IMPLIES_EXEC;
78 return 1;
79}
80__setup("noexec32=", nonx32_setup);
81
82/*
83 * Great future plan:
84 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
85 * Always point %gs to its beginning
86 */
87void __init setup_per_cpu_areas(void)
88{
89 int i;
90 unsigned long size;
91
92#ifdef CONFIG_HOTPLUG_CPU
93 prefill_possible_map();
94#endif
95
96 /* Copy section for each CPU (we discard the original) */
97 size = PERCPU_ENOUGH_ROOM;
98
99 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
100 for_each_cpu_mask (i, cpu_possible_map) {
101 char *ptr;
102
103 if (!NODE_DATA(cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes());
106 ptr = alloc_bootmem_pages(size);
107 } else {
108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
109 }
110 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i);
112 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
113 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
114 }
115}
116
117void pda_init(int cpu)
118{
119 struct x8664_pda *pda = cpu_pda(cpu);
120
121 /* Setup up data that may be needed in __get_free_pages early */
122 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
123 /* Memory clobbers used to order PDA accessed */
124 mb();
125 wrmsrl(MSR_GS_BASE, pda);
126 mb();
127
128 pda->cpunumber = cpu;
129 pda->irqcount = -1;
130 pda->kernelstack =
131 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
132 pda->active_mm = &init_mm;
133 pda->mmu_state = 0;
134
135 if (cpu == 0) {
136 /* others are initialized in smpboot.c */
137 pda->pcurrent = &init_task;
138 pda->irqstackptr = boot_cpu_stack;
139 } else {
140 pda->irqstackptr = (char *)
141 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
142 if (!pda->irqstackptr)
143 panic("cannot allocate irqstack for cpu %d", cpu);
144 }
145
146
147 pda->irqstackptr += IRQSTACKSIZE-64;
148}
149
150char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
151__attribute__((section(".bss.page_aligned")));
152
153extern asmlinkage void ignore_sysret(void);
154
155/* May not be marked __init: used by software suspend */
156void syscall_init(void)
157{
158 /*
159 * LSTAR and STAR live in a bit strange symbiosis.
160 * They both write to the same internal register. STAR allows to set CS/DS
161 * but only a 32bit target. LSTAR sets the 64bit rip.
162 */
163 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
164 wrmsrl(MSR_LSTAR, system_call);
165 wrmsrl(MSR_CSTAR, ignore_sysret);
166
167#ifdef CONFIG_IA32_EMULATION
168 syscall32_cpu_init ();
169#endif
170
171 /* Flags to clear on syscall */
172 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
173}
174
175void __cpuinit check_efer(void)
176{
177 unsigned long efer;
178
179 rdmsrl(MSR_EFER, efer);
180 if (!(efer & EFER_NX) || do_not_nx) {
181 __supported_pte_mask &= ~_PAGE_NX;
182 }
183}
184
185unsigned long kernel_eflags;
186
187/*
188 * cpu_init() initializes state that is per-CPU. Some data is already
189 * initialized (naturally) in the bootstrap process, such as the GDT
190 * and IDT. We reload them nevertheless, this function acts as a
191 * 'CPU state barrier', nothing should get across.
192 * A lot of state is already set up in PDA init.
193 */
194void __cpuinit cpu_init (void)
195{
196 int cpu = stack_smp_processor_id();
197 struct tss_struct *t = &per_cpu(init_tss, cpu);
198 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
199 unsigned long v;
200 char *estacks = NULL;
201 struct task_struct *me;
202 int i;
203
204 /* CPU 0 is initialised in head64.c */
205 if (cpu != 0) {
206 pda_init(cpu);
207 } else
208 estacks = boot_exception_stacks;
209
210 me = current;
211
212 if (cpu_test_and_set(cpu, cpu_initialized))
213 panic("CPU#%d already initialized!\n", cpu);
214
215 printk("Initializing CPU#%d\n", cpu);
216
217 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
218
219 /*
220 * Initialize the per-CPU GDT with the boot GDT,
221 * and set up the GDT descriptor:
222 */
223 if (cpu)
224 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
225
226 cpu_gdt_descr[cpu].size = GDT_SIZE;
227 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
228 asm volatile("lidt %0" :: "m" (idt_descr));
229
230 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
231 syscall_init();
232
233 wrmsrl(MSR_FS_BASE, 0);
234 wrmsrl(MSR_KERNEL_GS_BASE, 0);
235 barrier();
236
237 check_efer();
238
239 /*
240 * set up and load the per-CPU TSS
241 */
242 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
243 static const unsigned int order[N_EXCEPTION_STACKS] = {
244 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
245 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
246 };
247 if (cpu) {
248 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
249 if (!estacks)
250 panic("Cannot allocate exception stack %ld %d\n",
251 v, cpu);
252 }
253 estacks += PAGE_SIZE << order[v];
254 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
255 }
256
257 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
258 /*
259 * <= is required because the CPU will access up to
260 * 8 bits beyond the end of the IO permission bitmap.
261 */
262 for (i = 0; i <= IO_BITMAP_LONGS; i++)
263 t->io_bitmap[i] = ~0UL;
264
265 atomic_inc(&init_mm.mm_count);
266 me->active_mm = &init_mm;
267 if (me->mm)
268 BUG();
269 enter_lazy_tlb(&init_mm, me);
270
271 set_tss_desc(cpu, t);
272 load_TR_desc();
273 load_LDT(&init_mm.context);
274
275 /*
276 * Clear all 6 debug registers:
277 */
278
279 set_debugreg(0UL, 0);
280 set_debugreg(0UL, 1);
281 set_debugreg(0UL, 2);
282 set_debugreg(0UL, 3);
283 set_debugreg(0UL, 6);
284 set_debugreg(0UL, 7);
285
286 fpu_init();
287
288 raw_local_save_flags(kernel_eflags);
289}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
new file mode 100644
index 000000000000..af838f6b0b7f
--- /dev/null
+++ b/arch/x86/kernel/setup_64.c
@@ -0,0 +1,1117 @@
1/*
2 * linux/arch/x86-64/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Nov 2001 Dave Jones <davej@suse.de>
7 * Forked from i386 setup code.
8 */
9
10/*
11 * This file handles the architecture-dependent parts of initialization
12 */
13
14#include <linux/errno.h>
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/stddef.h>
19#include <linux/unistd.h>
20#include <linux/ptrace.h>
21#include <linux/slab.h>
22#include <linux/user.h>
23#include <linux/a.out.h>
24#include <linux/screen_info.h>
25#include <linux/ioport.h>
26#include <linux/delay.h>
27#include <linux/init.h>
28#include <linux/initrd.h>
29#include <linux/highmem.h>
30#include <linux/bootmem.h>
31#include <linux/module.h>
32#include <asm/processor.h>
33#include <linux/console.h>
34#include <linux/seq_file.h>
35#include <linux/crash_dump.h>
36#include <linux/root_dev.h>
37#include <linux/pci.h>
38#include <linux/acpi.h>
39#include <linux/kallsyms.h>
40#include <linux/edd.h>
41#include <linux/mmzone.h>
42#include <linux/kexec.h>
43#include <linux/cpufreq.h>
44#include <linux/dmi.h>
45#include <linux/dma-mapping.h>
46#include <linux/ctype.h>
47
48#include <asm/mtrr.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/io.h>
52#include <asm/smp.h>
53#include <asm/msr.h>
54#include <asm/desc.h>
55#include <video/edid.h>
56#include <asm/e820.h>
57#include <asm/dma.h>
58#include <asm/mpspec.h>
59#include <asm/mmu_context.h>
60#include <asm/bootsetup.h>
61#include <asm/proto.h>
62#include <asm/setup.h>
63#include <asm/mach_apic.h>
64#include <asm/numa.h>
65#include <asm/sections.h>
66#include <asm/dmi.h>
67
68/*
69 * Machine setup..
70 */
71
72struct cpuinfo_x86 boot_cpu_data __read_mostly;
73EXPORT_SYMBOL(boot_cpu_data);
74
75unsigned long mmu_cr4_features;
76
77/* Boot loader ID as an integer, for the benefit of proc_dointvec */
78int bootloader_type;
79
80unsigned long saved_video_mode;
81
82int force_mwait __cpuinitdata;
83
84/*
85 * Early DMI memory
86 */
87int dmi_alloc_index;
88char dmi_alloc_data[DMI_MAX_DATA];
89
90/*
91 * Setup options
92 */
93struct screen_info screen_info;
94EXPORT_SYMBOL(screen_info);
95struct sys_desc_table_struct {
96 unsigned short length;
97 unsigned char table[0];
98};
99
100struct edid_info edid_info;
101EXPORT_SYMBOL_GPL(edid_info);
102
103extern int root_mountflags;
104
105char __initdata command_line[COMMAND_LINE_SIZE];
106
107struct resource standard_io_resources[] = {
108 { .name = "dma1", .start = 0x00, .end = 0x1f,
109 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
110 { .name = "pic1", .start = 0x20, .end = 0x21,
111 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
112 { .name = "timer0", .start = 0x40, .end = 0x43,
113 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
114 { .name = "timer1", .start = 0x50, .end = 0x53,
115 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
116 { .name = "keyboard", .start = 0x60, .end = 0x6f,
117 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
118 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
119 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
120 { .name = "pic2", .start = 0xa0, .end = 0xa1,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
122 { .name = "dma2", .start = 0xc0, .end = 0xdf,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
124 { .name = "fpu", .start = 0xf0, .end = 0xff,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
126};
127
128#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
129
130struct resource data_resource = {
131 .name = "Kernel data",
132 .start = 0,
133 .end = 0,
134 .flags = IORESOURCE_RAM,
135};
136struct resource code_resource = {
137 .name = "Kernel code",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_RAM,
141};
142
143#ifdef CONFIG_PROC_VMCORE
144/* elfcorehdr= specifies the location of elf core header
145 * stored by the crashed kernel. This option will be passed
146 * by kexec loader to the capture kernel.
147 */
148static int __init setup_elfcorehdr(char *arg)
149{
150 char *end;
151 if (!arg)
152 return -EINVAL;
153 elfcorehdr_addr = memparse(arg, &end);
154 return end > arg ? 0 : -EINVAL;
155}
156early_param("elfcorehdr", setup_elfcorehdr);
157#endif
158
159#ifndef CONFIG_NUMA
160static void __init
161contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
162{
163 unsigned long bootmap_size, bootmap;
164
165 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
166 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
167 if (bootmap == -1L)
168 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
169 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
170 e820_register_active_regions(0, start_pfn, end_pfn);
171 free_bootmem_with_active_regions(0, end_pfn);
172 reserve_bootmem(bootmap, bootmap_size);
173}
174#endif
175
176#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
177struct edd edd;
178#ifdef CONFIG_EDD_MODULE
179EXPORT_SYMBOL(edd);
180#endif
181/**
182 * copy_edd() - Copy the BIOS EDD information
183 * from boot_params into a safe place.
184 *
185 */
186static inline void copy_edd(void)
187{
188 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
189 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
190 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
191 edd.edd_info_nr = EDD_NR;
192}
193#else
194static inline void copy_edd(void)
195{
196}
197#endif
198
199#define EBDA_ADDR_POINTER 0x40E
200
201unsigned __initdata ebda_addr;
202unsigned __initdata ebda_size;
203
204static void discover_ebda(void)
205{
206 /*
207 * there is a real-mode segmented pointer pointing to the
208 * 4K EBDA area at 0x40E
209 */
210 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
211 ebda_addr <<= 4;
212
213 ebda_size = *(unsigned short *)__va(ebda_addr);
214
215 /* Round EBDA up to pages */
216 if (ebda_size == 0)
217 ebda_size = 1;
218 ebda_size <<= 10;
219 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
220 if (ebda_size > 64*1024)
221 ebda_size = 64*1024;
222}
223
224void __init setup_arch(char **cmdline_p)
225{
226 printk(KERN_INFO "Command line: %s\n", boot_command_line);
227
228 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
229 screen_info = SCREEN_INFO;
230 edid_info = EDID_INFO;
231 saved_video_mode = SAVED_VIDEO_MODE;
232 bootloader_type = LOADER_TYPE;
233
234#ifdef CONFIG_BLK_DEV_RAM
235 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
236 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
237 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
238#endif
239 setup_memory_region();
240 copy_edd();
241
242 if (!MOUNT_ROOT_RDONLY)
243 root_mountflags &= ~MS_RDONLY;
244 init_mm.start_code = (unsigned long) &_text;
245 init_mm.end_code = (unsigned long) &_etext;
246 init_mm.end_data = (unsigned long) &_edata;
247 init_mm.brk = (unsigned long) &_end;
248
249 code_resource.start = virt_to_phys(&_text);
250 code_resource.end = virt_to_phys(&_etext)-1;
251 data_resource.start = virt_to_phys(&_etext);
252 data_resource.end = virt_to_phys(&_edata)-1;
253
254 early_identify_cpu(&boot_cpu_data);
255
256 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
257 *cmdline_p = command_line;
258
259 parse_early_param();
260
261 finish_e820_parsing();
262
263 e820_register_active_regions(0, 0, -1UL);
264 /*
265 * partially used pages are not usable - thus
266 * we are rounding upwards:
267 */
268 end_pfn = e820_end_of_ram();
269 num_physpages = end_pfn;
270
271 check_efer();
272
273 discover_ebda();
274
275 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
276
277 dmi_scan_machine();
278
279#ifdef CONFIG_ACPI
280 /*
281 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
282 * Call this early for SRAT node setup.
283 */
284 acpi_boot_table_init();
285#endif
286
287 /* How many end-of-memory variables you have, grandma! */
288 max_low_pfn = end_pfn;
289 max_pfn = end_pfn;
290 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
291
292 /* Remove active ranges so rediscovery with NUMA-awareness happens */
293 remove_all_active_ranges();
294
295#ifdef CONFIG_ACPI_NUMA
296 /*
297 * Parse SRAT to discover nodes.
298 */
299 acpi_numa_init();
300#endif
301
302#ifdef CONFIG_NUMA
303 numa_initmem_init(0, end_pfn);
304#else
305 contig_initmem_init(0, end_pfn);
306#endif
307
308 /* Reserve direct mapping */
309 reserve_bootmem_generic(table_start << PAGE_SHIFT,
310 (table_end - table_start) << PAGE_SHIFT);
311
312 /* reserve kernel */
313 reserve_bootmem_generic(__pa_symbol(&_text),
314 __pa_symbol(&_end) - __pa_symbol(&_text));
315
316 /*
317 * reserve physical page 0 - it's a special BIOS page on many boxes,
318 * enabling clean reboots, SMP operation, laptop functions.
319 */
320 reserve_bootmem_generic(0, PAGE_SIZE);
321
322 /* reserve ebda region */
323 if (ebda_addr)
324 reserve_bootmem_generic(ebda_addr, ebda_size);
325#ifdef CONFIG_NUMA
326 /* reserve nodemap region */
327 if (nodemap_addr)
328 reserve_bootmem_generic(nodemap_addr, nodemap_size);
329#endif
330
331#ifdef CONFIG_SMP
332 /* Reserve SMP trampoline */
333 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
334#endif
335
336#ifdef CONFIG_ACPI_SLEEP
337 /*
338 * Reserve low memory region for sleep support.
339 */
340 acpi_reserve_bootmem();
341#endif
342 /*
343 * Find and reserve possible boot-time SMP configuration:
344 */
345 find_smp_config();
346#ifdef CONFIG_BLK_DEV_INITRD
347 if (LOADER_TYPE && INITRD_START) {
348 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
349 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
350 initrd_start = INITRD_START + PAGE_OFFSET;
351 initrd_end = initrd_start+INITRD_SIZE;
352 }
353 else {
354 printk(KERN_ERR "initrd extends beyond end of memory "
355 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
356 (unsigned long)(INITRD_START + INITRD_SIZE),
357 (unsigned long)(end_pfn << PAGE_SHIFT));
358 initrd_start = 0;
359 }
360 }
361#endif
362#ifdef CONFIG_KEXEC
363 if (crashk_res.start != crashk_res.end) {
364 reserve_bootmem_generic(crashk_res.start,
365 crashk_res.end - crashk_res.start + 1);
366 }
367#endif
368
369 paging_init();
370
371#ifdef CONFIG_PCI
372 early_quirks();
373#endif
374
375 /*
376 * set this early, so we dont allocate cpu0
377 * if MADT list doesnt list BSP first
378 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
379 */
380 cpu_set(0, cpu_present_map);
381#ifdef CONFIG_ACPI
382 /*
383 * Read APIC and some other early information from ACPI tables.
384 */
385 acpi_boot_init();
386#endif
387
388 init_cpu_to_node();
389
390 /*
391 * get boot-time SMP configuration:
392 */
393 if (smp_found_config)
394 get_smp_config();
395 init_apic_mappings();
396
397 /*
398 * We trust e820 completely. No explicit ROM probing in memory.
399 */
400 e820_reserve_resources();
401 e820_mark_nosave_regions();
402
403 {
404 unsigned i;
405 /* request I/O space for devices used on all i[345]86 PCs */
406 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
407 request_resource(&ioport_resource, &standard_io_resources[i]);
408 }
409
410 e820_setup_gap();
411
412#ifdef CONFIG_VT
413#if defined(CONFIG_VGA_CONSOLE)
414 conswitchp = &vga_con;
415#elif defined(CONFIG_DUMMY_CONSOLE)
416 conswitchp = &dummy_con;
417#endif
418#endif
419}
420
421static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
422{
423 unsigned int *v;
424
425 if (c->extended_cpuid_level < 0x80000004)
426 return 0;
427
428 v = (unsigned int *) c->x86_model_id;
429 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
430 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
431 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
432 c->x86_model_id[48] = 0;
433 return 1;
434}
435
436
437static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
438{
439 unsigned int n, dummy, eax, ebx, ecx, edx;
440
441 n = c->extended_cpuid_level;
442
443 if (n >= 0x80000005) {
444 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
445 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
446 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
447 c->x86_cache_size=(ecx>>24)+(edx>>24);
448 /* On K8 L1 TLB is inclusive, so don't count it */
449 c->x86_tlbsize = 0;
450 }
451
452 if (n >= 0x80000006) {
453 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
454 ecx = cpuid_ecx(0x80000006);
455 c->x86_cache_size = ecx >> 16;
456 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
457
458 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
459 c->x86_cache_size, ecx & 0xFF);
460 }
461
462 if (n >= 0x80000007)
463 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
464 if (n >= 0x80000008) {
465 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
466 c->x86_virt_bits = (eax >> 8) & 0xff;
467 c->x86_phys_bits = eax & 0xff;
468 }
469}
470
471#ifdef CONFIG_NUMA
472static int nearby_node(int apicid)
473{
474 int i;
475 for (i = apicid - 1; i >= 0; i--) {
476 int node = apicid_to_node[i];
477 if (node != NUMA_NO_NODE && node_online(node))
478 return node;
479 }
480 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
481 int node = apicid_to_node[i];
482 if (node != NUMA_NO_NODE && node_online(node))
483 return node;
484 }
485 return first_node(node_online_map); /* Shouldn't happen */
486}
487#endif
488
489/*
490 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
491 * Assumes number of cores is a power of two.
492 */
493static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
494{
495#ifdef CONFIG_SMP
496 unsigned bits;
497#ifdef CONFIG_NUMA
498 int cpu = smp_processor_id();
499 int node = 0;
500 unsigned apicid = hard_smp_processor_id();
501#endif
502 unsigned ecx = cpuid_ecx(0x80000008);
503
504 c->x86_max_cores = (ecx & 0xff) + 1;
505
506 /* CPU telling us the core id bits shift? */
507 bits = (ecx >> 12) & 0xF;
508
509 /* Otherwise recompute */
510 if (bits == 0) {
511 while ((1 << bits) < c->x86_max_cores)
512 bits++;
513 }
514
515 /* Low order bits define the core id (index of core in socket) */
516 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
517 /* Convert the APIC ID into the socket ID */
518 c->phys_proc_id = phys_pkg_id(bits);
519
520#ifdef CONFIG_NUMA
521 node = c->phys_proc_id;
522 if (apicid_to_node[apicid] != NUMA_NO_NODE)
523 node = apicid_to_node[apicid];
524 if (!node_online(node)) {
525 /* Two possibilities here:
526 - The CPU is missing memory and no node was created.
527 In that case try picking one from a nearby CPU
528 - The APIC IDs differ from the HyperTransport node IDs
529 which the K8 northbridge parsing fills in.
530 Assume they are all increased by a constant offset,
531 but in the same order as the HT nodeids.
532 If that doesn't result in a usable node fall back to the
533 path for the previous case. */
534 int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
535 if (ht_nodeid >= 0 &&
536 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
537 node = apicid_to_node[ht_nodeid];
538 /* Pick a nearby node */
539 if (!node_online(node))
540 node = nearby_node(apicid);
541 }
542 numa_set_node(cpu, node);
543
544 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
545#endif
546#endif
547}
548
549static void __cpuinit init_amd(struct cpuinfo_x86 *c)
550{
551 unsigned level;
552
553#ifdef CONFIG_SMP
554 unsigned long value;
555
556 /*
557 * Disable TLB flush filter by setting HWCR.FFDIS on K8
558 * bit 6 of msr C001_0015
559 *
560 * Errata 63 for SH-B3 steppings
561 * Errata 122 for all steppings (F+ have it disabled by default)
562 */
563 if (c->x86 == 15) {
564 rdmsrl(MSR_K8_HWCR, value);
565 value |= 1 << 6;
566 wrmsrl(MSR_K8_HWCR, value);
567 }
568#endif
569
570 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
571 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
572 clear_bit(0*32+31, &c->x86_capability);
573
574 /* On C+ stepping K8 rep microcode works well for copy/memset */
575 level = cpuid_eax(1);
576 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
577 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
578 if (c->x86 == 0x10)
579 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
580
581 /* Enable workaround for FXSAVE leak */
582 if (c->x86 >= 6)
583 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
584
585 level = get_model_name(c);
586 if (!level) {
587 switch (c->x86) {
588 case 15:
589 /* Should distinguish Models here, but this is only
590 a fallback anyways. */
591 strcpy(c->x86_model_id, "Hammer");
592 break;
593 }
594 }
595 display_cacheinfo(c);
596
597 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
598 if (c->x86_power & (1<<8))
599 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
600
601 /* Multi core CPU? */
602 if (c->extended_cpuid_level >= 0x80000008)
603 amd_detect_cmp(c);
604
605 if (c->extended_cpuid_level >= 0x80000006 &&
606 (cpuid_edx(0x80000006) & 0xf000))
607 num_cache_leaves = 4;
608 else
609 num_cache_leaves = 3;
610
611 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
612 set_bit(X86_FEATURE_K8, &c->x86_capability);
613
614 /* RDTSC can be speculated around */
615 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
616
617 /* Family 10 doesn't support C states in MWAIT so don't use it */
618 if (c->x86 == 0x10 && !force_mwait)
619 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
620}
621
622static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
623{
624#ifdef CONFIG_SMP
625 u32 eax, ebx, ecx, edx;
626 int index_msb, core_bits;
627
628 cpuid(1, &eax, &ebx, &ecx, &edx);
629
630
631 if (!cpu_has(c, X86_FEATURE_HT))
632 return;
633 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
634 goto out;
635
636 smp_num_siblings = (ebx & 0xff0000) >> 16;
637
638 if (smp_num_siblings == 1) {
639 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
640 } else if (smp_num_siblings > 1 ) {
641
642 if (smp_num_siblings > NR_CPUS) {
643 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
644 smp_num_siblings = 1;
645 return;
646 }
647
648 index_msb = get_count_order(smp_num_siblings);
649 c->phys_proc_id = phys_pkg_id(index_msb);
650
651 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
652
653 index_msb = get_count_order(smp_num_siblings) ;
654
655 core_bits = get_count_order(c->x86_max_cores);
656
657 c->cpu_core_id = phys_pkg_id(index_msb) &
658 ((1 << core_bits) - 1);
659 }
660out:
661 if ((c->x86_max_cores * smp_num_siblings) > 1) {
662 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
663 printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
664 }
665
666#endif
667}
668
669/*
670 * find out the number of processor cores on the die
671 */
672static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
673{
674 unsigned int eax, t;
675
676 if (c->cpuid_level < 4)
677 return 1;
678
679 cpuid_count(4, 0, &eax, &t, &t, &t);
680
681 if (eax & 0x1f)
682 return ((eax >> 26) + 1);
683 else
684 return 1;
685}
686
687static void srat_detect_node(void)
688{
689#ifdef CONFIG_NUMA
690 unsigned node;
691 int cpu = smp_processor_id();
692 int apicid = hard_smp_processor_id();
693
694 /* Don't do the funky fallback heuristics the AMD version employs
695 for now. */
696 node = apicid_to_node[apicid];
697 if (node == NUMA_NO_NODE)
698 node = first_node(node_online_map);
699 numa_set_node(cpu, node);
700
701 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
702#endif
703}
704
705static void __cpuinit init_intel(struct cpuinfo_x86 *c)
706{
707 /* Cache sizes */
708 unsigned n;
709
710 init_intel_cacheinfo(c);
711 if (c->cpuid_level > 9 ) {
712 unsigned eax = cpuid_eax(10);
713 /* Check for version and the number of counters */
714 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
715 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
716 }
717
718 if (cpu_has_ds) {
719 unsigned int l1, l2;
720 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
721 if (!(l1 & (1<<11)))
722 set_bit(X86_FEATURE_BTS, c->x86_capability);
723 if (!(l1 & (1<<12)))
724 set_bit(X86_FEATURE_PEBS, c->x86_capability);
725 }
726
727 n = c->extended_cpuid_level;
728 if (n >= 0x80000008) {
729 unsigned eax = cpuid_eax(0x80000008);
730 c->x86_virt_bits = (eax >> 8) & 0xff;
731 c->x86_phys_bits = eax & 0xff;
732 /* CPUID workaround for Intel 0F34 CPU */
733 if (c->x86_vendor == X86_VENDOR_INTEL &&
734 c->x86 == 0xF && c->x86_model == 0x3 &&
735 c->x86_mask == 0x4)
736 c->x86_phys_bits = 36;
737 }
738
739 if (c->x86 == 15)
740 c->x86_cache_alignment = c->x86_clflush_size * 2;
741 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
742 (c->x86 == 0x6 && c->x86_model >= 0x0e))
743 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
744 if (c->x86 == 6)
745 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
746 if (c->x86 == 15)
747 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
748 else
749 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
750 c->x86_max_cores = intel_num_cpu_cores(c);
751
752 srat_detect_node();
753}
754
755static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
756{
757 char *v = c->x86_vendor_id;
758
759 if (!strcmp(v, "AuthenticAMD"))
760 c->x86_vendor = X86_VENDOR_AMD;
761 else if (!strcmp(v, "GenuineIntel"))
762 c->x86_vendor = X86_VENDOR_INTEL;
763 else
764 c->x86_vendor = X86_VENDOR_UNKNOWN;
765}
766
767struct cpu_model_info {
768 int vendor;
769 int family;
770 char *model_names[16];
771};
772
773/* Do some early cpuid on the boot CPU to get some parameter that are
774 needed before check_bugs. Everything advanced is in identify_cpu
775 below. */
776void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
777{
778 u32 tfms;
779
780 c->loops_per_jiffy = loops_per_jiffy;
781 c->x86_cache_size = -1;
782 c->x86_vendor = X86_VENDOR_UNKNOWN;
783 c->x86_model = c->x86_mask = 0; /* So far unknown... */
784 c->x86_vendor_id[0] = '\0'; /* Unset */
785 c->x86_model_id[0] = '\0'; /* Unset */
786 c->x86_clflush_size = 64;
787 c->x86_cache_alignment = c->x86_clflush_size;
788 c->x86_max_cores = 1;
789 c->extended_cpuid_level = 0;
790 memset(&c->x86_capability, 0, sizeof c->x86_capability);
791
792 /* Get vendor name */
793 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
794 (unsigned int *)&c->x86_vendor_id[0],
795 (unsigned int *)&c->x86_vendor_id[8],
796 (unsigned int *)&c->x86_vendor_id[4]);
797
798 get_cpu_vendor(c);
799
800 /* Initialize the standard set of capabilities */
801 /* Note that the vendor-specific code below might override */
802
803 /* Intel-defined flags: level 0x00000001 */
804 if (c->cpuid_level >= 0x00000001) {
805 __u32 misc;
806 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
807 &c->x86_capability[0]);
808 c->x86 = (tfms >> 8) & 0xf;
809 c->x86_model = (tfms >> 4) & 0xf;
810 c->x86_mask = tfms & 0xf;
811 if (c->x86 == 0xf)
812 c->x86 += (tfms >> 20) & 0xff;
813 if (c->x86 >= 0x6)
814 c->x86_model += ((tfms >> 16) & 0xF) << 4;
815 if (c->x86_capability[0] & (1<<19))
816 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
817 } else {
818 /* Have CPUID level 0 only - unheard of */
819 c->x86 = 4;
820 }
821
822#ifdef CONFIG_SMP
823 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
824#endif
825}
826
827/*
828 * This does the hard work of actually picking apart the CPU stuff...
829 */
830void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
831{
832 int i;
833 u32 xlvl;
834
835 early_identify_cpu(c);
836
837 /* AMD-defined flags: level 0x80000001 */
838 xlvl = cpuid_eax(0x80000000);
839 c->extended_cpuid_level = xlvl;
840 if ((xlvl & 0xffff0000) == 0x80000000) {
841 if (xlvl >= 0x80000001) {
842 c->x86_capability[1] = cpuid_edx(0x80000001);
843 c->x86_capability[6] = cpuid_ecx(0x80000001);
844 }
845 if (xlvl >= 0x80000004)
846 get_model_name(c); /* Default name */
847 }
848
849 /* Transmeta-defined flags: level 0x80860001 */
850 xlvl = cpuid_eax(0x80860000);
851 if ((xlvl & 0xffff0000) == 0x80860000) {
852 /* Don't set x86_cpuid_level here for now to not confuse. */
853 if (xlvl >= 0x80860001)
854 c->x86_capability[2] = cpuid_edx(0x80860001);
855 }
856
857 init_scattered_cpuid_features(c);
858
859 c->apicid = phys_pkg_id(0);
860
861 /*
862 * Vendor-specific initialization. In this section we
863 * canonicalize the feature flags, meaning if there are
864 * features a certain CPU supports which CPUID doesn't
865 * tell us, CPUID claiming incorrect flags, or other bugs,
866 * we handle them here.
867 *
868 * At the end of this section, c->x86_capability better
869 * indicate the features this CPU genuinely supports!
870 */
871 switch (c->x86_vendor) {
872 case X86_VENDOR_AMD:
873 init_amd(c);
874 break;
875
876 case X86_VENDOR_INTEL:
877 init_intel(c);
878 break;
879
880 case X86_VENDOR_UNKNOWN:
881 default:
882 display_cacheinfo(c);
883 break;
884 }
885
886 select_idle_routine(c);
887 detect_ht(c);
888
889 /*
890 * On SMP, boot_cpu_data holds the common feature set between
891 * all CPUs; so make sure that we indicate which features are
892 * common between the CPUs. The first time this routine gets
893 * executed, c == &boot_cpu_data.
894 */
895 if (c != &boot_cpu_data) {
896 /* AND the already accumulated flags with these */
897 for (i = 0 ; i < NCAPINTS ; i++)
898 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
899 }
900
901#ifdef CONFIG_X86_MCE
902 mcheck_init(c);
903#endif
904 if (c != &boot_cpu_data)
905 mtrr_ap_init();
906#ifdef CONFIG_NUMA
907 numa_add_cpu(smp_processor_id());
908#endif
909}
910
911
912void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
913{
914 if (c->x86_model_id[0])
915 printk("%s", c->x86_model_id);
916
917 if (c->x86_mask || c->cpuid_level >= 0)
918 printk(" stepping %02x\n", c->x86_mask);
919 else
920 printk("\n");
921}
922
923/*
924 * Get CPU information for use by the procfs.
925 */
926
927static int show_cpuinfo(struct seq_file *m, void *v)
928{
929 struct cpuinfo_x86 *c = v;
930
931 /*
932 * These flag bits must match the definitions in <asm/cpufeature.h>.
933 * NULL means this bit is undefined or reserved; either way it doesn't
934 * have meaning as far as Linux is concerned. Note that it's important
935 * to realize there is a difference between this table and CPUID -- if
936 * applications want to get the raw CPUID data, they should access
937 * /dev/cpu/<cpu_nr>/cpuid instead.
938 */
939 static char *x86_cap_flags[] = {
940 /* Intel-defined */
941 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
942 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
943 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
944 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
945
946 /* AMD-defined */
947 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
948 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
949 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
950 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
951 "3dnowext", "3dnow",
952
953 /* Transmeta-defined */
954 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
955 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
956 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
957 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
958
959 /* Other (Linux-defined) */
960 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
961 NULL, NULL, NULL, NULL,
962 "constant_tsc", "up", NULL, "arch_perfmon",
963 "pebs", "bts", NULL, "sync_rdtsc",
964 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
965 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
966
967 /* Intel-defined (#2) */
968 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
969 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
970 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
971 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
972
973 /* VIA/Cyrix/Centaur-defined */
974 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
975 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
976 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
977 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
978
979 /* AMD-defined (#2) */
980 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
981 "altmovcr8", "abm", "sse4a",
982 "misalignsse", "3dnowprefetch",
983 "osvw", "ibs", NULL, NULL, NULL, NULL,
984 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
985 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
986
987 /* Auxiliary (Linux-defined) */
988 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
989 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
990 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
991 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
992 };
993 static char *x86_power_flags[] = {
994 "ts", /* temperature sensor */
995 "fid", /* frequency id control */
996 "vid", /* voltage id control */
997 "ttp", /* thermal trip */
998 "tm",
999 "stc",
1000 "100mhzsteps",
1001 "hwpstate",
1002 "", /* tsc invariant mapped to constant_tsc */
1003 /* nothing */
1004 };
1005
1006
1007#ifdef CONFIG_SMP
1008 if (!cpu_online(c-cpu_data))
1009 return 0;
1010#endif
1011
1012 seq_printf(m,"processor\t: %u\n"
1013 "vendor_id\t: %s\n"
1014 "cpu family\t: %d\n"
1015 "model\t\t: %d\n"
1016 "model name\t: %s\n",
1017 (unsigned)(c-cpu_data),
1018 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1019 c->x86,
1020 (int)c->x86_model,
1021 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1022
1023 if (c->x86_mask || c->cpuid_level >= 0)
1024 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1025 else
1026 seq_printf(m, "stepping\t: unknown\n");
1027
1028 if (cpu_has(c,X86_FEATURE_TSC)) {
1029 unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
1030 if (!freq)
1031 freq = cpu_khz;
1032 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1033 freq / 1000, (freq % 1000));
1034 }
1035
1036 /* Cache size */
1037 if (c->x86_cache_size >= 0)
1038 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1039
1040#ifdef CONFIG_SMP
1041 if (smp_num_siblings * c->x86_max_cores > 1) {
1042 int cpu = c - cpu_data;
1043 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
1044 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
1045 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
1046 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1047 }
1048#endif
1049
1050 seq_printf(m,
1051 "fpu\t\t: yes\n"
1052 "fpu_exception\t: yes\n"
1053 "cpuid level\t: %d\n"
1054 "wp\t\t: yes\n"
1055 "flags\t\t:",
1056 c->cpuid_level);
1057
1058 {
1059 int i;
1060 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
1061 if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
1062 seq_printf(m, " %s", x86_cap_flags[i]);
1063 }
1064
1065 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1066 c->loops_per_jiffy/(500000/HZ),
1067 (c->loops_per_jiffy/(5000/HZ)) % 100);
1068
1069 if (c->x86_tlbsize > 0)
1070 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1071 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1072 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1073
1074 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1075 c->x86_phys_bits, c->x86_virt_bits);
1076
1077 seq_printf(m, "power management:");
1078 {
1079 unsigned i;
1080 for (i = 0; i < 32; i++)
1081 if (c->x86_power & (1 << i)) {
1082 if (i < ARRAY_SIZE(x86_power_flags) &&
1083 x86_power_flags[i])
1084 seq_printf(m, "%s%s",
1085 x86_power_flags[i][0]?" ":"",
1086 x86_power_flags[i]);
1087 else
1088 seq_printf(m, " [%d]", i);
1089 }
1090 }
1091
1092 seq_printf(m, "\n\n");
1093
1094 return 0;
1095}
1096
1097static void *c_start(struct seq_file *m, loff_t *pos)
1098{
1099 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
1100}
1101
1102static void *c_next(struct seq_file *m, void *v, loff_t *pos)
1103{
1104 ++*pos;
1105 return c_start(m, pos);
1106}
1107
1108static void c_stop(struct seq_file *m, void *v)
1109{
1110}
1111
1112struct seq_operations cpuinfo_op = {
1113 .start =c_start,
1114 .next = c_next,
1115 .stop = c_stop,
1116 .show = show_cpuinfo,
1117};
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..739175b01e06
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,495 @@
1/*
2 * linux/arch/x86_64/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 *
7 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
8 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
9 * 2000-2002 x86-64 support by Andi Kleen
10 */
11
12#include <linux/sched.h>
13#include <linux/mm.h>
14#include <linux/smp.h>
15#include <linux/kernel.h>
16#include <linux/signal.h>
17#include <linux/errno.h>
18#include <linux/wait.h>
19#include <linux/ptrace.h>
20#include <linux/unistd.h>
21#include <linux/stddef.h>
22#include <linux/personality.h>
23#include <linux/compiler.h>
24#include <asm/ucontext.h>
25#include <asm/uaccess.h>
26#include <asm/i387.h>
27#include <asm/proto.h>
28#include <asm/ia32_unistd.h>
29#include <asm/mce.h>
30
31/* #define DEBUG_SIG 1 */
32
33#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
34
35int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
36 sigset_t *set, struct pt_regs * regs);
37int ia32_setup_frame(int sig, struct k_sigaction *ka,
38 sigset_t *set, struct pt_regs * regs);
39
40asmlinkage long
41sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
42 struct pt_regs *regs)
43{
44 return do_sigaltstack(uss, uoss, regs->rsp);
45}
46
47
48/*
49 * Do a signal return; undo the signal stack.
50 */
51
52struct rt_sigframe
53{
54 char __user *pretcode;
55 struct ucontext uc;
56 struct siginfo info;
57};
58
59static int
60restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
61{
62 unsigned int err = 0;
63
64 /* Always make any pending restarted system calls return -EINTR */
65 current_thread_info()->restart_block.fn = do_no_restart_syscall;
66
67#define COPY(x) err |= __get_user(regs->x, &sc->x)
68
69 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
70 COPY(rdx); COPY(rcx); COPY(rip);
71 COPY(r8);
72 COPY(r9);
73 COPY(r10);
74 COPY(r11);
75 COPY(r12);
76 COPY(r13);
77 COPY(r14);
78 COPY(r15);
79
80 /* Kernel saves and restores only the CS segment register on signals,
81 * which is the bare minimum needed to allow mixed 32/64-bit code.
82 * App's signal handler can save/restore other segments if needed. */
83 {
84 unsigned cs;
85 err |= __get_user(cs, &sc->cs);
86 regs->cs = cs | 3; /* Force into user mode */
87 }
88
89 {
90 unsigned int tmpflags;
91 err |= __get_user(tmpflags, &sc->eflags);
92 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
93 regs->orig_rax = -1; /* disable syscall checks */
94 }
95
96 {
97 struct _fpstate __user * buf;
98 err |= __get_user(buf, &sc->fpstate);
99
100 if (buf) {
101 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
102 goto badframe;
103 err |= restore_i387(buf);
104 } else {
105 struct task_struct *me = current;
106 if (used_math()) {
107 clear_fpu(me);
108 clear_used_math();
109 }
110 }
111 }
112
113 err |= __get_user(*prax, &sc->rax);
114 return err;
115
116badframe:
117 return 1;
118}
119
120asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
121{
122 struct rt_sigframe __user *frame;
123 sigset_t set;
124 unsigned long eax;
125
126 frame = (struct rt_sigframe __user *)(regs->rsp - 8);
127 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
128 goto badframe;
129 }
130 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
131 goto badframe;
132 }
133
134 sigdelsetmask(&set, ~_BLOCKABLE);
135 spin_lock_irq(&current->sighand->siglock);
136 current->blocked = set;
137 recalc_sigpending();
138 spin_unlock_irq(&current->sighand->siglock);
139
140 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
141 goto badframe;
142
143#ifdef DEBUG_SIG
144 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
145#endif
146
147 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
148 goto badframe;
149
150 return eax;
151
152badframe:
153 signal_fault(regs,frame,"sigreturn");
154 return 0;
155}
156
157/*
158 * Set up a signal frame.
159 */
160
161static inline int
162setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
163{
164 int err = 0;
165
166 err |= __put_user(regs->cs, &sc->cs);
167 err |= __put_user(0, &sc->gs);
168 err |= __put_user(0, &sc->fs);
169
170 err |= __put_user(regs->rdi, &sc->rdi);
171 err |= __put_user(regs->rsi, &sc->rsi);
172 err |= __put_user(regs->rbp, &sc->rbp);
173 err |= __put_user(regs->rsp, &sc->rsp);
174 err |= __put_user(regs->rbx, &sc->rbx);
175 err |= __put_user(regs->rdx, &sc->rdx);
176 err |= __put_user(regs->rcx, &sc->rcx);
177 err |= __put_user(regs->rax, &sc->rax);
178 err |= __put_user(regs->r8, &sc->r8);
179 err |= __put_user(regs->r9, &sc->r9);
180 err |= __put_user(regs->r10, &sc->r10);
181 err |= __put_user(regs->r11, &sc->r11);
182 err |= __put_user(regs->r12, &sc->r12);
183 err |= __put_user(regs->r13, &sc->r13);
184 err |= __put_user(regs->r14, &sc->r14);
185 err |= __put_user(regs->r15, &sc->r15);
186 err |= __put_user(me->thread.trap_no, &sc->trapno);
187 err |= __put_user(me->thread.error_code, &sc->err);
188 err |= __put_user(regs->rip, &sc->rip);
189 err |= __put_user(regs->eflags, &sc->eflags);
190 err |= __put_user(mask, &sc->oldmask);
191 err |= __put_user(me->thread.cr2, &sc->cr2);
192
193 return err;
194}
195
196/*
197 * Determine which stack to use..
198 */
199
200static void __user *
201get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
202{
203 unsigned long rsp;
204
205 /* Default to using normal stack - redzone*/
206 rsp = regs->rsp - 128;
207
208 /* This is the X/Open sanctioned signal stack switching. */
209 if (ka->sa.sa_flags & SA_ONSTACK) {
210 if (sas_ss_flags(rsp) == 0)
211 rsp = current->sas_ss_sp + current->sas_ss_size;
212 }
213
214 return (void __user *)round_down(rsp - size, 16);
215}
216
217static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
218 sigset_t *set, struct pt_regs * regs)
219{
220 struct rt_sigframe __user *frame;
221 struct _fpstate __user *fp = NULL;
222 int err = 0;
223 struct task_struct *me = current;
224
225 if (used_math()) {
226 fp = get_stack(ka, regs, sizeof(struct _fpstate));
227 frame = (void __user *)round_down(
228 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
229
230 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
231 goto give_sigsegv;
232
233 if (save_i387(fp) < 0)
234 err |= -1;
235 } else
236 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
237
238 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
239 goto give_sigsegv;
240
241 if (ka->sa.sa_flags & SA_SIGINFO) {
242 err |= copy_siginfo_to_user(&frame->info, info);
243 if (err)
244 goto give_sigsegv;
245 }
246
247 /* Create the ucontext. */
248 err |= __put_user(0, &frame->uc.uc_flags);
249 err |= __put_user(0, &frame->uc.uc_link);
250 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
251 err |= __put_user(sas_ss_flags(regs->rsp),
252 &frame->uc.uc_stack.ss_flags);
253 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
254 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
255 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
256 if (sizeof(*set) == 16) {
257 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
258 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
259 } else
260 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
261
262 /* Set up to return from userspace. If provided, use a stub
263 already in userspace. */
264 /* x86-64 should always use SA_RESTORER. */
265 if (ka->sa.sa_flags & SA_RESTORER) {
266 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
267 } else {
268 /* could use a vstub here */
269 goto give_sigsegv;
270 }
271
272 if (err)
273 goto give_sigsegv;
274
275#ifdef DEBUG_SIG
276 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
277#endif
278
279 /* Set up registers for signal handler */
280 regs->rdi = sig;
281 /* In case the signal handler was declared without prototypes */
282 regs->rax = 0;
283
284 /* This also works for non SA_SIGINFO handlers because they expect the
285 next argument after the signal number on the stack. */
286 regs->rsi = (unsigned long)&frame->info;
287 regs->rdx = (unsigned long)&frame->uc;
288 regs->rip = (unsigned long) ka->sa.sa_handler;
289
290 regs->rsp = (unsigned long)frame;
291
292 /* Set up the CS register to run signal handlers in 64-bit mode,
293 even if the handler happens to be interrupting 32-bit code. */
294 regs->cs = __USER_CS;
295
296 /* This, by contrast, has nothing to do with segment registers -
297 see include/asm-x86_64/uaccess.h for details. */
298 set_fs(USER_DS);
299
300 regs->eflags &= ~TF_MASK;
301 if (test_thread_flag(TIF_SINGLESTEP))
302 ptrace_notify(SIGTRAP);
303#ifdef DEBUG_SIG
304 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
305 current->comm, current->pid, frame, regs->rip, frame->pretcode);
306#endif
307
308 return 0;
309
310give_sigsegv:
311 force_sigsegv(sig, current);
312 return -EFAULT;
313}
314
315/*
316 * OK, we're invoking a handler
317 */
318
319static int
320handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
321 sigset_t *oldset, struct pt_regs *regs)
322{
323 int ret;
324
325#ifdef DEBUG_SIG
326 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
327 current->pid, sig,
328 regs->rip, regs->rsp, regs);
329#endif
330
331 /* Are we from a system call? */
332 if ((long)regs->orig_rax >= 0) {
333 /* If so, check system call restarting.. */
334 switch (regs->rax) {
335 case -ERESTART_RESTARTBLOCK:
336 case -ERESTARTNOHAND:
337 regs->rax = -EINTR;
338 break;
339
340 case -ERESTARTSYS:
341 if (!(ka->sa.sa_flags & SA_RESTART)) {
342 regs->rax = -EINTR;
343 break;
344 }
345 /* fallthrough */
346 case -ERESTARTNOINTR:
347 regs->rax = regs->orig_rax;
348 regs->rip -= 2;
349 break;
350 }
351 }
352
353 /*
354 * If TF is set due to a debugger (PT_DTRACE), clear the TF
355 * flag so that register information in the sigcontext is
356 * correct.
357 */
358 if (unlikely(regs->eflags & TF_MASK)) {
359 if (likely(current->ptrace & PT_DTRACE)) {
360 current->ptrace &= ~PT_DTRACE;
361 regs->eflags &= ~TF_MASK;
362 }
363 }
364
365#ifdef CONFIG_IA32_EMULATION
366 if (test_thread_flag(TIF_IA32)) {
367 if (ka->sa.sa_flags & SA_SIGINFO)
368 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
369 else
370 ret = ia32_setup_frame(sig, ka, oldset, regs);
371 } else
372#endif
373 ret = setup_rt_frame(sig, ka, info, oldset, regs);
374
375 if (ret == 0) {
376 spin_lock_irq(&current->sighand->siglock);
377 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
378 if (!(ka->sa.sa_flags & SA_NODEFER))
379 sigaddset(&current->blocked,sig);
380 recalc_sigpending();
381 spin_unlock_irq(&current->sighand->siglock);
382 }
383
384 return ret;
385}
386
387/*
388 * Note that 'init' is a special process: it doesn't get signals it doesn't
389 * want to handle. Thus you cannot kill init even with a SIGKILL even by
390 * mistake.
391 */
392static void do_signal(struct pt_regs *regs)
393{
394 struct k_sigaction ka;
395 siginfo_t info;
396 int signr;
397 sigset_t *oldset;
398
399 /*
400 * We want the common case to go fast, which
401 * is why we may in certain cases get here from
402 * kernel mode. Just return without doing anything
403 * if so.
404 */
405 if (!user_mode(regs))
406 return;
407
408 if (test_thread_flag(TIF_RESTORE_SIGMASK))
409 oldset = &current->saved_sigmask;
410 else
411 oldset = &current->blocked;
412
413 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
414 if (signr > 0) {
415 /* Reenable any watchpoints before delivering the
416 * signal to user space. The processor register will
417 * have been cleared if the watchpoint triggered
418 * inside the kernel.
419 */
420 if (current->thread.debugreg7)
421 set_debugreg(current->thread.debugreg7, 7);
422
423 /* Whee! Actually deliver the signal. */
424 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
425 /* a signal was successfully delivered; the saved
426 * sigmask will have been stored in the signal frame,
427 * and will be restored by sigreturn, so we can simply
428 * clear the TIF_RESTORE_SIGMASK flag */
429 clear_thread_flag(TIF_RESTORE_SIGMASK);
430 }
431 return;
432 }
433
434 /* Did we come from a system call? */
435 if ((long)regs->orig_rax >= 0) {
436 /* Restart the system call - no handlers present */
437 long res = regs->rax;
438 switch (res) {
439 case -ERESTARTNOHAND:
440 case -ERESTARTSYS:
441 case -ERESTARTNOINTR:
442 regs->rax = regs->orig_rax;
443 regs->rip -= 2;
444 break;
445 case -ERESTART_RESTARTBLOCK:
446 regs->rax = test_thread_flag(TIF_IA32) ?
447 __NR_ia32_restart_syscall :
448 __NR_restart_syscall;
449 regs->rip -= 2;
450 break;
451 }
452 }
453
454 /* if there's no signal to deliver, we just put the saved sigmask
455 back. */
456 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
457 clear_thread_flag(TIF_RESTORE_SIGMASK);
458 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
459 }
460}
461
462void
463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
464{
465#ifdef DEBUG_SIG
466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
467 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
468#endif
469
470 /* Pending single-step? */
471 if (thread_info_flags & _TIF_SINGLESTEP) {
472 regs->eflags |= TF_MASK;
473 clear_thread_flag(TIF_SINGLESTEP);
474 }
475
476#ifdef CONFIG_X86_MCE
477 /* notify userspace of pending MCEs */
478 if (thread_info_flags & _TIF_MCE_NOTIFY)
479 mce_notify_user();
480#endif /* CONFIG_X86_MCE */
481
482 /* deal with pending signal delivery */
483 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
484 do_signal(regs);
485}
486
487void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
488{
489 struct task_struct *me = current;
490 if (show_unhandled_signals && printk_ratelimit())
491 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
492 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
493
494 force_sig(SIGSEGV, me);
495}
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
new file mode 100644
index 000000000000..df4a82812adb
--- /dev/null
+++ b/arch/x86/kernel/smp_64.c
@@ -0,0 +1,523 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 *
8 * This code is released under the GNU General Public License version 2 or
9 * later.
10 */
11
12#include <linux/init.h>
13
14#include <linux/mm.h>
15#include <linux/delay.h>
16#include <linux/spinlock.h>
17#include <linux/smp.h>
18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h>
20#include <linux/interrupt.h>
21
22#include <asm/mtrr.h>
23#include <asm/pgalloc.h>
24#include <asm/tlbflush.h>
25#include <asm/mach_apic.h>
26#include <asm/mmu_context.h>
27#include <asm/proto.h>
28#include <asm/apicdef.h>
29#include <asm/idle.h>
30
31/*
32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds.
34 *
35 * These mean you can really definitely utterly forget about
36 * writing to user space from interrupts. (Its not allowed anyway).
37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 *
40 * More scalable flush, from Andi Kleen
41 *
42 * To avoid global state use 8 different call vectors.
43 * Each CPU uses a specific vector to trigger flushes on other
44 * CPUs. Depending on the received vector the target CPUs look into
45 * the right per cpu variable for the flush data.
46 *
47 * With more than 8 CPUs they are hashed to the 8 available
48 * vectors. The limited global vector space forces us to this right now.
49 * In future when interrupts are split into per CPU domains this could be
50 * fixed, at the cost of triggering multiple IPIs in some cases.
51 */
52
53union smp_flush_state {
54 struct {
55 cpumask_t flush_cpumask;
56 struct mm_struct *flush_mm;
57 unsigned long flush_va;
58#define FLUSH_ALL -1ULL
59 spinlock_t tlbstate_lock;
60 };
61 char pad[SMP_CACHE_BYTES];
62} ____cacheline_aligned;
63
64/* State is put into the per CPU data section, but padded
65 to a full cache line because other CPUs can access it and we don't
66 want false sharing in the per cpu data segment. */
67static DEFINE_PER_CPU(union smp_flush_state, flush_state);
68
69/*
70 * We cannot call mmdrop() because we are in interrupt context,
71 * instead update mm->cpu_vm_mask.
72 */
73static inline void leave_mm(int cpu)
74{
75 if (read_pda(mmu_state) == TLBSTATE_OK)
76 BUG();
77 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
78 load_cr3(swapper_pg_dir);
79}
80
81/*
82 *
83 * The flush IPI assumes that a thread switch happens in this order:
84 * [cpu0: the cpu that switches]
85 * 1) switch_mm() either 1a) or 1b)
86 * 1a) thread switch to a different mm
87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
88 * Stop ipi delivery for the old mm. This is not synchronized with
89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
90 * for the wrong mm, and in the worst case we perform a superfluous
91 * tlb flush.
92 * 1a2) set cpu mmu_state to TLBSTATE_OK
93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
94 * was in lazy tlb mode.
95 * 1a3) update cpu active_mm
96 * Now cpu0 accepts tlb flushes for the new mm.
97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
98 * Now the other cpus will send tlb flush ipis.
99 * 1a4) change cr3.
100 * 1b) thread switch without mm change
101 * cpu active_mm is correct, cpu0 already handles
102 * flush ipis.
103 * 1b1) set cpu mmu_state to TLBSTATE_OK
104 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
105 * Atomically set the bit [other cpus will start sending flush ipis],
106 * and test the bit.
107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
108 * 2) switch %%esp, ie current
109 *
110 * The interrupt must handle 2 special cases:
111 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
112 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
113 * runs in kernel space, the cpu could load tlb entries for user space
114 * pages.
115 *
116 * The good news is that cpu mmu_state is local to each cpu, no
117 * write/read ordering problems.
118 */
119
120/*
121 * TLB flush IPI:
122 *
123 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
124 * 2) Leave the mm if we are in the lazy tlb mode.
125 *
126 * Interrupts are disabled.
127 */
128
129asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
130{
131 int cpu;
132 int sender;
133 union smp_flush_state *f;
134
135 cpu = smp_processor_id();
136 /*
137 * orig_rax contains the negated interrupt vector.
138 * Use that to determine where the sender put the data.
139 */
140 sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
141 f = &per_cpu(flush_state, sender);
142
143 if (!cpu_isset(cpu, f->flush_cpumask))
144 goto out;
145 /*
146 * This was a BUG() but until someone can quote me the
147 * line from the intel manual that guarantees an IPI to
148 * multiple CPUs is retried _only_ on the erroring CPUs
149 * its staying as a return
150 *
151 * BUG();
152 */
153
154 if (f->flush_mm == read_pda(active_mm)) {
155 if (read_pda(mmu_state) == TLBSTATE_OK) {
156 if (f->flush_va == FLUSH_ALL)
157 local_flush_tlb();
158 else
159 __flush_tlb_one(f->flush_va);
160 } else
161 leave_mm(cpu);
162 }
163out:
164 ack_APIC_irq();
165 cpu_clear(cpu, f->flush_cpumask);
166}
167
168static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
169 unsigned long va)
170{
171 int sender;
172 union smp_flush_state *f;
173
174 /* Caller has disabled preemption */
175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
176 f = &per_cpu(flush_state, sender);
177
178 /* Could avoid this lock when
179 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
180 probably not worth checking this for a cache-hot lock. */
181 spin_lock(&f->tlbstate_lock);
182
183 f->flush_mm = mm;
184 f->flush_va = va;
185 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
186
187 /*
188 * We have to send the IPI only to
189 * CPUs affected.
190 */
191 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
192
193 while (!cpus_empty(f->flush_cpumask))
194 cpu_relax();
195
196 f->flush_mm = NULL;
197 f->flush_va = 0;
198 spin_unlock(&f->tlbstate_lock);
199}
200
201int __cpuinit init_smp_flush(void)
202{
203 int i;
204 for_each_cpu_mask(i, cpu_possible_map) {
205 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
206 }
207 return 0;
208}
209
210core_initcall(init_smp_flush);
211
212void flush_tlb_current_task(void)
213{
214 struct mm_struct *mm = current->mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 local_flush_tlb();
222 if (!cpus_empty(cpu_mask))
223 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
224 preempt_enable();
225}
226EXPORT_SYMBOL(flush_tlb_current_task);
227
228void flush_tlb_mm (struct mm_struct * mm)
229{
230 cpumask_t cpu_mask;
231
232 preempt_disable();
233 cpu_mask = mm->cpu_vm_mask;
234 cpu_clear(smp_processor_id(), cpu_mask);
235
236 if (current->active_mm == mm) {
237 if (current->mm)
238 local_flush_tlb();
239 else
240 leave_mm(smp_processor_id());
241 }
242 if (!cpus_empty(cpu_mask))
243 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
244
245 preempt_enable();
246}
247EXPORT_SYMBOL(flush_tlb_mm);
248
249void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
250{
251 struct mm_struct *mm = vma->vm_mm;
252 cpumask_t cpu_mask;
253
254 preempt_disable();
255 cpu_mask = mm->cpu_vm_mask;
256 cpu_clear(smp_processor_id(), cpu_mask);
257
258 if (current->active_mm == mm) {
259 if(current->mm)
260 __flush_tlb_one(va);
261 else
262 leave_mm(smp_processor_id());
263 }
264
265 if (!cpus_empty(cpu_mask))
266 flush_tlb_others(cpu_mask, mm, va);
267
268 preempt_enable();
269}
270EXPORT_SYMBOL(flush_tlb_page);
271
272static void do_flush_tlb_all(void* info)
273{
274 unsigned long cpu = smp_processor_id();
275
276 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY)
278 leave_mm(cpu);
279}
280
281void flush_tlb_all(void)
282{
283 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
284}
285
286/*
287 * this function sends a 'reschedule' IPI to another CPU.
288 * it goes straight through and wastes no time serializing
289 * anything. Worst case is that we lose a reschedule ...
290 */
291
292void smp_send_reschedule(int cpu)
293{
294 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
295}
296
297/*
298 * Structure and data for smp_call_function(). This is designed to minimise
299 * static memory requirements. It also looks cleaner.
300 */
301static DEFINE_SPINLOCK(call_lock);
302
303struct call_data_struct {
304 void (*func) (void *info);
305 void *info;
306 atomic_t started;
307 atomic_t finished;
308 int wait;
309};
310
311static struct call_data_struct * call_data;
312
313void lock_ipi_call_lock(void)
314{
315 spin_lock_irq(&call_lock);
316}
317
318void unlock_ipi_call_lock(void)
319{
320 spin_unlock_irq(&call_lock);
321}
322
323/*
324 * this function sends a 'generic call function' IPI to one other CPU
325 * in the system.
326 *
327 * cpu is a standard Linux logical CPU number.
328 */
329static void
330__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
331 int nonatomic, int wait)
332{
333 struct call_data_struct data;
334 int cpus = 1;
335
336 data.func = func;
337 data.info = info;
338 atomic_set(&data.started, 0);
339 data.wait = wait;
340 if (wait)
341 atomic_set(&data.finished, 0);
342
343 call_data = &data;
344 wmb();
345 /* Send a message to all other CPUs and wait for them to respond */
346 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
347
348 /* Wait for response */
349 while (atomic_read(&data.started) != cpus)
350 cpu_relax();
351
352 if (!wait)
353 return;
354
355 while (atomic_read(&data.finished) != cpus)
356 cpu_relax();
357}
358
359/*
360 * smp_call_function_single - Run a function on a specific CPU
361 * @func: The function to run. This must be fast and non-blocking.
362 * @info: An arbitrary pointer to pass to the function.
363 * @nonatomic: Currently unused.
364 * @wait: If true, wait until function has completed on other CPUs.
365 *
366 * Retrurns 0 on success, else a negative status code.
367 *
368 * Does not return until the remote CPU is nearly ready to execute <func>
369 * or is or has executed.
370 */
371
372int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
373 int nonatomic, int wait)
374{
375 /* prevent preemption and reschedule on another processor */
376 int me = get_cpu();
377
378 /* Can deadlock when called with interrupts disabled */
379 WARN_ON(irqs_disabled());
380
381 if (cpu == me) {
382 local_irq_disable();
383 func(info);
384 local_irq_enable();
385 put_cpu();
386 return 0;
387 }
388
389 spin_lock(&call_lock);
390 __smp_call_function_single(cpu, func, info, nonatomic, wait);
391 spin_unlock(&call_lock);
392 put_cpu();
393 return 0;
394}
395EXPORT_SYMBOL(smp_call_function_single);
396
397/*
398 * this function sends a 'generic call function' IPI to all other CPUs
399 * in the system.
400 */
401static void __smp_call_function (void (*func) (void *info), void *info,
402 int nonatomic, int wait)
403{
404 struct call_data_struct data;
405 int cpus = num_online_cpus()-1;
406
407 if (!cpus)
408 return;
409
410 data.func = func;
411 data.info = info;
412 atomic_set(&data.started, 0);
413 data.wait = wait;
414 if (wait)
415 atomic_set(&data.finished, 0);
416
417 call_data = &data;
418 wmb();
419 /* Send a message to all other CPUs and wait for them to respond */
420 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
421
422 /* Wait for response */
423 while (atomic_read(&data.started) != cpus)
424 cpu_relax();
425
426 if (!wait)
427 return;
428
429 while (atomic_read(&data.finished) != cpus)
430 cpu_relax();
431}
432
433/*
434 * smp_call_function - run a function on all other CPUs.
435 * @func: The function to run. This must be fast and non-blocking.
436 * @info: An arbitrary pointer to pass to the function.
437 * @nonatomic: currently unused.
438 * @wait: If true, wait (atomically) until function has completed on other
439 * CPUs.
440 *
441 * Returns 0 on success, else a negative status code. Does not return until
442 * remote CPUs are nearly ready to execute func or are or have executed.
443 *
444 * You must not call this function with disabled interrupts or from a
445 * hardware interrupt handler or from a bottom half handler.
446 * Actually there are a few legal cases, like panic.
447 */
448int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
449 int wait)
450{
451 spin_lock(&call_lock);
452 __smp_call_function(func,info,nonatomic,wait);
453 spin_unlock(&call_lock);
454 return 0;
455}
456EXPORT_SYMBOL(smp_call_function);
457
458static void stop_this_cpu(void *dummy)
459{
460 local_irq_disable();
461 /*
462 * Remove this CPU:
463 */
464 cpu_clear(smp_processor_id(), cpu_online_map);
465 disable_local_APIC();
466 for (;;)
467 halt();
468}
469
470void smp_send_stop(void)
471{
472 int nolock;
473 unsigned long flags;
474
475 if (reboot_force)
476 return;
477
478 /* Don't deadlock on the call lock in panic */
479 nolock = !spin_trylock(&call_lock);
480 local_irq_save(flags);
481 __smp_call_function(stop_this_cpu, NULL, 0, 0);
482 if (!nolock)
483 spin_unlock(&call_lock);
484 disable_local_APIC();
485 local_irq_restore(flags);
486}
487
488/*
489 * Reschedule call back. Nothing to do,
490 * all the work is done automatically when
491 * we return from the interrupt.
492 */
493asmlinkage void smp_reschedule_interrupt(void)
494{
495 ack_APIC_irq();
496}
497
498asmlinkage void smp_call_function_interrupt(void)
499{
500 void (*func) (void *info) = call_data->func;
501 void *info = call_data->info;
502 int wait = call_data->wait;
503
504 ack_APIC_irq();
505 /*
506 * Notify initiating CPU that I've grabbed the data and am
507 * about to execute the function
508 */
509 mb();
510 atomic_inc(&call_data->started);
511 /*
512 * At this point the info structure may be out of scope unless wait==1
513 */
514 exit_idle();
515 irq_enter();
516 (*func)(info);
517 irq_exit();
518 if (wait) {
519 mb();
520 atomic_inc(&call_data->finished);
521 }
522}
523
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
new file mode 100644
index 000000000000..32f50783edc8
--- /dev/null
+++ b/arch/x86/kernel/smpboot_64.c
@@ -0,0 +1,1085 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support
38 */
39
40
41#include <linux/init.h>
42
43#include <linux/mm.h>
44#include <linux/kernel_stat.h>
45#include <linux/bootmem.h>
46#include <linux/thread_info.h>
47#include <linux/module.h>
48#include <linux/delay.h>
49#include <linux/mc146818rtc.h>
50#include <linux/smp.h>
51#include <linux/kdebug.h>
52
53#include <asm/mtrr.h>
54#include <asm/pgalloc.h>
55#include <asm/desc.h>
56#include <asm/tlbflush.h>
57#include <asm/proto.h>
58#include <asm/nmi.h>
59#include <asm/irq.h>
60#include <asm/hw_irq.h>
61#include <asm/numa.h>
62
63/* Number of siblings per CPU package */
64int smp_num_siblings = 1;
65EXPORT_SYMBOL(smp_num_siblings);
66
67/* Last level cache ID of each logical CPU */
68u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
69
70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map __read_mostly;
72
73EXPORT_SYMBOL(cpu_online_map);
74
75/*
76 * Private maps to synchronize booting between AP and BP.
77 * Probably not needed anymore, but it makes for easier debugging. -AK
78 */
79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map;
81EXPORT_SYMBOL(cpu_callout_map);
82
83cpumask_t cpu_possible_map;
84EXPORT_SYMBOL(cpu_possible_map);
85
86/* Per CPU bogomips and other parameters */
87struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
88EXPORT_SYMBOL(cpu_data);
89
90/* Set when the idlers are all forked */
91int smp_threads_ready;
92
93/* representing HT siblings of each logical CPU */
94cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
95EXPORT_SYMBOL(cpu_sibling_map);
96
97/* representing HT and core siblings of each logical CPU */
98cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
99EXPORT_SYMBOL(cpu_core_map);
100
101/*
102 * Trampoline 80x86 program as an array.
103 */
104
105extern unsigned char trampoline_data[];
106extern unsigned char trampoline_end[];
107
108/* State of each CPU */
109DEFINE_PER_CPU(int, cpu_state) = { 0 };
110
111/*
112 * Store all idle threads, this can be reused instead of creating
113 * a new thread. Also avoids complicated thread destroy functionality
114 * for idle threads.
115 */
116struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
117
118#define get_idle_for_cpu(x) (idle_thread_array[(x)])
119#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
120
121/*
122 * Currently trivial. Write the real->protected mode
123 * bootstrap into the page concerned. The caller
124 * has made sure it's suitably aligned.
125 */
126
127static unsigned long __cpuinit setup_trampoline(void)
128{
129 void *tramp = __va(SMP_TRAMPOLINE_BASE);
130 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
131 return virt_to_phys(tramp);
132}
133
134/*
135 * The bootstrap kernel entry code has set these up. Save them for
136 * a given CPU
137 */
138
139static void __cpuinit smp_store_cpu_info(int id)
140{
141 struct cpuinfo_x86 *c = cpu_data + id;
142
143 *c = boot_cpu_data;
144 identify_cpu(c);
145 print_cpu_info(c);
146}
147
148static atomic_t init_deasserted __cpuinitdata;
149
150/*
151 * Report back to the Boot Processor.
152 * Running on AP.
153 */
154void __cpuinit smp_callin(void)
155{
156 int cpuid, phys_id;
157 unsigned long timeout;
158
159 /*
160 * If waken up by an INIT in an 82489DX configuration
161 * we may get here before an INIT-deassert IPI reaches
162 * our local APIC. We have to wait for the IPI or we'll
163 * lock up on an APIC access.
164 */
165 while (!atomic_read(&init_deasserted))
166 cpu_relax();
167
168 /*
169 * (This works even if the APIC is not enabled.)
170 */
171 phys_id = GET_APIC_ID(apic_read(APIC_ID));
172 cpuid = smp_processor_id();
173 if (cpu_isset(cpuid, cpu_callin_map)) {
174 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
175 phys_id, cpuid);
176 }
177 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
178
179 /*
180 * STARTUP IPIs are fragile beasts as they might sometimes
181 * trigger some glue motherboard logic. Complete APIC bus
182 * silence for 1 second, this overestimates the time the
183 * boot CPU is spending to send the up to 2 STARTUP IPIs
184 * by a factor of two. This should be enough.
185 */
186
187 /*
188 * Waiting 2s total for startup (udelay is not yet working)
189 */
190 timeout = jiffies + 2*HZ;
191 while (time_before(jiffies, timeout)) {
192 /*
193 * Has the boot CPU finished it's STARTUP sequence?
194 */
195 if (cpu_isset(cpuid, cpu_callout_map))
196 break;
197 cpu_relax();
198 }
199
200 if (!time_before(jiffies, timeout)) {
201 panic("smp_callin: CPU%d started up but did not get a callout!\n",
202 cpuid);
203 }
204
205 /*
206 * the boot CPU has finished the init stage and is spinning
207 * on callin_map until we finish. We are free to set up this
208 * CPU, first the APIC. (this is probably redundant on most
209 * boards)
210 */
211
212 Dprintk("CALLIN, before setup_local_APIC().\n");
213 setup_local_APIC();
214
215 /*
216 * Get our bogomips.
217 *
218 * Need to enable IRQs because it can take longer and then
219 * the NMI watchdog might kill us.
220 */
221 local_irq_enable();
222 calibrate_delay();
223 local_irq_disable();
224 Dprintk("Stack at about %p\n",&cpuid);
225
226 disable_APIC_timer();
227
228 /*
229 * Save our processor parameters
230 */
231 smp_store_cpu_info(cpuid);
232
233 /*
234 * Allow the master to continue.
235 */
236 cpu_set(cpuid, cpu_callin_map);
237}
238
239/* maps the cpu to the sched domain representing multi-core */
240cpumask_t cpu_coregroup_map(int cpu)
241{
242 struct cpuinfo_x86 *c = cpu_data + cpu;
243 /*
244 * For perf, we return last level cache shared map.
245 * And for power savings, we return cpu_core_map
246 */
247 if (sched_mc_power_savings || sched_smt_power_savings)
248 return cpu_core_map[cpu];
249 else
250 return c->llc_shared_map;
251}
252
253/* representing cpus for which sibling maps can be computed */
254static cpumask_t cpu_sibling_setup_map;
255
256static inline void set_cpu_sibling_map(int cpu)
257{
258 int i;
259 struct cpuinfo_x86 *c = cpu_data;
260
261 cpu_set(cpu, cpu_sibling_setup_map);
262
263 if (smp_num_siblings > 1) {
264 for_each_cpu_mask(i, cpu_sibling_setup_map) {
265 if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
266 c[cpu].cpu_core_id == c[i].cpu_core_id) {
267 cpu_set(i, cpu_sibling_map[cpu]);
268 cpu_set(cpu, cpu_sibling_map[i]);
269 cpu_set(i, cpu_core_map[cpu]);
270 cpu_set(cpu, cpu_core_map[i]);
271 cpu_set(i, c[cpu].llc_shared_map);
272 cpu_set(cpu, c[i].llc_shared_map);
273 }
274 }
275 } else {
276 cpu_set(cpu, cpu_sibling_map[cpu]);
277 }
278
279 cpu_set(cpu, c[cpu].llc_shared_map);
280
281 if (current_cpu_data.x86_max_cores == 1) {
282 cpu_core_map[cpu] = cpu_sibling_map[cpu];
283 c[cpu].booted_cores = 1;
284 return;
285 }
286
287 for_each_cpu_mask(i, cpu_sibling_setup_map) {
288 if (cpu_llc_id[cpu] != BAD_APICID &&
289 cpu_llc_id[cpu] == cpu_llc_id[i]) {
290 cpu_set(i, c[cpu].llc_shared_map);
291 cpu_set(cpu, c[i].llc_shared_map);
292 }
293 if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
294 cpu_set(i, cpu_core_map[cpu]);
295 cpu_set(cpu, cpu_core_map[i]);
296 /*
297 * Does this new cpu bringup a new core?
298 */
299 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
300 /*
301 * for each core in package, increment
302 * the booted_cores for this new cpu
303 */
304 if (first_cpu(cpu_sibling_map[i]) == i)
305 c[cpu].booted_cores++;
306 /*
307 * increment the core count for all
308 * the other cpus in this package
309 */
310 if (i != cpu)
311 c[i].booted_cores++;
312 } else if (i != cpu && !c[cpu].booted_cores)
313 c[cpu].booted_cores = c[i].booted_cores;
314 }
315 }
316}
317
318/*
319 * Setup code on secondary processor (after comming out of the trampoline)
320 */
321void __cpuinit start_secondary(void)
322{
323 /*
324 * Dont put anything before smp_callin(), SMP
325 * booting is too fragile that we want to limit the
326 * things done here to the most necessary things.
327 */
328 cpu_init();
329 preempt_disable();
330 smp_callin();
331
332 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
333 barrier();
334
335 /*
336 * Check TSC sync first:
337 */
338 check_tsc_sync_target();
339
340 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
341 setup_secondary_APIC_clock();
342
343 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
344
345 if (nmi_watchdog == NMI_IO_APIC) {
346 disable_8259A_irq(0);
347 enable_NMI_through_LVT0(NULL);
348 enable_8259A_irq(0);
349 }
350
351 enable_APIC_timer();
352
353 /*
354 * The sibling maps must be set before turing the online map on for
355 * this cpu
356 */
357 set_cpu_sibling_map(smp_processor_id());
358
359 /*
360 * We need to hold call_lock, so there is no inconsistency
361 * between the time smp_call_function() determines number of
362 * IPI receipients, and the time when the determination is made
363 * for which cpus receive the IPI in genapic_flat.c. Holding this
364 * lock helps us to not include this cpu in a currently in progress
365 * smp_call_function().
366 */
367 lock_ipi_call_lock();
368 spin_lock(&vector_lock);
369
370 /* Setup the per cpu irq handling data structures */
371 __setup_vector_irq(smp_processor_id());
372 /*
373 * Allow the master to continue.
374 */
375 cpu_set(smp_processor_id(), cpu_online_map);
376 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
377 spin_unlock(&vector_lock);
378
379 unlock_ipi_call_lock();
380
381 cpu_idle();
382}
383
384extern volatile unsigned long init_rsp;
385extern void (*initial_code)(void);
386
387#ifdef APIC_DEBUG
388static void inquire_remote_apic(int apicid)
389{
390 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
391 char *names[] = { "ID", "VERSION", "SPIV" };
392 int timeout;
393 unsigned int status;
394
395 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
396
397 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
398 printk("... APIC #%d %s: ", apicid, names[i]);
399
400 /*
401 * Wait for idle.
402 */
403 status = safe_apic_wait_icr_idle();
404 if (status)
405 printk("a previous APIC delivery may have failed\n");
406
407 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
408 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
409
410 timeout = 0;
411 do {
412 udelay(100);
413 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
414 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
415
416 switch (status) {
417 case APIC_ICR_RR_VALID:
418 status = apic_read(APIC_RRR);
419 printk("%08x\n", status);
420 break;
421 default:
422 printk("failed\n");
423 }
424 }
425}
426#endif
427
428/*
429 * Kick the secondary to wake up.
430 */
431static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
432{
433 unsigned long send_status, accept_status = 0;
434 int maxlvt, num_starts, j;
435
436 Dprintk("Asserting INIT.\n");
437
438 /*
439 * Turn INIT on target chip
440 */
441 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
442
443 /*
444 * Send IPI
445 */
446 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
447 | APIC_DM_INIT);
448
449 Dprintk("Waiting for send to finish...\n");
450 send_status = safe_apic_wait_icr_idle();
451
452 mdelay(10);
453
454 Dprintk("Deasserting INIT.\n");
455
456 /* Target chip */
457 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
458
459 /* Send IPI */
460 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
461
462 Dprintk("Waiting for send to finish...\n");
463 send_status = safe_apic_wait_icr_idle();
464
465 mb();
466 atomic_set(&init_deasserted, 1);
467
468 num_starts = 2;
469
470 /*
471 * Run STARTUP IPI loop.
472 */
473 Dprintk("#startup loops: %d.\n", num_starts);
474
475 maxlvt = get_maxlvt();
476
477 for (j = 1; j <= num_starts; j++) {
478 Dprintk("Sending STARTUP #%d.\n",j);
479 apic_write(APIC_ESR, 0);
480 apic_read(APIC_ESR);
481 Dprintk("After apic_write.\n");
482
483 /*
484 * STARTUP IPI
485 */
486
487 /* Target chip */
488 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
489
490 /* Boot on the stack */
491 /* Kick the second */
492 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
493
494 /*
495 * Give the other CPU some time to accept the IPI.
496 */
497 udelay(300);
498
499 Dprintk("Startup point 1.\n");
500
501 Dprintk("Waiting for send to finish...\n");
502 send_status = safe_apic_wait_icr_idle();
503
504 /*
505 * Give the other CPU some time to accept the IPI.
506 */
507 udelay(200);
508 /*
509 * Due to the Pentium erratum 3AP.
510 */
511 if (maxlvt > 3) {
512 apic_write(APIC_ESR, 0);
513 }
514 accept_status = (apic_read(APIC_ESR) & 0xEF);
515 if (send_status || accept_status)
516 break;
517 }
518 Dprintk("After Startup.\n");
519
520 if (send_status)
521 printk(KERN_ERR "APIC never delivered???\n");
522 if (accept_status)
523 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
524
525 return (send_status | accept_status);
526}
527
528struct create_idle {
529 struct work_struct work;
530 struct task_struct *idle;
531 struct completion done;
532 int cpu;
533};
534
535void do_fork_idle(struct work_struct *work)
536{
537 struct create_idle *c_idle =
538 container_of(work, struct create_idle, work);
539
540 c_idle->idle = fork_idle(c_idle->cpu);
541 complete(&c_idle->done);
542}
543
544/*
545 * Boot one CPU.
546 */
547static int __cpuinit do_boot_cpu(int cpu, int apicid)
548{
549 unsigned long boot_error;
550 int timeout;
551 unsigned long start_rip;
552 struct create_idle c_idle = {
553 .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
554 .cpu = cpu,
555 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
556 };
557
558 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
559 if (!cpu_gdt_descr[cpu].address &&
560 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
561 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
562 return -1;
563 }
564
565 /* Allocate node local memory for AP pdas */
566 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
567 struct x8664_pda *newpda, *pda;
568 int node = cpu_to_node(cpu);
569 pda = cpu_pda(cpu);
570 newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
571 node);
572 if (newpda) {
573 memcpy(newpda, pda, sizeof (struct x8664_pda));
574 cpu_pda(cpu) = newpda;
575 } else
576 printk(KERN_ERR
577 "Could not allocate node local PDA for CPU %d on node %d\n",
578 cpu, node);
579 }
580
581 alternatives_smp_switch(1);
582
583 c_idle.idle = get_idle_for_cpu(cpu);
584
585 if (c_idle.idle) {
586 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
587 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
588 init_idle(c_idle.idle, cpu);
589 goto do_rest;
590 }
591
592 /*
593 * During cold boot process, keventd thread is not spun up yet.
594 * When we do cpu hot-add, we create idle threads on the fly, we should
595 * not acquire any attributes from the calling context. Hence the clean
596 * way to create kernel_threads() is to do that from keventd().
597 * We do the current_is_keventd() due to the fact that ACPI notifier
598 * was also queuing to keventd() and when the caller is already running
599 * in context of keventd(), we would end up with locking up the keventd
600 * thread.
601 */
602 if (!keventd_up() || current_is_keventd())
603 c_idle.work.func(&c_idle.work);
604 else {
605 schedule_work(&c_idle.work);
606 wait_for_completion(&c_idle.done);
607 }
608
609 if (IS_ERR(c_idle.idle)) {
610 printk("failed fork for CPU %d\n", cpu);
611 return PTR_ERR(c_idle.idle);
612 }
613
614 set_idle_for_cpu(cpu, c_idle.idle);
615
616do_rest:
617
618 cpu_pda(cpu)->pcurrent = c_idle.idle;
619
620 start_rip = setup_trampoline();
621
622 init_rsp = c_idle.idle->thread.rsp;
623 per_cpu(init_tss,cpu).rsp0 = init_rsp;
624 initial_code = start_secondary;
625 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
626
627 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
628 cpus_weight(cpu_present_map),
629 apicid);
630
631 /*
632 * This grunge runs the startup process for
633 * the targeted processor.
634 */
635
636 atomic_set(&init_deasserted, 0);
637
638 Dprintk("Setting warm reset code and vector.\n");
639
640 CMOS_WRITE(0xa, 0xf);
641 local_flush_tlb();
642 Dprintk("1.\n");
643 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
644 Dprintk("2.\n");
645 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
646 Dprintk("3.\n");
647
648 /*
649 * Be paranoid about clearing APIC errors.
650 */
651 apic_write(APIC_ESR, 0);
652 apic_read(APIC_ESR);
653
654 /*
655 * Status is now clean
656 */
657 boot_error = 0;
658
659 /*
660 * Starting actual IPI sequence...
661 */
662 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
663
664 if (!boot_error) {
665 /*
666 * allow APs to start initializing.
667 */
668 Dprintk("Before Callout %d.\n", cpu);
669 cpu_set(cpu, cpu_callout_map);
670 Dprintk("After Callout %d.\n", cpu);
671
672 /*
673 * Wait 5s total for a response
674 */
675 for (timeout = 0; timeout < 50000; timeout++) {
676 if (cpu_isset(cpu, cpu_callin_map))
677 break; /* It has booted */
678 udelay(100);
679 }
680
681 if (cpu_isset(cpu, cpu_callin_map)) {
682 /* number CPUs logically, starting from 1 (BSP is 0) */
683 Dprintk("CPU has booted.\n");
684 } else {
685 boot_error = 1;
686 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
687 == 0xA5)
688 /* trampoline started but...? */
689 printk("Stuck ??\n");
690 else
691 /* trampoline code not run */
692 printk("Not responding.\n");
693#ifdef APIC_DEBUG
694 inquire_remote_apic(apicid);
695#endif
696 }
697 }
698 if (boot_error) {
699 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
700 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
701 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
702 cpu_clear(cpu, cpu_present_map);
703 cpu_clear(cpu, cpu_possible_map);
704 x86_cpu_to_apicid[cpu] = BAD_APICID;
705 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
706 return -EIO;
707 }
708
709 return 0;
710}
711
712cycles_t cacheflush_time;
713unsigned long cache_decay_ticks;
714
715/*
716 * Cleanup possible dangling ends...
717 */
718static __cpuinit void smp_cleanup_boot(void)
719{
720 /*
721 * Paranoid: Set warm reset code and vector here back
722 * to default values.
723 */
724 CMOS_WRITE(0, 0xf);
725
726 /*
727 * Reset trampoline flag
728 */
729 *((volatile int *) phys_to_virt(0x467)) = 0;
730}
731
732/*
733 * Fall back to non SMP mode after errors.
734 *
735 * RED-PEN audit/test this more. I bet there is more state messed up here.
736 */
737static __init void disable_smp(void)
738{
739 cpu_present_map = cpumask_of_cpu(0);
740 cpu_possible_map = cpumask_of_cpu(0);
741 if (smp_found_config)
742 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
743 else
744 phys_cpu_present_map = physid_mask_of_physid(0);
745 cpu_set(0, cpu_sibling_map[0]);
746 cpu_set(0, cpu_core_map[0]);
747}
748
749#ifdef CONFIG_HOTPLUG_CPU
750
751int additional_cpus __initdata = -1;
752
753/*
754 * cpu_possible_map should be static, it cannot change as cpu's
755 * are onlined, or offlined. The reason is per-cpu data-structures
756 * are allocated by some modules at init time, and dont expect to
757 * do this dynamically on cpu arrival/departure.
758 * cpu_present_map on the other hand can change dynamically.
759 * In case when cpu_hotplug is not compiled, then we resort to current
760 * behaviour, which is cpu_possible == cpu_present.
761 * - Ashok Raj
762 *
763 * Three ways to find out the number of additional hotplug CPUs:
764 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
765 * - The user can overwrite it with additional_cpus=NUM
766 * - Otherwise don't reserve additional CPUs.
767 * We do this because additional CPUs waste a lot of memory.
768 * -AK
769 */
770__init void prefill_possible_map(void)
771{
772 int i;
773 int possible;
774
775 if (additional_cpus == -1) {
776 if (disabled_cpus > 0)
777 additional_cpus = disabled_cpus;
778 else
779 additional_cpus = 0;
780 }
781 possible = num_processors + additional_cpus;
782 if (possible > NR_CPUS)
783 possible = NR_CPUS;
784
785 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
786 possible,
787 max_t(int, possible - num_processors, 0));
788
789 for (i = 0; i < possible; i++)
790 cpu_set(i, cpu_possible_map);
791}
792#endif
793
794/*
795 * Various sanity checks.
796 */
797static int __init smp_sanity_check(unsigned max_cpus)
798{
799 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
800 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
801 hard_smp_processor_id());
802 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
803 }
804
805 /*
806 * If we couldn't find an SMP configuration at boot time,
807 * get out of here now!
808 */
809 if (!smp_found_config) {
810 printk(KERN_NOTICE "SMP motherboard not detected.\n");
811 disable_smp();
812 if (APIC_init_uniprocessor())
813 printk(KERN_NOTICE "Local APIC not detected."
814 " Using dummy APIC emulation.\n");
815 return -1;
816 }
817
818 /*
819 * Should not be necessary because the MP table should list the boot
820 * CPU too, but we do it for the sake of robustness anyway.
821 */
822 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
823 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
824 boot_cpu_id);
825 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
826 }
827
828 /*
829 * If we couldn't find a local APIC, then get out of here now!
830 */
831 if (!cpu_has_apic) {
832 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
833 boot_cpu_id);
834 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
835 nr_ioapics = 0;
836 return -1;
837 }
838
839 /*
840 * If SMP should be disabled, then really disable it!
841 */
842 if (!max_cpus) {
843 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
844 nr_ioapics = 0;
845 return -1;
846 }
847
848 return 0;
849}
850
851/*
852 * Prepare for SMP bootup. The MP table or ACPI has been read
853 * earlier. Just do some sanity checking here and enable APIC mode.
854 */
855void __init smp_prepare_cpus(unsigned int max_cpus)
856{
857 nmi_watchdog_default();
858 current_cpu_data = boot_cpu_data;
859 current_thread_info()->cpu = 0; /* needed? */
860 set_cpu_sibling_map(0);
861
862 if (smp_sanity_check(max_cpus) < 0) {
863 printk(KERN_INFO "SMP disabled\n");
864 disable_smp();
865 return;
866 }
867
868
869 /*
870 * Switch from PIC to APIC mode.
871 */
872 setup_local_APIC();
873
874 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
875 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
876 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
877 /* Or can we switch back to PIC here? */
878 }
879
880 /*
881 * Now start the IO-APICs
882 */
883 if (!skip_ioapic_setup && nr_ioapics)
884 setup_IO_APIC();
885 else
886 nr_ioapics = 0;
887
888 /*
889 * Set up local APIC timer on boot CPU.
890 */
891
892 setup_boot_APIC_clock();
893}
894
895/*
896 * Early setup to make printk work.
897 */
898void __init smp_prepare_boot_cpu(void)
899{
900 int me = smp_processor_id();
901 cpu_set(me, cpu_online_map);
902 cpu_set(me, cpu_callout_map);
903 per_cpu(cpu_state, me) = CPU_ONLINE;
904}
905
906/*
907 * Entry point to boot a CPU.
908 */
909int __cpuinit __cpu_up(unsigned int cpu)
910{
911 int apicid = cpu_present_to_apicid(cpu);
912 unsigned long flags;
913 int err;
914
915 WARN_ON(irqs_disabled());
916
917 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
918
919 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
920 !physid_isset(apicid, phys_cpu_present_map)) {
921 printk("__cpu_up: bad cpu %d\n", cpu);
922 return -EINVAL;
923 }
924
925 /*
926 * Already booted CPU?
927 */
928 if (cpu_isset(cpu, cpu_callin_map)) {
929 Dprintk("do_boot_cpu %d Already started\n", cpu);
930 return -ENOSYS;
931 }
932
933 /*
934 * Save current MTRR state in case it was changed since early boot
935 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
936 */
937 mtrr_save_state();
938
939 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
940 /* Boot it! */
941 err = do_boot_cpu(cpu, apicid);
942 if (err < 0) {
943 Dprintk("do_boot_cpu failed %d\n", err);
944 return err;
945 }
946
947 /* Unleash the CPU! */
948 Dprintk("waiting for cpu %d\n", cpu);
949
950 /*
951 * Make sure and check TSC sync:
952 */
953 local_irq_save(flags);
954 check_tsc_sync_source(cpu);
955 local_irq_restore(flags);
956
957 while (!cpu_isset(cpu, cpu_online_map))
958 cpu_relax();
959 err = 0;
960
961 return err;
962}
963
964/*
965 * Finish the SMP boot.
966 */
967void __init smp_cpus_done(unsigned int max_cpus)
968{
969 smp_cleanup_boot();
970 setup_ioapic_dest();
971 check_nmi_watchdog();
972}
973
974#ifdef CONFIG_HOTPLUG_CPU
975
976static void remove_siblinginfo(int cpu)
977{
978 int sibling;
979 struct cpuinfo_x86 *c = cpu_data;
980
981 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
982 cpu_clear(cpu, cpu_core_map[sibling]);
983 /*
984 * last thread sibling in this cpu core going down
985 */
986 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
987 c[sibling].booted_cores--;
988 }
989
990 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
991 cpu_clear(cpu, cpu_sibling_map[sibling]);
992 cpus_clear(cpu_sibling_map[cpu]);
993 cpus_clear(cpu_core_map[cpu]);
994 c[cpu].phys_proc_id = 0;
995 c[cpu].cpu_core_id = 0;
996 cpu_clear(cpu, cpu_sibling_setup_map);
997}
998
999void remove_cpu_from_maps(void)
1000{
1001 int cpu = smp_processor_id();
1002
1003 cpu_clear(cpu, cpu_callout_map);
1004 cpu_clear(cpu, cpu_callin_map);
1005 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1006 clear_node_cpumask(cpu);
1007}
1008
1009int __cpu_disable(void)
1010{
1011 int cpu = smp_processor_id();
1012
1013 /*
1014 * Perhaps use cpufreq to drop frequency, but that could go
1015 * into generic code.
1016 *
1017 * We won't take down the boot processor on i386 due to some
1018 * interrupts only being able to be serviced by the BSP.
1019 * Especially so if we're not using an IOAPIC -zwane
1020 */
1021 if (cpu == 0)
1022 return -EBUSY;
1023
1024 if (nmi_watchdog == NMI_LOCAL_APIC)
1025 stop_apic_nmi_watchdog(NULL);
1026 clear_local_APIC();
1027
1028 /*
1029 * HACK:
1030 * Allow any queued timer interrupts to get serviced
1031 * This is only a temporary solution until we cleanup
1032 * fixup_irqs as we do for IA64.
1033 */
1034 local_irq_enable();
1035 mdelay(1);
1036
1037 local_irq_disable();
1038 remove_siblinginfo(cpu);
1039
1040 spin_lock(&vector_lock);
1041 /* It's now safe to remove this processor from the online map */
1042 cpu_clear(cpu, cpu_online_map);
1043 spin_unlock(&vector_lock);
1044 remove_cpu_from_maps();
1045 fixup_irqs(cpu_online_map);
1046 return 0;
1047}
1048
1049void __cpu_die(unsigned int cpu)
1050{
1051 /* We don't do anything here: idle task is faking death itself. */
1052 unsigned int i;
1053
1054 for (i = 0; i < 10; i++) {
1055 /* They ack this in play_dead by setting CPU_DEAD */
1056 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1057 printk ("CPU %d is now offline\n", cpu);
1058 if (1 == num_online_cpus())
1059 alternatives_smp_switch(0);
1060 return;
1061 }
1062 msleep(100);
1063 }
1064 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1065}
1066
1067static __init int setup_additional_cpus(char *s)
1068{
1069 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1070}
1071early_param("additional_cpus", setup_additional_cpus);
1072
1073#else /* ... !CONFIG_HOTPLUG_CPU */
1074
1075int __cpu_disable(void)
1076{
1077 return -ENOSYS;
1078}
1079
1080void __cpu_die(unsigned int cpu)
1081{
1082 /* We said "no" in __cpu_disable */
1083 BUG();
1084}
1085#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644
index 000000000000..cb9109113584
--- /dev/null
+++ b/arch/x86/kernel/stacktrace.c
@@ -0,0 +1,54 @@
1/*
2 * arch/x86_64/kernel/stacktrace.c
3 *
4 * Stack trace management functions
5 *
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/sched.h>
9#include <linux/stacktrace.h>
10#include <linux/module.h>
11#include <asm/stacktrace.h>
12
13static void save_stack_warning(void *data, char *msg)
14{
15}
16
17static void
18save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
19{
20}
21
22static int save_stack_stack(void *data, char *name)
23{
24 return -1;
25}
26
27static void save_stack_address(void *data, unsigned long addr)
28{
29 struct stack_trace *trace = (struct stack_trace *)data;
30 if (trace->skip > 0) {
31 trace->skip--;
32 return;
33 }
34 if (trace->nr_entries < trace->max_entries)
35 trace->entries[trace->nr_entries++] = addr;
36}
37
38static struct stacktrace_ops save_stack_ops = {
39 .warning = save_stack_warning,
40 .warning_symbol = save_stack_warning_symbol,
41 .stack = save_stack_stack,
42 .address = save_stack_address,
43};
44
45/*
46 * Save stack-backtrace addresses into a stack_trace buffer.
47 */
48void save_stack_trace(struct stack_trace *trace)
49{
50 dump_trace(current, NULL, NULL, &save_stack_ops, trace);
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = ULONG_MAX;
53}
54EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
new file mode 100644
index 000000000000..573c0a6e0ac6
--- /dev/null
+++ b/arch/x86/kernel/suspend_64.c
@@ -0,0 +1,239 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/smp.h>
11#include <linux/suspend.h>
12#include <asm/proto.h>
13#include <asm/page.h>
14#include <asm/pgtable.h>
15#include <asm/mtrr.h>
16
17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end;
19
20struct saved_context saved_context;
21
22unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
23unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
24unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
25unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
26unsigned long saved_context_eflags;
27
28void __save_processor_state(struct saved_context *ctxt)
29{
30 kernel_fpu_begin();
31
32 /*
33 * descriptor tables
34 */
35 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
36 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
37 asm volatile ("str %0" : "=m" (ctxt->tr));
38
39 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
40 /*
41 * segment registers
42 */
43 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
44 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
45 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
46 asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
47 asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
48
49 rdmsrl(MSR_FS_BASE, ctxt->fs_base);
50 rdmsrl(MSR_GS_BASE, ctxt->gs_base);
51 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
52 mtrr_save_fixed_ranges(NULL);
53
54 /*
55 * control registers
56 */
57 rdmsrl(MSR_EFER, ctxt->efer);
58 ctxt->cr0 = read_cr0();
59 ctxt->cr2 = read_cr2();
60 ctxt->cr3 = read_cr3();
61 ctxt->cr4 = read_cr4();
62 ctxt->cr8 = read_cr8();
63}
64
65void save_processor_state(void)
66{
67 __save_processor_state(&saved_context);
68}
69
70static void do_fpu_end(void)
71{
72 /*
73 * Restore FPU regs if necessary
74 */
75 kernel_fpu_end();
76}
77
78void __restore_processor_state(struct saved_context *ctxt)
79{
80 /*
81 * control registers
82 */
83 wrmsrl(MSR_EFER, ctxt->efer);
84 write_cr8(ctxt->cr8);
85 write_cr4(ctxt->cr4);
86 write_cr3(ctxt->cr3);
87 write_cr2(ctxt->cr2);
88 write_cr0(ctxt->cr0);
89
90 /*
91 * now restore the descriptor tables to their proper values
92 * ltr is done i fix_processor_context().
93 */
94 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
95 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
96
97 /*
98 * segment registers
99 */
100 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
101 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
102 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
103 load_gs_index(ctxt->gs);
104 asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
105
106 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
107 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
108 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
109
110 fix_processor_context();
111
112 do_fpu_end();
113 mtrr_ap_init();
114}
115
116void restore_processor_state(void)
117{
118 __restore_processor_state(&saved_context);
119}
120
121void fix_processor_context(void)
122{
123 int cpu = smp_processor_id();
124 struct tss_struct *t = &per_cpu(init_tss, cpu);
125
126 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
127
128 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
129
130 syscall_init(); /* This sets MSR_*STAR and related */
131 load_TR_desc(); /* This does ltr */
132 load_LDT(&current->active_mm->context); /* This does lldt */
133
134 /*
135 * Now maybe reload the debug registers
136 */
137 if (current->thread.debugreg7){
138 loaddebug(&current->thread, 0);
139 loaddebug(&current->thread, 1);
140 loaddebug(&current->thread, 2);
141 loaddebug(&current->thread, 3);
142 /* no 4 and 5 */
143 loaddebug(&current->thread, 6);
144 loaddebug(&current->thread, 7);
145 }
146
147}
148
149#ifdef CONFIG_HIBERNATION
150/* Defined in arch/x86_64/kernel/suspend_asm.S */
151extern int restore_image(void);
152
153pgd_t *temp_level4_pgt;
154
155static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
156{
157 long i, j;
158
159 i = pud_index(address);
160 pud = pud + i;
161 for (; i < PTRS_PER_PUD; pud++, i++) {
162 unsigned long paddr;
163 pmd_t *pmd;
164
165 paddr = address + i*PUD_SIZE;
166 if (paddr >= end)
167 break;
168
169 pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
170 if (!pmd)
171 return -ENOMEM;
172 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
173 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
174 unsigned long pe;
175
176 if (paddr >= end)
177 break;
178 pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
179 pe &= __supported_pte_mask;
180 set_pmd(pmd, __pmd(pe));
181 }
182 }
183 return 0;
184}
185
186static int set_up_temporary_mappings(void)
187{
188 unsigned long start, end, next;
189 int error;
190
191 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
192 if (!temp_level4_pgt)
193 return -ENOMEM;
194
195 /* It is safe to reuse the original kernel mapping */
196 set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
197 init_level4_pgt[pgd_index(__START_KERNEL_map)]);
198
199 /* Set up the direct mapping from scratch */
200 start = (unsigned long)pfn_to_kaddr(0);
201 end = (unsigned long)pfn_to_kaddr(end_pfn);
202
203 for (; start < end; start = next) {
204 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
205 if (!pud)
206 return -ENOMEM;
207 next = start + PGDIR_SIZE;
208 if (next > end)
209 next = end;
210 if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
211 return error;
212 set_pgd(temp_level4_pgt + pgd_index(start),
213 mk_kernel_pgd(__pa(pud)));
214 }
215 return 0;
216}
217
218int swsusp_arch_resume(void)
219{
220 int error;
221
222 /* We have got enough memory and from now on we cannot recover */
223 if ((error = set_up_temporary_mappings()))
224 return error;
225 restore_image();
226 return 0;
227}
228
229/*
230 * pfn_is_nosave - check if given pfn is in the 'nosave' section
231 */
232
233int pfn_is_nosave(unsigned long pfn)
234{
235 unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
236 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
237 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
238}
239#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
new file mode 100644
index 000000000000..16d183f67bc1
--- /dev/null
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -0,0 +1,110 @@
1/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
2 *
3 * Distribute under GPLv2.
4 *
5 * swsusp_arch_resume may not use any stack, nor any variable that is
6 * not "NoSave" during copying pages:
7 *
8 * Its rewriting one kernel image with another. What is stack in "old"
9 * image could very well be data page in "new" image, and overwriting
10 * your own stack under you is bad idea.
11 */
12
13 .text
14#include <linux/linkage.h>
15#include <asm/segment.h>
16#include <asm/page.h>
17#include <asm/asm-offsets.h>
18
19ENTRY(swsusp_arch_suspend)
20
21 movq %rsp, saved_context_esp(%rip)
22 movq %rax, saved_context_eax(%rip)
23 movq %rbx, saved_context_ebx(%rip)
24 movq %rcx, saved_context_ecx(%rip)
25 movq %rdx, saved_context_edx(%rip)
26 movq %rbp, saved_context_ebp(%rip)
27 movq %rsi, saved_context_esi(%rip)
28 movq %rdi, saved_context_edi(%rip)
29 movq %r8, saved_context_r08(%rip)
30 movq %r9, saved_context_r09(%rip)
31 movq %r10, saved_context_r10(%rip)
32 movq %r11, saved_context_r11(%rip)
33 movq %r12, saved_context_r12(%rip)
34 movq %r13, saved_context_r13(%rip)
35 movq %r14, saved_context_r14(%rip)
36 movq %r15, saved_context_r15(%rip)
37 pushfq ; popq saved_context_eflags(%rip)
38
39 call swsusp_save
40 ret
41
42ENTRY(restore_image)
43 /* switch to temporary page tables */
44 movq $__PAGE_OFFSET, %rdx
45 movq temp_level4_pgt(%rip), %rax
46 subq %rdx, %rax
47 movq %rax, %cr3
48 /* Flush TLB */
49 movq mmu_cr4_features(%rip), %rax
50 movq %rax, %rdx
51 andq $~(1<<7), %rdx # PGE
52 movq %rdx, %cr4; # turn off PGE
53 movq %cr3, %rcx; # flush TLB
54 movq %rcx, %cr3;
55 movq %rax, %cr4; # turn PGE back on
56
57 movq restore_pblist(%rip), %rdx
58loop:
59 testq %rdx, %rdx
60 jz done
61
62 /* get addresses from the pbe and copy the page */
63 movq pbe_address(%rdx), %rsi
64 movq pbe_orig_address(%rdx), %rdi
65 movq $512, %rcx
66 rep
67 movsq
68
69 /* progress to the next pbe */
70 movq pbe_next(%rdx), %rdx
71 jmp loop
72done:
73 /* go back to the original page tables */
74 movq $(init_level4_pgt - __START_KERNEL_map), %rax
75 addq phys_base(%rip), %rax
76 movq %rax, %cr3
77
78 /* Flush TLB, including "global" things (vmalloc) */
79 movq mmu_cr4_features(%rip), %rax
80 movq %rax, %rdx
81 andq $~(1<<7), %rdx; # PGE
82 movq %rdx, %cr4; # turn off PGE
83 movq %cr3, %rcx; # flush TLB
84 movq %rcx, %cr3
85 movq %rax, %cr4; # turn PGE back on
86
87 movl $24, %eax
88 movl %eax, %ds
89
90 movq saved_context_esp(%rip), %rsp
91 movq saved_context_ebp(%rip), %rbp
92 /* Don't restore %rax, it must be 0 anyway */
93 movq saved_context_ebx(%rip), %rbx
94 movq saved_context_ecx(%rip), %rcx
95 movq saved_context_edx(%rip), %rdx
96 movq saved_context_esi(%rip), %rsi
97 movq saved_context_edi(%rip), %rdi
98 movq saved_context_r08(%rip), %r8
99 movq saved_context_r09(%rip), %r9
100 movq saved_context_r10(%rip), %r10
101 movq saved_context_r11(%rip), %r11
102 movq saved_context_r12(%rip), %r12
103 movq saved_context_r13(%rip), %r13
104 movq saved_context_r14(%rip), %r14
105 movq saved_context_r15(%rip), %r15
106 pushq saved_context_eflags(%rip) ; popfq
107
108 xorq %rax, %rax
109
110 ret
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..4770b7a2052c
--- /dev/null
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -0,0 +1,159 @@
1/*
2 * linux/arch/x86_64/kernel/sys_x86_64.c
3 */
4
5#include <linux/errno.h>
6#include <linux/sched.h>
7#include <linux/syscalls.h>
8#include <linux/mm.h>
9#include <linux/fs.h>
10#include <linux/smp.h>
11#include <linux/sem.h>
12#include <linux/msg.h>
13#include <linux/shm.h>
14#include <linux/stat.h>
15#include <linux/mman.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/personality.h>
19
20#include <asm/uaccess.h>
21#include <asm/ia32.h>
22
23/*
24 * sys_pipe() is the normal C calling standard for creating
25 * a pipe. It's not the way Unix traditionally does this, though.
26 */
27asmlinkage long sys_pipe(int __user *fildes)
28{
29 int fd[2];
30 int error;
31
32 error = do_pipe(fd);
33 if (!error) {
34 if (copy_to_user(fildes, fd, 2*sizeof(int)))
35 error = -EFAULT;
36 }
37 return error;
38}
39
40asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
41 unsigned long fd, unsigned long off)
42{
43 long error;
44 struct file * file;
45
46 error = -EINVAL;
47 if (off & ~PAGE_MASK)
48 goto out;
49
50 error = -EBADF;
51 file = NULL;
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58 down_write(&current->mm->mmap_sem);
59 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
60 up_write(&current->mm->mmap_sem);
61
62 if (file)
63 fput(file);
64out:
65 return error;
66}
67
68static void find_start_end(unsigned long flags, unsigned long *begin,
69 unsigned long *end)
70{
71 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
72 /* This is usually used needed to map code in small
73 model, so it needs to be in the first 31bit. Limit
74 it to that. This means we need to move the
75 unmapped base down for this case. This can give
76 conflicts with the heap, but we assume that glibc
77 malloc knows how to fall back to mmap. Give it 1GB
78 of playground for now. -AK */
79 *begin = 0x40000000;
80 *end = 0x80000000;
81 } else {
82 *begin = TASK_UNMAPPED_BASE;
83 *end = TASK_SIZE;
84 }
85}
86
87unsigned long
88arch_get_unmapped_area(struct file *filp, unsigned long addr,
89 unsigned long len, unsigned long pgoff, unsigned long flags)
90{
91 struct mm_struct *mm = current->mm;
92 struct vm_area_struct *vma;
93 unsigned long start_addr;
94 unsigned long begin, end;
95
96 if (flags & MAP_FIXED)
97 return addr;
98
99 find_start_end(flags, &begin, &end);
100
101 if (len > end)
102 return -ENOMEM;
103
104 if (addr) {
105 addr = PAGE_ALIGN(addr);
106 vma = find_vma(mm, addr);
107 if (end - len >= addr &&
108 (!vma || addr + len <= vma->vm_start))
109 return addr;
110 }
111 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
112 && len <= mm->cached_hole_size) {
113 mm->cached_hole_size = 0;
114 mm->free_area_cache = begin;
115 }
116 addr = mm->free_area_cache;
117 if (addr < begin)
118 addr = begin;
119 start_addr = addr;
120
121full_search:
122 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
123 /* At this point: (!vma || addr < vma->vm_end). */
124 if (end - len < addr) {
125 /*
126 * Start a new search - just in case we missed
127 * some holes.
128 */
129 if (start_addr != begin) {
130 start_addr = addr = begin;
131 mm->cached_hole_size = 0;
132 goto full_search;
133 }
134 return -ENOMEM;
135 }
136 if (!vma || addr + len <= vma->vm_start) {
137 /*
138 * Remember the place where we stopped the search:
139 */
140 mm->free_area_cache = addr + len;
141 return addr;
142 }
143 if (addr + mm->cached_hole_size < vma->vm_start)
144 mm->cached_hole_size = vma->vm_start - addr;
145
146 addr = vma->vm_end;
147 }
148}
149
150asmlinkage long sys_uname(struct new_utsname __user * name)
151{
152 int err;
153 down_read(&uts_sem);
154 err = copy_to_user(name, utsname(), sizeof (*name));
155 up_read(&uts_sem);
156 if (personality(current->personality) == PER_LINUX32)
157 err |= copy_to_user(&name->machine, "i686", 5);
158 return err ? -EFAULT : 0;
159}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
new file mode 100644
index 000000000000..63d592c276cc
--- /dev/null
+++ b/arch/x86/kernel/syscall_64.c
@@ -0,0 +1,26 @@
1/* System call table for x86-64. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __NO_STUBS
9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_
12#include <asm-x86_64/unistd.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [ nr ] = sym,
16#undef _ASM_X86_64_UNISTD_H_
17
18typedef void (*sys_call_ptr_t)(void);
19
20extern void sys_ni_syscall(void);
21
22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm-x86_64/unistd.h>
26};
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644
index 000000000000..e3f2569b2c44
--- /dev/null
+++ b/arch/x86/kernel/tce_64.c
@@ -0,0 +1,189 @@
1/*
2 * This file manages the translation entries for the IBM Calgary IOMMU.
3 *
4 * Derived from arch/powerpc/platforms/pseries/iommu.c
5 *
6 * Copyright (C) IBM Corporation, 2006
7 *
8 * Author: Jon Mason <jdmason@us.ibm.com>
9 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/mm.h>
29#include <linux/spinlock.h>
30#include <linux/string.h>
31#include <linux/pci.h>
32#include <linux/dma-mapping.h>
33#include <linux/bootmem.h>
34#include <asm/tce.h>
35#include <asm/calgary.h>
36#include <asm/proto.h>
37
38/* flush a tce at 'tceaddr' to main memory */
39static inline void flush_tce(void* tceaddr)
40{
41 /* a single tce can't cross a cache line */
42 if (cpu_has_clflush)
43 asm volatile("clflush (%0)" :: "r" (tceaddr));
44 else
45 asm volatile("wbinvd":::"memory");
46}
47
48void tce_build(struct iommu_table *tbl, unsigned long index,
49 unsigned int npages, unsigned long uaddr, int direction)
50{
51 u64* tp;
52 u64 t;
53 u64 rpn;
54
55 t = (1 << TCE_READ_SHIFT);
56 if (direction != DMA_TO_DEVICE)
57 t |= (1 << TCE_WRITE_SHIFT);
58
59 tp = ((u64*)tbl->it_base) + index;
60
61 while (npages--) {
62 rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
63 t &= ~TCE_RPN_MASK;
64 t |= (rpn << TCE_RPN_SHIFT);
65
66 *tp = cpu_to_be64(t);
67 flush_tce(tp);
68
69 uaddr += PAGE_SIZE;
70 tp++;
71 }
72}
73
74void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
75{
76 u64* tp;
77
78 tp = ((u64*)tbl->it_base) + index;
79
80 while (npages--) {
81 *tp = cpu_to_be64(0);
82 flush_tce(tp);
83 tp++;
84 }
85}
86
87static inline unsigned int table_size_to_number_of_entries(unsigned char size)
88{
89 /*
90 * size is the order of the table, 0-7
91 * smallest table is 8K entries, so shift result by 13 to
92 * multiply by 8K
93 */
94 return (1 << size) << 13;
95}
96
97static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
98{
99 unsigned int bitmapsz;
100 unsigned long bmppages;
101 int ret;
102
103 tbl->it_busno = dev->bus->number;
104
105 /* set the tce table size - measured in entries */
106 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
107
108 /*
109 * number of bytes needed for the bitmap size in number of
110 * entries; we need one bit per entry
111 */
112 bitmapsz = tbl->it_size / BITS_PER_BYTE;
113 bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
114 if (!bmppages) {
115 printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
116 ret = -ENOMEM;
117 goto done;
118 }
119
120 tbl->it_map = (unsigned long*)bmppages;
121
122 memset(tbl->it_map, 0, bitmapsz);
123
124 tbl->it_hint = 0;
125
126 spin_lock_init(&tbl->it_lock);
127
128 return 0;
129
130done:
131 return ret;
132}
133
134int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
135{
136 struct iommu_table *tbl;
137 int ret;
138
139 if (pci_iommu(dev->bus)) {
140 printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
141 dev, pci_iommu(dev->bus));
142 BUG();
143 }
144
145 tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
146 if (!tbl) {
147 printk(KERN_ERR "Calgary: error allocating iommu_table\n");
148 ret = -ENOMEM;
149 goto done;
150 }
151
152 ret = tce_table_setparms(dev, tbl);
153 if (ret)
154 goto free_tbl;
155
156 tbl->bbar = bbar;
157
158 set_pci_iommu(dev->bus, tbl);
159
160 return 0;
161
162free_tbl:
163 kfree(tbl);
164done:
165 return ret;
166}
167
168void * __init alloc_tce_table(void)
169{
170 unsigned int size;
171
172 size = table_size_to_number_of_entries(specified_table_size);
173 size *= TCE_ENTRY_SIZE;
174
175 return __alloc_bootmem_low(size, size, 0);
176}
177
178void __init free_tce_table(void *tbl)
179{
180 unsigned int size;
181
182 if (!tbl)
183 return;
184
185 size = table_size_to_number_of_entries(specified_table_size);
186 size *= TCE_ENTRY_SIZE;
187
188 free_bootmem(__pa(tbl), size);
189}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
new file mode 100644
index 000000000000..6d48a4e826d9
--- /dev/null
+++ b/arch/x86/kernel/time_64.c
@@ -0,0 +1,447 @@
1/*
2 * linux/arch/x86-64/kernel/time.c
3 *
4 * "High Precision Event Timer" based timekeeping.
5 *
6 * Copyright (c) 1991,1992,1995 Linus Torvalds
7 * Copyright (c) 1994 Alan Modra
8 * Copyright (c) 1995 Markus Kuhn
9 * Copyright (c) 1996 Ingo Molnar
10 * Copyright (c) 1998 Andrea Arcangeli
11 * Copyright (c) 2002,2006 Vojtech Pavlik
12 * Copyright (c) 2003 Andi Kleen
13 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
14 */
15
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/mc146818rtc.h>
21#include <linux/time.h>
22#include <linux/ioport.h>
23#include <linux/module.h>
24#include <linux/device.h>
25#include <linux/sysdev.h>
26#include <linux/bcd.h>
27#include <linux/notifier.h>
28#include <linux/cpu.h>
29#include <linux/kallsyms.h>
30#include <linux/acpi.h>
31#ifdef CONFIG_ACPI
32#include <acpi/achware.h> /* for PM timer frequency */
33#include <acpi/acpi_bus.h>
34#endif
35#include <asm/8253pit.h>
36#include <asm/i8253.h>
37#include <asm/pgtable.h>
38#include <asm/vsyscall.h>
39#include <asm/timex.h>
40#include <asm/proto.h>
41#include <asm/hpet.h>
42#include <asm/sections.h>
43#include <linux/hpet.h>
44#include <asm/apic.h>
45#include <asm/hpet.h>
46#include <asm/mpspec.h>
47#include <asm/nmi.h>
48#include <asm/vgtod.h>
49
50static char *timename = NULL;
51
52DEFINE_SPINLOCK(rtc_lock);
53EXPORT_SYMBOL(rtc_lock);
54DEFINE_SPINLOCK(i8253_lock);
55EXPORT_SYMBOL(i8253_lock);
56
57volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
58
59unsigned long profile_pc(struct pt_regs *regs)
60{
61 unsigned long pc = instruction_pointer(regs);
62
63 /* Assume the lock function has either no stack frame or a copy
64 of eflags from PUSHF
65 Eflags always has bits 22 and up cleared unlike kernel addresses. */
66 if (!user_mode(regs) && in_lock_functions(pc)) {
67 unsigned long *sp = (unsigned long *)regs->rsp;
68 if (sp[0] >> 22)
69 return sp[0];
70 if (sp[1] >> 22)
71 return sp[1];
72 }
73 return pc;
74}
75EXPORT_SYMBOL(profile_pc);
76
77/*
78 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
79 * ms after the second nowtime has started, because when nowtime is written
80 * into the registers of the CMOS clock, it will jump to the next second
81 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
82 * sheet for details.
83 */
84
85static int set_rtc_mmss(unsigned long nowtime)
86{
87 int retval = 0;
88 int real_seconds, real_minutes, cmos_minutes;
89 unsigned char control, freq_select;
90
91/*
92 * IRQs are disabled when we're called from the timer interrupt,
93 * no need for spin_lock_irqsave()
94 */
95
96 spin_lock(&rtc_lock);
97
98/*
99 * Tell the clock it's being set and stop it.
100 */
101
102 control = CMOS_READ(RTC_CONTROL);
103 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
104
105 freq_select = CMOS_READ(RTC_FREQ_SELECT);
106 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
107
108 cmos_minutes = CMOS_READ(RTC_MINUTES);
109 BCD_TO_BIN(cmos_minutes);
110
111/*
112 * since we're only adjusting minutes and seconds, don't interfere with hour
113 * overflow. This avoids messing with unknown time zones but requires your RTC
114 * not to be off by more than 15 minutes. Since we're calling it only when
115 * our clock is externally synchronized using NTP, this shouldn't be a problem.
116 */
117
118 real_seconds = nowtime % 60;
119 real_minutes = nowtime / 60;
120 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
121 real_minutes += 30; /* correct for half hour time zone */
122 real_minutes %= 60;
123
124 if (abs(real_minutes - cmos_minutes) >= 30) {
125 printk(KERN_WARNING "time.c: can't update CMOS clock "
126 "from %d to %d\n", cmos_minutes, real_minutes);
127 retval = -1;
128 } else {
129 BIN_TO_BCD(real_seconds);
130 BIN_TO_BCD(real_minutes);
131 CMOS_WRITE(real_seconds, RTC_SECONDS);
132 CMOS_WRITE(real_minutes, RTC_MINUTES);
133 }
134
135/*
136 * The following flags have to be released exactly in this order, otherwise the
137 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
138 * not reset the oscillator and will not update precisely 500 ms later. You
139 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
140 * believes data sheets anyway ... -- Markus Kuhn
141 */
142
143 CMOS_WRITE(control, RTC_CONTROL);
144 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
145
146 spin_unlock(&rtc_lock);
147
148 return retval;
149}
150
151int update_persistent_clock(struct timespec now)
152{
153 return set_rtc_mmss(now.tv_sec);
154}
155
156void main_timer_handler(void)
157{
158/*
159 * Here we are in the timer irq handler. We have irqs locally disabled (so we
160 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
161 * on the other CPU, so we need a lock. We also need to lock the vsyscall
162 * variables, because both do_timer() and us change them -arca+vojtech
163 */
164
165 write_seqlock(&xtime_lock);
166
167/*
168 * Do the timer stuff.
169 */
170
171 do_timer(1);
172#ifndef CONFIG_SMP
173 update_process_times(user_mode(get_irq_regs()));
174#endif
175
176/*
177 * In the SMP case we use the local APIC timer interrupt to do the profiling,
178 * except when we simulate SMP mode on a uniprocessor system, in that case we
179 * have to call the local interrupt handler.
180 */
181
182 if (!using_apic_timer)
183 smp_local_timer_interrupt();
184
185 write_sequnlock(&xtime_lock);
186}
187
188static irqreturn_t timer_interrupt(int irq, void *dev_id)
189{
190 if (apic_runs_main_timer > 1)
191 return IRQ_HANDLED;
192 main_timer_handler();
193 if (using_apic_timer)
194 smp_send_timer_broadcast_ipi();
195 return IRQ_HANDLED;
196}
197
198unsigned long read_persistent_clock(void)
199{
200 unsigned int year, mon, day, hour, min, sec;
201 unsigned long flags;
202 unsigned century = 0;
203
204 spin_lock_irqsave(&rtc_lock, flags);
205
206 do {
207 sec = CMOS_READ(RTC_SECONDS);
208 min = CMOS_READ(RTC_MINUTES);
209 hour = CMOS_READ(RTC_HOURS);
210 day = CMOS_READ(RTC_DAY_OF_MONTH);
211 mon = CMOS_READ(RTC_MONTH);
212 year = CMOS_READ(RTC_YEAR);
213#ifdef CONFIG_ACPI
214 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
215 acpi_gbl_FADT.century)
216 century = CMOS_READ(acpi_gbl_FADT.century);
217#endif
218 } while (sec != CMOS_READ(RTC_SECONDS));
219
220 spin_unlock_irqrestore(&rtc_lock, flags);
221
222 /*
223 * We know that x86-64 always uses BCD format, no need to check the
224 * config register.
225 */
226
227 BCD_TO_BIN(sec);
228 BCD_TO_BIN(min);
229 BCD_TO_BIN(hour);
230 BCD_TO_BIN(day);
231 BCD_TO_BIN(mon);
232 BCD_TO_BIN(year);
233
234 if (century) {
235 BCD_TO_BIN(century);
236 year += century * 100;
237 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
238 } else {
239 /*
240 * x86-64 systems only exists since 2002.
241 * This will work up to Dec 31, 2100
242 */
243 year += 2000;
244 }
245
246 return mktime(year, mon, day, hour, min, sec);
247}
248
249/* calibrate_cpu is used on systems with fixed rate TSCs to determine
250 * processor frequency */
251#define TICK_COUNT 100000000
252static unsigned int __init tsc_calibrate_cpu_khz(void)
253{
254 int tsc_start, tsc_now;
255 int i, no_ctr_free;
256 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
257 unsigned long flags;
258
259 for (i = 0; i < 4; i++)
260 if (avail_to_resrv_perfctr_nmi_bit(i))
261 break;
262 no_ctr_free = (i == 4);
263 if (no_ctr_free) {
264 i = 3;
265 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
266 wrmsrl(MSR_K7_EVNTSEL3, 0);
267 rdmsrl(MSR_K7_PERFCTR3, pmc3);
268 } else {
269 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
270 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
271 }
272 local_irq_save(flags);
273 /* start meauring cycles, incrementing from 0 */
274 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
275 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
276 rdtscl(tsc_start);
277 do {
278 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
279 tsc_now = get_cycles_sync();
280 } while ((tsc_now - tsc_start) < TICK_COUNT);
281
282 local_irq_restore(flags);
283 if (no_ctr_free) {
284 wrmsrl(MSR_K7_EVNTSEL3, 0);
285 wrmsrl(MSR_K7_PERFCTR3, pmc3);
286 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
287 } else {
288 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
289 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
290 }
291
292 return pmc_now * tsc_khz / (tsc_now - tsc_start);
293}
294
295/*
296 * pit_calibrate_tsc() uses the speaker output (channel 2) of
297 * the PIT. This is better than using the timer interrupt output,
298 * because we can read the value of the speaker with just one inb(),
299 * where we need three i/o operations for the interrupt channel.
300 * We count how many ticks the TSC does in 50 ms.
301 */
302
303static unsigned int __init pit_calibrate_tsc(void)
304{
305 unsigned long start, end;
306 unsigned long flags;
307
308 spin_lock_irqsave(&i8253_lock, flags);
309
310 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
311
312 outb(0xb0, 0x43);
313 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
314 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
315 start = get_cycles_sync();
316 while ((inb(0x61) & 0x20) == 0);
317 end = get_cycles_sync();
318
319 spin_unlock_irqrestore(&i8253_lock, flags);
320
321 return (end - start) / 50;
322}
323
324#define PIT_MODE 0x43
325#define PIT_CH0 0x40
326
327static void __pit_init(int val, u8 mode)
328{
329 unsigned long flags;
330
331 spin_lock_irqsave(&i8253_lock, flags);
332 outb_p(mode, PIT_MODE);
333 outb_p(val & 0xff, PIT_CH0); /* LSB */
334 outb_p(val >> 8, PIT_CH0); /* MSB */
335 spin_unlock_irqrestore(&i8253_lock, flags);
336}
337
338void __init pit_init(void)
339{
340 __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
341}
342
343void pit_stop_interrupt(void)
344{
345 __pit_init(0, 0x30); /* mode 0 */
346}
347
348void stop_timer_interrupt(void)
349{
350 char *name;
351 if (hpet_address) {
352 name = "HPET";
353 hpet_timer_stop_set_go(0);
354 } else {
355 name = "PIT";
356 pit_stop_interrupt();
357 }
358 printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
359}
360
361static struct irqaction irq0 = {
362 .handler = timer_interrupt,
363 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
364 .mask = CPU_MASK_NONE,
365 .name = "timer"
366};
367
368void __init time_init(void)
369{
370 if (nohpet)
371 hpet_address = 0;
372
373 if (hpet_arch_init())
374 hpet_address = 0;
375
376 if (hpet_use_timer) {
377 /* set tick_nsec to use the proper rate for HPET */
378 tick_nsec = TICK_NSEC_HPET;
379 tsc_khz = hpet_calibrate_tsc();
380 timename = "HPET";
381 } else {
382 pit_init();
383 tsc_khz = pit_calibrate_tsc();
384 timename = "PIT";
385 }
386
387 cpu_khz = tsc_khz;
388 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
389 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
390 boot_cpu_data.x86 == 16)
391 cpu_khz = tsc_calibrate_cpu_khz();
392
393 if (unsynchronized_tsc())
394 mark_tsc_unstable("TSCs unsynchronized");
395
396 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
397 vgetcpu_mode = VGETCPU_RDTSCP;
398 else
399 vgetcpu_mode = VGETCPU_LSL;
400
401 set_cyc2ns_scale(tsc_khz);
402 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
403 cpu_khz / 1000, cpu_khz % 1000);
404 init_tsc_clocksource();
405
406 setup_irq(0, &irq0);
407}
408
409/*
410 * sysfs support for the timer.
411 */
412
413static int timer_suspend(struct sys_device *dev, pm_message_t state)
414{
415 return 0;
416}
417
418static int timer_resume(struct sys_device *dev)
419{
420 if (hpet_address)
421 hpet_reenable();
422 else
423 i8254_timer_resume();
424 return 0;
425}
426
427static struct sysdev_class timer_sysclass = {
428 .resume = timer_resume,
429 .suspend = timer_suspend,
430 set_kset_name("timer"),
431};
432
433/* XXX this sysfs stuff should probably go elsewhere later -john */
434static struct sys_device device_timer = {
435 .id = 0,
436 .cls = &timer_sysclass,
437};
438
439static int time_init_device(void)
440{
441 int error = sysdev_class_register(&timer_sysclass);
442 if (!error)
443 error = sysdev_register(&device_timer);
444 return error;
445}
446
447device_initcall(time_init_device);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
new file mode 100644
index 000000000000..607983b0d27b
--- /dev/null
+++ b/arch/x86/kernel/trampoline_64.S
@@ -0,0 +1,166 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 * 15 Sept 2005 Eric Biederman: 64bit PIC support
7 *
8 * Entry: CS:IP point to the start of our code, we are
9 * in real mode with no stack, but the rest of the
10 * trampoline page to make our stack and everything else
11 * is a mystery.
12 *
13 * In fact we don't actually need a stack so we don't
14 * set one up.
15 *
16 * On entry to trampoline_data, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, data addresses need to be absolute
19 * (no relocation) and are taken with regard to r_base.
20 *
21 * With the addition of trampoline_level4_pgt this code can
22 * now enter a 64bit kernel that lives at arbitrary 64bit
23 * physical addresses.
24 *
25 * If you work on this file, check the object module with objdump
26 * --full-contents --reloc to make sure there are no relocation
27 * entries.
28 */
29
30#include <linux/linkage.h>
31#include <asm/pgtable.h>
32#include <asm/page.h>
33#include <asm/msr.h>
34#include <asm/segment.h>
35
36.data
37
38.code16
39
40ENTRY(trampoline_data)
41r_base = .
42 cli # We should be safe anyway
43 wbinvd
44 mov %cs, %ax # Code and data in the same place
45 mov %ax, %ds
46 mov %ax, %es
47 mov %ax, %ss
48
49
50 movl $0xA5A5A5A5, trampoline_data - r_base
51 # write marker for master knows we're running
52
53 # Setup stack
54 movw $(trampoline_stack_end - r_base), %sp
55
56 call verify_cpu # Verify the cpu supports long mode
57 testl %eax, %eax # Check for return code
58 jnz no_longmode
59
60 mov %cs, %ax
61 movzx %ax, %esi # Find the 32bit trampoline location
62 shll $4, %esi
63
64 # Fixup the vectors
65 addl %esi, startup_32_vector - r_base
66 addl %esi, startup_64_vector - r_base
67 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
68
69 /*
70 * GDT tables in non default location kernel can be beyond 16MB and
71 * lgdt will not be able to load the address as in real mode default
72 * operand size is 16bit. Use lgdtl instead to force operand size
73 * to 32 bit.
74 */
75
76 lidtl tidt - r_base # load idt with 0, 0
77 lgdtl tgdt - r_base # load gdt with whatever is appropriate
78
79 xor %ax, %ax
80 inc %ax # protected mode (PE) bit
81 lmsw %ax # into protected mode
82
83 # flush prefetch and jump to startup_32
84 ljmpl *(startup_32_vector - r_base)
85
86 .code32
87 .balign 4
88startup_32:
89 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
90 movl %eax, %ds
91
92 xorl %eax, %eax
93 btsl $5, %eax # Enable PAE mode
94 movl %eax, %cr4
95
96 # Setup trampoline 4 level pagetables
97 leal (trampoline_level4_pgt - r_base)(%esi), %eax
98 movl %eax, %cr3
99
100 movl $MSR_EFER, %ecx
101 movl $(1 << _EFER_LME), %eax # Enable Long Mode
102 xorl %edx, %edx
103 wrmsr
104
105 xorl %eax, %eax
106 btsl $31, %eax # Enable paging and in turn activate Long Mode
107 btsl $0, %eax # Enable protected mode
108 movl %eax, %cr0
109
110 /*
111 * At this point we're in long mode but in 32bit compatibility mode
112 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
113 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
114 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
115 */
116 ljmp *(startup_64_vector - r_base)(%esi)
117
118 .code64
119 .balign 4
120startup_64:
121 # Now jump into the kernel using virtual addresses
122 movq $secondary_startup_64, %rax
123 jmp *%rax
124
125 .code16
126no_longmode:
127 hlt
128 jmp no_longmode
129#include "verify_cpu_64.S"
130
131 # Careful these need to be in the same 64K segment as the above;
132tidt:
133 .word 0 # idt limit = 0
134 .word 0, 0 # idt base = 0L
135
136 # Duplicate the global descriptor table
137 # so the kernel can live anywhere
138 .balign 4
139tgdt:
140 .short tgdt_end - tgdt # gdt limit
141 .long tgdt - r_base
142 .short 0
143 .quad 0x00cf9b000000ffff # __KERNEL32_CS
144 .quad 0x00af9b000000ffff # __KERNEL_CS
145 .quad 0x00cf93000000ffff # __KERNEL_DS
146tgdt_end:
147
148 .balign 4
149startup_32_vector:
150 .long startup_32 - r_base
151 .word __KERNEL32_CS, 0
152
153 .balign 4
154startup_64_vector:
155 .long startup_64 - r_base
156 .word __KERNEL_CS, 0
157
158trampoline_stack:
159 .org 0x1000
160trampoline_stack_end:
161ENTRY(trampoline_level4_pgt)
162 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
163 .fill 510,8,0
164 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
165
166ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
new file mode 100644
index 000000000000..03888420775d
--- /dev/null
+++ b/arch/x86/kernel/traps_64.c
@@ -0,0 +1,1138 @@
1/*
2 * linux/arch/x86-64/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11/*
12 * 'Traps.c' handles hardware traps and faults after we have saved some
13 * state in 'entry.S'.
14 */
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/ptrace.h>
20#include <linux/timer.h>
21#include <linux/mm.h>
22#include <linux/init.h>
23#include <linux/delay.h>
24#include <linux/spinlock.h>
25#include <linux/interrupt.h>
26#include <linux/kallsyms.h>
27#include <linux/module.h>
28#include <linux/moduleparam.h>
29#include <linux/nmi.h>
30#include <linux/kprobes.h>
31#include <linux/kexec.h>
32#include <linux/unwind.h>
33#include <linux/uaccess.h>
34#include <linux/bug.h>
35#include <linux/kdebug.h>
36
37#if defined(CONFIG_EDAC)
38#include <linux/edac.h>
39#endif
40
41#include <asm/system.h>
42#include <asm/io.h>
43#include <asm/atomic.h>
44#include <asm/debugreg.h>
45#include <asm/desc.h>
46#include <asm/i387.h>
47#include <asm/processor.h>
48#include <asm/unwind.h>
49#include <asm/smp.h>
50#include <asm/pgalloc.h>
51#include <asm/pda.h>
52#include <asm/proto.h>
53#include <asm/nmi.h>
54#include <asm/stacktrace.h>
55
56asmlinkage void divide_error(void);
57asmlinkage void debug(void);
58asmlinkage void nmi(void);
59asmlinkage void int3(void);
60asmlinkage void overflow(void);
61asmlinkage void bounds(void);
62asmlinkage void invalid_op(void);
63asmlinkage void device_not_available(void);
64asmlinkage void double_fault(void);
65asmlinkage void coprocessor_segment_overrun(void);
66asmlinkage void invalid_TSS(void);
67asmlinkage void segment_not_present(void);
68asmlinkage void stack_segment(void);
69asmlinkage void general_protection(void);
70asmlinkage void page_fault(void);
71asmlinkage void coprocessor_error(void);
72asmlinkage void simd_coprocessor_error(void);
73asmlinkage void reserved(void);
74asmlinkage void alignment_check(void);
75asmlinkage void machine_check(void);
76asmlinkage void spurious_interrupt_bug(void);
77
78static inline void conditional_sti(struct pt_regs *regs)
79{
80 if (regs->eflags & X86_EFLAGS_IF)
81 local_irq_enable();
82}
83
84static inline void preempt_conditional_sti(struct pt_regs *regs)
85{
86 preempt_disable();
87 if (regs->eflags & X86_EFLAGS_IF)
88 local_irq_enable();
89}
90
91static inline void preempt_conditional_cli(struct pt_regs *regs)
92{
93 if (regs->eflags & X86_EFLAGS_IF)
94 local_irq_disable();
95 /* Make sure to not schedule here because we could be running
96 on an exception stack. */
97 preempt_enable_no_resched();
98}
99
100int kstack_depth_to_print = 12;
101
102#ifdef CONFIG_KALLSYMS
103void printk_address(unsigned long address)
104{
105 unsigned long offset = 0, symsize;
106 const char *symname;
107 char *modname;
108 char *delim = ":";
109 char namebuf[128];
110
111 symname = kallsyms_lookup(address, &symsize, &offset,
112 &modname, namebuf);
113 if (!symname) {
114 printk(" [<%016lx>]\n", address);
115 return;
116 }
117 if (!modname)
118 modname = delim = "";
119 printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
120 address, delim, modname, delim, symname, offset, symsize);
121}
122#else
123void printk_address(unsigned long address)
124{
125 printk(" [<%016lx>]\n", address);
126}
127#endif
128
129static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
130 unsigned *usedp, char **idp)
131{
132 static char ids[][8] = {
133 [DEBUG_STACK - 1] = "#DB",
134 [NMI_STACK - 1] = "NMI",
135 [DOUBLEFAULT_STACK - 1] = "#DF",
136 [STACKFAULT_STACK - 1] = "#SS",
137 [MCE_STACK - 1] = "#MC",
138#if DEBUG_STKSZ > EXCEPTION_STKSZ
139 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
140#endif
141 };
142 unsigned k;
143
144 /*
145 * Iterate over all exception stacks, and figure out whether
146 * 'stack' is in one of them:
147 */
148 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
149 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
150 /*
151 * Is 'stack' above this exception frame's end?
152 * If yes then skip to the next frame.
153 */
154 if (stack >= end)
155 continue;
156 /*
157 * Is 'stack' above this exception frame's start address?
158 * If yes then we found the right frame.
159 */
160 if (stack >= end - EXCEPTION_STKSZ) {
161 /*
162 * Make sure we only iterate through an exception
163 * stack once. If it comes up for the second time
164 * then there's something wrong going on - just
165 * break out and return NULL:
166 */
167 if (*usedp & (1U << k))
168 break;
169 *usedp |= 1U << k;
170 *idp = ids[k];
171 return (unsigned long *)end;
172 }
173 /*
174 * If this is a debug stack, and if it has a larger size than
175 * the usual exception stacks, then 'stack' might still
176 * be within the lower portion of the debug stack:
177 */
178#if DEBUG_STKSZ > EXCEPTION_STKSZ
179 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
180 unsigned j = N_EXCEPTION_STACKS - 1;
181
182 /*
183 * Black magic. A large debug stack is composed of
184 * multiple exception stack entries, which we
185 * iterate through now. Dont look:
186 */
187 do {
188 ++j;
189 end -= EXCEPTION_STKSZ;
190 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
191 } while (stack < end - EXCEPTION_STKSZ);
192 if (*usedp & (1U << j))
193 break;
194 *usedp |= 1U << j;
195 *idp = ids[j];
196 return (unsigned long *)end;
197 }
198#endif
199 }
200 return NULL;
201}
202
203#define MSG(txt) ops->warning(data, txt)
204
205/*
206 * x86-64 can have upto three kernel stacks:
207 * process stack
208 * interrupt stack
209 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
210 */
211
212static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
213{
214 void *t = (void *)tinfo;
215 return p > t && p < t + THREAD_SIZE - 3;
216}
217
218void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
219 unsigned long *stack,
220 struct stacktrace_ops *ops, void *data)
221{
222 const unsigned cpu = get_cpu();
223 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
224 unsigned used = 0;
225 struct thread_info *tinfo;
226
227 if (!tsk)
228 tsk = current;
229
230 if (!stack) {
231 unsigned long dummy;
232 stack = &dummy;
233 if (tsk && tsk != current)
234 stack = (unsigned long *)tsk->thread.rsp;
235 }
236
237 /*
238 * Print function call entries within a stack. 'cond' is the
239 * "end of stackframe" condition, that the 'stack++'
240 * iteration will eventually trigger.
241 */
242#define HANDLE_STACK(cond) \
243 do while (cond) { \
244 unsigned long addr = *stack++; \
245 /* Use unlocked access here because except for NMIs \
246 we should be already protected against module unloads */ \
247 if (__kernel_text_address(addr)) { \
248 /* \
249 * If the address is either in the text segment of the \
250 * kernel, or in the region which contains vmalloc'ed \
251 * memory, it *may* be the address of a calling \
252 * routine; if so, print it so that someone tracing \
253 * down the cause of the crash will be able to figure \
254 * out the call path that was taken. \
255 */ \
256 ops->address(data, addr); \
257 } \
258 } while (0)
259
260 /*
261 * Print function call entries in all stacks, starting at the
262 * current stack address. If the stacks consist of nested
263 * exceptions
264 */
265 for (;;) {
266 char *id;
267 unsigned long *estack_end;
268 estack_end = in_exception_stack(cpu, (unsigned long)stack,
269 &used, &id);
270
271 if (estack_end) {
272 if (ops->stack(data, id) < 0)
273 break;
274 HANDLE_STACK (stack < estack_end);
275 ops->stack(data, "<EOE>");
276 /*
277 * We link to the next stack via the
278 * second-to-last pointer (index -2 to end) in the
279 * exception stack:
280 */
281 stack = (unsigned long *) estack_end[-2];
282 continue;
283 }
284 if (irqstack_end) {
285 unsigned long *irqstack;
286 irqstack = irqstack_end -
287 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
288
289 if (stack >= irqstack && stack < irqstack_end) {
290 if (ops->stack(data, "IRQ") < 0)
291 break;
292 HANDLE_STACK (stack < irqstack_end);
293 /*
294 * We link to the next stack (which would be
295 * the process stack normally) the last
296 * pointer (index -1 to end) in the IRQ stack:
297 */
298 stack = (unsigned long *) (irqstack_end[-1]);
299 irqstack_end = NULL;
300 ops->stack(data, "EOI");
301 continue;
302 }
303 }
304 break;
305 }
306
307 /*
308 * This handles the process stack:
309 */
310 tinfo = task_thread_info(tsk);
311 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
312#undef HANDLE_STACK
313 put_cpu();
314}
315EXPORT_SYMBOL(dump_trace);
316
317static void
318print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
319{
320 print_symbol(msg, symbol);
321 printk("\n");
322}
323
324static void print_trace_warning(void *data, char *msg)
325{
326 printk("%s\n", msg);
327}
328
329static int print_trace_stack(void *data, char *name)
330{
331 printk(" <%s> ", name);
332 return 0;
333}
334
335static void print_trace_address(void *data, unsigned long addr)
336{
337 touch_nmi_watchdog();
338 printk_address(addr);
339}
340
341static struct stacktrace_ops print_trace_ops = {
342 .warning = print_trace_warning,
343 .warning_symbol = print_trace_warning_symbol,
344 .stack = print_trace_stack,
345 .address = print_trace_address,
346};
347
348void
349show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
350{
351 printk("\nCall Trace:\n");
352 dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
353 printk("\n");
354}
355
356static void
357_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
358{
359 unsigned long *stack;
360 int i;
361 const int cpu = smp_processor_id();
362 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
363 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
364
365 // debugging aid: "show_stack(NULL, NULL);" prints the
366 // back trace for this cpu.
367
368 if (rsp == NULL) {
369 if (tsk)
370 rsp = (unsigned long *)tsk->thread.rsp;
371 else
372 rsp = (unsigned long *)&rsp;
373 }
374
375 stack = rsp;
376 for(i=0; i < kstack_depth_to_print; i++) {
377 if (stack >= irqstack && stack <= irqstack_end) {
378 if (stack == irqstack_end) {
379 stack = (unsigned long *) (irqstack_end[-1]);
380 printk(" <EOI> ");
381 }
382 } else {
383 if (((long) stack & (THREAD_SIZE-1)) == 0)
384 break;
385 }
386 if (i && ((i % 4) == 0))
387 printk("\n");
388 printk(" %016lx", *stack++);
389 touch_nmi_watchdog();
390 }
391 show_trace(tsk, regs, rsp);
392}
393
394void show_stack(struct task_struct *tsk, unsigned long * rsp)
395{
396 _show_stack(tsk, NULL, rsp);
397}
398
399/*
400 * The architecture-independent dump_stack generator
401 */
402void dump_stack(void)
403{
404 unsigned long dummy;
405 show_trace(NULL, NULL, &dummy);
406}
407
408EXPORT_SYMBOL(dump_stack);
409
410void show_registers(struct pt_regs *regs)
411{
412 int i;
413 int in_kernel = !user_mode(regs);
414 unsigned long rsp;
415 const int cpu = smp_processor_id();
416 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
417
418 rsp = regs->rsp;
419 printk("CPU %d ", cpu);
420 __show_regs(regs);
421 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
422 cur->comm, cur->pid, task_thread_info(cur), cur);
423
424 /*
425 * When in-kernel, we also print out the stack and code at the
426 * time of the fault..
427 */
428 if (in_kernel) {
429 printk("Stack: ");
430 _show_stack(NULL, regs, (unsigned long*)rsp);
431
432 printk("\nCode: ");
433 if (regs->rip < PAGE_OFFSET)
434 goto bad;
435
436 for (i=0; i<20; i++) {
437 unsigned char c;
438 if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
439bad:
440 printk(" Bad RIP value.");
441 break;
442 }
443 printk("%02x ", c);
444 }
445 }
446 printk("\n");
447}
448
449int is_valid_bugaddr(unsigned long rip)
450{
451 unsigned short ud2;
452
453 if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
454 return 0;
455
456 return ud2 == 0x0b0f;
457}
458
459#ifdef CONFIG_BUG
460void out_of_line_bug(void)
461{
462 BUG();
463}
464EXPORT_SYMBOL(out_of_line_bug);
465#endif
466
467static DEFINE_SPINLOCK(die_lock);
468static int die_owner = -1;
469static unsigned int die_nest_count;
470
471unsigned __kprobes long oops_begin(void)
472{
473 int cpu;
474 unsigned long flags;
475
476 oops_enter();
477
478 /* racy, but better than risking deadlock. */
479 local_irq_save(flags);
480 cpu = smp_processor_id();
481 if (!spin_trylock(&die_lock)) {
482 if (cpu == die_owner)
483 /* nested oops. should stop eventually */;
484 else
485 spin_lock(&die_lock);
486 }
487 die_nest_count++;
488 die_owner = cpu;
489 console_verbose();
490 bust_spinlocks(1);
491 return flags;
492}
493
494void __kprobes oops_end(unsigned long flags)
495{
496 die_owner = -1;
497 bust_spinlocks(0);
498 die_nest_count--;
499 if (die_nest_count)
500 /* We still own the lock */
501 local_irq_restore(flags);
502 else
503 /* Nest count reaches zero, release the lock. */
504 spin_unlock_irqrestore(&die_lock, flags);
505 if (panic_on_oops)
506 panic("Fatal exception");
507 oops_exit();
508}
509
510void __kprobes __die(const char * str, struct pt_regs * regs, long err)
511{
512 static int die_counter;
513 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
514#ifdef CONFIG_PREEMPT
515 printk("PREEMPT ");
516#endif
517#ifdef CONFIG_SMP
518 printk("SMP ");
519#endif
520#ifdef CONFIG_DEBUG_PAGEALLOC
521 printk("DEBUG_PAGEALLOC");
522#endif
523 printk("\n");
524 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
525 show_registers(regs);
526 add_taint(TAINT_DIE);
527 /* Executive summary in case the oops scrolled away */
528 printk(KERN_ALERT "RIP ");
529 printk_address(regs->rip);
530 printk(" RSP <%016lx>\n", regs->rsp);
531 if (kexec_should_crash(current))
532 crash_kexec(regs);
533}
534
535void die(const char * str, struct pt_regs * regs, long err)
536{
537 unsigned long flags = oops_begin();
538
539 if (!user_mode(regs))
540 report_bug(regs->rip, regs);
541
542 __die(str, regs, err);
543 oops_end(flags);
544 do_exit(SIGSEGV);
545}
546
547void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
548{
549 unsigned long flags = oops_begin();
550
551 /*
552 * We are in trouble anyway, lets at least try
553 * to get a message out.
554 */
555 printk(str, smp_processor_id());
556 show_registers(regs);
557 if (kexec_should_crash(current))
558 crash_kexec(regs);
559 if (do_panic || panic_on_oops)
560 panic("Non maskable interrupt");
561 oops_end(flags);
562 nmi_exit();
563 local_irq_enable();
564 do_exit(SIGSEGV);
565}
566
567static void __kprobes do_trap(int trapnr, int signr, char *str,
568 struct pt_regs * regs, long error_code,
569 siginfo_t *info)
570{
571 struct task_struct *tsk = current;
572
573 if (user_mode(regs)) {
574 /*
575 * We want error_code and trap_no set for userspace
576 * faults and kernelspace faults which result in
577 * die(), but not kernelspace faults which are fixed
578 * up. die() gives the process no chance to handle
579 * the signal and notice the kernel fault information,
580 * so that won't result in polluting the information
581 * about previously queued, but not yet delivered,
582 * faults. See also do_general_protection below.
583 */
584 tsk->thread.error_code = error_code;
585 tsk->thread.trap_no = trapnr;
586
587 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
588 printk_ratelimit())
589 printk(KERN_INFO
590 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
591 tsk->comm, tsk->pid, str,
592 regs->rip, regs->rsp, error_code);
593
594 if (info)
595 force_sig_info(signr, info, tsk);
596 else
597 force_sig(signr, tsk);
598 return;
599 }
600
601
602 /* kernel trap */
603 {
604 const struct exception_table_entry *fixup;
605 fixup = search_exception_tables(regs->rip);
606 if (fixup)
607 regs->rip = fixup->fixup;
608 else {
609 tsk->thread.error_code = error_code;
610 tsk->thread.trap_no = trapnr;
611 die(str, regs, error_code);
612 }
613 return;
614 }
615}
616
617#define DO_ERROR(trapnr, signr, str, name) \
618asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
619{ \
620 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
621 == NOTIFY_STOP) \
622 return; \
623 conditional_sti(regs); \
624 do_trap(trapnr, signr, str, regs, error_code, NULL); \
625}
626
627#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
628asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
629{ \
630 siginfo_t info; \
631 info.si_signo = signr; \
632 info.si_errno = 0; \
633 info.si_code = sicode; \
634 info.si_addr = (void __user *)siaddr; \
635 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
636 == NOTIFY_STOP) \
637 return; \
638 conditional_sti(regs); \
639 do_trap(trapnr, signr, str, regs, error_code, &info); \
640}
641
642DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
643DO_ERROR( 4, SIGSEGV, "overflow", overflow)
644DO_ERROR( 5, SIGSEGV, "bounds", bounds)
645DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
646DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
647DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
648DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
649DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
650DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
651DO_ERROR(18, SIGSEGV, "reserved", reserved)
652
653/* Runs on IST stack */
654asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
655{
656 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
657 12, SIGBUS) == NOTIFY_STOP)
658 return;
659 preempt_conditional_sti(regs);
660 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
661 preempt_conditional_cli(regs);
662}
663
664asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
665{
666 static const char str[] = "double fault";
667 struct task_struct *tsk = current;
668
669 /* Return not checked because double check cannot be ignored */
670 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
671
672 tsk->thread.error_code = error_code;
673 tsk->thread.trap_no = 8;
674
675 /* This is always a kernel trap and never fixable (and thus must
676 never return). */
677 for (;;)
678 die(str, regs, error_code);
679}
680
681asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
682 long error_code)
683{
684 struct task_struct *tsk = current;
685
686 conditional_sti(regs);
687
688 if (user_mode(regs)) {
689 tsk->thread.error_code = error_code;
690 tsk->thread.trap_no = 13;
691
692 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
693 printk_ratelimit())
694 printk(KERN_INFO
695 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
696 tsk->comm, tsk->pid,
697 regs->rip, regs->rsp, error_code);
698
699 force_sig(SIGSEGV, tsk);
700 return;
701 }
702
703 /* kernel gp */
704 {
705 const struct exception_table_entry *fixup;
706 fixup = search_exception_tables(regs->rip);
707 if (fixup) {
708 regs->rip = fixup->fixup;
709 return;
710 }
711
712 tsk->thread.error_code = error_code;
713 tsk->thread.trap_no = 13;
714 if (notify_die(DIE_GPF, "general protection fault", regs,
715 error_code, 13, SIGSEGV) == NOTIFY_STOP)
716 return;
717 die("general protection fault", regs, error_code);
718 }
719}
720
721static __kprobes void
722mem_parity_error(unsigned char reason, struct pt_regs * regs)
723{
724 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
725 reason);
726 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
727
728#if defined(CONFIG_EDAC)
729 if(edac_handler_set()) {
730 edac_atomic_assert_error();
731 return;
732 }
733#endif
734
735 if (panic_on_unrecovered_nmi)
736 panic("NMI: Not continuing");
737
738 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
739
740 /* Clear and disable the memory parity error line. */
741 reason = (reason & 0xf) | 4;
742 outb(reason, 0x61);
743}
744
745static __kprobes void
746io_check_error(unsigned char reason, struct pt_regs * regs)
747{
748 printk("NMI: IOCK error (debug interrupt?)\n");
749 show_registers(regs);
750
751 /* Re-enable the IOCK line, wait for a few seconds */
752 reason = (reason & 0xf) | 8;
753 outb(reason, 0x61);
754 mdelay(2000);
755 reason &= ~8;
756 outb(reason, 0x61);
757}
758
759static __kprobes void
760unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
761{
762 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
763 reason);
764 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
765
766 if (panic_on_unrecovered_nmi)
767 panic("NMI: Not continuing");
768
769 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
770}
771
772/* Runs on IST stack. This code must keep interrupts off all the time.
773 Nested NMIs are prevented by the CPU. */
774asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
775{
776 unsigned char reason = 0;
777 int cpu;
778
779 cpu = smp_processor_id();
780
781 /* Only the BSP gets external NMIs from the system. */
782 if (!cpu)
783 reason = get_nmi_reason();
784
785 if (!(reason & 0xc0)) {
786 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
787 == NOTIFY_STOP)
788 return;
789 /*
790 * Ok, so this is none of the documented NMI sources,
791 * so it must be the NMI watchdog.
792 */
793 if (nmi_watchdog_tick(regs,reason))
794 return;
795 if (!do_nmi_callback(regs,cpu))
796 unknown_nmi_error(reason, regs);
797
798 return;
799 }
800 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
801 return;
802
803 /* AK: following checks seem to be broken on modern chipsets. FIXME */
804
805 if (reason & 0x80)
806 mem_parity_error(reason, regs);
807 if (reason & 0x40)
808 io_check_error(reason, regs);
809}
810
811/* runs on IST stack. */
812asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
813{
814 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
815 return;
816 }
817 preempt_conditional_sti(regs);
818 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
819 preempt_conditional_cli(regs);
820}
821
822/* Help handler running on IST stack to switch back to user stack
823 for scheduling or signal handling. The actual stack switch is done in
824 entry.S */
825asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
826{
827 struct pt_regs *regs = eregs;
828 /* Did already sync */
829 if (eregs == (struct pt_regs *)eregs->rsp)
830 ;
831 /* Exception from user space */
832 else if (user_mode(eregs))
833 regs = task_pt_regs(current);
834 /* Exception from kernel and interrupts are enabled. Move to
835 kernel process stack. */
836 else if (eregs->eflags & X86_EFLAGS_IF)
837 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
838 if (eregs != regs)
839 *regs = *eregs;
840 return regs;
841}
842
843/* runs on IST stack. */
844asmlinkage void __kprobes do_debug(struct pt_regs * regs,
845 unsigned long error_code)
846{
847 unsigned long condition;
848 struct task_struct *tsk = current;
849 siginfo_t info;
850
851 get_debugreg(condition, 6);
852
853 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
854 SIGTRAP) == NOTIFY_STOP)
855 return;
856
857 preempt_conditional_sti(regs);
858
859 /* Mask out spurious debug traps due to lazy DR7 setting */
860 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
861 if (!tsk->thread.debugreg7) {
862 goto clear_dr7;
863 }
864 }
865
866 tsk->thread.debugreg6 = condition;
867
868 /* Mask out spurious TF errors due to lazy TF clearing */
869 if (condition & DR_STEP) {
870 /*
871 * The TF error should be masked out only if the current
872 * process is not traced and if the TRAP flag has been set
873 * previously by a tracing process (condition detected by
874 * the PT_DTRACE flag); remember that the i386 TRAP flag
875 * can be modified by the process itself in user mode,
876 * allowing programs to debug themselves without the ptrace()
877 * interface.
878 */
879 if (!user_mode(regs))
880 goto clear_TF_reenable;
881 /*
882 * Was the TF flag set by a debugger? If so, clear it now,
883 * so that register information is correct.
884 */
885 if (tsk->ptrace & PT_DTRACE) {
886 regs->eflags &= ~TF_MASK;
887 tsk->ptrace &= ~PT_DTRACE;
888 }
889 }
890
891 /* Ok, finally something we can handle */
892 tsk->thread.trap_no = 1;
893 tsk->thread.error_code = error_code;
894 info.si_signo = SIGTRAP;
895 info.si_errno = 0;
896 info.si_code = TRAP_BRKPT;
897 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
898 force_sig_info(SIGTRAP, &info, tsk);
899
900clear_dr7:
901 set_debugreg(0UL, 7);
902 preempt_conditional_cli(regs);
903 return;
904
905clear_TF_reenable:
906 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
907 regs->eflags &= ~TF_MASK;
908 preempt_conditional_cli(regs);
909}
910
911static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
912{
913 const struct exception_table_entry *fixup;
914 fixup = search_exception_tables(regs->rip);
915 if (fixup) {
916 regs->rip = fixup->fixup;
917 return 1;
918 }
919 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
920 /* Illegal floating point operation in the kernel */
921 current->thread.trap_no = trapnr;
922 die(str, regs, 0);
923 return 0;
924}
925
926/*
927 * Note that we play around with the 'TS' bit in an attempt to get
928 * the correct behaviour even in the presence of the asynchronous
929 * IRQ13 behaviour
930 */
931asmlinkage void do_coprocessor_error(struct pt_regs *regs)
932{
933 void __user *rip = (void __user *)(regs->rip);
934 struct task_struct * task;
935 siginfo_t info;
936 unsigned short cwd, swd;
937
938 conditional_sti(regs);
939 if (!user_mode(regs) &&
940 kernel_math_error(regs, "kernel x87 math error", 16))
941 return;
942
943 /*
944 * Save the info for the exception handler and clear the error.
945 */
946 task = current;
947 save_init_fpu(task);
948 task->thread.trap_no = 16;
949 task->thread.error_code = 0;
950 info.si_signo = SIGFPE;
951 info.si_errno = 0;
952 info.si_code = __SI_FAULT;
953 info.si_addr = rip;
954 /*
955 * (~cwd & swd) will mask out exceptions that are not set to unmasked
956 * status. 0x3f is the exception bits in these regs, 0x200 is the
957 * C1 reg you need in case of a stack fault, 0x040 is the stack
958 * fault bit. We should only be taking one exception at a time,
959 * so if this combination doesn't produce any single exception,
960 * then we have a bad program that isn't synchronizing its FPU usage
961 * and it will suffer the consequences since we won't be able to
962 * fully reproduce the context of the exception
963 */
964 cwd = get_fpu_cwd(task);
965 swd = get_fpu_swd(task);
966 switch (swd & ~cwd & 0x3f) {
967 case 0x000:
968 default:
969 break;
970 case 0x001: /* Invalid Op */
971 /*
972 * swd & 0x240 == 0x040: Stack Underflow
973 * swd & 0x240 == 0x240: Stack Overflow
974 * User must clear the SF bit (0x40) if set
975 */
976 info.si_code = FPE_FLTINV;
977 break;
978 case 0x002: /* Denormalize */
979 case 0x010: /* Underflow */
980 info.si_code = FPE_FLTUND;
981 break;
982 case 0x004: /* Zero Divide */
983 info.si_code = FPE_FLTDIV;
984 break;
985 case 0x008: /* Overflow */
986 info.si_code = FPE_FLTOVF;
987 break;
988 case 0x020: /* Precision */
989 info.si_code = FPE_FLTRES;
990 break;
991 }
992 force_sig_info(SIGFPE, &info, task);
993}
994
995asmlinkage void bad_intr(void)
996{
997 printk("bad interrupt");
998}
999
1000asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1001{
1002 void __user *rip = (void __user *)(regs->rip);
1003 struct task_struct * task;
1004 siginfo_t info;
1005 unsigned short mxcsr;
1006
1007 conditional_sti(regs);
1008 if (!user_mode(regs) &&
1009 kernel_math_error(regs, "kernel simd math error", 19))
1010 return;
1011
1012 /*
1013 * Save the info for the exception handler and clear the error.
1014 */
1015 task = current;
1016 save_init_fpu(task);
1017 task->thread.trap_no = 19;
1018 task->thread.error_code = 0;
1019 info.si_signo = SIGFPE;
1020 info.si_errno = 0;
1021 info.si_code = __SI_FAULT;
1022 info.si_addr = rip;
1023 /*
1024 * The SIMD FPU exceptions are handled a little differently, as there
1025 * is only a single status/control register. Thus, to determine which
1026 * unmasked exception was caught we must mask the exception mask bits
1027 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1028 */
1029 mxcsr = get_fpu_mxcsr(task);
1030 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1031 case 0x000:
1032 default:
1033 break;
1034 case 0x001: /* Invalid Op */
1035 info.si_code = FPE_FLTINV;
1036 break;
1037 case 0x002: /* Denormalize */
1038 case 0x010: /* Underflow */
1039 info.si_code = FPE_FLTUND;
1040 break;
1041 case 0x004: /* Zero Divide */
1042 info.si_code = FPE_FLTDIV;
1043 break;
1044 case 0x008: /* Overflow */
1045 info.si_code = FPE_FLTOVF;
1046 break;
1047 case 0x020: /* Precision */
1048 info.si_code = FPE_FLTRES;
1049 break;
1050 }
1051 force_sig_info(SIGFPE, &info, task);
1052}
1053
1054asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1055{
1056}
1057
1058asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1059{
1060}
1061
1062asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1063{
1064}
1065
1066/*
1067 * 'math_state_restore()' saves the current math information in the
1068 * old math state array, and gets the new ones from the current task
1069 *
1070 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1071 * Don't touch unless you *really* know how it works.
1072 */
1073asmlinkage void math_state_restore(void)
1074{
1075 struct task_struct *me = current;
1076 clts(); /* Allow maths ops (or we recurse) */
1077
1078 if (!used_math())
1079 init_fpu(me);
1080 restore_fpu_checking(&me->thread.i387.fxsave);
1081 task_thread_info(me)->status |= TS_USEDFPU;
1082 me->fpu_counter++;
1083}
1084
1085void __init trap_init(void)
1086{
1087 set_intr_gate(0,&divide_error);
1088 set_intr_gate_ist(1,&debug,DEBUG_STACK);
1089 set_intr_gate_ist(2,&nmi,NMI_STACK);
1090 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
1091 set_system_gate(4,&overflow); /* int4 can be called from all */
1092 set_intr_gate(5,&bounds);
1093 set_intr_gate(6,&invalid_op);
1094 set_intr_gate(7,&device_not_available);
1095 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
1096 set_intr_gate(9,&coprocessor_segment_overrun);
1097 set_intr_gate(10,&invalid_TSS);
1098 set_intr_gate(11,&segment_not_present);
1099 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
1100 set_intr_gate(13,&general_protection);
1101 set_intr_gate(14,&page_fault);
1102 set_intr_gate(15,&spurious_interrupt_bug);
1103 set_intr_gate(16,&coprocessor_error);
1104 set_intr_gate(17,&alignment_check);
1105#ifdef CONFIG_X86_MCE
1106 set_intr_gate_ist(18,&machine_check, MCE_STACK);
1107#endif
1108 set_intr_gate(19,&simd_coprocessor_error);
1109
1110#ifdef CONFIG_IA32_EMULATION
1111 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1112#endif
1113
1114 /*
1115 * Should be a barrier for any external CPU state.
1116 */
1117 cpu_init();
1118}
1119
1120
1121static int __init oops_setup(char *s)
1122{
1123 if (!s)
1124 return -EINVAL;
1125 if (!strcmp(s, "panic"))
1126 panic_on_oops = 1;
1127 return 0;
1128}
1129early_param("oops", oops_setup);
1130
1131static int __init kstack_setup(char *s)
1132{
1133 if (!s)
1134 return -EINVAL;
1135 kstack_depth_to_print = simple_strtoul(s,NULL,0);
1136 return 0;
1137}
1138early_param("kstack", kstack_setup);
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
new file mode 100644
index 000000000000..2a59bde663f2
--- /dev/null
+++ b/arch/x86/kernel/tsc_64.c
@@ -0,0 +1,207 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9
10#include <asm/timex.h>
11
12static int notsc __initdata = 0;
13
14unsigned int cpu_khz; /* TSC clocks / usec, not used here */
15EXPORT_SYMBOL(cpu_khz);
16unsigned int tsc_khz;
17EXPORT_SYMBOL(tsc_khz);
18
19static unsigned int cyc2ns_scale __read_mostly;
20
21void set_cyc2ns_scale(unsigned long khz)
22{
23 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
24}
25
26static unsigned long long cycles_2_ns(unsigned long long cyc)
27{
28 return (cyc * cyc2ns_scale) >> NS_SCALE;
29}
30
31unsigned long long sched_clock(void)
32{
33 unsigned long a = 0;
34
35 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
36 * which means it is not completely exact and may not be monotonous
37 * between CPUs. But the errors should be too small to matter for
38 * scheduling purposes.
39 */
40
41 rdtscll(a);
42 return cycles_2_ns(a);
43}
44
45static int tsc_unstable;
46
47inline int check_tsc_unstable(void)
48{
49 return tsc_unstable;
50}
51#ifdef CONFIG_CPU_FREQ
52
53/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
54 * changes.
55 *
56 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
57 * not that important because current Opteron setups do not support
58 * scaling on SMP anyroads.
59 *
60 * Should fix up last_tsc too. Currently gettimeofday in the
61 * first tick after the change will be slightly wrong.
62 */
63
64static unsigned int ref_freq;
65static unsigned long loops_per_jiffy_ref;
66static unsigned long tsc_khz_ref;
67
68static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
69 void *data)
70{
71 struct cpufreq_freqs *freq = data;
72 unsigned long *lpj, dummy;
73
74 if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
75 return 0;
76
77 lpj = &dummy;
78 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
79#ifdef CONFIG_SMP
80 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
81#else
82 lpj = &boot_cpu_data.loops_per_jiffy;
83#endif
84
85 if (!ref_freq) {
86 ref_freq = freq->old;
87 loops_per_jiffy_ref = *lpj;
88 tsc_khz_ref = tsc_khz;
89 }
90 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
91 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
92 (val == CPUFREQ_RESUMECHANGE)) {
93 *lpj =
94 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
95
96 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
97 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
98 mark_tsc_unstable("cpufreq changes");
99 }
100
101 set_cyc2ns_scale(tsc_khz_ref);
102
103 return 0;
104}
105
106static struct notifier_block time_cpufreq_notifier_block = {
107 .notifier_call = time_cpufreq_notifier
108};
109
110static int __init cpufreq_tsc(void)
111{
112 cpufreq_register_notifier(&time_cpufreq_notifier_block,
113 CPUFREQ_TRANSITION_NOTIFIER);
114 return 0;
115}
116
117core_initcall(cpufreq_tsc);
118
119#endif
120
121/*
122 * Make an educated guess if the TSC is trustworthy and synchronized
123 * over all CPUs.
124 */
125__cpuinit int unsynchronized_tsc(void)
126{
127 if (tsc_unstable)
128 return 1;
129
130#ifdef CONFIG_SMP
131 if (apic_is_clustered_box())
132 return 1;
133#endif
134 /* Most intel systems have synchronized TSCs except for
135 multi node systems */
136 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
137#ifdef CONFIG_ACPI
138 /* But TSC doesn't tick in C3 so don't use it there */
139 if (acpi_gbl_FADT.header.length > 0 &&
140 acpi_gbl_FADT.C3latency < 1000)
141 return 1;
142#endif
143 return 0;
144 }
145
146 /* Assume multi socket systems are not synchronized */
147 return num_present_cpus() > 1;
148}
149
150int __init notsc_setup(char *s)
151{
152 notsc = 1;
153 return 1;
154}
155
156__setup("notsc", notsc_setup);
157
158
159/* clock source code: */
160static cycle_t read_tsc(void)
161{
162 cycle_t ret = (cycle_t)get_cycles_sync();
163 return ret;
164}
165
166static cycle_t __vsyscall_fn vread_tsc(void)
167{
168 cycle_t ret = (cycle_t)get_cycles_sync();
169 return ret;
170}
171
172static struct clocksource clocksource_tsc = {
173 .name = "tsc",
174 .rating = 300,
175 .read = read_tsc,
176 .mask = CLOCKSOURCE_MASK(64),
177 .shift = 22,
178 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
179 CLOCK_SOURCE_MUST_VERIFY,
180 .vread = vread_tsc,
181};
182
183void mark_tsc_unstable(char *reason)
184{
185 if (!tsc_unstable) {
186 tsc_unstable = 1;
187 printk("Marking TSC unstable due to %s\n", reason);
188 /* Change only the rating, when not registered */
189 if (clocksource_tsc.mult)
190 clocksource_change_rating(&clocksource_tsc, 0);
191 else
192 clocksource_tsc.rating = 0;
193 }
194}
195EXPORT_SYMBOL_GPL(mark_tsc_unstable);
196
197void __init init_tsc_clocksource(void)
198{
199 if (!notsc) {
200 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
201 clocksource_tsc.shift);
202 if (check_tsc_unstable())
203 clocksource_tsc.rating = 0;
204
205 clocksource_register(&clocksource_tsc);
206 }
207}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 12424629af87..355f5f506c81 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -1 +1,187 @@
1#include "../../x86_64/kernel/tsc_sync.c" 1/*
2 * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
3 *
4 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
5 *
6 * We check whether all boot CPUs have their TSC's synchronized,
7 * print a warning if not and turn off the TSC clock-source.
8 *
9 * The warp-check is point-to-point between two CPUs, the CPU
10 * initiating the bootup is the 'source CPU', the freshly booting
11 * CPU is the 'target CPU'.
12 *
13 * Only two CPUs may participate - they can enter in any order.
14 * ( The serial nature of the boot logic and the CPU hotplug lock
15 * protects against more than 2 CPUs entering this code. )
16 */
17#include <linux/spinlock.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h>
21#include <linux/nmi.h>
22#include <asm/tsc.h>
23
24/*
25 * Entry/exit counters that make sure that both CPUs
26 * run the measurement code at once:
27 */
28static __cpuinitdata atomic_t start_count;
29static __cpuinitdata atomic_t stop_count;
30
31/*
32 * We use a raw spinlock in this exceptional case, because
33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps:
35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps;
40
41/*
42 * TSC-warp measurement loop running on both CPUs:
43 */
44static __cpuinit void check_tsc_warp(void)
45{
46 cycles_t start, now, prev, end;
47 int i;
48
49 start = get_cycles_sync();
50 /*
51 * The measurement runs for 20 msecs:
52 */
53 end = start + tsc_khz * 20ULL;
54 now = start;
55
56 for (i = 0; ; i++) {
57 /*
58 * We take the global lock, measure TSC, save the
59 * previous TSC that was measured (possibly on
60 * another CPU) and update the previous TSC timestamp.
61 */
62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc;
64 now = get_cycles_sync();
65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock);
67
68 /*
69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million
71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]):
73 */
74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000)
76 break;
77 cpu_relax();
78 touch_nmi_watchdog();
79 }
80 /*
81 * Outside the critical section we can now see whether
82 * we saw a time-warp of the TSC going backwards:
83 */
84 if (unlikely(prev > now)) {
85 __raw_spin_lock(&sync_lock);
86 max_warp = max(max_warp, prev - now);
87 nr_warps++;
88 __raw_spin_unlock(&sync_lock);
89 }
90
91 }
92}
93
94/*
95 * Source CPU calls into this - it waits for the freshly booted
96 * target CPU to arrive and then starts the measurement:
97 */
98void __cpuinit check_tsc_sync_source(int cpu)
99{
100 int cpus = 2;
101
102 /*
103 * No need to check if we already know that the TSC is not
104 * synchronized:
105 */
106 if (unsynchronized_tsc())
107 return;
108
109 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
110 smp_processor_id(), cpu);
111
112 /*
113 * Reset it - in case this is a second bootup:
114 */
115 atomic_set(&stop_count, 0);
116
117 /*
118 * Wait for the target to arrive:
119 */
120 while (atomic_read(&start_count) != cpus-1)
121 cpu_relax();
122 /*
123 * Trigger the target to continue into the measurement too:
124 */
125 atomic_inc(&start_count);
126
127 check_tsc_warp();
128
129 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax();
131
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) {
138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else {
146 printk(" passed.\n");
147 }
148
149 /*
150 * Let the target continue with the bootup:
151 */
152 atomic_inc(&stop_count);
153}
154
155/*
156 * Freshly booted CPUs call into this:
157 */
158void __cpuinit check_tsc_sync_target(void)
159{
160 int cpus = 2;
161
162 if (unsynchronized_tsc())
163 return;
164
165 /*
166 * Register this CPU's participation and wait for the
167 * source CPU to start the measurement:
168 */
169 atomic_inc(&start_count);
170 while (atomic_read(&start_count) != cpus)
171 cpu_relax();
172
173 check_tsc_warp();
174
175 /*
176 * Ok, we are done:
177 */
178 atomic_inc(&stop_count);
179
180 /*
181 * Wait for the source CPU to print stuff:
182 */
183 while (atomic_read(&stop_count) != cpus)
184 cpu_relax();
185}
186#undef NR_LOOPS
187
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
new file mode 100644
index 000000000000..45b6f8a975a1
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -0,0 +1,105 @@
1/*
2 *
3 * verify_cpu.S - Code for cpu long mode and SSE verification. This
4 * code has been borrowed from boot/setup.S and was introduced by
5 * Andi Kleen.
6 *
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 *
11 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details.
13 *
14 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context.
17 * Following are the current usage.
18 *
19 * This file is included by both 16bit and 32bit code.
20 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure
28 *
29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt.
31 */
32
33#include <asm/cpufeature.h>
34
35verify_cpu:
36 pushfl # Save caller passed flags
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 pushfl # standard way to check for cpuid
41 popl %eax
42 movl %eax,%ebx
43 xorl $0x200000,%eax
44 pushl %eax
45 popfl
46 pushfl
47 popl %eax
48 cmpl %eax,%ebx
49 jz verify_cpu_no_longmode # cpu has no cpuid
50
51 movl $0x0,%eax # See if cpuid 1 is implemented
52 cpuid
53 cmpl $0x1,%eax
54 jb verify_cpu_no_longmode # no cpuid 1
55
56 xor %di,%di
57 cmpl $0x68747541,%ebx # AuthenticAMD
58 jnz verify_cpu_noamd
59 cmpl $0x69746e65,%edx
60 jnz verify_cpu_noamd
61 cmpl $0x444d4163,%ecx
62 jnz verify_cpu_noamd
63 mov $1,%di # cpu is from AMD
64
65verify_cpu_noamd:
66 movl $0x1,%eax # Does the cpu have what it takes
67 cpuid
68 andl $REQUIRED_MASK0,%edx
69 xorl $REQUIRED_MASK0,%edx
70 jnz verify_cpu_no_longmode
71
72 movl $0x80000000,%eax # See if extended cpuid is implemented
73 cpuid
74 cmpl $0x80000001,%eax
75 jb verify_cpu_no_longmode # no extended cpuid
76
77 movl $0x80000001,%eax # Does the cpu have what it takes
78 cpuid
79 andl $REQUIRED_MASK1,%edx
80 xorl $REQUIRED_MASK1,%edx
81 jnz verify_cpu_no_longmode
82
83verify_cpu_sse_test:
84 movl $1,%eax
85 cpuid
86 andl $SSE_MASK,%edx
87 cmpl $SSE_MASK,%edx
88 je verify_cpu_sse_ok
89 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR
92 rdmsr
93 btr $15,%eax # enable SSE
94 wrmsr
95 xor %di,%di # don't loop
96 jmp verify_cpu_sse_test # try again
97
98verify_cpu_no_longmode:
99 popfl # Restore caller passed flags
100 movl $1,%eax
101 ret
102verify_cpu_sse_ok:
103 popfl # Restore caller passed flags
104 xorl %eax, %eax
105 ret
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
new file mode 100644
index 000000000000..ba8ea97abd21
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -0,0 +1,235 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/page.h>
9
10#undef i386 /* in case the preprocessor is a 32bit one */
11
12OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64)
15jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(4); /* R__ */
23}
24SECTIONS
25{
26 . = __START_KERNEL;
27 phys_startup_64 = startup_64 - LOAD_OFFSET;
28 _text = .; /* Text and read-only data */
29 .text : AT(ADDR(.text) - LOAD_OFFSET) {
30 /* First the code that has to be first for bootstrapping */
31 *(.text.head)
32 _stext = .;
33 /* Then the rest */
34 TEXT_TEXT
35 SCHED_TEXT
36 LOCK_TEXT
37 KPROBES_TEXT
38 *(.fixup)
39 *(.gnu.warning)
40 } :text = 0x9090
41 /* out-of-line lock text */
42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
43
44 _etext = .; /* End of text section */
45
46 . = ALIGN(16); /* Exception table */
47 __start___ex_table = .;
48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
49 __stop___ex_table = .;
50
51 NOTES :text :note
52
53 BUG_TABLE :text
54
55 RODATA
56
57 . = ALIGN(4);
58 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
59 __tracedata_start = .;
60 *(.tracedata)
61 __tracedata_end = .;
62 }
63
64 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
65 /* Data */
66 .data : AT(ADDR(.data) - LOAD_OFFSET) {
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 _edata = .; /* End of data section */
72
73 . = ALIGN(PAGE_SIZE);
74 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
75 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
76 *(.data.cacheline_aligned)
77 }
78 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
79 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
80 *(.data.read_mostly)
81 }
82
83#define VSYSCALL_ADDR (-10*1024*1024)
84#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
85#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
86
87#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
88#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
89
90#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
91#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
92
93 . = VSYSCALL_ADDR;
94 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
95 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
96
97 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
98 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
99 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
100 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
101 { *(.vsyscall_gtod_data) }
102 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
103 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
104 { *(.vsyscall_clock) }
105 vsyscall_clock = VVIRT(.vsyscall_clock);
106
107
108 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
109 { *(.vsyscall_1) }
110 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
111 { *(.vsyscall_2) }
112
113 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
114 vgetcpu_mode = VVIRT(.vgetcpu_mode);
115
116 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
117 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
118 jiffies = VVIRT(.jiffies);
119
120 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
121 { *(.vsyscall_3) }
122
123 . = VSYSCALL_VIRT_ADDR + 4096;
124
125#undef VSYSCALL_ADDR
126#undef VSYSCALL_PHYS_ADDR
127#undef VSYSCALL_VIRT_ADDR
128#undef VLOAD_OFFSET
129#undef VLOAD
130#undef VVIRT_OFFSET
131#undef VVIRT
132
133 . = ALIGN(8192); /* init_task */
134 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
135 *(.data.init_task)
136 }:data.init
137
138 . = ALIGN(4096);
139 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
140 *(.data.page_aligned)
141 }
142
143 /* might get freed after init */
144 . = ALIGN(4096);
145 __smp_alt_begin = .;
146 __smp_locks = .;
147 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
148 *(.smp_locks)
149 }
150 __smp_locks_end = .;
151 . = ALIGN(4096);
152 __smp_alt_end = .;
153
154 . = ALIGN(4096); /* Init code and data */
155 __init_begin = .;
156 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
157 _sinittext = .;
158 *(.init.text)
159 _einittext = .;
160 }
161 __initdata_begin = .;
162 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
163 __initdata_end = .;
164 . = ALIGN(16);
165 __setup_start = .;
166 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
167 __setup_end = .;
168 __initcall_start = .;
169 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
170 INITCALLS
171 }
172 __initcall_end = .;
173 __con_initcall_start = .;
174 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
175 *(.con_initcall.init)
176 }
177 __con_initcall_end = .;
178 SECURITY_INIT
179 . = ALIGN(8);
180 __alt_instructions = .;
181 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
182 *(.altinstructions)
183 }
184 __alt_instructions_end = .;
185 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
186 *(.altinstr_replacement)
187 }
188 /* .exit.text is discard at runtime, not link time, to deal with references
189 from .altinstructions and .eh_frame */
190 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
191 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
192
193/* vdso blob that is mapped into user space */
194 vdso_start = . ;
195 .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
196 . = ALIGN(4096);
197 vdso_end = .;
198
199#ifdef CONFIG_BLK_DEV_INITRD
200 . = ALIGN(4096);
201 __initramfs_start = .;
202 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
203 __initramfs_end = .;
204#endif
205
206 PERCPU(4096)
207
208 . = ALIGN(4096);
209 __init_end = .;
210
211 . = ALIGN(4096);
212 __nosave_begin = .;
213 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
214 . = ALIGN(4096);
215 __nosave_end = .;
216
217 __bss_start = .; /* BSS */
218 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
219 *(.bss.page_aligned)
220 *(.bss)
221 }
222 __bss_stop = .;
223
224 _end = . ;
225
226 /* Sections to be discarded */
227 /DISCARD/ : {
228 *(.exitcall.exit)
229 *(.eh_frame)
230 }
231
232 STABS_DEBUG
233
234 DWARF_DEBUG
235}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644
index 000000000000..414caf0c5f9a
--- /dev/null
+++ b/arch/x86/kernel/vsmp_64.c
@@ -0,0 +1,49 @@
1/*
2 * vSMPowered(tm) systems specific initialization
3 * Copyright (C) 2005 ScaleMP Inc.
4 *
5 * Use of this code is subject to the terms and conditions of the
6 * GNU general public license version 2. See "COPYING" or
7 * http://www.gnu.org/licenses/gpl.html
8 *
9 * Ravikiran Thirumalai <kiran@scalemp.com>,
10 * Shai Fultheim <shai@scalemp.com>
11 */
12
13#include <linux/init.h>
14#include <linux/pci_ids.h>
15#include <linux/pci_regs.h>
16#include <asm/pci-direct.h>
17#include <asm/io.h>
18
19static int __init vsmp_init(void)
20{
21 void *address;
22 unsigned int cap, ctl;
23
24 if (!early_pci_allowed())
25 return 0;
26
27 /* Check if we are running on a ScaleMP vSMP box */
28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
29 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
30 return 0;
31
32 /* set vSMP magic bits to indicate vSMP capable kernel */
33 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
34 cap = readl(address);
35 ctl = readl(address + 4);
36 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl);
37 if (cap & ctl & (1 << 4)) {
38 /* Turn on vSMP IRQ fastpath handling (see system.h) */
39 ctl &= ~(1 << 4);
40 writel(ctl, address + 4);
41 ctl = readl(address + 4);
42 printk("vSMP CTL: control set to:0x%08x\n", ctl);
43 }
44
45 iounmap(address);
46 return 0;
47}
48
49core_initcall(vsmp_init);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
new file mode 100644
index 000000000000..06c34949bfdc
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -0,0 +1,349 @@
1/*
2 * linux/arch/x86_64/kernel/vsyscall.c
3 *
4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright 2003 Andi Kleen, SuSE Labs.
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 * If we want more than four we need a vDSO.
17 *
18 * Note: the concept clashes with user mode linux. If you use UML and
19 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
20 */
21
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/timer.h>
26#include <linux/seqlock.h>
27#include <linux/jiffies.h>
28#include <linux/sysctl.h>
29#include <linux/clocksource.h>
30#include <linux/getcpu.h>
31#include <linux/cpu.h>
32#include <linux/smp.h>
33#include <linux/notifier.h>
34
35#include <asm/vsyscall.h>
36#include <asm/pgtable.h>
37#include <asm/page.h>
38#include <asm/unistd.h>
39#include <asm/fixmap.h>
40#include <asm/errno.h>
41#include <asm/io.h>
42#include <asm/segment.h>
43#include <asm/desc.h>
44#include <asm/topology.h>
45#include <asm/vgtod.h>
46
47#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
48#define __syscall_clobber "r11","rcx","memory"
49#define __pa_vsymbol(x) \
50 ({unsigned long v; \
51 extern char __vsyscall_0; \
52 asm("" : "=r" (v) : "0" (x)); \
53 ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
54
55/*
56 * vsyscall_gtod_data contains data that is :
57 * - readonly from vsyscalls
58 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
59 * Try to keep this structure as small as possible to avoid cache line ping pongs
60 */
61int __vgetcpu_mode __section_vgetcpu_mode;
62
63struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
64{
65 .lock = SEQLOCK_UNLOCKED,
66 .sysctl_enabled = 1,
67};
68
69void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
70{
71 unsigned long flags;
72
73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
74 /* copy vsyscall data */
75 vsyscall_gtod_data.clock.vread = clock->vread;
76 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
77 vsyscall_gtod_data.clock.mask = clock->mask;
78 vsyscall_gtod_data.clock.mult = clock->mult;
79 vsyscall_gtod_data.clock.shift = clock->shift;
80 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
81 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
82 vsyscall_gtod_data.sys_tz = sys_tz;
83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
86}
87
88/* RED-PEN may want to readd seq locking, but then the variable should be
89 * write-once.
90 */
91static __always_inline void do_get_tz(struct timezone * tz)
92{
93 *tz = __vsyscall_gtod_data.sys_tz;
94}
95
96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
97{
98 int ret;
99 asm volatile("vsysc2: syscall"
100 : "=a" (ret)
101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
103 return ret;
104}
105
106static __always_inline long time_syscall(long *t)
107{
108 long secs;
109 asm volatile("vsysc1: syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114
115static __always_inline void do_vgettimeofday(struct timeval * tv)
116{
117 cycle_t now, base, mask, cycle_delta;
118 unsigned seq;
119 unsigned long mult, shift, nsec;
120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
123
124 vread = __vsyscall_gtod_data.clock.vread;
125 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
126 gettimeofday(tv,NULL);
127 return;
128 }
129 now = vread();
130 base = __vsyscall_gtod_data.clock.cycle_last;
131 mask = __vsyscall_gtod_data.clock.mask;
132 mult = __vsyscall_gtod_data.clock.mult;
133 shift = __vsyscall_gtod_data.clock.shift;
134
135 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
136 nsec = __vsyscall_gtod_data.wall_time_nsec;
137 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
138
139 /* calculate interval: */
140 cycle_delta = (now - base) & mask;
141 /* convert to nsecs: */
142 nsec += (cycle_delta * mult) >> shift;
143
144 while (nsec >= NSEC_PER_SEC) {
145 tv->tv_sec += 1;
146 nsec -= NSEC_PER_SEC;
147 }
148 tv->tv_usec = nsec / NSEC_PER_USEC;
149}
150
151int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
152{
153 if (tv)
154 do_vgettimeofday(tv);
155 if (tz)
156 do_get_tz(tz);
157 return 0;
158}
159
160/* This will break when the xtime seconds get inaccurate, but that is
161 * unlikely */
162time_t __vsyscall(1) vtime(time_t *t)
163{
164 struct timeval tv;
165 time_t result;
166 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
167 return time_syscall(t);
168
169 vgettimeofday(&tv, 0);
170 result = tv.tv_sec;
171 if (t)
172 *t = result;
173 return result;
174}
175
176/* Fast way to get current CPU and node.
177 This helps to do per node and per CPU caches in user space.
178 The result is not guaranteed without CPU affinity, but usually
179 works out because the scheduler tries to keep a thread on the same
180 CPU.
181
182 tcache must point to a two element sized long array.
183 All arguments can be NULL. */
184long __vsyscall(2)
185vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
186{
187 unsigned int dummy, p;
188 unsigned long j = 0;
189
190 /* Fast cache - only recompute value once per jiffies and avoid
191 relatively costly rdtscp/cpuid otherwise.
192 This works because the scheduler usually keeps the process
193 on the same CPU and this syscall doesn't guarantee its
194 results anyways.
195 We do this here because otherwise user space would do it on
196 its own in a likely inferior way (no access to jiffies).
197 If you don't like it pass NULL. */
198 if (tcache && tcache->blob[0] == (j = __jiffies)) {
199 p = tcache->blob[1];
200 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
201 /* Load per CPU data from RDTSCP */
202 rdtscp(dummy, dummy, p);
203 } else {
204 /* Load per CPU data from GDT */
205 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
206 }
207 if (tcache) {
208 tcache->blob[0] = j;
209 tcache->blob[1] = p;
210 }
211 if (cpu)
212 *cpu = p & 0xfff;
213 if (node)
214 *node = p >> 12;
215 return 0;
216}
217
218long __vsyscall(3) venosys_1(void)
219{
220 return -ENOSYS;
221}
222
223#ifdef CONFIG_SYSCTL
224
225#define SYSCALL 0x050f
226#define NOP2 0x9090
227
228/*
229 * NOP out syscall in vsyscall page when not needed.
230 */
231static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
232 void __user *buffer, size_t *lenp, loff_t *ppos)
233{
234 extern u16 vsysc1, vsysc2;
235 u16 __iomem *map1;
236 u16 __iomem *map2;
237 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
238 if (!write)
239 return ret;
240 /* gcc has some trouble with __va(__pa()), so just do it this
241 way. */
242 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
243 if (!map1)
244 return -ENOMEM;
245 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
246 if (!map2) {
247 ret = -ENOMEM;
248 goto out;
249 }
250 if (!vsyscall_gtod_data.sysctl_enabled) {
251 writew(SYSCALL, map1);
252 writew(SYSCALL, map2);
253 } else {
254 writew(NOP2, map1);
255 writew(NOP2, map2);
256 }
257 iounmap(map2);
258out:
259 iounmap(map1);
260 return ret;
261}
262
263static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
264 void __user *oldval, size_t __user *oldlenp,
265 void __user *newval, size_t newlen)
266{
267 return -ENOSYS;
268}
269
270static ctl_table kernel_table2[] = {
271 { .ctl_name = 99, .procname = "vsyscall64",
272 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
273 .mode = 0644,
274 .strategy = vsyscall_sysctl_nostrat,
275 .proc_handler = vsyscall_sysctl_change },
276 {}
277};
278
279static ctl_table kernel_root_table2[] = {
280 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
281 .child = kernel_table2 },
282 {}
283};
284
285#endif
286
287/* Assume __initcall executes before all user space. Hopefully kmod
288 doesn't violate that. We'll find out if it does. */
289static void __cpuinit vsyscall_set_cpu(int cpu)
290{
291 unsigned long *d;
292 unsigned long node = 0;
293#ifdef CONFIG_NUMA
294 node = cpu_to_node[cpu];
295#endif
296 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
297 write_rdtscp_aux((node << 12) | cpu);
298
299 /* Store cpu number in limit so that it can be loaded quickly
300 in user space in vgetcpu.
301 12 bits for the CPU and 8 bits for the node. */
302 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
303 *d = 0x0f40000000000ULL;
304 *d |= cpu;
305 *d |= (node & 0xf) << 12;
306 *d |= (node >> 4) << 48;
307}
308
309static void __cpuinit cpu_vsyscall_init(void *arg)
310{
311 /* preemption should be already off */
312 vsyscall_set_cpu(raw_smp_processor_id());
313}
314
315static int __cpuinit
316cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
317{
318 long cpu = (long)arg;
319 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
320 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
321 return NOTIFY_DONE;
322}
323
324static void __init map_vsyscall(void)
325{
326 extern char __vsyscall_0;
327 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
328
329 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
330 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
331}
332
333static int __init vsyscall_init(void)
334{
335 BUG_ON(((unsigned long) &vgettimeofday !=
336 VSYSCALL_ADDR(__NR_vgettimeofday)));
337 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
338 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
339 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
340 map_vsyscall();
341#ifdef CONFIG_SYSCTL
342 register_sysctl_table(kernel_root_table2);
343#endif
344 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
345 hotcpu_notifier(cpu_vsyscall_notifier, 0);
346 return 0;
347}
348
349__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
new file mode 100644
index 000000000000..77c25b307635
--- /dev/null
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -0,0 +1,62 @@
1/* Exports for assembly files.
2 All C exports should go in the respective C files. */
3
4#include <linux/module.h>
5#include <linux/smp.h>
6
7#include <asm/semaphore.h>
8#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11
12EXPORT_SYMBOL(kernel_thread);
13
14EXPORT_SYMBOL(__down_failed);
15EXPORT_SYMBOL(__down_failed_interruptible);
16EXPORT_SYMBOL(__down_failed_trylock);
17EXPORT_SYMBOL(__up_wakeup);
18
19EXPORT_SYMBOL(__get_user_1);
20EXPORT_SYMBOL(__get_user_2);
21EXPORT_SYMBOL(__get_user_4);
22EXPORT_SYMBOL(__get_user_8);
23EXPORT_SYMBOL(__put_user_1);
24EXPORT_SYMBOL(__put_user_2);
25EXPORT_SYMBOL(__put_user_4);
26EXPORT_SYMBOL(__put_user_8);
27
28EXPORT_SYMBOL(copy_user_generic);
29EXPORT_SYMBOL(__copy_user_nocache);
30EXPORT_SYMBOL(copy_from_user);
31EXPORT_SYMBOL(copy_to_user);
32EXPORT_SYMBOL(__copy_from_user_inatomic);
33
34EXPORT_SYMBOL(copy_page);
35EXPORT_SYMBOL(clear_page);
36
37#ifdef CONFIG_SMP
38extern void __write_lock_failed(rwlock_t *rw);
39extern void __read_lock_failed(rwlock_t *rw);
40EXPORT_SYMBOL(__write_lock_failed);
41EXPORT_SYMBOL(__read_lock_failed);
42#endif
43
44/* Export string functions. We normally rely on gcc builtin for most of these,
45 but gcc sometimes decides not to inline them. */
46#undef memcpy
47#undef memset
48#undef memmove
49
50extern void * memset(void *,int,__kernel_size_t);
51extern void * memcpy(void *,const void *,__kernel_size_t);
52extern void * __memcpy(void *,const void *,__kernel_size_t);
53
54EXPORT_SYMBOL(memset);
55EXPORT_SYMBOL(memcpy);
56EXPORT_SYMBOL(__memcpy);
57
58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);