diff options
Diffstat (limited to 'arch/x86')
68 files changed, 25438 insertions, 7 deletions
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index cff3d1dc5dd4..49467640751f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -174,7 +174,7 @@ no_longmode: | |||
174 | hlt | 174 | hlt |
175 | jmp 1b | 175 | jmp 1b |
176 | 176 | ||
177 | #include "../../../x86_64/kernel/verify_cpu_64.S" | 177 | #include "../../kernel/verify_cpu_64.S" |
178 | 178 | ||
179 | /* Be careful here startup_64 needs to be at a predictable | 179 | /* Be careful here startup_64 needs to be at a predictable |
180 | * address so I can export it in an ELF header. Bootloaders | 180 | * address so I can export it in an ELF header. Bootloaders |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 577d08f4b8bb..45855c97923e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | ifeq ($(CONFIG_X86_32),y) | 1 | ifeq ($(CONFIG_X86_32),y) |
2 | include ${srctree}/arch/x86/kernel/Makefile_32 | 2 | include ${srctree}/arch/x86/kernel/Makefile_32 |
3 | else | 3 | else |
4 | include ${srctree}/arch/x86_64/kernel/Makefile_64 | 4 | include ${srctree}/arch/x86/kernel/Makefile_64 |
5 | endif | 5 | endif |
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index 5096f486d389..cb25523026a6 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 | |||
@@ -83,6 +83,4 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ | |||
83 | $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE | 83 | $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE |
84 | $(call if_changed,syscall) | 84 | $(call if_changed,syscall) |
85 | 85 | ||
86 | k8-y += ../../x86_64/kernel/k8.o | ||
87 | stacktrace-y += ../../x86_64/kernel/stacktrace.o | ||
88 | 86 | ||
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 new file mode 100644 index 000000000000..6e6b5909e465 --- /dev/null +++ b/arch/x86/kernel/Makefile_64 | |||
@@ -0,0 +1,54 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | extra-y := head_64.o head64.o init_task_64.o vmlinux.lds | ||
6 | EXTRA_AFLAGS := -traditional | ||
7 | obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ | ||
8 | ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ | ||
9 | x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \ | ||
10 | setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \ | ||
11 | pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \ | ||
12 | perfctr-watchdog.o | ||
13 | |||
14 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | ||
15 | obj-$(CONFIG_X86_MCE) += mce_64.o therm_throt.o | ||
16 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o | ||
17 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | ||
18 | obj-$(CONFIG_MTRR) += ../../x86/kernel/cpu/mtrr/ | ||
19 | obj-$(CONFIG_ACPI) += ../../x86/kernel/acpi/ | ||
20 | obj-$(CONFIG_X86_MSR) += msr.o | ||
21 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
22 | obj-$(CONFIG_X86_CPUID) += cpuid.o | ||
23 | obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o | ||
24 | obj-y += apic_64.o nmi_64.o | ||
25 | obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o | ||
26 | obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash_64.o | ||
27 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o | ||
28 | obj-$(CONFIG_PM) += suspend_64.o | ||
29 | obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o | ||
30 | obj-$(CONFIG_CPU_FREQ) += ../../x86/kernel/cpu/cpufreq/ | ||
31 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | ||
32 | obj-$(CONFIG_IOMMU) += pci-gart_64.o aperture_64.o | ||
33 | obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o | ||
34 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o | ||
35 | obj-$(CONFIG_KPROBES) += kprobes_64.o | ||
36 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | ||
37 | obj-$(CONFIG_X86_VSMP) += vsmp_64.o | ||
38 | obj-$(CONFIG_K8_NB) += k8.o | ||
39 | obj-$(CONFIG_AUDIT) += audit_64.o | ||
40 | |||
41 | obj-$(CONFIG_MODULES) += module_64.o | ||
42 | obj-$(CONFIG_PCI) += early-quirks_64.o | ||
43 | |||
44 | obj-y += topology.o | ||
45 | obj-y += intel_cacheinfo.o | ||
46 | obj-y += addon_cpuid_features.o | ||
47 | obj-y += pcspeaker.o | ||
48 | |||
49 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 | ||
50 | |||
51 | therm_throt-y += ../../x86/kernel/cpu/mcheck/therm_throt.o | ||
52 | intel_cacheinfo-y += ../../x86/kernel/cpu/intel_cacheinfo.o | ||
53 | addon_cpuid_features-y += ../../x86/kernel/cpu/addon_cpuid_features.o | ||
54 | perfctr-watchdog-y += ../../x86/kernel/cpu/perfctr-watchdog.o | ||
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5e3b3f5496c5..8b4357e1efe0 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S | |||
@@ -269,7 +269,7 @@ no_longmode: | |||
269 | movb $0xbc,%al ; outb %al,$0x80 | 269 | movb $0xbc,%al ; outb %al,$0x80 |
270 | jmp no_longmode | 270 | jmp no_longmode |
271 | 271 | ||
272 | #include "../../../x86_64/kernel/verify_cpu_64.S" | 272 | #include "../verify_cpu_64.S" |
273 | 273 | ||
274 | /* This code uses an extended set of video mode numbers. These include: | 274 | /* This code uses an extended set of video mode numbers. These include: |
275 | * Aliases for standard modes | 275 | * Aliases for standard modes |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c new file mode 100644 index 000000000000..8f681cae7bf7 --- /dev/null +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -0,0 +1,298 @@ | |||
1 | /* | ||
2 | * Firmware replacement code. | ||
3 | * | ||
4 | * Work around broken BIOSes that don't set an aperture or only set the | ||
5 | * aperture in the AGP bridge. | ||
6 | * If all fails map the aperture over some low memory. This is cheaper than | ||
7 | * doing bounce buffering. The memory is lost. This is done at early boot | ||
8 | * because only the bootmem allocator can allocate 32+MB. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/types.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/bootmem.h> | ||
16 | #include <linux/mmzone.h> | ||
17 | #include <linux/pci_ids.h> | ||
18 | #include <linux/pci.h> | ||
19 | #include <linux/bitops.h> | ||
20 | #include <linux/ioport.h> | ||
21 | #include <asm/e820.h> | ||
22 | #include <asm/io.h> | ||
23 | #include <asm/iommu.h> | ||
24 | #include <asm/pci-direct.h> | ||
25 | #include <asm/dma.h> | ||
26 | #include <asm/k8.h> | ||
27 | |||
28 | int iommu_aperture; | ||
29 | int iommu_aperture_disabled __initdata = 0; | ||
30 | int iommu_aperture_allowed __initdata = 0; | ||
31 | |||
32 | int fallback_aper_order __initdata = 1; /* 64MB */ | ||
33 | int fallback_aper_force __initdata = 0; | ||
34 | |||
35 | int fix_aperture __initdata = 1; | ||
36 | |||
37 | static struct resource gart_resource = { | ||
38 | .name = "GART", | ||
39 | .flags = IORESOURCE_MEM, | ||
40 | }; | ||
41 | |||
42 | static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) | ||
43 | { | ||
44 | gart_resource.start = aper_base; | ||
45 | gart_resource.end = aper_base + aper_size - 1; | ||
46 | insert_resource(&iomem_resource, &gart_resource); | ||
47 | } | ||
48 | |||
49 | /* This code runs before the PCI subsystem is initialized, so just | ||
50 | access the northbridge directly. */ | ||
51 | |||
52 | static u32 __init allocate_aperture(void) | ||
53 | { | ||
54 | u32 aper_size; | ||
55 | void *p; | ||
56 | |||
57 | if (fallback_aper_order > 7) | ||
58 | fallback_aper_order = 7; | ||
59 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; | ||
60 | |||
61 | /* | ||
62 | * Aperture has to be naturally aligned. This means an 2GB aperture won't | ||
63 | * have much chance of finding a place in the lower 4GB of memory. | ||
64 | * Unfortunately we cannot move it up because that would make the | ||
65 | * IOMMU useless. | ||
66 | */ | ||
67 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); | ||
68 | if (!p || __pa(p)+aper_size > 0xffffffff) { | ||
69 | printk("Cannot allocate aperture memory hole (%p,%uK)\n", | ||
70 | p, aper_size>>10); | ||
71 | if (p) | ||
72 | free_bootmem(__pa(p), aper_size); | ||
73 | return 0; | ||
74 | } | ||
75 | printk("Mapping aperture over %d KB of RAM @ %lx\n", | ||
76 | aper_size >> 10, __pa(p)); | ||
77 | insert_aperture_resource((u32)__pa(p), aper_size); | ||
78 | return (u32)__pa(p); | ||
79 | } | ||
80 | |||
81 | static int __init aperture_valid(u64 aper_base, u32 aper_size) | ||
82 | { | ||
83 | if (!aper_base) | ||
84 | return 0; | ||
85 | if (aper_size < 64*1024*1024) { | ||
86 | printk("Aperture too small (%d MB)\n", aper_size>>20); | ||
87 | return 0; | ||
88 | } | ||
89 | if (aper_base + aper_size > 0x100000000UL) { | ||
90 | printk("Aperture beyond 4GB. Ignoring.\n"); | ||
91 | return 0; | ||
92 | } | ||
93 | if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { | ||
94 | printk("Aperture pointing to e820 RAM. Ignoring.\n"); | ||
95 | return 0; | ||
96 | } | ||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | /* Find a PCI capability */ | ||
101 | static __u32 __init find_cap(int num, int slot, int func, int cap) | ||
102 | { | ||
103 | u8 pos; | ||
104 | int bytes; | ||
105 | if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) | ||
106 | return 0; | ||
107 | pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); | ||
108 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
109 | u8 id; | ||
110 | pos &= ~3; | ||
111 | id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); | ||
112 | if (id == 0xff) | ||
113 | break; | ||
114 | if (id == cap) | ||
115 | return pos; | ||
116 | pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); | ||
117 | } | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | /* Read a standard AGPv3 bridge header */ | ||
122 | static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | ||
123 | { | ||
124 | u32 apsize; | ||
125 | u32 apsizereg; | ||
126 | int nbits; | ||
127 | u32 aper_low, aper_hi; | ||
128 | u64 aper; | ||
129 | |||
130 | printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); | ||
131 | apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); | ||
132 | if (apsizereg == 0xffffffff) { | ||
133 | printk("APSIZE in AGP bridge unreadable\n"); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | apsize = apsizereg & 0xfff; | ||
138 | /* Some BIOS use weird encodings not in the AGPv3 table. */ | ||
139 | if (apsize & 0xff) | ||
140 | apsize |= 0xf00; | ||
141 | nbits = hweight16(apsize); | ||
142 | *order = 7 - nbits; | ||
143 | if ((int)*order < 0) /* < 32MB */ | ||
144 | *order = 0; | ||
145 | |||
146 | aper_low = read_pci_config(num,slot,func, 0x10); | ||
147 | aper_hi = read_pci_config(num,slot,func,0x14); | ||
148 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); | ||
149 | |||
150 | printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", | ||
151 | aper, 32 << *order, apsizereg); | ||
152 | |||
153 | if (!aperture_valid(aper, (32*1024*1024) << *order)) | ||
154 | return 0; | ||
155 | return (u32)aper; | ||
156 | } | ||
157 | |||
158 | /* Look for an AGP bridge. Windows only expects the aperture in the | ||
159 | AGP bridge and some BIOS forget to initialize the Northbridge too. | ||
160 | Work around this here. | ||
161 | |||
162 | Do an PCI bus scan by hand because we're running before the PCI | ||
163 | subsystem. | ||
164 | |||
165 | All K8 AGP bridges are AGPv3 compliant, so we can do this scan | ||
166 | generically. It's probably overkill to always scan all slots because | ||
167 | the AGP bridges should be always an own bus on the HT hierarchy, | ||
168 | but do it here for future safety. */ | ||
169 | static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | ||
170 | { | ||
171 | int num, slot, func; | ||
172 | |||
173 | /* Poor man's PCI discovery */ | ||
174 | for (num = 0; num < 256; num++) { | ||
175 | for (slot = 0; slot < 32; slot++) { | ||
176 | for (func = 0; func < 8; func++) { | ||
177 | u32 class, cap; | ||
178 | u8 type; | ||
179 | class = read_pci_config(num,slot,func, | ||
180 | PCI_CLASS_REVISION); | ||
181 | if (class == 0xffffffff) | ||
182 | break; | ||
183 | |||
184 | switch (class >> 16) { | ||
185 | case PCI_CLASS_BRIDGE_HOST: | ||
186 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ | ||
187 | /* AGP bridge? */ | ||
188 | cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); | ||
189 | if (!cap) | ||
190 | break; | ||
191 | *valid_agp = 1; | ||
192 | return read_agp(num,slot,func,cap,order); | ||
193 | } | ||
194 | |||
195 | /* No multi-function device? */ | ||
196 | type = read_pci_config_byte(num,slot,func, | ||
197 | PCI_HEADER_TYPE); | ||
198 | if (!(type & 0x80)) | ||
199 | break; | ||
200 | } | ||
201 | } | ||
202 | } | ||
203 | printk("No AGP bridge found\n"); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | void __init iommu_hole_init(void) | ||
208 | { | ||
209 | int fix, num; | ||
210 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; | ||
211 | u64 aper_base, last_aper_base = 0; | ||
212 | int valid_agp = 0; | ||
213 | |||
214 | if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) | ||
215 | return; | ||
216 | |||
217 | printk(KERN_INFO "Checking aperture...\n"); | ||
218 | |||
219 | fix = 0; | ||
220 | for (num = 24; num < 32; num++) { | ||
221 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | ||
222 | continue; | ||
223 | |||
224 | iommu_detected = 1; | ||
225 | iommu_aperture = 1; | ||
226 | |||
227 | aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; | ||
228 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
229 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | ||
230 | aper_base <<= 25; | ||
231 | |||
232 | printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, | ||
233 | aper_base, aper_size>>20); | ||
234 | |||
235 | if (!aperture_valid(aper_base, aper_size)) { | ||
236 | fix = 1; | ||
237 | break; | ||
238 | } | ||
239 | |||
240 | if ((last_aper_order && aper_order != last_aper_order) || | ||
241 | (last_aper_base && aper_base != last_aper_base)) { | ||
242 | fix = 1; | ||
243 | break; | ||
244 | } | ||
245 | last_aper_order = aper_order; | ||
246 | last_aper_base = aper_base; | ||
247 | } | ||
248 | |||
249 | if (!fix && !fallback_aper_force) { | ||
250 | if (last_aper_base) { | ||
251 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; | ||
252 | insert_aperture_resource((u32)last_aper_base, n); | ||
253 | } | ||
254 | return; | ||
255 | } | ||
256 | |||
257 | if (!fallback_aper_force) | ||
258 | aper_alloc = search_agp_bridge(&aper_order, &valid_agp); | ||
259 | |||
260 | if (aper_alloc) { | ||
261 | /* Got the aperture from the AGP bridge */ | ||
262 | } else if (swiotlb && !valid_agp) { | ||
263 | /* Do nothing */ | ||
264 | } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || | ||
265 | force_iommu || | ||
266 | valid_agp || | ||
267 | fallback_aper_force) { | ||
268 | printk("Your BIOS doesn't leave a aperture memory hole\n"); | ||
269 | printk("Please enable the IOMMU option in the BIOS setup\n"); | ||
270 | printk("This costs you %d MB of RAM\n", | ||
271 | 32 << fallback_aper_order); | ||
272 | |||
273 | aper_order = fallback_aper_order; | ||
274 | aper_alloc = allocate_aperture(); | ||
275 | if (!aper_alloc) { | ||
276 | /* Could disable AGP and IOMMU here, but it's probably | ||
277 | not worth it. But the later users cannot deal with | ||
278 | bad apertures and turning on the aperture over memory | ||
279 | causes very strange problems, so it's better to | ||
280 | panic early. */ | ||
281 | panic("Not enough memory for aperture"); | ||
282 | } | ||
283 | } else { | ||
284 | return; | ||
285 | } | ||
286 | |||
287 | /* Fix up the north bridges */ | ||
288 | for (num = 24; num < 32; num++) { | ||
289 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | ||
290 | continue; | ||
291 | |||
292 | /* Don't enable translation yet. That is done later. | ||
293 | Assume this BIOS didn't initialise the GART so | ||
294 | just overwrite all previous bits */ | ||
295 | write_pci_config(0, num, 3, 0x90, aper_order<<1); | ||
296 | write_pci_config(0, num, 3, 0x94, aper_alloc>>25); | ||
297 | } | ||
298 | } | ||
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c new file mode 100644 index 000000000000..925758dbca0c --- /dev/null +++ b/arch/x86/kernel/apic_64.c | |||
@@ -0,0 +1,1253 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <linux/mm.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <linux/mc146818rtc.h> | ||
24 | #include <linux/kernel_stat.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/ioport.h> | ||
28 | |||
29 | #include <asm/atomic.h> | ||
30 | #include <asm/smp.h> | ||
31 | #include <asm/mtrr.h> | ||
32 | #include <asm/mpspec.h> | ||
33 | #include <asm/pgalloc.h> | ||
34 | #include <asm/mach_apic.h> | ||
35 | #include <asm/nmi.h> | ||
36 | #include <asm/idle.h> | ||
37 | #include <asm/proto.h> | ||
38 | #include <asm/timex.h> | ||
39 | #include <asm/hpet.h> | ||
40 | #include <asm/apic.h> | ||
41 | |||
42 | int apic_mapped; | ||
43 | int apic_verbosity; | ||
44 | int apic_runs_main_timer; | ||
45 | int apic_calibrate_pmtmr __initdata; | ||
46 | |||
47 | int disable_apic_timer __initdata; | ||
48 | |||
49 | /* Local APIC timer works in C2? */ | ||
50 | int local_apic_timer_c2_ok; | ||
51 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | ||
52 | |||
53 | static struct resource *ioapic_resources; | ||
54 | static struct resource lapic_resource = { | ||
55 | .name = "Local APIC", | ||
56 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * cpu_mask that denotes the CPUs that needs timer interrupt coming in as | ||
61 | * IPIs in place of local APIC timers | ||
62 | */ | ||
63 | static cpumask_t timer_interrupt_broadcast_ipi_mask; | ||
64 | |||
65 | /* Using APIC to generate smp_local_timer_interrupt? */ | ||
66 | int using_apic_timer __read_mostly = 0; | ||
67 | |||
68 | static void apic_pm_activate(void); | ||
69 | |||
70 | void apic_wait_icr_idle(void) | ||
71 | { | ||
72 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | ||
73 | cpu_relax(); | ||
74 | } | ||
75 | |||
76 | unsigned int safe_apic_wait_icr_idle(void) | ||
77 | { | ||
78 | unsigned int send_status; | ||
79 | int timeout; | ||
80 | |||
81 | timeout = 0; | ||
82 | do { | ||
83 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
84 | if (!send_status) | ||
85 | break; | ||
86 | udelay(100); | ||
87 | } while (timeout++ < 1000); | ||
88 | |||
89 | return send_status; | ||
90 | } | ||
91 | |||
92 | void enable_NMI_through_LVT0 (void * dummy) | ||
93 | { | ||
94 | unsigned int v; | ||
95 | |||
96 | /* unmask and set to NMI */ | ||
97 | v = APIC_DM_NMI; | ||
98 | apic_write(APIC_LVT0, v); | ||
99 | } | ||
100 | |||
101 | int get_maxlvt(void) | ||
102 | { | ||
103 | unsigned int v, maxlvt; | ||
104 | |||
105 | v = apic_read(APIC_LVR); | ||
106 | maxlvt = GET_APIC_MAXLVT(v); | ||
107 | return maxlvt; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
112 | * each architecture has to answer this themselves. | ||
113 | */ | ||
114 | void ack_bad_irq(unsigned int irq) | ||
115 | { | ||
116 | printk("unexpected IRQ trap at vector %02x\n", irq); | ||
117 | /* | ||
118 | * Currently unexpected vectors happen only on SMP and APIC. | ||
119 | * We _must_ ack these because every local APIC has only N | ||
120 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
121 | * holds up an irq slot - in excessive cases (when multiple | ||
122 | * unexpected vectors occur) that might lock up the APIC | ||
123 | * completely. | ||
124 | * But don't ack when the APIC is disabled. -AK | ||
125 | */ | ||
126 | if (!disable_apic) | ||
127 | ack_APIC_irq(); | ||
128 | } | ||
129 | |||
130 | void clear_local_APIC(void) | ||
131 | { | ||
132 | int maxlvt; | ||
133 | unsigned int v; | ||
134 | |||
135 | maxlvt = get_maxlvt(); | ||
136 | |||
137 | /* | ||
138 | * Masking an LVT entry can trigger a local APIC error | ||
139 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
140 | */ | ||
141 | if (maxlvt >= 3) { | ||
142 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
143 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
144 | } | ||
145 | /* | ||
146 | * Careful: we have to set masks only first to deassert | ||
147 | * any level-triggered sources. | ||
148 | */ | ||
149 | v = apic_read(APIC_LVTT); | ||
150 | apic_write(APIC_LVTT, v | APIC_LVT_MASKED); | ||
151 | v = apic_read(APIC_LVT0); | ||
152 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
153 | v = apic_read(APIC_LVT1); | ||
154 | apic_write(APIC_LVT1, v | APIC_LVT_MASKED); | ||
155 | if (maxlvt >= 4) { | ||
156 | v = apic_read(APIC_LVTPC); | ||
157 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Clean APIC state for other OSs: | ||
162 | */ | ||
163 | apic_write(APIC_LVTT, APIC_LVT_MASKED); | ||
164 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
165 | apic_write(APIC_LVT1, APIC_LVT_MASKED); | ||
166 | if (maxlvt >= 3) | ||
167 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); | ||
168 | if (maxlvt >= 4) | ||
169 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); | ||
170 | apic_write(APIC_ESR, 0); | ||
171 | apic_read(APIC_ESR); | ||
172 | } | ||
173 | |||
174 | void disconnect_bsp_APIC(int virt_wire_setup) | ||
175 | { | ||
176 | /* Go back to Virtual Wire compatibility mode */ | ||
177 | unsigned long value; | ||
178 | |||
179 | /* For the spurious interrupt use vector F, and enable it */ | ||
180 | value = apic_read(APIC_SPIV); | ||
181 | value &= ~APIC_VECTOR_MASK; | ||
182 | value |= APIC_SPIV_APIC_ENABLED; | ||
183 | value |= 0xf; | ||
184 | apic_write(APIC_SPIV, value); | ||
185 | |||
186 | if (!virt_wire_setup) { | ||
187 | /* For LVT0 make it edge triggered, active high, external and enabled */ | ||
188 | value = apic_read(APIC_LVT0); | ||
189 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
190 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
191 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); | ||
192 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
193 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
194 | apic_write(APIC_LVT0, value); | ||
195 | } else { | ||
196 | /* Disable LVT0 */ | ||
197 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
198 | } | ||
199 | |||
200 | /* For LVT1 make it edge triggered, active high, nmi and enabled */ | ||
201 | value = apic_read(APIC_LVT1); | ||
202 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
203 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
204 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
205 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
206 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
207 | apic_write(APIC_LVT1, value); | ||
208 | } | ||
209 | |||
210 | void disable_local_APIC(void) | ||
211 | { | ||
212 | unsigned int value; | ||
213 | |||
214 | clear_local_APIC(); | ||
215 | |||
216 | /* | ||
217 | * Disable APIC (implies clearing of registers | ||
218 | * for 82489DX!). | ||
219 | */ | ||
220 | value = apic_read(APIC_SPIV); | ||
221 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
222 | apic_write(APIC_SPIV, value); | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This is to verify that we're looking at a real local APIC. | ||
227 | * Check these against your board if the CPUs aren't getting | ||
228 | * started for no apparent reason. | ||
229 | */ | ||
230 | int __init verify_local_APIC(void) | ||
231 | { | ||
232 | unsigned int reg0, reg1; | ||
233 | |||
234 | /* | ||
235 | * The version register is read-only in a real APIC. | ||
236 | */ | ||
237 | reg0 = apic_read(APIC_LVR); | ||
238 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
239 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
240 | reg1 = apic_read(APIC_LVR); | ||
241 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
242 | |||
243 | /* | ||
244 | * The two version reads above should print the same | ||
245 | * numbers. If the second one is different, then we | ||
246 | * poke at a non-APIC. | ||
247 | */ | ||
248 | if (reg1 != reg0) | ||
249 | return 0; | ||
250 | |||
251 | /* | ||
252 | * Check if the version looks reasonably. | ||
253 | */ | ||
254 | reg1 = GET_APIC_VERSION(reg0); | ||
255 | if (reg1 == 0x00 || reg1 == 0xff) | ||
256 | return 0; | ||
257 | reg1 = get_maxlvt(); | ||
258 | if (reg1 < 0x02 || reg1 == 0xff) | ||
259 | return 0; | ||
260 | |||
261 | /* | ||
262 | * The ID register is read/write in a real APIC. | ||
263 | */ | ||
264 | reg0 = apic_read(APIC_ID); | ||
265 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
266 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
267 | reg1 = apic_read(APIC_ID); | ||
268 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
269 | apic_write(APIC_ID, reg0); | ||
270 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
271 | return 0; | ||
272 | |||
273 | /* | ||
274 | * The next two are just to see if we have sane values. | ||
275 | * They're only really relevant if we're in Virtual Wire | ||
276 | * compatibility mode, but most boxes are anymore. | ||
277 | */ | ||
278 | reg0 = apic_read(APIC_LVT0); | ||
279 | apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); | ||
280 | reg1 = apic_read(APIC_LVT1); | ||
281 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
282 | |||
283 | return 1; | ||
284 | } | ||
285 | |||
286 | void __init sync_Arb_IDs(void) | ||
287 | { | ||
288 | /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ | ||
289 | unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
290 | if (ver >= 0x14) /* P4 or higher */ | ||
291 | return; | ||
292 | |||
293 | /* | ||
294 | * Wait for idle. | ||
295 | */ | ||
296 | apic_wait_icr_idle(); | ||
297 | |||
298 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
299 | apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | ||
300 | | APIC_DM_INIT); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * An initial setup of the virtual wire mode. | ||
305 | */ | ||
306 | void __init init_bsp_APIC(void) | ||
307 | { | ||
308 | unsigned int value; | ||
309 | |||
310 | /* | ||
311 | * Don't do the setup now if we have a SMP BIOS as the | ||
312 | * through-I/O-APIC virtual wire mode might be active. | ||
313 | */ | ||
314 | if (smp_found_config || !cpu_has_apic) | ||
315 | return; | ||
316 | |||
317 | value = apic_read(APIC_LVR); | ||
318 | |||
319 | /* | ||
320 | * Do not trust the local APIC being empty at bootup. | ||
321 | */ | ||
322 | clear_local_APIC(); | ||
323 | |||
324 | /* | ||
325 | * Enable APIC. | ||
326 | */ | ||
327 | value = apic_read(APIC_SPIV); | ||
328 | value &= ~APIC_VECTOR_MASK; | ||
329 | value |= APIC_SPIV_APIC_ENABLED; | ||
330 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
331 | value |= SPURIOUS_APIC_VECTOR; | ||
332 | apic_write(APIC_SPIV, value); | ||
333 | |||
334 | /* | ||
335 | * Set up the virtual wire mode. | ||
336 | */ | ||
337 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
338 | value = APIC_DM_NMI; | ||
339 | apic_write(APIC_LVT1, value); | ||
340 | } | ||
341 | |||
342 | void __cpuinit setup_local_APIC (void) | ||
343 | { | ||
344 | unsigned int value, maxlvt; | ||
345 | int i, j; | ||
346 | |||
347 | value = apic_read(APIC_LVR); | ||
348 | |||
349 | BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); | ||
350 | |||
351 | /* | ||
352 | * Double-check whether this APIC is really registered. | ||
353 | * This is meaningless in clustered apic mode, so we skip it. | ||
354 | */ | ||
355 | if (!apic_id_registered()) | ||
356 | BUG(); | ||
357 | |||
358 | /* | ||
359 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
360 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
361 | * document number 292116). So here it goes... | ||
362 | */ | ||
363 | init_apic_ldr(); | ||
364 | |||
365 | /* | ||
366 | * Set Task Priority to 'accept all'. We never change this | ||
367 | * later on. | ||
368 | */ | ||
369 | value = apic_read(APIC_TASKPRI); | ||
370 | value &= ~APIC_TPRI_MASK; | ||
371 | apic_write(APIC_TASKPRI, value); | ||
372 | |||
373 | /* | ||
374 | * After a crash, we no longer service the interrupts and a pending | ||
375 | * interrupt from previous kernel might still have ISR bit set. | ||
376 | * | ||
377 | * Most probably by now CPU has serviced that pending interrupt and | ||
378 | * it might not have done the ack_APIC_irq() because it thought, | ||
379 | * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it | ||
380 | * does not clear the ISR bit and cpu thinks it has already serivced | ||
381 | * the interrupt. Hence a vector might get locked. It was noticed | ||
382 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. | ||
383 | */ | ||
384 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | ||
385 | value = apic_read(APIC_ISR + i*0x10); | ||
386 | for (j = 31; j >= 0; j--) { | ||
387 | if (value & (1<<j)) | ||
388 | ack_APIC_irq(); | ||
389 | } | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Now that we are all set up, enable the APIC | ||
394 | */ | ||
395 | value = apic_read(APIC_SPIV); | ||
396 | value &= ~APIC_VECTOR_MASK; | ||
397 | /* | ||
398 | * Enable APIC | ||
399 | */ | ||
400 | value |= APIC_SPIV_APIC_ENABLED; | ||
401 | |||
402 | /* We always use processor focus */ | ||
403 | |||
404 | /* | ||
405 | * Set spurious IRQ vector | ||
406 | */ | ||
407 | value |= SPURIOUS_APIC_VECTOR; | ||
408 | apic_write(APIC_SPIV, value); | ||
409 | |||
410 | /* | ||
411 | * Set up LVT0, LVT1: | ||
412 | * | ||
413 | * set up through-local-APIC on the BP's LINT0. This is not | ||
414 | * strictly necessary in pure symmetric-IO mode, but sometimes | ||
415 | * we delegate interrupts to the 8259A. | ||
416 | */ | ||
417 | /* | ||
418 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
419 | */ | ||
420 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
421 | if (!smp_processor_id() && !value) { | ||
422 | value = APIC_DM_EXTINT; | ||
423 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); | ||
424 | } else { | ||
425 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
426 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); | ||
427 | } | ||
428 | apic_write(APIC_LVT0, value); | ||
429 | |||
430 | /* | ||
431 | * only the BP should see the LINT1 NMI signal, obviously. | ||
432 | */ | ||
433 | if (!smp_processor_id()) | ||
434 | value = APIC_DM_NMI; | ||
435 | else | ||
436 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
437 | apic_write(APIC_LVT1, value); | ||
438 | |||
439 | { | ||
440 | unsigned oldvalue; | ||
441 | maxlvt = get_maxlvt(); | ||
442 | oldvalue = apic_read(APIC_ESR); | ||
443 | value = ERROR_APIC_VECTOR; // enables sending errors | ||
444 | apic_write(APIC_LVTERR, value); | ||
445 | /* | ||
446 | * spec says clear errors after enabling vector. | ||
447 | */ | ||
448 | if (maxlvt > 3) | ||
449 | apic_write(APIC_ESR, 0); | ||
450 | value = apic_read(APIC_ESR); | ||
451 | if (value != oldvalue) | ||
452 | apic_printk(APIC_VERBOSE, | ||
453 | "ESR value after enabling vector: %08x, after %08x\n", | ||
454 | oldvalue, value); | ||
455 | } | ||
456 | |||
457 | nmi_watchdog_default(); | ||
458 | setup_apic_nmi_watchdog(NULL); | ||
459 | apic_pm_activate(); | ||
460 | } | ||
461 | |||
462 | #ifdef CONFIG_PM | ||
463 | |||
464 | static struct { | ||
465 | /* 'active' is true if the local APIC was enabled by us and | ||
466 | not the BIOS; this signifies that we are also responsible | ||
467 | for disabling it before entering apm/acpi suspend */ | ||
468 | int active; | ||
469 | /* r/w apic fields */ | ||
470 | unsigned int apic_id; | ||
471 | unsigned int apic_taskpri; | ||
472 | unsigned int apic_ldr; | ||
473 | unsigned int apic_dfr; | ||
474 | unsigned int apic_spiv; | ||
475 | unsigned int apic_lvtt; | ||
476 | unsigned int apic_lvtpc; | ||
477 | unsigned int apic_lvt0; | ||
478 | unsigned int apic_lvt1; | ||
479 | unsigned int apic_lvterr; | ||
480 | unsigned int apic_tmict; | ||
481 | unsigned int apic_tdcr; | ||
482 | unsigned int apic_thmr; | ||
483 | } apic_pm_state; | ||
484 | |||
485 | static int lapic_suspend(struct sys_device *dev, pm_message_t state) | ||
486 | { | ||
487 | unsigned long flags; | ||
488 | int maxlvt; | ||
489 | |||
490 | if (!apic_pm_state.active) | ||
491 | return 0; | ||
492 | |||
493 | maxlvt = get_maxlvt(); | ||
494 | |||
495 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
496 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
497 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
498 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
499 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
500 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
501 | if (maxlvt >= 4) | ||
502 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
503 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
504 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
505 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
506 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
507 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
508 | #ifdef CONFIG_X86_MCE_INTEL | ||
509 | if (maxlvt >= 5) | ||
510 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
511 | #endif | ||
512 | local_irq_save(flags); | ||
513 | disable_local_APIC(); | ||
514 | local_irq_restore(flags); | ||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | static int lapic_resume(struct sys_device *dev) | ||
519 | { | ||
520 | unsigned int l, h; | ||
521 | unsigned long flags; | ||
522 | int maxlvt; | ||
523 | |||
524 | if (!apic_pm_state.active) | ||
525 | return 0; | ||
526 | |||
527 | maxlvt = get_maxlvt(); | ||
528 | |||
529 | local_irq_save(flags); | ||
530 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
531 | l &= ~MSR_IA32_APICBASE_BASE; | ||
532 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
533 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
534 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
535 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
536 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
537 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
538 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
539 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
540 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
541 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
542 | #ifdef CONFIG_X86_MCE_INTEL | ||
543 | if (maxlvt >= 5) | ||
544 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
545 | #endif | ||
546 | if (maxlvt >= 4) | ||
547 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
548 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
549 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
550 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
551 | apic_write(APIC_ESR, 0); | ||
552 | apic_read(APIC_ESR); | ||
553 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
554 | apic_write(APIC_ESR, 0); | ||
555 | apic_read(APIC_ESR); | ||
556 | local_irq_restore(flags); | ||
557 | return 0; | ||
558 | } | ||
559 | |||
560 | static struct sysdev_class lapic_sysclass = { | ||
561 | set_kset_name("lapic"), | ||
562 | .resume = lapic_resume, | ||
563 | .suspend = lapic_suspend, | ||
564 | }; | ||
565 | |||
566 | static struct sys_device device_lapic = { | ||
567 | .id = 0, | ||
568 | .cls = &lapic_sysclass, | ||
569 | }; | ||
570 | |||
571 | static void __cpuinit apic_pm_activate(void) | ||
572 | { | ||
573 | apic_pm_state.active = 1; | ||
574 | } | ||
575 | |||
576 | static int __init init_lapic_sysfs(void) | ||
577 | { | ||
578 | int error; | ||
579 | if (!cpu_has_apic) | ||
580 | return 0; | ||
581 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
582 | error = sysdev_class_register(&lapic_sysclass); | ||
583 | if (!error) | ||
584 | error = sysdev_register(&device_lapic); | ||
585 | return error; | ||
586 | } | ||
587 | device_initcall(init_lapic_sysfs); | ||
588 | |||
589 | #else /* CONFIG_PM */ | ||
590 | |||
591 | static void apic_pm_activate(void) { } | ||
592 | |||
593 | #endif /* CONFIG_PM */ | ||
594 | |||
595 | static int __init apic_set_verbosity(char *str) | ||
596 | { | ||
597 | if (str == NULL) { | ||
598 | skip_ioapic_setup = 0; | ||
599 | ioapic_force = 1; | ||
600 | return 0; | ||
601 | } | ||
602 | if (strcmp("debug", str) == 0) | ||
603 | apic_verbosity = APIC_DEBUG; | ||
604 | else if (strcmp("verbose", str) == 0) | ||
605 | apic_verbosity = APIC_VERBOSE; | ||
606 | else { | ||
607 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
608 | " use apic=verbose or apic=debug\n", str); | ||
609 | return -EINVAL; | ||
610 | } | ||
611 | |||
612 | return 0; | ||
613 | } | ||
614 | early_param("apic", apic_set_verbosity); | ||
615 | |||
616 | /* | ||
617 | * Detect and enable local APICs on non-SMP boards. | ||
618 | * Original code written by Keir Fraser. | ||
619 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
620 | * not correctly set up (usually the APIC timer won't work etc.) | ||
621 | */ | ||
622 | |||
623 | static int __init detect_init_APIC (void) | ||
624 | { | ||
625 | if (!cpu_has_apic) { | ||
626 | printk(KERN_INFO "No local APIC present\n"); | ||
627 | return -1; | ||
628 | } | ||
629 | |||
630 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
631 | boot_cpu_id = 0; | ||
632 | return 0; | ||
633 | } | ||
634 | |||
635 | #ifdef CONFIG_X86_IO_APIC | ||
636 | static struct resource * __init ioapic_setup_resources(void) | ||
637 | { | ||
638 | #define IOAPIC_RESOURCE_NAME_SIZE 11 | ||
639 | unsigned long n; | ||
640 | struct resource *res; | ||
641 | char *mem; | ||
642 | int i; | ||
643 | |||
644 | if (nr_ioapics <= 0) | ||
645 | return NULL; | ||
646 | |||
647 | n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); | ||
648 | n *= nr_ioapics; | ||
649 | |||
650 | mem = alloc_bootmem(n); | ||
651 | res = (void *)mem; | ||
652 | |||
653 | if (mem != NULL) { | ||
654 | memset(mem, 0, n); | ||
655 | mem += sizeof(struct resource) * nr_ioapics; | ||
656 | |||
657 | for (i = 0; i < nr_ioapics; i++) { | ||
658 | res[i].name = mem; | ||
659 | res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
660 | sprintf(mem, "IOAPIC %u", i); | ||
661 | mem += IOAPIC_RESOURCE_NAME_SIZE; | ||
662 | } | ||
663 | } | ||
664 | |||
665 | ioapic_resources = res; | ||
666 | |||
667 | return res; | ||
668 | } | ||
669 | |||
670 | static int __init ioapic_insert_resources(void) | ||
671 | { | ||
672 | int i; | ||
673 | struct resource *r = ioapic_resources; | ||
674 | |||
675 | if (!r) { | ||
676 | printk("IO APIC resources could be not be allocated.\n"); | ||
677 | return -1; | ||
678 | } | ||
679 | |||
680 | for (i = 0; i < nr_ioapics; i++) { | ||
681 | insert_resource(&iomem_resource, r); | ||
682 | r++; | ||
683 | } | ||
684 | |||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | /* Insert the IO APIC resources after PCI initialization has occured to handle | ||
689 | * IO APICS that are mapped in on a BAR in PCI space. */ | ||
690 | late_initcall(ioapic_insert_resources); | ||
691 | #endif | ||
692 | |||
693 | void __init init_apic_mappings(void) | ||
694 | { | ||
695 | unsigned long apic_phys; | ||
696 | |||
697 | /* | ||
698 | * If no local APIC can be found then set up a fake all | ||
699 | * zeroes page to simulate the local APIC and another | ||
700 | * one for the IO-APIC. | ||
701 | */ | ||
702 | if (!smp_found_config && detect_init_APIC()) { | ||
703 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
704 | apic_phys = __pa(apic_phys); | ||
705 | } else | ||
706 | apic_phys = mp_lapic_addr; | ||
707 | |||
708 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
709 | apic_mapped = 1; | ||
710 | apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); | ||
711 | |||
712 | /* Put local APIC into the resource map. */ | ||
713 | lapic_resource.start = apic_phys; | ||
714 | lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; | ||
715 | insert_resource(&iomem_resource, &lapic_resource); | ||
716 | |||
717 | /* | ||
718 | * Fetch the APIC ID of the BSP in case we have a | ||
719 | * default configuration (or the MP table is broken). | ||
720 | */ | ||
721 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
722 | |||
723 | { | ||
724 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
725 | int i; | ||
726 | struct resource *ioapic_res; | ||
727 | |||
728 | ioapic_res = ioapic_setup_resources(); | ||
729 | for (i = 0; i < nr_ioapics; i++) { | ||
730 | if (smp_found_config) { | ||
731 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
732 | } else { | ||
733 | ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
734 | ioapic_phys = __pa(ioapic_phys); | ||
735 | } | ||
736 | set_fixmap_nocache(idx, ioapic_phys); | ||
737 | apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", | ||
738 | __fix_to_virt(idx), ioapic_phys); | ||
739 | idx++; | ||
740 | |||
741 | if (ioapic_res != NULL) { | ||
742 | ioapic_res->start = ioapic_phys; | ||
743 | ioapic_res->end = ioapic_phys + (4 * 1024) - 1; | ||
744 | ioapic_res++; | ||
745 | } | ||
746 | } | ||
747 | } | ||
748 | } | ||
749 | |||
750 | /* | ||
751 | * This function sets up the local APIC timer, with a timeout of | ||
752 | * 'clocks' APIC bus clock. During calibration we actually call | ||
753 | * this function twice on the boot CPU, once with a bogus timeout | ||
754 | * value, second time for real. The other (noncalibrating) CPUs | ||
755 | * call this function only once, with the real, calibrated value. | ||
756 | * | ||
757 | * We do reads before writes even if unnecessary, to get around the | ||
758 | * P5 APIC double write bug. | ||
759 | */ | ||
760 | |||
761 | #define APIC_DIVISOR 16 | ||
762 | |||
763 | static void __setup_APIC_LVTT(unsigned int clocks) | ||
764 | { | ||
765 | unsigned int lvtt_value, tmp_value; | ||
766 | int cpu = smp_processor_id(); | ||
767 | |||
768 | lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; | ||
769 | |||
770 | if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) | ||
771 | lvtt_value |= APIC_LVT_MASKED; | ||
772 | |||
773 | apic_write(APIC_LVTT, lvtt_value); | ||
774 | |||
775 | /* | ||
776 | * Divide PICLK by 16 | ||
777 | */ | ||
778 | tmp_value = apic_read(APIC_TDCR); | ||
779 | apic_write(APIC_TDCR, (tmp_value | ||
780 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ||
781 | | APIC_TDR_DIV_16); | ||
782 | |||
783 | apic_write(APIC_TMICT, clocks/APIC_DIVISOR); | ||
784 | } | ||
785 | |||
786 | static void setup_APIC_timer(unsigned int clocks) | ||
787 | { | ||
788 | unsigned long flags; | ||
789 | |||
790 | local_irq_save(flags); | ||
791 | |||
792 | /* wait for irq slice */ | ||
793 | if (hpet_address && hpet_use_timer) { | ||
794 | u32 trigger = hpet_readl(HPET_T0_CMP); | ||
795 | while (hpet_readl(HPET_T0_CMP) == trigger) | ||
796 | /* do nothing */ ; | ||
797 | } else { | ||
798 | int c1, c2; | ||
799 | outb_p(0x00, 0x43); | ||
800 | c2 = inb_p(0x40); | ||
801 | c2 |= inb_p(0x40) << 8; | ||
802 | do { | ||
803 | c1 = c2; | ||
804 | outb_p(0x00, 0x43); | ||
805 | c2 = inb_p(0x40); | ||
806 | c2 |= inb_p(0x40) << 8; | ||
807 | } while (c2 - c1 < 300); | ||
808 | } | ||
809 | __setup_APIC_LVTT(clocks); | ||
810 | /* Turn off PIT interrupt if we use APIC timer as main timer. | ||
811 | Only works with the PM timer right now | ||
812 | TBD fix it for HPET too. */ | ||
813 | if ((pmtmr_ioport != 0) && | ||
814 | smp_processor_id() == boot_cpu_id && | ||
815 | apic_runs_main_timer == 1 && | ||
816 | !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { | ||
817 | stop_timer_interrupt(); | ||
818 | apic_runs_main_timer++; | ||
819 | } | ||
820 | local_irq_restore(flags); | ||
821 | } | ||
822 | |||
823 | /* | ||
824 | * In this function we calibrate APIC bus clocks to the external | ||
825 | * timer. Unfortunately we cannot use jiffies and the timer irq | ||
826 | * to calibrate, since some later bootup code depends on getting | ||
827 | * the first irq? Ugh. | ||
828 | * | ||
829 | * We want to do the calibration only once since we | ||
830 | * want to have local timer irqs syncron. CPUs connected | ||
831 | * by the same APIC bus have the very same bus frequency. | ||
832 | * And we want to have irqs off anyways, no accidental | ||
833 | * APIC irq that way. | ||
834 | */ | ||
835 | |||
836 | #define TICK_COUNT 100000000 | ||
837 | |||
838 | static int __init calibrate_APIC_clock(void) | ||
839 | { | ||
840 | unsigned apic, apic_start; | ||
841 | unsigned long tsc, tsc_start; | ||
842 | int result; | ||
843 | /* | ||
844 | * Put whatever arbitrary (but long enough) timeout | ||
845 | * value into the APIC clock, we just want to get the | ||
846 | * counter running for calibration. | ||
847 | */ | ||
848 | __setup_APIC_LVTT(4000000000); | ||
849 | |||
850 | apic_start = apic_read(APIC_TMCCT); | ||
851 | #ifdef CONFIG_X86_PM_TIMER | ||
852 | if (apic_calibrate_pmtmr && pmtmr_ioport) { | ||
853 | pmtimer_wait(5000); /* 5ms wait */ | ||
854 | apic = apic_read(APIC_TMCCT); | ||
855 | result = (apic_start - apic) * 1000L / 5; | ||
856 | } else | ||
857 | #endif | ||
858 | { | ||
859 | rdtscll(tsc_start); | ||
860 | |||
861 | do { | ||
862 | apic = apic_read(APIC_TMCCT); | ||
863 | rdtscll(tsc); | ||
864 | } while ((tsc - tsc_start) < TICK_COUNT && | ||
865 | (apic_start - apic) < TICK_COUNT); | ||
866 | |||
867 | result = (apic_start - apic) * 1000L * tsc_khz / | ||
868 | (tsc - tsc_start); | ||
869 | } | ||
870 | printk("result %d\n", result); | ||
871 | |||
872 | |||
873 | printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", | ||
874 | result / 1000 / 1000, result / 1000 % 1000); | ||
875 | |||
876 | return result * APIC_DIVISOR / HZ; | ||
877 | } | ||
878 | |||
879 | static unsigned int calibration_result; | ||
880 | |||
881 | void __init setup_boot_APIC_clock (void) | ||
882 | { | ||
883 | if (disable_apic_timer) { | ||
884 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
885 | return; | ||
886 | } | ||
887 | |||
888 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | ||
889 | using_apic_timer = 1; | ||
890 | |||
891 | local_irq_disable(); | ||
892 | |||
893 | calibration_result = calibrate_APIC_clock(); | ||
894 | /* | ||
895 | * Now set up the timer for real. | ||
896 | */ | ||
897 | setup_APIC_timer(calibration_result); | ||
898 | |||
899 | local_irq_enable(); | ||
900 | } | ||
901 | |||
902 | void __cpuinit setup_secondary_APIC_clock(void) | ||
903 | { | ||
904 | local_irq_disable(); /* FIXME: Do we need this? --RR */ | ||
905 | setup_APIC_timer(calibration_result); | ||
906 | local_irq_enable(); | ||
907 | } | ||
908 | |||
909 | void disable_APIC_timer(void) | ||
910 | { | ||
911 | if (using_apic_timer) { | ||
912 | unsigned long v; | ||
913 | |||
914 | v = apic_read(APIC_LVTT); | ||
915 | /* | ||
916 | * When an illegal vector value (0-15) is written to an LVT | ||
917 | * entry and delivery mode is Fixed, the APIC may signal an | ||
918 | * illegal vector error, with out regard to whether the mask | ||
919 | * bit is set or whether an interrupt is actually seen on input. | ||
920 | * | ||
921 | * Boot sequence might call this function when the LVTT has | ||
922 | * '0' vector value. So make sure vector field is set to | ||
923 | * valid value. | ||
924 | */ | ||
925 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
926 | apic_write(APIC_LVTT, v); | ||
927 | } | ||
928 | } | ||
929 | |||
930 | void enable_APIC_timer(void) | ||
931 | { | ||
932 | int cpu = smp_processor_id(); | ||
933 | |||
934 | if (using_apic_timer && | ||
935 | !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
936 | unsigned long v; | ||
937 | |||
938 | v = apic_read(APIC_LVTT); | ||
939 | apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); | ||
940 | } | ||
941 | } | ||
942 | |||
943 | void switch_APIC_timer_to_ipi(void *cpumask) | ||
944 | { | ||
945 | cpumask_t mask = *(cpumask_t *)cpumask; | ||
946 | int cpu = smp_processor_id(); | ||
947 | |||
948 | if (cpu_isset(cpu, mask) && | ||
949 | !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
950 | disable_APIC_timer(); | ||
951 | cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); | ||
952 | } | ||
953 | } | ||
954 | EXPORT_SYMBOL(switch_APIC_timer_to_ipi); | ||
955 | |||
956 | void smp_send_timer_broadcast_ipi(void) | ||
957 | { | ||
958 | int cpu = smp_processor_id(); | ||
959 | cpumask_t mask; | ||
960 | |||
961 | cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); | ||
962 | |||
963 | if (cpu_isset(cpu, mask)) { | ||
964 | cpu_clear(cpu, mask); | ||
965 | add_pda(apic_timer_irqs, 1); | ||
966 | smp_local_timer_interrupt(); | ||
967 | } | ||
968 | |||
969 | if (!cpus_empty(mask)) { | ||
970 | send_IPI_mask(mask, LOCAL_TIMER_VECTOR); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | void switch_ipi_to_APIC_timer(void *cpumask) | ||
975 | { | ||
976 | cpumask_t mask = *(cpumask_t *)cpumask; | ||
977 | int cpu = smp_processor_id(); | ||
978 | |||
979 | if (cpu_isset(cpu, mask) && | ||
980 | cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
981 | cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); | ||
982 | enable_APIC_timer(); | ||
983 | } | ||
984 | } | ||
985 | EXPORT_SYMBOL(switch_ipi_to_APIC_timer); | ||
986 | |||
987 | int setup_profiling_timer(unsigned int multiplier) | ||
988 | { | ||
989 | return -EINVAL; | ||
990 | } | ||
991 | |||
992 | void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, | ||
993 | unsigned char msg_type, unsigned char mask) | ||
994 | { | ||
995 | unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; | ||
996 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | ||
997 | apic_write(reg, v); | ||
998 | } | ||
999 | |||
1000 | #undef APIC_DIVISOR | ||
1001 | |||
1002 | /* | ||
1003 | * Local timer interrupt handler. It does both profiling and | ||
1004 | * process statistics/rescheduling. | ||
1005 | * | ||
1006 | * We do profiling in every local tick, statistics/rescheduling | ||
1007 | * happen only every 'profiling multiplier' ticks. The default | ||
1008 | * multiplier is 1 and it can be changed by writing the new multiplier | ||
1009 | * value into /proc/profile. | ||
1010 | */ | ||
1011 | |||
1012 | void smp_local_timer_interrupt(void) | ||
1013 | { | ||
1014 | profile_tick(CPU_PROFILING); | ||
1015 | #ifdef CONFIG_SMP | ||
1016 | update_process_times(user_mode(get_irq_regs())); | ||
1017 | #endif | ||
1018 | if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) | ||
1019 | main_timer_handler(); | ||
1020 | /* | ||
1021 | * We take the 'long' return path, and there every subsystem | ||
1022 | * grabs the appropriate locks (kernel lock/ irq lock). | ||
1023 | * | ||
1024 | * We might want to decouple profiling from the 'long path', | ||
1025 | * and do the profiling totally in assembly. | ||
1026 | * | ||
1027 | * Currently this isn't too much of an issue (performance wise), | ||
1028 | * we can take more than 100K local irqs per second on a 100 MHz P5. | ||
1029 | */ | ||
1030 | } | ||
1031 | |||
1032 | /* | ||
1033 | * Local APIC timer interrupt. This is the most natural way for doing | ||
1034 | * local interrupts, but local timer interrupts can be emulated by | ||
1035 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
1036 | * | ||
1037 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
1038 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
1039 | */ | ||
1040 | void smp_apic_timer_interrupt(struct pt_regs *regs) | ||
1041 | { | ||
1042 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
1043 | |||
1044 | /* | ||
1045 | * the NMI deadlock-detector uses this. | ||
1046 | */ | ||
1047 | add_pda(apic_timer_irqs, 1); | ||
1048 | |||
1049 | /* | ||
1050 | * NOTE! We'd better ACK the irq immediately, | ||
1051 | * because timer handling can be slow. | ||
1052 | */ | ||
1053 | ack_APIC_irq(); | ||
1054 | /* | ||
1055 | * update_process_times() expects us to have done irq_enter(). | ||
1056 | * Besides, if we don't timer interrupts ignore the global | ||
1057 | * interrupt lock, which is the WrongThing (tm) to do. | ||
1058 | */ | ||
1059 | exit_idle(); | ||
1060 | irq_enter(); | ||
1061 | smp_local_timer_interrupt(); | ||
1062 | irq_exit(); | ||
1063 | set_irq_regs(old_regs); | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * apic_is_clustered_box() -- Check if we can expect good TSC | ||
1068 | * | ||
1069 | * Thus far, the major user of this is IBM's Summit2 series: | ||
1070 | * | ||
1071 | * Clustered boxes may have unsynced TSC problems if they are | ||
1072 | * multi-chassis. Use available data to take a good guess. | ||
1073 | * If in doubt, go HPET. | ||
1074 | */ | ||
1075 | __cpuinit int apic_is_clustered_box(void) | ||
1076 | { | ||
1077 | int i, clusters, zeros; | ||
1078 | unsigned id; | ||
1079 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
1080 | |||
1081 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | ||
1082 | |||
1083 | for (i = 0; i < NR_CPUS; i++) { | ||
1084 | id = bios_cpu_apicid[i]; | ||
1085 | if (id != BAD_APICID) | ||
1086 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
1087 | } | ||
1088 | |||
1089 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
1090 | * the APIC clusters they have been allocated. Only present CPUs have | ||
1091 | * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since | ||
1092 | * clusters are allocated sequentially, count zeros only if they are | ||
1093 | * bounded by ones. | ||
1094 | */ | ||
1095 | clusters = 0; | ||
1096 | zeros = 0; | ||
1097 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
1098 | if (test_bit(i, clustermap)) { | ||
1099 | clusters += 1 + zeros; | ||
1100 | zeros = 0; | ||
1101 | } else | ||
1102 | ++zeros; | ||
1103 | } | ||
1104 | |||
1105 | /* | ||
1106 | * If clusters > 2, then should be multi-chassis. | ||
1107 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
1108 | * out, but AFAIK this will work even for them. | ||
1109 | */ | ||
1110 | return (clusters > 2); | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
1115 | */ | ||
1116 | asmlinkage void smp_spurious_interrupt(void) | ||
1117 | { | ||
1118 | unsigned int v; | ||
1119 | exit_idle(); | ||
1120 | irq_enter(); | ||
1121 | /* | ||
1122 | * Check if this really is a spurious interrupt and ACK it | ||
1123 | * if it is a vectored one. Just in case... | ||
1124 | * Spurious interrupts should not be ACKed. | ||
1125 | */ | ||
1126 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
1127 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
1128 | ack_APIC_irq(); | ||
1129 | |||
1130 | irq_exit(); | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * This interrupt should never happen with our APIC/SMP architecture | ||
1135 | */ | ||
1136 | |||
1137 | asmlinkage void smp_error_interrupt(void) | ||
1138 | { | ||
1139 | unsigned int v, v1; | ||
1140 | |||
1141 | exit_idle(); | ||
1142 | irq_enter(); | ||
1143 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1144 | v = apic_read(APIC_ESR); | ||
1145 | apic_write(APIC_ESR, 0); | ||
1146 | v1 = apic_read(APIC_ESR); | ||
1147 | ack_APIC_irq(); | ||
1148 | atomic_inc(&irq_err_count); | ||
1149 | |||
1150 | /* Here is what the APIC error bits mean: | ||
1151 | 0: Send CS error | ||
1152 | 1: Receive CS error | ||
1153 | 2: Send accept error | ||
1154 | 3: Receive accept error | ||
1155 | 4: Reserved | ||
1156 | 5: Send illegal vector | ||
1157 | 6: Received illegal vector | ||
1158 | 7: Illegal register address | ||
1159 | */ | ||
1160 | printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | ||
1161 | smp_processor_id(), v , v1); | ||
1162 | irq_exit(); | ||
1163 | } | ||
1164 | |||
1165 | int disable_apic; | ||
1166 | |||
1167 | /* | ||
1168 | * This initializes the IO-APIC and APIC hardware if this is | ||
1169 | * a UP kernel. | ||
1170 | */ | ||
1171 | int __init APIC_init_uniprocessor (void) | ||
1172 | { | ||
1173 | if (disable_apic) { | ||
1174 | printk(KERN_INFO "Apic disabled\n"); | ||
1175 | return -1; | ||
1176 | } | ||
1177 | if (!cpu_has_apic) { | ||
1178 | disable_apic = 1; | ||
1179 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1180 | return -1; | ||
1181 | } | ||
1182 | |||
1183 | verify_local_APIC(); | ||
1184 | |||
1185 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); | ||
1186 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); | ||
1187 | |||
1188 | setup_local_APIC(); | ||
1189 | |||
1190 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | ||
1191 | setup_IO_APIC(); | ||
1192 | else | ||
1193 | nr_ioapics = 0; | ||
1194 | setup_boot_APIC_clock(); | ||
1195 | check_nmi_watchdog(); | ||
1196 | return 0; | ||
1197 | } | ||
1198 | |||
1199 | static __init int setup_disableapic(char *str) | ||
1200 | { | ||
1201 | disable_apic = 1; | ||
1202 | clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1203 | return 0; | ||
1204 | } | ||
1205 | early_param("disableapic", setup_disableapic); | ||
1206 | |||
1207 | /* same as disableapic, for compatibility */ | ||
1208 | static __init int setup_nolapic(char *str) | ||
1209 | { | ||
1210 | return setup_disableapic(str); | ||
1211 | } | ||
1212 | early_param("nolapic", setup_nolapic); | ||
1213 | |||
1214 | static int __init parse_lapic_timer_c2_ok(char *arg) | ||
1215 | { | ||
1216 | local_apic_timer_c2_ok = 1; | ||
1217 | return 0; | ||
1218 | } | ||
1219 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | ||
1220 | |||
1221 | static __init int setup_noapictimer(char *str) | ||
1222 | { | ||
1223 | if (str[0] != ' ' && str[0] != 0) | ||
1224 | return 0; | ||
1225 | disable_apic_timer = 1; | ||
1226 | return 1; | ||
1227 | } | ||
1228 | |||
1229 | static __init int setup_apicmaintimer(char *str) | ||
1230 | { | ||
1231 | apic_runs_main_timer = 1; | ||
1232 | nohpet = 1; | ||
1233 | return 1; | ||
1234 | } | ||
1235 | __setup("apicmaintimer", setup_apicmaintimer); | ||
1236 | |||
1237 | static __init int setup_noapicmaintimer(char *str) | ||
1238 | { | ||
1239 | apic_runs_main_timer = -1; | ||
1240 | return 1; | ||
1241 | } | ||
1242 | __setup("noapicmaintimer", setup_noapicmaintimer); | ||
1243 | |||
1244 | static __init int setup_apicpmtimer(char *s) | ||
1245 | { | ||
1246 | apic_calibrate_pmtmr = 1; | ||
1247 | notsc_setup(NULL); | ||
1248 | return setup_apicmaintimer(NULL); | ||
1249 | } | ||
1250 | __setup("apicpmtimer", setup_apicpmtimer); | ||
1251 | |||
1252 | __setup("noapictimer", setup_noapictimer); | ||
1253 | |||
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c new file mode 100644 index 000000000000..778953bc636c --- /dev/null +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -0,0 +1,85 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by assembly language modules. | ||
3 | * This code generates raw asm output which is post-processed to extract | ||
4 | * and format the required data. | ||
5 | */ | ||
6 | |||
7 | #include <linux/crypto.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/stddef.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/hardirq.h> | ||
12 | #include <linux/suspend.h> | ||
13 | #include <asm/pda.h> | ||
14 | #include <asm/processor.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/thread_info.h> | ||
17 | #include <asm/ia32.h> | ||
18 | |||
19 | #define DEFINE(sym, val) \ | ||
20 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
21 | |||
22 | #define BLANK() asm volatile("\n->" : : ) | ||
23 | |||
24 | #define __NO_STUBS 1 | ||
25 | #undef __SYSCALL | ||
26 | #undef _ASM_X86_64_UNISTD_H_ | ||
27 | #define __SYSCALL(nr, sym) [nr] = 1, | ||
28 | static char syscalls[] = { | ||
29 | #include <asm/unistd.h> | ||
30 | }; | ||
31 | |||
32 | int main(void) | ||
33 | { | ||
34 | #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) | ||
35 | ENTRY(state); | ||
36 | ENTRY(flags); | ||
37 | ENTRY(thread); | ||
38 | ENTRY(pid); | ||
39 | BLANK(); | ||
40 | #undef ENTRY | ||
41 | #define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) | ||
42 | ENTRY(flags); | ||
43 | ENTRY(addr_limit); | ||
44 | ENTRY(preempt_count); | ||
45 | ENTRY(status); | ||
46 | BLANK(); | ||
47 | #undef ENTRY | ||
48 | #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) | ||
49 | ENTRY(kernelstack); | ||
50 | ENTRY(oldrsp); | ||
51 | ENTRY(pcurrent); | ||
52 | ENTRY(irqcount); | ||
53 | ENTRY(cpunumber); | ||
54 | ENTRY(irqstackptr); | ||
55 | ENTRY(data_offset); | ||
56 | BLANK(); | ||
57 | #undef ENTRY | ||
58 | #ifdef CONFIG_IA32_EMULATION | ||
59 | #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) | ||
60 | ENTRY(eax); | ||
61 | ENTRY(ebx); | ||
62 | ENTRY(ecx); | ||
63 | ENTRY(edx); | ||
64 | ENTRY(esi); | ||
65 | ENTRY(edi); | ||
66 | ENTRY(ebp); | ||
67 | ENTRY(esp); | ||
68 | ENTRY(eip); | ||
69 | BLANK(); | ||
70 | #undef ENTRY | ||
71 | DEFINE(IA32_RT_SIGFRAME_sigcontext, | ||
72 | offsetof (struct rt_sigframe32, uc.uc_mcontext)); | ||
73 | BLANK(); | ||
74 | #endif | ||
75 | DEFINE(pbe_address, offsetof(struct pbe, address)); | ||
76 | DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); | ||
77 | DEFINE(pbe_next, offsetof(struct pbe, next)); | ||
78 | BLANK(); | ||
79 | DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); | ||
80 | BLANK(); | ||
81 | DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); | ||
82 | BLANK(); | ||
83 | DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); | ||
84 | return 0; | ||
85 | } | ||
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c new file mode 100644 index 000000000000..06d3e5a14d9d --- /dev/null +++ b/arch/x86/kernel/audit_64.c | |||
@@ -0,0 +1,81 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/audit.h> | ||
4 | #include <asm/unistd.h> | ||
5 | |||
6 | static unsigned dir_class[] = { | ||
7 | #include <asm-generic/audit_dir_write.h> | ||
8 | ~0U | ||
9 | }; | ||
10 | |||
11 | static unsigned read_class[] = { | ||
12 | #include <asm-generic/audit_read.h> | ||
13 | ~0U | ||
14 | }; | ||
15 | |||
16 | static unsigned write_class[] = { | ||
17 | #include <asm-generic/audit_write.h> | ||
18 | ~0U | ||
19 | }; | ||
20 | |||
21 | static unsigned chattr_class[] = { | ||
22 | #include <asm-generic/audit_change_attr.h> | ||
23 | ~0U | ||
24 | }; | ||
25 | |||
26 | static unsigned signal_class[] = { | ||
27 | #include <asm-generic/audit_signal.h> | ||
28 | ~0U | ||
29 | }; | ||
30 | |||
31 | int audit_classify_arch(int arch) | ||
32 | { | ||
33 | #ifdef CONFIG_IA32_EMULATION | ||
34 | if (arch == AUDIT_ARCH_I386) | ||
35 | return 1; | ||
36 | #endif | ||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | int audit_classify_syscall(int abi, unsigned syscall) | ||
41 | { | ||
42 | #ifdef CONFIG_IA32_EMULATION | ||
43 | extern int ia32_classify_syscall(unsigned); | ||
44 | if (abi == AUDIT_ARCH_I386) | ||
45 | return ia32_classify_syscall(syscall); | ||
46 | #endif | ||
47 | switch(syscall) { | ||
48 | case __NR_open: | ||
49 | return 2; | ||
50 | case __NR_openat: | ||
51 | return 3; | ||
52 | case __NR_execve: | ||
53 | return 5; | ||
54 | default: | ||
55 | return 0; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | static int __init audit_classes_init(void) | ||
60 | { | ||
61 | #ifdef CONFIG_IA32_EMULATION | ||
62 | extern __u32 ia32_dir_class[]; | ||
63 | extern __u32 ia32_write_class[]; | ||
64 | extern __u32 ia32_read_class[]; | ||
65 | extern __u32 ia32_chattr_class[]; | ||
66 | extern __u32 ia32_signal_class[]; | ||
67 | audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); | ||
68 | audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); | ||
69 | audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); | ||
70 | audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); | ||
71 | audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); | ||
72 | #endif | ||
73 | audit_register_class(AUDIT_CLASS_WRITE, write_class); | ||
74 | audit_register_class(AUDIT_CLASS_READ, read_class); | ||
75 | audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); | ||
76 | audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); | ||
77 | audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | __initcall(audit_classes_init); | ||
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c new file mode 100644 index 000000000000..4e5e9d364d63 --- /dev/null +++ b/arch/x86/kernel/bugs_64.c | |||
@@ -0,0 +1,24 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/bugs.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2000 SuSE | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <asm/alternative.h> | ||
11 | #include <asm/bugs.h> | ||
12 | #include <asm/processor.h> | ||
13 | #include <asm/mtrr.h> | ||
14 | |||
15 | void __init check_bugs(void) | ||
16 | { | ||
17 | identify_cpu(&boot_cpu_data); | ||
18 | mtrr_bp_init(); | ||
19 | #if !defined(CONFIG_SMP) | ||
20 | printk("CPU: "); | ||
21 | print_cpu_info(&boot_cpu_data); | ||
22 | #endif | ||
23 | alternative_instructions(); | ||
24 | } | ||
diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c new file mode 100644 index 000000000000..13432a1ae904 --- /dev/null +++ b/arch/x86/kernel/crash_64.c | |||
@@ -0,0 +1,135 @@ | |||
1 | /* | ||
2 | * Architecture specific (x86_64) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/elf.h> | ||
19 | #include <linux/elfcore.h> | ||
20 | #include <linux/kdebug.h> | ||
21 | |||
22 | #include <asm/processor.h> | ||
23 | #include <asm/hardirq.h> | ||
24 | #include <asm/nmi.h> | ||
25 | #include <asm/hw_irq.h> | ||
26 | #include <asm/mach_apic.h> | ||
27 | |||
28 | /* This keeps a track of which one is crashing cpu. */ | ||
29 | static int crashing_cpu; | ||
30 | |||
31 | #ifdef CONFIG_SMP | ||
32 | static atomic_t waiting_for_crash_ipi; | ||
33 | |||
34 | static int crash_nmi_callback(struct notifier_block *self, | ||
35 | unsigned long val, void *data) | ||
36 | { | ||
37 | struct pt_regs *regs; | ||
38 | int cpu; | ||
39 | |||
40 | if (val != DIE_NMI_IPI) | ||
41 | return NOTIFY_OK; | ||
42 | |||
43 | regs = ((struct die_args *)data)->regs; | ||
44 | cpu = raw_smp_processor_id(); | ||
45 | |||
46 | /* | ||
47 | * Don't do anything if this handler is invoked on crashing cpu. | ||
48 | * Otherwise, system will completely hang. Crashing cpu can get | ||
49 | * an NMI if system was initially booted with nmi_watchdog parameter. | ||
50 | */ | ||
51 | if (cpu == crashing_cpu) | ||
52 | return NOTIFY_STOP; | ||
53 | local_irq_disable(); | ||
54 | |||
55 | crash_save_cpu(regs, cpu); | ||
56 | disable_local_APIC(); | ||
57 | atomic_dec(&waiting_for_crash_ipi); | ||
58 | /* Assume hlt works */ | ||
59 | for(;;) | ||
60 | halt(); | ||
61 | |||
62 | return 1; | ||
63 | } | ||
64 | |||
65 | static void smp_send_nmi_allbutself(void) | ||
66 | { | ||
67 | send_IPI_allbutself(NMI_VECTOR); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * This code is a best effort heuristic to get the | ||
72 | * other cpus to stop executing. So races with | ||
73 | * cpu hotplug shouldn't matter. | ||
74 | */ | ||
75 | |||
76 | static struct notifier_block crash_nmi_nb = { | ||
77 | .notifier_call = crash_nmi_callback, | ||
78 | }; | ||
79 | |||
80 | static void nmi_shootdown_cpus(void) | ||
81 | { | ||
82 | unsigned long msecs; | ||
83 | |||
84 | atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); | ||
85 | if (register_die_notifier(&crash_nmi_nb)) | ||
86 | return; /* return what? */ | ||
87 | |||
88 | /* | ||
89 | * Ensure the new callback function is set before sending | ||
90 | * out the NMI | ||
91 | */ | ||
92 | wmb(); | ||
93 | |||
94 | smp_send_nmi_allbutself(); | ||
95 | |||
96 | msecs = 1000; /* Wait at most a second for the other cpus to stop */ | ||
97 | while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { | ||
98 | mdelay(1); | ||
99 | msecs--; | ||
100 | } | ||
101 | /* Leave the nmi callback set */ | ||
102 | disable_local_APIC(); | ||
103 | } | ||
104 | #else | ||
105 | static void nmi_shootdown_cpus(void) | ||
106 | { | ||
107 | /* There are no cpus to shootdown */ | ||
108 | } | ||
109 | #endif | ||
110 | |||
111 | void machine_crash_shutdown(struct pt_regs *regs) | ||
112 | { | ||
113 | /* | ||
114 | * This function is only called after the system | ||
115 | * has panicked or is otherwise in a critical state. | ||
116 | * The minimum amount of code to allow a kexec'd kernel | ||
117 | * to run successfully needs to happen here. | ||
118 | * | ||
119 | * In practice this means shooting down the other cpus in | ||
120 | * an SMP system. | ||
121 | */ | ||
122 | /* The kernel is broken so disable interrupts */ | ||
123 | local_irq_disable(); | ||
124 | |||
125 | /* Make a note of crashing cpu. Will be used in NMI callback.*/ | ||
126 | crashing_cpu = smp_processor_id(); | ||
127 | nmi_shootdown_cpus(); | ||
128 | |||
129 | if(cpu_has_apic) | ||
130 | disable_local_APIC(); | ||
131 | |||
132 | disable_IO_APIC(); | ||
133 | |||
134 | crash_save_cpu(regs, smp_processor_id()); | ||
135 | } | ||
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c new file mode 100644 index 000000000000..942deac4d43a --- /dev/null +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -0,0 +1,47 @@ | |||
1 | /* | ||
2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
6 | */ | ||
7 | |||
8 | #include <linux/errno.h> | ||
9 | #include <linux/crash_dump.h> | ||
10 | |||
11 | #include <asm/uaccess.h> | ||
12 | #include <asm/io.h> | ||
13 | |||
14 | /** | ||
15 | * copy_oldmem_page - copy one page from "oldmem" | ||
16 | * @pfn: page frame number to be copied | ||
17 | * @buf: target memory address for the copy; this can be in kernel address | ||
18 | * space or user address space (see @userbuf) | ||
19 | * @csize: number of bytes to copy | ||
20 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
21 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
22 | * otherwise @buf is in kernel address space, use memcpy(). | ||
23 | * | ||
24 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
25 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
26 | */ | ||
27 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
28 | size_t csize, unsigned long offset, int userbuf) | ||
29 | { | ||
30 | void *vaddr; | ||
31 | |||
32 | if (!csize) | ||
33 | return 0; | ||
34 | |||
35 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | ||
36 | |||
37 | if (userbuf) { | ||
38 | if (copy_to_user(buf, (vaddr + offset), csize)) { | ||
39 | iounmap(vaddr); | ||
40 | return -EFAULT; | ||
41 | } | ||
42 | } else | ||
43 | memcpy(buf, (vaddr + offset), csize); | ||
44 | |||
45 | iounmap(vaddr); | ||
46 | return csize; | ||
47 | } | ||
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c new file mode 100644 index 000000000000..0f4d5e209e9b --- /dev/null +++ b/arch/x86/kernel/e820_64.c | |||
@@ -0,0 +1,725 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * | ||
5 | * Getting sanitize_e820_map() in sync with i386 version by applying change: | ||
6 | * - Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
7 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
8 | * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/ioport.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/kexec.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/suspend.h> | ||
21 | #include <linux/pfn.h> | ||
22 | |||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/page.h> | ||
25 | #include <asm/e820.h> | ||
26 | #include <asm/proto.h> | ||
27 | #include <asm/bootsetup.h> | ||
28 | #include <asm/sections.h> | ||
29 | |||
30 | struct e820map e820; | ||
31 | |||
32 | /* | ||
33 | * PFN of last memory page. | ||
34 | */ | ||
35 | unsigned long end_pfn; | ||
36 | EXPORT_SYMBOL(end_pfn); | ||
37 | |||
38 | /* | ||
39 | * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | ||
40 | * The direct mapping extends to end_pfn_map, so that we can directly access | ||
41 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
42 | */ | ||
43 | unsigned long end_pfn_map; | ||
44 | |||
45 | /* | ||
46 | * Last pfn which the user wants to use. | ||
47 | */ | ||
48 | static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; | ||
49 | |||
50 | extern struct resource code_resource, data_resource; | ||
51 | |||
52 | /* Check for some hardcoded bad areas that early boot is not allowed to touch */ | ||
53 | static inline int bad_addr(unsigned long *addrp, unsigned long size) | ||
54 | { | ||
55 | unsigned long addr = *addrp, last = addr + size; | ||
56 | |||
57 | /* various gunk below that needed for SMP startup */ | ||
58 | if (addr < 0x8000) { | ||
59 | *addrp = PAGE_ALIGN(0x8000); | ||
60 | return 1; | ||
61 | } | ||
62 | |||
63 | /* direct mapping tables of the kernel */ | ||
64 | if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | ||
65 | *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | /* initrd */ | ||
70 | #ifdef CONFIG_BLK_DEV_INITRD | ||
71 | if (LOADER_TYPE && INITRD_START && last >= INITRD_START && | ||
72 | addr < INITRD_START+INITRD_SIZE) { | ||
73 | *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); | ||
74 | return 1; | ||
75 | } | ||
76 | #endif | ||
77 | /* kernel code */ | ||
78 | if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { | ||
79 | *addrp = PAGE_ALIGN(__pa_symbol(&_end)); | ||
80 | return 1; | ||
81 | } | ||
82 | |||
83 | if (last >= ebda_addr && addr < ebda_addr + ebda_size) { | ||
84 | *addrp = PAGE_ALIGN(ebda_addr + ebda_size); | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | #ifdef CONFIG_NUMA | ||
89 | /* NUMA memory to node map */ | ||
90 | if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { | ||
91 | *addrp = nodemap_addr + nodemap_size; | ||
92 | return 1; | ||
93 | } | ||
94 | #endif | ||
95 | /* XXX ramdisk image here? */ | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * This function checks if any part of the range <start,end> is mapped | ||
101 | * with type. | ||
102 | */ | ||
103 | int | ||
104 | e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | ||
105 | { | ||
106 | int i; | ||
107 | for (i = 0; i < e820.nr_map; i++) { | ||
108 | struct e820entry *ei = &e820.map[i]; | ||
109 | if (type && ei->type != type) | ||
110 | continue; | ||
111 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
112 | continue; | ||
113 | return 1; | ||
114 | } | ||
115 | return 0; | ||
116 | } | ||
117 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
118 | |||
119 | /* | ||
120 | * This function checks if the entire range <start,end> is mapped with type. | ||
121 | * | ||
122 | * Note: this function only works correct if the e820 table is sorted and | ||
123 | * not-overlapping, which is the case | ||
124 | */ | ||
125 | int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) | ||
126 | { | ||
127 | int i; | ||
128 | for (i = 0; i < e820.nr_map; i++) { | ||
129 | struct e820entry *ei = &e820.map[i]; | ||
130 | if (type && ei->type != type) | ||
131 | continue; | ||
132 | /* is the region (part) in overlap with the current region ?*/ | ||
133 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
134 | continue; | ||
135 | |||
136 | /* if the region is at the beginning of <start,end> we move | ||
137 | * start to the end of the region since it's ok until there | ||
138 | */ | ||
139 | if (ei->addr <= start) | ||
140 | start = ei->addr + ei->size; | ||
141 | /* if start is now at or beyond end, we're done, full coverage */ | ||
142 | if (start >= end) | ||
143 | return 1; /* we're done */ | ||
144 | } | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find a free area in a specific range. | ||
150 | */ | ||
151 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | ||
152 | { | ||
153 | int i; | ||
154 | for (i = 0; i < e820.nr_map; i++) { | ||
155 | struct e820entry *ei = &e820.map[i]; | ||
156 | unsigned long addr = ei->addr, last; | ||
157 | if (ei->type != E820_RAM) | ||
158 | continue; | ||
159 | if (addr < start) | ||
160 | addr = start; | ||
161 | if (addr > ei->addr + ei->size) | ||
162 | continue; | ||
163 | while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) | ||
164 | ; | ||
165 | last = PAGE_ALIGN(addr) + size; | ||
166 | if (last > ei->addr + ei->size) | ||
167 | continue; | ||
168 | if (last > end) | ||
169 | continue; | ||
170 | return addr; | ||
171 | } | ||
172 | return -1UL; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Find the highest page frame number we have available | ||
177 | */ | ||
178 | unsigned long __init e820_end_of_ram(void) | ||
179 | { | ||
180 | unsigned long end_pfn = 0; | ||
181 | end_pfn = find_max_pfn_with_active_regions(); | ||
182 | |||
183 | if (end_pfn > end_pfn_map) | ||
184 | end_pfn_map = end_pfn; | ||
185 | if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | ||
186 | end_pfn_map = MAXMEM>>PAGE_SHIFT; | ||
187 | if (end_pfn > end_user_pfn) | ||
188 | end_pfn = end_user_pfn; | ||
189 | if (end_pfn > end_pfn_map) | ||
190 | end_pfn = end_pfn_map; | ||
191 | |||
192 | printk("end_pfn_map = %lu\n", end_pfn_map); | ||
193 | return end_pfn; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Mark e820 reserved areas as busy for the resource manager. | ||
198 | */ | ||
199 | void __init e820_reserve_resources(void) | ||
200 | { | ||
201 | int i; | ||
202 | for (i = 0; i < e820.nr_map; i++) { | ||
203 | struct resource *res; | ||
204 | res = alloc_bootmem_low(sizeof(struct resource)); | ||
205 | switch (e820.map[i].type) { | ||
206 | case E820_RAM: res->name = "System RAM"; break; | ||
207 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
208 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
209 | default: res->name = "reserved"; | ||
210 | } | ||
211 | res->start = e820.map[i].addr; | ||
212 | res->end = res->start + e820.map[i].size - 1; | ||
213 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
214 | request_resource(&iomem_resource, res); | ||
215 | if (e820.map[i].type == E820_RAM) { | ||
216 | /* | ||
217 | * We don't know which RAM region contains kernel data, | ||
218 | * so we try it repeatedly and let the resource manager | ||
219 | * test it. | ||
220 | */ | ||
221 | request_resource(res, &code_resource); | ||
222 | request_resource(res, &data_resource); | ||
223 | #ifdef CONFIG_KEXEC | ||
224 | request_resource(res, &crashk_res); | ||
225 | #endif | ||
226 | } | ||
227 | } | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Find the ranges of physical addresses that do not correspond to | ||
232 | * e820 RAM areas and mark the corresponding pages as nosave for software | ||
233 | * suspend and suspend to RAM. | ||
234 | * | ||
235 | * This function requires the e820 map to be sorted and without any | ||
236 | * overlapping entries and assumes the first e820 area to be RAM. | ||
237 | */ | ||
238 | void __init e820_mark_nosave_regions(void) | ||
239 | { | ||
240 | int i; | ||
241 | unsigned long paddr; | ||
242 | |||
243 | paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); | ||
244 | for (i = 1; i < e820.nr_map; i++) { | ||
245 | struct e820entry *ei = &e820.map[i]; | ||
246 | |||
247 | if (paddr < ei->addr) | ||
248 | register_nosave_region(PFN_DOWN(paddr), | ||
249 | PFN_UP(ei->addr)); | ||
250 | |||
251 | paddr = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
252 | if (ei->type != E820_RAM) | ||
253 | register_nosave_region(PFN_UP(ei->addr), | ||
254 | PFN_DOWN(paddr)); | ||
255 | |||
256 | if (paddr >= (end_pfn << PAGE_SHIFT)) | ||
257 | break; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * Finds an active region in the address range from start_pfn to end_pfn and | ||
263 | * returns its range in ei_startpfn and ei_endpfn for the e820 entry. | ||
264 | */ | ||
265 | static int __init e820_find_active_region(const struct e820entry *ei, | ||
266 | unsigned long start_pfn, | ||
267 | unsigned long end_pfn, | ||
268 | unsigned long *ei_startpfn, | ||
269 | unsigned long *ei_endpfn) | ||
270 | { | ||
271 | *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; | ||
272 | *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; | ||
273 | |||
274 | /* Skip map entries smaller than a page */ | ||
275 | if (*ei_startpfn >= *ei_endpfn) | ||
276 | return 0; | ||
277 | |||
278 | /* Check if end_pfn_map should be updated */ | ||
279 | if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) | ||
280 | end_pfn_map = *ei_endpfn; | ||
281 | |||
282 | /* Skip if map is outside the node */ | ||
283 | if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || | ||
284 | *ei_startpfn >= end_pfn) | ||
285 | return 0; | ||
286 | |||
287 | /* Check for overlaps */ | ||
288 | if (*ei_startpfn < start_pfn) | ||
289 | *ei_startpfn = start_pfn; | ||
290 | if (*ei_endpfn > end_pfn) | ||
291 | *ei_endpfn = end_pfn; | ||
292 | |||
293 | /* Obey end_user_pfn to save on memmap */ | ||
294 | if (*ei_startpfn >= end_user_pfn) | ||
295 | return 0; | ||
296 | if (*ei_endpfn > end_user_pfn) | ||
297 | *ei_endpfn = end_user_pfn; | ||
298 | |||
299 | return 1; | ||
300 | } | ||
301 | |||
302 | /* Walk the e820 map and register active regions within a node */ | ||
303 | void __init | ||
304 | e820_register_active_regions(int nid, unsigned long start_pfn, | ||
305 | unsigned long end_pfn) | ||
306 | { | ||
307 | unsigned long ei_startpfn; | ||
308 | unsigned long ei_endpfn; | ||
309 | int i; | ||
310 | |||
311 | for (i = 0; i < e820.nr_map; i++) | ||
312 | if (e820_find_active_region(&e820.map[i], | ||
313 | start_pfn, end_pfn, | ||
314 | &ei_startpfn, &ei_endpfn)) | ||
315 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Add a memory region to the kernel e820 map. | ||
320 | */ | ||
321 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | ||
322 | { | ||
323 | int x = e820.nr_map; | ||
324 | |||
325 | if (x == E820MAX) { | ||
326 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
327 | return; | ||
328 | } | ||
329 | |||
330 | e820.map[x].addr = start; | ||
331 | e820.map[x].size = size; | ||
332 | e820.map[x].type = type; | ||
333 | e820.nr_map++; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Find the hole size (in bytes) in the memory range. | ||
338 | * @start: starting address of the memory range to scan | ||
339 | * @end: ending address of the memory range to scan | ||
340 | */ | ||
341 | unsigned long __init e820_hole_size(unsigned long start, unsigned long end) | ||
342 | { | ||
343 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
344 | unsigned long end_pfn = end >> PAGE_SHIFT; | ||
345 | unsigned long ei_startpfn; | ||
346 | unsigned long ei_endpfn; | ||
347 | unsigned long ram = 0; | ||
348 | int i; | ||
349 | |||
350 | for (i = 0; i < e820.nr_map; i++) { | ||
351 | if (e820_find_active_region(&e820.map[i], | ||
352 | start_pfn, end_pfn, | ||
353 | &ei_startpfn, &ei_endpfn)) | ||
354 | ram += ei_endpfn - ei_startpfn; | ||
355 | } | ||
356 | return end - start - (ram << PAGE_SHIFT); | ||
357 | } | ||
358 | |||
359 | void __init e820_print_map(char *who) | ||
360 | { | ||
361 | int i; | ||
362 | |||
363 | for (i = 0; i < e820.nr_map; i++) { | ||
364 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | ||
365 | (unsigned long long) e820.map[i].addr, | ||
366 | (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | ||
367 | switch (e820.map[i].type) { | ||
368 | case E820_RAM: printk("(usable)\n"); | ||
369 | break; | ||
370 | case E820_RESERVED: | ||
371 | printk("(reserved)\n"); | ||
372 | break; | ||
373 | case E820_ACPI: | ||
374 | printk("(ACPI data)\n"); | ||
375 | break; | ||
376 | case E820_NVS: | ||
377 | printk("(ACPI NVS)\n"); | ||
378 | break; | ||
379 | default: printk("type %u\n", e820.map[i].type); | ||
380 | break; | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * Sanitize the BIOS e820 map. | ||
387 | * | ||
388 | * Some e820 responses include overlapping entries. The following | ||
389 | * replaces the original e820 map with a new one, removing overlaps. | ||
390 | * | ||
391 | */ | ||
392 | static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
393 | { | ||
394 | struct change_member { | ||
395 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
396 | unsigned long long addr; /* address for this change point */ | ||
397 | }; | ||
398 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
399 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
400 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
401 | static struct e820entry new_bios[E820MAX] __initdata; | ||
402 | struct change_member *change_tmp; | ||
403 | unsigned long current_type, last_type; | ||
404 | unsigned long long last_addr; | ||
405 | int chgidx, still_changing; | ||
406 | int overlap_entries; | ||
407 | int new_bios_entry; | ||
408 | int old_nr, new_nr, chg_nr; | ||
409 | int i; | ||
410 | |||
411 | /* | ||
412 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
413 | |||
414 | Sample memory map (w/overlaps): | ||
415 | ____22__________________ | ||
416 | ______________________4_ | ||
417 | ____1111________________ | ||
418 | _44_____________________ | ||
419 | 11111111________________ | ||
420 | ____________________33__ | ||
421 | ___________44___________ | ||
422 | __________33333_________ | ||
423 | ______________22________ | ||
424 | ___________________2222_ | ||
425 | _________111111111______ | ||
426 | _____________________11_ | ||
427 | _________________4______ | ||
428 | |||
429 | Sanitized equivalent (no overlap): | ||
430 | 1_______________________ | ||
431 | _44_____________________ | ||
432 | ___1____________________ | ||
433 | ____22__________________ | ||
434 | ______11________________ | ||
435 | _________1______________ | ||
436 | __________3_____________ | ||
437 | ___________44___________ | ||
438 | _____________33_________ | ||
439 | _______________2________ | ||
440 | ________________1_______ | ||
441 | _________________4______ | ||
442 | ___________________2____ | ||
443 | ____________________33__ | ||
444 | ______________________4_ | ||
445 | */ | ||
446 | |||
447 | /* if there's only one memory region, don't bother */ | ||
448 | if (*pnr_map < 2) | ||
449 | return -1; | ||
450 | |||
451 | old_nr = *pnr_map; | ||
452 | |||
453 | /* bail out if we find any unreasonable addresses in bios map */ | ||
454 | for (i=0; i<old_nr; i++) | ||
455 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
456 | return -1; | ||
457 | |||
458 | /* create pointers for initial change-point information (for sorting) */ | ||
459 | for (i=0; i < 2*old_nr; i++) | ||
460 | change_point[i] = &change_point_list[i]; | ||
461 | |||
462 | /* record all known change-points (starting and ending addresses), | ||
463 | omitting those that are for empty memory regions */ | ||
464 | chgidx = 0; | ||
465 | for (i=0; i < old_nr; i++) { | ||
466 | if (biosmap[i].size != 0) { | ||
467 | change_point[chgidx]->addr = biosmap[i].addr; | ||
468 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
469 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
470 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
471 | } | ||
472 | } | ||
473 | chg_nr = chgidx; | ||
474 | |||
475 | /* sort change-point list by memory addresses (low -> high) */ | ||
476 | still_changing = 1; | ||
477 | while (still_changing) { | ||
478 | still_changing = 0; | ||
479 | for (i=1; i < chg_nr; i++) { | ||
480 | /* if <current_addr> > <last_addr>, swap */ | ||
481 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
482 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
483 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
484 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
485 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
486 | ) | ||
487 | { | ||
488 | change_tmp = change_point[i]; | ||
489 | change_point[i] = change_point[i-1]; | ||
490 | change_point[i-1] = change_tmp; | ||
491 | still_changing=1; | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | /* create a new bios memory map, removing overlaps */ | ||
497 | overlap_entries=0; /* number of entries in the overlap table */ | ||
498 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
499 | last_type = 0; /* start with undefined memory type */ | ||
500 | last_addr = 0; /* start with 0 as last starting address */ | ||
501 | /* loop through change-points, determining affect on the new bios map */ | ||
502 | for (chgidx=0; chgidx < chg_nr; chgidx++) | ||
503 | { | ||
504 | /* keep track of all overlapping bios entries */ | ||
505 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
506 | { | ||
507 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
508 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
509 | } | ||
510 | else | ||
511 | { | ||
512 | /* remove entry from list (order independent, so swap with last) */ | ||
513 | for (i=0; i<overlap_entries; i++) | ||
514 | { | ||
515 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
516 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
517 | } | ||
518 | overlap_entries--; | ||
519 | } | ||
520 | /* if there are overlapping entries, decide which "type" to use */ | ||
521 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
522 | current_type = 0; | ||
523 | for (i=0; i<overlap_entries; i++) | ||
524 | if (overlap_list[i]->type > current_type) | ||
525 | current_type = overlap_list[i]->type; | ||
526 | /* continue building up new bios map based on this information */ | ||
527 | if (current_type != last_type) { | ||
528 | if (last_type != 0) { | ||
529 | new_bios[new_bios_entry].size = | ||
530 | change_point[chgidx]->addr - last_addr; | ||
531 | /* move forward only if the new size was non-zero */ | ||
532 | if (new_bios[new_bios_entry].size != 0) | ||
533 | if (++new_bios_entry >= E820MAX) | ||
534 | break; /* no more space left for new bios entries */ | ||
535 | } | ||
536 | if (current_type != 0) { | ||
537 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
538 | new_bios[new_bios_entry].type = current_type; | ||
539 | last_addr=change_point[chgidx]->addr; | ||
540 | } | ||
541 | last_type = current_type; | ||
542 | } | ||
543 | } | ||
544 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
545 | |||
546 | /* copy new bios mapping into original location */ | ||
547 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
548 | *pnr_map = new_nr; | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * Copy the BIOS e820 map into a safe place. | ||
555 | * | ||
556 | * Sanity-check it while we're at it.. | ||
557 | * | ||
558 | * If we're lucky and live on a modern system, the setup code | ||
559 | * will have given us a memory map that we can use to properly | ||
560 | * set up memory. If we aren't, we'll fake a memory map. | ||
561 | */ | ||
562 | static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | ||
563 | { | ||
564 | /* Only one memory region (or negative)? Ignore it */ | ||
565 | if (nr_map < 2) | ||
566 | return -1; | ||
567 | |||
568 | do { | ||
569 | unsigned long start = biosmap->addr; | ||
570 | unsigned long size = biosmap->size; | ||
571 | unsigned long end = start + size; | ||
572 | unsigned long type = biosmap->type; | ||
573 | |||
574 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
575 | if (start > end) | ||
576 | return -1; | ||
577 | |||
578 | add_memory_region(start, size, type); | ||
579 | } while (biosmap++,--nr_map); | ||
580 | return 0; | ||
581 | } | ||
582 | |||
583 | void early_panic(char *msg) | ||
584 | { | ||
585 | early_printk(msg); | ||
586 | panic(msg); | ||
587 | } | ||
588 | |||
589 | void __init setup_memory_region(void) | ||
590 | { | ||
591 | /* | ||
592 | * Try to copy the BIOS-supplied E820-map. | ||
593 | * | ||
594 | * Otherwise fake a memory map; one section from 0k->640k, | ||
595 | * the next section from 1mb->appropriate_mem_k | ||
596 | */ | ||
597 | sanitize_e820_map(E820_MAP, &E820_MAP_NR); | ||
598 | if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) | ||
599 | early_panic("Cannot find a valid memory map"); | ||
600 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
601 | e820_print_map("BIOS-e820"); | ||
602 | } | ||
603 | |||
604 | static int __init parse_memopt(char *p) | ||
605 | { | ||
606 | if (!p) | ||
607 | return -EINVAL; | ||
608 | end_user_pfn = memparse(p, &p); | ||
609 | end_user_pfn >>= PAGE_SHIFT; | ||
610 | return 0; | ||
611 | } | ||
612 | early_param("mem", parse_memopt); | ||
613 | |||
614 | static int userdef __initdata; | ||
615 | |||
616 | static int __init parse_memmap_opt(char *p) | ||
617 | { | ||
618 | char *oldp; | ||
619 | unsigned long long start_at, mem_size; | ||
620 | |||
621 | if (!strcmp(p, "exactmap")) { | ||
622 | #ifdef CONFIG_CRASH_DUMP | ||
623 | /* If we are doing a crash dump, we | ||
624 | * still need to know the real mem | ||
625 | * size before original memory map is | ||
626 | * reset. | ||
627 | */ | ||
628 | e820_register_active_regions(0, 0, -1UL); | ||
629 | saved_max_pfn = e820_end_of_ram(); | ||
630 | remove_all_active_ranges(); | ||
631 | #endif | ||
632 | end_pfn_map = 0; | ||
633 | e820.nr_map = 0; | ||
634 | userdef = 1; | ||
635 | return 0; | ||
636 | } | ||
637 | |||
638 | oldp = p; | ||
639 | mem_size = memparse(p, &p); | ||
640 | if (p == oldp) | ||
641 | return -EINVAL; | ||
642 | if (*p == '@') { | ||
643 | start_at = memparse(p+1, &p); | ||
644 | add_memory_region(start_at, mem_size, E820_RAM); | ||
645 | } else if (*p == '#') { | ||
646 | start_at = memparse(p+1, &p); | ||
647 | add_memory_region(start_at, mem_size, E820_ACPI); | ||
648 | } else if (*p == '$') { | ||
649 | start_at = memparse(p+1, &p); | ||
650 | add_memory_region(start_at, mem_size, E820_RESERVED); | ||
651 | } else { | ||
652 | end_user_pfn = (mem_size >> PAGE_SHIFT); | ||
653 | } | ||
654 | return *p == '\0' ? 0 : -EINVAL; | ||
655 | } | ||
656 | early_param("memmap", parse_memmap_opt); | ||
657 | |||
658 | void __init finish_e820_parsing(void) | ||
659 | { | ||
660 | if (userdef) { | ||
661 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
662 | e820_print_map("user"); | ||
663 | } | ||
664 | } | ||
665 | |||
666 | unsigned long pci_mem_start = 0xaeedbabe; | ||
667 | EXPORT_SYMBOL(pci_mem_start); | ||
668 | |||
669 | /* | ||
670 | * Search for the biggest gap in the low 32 bits of the e820 | ||
671 | * memory space. We pass this space to PCI to assign MMIO resources | ||
672 | * for hotplug or unconfigured devices in. | ||
673 | * Hopefully the BIOS let enough space left. | ||
674 | */ | ||
675 | __init void e820_setup_gap(void) | ||
676 | { | ||
677 | unsigned long gapstart, gapsize, round; | ||
678 | unsigned long last; | ||
679 | int i; | ||
680 | int found = 0; | ||
681 | |||
682 | last = 0x100000000ull; | ||
683 | gapstart = 0x10000000; | ||
684 | gapsize = 0x400000; | ||
685 | i = e820.nr_map; | ||
686 | while (--i >= 0) { | ||
687 | unsigned long long start = e820.map[i].addr; | ||
688 | unsigned long long end = start + e820.map[i].size; | ||
689 | |||
690 | /* | ||
691 | * Since "last" is at most 4GB, we know we'll | ||
692 | * fit in 32 bits if this condition is true | ||
693 | */ | ||
694 | if (last > end) { | ||
695 | unsigned long gap = last - end; | ||
696 | |||
697 | if (gap > gapsize) { | ||
698 | gapsize = gap; | ||
699 | gapstart = end; | ||
700 | found = 1; | ||
701 | } | ||
702 | } | ||
703 | if (start < last) | ||
704 | last = start; | ||
705 | } | ||
706 | |||
707 | if (!found) { | ||
708 | gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; | ||
709 | printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" | ||
710 | KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * See how much we want to round up: start off with | ||
715 | * rounding to the next 1MB area. | ||
716 | */ | ||
717 | round = 0x100000; | ||
718 | while ((gapsize >> 4) > round) | ||
719 | round += round; | ||
720 | /* Fun with two's complement */ | ||
721 | pci_mem_start = (gapstart + round) & -round; | ||
722 | |||
723 | printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | ||
724 | pci_mem_start, gapstart, gapsize); | ||
725 | } | ||
diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c new file mode 100644 index 000000000000..13aa4fd728f3 --- /dev/null +++ b/arch/x86/kernel/early-quirks_64.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* Various workarounds for chipset bugs. | ||
2 | This code runs very early and can't use the regular PCI subsystem | ||
3 | The entries are keyed to PCI bridges which usually identify chipsets | ||
4 | uniquely. | ||
5 | This is only for whole classes of chipsets with specific problems which | ||
6 | need early invasive action (e.g. before the timers are initialized). | ||
7 | Most PCI device specific workarounds can be done later and should be | ||
8 | in standard PCI quirks | ||
9 | Mainboard specific bugs should be handled by DMI entries. | ||
10 | CPU specific bugs in setup.c */ | ||
11 | |||
12 | #include <linux/pci.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/pci_ids.h> | ||
15 | #include <asm/pci-direct.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/iommu.h> | ||
18 | #include <asm/dma.h> | ||
19 | |||
20 | static void __init via_bugs(void) | ||
21 | { | ||
22 | #ifdef CONFIG_IOMMU | ||
23 | if ((end_pfn > MAX_DMA32_PFN || force_iommu) && | ||
24 | !iommu_aperture_allowed) { | ||
25 | printk(KERN_INFO | ||
26 | "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); | ||
27 | iommu_aperture_disabled = 1; | ||
28 | } | ||
29 | #endif | ||
30 | } | ||
31 | |||
32 | #ifdef CONFIG_ACPI | ||
33 | |||
34 | static int __init nvidia_hpet_check(struct acpi_table_header *header) | ||
35 | { | ||
36 | return 0; | ||
37 | } | ||
38 | #endif | ||
39 | |||
40 | static void __init nvidia_bugs(void) | ||
41 | { | ||
42 | #ifdef CONFIG_ACPI | ||
43 | /* | ||
44 | * All timer overrides on Nvidia are | ||
45 | * wrong unless HPET is enabled. | ||
46 | * Unfortunately that's not true on many Asus boards. | ||
47 | * We don't know yet how to detect this automatically, but | ||
48 | * at least allow a command line override. | ||
49 | */ | ||
50 | if (acpi_use_timer_override) | ||
51 | return; | ||
52 | |||
53 | if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { | ||
54 | acpi_skip_timer_override = 1; | ||
55 | printk(KERN_INFO "Nvidia board " | ||
56 | "detected. Ignoring ACPI " | ||
57 | "timer override.\n"); | ||
58 | printk(KERN_INFO "If you got timer trouble " | ||
59 | "try acpi_use_timer_override\n"); | ||
60 | } | ||
61 | #endif | ||
62 | /* RED-PEN skip them on mptables too? */ | ||
63 | |||
64 | } | ||
65 | |||
66 | static void __init ati_bugs(void) | ||
67 | { | ||
68 | if (timer_over_8254 == 1) { | ||
69 | timer_over_8254 = 0; | ||
70 | printk(KERN_INFO | ||
71 | "ATI board detected. Disabling timer routing over 8254.\n"); | ||
72 | } | ||
73 | } | ||
74 | |||
75 | struct chipset { | ||
76 | u16 vendor; | ||
77 | void (*f)(void); | ||
78 | }; | ||
79 | |||
80 | static struct chipset early_qrk[] __initdata = { | ||
81 | { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, | ||
82 | { PCI_VENDOR_ID_VIA, via_bugs }, | ||
83 | { PCI_VENDOR_ID_ATI, ati_bugs }, | ||
84 | {} | ||
85 | }; | ||
86 | |||
87 | void __init early_quirks(void) | ||
88 | { | ||
89 | int num, slot, func; | ||
90 | |||
91 | if (!early_pci_allowed()) | ||
92 | return; | ||
93 | |||
94 | /* Poor man's PCI discovery */ | ||
95 | for (num = 0; num < 32; num++) { | ||
96 | for (slot = 0; slot < 32; slot++) { | ||
97 | for (func = 0; func < 8; func++) { | ||
98 | u32 class; | ||
99 | u32 vendor; | ||
100 | u8 type; | ||
101 | int i; | ||
102 | class = read_pci_config(num,slot,func, | ||
103 | PCI_CLASS_REVISION); | ||
104 | if (class == 0xffffffff) | ||
105 | break; | ||
106 | |||
107 | if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) | ||
108 | continue; | ||
109 | |||
110 | vendor = read_pci_config(num, slot, func, | ||
111 | PCI_VENDOR_ID); | ||
112 | vendor &= 0xffff; | ||
113 | |||
114 | for (i = 0; early_qrk[i].f; i++) | ||
115 | if (early_qrk[i].vendor == vendor) { | ||
116 | early_qrk[i].f(); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | type = read_pci_config_byte(num, slot, func, | ||
121 | PCI_HEADER_TYPE); | ||
122 | if (!(type & 0x80)) | ||
123 | break; | ||
124 | } | ||
125 | } | ||
126 | } | ||
127 | } | ||
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 92f812ba275c..fd9aff3f3890 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -1,2 +1,259 @@ | |||
1 | #include <linux/console.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/screen_info.h> | ||
6 | #include <asm/io.h> | ||
7 | #include <asm/processor.h> | ||
8 | #include <asm/fcntl.h> | ||
9 | #include <xen/hvc-console.h> | ||
1 | 10 | ||
2 | #include "../../x86_64/kernel/early_printk.c" | 11 | /* Simple VGA output */ |
12 | |||
13 | #ifdef __i386__ | ||
14 | #include <asm/setup.h> | ||
15 | #else | ||
16 | #include <asm/bootsetup.h> | ||
17 | #endif | ||
18 | #define VGABASE (__ISA_IO_base + 0xb8000) | ||
19 | |||
20 | static int max_ypos = 25, max_xpos = 80; | ||
21 | static int current_ypos = 25, current_xpos = 0; | ||
22 | |||
23 | static void early_vga_write(struct console *con, const char *str, unsigned n) | ||
24 | { | ||
25 | char c; | ||
26 | int i, k, j; | ||
27 | |||
28 | while ((c = *str++) != '\0' && n-- > 0) { | ||
29 | if (current_ypos >= max_ypos) { | ||
30 | /* scroll 1 line up */ | ||
31 | for (k = 1, j = 0; k < max_ypos; k++, j++) { | ||
32 | for (i = 0; i < max_xpos; i++) { | ||
33 | writew(readw(VGABASE+2*(max_xpos*k+i)), | ||
34 | VGABASE + 2*(max_xpos*j + i)); | ||
35 | } | ||
36 | } | ||
37 | for (i = 0; i < max_xpos; i++) | ||
38 | writew(0x720, VGABASE + 2*(max_xpos*j + i)); | ||
39 | current_ypos = max_ypos-1; | ||
40 | } | ||
41 | if (c == '\n') { | ||
42 | current_xpos = 0; | ||
43 | current_ypos++; | ||
44 | } else if (c != '\r') { | ||
45 | writew(((0x7 << 8) | (unsigned short) c), | ||
46 | VGABASE + 2*(max_xpos*current_ypos + | ||
47 | current_xpos++)); | ||
48 | if (current_xpos >= max_xpos) { | ||
49 | current_xpos = 0; | ||
50 | current_ypos++; | ||
51 | } | ||
52 | } | ||
53 | } | ||
54 | } | ||
55 | |||
56 | static struct console early_vga_console = { | ||
57 | .name = "earlyvga", | ||
58 | .write = early_vga_write, | ||
59 | .flags = CON_PRINTBUFFER, | ||
60 | .index = -1, | ||
61 | }; | ||
62 | |||
63 | /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ | ||
64 | |||
65 | static int early_serial_base = 0x3f8; /* ttyS0 */ | ||
66 | |||
67 | #define XMTRDY 0x20 | ||
68 | |||
69 | #define DLAB 0x80 | ||
70 | |||
71 | #define TXR 0 /* Transmit register (WRITE) */ | ||
72 | #define RXR 0 /* Receive register (READ) */ | ||
73 | #define IER 1 /* Interrupt Enable */ | ||
74 | #define IIR 2 /* Interrupt ID */ | ||
75 | #define FCR 2 /* FIFO control */ | ||
76 | #define LCR 3 /* Line control */ | ||
77 | #define MCR 4 /* Modem control */ | ||
78 | #define LSR 5 /* Line Status */ | ||
79 | #define MSR 6 /* Modem Status */ | ||
80 | #define DLL 0 /* Divisor Latch Low */ | ||
81 | #define DLH 1 /* Divisor latch High */ | ||
82 | |||
83 | static int early_serial_putc(unsigned char ch) | ||
84 | { | ||
85 | unsigned timeout = 0xffff; | ||
86 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | ||
87 | cpu_relax(); | ||
88 | outb(ch, early_serial_base + TXR); | ||
89 | return timeout ? 0 : -1; | ||
90 | } | ||
91 | |||
92 | static void early_serial_write(struct console *con, const char *s, unsigned n) | ||
93 | { | ||
94 | while (*s && n-- > 0) { | ||
95 | if (*s == '\n') | ||
96 | early_serial_putc('\r'); | ||
97 | early_serial_putc(*s); | ||
98 | s++; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | #define DEFAULT_BAUD 9600 | ||
103 | |||
104 | static __init void early_serial_init(char *s) | ||
105 | { | ||
106 | unsigned char c; | ||
107 | unsigned divisor; | ||
108 | unsigned baud = DEFAULT_BAUD; | ||
109 | char *e; | ||
110 | |||
111 | if (*s == ',') | ||
112 | ++s; | ||
113 | |||
114 | if (*s) { | ||
115 | unsigned port; | ||
116 | if (!strncmp(s,"0x",2)) { | ||
117 | early_serial_base = simple_strtoul(s, &e, 16); | ||
118 | } else { | ||
119 | static int bases[] = { 0x3f8, 0x2f8 }; | ||
120 | |||
121 | if (!strncmp(s,"ttyS",4)) | ||
122 | s += 4; | ||
123 | port = simple_strtoul(s, &e, 10); | ||
124 | if (port > 1 || s == e) | ||
125 | port = 0; | ||
126 | early_serial_base = bases[port]; | ||
127 | } | ||
128 | s += strcspn(s, ","); | ||
129 | if (*s == ',') | ||
130 | s++; | ||
131 | } | ||
132 | |||
133 | outb(0x3, early_serial_base + LCR); /* 8n1 */ | ||
134 | outb(0, early_serial_base + IER); /* no interrupt */ | ||
135 | outb(0, early_serial_base + FCR); /* no fifo */ | ||
136 | outb(0x3, early_serial_base + MCR); /* DTR + RTS */ | ||
137 | |||
138 | if (*s) { | ||
139 | baud = simple_strtoul(s, &e, 0); | ||
140 | if (baud == 0 || s == e) | ||
141 | baud = DEFAULT_BAUD; | ||
142 | } | ||
143 | |||
144 | divisor = 115200 / baud; | ||
145 | c = inb(early_serial_base + LCR); | ||
146 | outb(c | DLAB, early_serial_base + LCR); | ||
147 | outb(divisor & 0xff, early_serial_base + DLL); | ||
148 | outb((divisor >> 8) & 0xff, early_serial_base + DLH); | ||
149 | outb(c & ~DLAB, early_serial_base + LCR); | ||
150 | } | ||
151 | |||
152 | static struct console early_serial_console = { | ||
153 | .name = "earlyser", | ||
154 | .write = early_serial_write, | ||
155 | .flags = CON_PRINTBUFFER, | ||
156 | .index = -1, | ||
157 | }; | ||
158 | |||
159 | /* Console interface to a host file on AMD's SimNow! */ | ||
160 | |||
161 | static int simnow_fd; | ||
162 | |||
163 | enum { | ||
164 | MAGIC1 = 0xBACCD00A, | ||
165 | MAGIC2 = 0xCA110000, | ||
166 | XOPEN = 5, | ||
167 | XWRITE = 4, | ||
168 | }; | ||
169 | |||
170 | static noinline long simnow(long cmd, long a, long b, long c) | ||
171 | { | ||
172 | long ret; | ||
173 | asm volatile("cpuid" : | ||
174 | "=a" (ret) : | ||
175 | "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); | ||
176 | return ret; | ||
177 | } | ||
178 | |||
179 | static void __init simnow_init(char *str) | ||
180 | { | ||
181 | char *fn = "klog"; | ||
182 | if (*str == '=') | ||
183 | fn = ++str; | ||
184 | /* error ignored */ | ||
185 | simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); | ||
186 | } | ||
187 | |||
188 | static void simnow_write(struct console *con, const char *s, unsigned n) | ||
189 | { | ||
190 | simnow(XWRITE, simnow_fd, (unsigned long)s, n); | ||
191 | } | ||
192 | |||
193 | static struct console simnow_console = { | ||
194 | .name = "simnow", | ||
195 | .write = simnow_write, | ||
196 | .flags = CON_PRINTBUFFER, | ||
197 | .index = -1, | ||
198 | }; | ||
199 | |||
200 | /* Direct interface for emergencies */ | ||
201 | struct console *early_console = &early_vga_console; | ||
202 | static int early_console_initialized = 0; | ||
203 | |||
204 | void early_printk(const char *fmt, ...) | ||
205 | { | ||
206 | char buf[512]; | ||
207 | int n; | ||
208 | va_list ap; | ||
209 | |||
210 | va_start(ap,fmt); | ||
211 | n = vscnprintf(buf,512,fmt,ap); | ||
212 | early_console->write(early_console,buf,n); | ||
213 | va_end(ap); | ||
214 | } | ||
215 | |||
216 | static int __initdata keep_early; | ||
217 | |||
218 | static int __init setup_early_printk(char *buf) | ||
219 | { | ||
220 | if (!buf) | ||
221 | return 0; | ||
222 | |||
223 | if (early_console_initialized) | ||
224 | return 0; | ||
225 | early_console_initialized = 1; | ||
226 | |||
227 | if (strstr(buf, "keep")) | ||
228 | keep_early = 1; | ||
229 | |||
230 | if (!strncmp(buf, "serial", 6)) { | ||
231 | early_serial_init(buf + 6); | ||
232 | early_console = &early_serial_console; | ||
233 | } else if (!strncmp(buf, "ttyS", 4)) { | ||
234 | early_serial_init(buf); | ||
235 | early_console = &early_serial_console; | ||
236 | } else if (!strncmp(buf, "vga", 3) | ||
237 | && SCREEN_INFO.orig_video_isVGA == 1) { | ||
238 | max_xpos = SCREEN_INFO.orig_video_cols; | ||
239 | max_ypos = SCREEN_INFO.orig_video_lines; | ||
240 | current_ypos = SCREEN_INFO.orig_y; | ||
241 | early_console = &early_vga_console; | ||
242 | } else if (!strncmp(buf, "simnow", 6)) { | ||
243 | simnow_init(buf + 6); | ||
244 | early_console = &simnow_console; | ||
245 | keep_early = 1; | ||
246 | #ifdef CONFIG_HVC_XEN | ||
247 | } else if (!strncmp(buf, "xen", 3)) { | ||
248 | early_console = &xenboot_console; | ||
249 | #endif | ||
250 | } | ||
251 | |||
252 | if (keep_early) | ||
253 | early_console->flags &= ~CON_BOOT; | ||
254 | else | ||
255 | early_console->flags |= CON_BOOT; | ||
256 | register_console(early_console); | ||
257 | return 0; | ||
258 | } | ||
259 | early_param("earlyprintk", setup_early_printk); | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S new file mode 100644 index 000000000000..1d232e5f5658 --- /dev/null +++ b/arch/x86/kernel/entry_64.S | |||
@@ -0,0 +1,1172 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/entry.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * entry.S contains the system-call and fault low-level handling routines. | ||
11 | * | ||
12 | * NOTE: This code handles signal-recognition, which happens every time | ||
13 | * after an interrupt and after each system call. | ||
14 | * | ||
15 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
16 | * only done for syscall tracing, signals or fork/exec et.al. | ||
17 | * | ||
18 | * A note on terminology: | ||
19 | * - top of stack: Architecture defined interrupt frame from SS to RIP | ||
20 | * at the top of the kernel process stack. | ||
21 | * - partial stack frame: partially saved registers upto R11. | ||
22 | * - full stack frame: Like partial stack frame, but all register saved. | ||
23 | * | ||
24 | * Some macro usage: | ||
25 | * - CFI macros are used to generate dwarf2 unwind information for better | ||
26 | * backtraces. They don't change any code. | ||
27 | * - SAVE_ALL/RESTORE_ALL - Save/restore all registers | ||
28 | * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. | ||
29 | * There are unfortunately lots of special cases where some registers | ||
30 | * not touched. The macro is a big mess that should be cleaned up. | ||
31 | * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. | ||
32 | * Gives a full stack frame. | ||
33 | * - ENTRY/END Define functions in the symbol table. | ||
34 | * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack | ||
35 | * frame that is otherwise undefined after a SYSCALL | ||
36 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. | ||
37 | * - errorentry/paranoidentry/zeroentry - Define exception entry points. | ||
38 | */ | ||
39 | |||
40 | #include <linux/linkage.h> | ||
41 | #include <asm/segment.h> | ||
42 | #include <asm/cache.h> | ||
43 | #include <asm/errno.h> | ||
44 | #include <asm/dwarf2.h> | ||
45 | #include <asm/calling.h> | ||
46 | #include <asm/asm-offsets.h> | ||
47 | #include <asm/msr.h> | ||
48 | #include <asm/unistd.h> | ||
49 | #include <asm/thread_info.h> | ||
50 | #include <asm/hw_irq.h> | ||
51 | #include <asm/page.h> | ||
52 | #include <asm/irqflags.h> | ||
53 | |||
54 | .code64 | ||
55 | |||
56 | #ifndef CONFIG_PREEMPT | ||
57 | #define retint_kernel retint_restore_args | ||
58 | #endif | ||
59 | |||
60 | |||
61 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | ||
62 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
63 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | ||
64 | jnc 1f | ||
65 | TRACE_IRQS_ON | ||
66 | 1: | ||
67 | #endif | ||
68 | .endm | ||
69 | |||
70 | /* | ||
71 | * C code is not supposed to know about undefined top of stack. Every time | ||
72 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
73 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
74 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
75 | * manipulation. | ||
76 | */ | ||
77 | |||
78 | /* %rsp:at FRAMEEND */ | ||
79 | .macro FIXUP_TOP_OF_STACK tmp | ||
80 | movq %gs:pda_oldrsp,\tmp | ||
81 | movq \tmp,RSP(%rsp) | ||
82 | movq $__USER_DS,SS(%rsp) | ||
83 | movq $__USER_CS,CS(%rsp) | ||
84 | movq $-1,RCX(%rsp) | ||
85 | movq R11(%rsp),\tmp /* get eflags */ | ||
86 | movq \tmp,EFLAGS(%rsp) | ||
87 | .endm | ||
88 | |||
89 | .macro RESTORE_TOP_OF_STACK tmp,offset=0 | ||
90 | movq RSP-\offset(%rsp),\tmp | ||
91 | movq \tmp,%gs:pda_oldrsp | ||
92 | movq EFLAGS-\offset(%rsp),\tmp | ||
93 | movq \tmp,R11-\offset(%rsp) | ||
94 | .endm | ||
95 | |||
96 | .macro FAKE_STACK_FRAME child_rip | ||
97 | /* push in order ss, rsp, eflags, cs, rip */ | ||
98 | xorl %eax, %eax | ||
99 | pushq %rax /* ss */ | ||
100 | CFI_ADJUST_CFA_OFFSET 8 | ||
101 | /*CFI_REL_OFFSET ss,0*/ | ||
102 | pushq %rax /* rsp */ | ||
103 | CFI_ADJUST_CFA_OFFSET 8 | ||
104 | CFI_REL_OFFSET rsp,0 | ||
105 | pushq $(1<<9) /* eflags - interrupts on */ | ||
106 | CFI_ADJUST_CFA_OFFSET 8 | ||
107 | /*CFI_REL_OFFSET rflags,0*/ | ||
108 | pushq $__KERNEL_CS /* cs */ | ||
109 | CFI_ADJUST_CFA_OFFSET 8 | ||
110 | /*CFI_REL_OFFSET cs,0*/ | ||
111 | pushq \child_rip /* rip */ | ||
112 | CFI_ADJUST_CFA_OFFSET 8 | ||
113 | CFI_REL_OFFSET rip,0 | ||
114 | pushq %rax /* orig rax */ | ||
115 | CFI_ADJUST_CFA_OFFSET 8 | ||
116 | .endm | ||
117 | |||
118 | .macro UNFAKE_STACK_FRAME | ||
119 | addq $8*6, %rsp | ||
120 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
121 | .endm | ||
122 | |||
123 | .macro CFI_DEFAULT_STACK start=1 | ||
124 | .if \start | ||
125 | CFI_STARTPROC simple | ||
126 | CFI_SIGNAL_FRAME | ||
127 | CFI_DEF_CFA rsp,SS+8 | ||
128 | .else | ||
129 | CFI_DEF_CFA_OFFSET SS+8 | ||
130 | .endif | ||
131 | CFI_REL_OFFSET r15,R15 | ||
132 | CFI_REL_OFFSET r14,R14 | ||
133 | CFI_REL_OFFSET r13,R13 | ||
134 | CFI_REL_OFFSET r12,R12 | ||
135 | CFI_REL_OFFSET rbp,RBP | ||
136 | CFI_REL_OFFSET rbx,RBX | ||
137 | CFI_REL_OFFSET r11,R11 | ||
138 | CFI_REL_OFFSET r10,R10 | ||
139 | CFI_REL_OFFSET r9,R9 | ||
140 | CFI_REL_OFFSET r8,R8 | ||
141 | CFI_REL_OFFSET rax,RAX | ||
142 | CFI_REL_OFFSET rcx,RCX | ||
143 | CFI_REL_OFFSET rdx,RDX | ||
144 | CFI_REL_OFFSET rsi,RSI | ||
145 | CFI_REL_OFFSET rdi,RDI | ||
146 | CFI_REL_OFFSET rip,RIP | ||
147 | /*CFI_REL_OFFSET cs,CS*/ | ||
148 | /*CFI_REL_OFFSET rflags,EFLAGS*/ | ||
149 | CFI_REL_OFFSET rsp,RSP | ||
150 | /*CFI_REL_OFFSET ss,SS*/ | ||
151 | .endm | ||
152 | /* | ||
153 | * A newly forked process directly context switches into this. | ||
154 | */ | ||
155 | /* rdi: prev */ | ||
156 | ENTRY(ret_from_fork) | ||
157 | CFI_DEFAULT_STACK | ||
158 | push kernel_eflags(%rip) | ||
159 | CFI_ADJUST_CFA_OFFSET 4 | ||
160 | popf # reset kernel eflags | ||
161 | CFI_ADJUST_CFA_OFFSET -4 | ||
162 | call schedule_tail | ||
163 | GET_THREAD_INFO(%rcx) | ||
164 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | ||
165 | jnz rff_trace | ||
166 | rff_action: | ||
167 | RESTORE_REST | ||
168 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
169 | je int_ret_from_sys_call | ||
170 | testl $_TIF_IA32,threadinfo_flags(%rcx) | ||
171 | jnz int_ret_from_sys_call | ||
172 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | ||
173 | jmp ret_from_sys_call | ||
174 | rff_trace: | ||
175 | movq %rsp,%rdi | ||
176 | call syscall_trace_leave | ||
177 | GET_THREAD_INFO(%rcx) | ||
178 | jmp rff_action | ||
179 | CFI_ENDPROC | ||
180 | END(ret_from_fork) | ||
181 | |||
182 | /* | ||
183 | * System call entry. Upto 6 arguments in registers are supported. | ||
184 | * | ||
185 | * SYSCALL does not save anything on the stack and does not change the | ||
186 | * stack pointer. | ||
187 | */ | ||
188 | |||
189 | /* | ||
190 | * Register setup: | ||
191 | * rax system call number | ||
192 | * rdi arg0 | ||
193 | * rcx return address for syscall/sysret, C arg3 | ||
194 | * rsi arg1 | ||
195 | * rdx arg2 | ||
196 | * r10 arg3 (--> moved to rcx for C) | ||
197 | * r8 arg4 | ||
198 | * r9 arg5 | ||
199 | * r11 eflags for syscall/sysret, temporary for C | ||
200 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
201 | * | ||
202 | * Interrupts are off on entry. | ||
203 | * Only called from user space. | ||
204 | * | ||
205 | * XXX if we had a free scratch register we could save the RSP into the stack frame | ||
206 | * and report it properly in ps. Unfortunately we haven't. | ||
207 | * | ||
208 | * When user can change the frames always force IRET. That is because | ||
209 | * it deals with uncanonical addresses better. SYSRET has trouble | ||
210 | * with them due to bugs in both AMD and Intel CPUs. | ||
211 | */ | ||
212 | |||
213 | ENTRY(system_call) | ||
214 | CFI_STARTPROC simple | ||
215 | CFI_SIGNAL_FRAME | ||
216 | CFI_DEF_CFA rsp,PDA_STACKOFFSET | ||
217 | CFI_REGISTER rip,rcx | ||
218 | /*CFI_REGISTER rflags,r11*/ | ||
219 | swapgs | ||
220 | movq %rsp,%gs:pda_oldrsp | ||
221 | movq %gs:pda_kernelstack,%rsp | ||
222 | /* | ||
223 | * No need to follow this irqs off/on section - it's straight | ||
224 | * and short: | ||
225 | */ | ||
226 | sti | ||
227 | SAVE_ARGS 8,1 | ||
228 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
229 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
230 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | ||
231 | GET_THREAD_INFO(%rcx) | ||
232 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | ||
233 | jnz tracesys | ||
234 | cmpq $__NR_syscall_max,%rax | ||
235 | ja badsys | ||
236 | movq %r10,%rcx | ||
237 | call *sys_call_table(,%rax,8) # XXX: rip relative | ||
238 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
239 | /* | ||
240 | * Syscall return path ending with SYSRET (fast path) | ||
241 | * Has incomplete stack frame and undefined top of stack. | ||
242 | */ | ||
243 | ret_from_sys_call: | ||
244 | movl $_TIF_ALLWORK_MASK,%edi | ||
245 | /* edi: flagmask */ | ||
246 | sysret_check: | ||
247 | GET_THREAD_INFO(%rcx) | ||
248 | cli | ||
249 | TRACE_IRQS_OFF | ||
250 | movl threadinfo_flags(%rcx),%edx | ||
251 | andl %edi,%edx | ||
252 | jnz sysret_careful | ||
253 | CFI_REMEMBER_STATE | ||
254 | /* | ||
255 | * sysretq will re-enable interrupts: | ||
256 | */ | ||
257 | TRACE_IRQS_ON | ||
258 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
259 | CFI_REGISTER rip,rcx | ||
260 | RESTORE_ARGS 0,-ARG_SKIP,1 | ||
261 | /*CFI_REGISTER rflags,r11*/ | ||
262 | movq %gs:pda_oldrsp,%rsp | ||
263 | swapgs | ||
264 | sysretq | ||
265 | |||
266 | CFI_RESTORE_STATE | ||
267 | /* Handle reschedules */ | ||
268 | /* edx: work, edi: workmask */ | ||
269 | sysret_careful: | ||
270 | bt $TIF_NEED_RESCHED,%edx | ||
271 | jnc sysret_signal | ||
272 | TRACE_IRQS_ON | ||
273 | sti | ||
274 | pushq %rdi | ||
275 | CFI_ADJUST_CFA_OFFSET 8 | ||
276 | call schedule | ||
277 | popq %rdi | ||
278 | CFI_ADJUST_CFA_OFFSET -8 | ||
279 | jmp sysret_check | ||
280 | |||
281 | /* Handle a signal */ | ||
282 | sysret_signal: | ||
283 | TRACE_IRQS_ON | ||
284 | sti | ||
285 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
286 | jz 1f | ||
287 | |||
288 | /* Really a signal */ | ||
289 | /* edx: work flags (arg3) */ | ||
290 | leaq do_notify_resume(%rip),%rax | ||
291 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | ||
292 | xorl %esi,%esi # oldset -> arg2 | ||
293 | call ptregscall_common | ||
294 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
295 | /* Use IRET because user could have changed frame. This | ||
296 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | ||
297 | cli | ||
298 | TRACE_IRQS_OFF | ||
299 | jmp int_with_check | ||
300 | |||
301 | badsys: | ||
302 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
303 | jmp ret_from_sys_call | ||
304 | |||
305 | /* Do syscall tracing */ | ||
306 | tracesys: | ||
307 | SAVE_REST | ||
308 | movq $-ENOSYS,RAX(%rsp) | ||
309 | FIXUP_TOP_OF_STACK %rdi | ||
310 | movq %rsp,%rdi | ||
311 | call syscall_trace_enter | ||
312 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
313 | RESTORE_REST | ||
314 | cmpq $__NR_syscall_max,%rax | ||
315 | movq $-ENOSYS,%rcx | ||
316 | cmova %rcx,%rax | ||
317 | ja 1f | ||
318 | movq %r10,%rcx /* fixup for C */ | ||
319 | call *sys_call_table(,%rax,8) | ||
320 | 1: movq %rax,RAX-ARGOFFSET(%rsp) | ||
321 | /* Use IRET because user could have changed frame */ | ||
322 | |||
323 | /* | ||
324 | * Syscall return path ending with IRET. | ||
325 | * Has correct top of stack, but partial stack frame. | ||
326 | */ | ||
327 | .globl int_ret_from_sys_call | ||
328 | int_ret_from_sys_call: | ||
329 | cli | ||
330 | TRACE_IRQS_OFF | ||
331 | testl $3,CS-ARGOFFSET(%rsp) | ||
332 | je retint_restore_args | ||
333 | movl $_TIF_ALLWORK_MASK,%edi | ||
334 | /* edi: mask to check */ | ||
335 | int_with_check: | ||
336 | GET_THREAD_INFO(%rcx) | ||
337 | movl threadinfo_flags(%rcx),%edx | ||
338 | andl %edi,%edx | ||
339 | jnz int_careful | ||
340 | andl $~TS_COMPAT,threadinfo_status(%rcx) | ||
341 | jmp retint_swapgs | ||
342 | |||
343 | /* Either reschedule or signal or syscall exit tracking needed. */ | ||
344 | /* First do a reschedule test. */ | ||
345 | /* edx: work, edi: workmask */ | ||
346 | int_careful: | ||
347 | bt $TIF_NEED_RESCHED,%edx | ||
348 | jnc int_very_careful | ||
349 | TRACE_IRQS_ON | ||
350 | sti | ||
351 | pushq %rdi | ||
352 | CFI_ADJUST_CFA_OFFSET 8 | ||
353 | call schedule | ||
354 | popq %rdi | ||
355 | CFI_ADJUST_CFA_OFFSET -8 | ||
356 | cli | ||
357 | TRACE_IRQS_OFF | ||
358 | jmp int_with_check | ||
359 | |||
360 | /* handle signals and tracing -- both require a full stack frame */ | ||
361 | int_very_careful: | ||
362 | TRACE_IRQS_ON | ||
363 | sti | ||
364 | SAVE_REST | ||
365 | /* Check for syscall exit trace */ | ||
366 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | ||
367 | jz int_signal | ||
368 | pushq %rdi | ||
369 | CFI_ADJUST_CFA_OFFSET 8 | ||
370 | leaq 8(%rsp),%rdi # &ptregs -> arg1 | ||
371 | call syscall_trace_leave | ||
372 | popq %rdi | ||
373 | CFI_ADJUST_CFA_OFFSET -8 | ||
374 | andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi | ||
375 | jmp int_restore_rest | ||
376 | |||
377 | int_signal: | ||
378 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
379 | jz 1f | ||
380 | movq %rsp,%rdi # &ptregs -> arg1 | ||
381 | xorl %esi,%esi # oldset -> arg2 | ||
382 | call do_notify_resume | ||
383 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
384 | int_restore_rest: | ||
385 | RESTORE_REST | ||
386 | cli | ||
387 | TRACE_IRQS_OFF | ||
388 | jmp int_with_check | ||
389 | CFI_ENDPROC | ||
390 | END(system_call) | ||
391 | |||
392 | /* | ||
393 | * Certain special system calls that need to save a complete full stack frame. | ||
394 | */ | ||
395 | |||
396 | .macro PTREGSCALL label,func,arg | ||
397 | .globl \label | ||
398 | \label: | ||
399 | leaq \func(%rip),%rax | ||
400 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
401 | jmp ptregscall_common | ||
402 | END(\label) | ||
403 | .endm | ||
404 | |||
405 | CFI_STARTPROC | ||
406 | |||
407 | PTREGSCALL stub_clone, sys_clone, %r8 | ||
408 | PTREGSCALL stub_fork, sys_fork, %rdi | ||
409 | PTREGSCALL stub_vfork, sys_vfork, %rdi | ||
410 | PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
411 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | ||
412 | PTREGSCALL stub_iopl, sys_iopl, %rsi | ||
413 | |||
414 | ENTRY(ptregscall_common) | ||
415 | popq %r11 | ||
416 | CFI_ADJUST_CFA_OFFSET -8 | ||
417 | CFI_REGISTER rip, r11 | ||
418 | SAVE_REST | ||
419 | movq %r11, %r15 | ||
420 | CFI_REGISTER rip, r15 | ||
421 | FIXUP_TOP_OF_STACK %r11 | ||
422 | call *%rax | ||
423 | RESTORE_TOP_OF_STACK %r11 | ||
424 | movq %r15, %r11 | ||
425 | CFI_REGISTER rip, r11 | ||
426 | RESTORE_REST | ||
427 | pushq %r11 | ||
428 | CFI_ADJUST_CFA_OFFSET 8 | ||
429 | CFI_REL_OFFSET rip, 0 | ||
430 | ret | ||
431 | CFI_ENDPROC | ||
432 | END(ptregscall_common) | ||
433 | |||
434 | ENTRY(stub_execve) | ||
435 | CFI_STARTPROC | ||
436 | popq %r11 | ||
437 | CFI_ADJUST_CFA_OFFSET -8 | ||
438 | CFI_REGISTER rip, r11 | ||
439 | SAVE_REST | ||
440 | FIXUP_TOP_OF_STACK %r11 | ||
441 | call sys_execve | ||
442 | RESTORE_TOP_OF_STACK %r11 | ||
443 | movq %rax,RAX(%rsp) | ||
444 | RESTORE_REST | ||
445 | jmp int_ret_from_sys_call | ||
446 | CFI_ENDPROC | ||
447 | END(stub_execve) | ||
448 | |||
449 | /* | ||
450 | * sigreturn is special because it needs to restore all registers on return. | ||
451 | * This cannot be done with SYSRET, so use the IRET return path instead. | ||
452 | */ | ||
453 | ENTRY(stub_rt_sigreturn) | ||
454 | CFI_STARTPROC | ||
455 | addq $8, %rsp | ||
456 | CFI_ADJUST_CFA_OFFSET -8 | ||
457 | SAVE_REST | ||
458 | movq %rsp,%rdi | ||
459 | FIXUP_TOP_OF_STACK %r11 | ||
460 | call sys_rt_sigreturn | ||
461 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | ||
462 | RESTORE_REST | ||
463 | jmp int_ret_from_sys_call | ||
464 | CFI_ENDPROC | ||
465 | END(stub_rt_sigreturn) | ||
466 | |||
467 | /* | ||
468 | * initial frame state for interrupts and exceptions | ||
469 | */ | ||
470 | .macro _frame ref | ||
471 | CFI_STARTPROC simple | ||
472 | CFI_SIGNAL_FRAME | ||
473 | CFI_DEF_CFA rsp,SS+8-\ref | ||
474 | /*CFI_REL_OFFSET ss,SS-\ref*/ | ||
475 | CFI_REL_OFFSET rsp,RSP-\ref | ||
476 | /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ | ||
477 | /*CFI_REL_OFFSET cs,CS-\ref*/ | ||
478 | CFI_REL_OFFSET rip,RIP-\ref | ||
479 | .endm | ||
480 | |||
481 | /* initial frame state for interrupts (and exceptions without error code) */ | ||
482 | #define INTR_FRAME _frame RIP | ||
483 | /* initial frame state for exceptions with error code (and interrupts with | ||
484 | vector already pushed) */ | ||
485 | #define XCPT_FRAME _frame ORIG_RAX | ||
486 | |||
487 | /* | ||
488 | * Interrupt entry/exit. | ||
489 | * | ||
490 | * Interrupt entry points save only callee clobbered registers in fast path. | ||
491 | * | ||
492 | * Entry runs with interrupts off. | ||
493 | */ | ||
494 | |||
495 | /* 0(%rsp): interrupt number */ | ||
496 | .macro interrupt func | ||
497 | cld | ||
498 | SAVE_ARGS | ||
499 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | ||
500 | pushq %rbp | ||
501 | CFI_ADJUST_CFA_OFFSET 8 | ||
502 | CFI_REL_OFFSET rbp, 0 | ||
503 | movq %rsp,%rbp | ||
504 | CFI_DEF_CFA_REGISTER rbp | ||
505 | testl $3,CS(%rdi) | ||
506 | je 1f | ||
507 | swapgs | ||
508 | /* irqcount is used to check if a CPU is already on an interrupt | ||
509 | stack or not. While this is essentially redundant with preempt_count | ||
510 | it is a little cheaper to use a separate counter in the PDA | ||
511 | (short of moving irq_enter into assembly, which would be too | ||
512 | much work) */ | ||
513 | 1: incl %gs:pda_irqcount | ||
514 | cmoveq %gs:pda_irqstackptr,%rsp | ||
515 | push %rbp # backlink for old unwinder | ||
516 | /* | ||
517 | * We entered an interrupt context - irqs are off: | ||
518 | */ | ||
519 | TRACE_IRQS_OFF | ||
520 | call \func | ||
521 | .endm | ||
522 | |||
523 | ENTRY(common_interrupt) | ||
524 | XCPT_FRAME | ||
525 | interrupt do_IRQ | ||
526 | /* 0(%rsp): oldrsp-ARGOFFSET */ | ||
527 | ret_from_intr: | ||
528 | cli | ||
529 | TRACE_IRQS_OFF | ||
530 | decl %gs:pda_irqcount | ||
531 | leaveq | ||
532 | CFI_DEF_CFA_REGISTER rsp | ||
533 | CFI_ADJUST_CFA_OFFSET -8 | ||
534 | exit_intr: | ||
535 | GET_THREAD_INFO(%rcx) | ||
536 | testl $3,CS-ARGOFFSET(%rsp) | ||
537 | je retint_kernel | ||
538 | |||
539 | /* Interrupt came from user space */ | ||
540 | /* | ||
541 | * Has a correct top of stack, but a partial stack frame | ||
542 | * %rcx: thread info. Interrupts off. | ||
543 | */ | ||
544 | retint_with_reschedule: | ||
545 | movl $_TIF_WORK_MASK,%edi | ||
546 | retint_check: | ||
547 | movl threadinfo_flags(%rcx),%edx | ||
548 | andl %edi,%edx | ||
549 | CFI_REMEMBER_STATE | ||
550 | jnz retint_careful | ||
551 | retint_swapgs: | ||
552 | /* | ||
553 | * The iretq could re-enable interrupts: | ||
554 | */ | ||
555 | cli | ||
556 | TRACE_IRQS_IRETQ | ||
557 | swapgs | ||
558 | jmp restore_args | ||
559 | |||
560 | retint_restore_args: | ||
561 | cli | ||
562 | /* | ||
563 | * The iretq could re-enable interrupts: | ||
564 | */ | ||
565 | TRACE_IRQS_IRETQ | ||
566 | restore_args: | ||
567 | RESTORE_ARGS 0,8,0 | ||
568 | iret_label: | ||
569 | iretq | ||
570 | |||
571 | .section __ex_table,"a" | ||
572 | .quad iret_label,bad_iret | ||
573 | .previous | ||
574 | .section .fixup,"ax" | ||
575 | /* force a signal here? this matches i386 behaviour */ | ||
576 | /* running with kernel gs */ | ||
577 | bad_iret: | ||
578 | movq $11,%rdi /* SIGSEGV */ | ||
579 | TRACE_IRQS_ON | ||
580 | sti | ||
581 | jmp do_exit | ||
582 | .previous | ||
583 | |||
584 | /* edi: workmask, edx: work */ | ||
585 | retint_careful: | ||
586 | CFI_RESTORE_STATE | ||
587 | bt $TIF_NEED_RESCHED,%edx | ||
588 | jnc retint_signal | ||
589 | TRACE_IRQS_ON | ||
590 | sti | ||
591 | pushq %rdi | ||
592 | CFI_ADJUST_CFA_OFFSET 8 | ||
593 | call schedule | ||
594 | popq %rdi | ||
595 | CFI_ADJUST_CFA_OFFSET -8 | ||
596 | GET_THREAD_INFO(%rcx) | ||
597 | cli | ||
598 | TRACE_IRQS_OFF | ||
599 | jmp retint_check | ||
600 | |||
601 | retint_signal: | ||
602 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
603 | jz retint_swapgs | ||
604 | TRACE_IRQS_ON | ||
605 | sti | ||
606 | SAVE_REST | ||
607 | movq $-1,ORIG_RAX(%rsp) | ||
608 | xorl %esi,%esi # oldset | ||
609 | movq %rsp,%rdi # &pt_regs | ||
610 | call do_notify_resume | ||
611 | RESTORE_REST | ||
612 | cli | ||
613 | TRACE_IRQS_OFF | ||
614 | movl $_TIF_NEED_RESCHED,%edi | ||
615 | GET_THREAD_INFO(%rcx) | ||
616 | jmp retint_check | ||
617 | |||
618 | #ifdef CONFIG_PREEMPT | ||
619 | /* Returning to kernel space. Check if we need preemption */ | ||
620 | /* rcx: threadinfo. interrupts off. */ | ||
621 | ENTRY(retint_kernel) | ||
622 | cmpl $0,threadinfo_preempt_count(%rcx) | ||
623 | jnz retint_restore_args | ||
624 | bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | ||
625 | jnc retint_restore_args | ||
626 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
627 | jnc retint_restore_args | ||
628 | call preempt_schedule_irq | ||
629 | jmp exit_intr | ||
630 | #endif | ||
631 | |||
632 | CFI_ENDPROC | ||
633 | END(common_interrupt) | ||
634 | |||
635 | /* | ||
636 | * APIC interrupts. | ||
637 | */ | ||
638 | .macro apicinterrupt num,func | ||
639 | INTR_FRAME | ||
640 | pushq $~(\num) | ||
641 | CFI_ADJUST_CFA_OFFSET 8 | ||
642 | interrupt \func | ||
643 | jmp ret_from_intr | ||
644 | CFI_ENDPROC | ||
645 | .endm | ||
646 | |||
647 | ENTRY(thermal_interrupt) | ||
648 | apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt | ||
649 | END(thermal_interrupt) | ||
650 | |||
651 | ENTRY(threshold_interrupt) | ||
652 | apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt | ||
653 | END(threshold_interrupt) | ||
654 | |||
655 | #ifdef CONFIG_SMP | ||
656 | ENTRY(reschedule_interrupt) | ||
657 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | ||
658 | END(reschedule_interrupt) | ||
659 | |||
660 | .macro INVALIDATE_ENTRY num | ||
661 | ENTRY(invalidate_interrupt\num) | ||
662 | apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt | ||
663 | END(invalidate_interrupt\num) | ||
664 | .endm | ||
665 | |||
666 | INVALIDATE_ENTRY 0 | ||
667 | INVALIDATE_ENTRY 1 | ||
668 | INVALIDATE_ENTRY 2 | ||
669 | INVALIDATE_ENTRY 3 | ||
670 | INVALIDATE_ENTRY 4 | ||
671 | INVALIDATE_ENTRY 5 | ||
672 | INVALIDATE_ENTRY 6 | ||
673 | INVALIDATE_ENTRY 7 | ||
674 | |||
675 | ENTRY(call_function_interrupt) | ||
676 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | ||
677 | END(call_function_interrupt) | ||
678 | ENTRY(irq_move_cleanup_interrupt) | ||
679 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt | ||
680 | END(irq_move_cleanup_interrupt) | ||
681 | #endif | ||
682 | |||
683 | ENTRY(apic_timer_interrupt) | ||
684 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | ||
685 | END(apic_timer_interrupt) | ||
686 | |||
687 | ENTRY(error_interrupt) | ||
688 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | ||
689 | END(error_interrupt) | ||
690 | |||
691 | ENTRY(spurious_interrupt) | ||
692 | apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt | ||
693 | END(spurious_interrupt) | ||
694 | |||
695 | /* | ||
696 | * Exception entry points. | ||
697 | */ | ||
698 | .macro zeroentry sym | ||
699 | INTR_FRAME | ||
700 | pushq $0 /* push error code/oldrax */ | ||
701 | CFI_ADJUST_CFA_OFFSET 8 | ||
702 | pushq %rax /* push real oldrax to the rdi slot */ | ||
703 | CFI_ADJUST_CFA_OFFSET 8 | ||
704 | CFI_REL_OFFSET rax,0 | ||
705 | leaq \sym(%rip),%rax | ||
706 | jmp error_entry | ||
707 | CFI_ENDPROC | ||
708 | .endm | ||
709 | |||
710 | .macro errorentry sym | ||
711 | XCPT_FRAME | ||
712 | pushq %rax | ||
713 | CFI_ADJUST_CFA_OFFSET 8 | ||
714 | CFI_REL_OFFSET rax,0 | ||
715 | leaq \sym(%rip),%rax | ||
716 | jmp error_entry | ||
717 | CFI_ENDPROC | ||
718 | .endm | ||
719 | |||
720 | /* error code is on the stack already */ | ||
721 | /* handle NMI like exceptions that can happen everywhere */ | ||
722 | .macro paranoidentry sym, ist=0, irqtrace=1 | ||
723 | SAVE_ALL | ||
724 | cld | ||
725 | movl $1,%ebx | ||
726 | movl $MSR_GS_BASE,%ecx | ||
727 | rdmsr | ||
728 | testl %edx,%edx | ||
729 | js 1f | ||
730 | swapgs | ||
731 | xorl %ebx,%ebx | ||
732 | 1: | ||
733 | .if \ist | ||
734 | movq %gs:pda_data_offset, %rbp | ||
735 | .endif | ||
736 | movq %rsp,%rdi | ||
737 | movq ORIG_RAX(%rsp),%rsi | ||
738 | movq $-1,ORIG_RAX(%rsp) | ||
739 | .if \ist | ||
740 | subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | ||
741 | .endif | ||
742 | call \sym | ||
743 | .if \ist | ||
744 | addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | ||
745 | .endif | ||
746 | cli | ||
747 | .if \irqtrace | ||
748 | TRACE_IRQS_OFF | ||
749 | .endif | ||
750 | .endm | ||
751 | |||
752 | /* | ||
753 | * "Paranoid" exit path from exception stack. | ||
754 | * Paranoid because this is used by NMIs and cannot take | ||
755 | * any kernel state for granted. | ||
756 | * We don't do kernel preemption checks here, because only | ||
757 | * NMI should be common and it does not enable IRQs and | ||
758 | * cannot get reschedule ticks. | ||
759 | * | ||
760 | * "trace" is 0 for the NMI handler only, because irq-tracing | ||
761 | * is fundamentally NMI-unsafe. (we cannot change the soft and | ||
762 | * hard flags at once, atomically) | ||
763 | */ | ||
764 | .macro paranoidexit trace=1 | ||
765 | /* ebx: no swapgs flag */ | ||
766 | paranoid_exit\trace: | ||
767 | testl %ebx,%ebx /* swapgs needed? */ | ||
768 | jnz paranoid_restore\trace | ||
769 | testl $3,CS(%rsp) | ||
770 | jnz paranoid_userspace\trace | ||
771 | paranoid_swapgs\trace: | ||
772 | .if \trace | ||
773 | TRACE_IRQS_IRETQ 0 | ||
774 | .endif | ||
775 | swapgs | ||
776 | paranoid_restore\trace: | ||
777 | RESTORE_ALL 8 | ||
778 | iretq | ||
779 | paranoid_userspace\trace: | ||
780 | GET_THREAD_INFO(%rcx) | ||
781 | movl threadinfo_flags(%rcx),%ebx | ||
782 | andl $_TIF_WORK_MASK,%ebx | ||
783 | jz paranoid_swapgs\trace | ||
784 | movq %rsp,%rdi /* &pt_regs */ | ||
785 | call sync_regs | ||
786 | movq %rax,%rsp /* switch stack for scheduling */ | ||
787 | testl $_TIF_NEED_RESCHED,%ebx | ||
788 | jnz paranoid_schedule\trace | ||
789 | movl %ebx,%edx /* arg3: thread flags */ | ||
790 | .if \trace | ||
791 | TRACE_IRQS_ON | ||
792 | .endif | ||
793 | sti | ||
794 | xorl %esi,%esi /* arg2: oldset */ | ||
795 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
796 | call do_notify_resume | ||
797 | cli | ||
798 | .if \trace | ||
799 | TRACE_IRQS_OFF | ||
800 | .endif | ||
801 | jmp paranoid_userspace\trace | ||
802 | paranoid_schedule\trace: | ||
803 | .if \trace | ||
804 | TRACE_IRQS_ON | ||
805 | .endif | ||
806 | sti | ||
807 | call schedule | ||
808 | cli | ||
809 | .if \trace | ||
810 | TRACE_IRQS_OFF | ||
811 | .endif | ||
812 | jmp paranoid_userspace\trace | ||
813 | CFI_ENDPROC | ||
814 | .endm | ||
815 | |||
816 | /* | ||
817 | * Exception entry point. This expects an error code/orig_rax on the stack | ||
818 | * and the exception handler in %rax. | ||
819 | */ | ||
820 | KPROBE_ENTRY(error_entry) | ||
821 | _frame RDI | ||
822 | CFI_REL_OFFSET rax,0 | ||
823 | /* rdi slot contains rax, oldrax contains error code */ | ||
824 | cld | ||
825 | subq $14*8,%rsp | ||
826 | CFI_ADJUST_CFA_OFFSET (14*8) | ||
827 | movq %rsi,13*8(%rsp) | ||
828 | CFI_REL_OFFSET rsi,RSI | ||
829 | movq 14*8(%rsp),%rsi /* load rax from rdi slot */ | ||
830 | CFI_REGISTER rax,rsi | ||
831 | movq %rdx,12*8(%rsp) | ||
832 | CFI_REL_OFFSET rdx,RDX | ||
833 | movq %rcx,11*8(%rsp) | ||
834 | CFI_REL_OFFSET rcx,RCX | ||
835 | movq %rsi,10*8(%rsp) /* store rax */ | ||
836 | CFI_REL_OFFSET rax,RAX | ||
837 | movq %r8, 9*8(%rsp) | ||
838 | CFI_REL_OFFSET r8,R8 | ||
839 | movq %r9, 8*8(%rsp) | ||
840 | CFI_REL_OFFSET r9,R9 | ||
841 | movq %r10,7*8(%rsp) | ||
842 | CFI_REL_OFFSET r10,R10 | ||
843 | movq %r11,6*8(%rsp) | ||
844 | CFI_REL_OFFSET r11,R11 | ||
845 | movq %rbx,5*8(%rsp) | ||
846 | CFI_REL_OFFSET rbx,RBX | ||
847 | movq %rbp,4*8(%rsp) | ||
848 | CFI_REL_OFFSET rbp,RBP | ||
849 | movq %r12,3*8(%rsp) | ||
850 | CFI_REL_OFFSET r12,R12 | ||
851 | movq %r13,2*8(%rsp) | ||
852 | CFI_REL_OFFSET r13,R13 | ||
853 | movq %r14,1*8(%rsp) | ||
854 | CFI_REL_OFFSET r14,R14 | ||
855 | movq %r15,(%rsp) | ||
856 | CFI_REL_OFFSET r15,R15 | ||
857 | xorl %ebx,%ebx | ||
858 | testl $3,CS(%rsp) | ||
859 | je error_kernelspace | ||
860 | error_swapgs: | ||
861 | swapgs | ||
862 | error_sti: | ||
863 | movq %rdi,RDI(%rsp) | ||
864 | CFI_REL_OFFSET rdi,RDI | ||
865 | movq %rsp,%rdi | ||
866 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
867 | movq $-1,ORIG_RAX(%rsp) | ||
868 | call *%rax | ||
869 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | ||
870 | error_exit: | ||
871 | movl %ebx,%eax | ||
872 | RESTORE_REST | ||
873 | cli | ||
874 | TRACE_IRQS_OFF | ||
875 | GET_THREAD_INFO(%rcx) | ||
876 | testl %eax,%eax | ||
877 | jne retint_kernel | ||
878 | movl threadinfo_flags(%rcx),%edx | ||
879 | movl $_TIF_WORK_MASK,%edi | ||
880 | andl %edi,%edx | ||
881 | jnz retint_careful | ||
882 | /* | ||
883 | * The iret might restore flags: | ||
884 | */ | ||
885 | TRACE_IRQS_IRETQ | ||
886 | swapgs | ||
887 | RESTORE_ARGS 0,8,0 | ||
888 | jmp iret_label | ||
889 | CFI_ENDPROC | ||
890 | |||
891 | error_kernelspace: | ||
892 | incl %ebx | ||
893 | /* There are two places in the kernel that can potentially fault with | ||
894 | usergs. Handle them here. The exception handlers after | ||
895 | iret run with kernel gs again, so don't set the user space flag. | ||
896 | B stepping K8s sometimes report an truncated RIP for IRET | ||
897 | exceptions returning to compat mode. Check for these here too. */ | ||
898 | leaq iret_label(%rip),%rbp | ||
899 | cmpq %rbp,RIP(%rsp) | ||
900 | je error_swapgs | ||
901 | movl %ebp,%ebp /* zero extend */ | ||
902 | cmpq %rbp,RIP(%rsp) | ||
903 | je error_swapgs | ||
904 | cmpq $gs_change,RIP(%rsp) | ||
905 | je error_swapgs | ||
906 | jmp error_sti | ||
907 | KPROBE_END(error_entry) | ||
908 | |||
909 | /* Reload gs selector with exception handling */ | ||
910 | /* edi: new selector */ | ||
911 | ENTRY(load_gs_index) | ||
912 | CFI_STARTPROC | ||
913 | pushf | ||
914 | CFI_ADJUST_CFA_OFFSET 8 | ||
915 | cli | ||
916 | swapgs | ||
917 | gs_change: | ||
918 | movl %edi,%gs | ||
919 | 2: mfence /* workaround */ | ||
920 | swapgs | ||
921 | popf | ||
922 | CFI_ADJUST_CFA_OFFSET -8 | ||
923 | ret | ||
924 | CFI_ENDPROC | ||
925 | ENDPROC(load_gs_index) | ||
926 | |||
927 | .section __ex_table,"a" | ||
928 | .align 8 | ||
929 | .quad gs_change,bad_gs | ||
930 | .previous | ||
931 | .section .fixup,"ax" | ||
932 | /* running with kernelgs */ | ||
933 | bad_gs: | ||
934 | swapgs /* switch back to user gs */ | ||
935 | xorl %eax,%eax | ||
936 | movl %eax,%gs | ||
937 | jmp 2b | ||
938 | .previous | ||
939 | |||
940 | /* | ||
941 | * Create a kernel thread. | ||
942 | * | ||
943 | * C extern interface: | ||
944 | * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | ||
945 | * | ||
946 | * asm input arguments: | ||
947 | * rdi: fn, rsi: arg, rdx: flags | ||
948 | */ | ||
949 | ENTRY(kernel_thread) | ||
950 | CFI_STARTPROC | ||
951 | FAKE_STACK_FRAME $child_rip | ||
952 | SAVE_ALL | ||
953 | |||
954 | # rdi: flags, rsi: usp, rdx: will be &pt_regs | ||
955 | movq %rdx,%rdi | ||
956 | orq kernel_thread_flags(%rip),%rdi | ||
957 | movq $-1, %rsi | ||
958 | movq %rsp, %rdx | ||
959 | |||
960 | xorl %r8d,%r8d | ||
961 | xorl %r9d,%r9d | ||
962 | |||
963 | # clone now | ||
964 | call do_fork | ||
965 | movq %rax,RAX(%rsp) | ||
966 | xorl %edi,%edi | ||
967 | |||
968 | /* | ||
969 | * It isn't worth to check for reschedule here, | ||
970 | * so internally to the x86_64 port you can rely on kernel_thread() | ||
971 | * not to reschedule the child before returning, this avoids the need | ||
972 | * of hacks for example to fork off the per-CPU idle tasks. | ||
973 | * [Hopefully no generic code relies on the reschedule -AK] | ||
974 | */ | ||
975 | RESTORE_ALL | ||
976 | UNFAKE_STACK_FRAME | ||
977 | ret | ||
978 | CFI_ENDPROC | ||
979 | ENDPROC(kernel_thread) | ||
980 | |||
981 | child_rip: | ||
982 | pushq $0 # fake return address | ||
983 | CFI_STARTPROC | ||
984 | /* | ||
985 | * Here we are in the child and the registers are set as they were | ||
986 | * at kernel_thread() invocation in the parent. | ||
987 | */ | ||
988 | movq %rdi, %rax | ||
989 | movq %rsi, %rdi | ||
990 | call *%rax | ||
991 | # exit | ||
992 | xorl %edi, %edi | ||
993 | call do_exit | ||
994 | CFI_ENDPROC | ||
995 | ENDPROC(child_rip) | ||
996 | |||
997 | /* | ||
998 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | ||
999 | * | ||
1000 | * C extern interface: | ||
1001 | * extern long execve(char *name, char **argv, char **envp) | ||
1002 | * | ||
1003 | * asm input arguments: | ||
1004 | * rdi: name, rsi: argv, rdx: envp | ||
1005 | * | ||
1006 | * We want to fallback into: | ||
1007 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | ||
1008 | * | ||
1009 | * do_sys_execve asm fallback arguments: | ||
1010 | * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | ||
1011 | */ | ||
1012 | ENTRY(kernel_execve) | ||
1013 | CFI_STARTPROC | ||
1014 | FAKE_STACK_FRAME $0 | ||
1015 | SAVE_ALL | ||
1016 | call sys_execve | ||
1017 | movq %rax, RAX(%rsp) | ||
1018 | RESTORE_REST | ||
1019 | testq %rax,%rax | ||
1020 | je int_ret_from_sys_call | ||
1021 | RESTORE_ARGS | ||
1022 | UNFAKE_STACK_FRAME | ||
1023 | ret | ||
1024 | CFI_ENDPROC | ||
1025 | ENDPROC(kernel_execve) | ||
1026 | |||
1027 | KPROBE_ENTRY(page_fault) | ||
1028 | errorentry do_page_fault | ||
1029 | KPROBE_END(page_fault) | ||
1030 | |||
1031 | ENTRY(coprocessor_error) | ||
1032 | zeroentry do_coprocessor_error | ||
1033 | END(coprocessor_error) | ||
1034 | |||
1035 | ENTRY(simd_coprocessor_error) | ||
1036 | zeroentry do_simd_coprocessor_error | ||
1037 | END(simd_coprocessor_error) | ||
1038 | |||
1039 | ENTRY(device_not_available) | ||
1040 | zeroentry math_state_restore | ||
1041 | END(device_not_available) | ||
1042 | |||
1043 | /* runs on exception stack */ | ||
1044 | KPROBE_ENTRY(debug) | ||
1045 | INTR_FRAME | ||
1046 | pushq $0 | ||
1047 | CFI_ADJUST_CFA_OFFSET 8 | ||
1048 | paranoidentry do_debug, DEBUG_STACK | ||
1049 | paranoidexit | ||
1050 | KPROBE_END(debug) | ||
1051 | |||
1052 | /* runs on exception stack */ | ||
1053 | KPROBE_ENTRY(nmi) | ||
1054 | INTR_FRAME | ||
1055 | pushq $-1 | ||
1056 | CFI_ADJUST_CFA_OFFSET 8 | ||
1057 | paranoidentry do_nmi, 0, 0 | ||
1058 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1059 | paranoidexit 0 | ||
1060 | #else | ||
1061 | jmp paranoid_exit1 | ||
1062 | CFI_ENDPROC | ||
1063 | #endif | ||
1064 | KPROBE_END(nmi) | ||
1065 | |||
1066 | KPROBE_ENTRY(int3) | ||
1067 | INTR_FRAME | ||
1068 | pushq $0 | ||
1069 | CFI_ADJUST_CFA_OFFSET 8 | ||
1070 | paranoidentry do_int3, DEBUG_STACK | ||
1071 | jmp paranoid_exit1 | ||
1072 | CFI_ENDPROC | ||
1073 | KPROBE_END(int3) | ||
1074 | |||
1075 | ENTRY(overflow) | ||
1076 | zeroentry do_overflow | ||
1077 | END(overflow) | ||
1078 | |||
1079 | ENTRY(bounds) | ||
1080 | zeroentry do_bounds | ||
1081 | END(bounds) | ||
1082 | |||
1083 | ENTRY(invalid_op) | ||
1084 | zeroentry do_invalid_op | ||
1085 | END(invalid_op) | ||
1086 | |||
1087 | ENTRY(coprocessor_segment_overrun) | ||
1088 | zeroentry do_coprocessor_segment_overrun | ||
1089 | END(coprocessor_segment_overrun) | ||
1090 | |||
1091 | ENTRY(reserved) | ||
1092 | zeroentry do_reserved | ||
1093 | END(reserved) | ||
1094 | |||
1095 | /* runs on exception stack */ | ||
1096 | ENTRY(double_fault) | ||
1097 | XCPT_FRAME | ||
1098 | paranoidentry do_double_fault | ||
1099 | jmp paranoid_exit1 | ||
1100 | CFI_ENDPROC | ||
1101 | END(double_fault) | ||
1102 | |||
1103 | ENTRY(invalid_TSS) | ||
1104 | errorentry do_invalid_TSS | ||
1105 | END(invalid_TSS) | ||
1106 | |||
1107 | ENTRY(segment_not_present) | ||
1108 | errorentry do_segment_not_present | ||
1109 | END(segment_not_present) | ||
1110 | |||
1111 | /* runs on exception stack */ | ||
1112 | ENTRY(stack_segment) | ||
1113 | XCPT_FRAME | ||
1114 | paranoidentry do_stack_segment | ||
1115 | jmp paranoid_exit1 | ||
1116 | CFI_ENDPROC | ||
1117 | END(stack_segment) | ||
1118 | |||
1119 | KPROBE_ENTRY(general_protection) | ||
1120 | errorentry do_general_protection | ||
1121 | KPROBE_END(general_protection) | ||
1122 | |||
1123 | ENTRY(alignment_check) | ||
1124 | errorentry do_alignment_check | ||
1125 | END(alignment_check) | ||
1126 | |||
1127 | ENTRY(divide_error) | ||
1128 | zeroentry do_divide_error | ||
1129 | END(divide_error) | ||
1130 | |||
1131 | ENTRY(spurious_interrupt_bug) | ||
1132 | zeroentry do_spurious_interrupt_bug | ||
1133 | END(spurious_interrupt_bug) | ||
1134 | |||
1135 | #ifdef CONFIG_X86_MCE | ||
1136 | /* runs on exception stack */ | ||
1137 | ENTRY(machine_check) | ||
1138 | INTR_FRAME | ||
1139 | pushq $0 | ||
1140 | CFI_ADJUST_CFA_OFFSET 8 | ||
1141 | paranoidentry do_machine_check | ||
1142 | jmp paranoid_exit1 | ||
1143 | CFI_ENDPROC | ||
1144 | END(machine_check) | ||
1145 | #endif | ||
1146 | |||
1147 | /* Call softirq on interrupt stack. Interrupts are off. */ | ||
1148 | ENTRY(call_softirq) | ||
1149 | CFI_STARTPROC | ||
1150 | push %rbp | ||
1151 | CFI_ADJUST_CFA_OFFSET 8 | ||
1152 | CFI_REL_OFFSET rbp,0 | ||
1153 | mov %rsp,%rbp | ||
1154 | CFI_DEF_CFA_REGISTER rbp | ||
1155 | incl %gs:pda_irqcount | ||
1156 | cmove %gs:pda_irqstackptr,%rsp | ||
1157 | push %rbp # backlink for old unwinder | ||
1158 | call __do_softirq | ||
1159 | leaveq | ||
1160 | CFI_DEF_CFA_REGISTER rsp | ||
1161 | CFI_ADJUST_CFA_OFFSET -8 | ||
1162 | decl %gs:pda_irqcount | ||
1163 | ret | ||
1164 | CFI_ENDPROC | ||
1165 | ENDPROC(call_softirq) | ||
1166 | |||
1167 | KPROBE_ENTRY(ignore_sysret) | ||
1168 | CFI_STARTPROC | ||
1169 | mov $-ENOSYS,%eax | ||
1170 | sysret | ||
1171 | CFI_ENDPROC | ||
1172 | ENDPROC(ignore_sysret) | ||
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c new file mode 100644 index 000000000000..47496a40e84f --- /dev/null +++ b/arch/x86/kernel/genapic_64.c | |||
@@ -0,0 +1,66 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Generic APIC sub-arch probe layer. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/threads.h> | ||
12 | #include <linux/cpumask.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <asm/smp.h> | ||
20 | #include <asm/ipi.h> | ||
21 | #include <asm/genapic.h> | ||
22 | |||
23 | #ifdef CONFIG_ACPI | ||
24 | #include <acpi/acpi_bus.h> | ||
25 | #endif | ||
26 | |||
27 | /* which logical CPU number maps to which CPU (physical APIC ID) */ | ||
28 | u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly | ||
29 | = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
30 | EXPORT_SYMBOL(x86_cpu_to_apicid); | ||
31 | |||
32 | u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
33 | |||
34 | struct genapic __read_mostly *genapic = &apic_flat; | ||
35 | |||
36 | /* | ||
37 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | ||
38 | */ | ||
39 | void __init setup_apic_routing(void) | ||
40 | { | ||
41 | #ifdef CONFIG_ACPI | ||
42 | /* | ||
43 | * Quirk: some x86_64 machines can only use physical APIC mode | ||
44 | * regardless of how many processors are present (x86_64 ES7000 | ||
45 | * is an example). | ||
46 | */ | ||
47 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | ||
48 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) | ||
49 | genapic = &apic_physflat; | ||
50 | else | ||
51 | #endif | ||
52 | |||
53 | if (cpus_weight(cpu_possible_map) <= 8) | ||
54 | genapic = &apic_flat; | ||
55 | else | ||
56 | genapic = &apic_physflat; | ||
57 | |||
58 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | ||
59 | } | ||
60 | |||
61 | /* Same for both flat and physical. */ | ||
62 | |||
63 | void send_IPI_self(int vector) | ||
64 | { | ||
65 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | ||
66 | } | ||
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c new file mode 100644 index 000000000000..ecb01eefdd27 --- /dev/null +++ b/arch/x86/kernel/genapic_flat_64.c | |||
@@ -0,0 +1,194 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Flat APIC subarch code. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/ipi.h> | ||
20 | #include <asm/genapic.h> | ||
21 | |||
22 | static cpumask_t flat_target_cpus(void) | ||
23 | { | ||
24 | return cpu_online_map; | ||
25 | } | ||
26 | |||
27 | static cpumask_t flat_vector_allocation_domain(int cpu) | ||
28 | { | ||
29 | /* Careful. Some cpus do not strictly honor the set of cpus | ||
30 | * specified in the interrupt destination when using lowest | ||
31 | * priority interrupt delivery mode. | ||
32 | * | ||
33 | * In particular there was a hyperthreading cpu observed to | ||
34 | * deliver interrupts to the wrong hyperthread when only one | ||
35 | * hyperthread was specified in the interrupt desitination. | ||
36 | */ | ||
37 | cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; | ||
38 | return domain; | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Set up the logical destination ID. | ||
43 | * | ||
44 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
45 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
46 | * document number 292116). So here it goes... | ||
47 | */ | ||
48 | static void flat_init_apic_ldr(void) | ||
49 | { | ||
50 | unsigned long val; | ||
51 | unsigned long num, id; | ||
52 | |||
53 | num = smp_processor_id(); | ||
54 | id = 1UL << num; | ||
55 | x86_cpu_to_log_apicid[num] = id; | ||
56 | apic_write(APIC_DFR, APIC_DFR_FLAT); | ||
57 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
58 | val |= SET_APIC_LOGICAL_ID(id); | ||
59 | apic_write(APIC_LDR, val); | ||
60 | } | ||
61 | |||
62 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
63 | { | ||
64 | unsigned long mask = cpus_addr(cpumask)[0]; | ||
65 | unsigned long flags; | ||
66 | |||
67 | local_irq_save(flags); | ||
68 | __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); | ||
69 | local_irq_restore(flags); | ||
70 | } | ||
71 | |||
72 | static void flat_send_IPI_allbutself(int vector) | ||
73 | { | ||
74 | #ifdef CONFIG_HOTPLUG_CPU | ||
75 | int hotplug = 1; | ||
76 | #else | ||
77 | int hotplug = 0; | ||
78 | #endif | ||
79 | if (hotplug || vector == NMI_VECTOR) { | ||
80 | cpumask_t allbutme = cpu_online_map; | ||
81 | |||
82 | cpu_clear(smp_processor_id(), allbutme); | ||
83 | |||
84 | if (!cpus_empty(allbutme)) | ||
85 | flat_send_IPI_mask(allbutme, vector); | ||
86 | } else if (num_online_cpus() > 1) { | ||
87 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | static void flat_send_IPI_all(int vector) | ||
92 | { | ||
93 | if (vector == NMI_VECTOR) | ||
94 | flat_send_IPI_mask(cpu_online_map, vector); | ||
95 | else | ||
96 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
97 | } | ||
98 | |||
99 | static int flat_apic_id_registered(void) | ||
100 | { | ||
101 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); | ||
102 | } | ||
103 | |||
104 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
105 | { | ||
106 | return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; | ||
107 | } | ||
108 | |||
109 | static unsigned int phys_pkg_id(int index_msb) | ||
110 | { | ||
111 | return hard_smp_processor_id() >> index_msb; | ||
112 | } | ||
113 | |||
114 | struct genapic apic_flat = { | ||
115 | .name = "flat", | ||
116 | .int_delivery_mode = dest_LowestPrio, | ||
117 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | ||
118 | .target_cpus = flat_target_cpus, | ||
119 | .vector_allocation_domain = flat_vector_allocation_domain, | ||
120 | .apic_id_registered = flat_apic_id_registered, | ||
121 | .init_apic_ldr = flat_init_apic_ldr, | ||
122 | .send_IPI_all = flat_send_IPI_all, | ||
123 | .send_IPI_allbutself = flat_send_IPI_allbutself, | ||
124 | .send_IPI_mask = flat_send_IPI_mask, | ||
125 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | ||
126 | .phys_pkg_id = phys_pkg_id, | ||
127 | }; | ||
128 | |||
129 | /* | ||
130 | * Physflat mode is used when there are more than 8 CPUs on a AMD system. | ||
131 | * We cannot use logical delivery in this case because the mask | ||
132 | * overflows, so use physical mode. | ||
133 | */ | ||
134 | |||
135 | static cpumask_t physflat_target_cpus(void) | ||
136 | { | ||
137 | return cpu_online_map; | ||
138 | } | ||
139 | |||
140 | static cpumask_t physflat_vector_allocation_domain(int cpu) | ||
141 | { | ||
142 | cpumask_t domain = CPU_MASK_NONE; | ||
143 | cpu_set(cpu, domain); | ||
144 | return domain; | ||
145 | } | ||
146 | |||
147 | |||
148 | static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
149 | { | ||
150 | send_IPI_mask_sequence(cpumask, vector); | ||
151 | } | ||
152 | |||
153 | static void physflat_send_IPI_allbutself(int vector) | ||
154 | { | ||
155 | cpumask_t allbutme = cpu_online_map; | ||
156 | |||
157 | cpu_clear(smp_processor_id(), allbutme); | ||
158 | physflat_send_IPI_mask(allbutme, vector); | ||
159 | } | ||
160 | |||
161 | static void physflat_send_IPI_all(int vector) | ||
162 | { | ||
163 | physflat_send_IPI_mask(cpu_online_map, vector); | ||
164 | } | ||
165 | |||
166 | static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
167 | { | ||
168 | int cpu; | ||
169 | |||
170 | /* | ||
171 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
172 | * May as well be the first. | ||
173 | */ | ||
174 | cpu = first_cpu(cpumask); | ||
175 | if ((unsigned)cpu < NR_CPUS) | ||
176 | return x86_cpu_to_apicid[cpu]; | ||
177 | else | ||
178 | return BAD_APICID; | ||
179 | } | ||
180 | |||
181 | struct genapic apic_physflat = { | ||
182 | .name = "physical flat", | ||
183 | .int_delivery_mode = dest_Fixed, | ||
184 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | ||
185 | .target_cpus = physflat_target_cpus, | ||
186 | .vector_allocation_domain = physflat_vector_allocation_domain, | ||
187 | .apic_id_registered = flat_apic_id_registered, | ||
188 | .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ | ||
189 | .send_IPI_all = physflat_send_IPI_all, | ||
190 | .send_IPI_allbutself = physflat_send_IPI_allbutself, | ||
191 | .send_IPI_mask = physflat_send_IPI_mask, | ||
192 | .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, | ||
193 | .phys_pkg_id = phys_pkg_id, | ||
194 | }; | ||
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c new file mode 100644 index 000000000000..6c34bdd22e26 --- /dev/null +++ b/arch/x86/kernel/head64.c | |||
@@ -0,0 +1,86 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head64.c -- prepare to run common code | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/linkage.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/percpu.h> | ||
13 | |||
14 | #include <asm/processor.h> | ||
15 | #include <asm/proto.h> | ||
16 | #include <asm/smp.h> | ||
17 | #include <asm/bootsetup.h> | ||
18 | #include <asm/setup.h> | ||
19 | #include <asm/desc.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/tlbflush.h> | ||
22 | #include <asm/sections.h> | ||
23 | |||
24 | static void __init zap_identity_mappings(void) | ||
25 | { | ||
26 | pgd_t *pgd = pgd_offset_k(0UL); | ||
27 | pgd_clear(pgd); | ||
28 | __flush_tlb(); | ||
29 | } | ||
30 | |||
31 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | ||
32 | yet. */ | ||
33 | static void __init clear_bss(void) | ||
34 | { | ||
35 | memset(__bss_start, 0, | ||
36 | (unsigned long) __bss_stop - (unsigned long) __bss_start); | ||
37 | } | ||
38 | |||
39 | #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ | ||
40 | #define OLD_CL_MAGIC_ADDR 0x20 | ||
41 | #define OLD_CL_MAGIC 0xA33F | ||
42 | #define OLD_CL_OFFSET 0x22 | ||
43 | |||
44 | static void __init copy_bootdata(char *real_mode_data) | ||
45 | { | ||
46 | unsigned long new_data; | ||
47 | char * command_line; | ||
48 | |||
49 | memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); | ||
50 | new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); | ||
51 | if (!new_data) { | ||
52 | if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { | ||
53 | return; | ||
54 | } | ||
55 | new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); | ||
56 | } | ||
57 | command_line = __va(new_data); | ||
58 | memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); | ||
59 | } | ||
60 | |||
61 | void __init x86_64_start_kernel(char * real_mode_data) | ||
62 | { | ||
63 | int i; | ||
64 | |||
65 | /* clear bss before set_intr_gate with early_idt_handler */ | ||
66 | clear_bss(); | ||
67 | |||
68 | /* Make NULL pointers segfault */ | ||
69 | zap_identity_mappings(); | ||
70 | |||
71 | for (i = 0; i < IDT_ENTRIES; i++) | ||
72 | set_intr_gate(i, early_idt_handler); | ||
73 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
74 | |||
75 | early_printk("Kernel alive\n"); | ||
76 | |||
77 | for (i = 0; i < NR_CPUS; i++) | ||
78 | cpu_pda(i) = &boot_cpu_pda[i]; | ||
79 | |||
80 | pda_init(0); | ||
81 | copy_bootdata(__va(real_mode_data)); | ||
82 | #ifdef CONFIG_SMP | ||
83 | cpu_set(0, cpu_online_map); | ||
84 | #endif | ||
85 | start_kernel(); | ||
86 | } | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S new file mode 100644 index 000000000000..b6167fe3330e --- /dev/null +++ b/arch/x86/kernel/head_64.S | |||
@@ -0,0 +1,416 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | ||
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | ||
8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | #include <linux/threads.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <asm/desc.h> | ||
16 | #include <asm/segment.h> | ||
17 | #include <asm/pgtable.h> | ||
18 | #include <asm/page.h> | ||
19 | #include <asm/msr.h> | ||
20 | #include <asm/cache.h> | ||
21 | |||
22 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | ||
23 | * because we need identity-mapped pages. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | .text | ||
28 | .section .text.head | ||
29 | .code64 | ||
30 | .globl startup_64 | ||
31 | startup_64: | ||
32 | |||
33 | /* | ||
34 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | ||
35 | * and someone has loaded an identity mapped page table | ||
36 | * for us. These identity mapped page tables map all of the | ||
37 | * kernel pages and possibly all of memory. | ||
38 | * | ||
39 | * %esi holds a physical pointer to real_mode_data. | ||
40 | * | ||
41 | * We come here either directly from a 64bit bootloader, or from | ||
42 | * arch/x86_64/boot/compressed/head.S. | ||
43 | * | ||
44 | * We only come here initially at boot nothing else comes here. | ||
45 | * | ||
46 | * Since we may be loaded at an address different from what we were | ||
47 | * compiled to run at we first fixup the physical addresses in our page | ||
48 | * tables and then reload them. | ||
49 | */ | ||
50 | |||
51 | /* Compute the delta between the address I am compiled to run at and the | ||
52 | * address I am actually running at. | ||
53 | */ | ||
54 | leaq _text(%rip), %rbp | ||
55 | subq $_text - __START_KERNEL_map, %rbp | ||
56 | |||
57 | /* Is the address not 2M aligned? */ | ||
58 | movq %rbp, %rax | ||
59 | andl $~LARGE_PAGE_MASK, %eax | ||
60 | testl %eax, %eax | ||
61 | jnz bad_address | ||
62 | |||
63 | /* Is the address too large? */ | ||
64 | leaq _text(%rip), %rdx | ||
65 | movq $PGDIR_SIZE, %rax | ||
66 | cmpq %rax, %rdx | ||
67 | jae bad_address | ||
68 | |||
69 | /* Fixup the physical addresses in the page table | ||
70 | */ | ||
71 | addq %rbp, init_level4_pgt + 0(%rip) | ||
72 | addq %rbp, init_level4_pgt + (258*8)(%rip) | ||
73 | addq %rbp, init_level4_pgt + (511*8)(%rip) | ||
74 | |||
75 | addq %rbp, level3_ident_pgt + 0(%rip) | ||
76 | |||
77 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
78 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | ||
79 | |||
80 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | ||
81 | |||
82 | /* Add an Identity mapping if I am above 1G */ | ||
83 | leaq _text(%rip), %rdi | ||
84 | andq $LARGE_PAGE_MASK, %rdi | ||
85 | |||
86 | movq %rdi, %rax | ||
87 | shrq $PUD_SHIFT, %rax | ||
88 | andq $(PTRS_PER_PUD - 1), %rax | ||
89 | jz ident_complete | ||
90 | |||
91 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | ||
92 | leaq level3_ident_pgt(%rip), %rbx | ||
93 | movq %rdx, 0(%rbx, %rax, 8) | ||
94 | |||
95 | movq %rdi, %rax | ||
96 | shrq $PMD_SHIFT, %rax | ||
97 | andq $(PTRS_PER_PMD - 1), %rax | ||
98 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | ||
99 | leaq level2_spare_pgt(%rip), %rbx | ||
100 | movq %rdx, 0(%rbx, %rax, 8) | ||
101 | ident_complete: | ||
102 | |||
103 | /* Fixup the kernel text+data virtual addresses | ||
104 | */ | ||
105 | leaq level2_kernel_pgt(%rip), %rdi | ||
106 | leaq 4096(%rdi), %r8 | ||
107 | /* See if it is a valid page table entry */ | ||
108 | 1: testq $1, 0(%rdi) | ||
109 | jz 2f | ||
110 | addq %rbp, 0(%rdi) | ||
111 | /* Go to the next page */ | ||
112 | 2: addq $8, %rdi | ||
113 | cmp %r8, %rdi | ||
114 | jne 1b | ||
115 | |||
116 | /* Fixup phys_base */ | ||
117 | addq %rbp, phys_base(%rip) | ||
118 | |||
119 | #ifdef CONFIG_SMP | ||
120 | addq %rbp, trampoline_level4_pgt + 0(%rip) | ||
121 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) | ||
122 | #endif | ||
123 | #ifdef CONFIG_ACPI_SLEEP | ||
124 | addq %rbp, wakeup_level4_pgt + 0(%rip) | ||
125 | addq %rbp, wakeup_level4_pgt + (511*8)(%rip) | ||
126 | #endif | ||
127 | |||
128 | /* Due to ENTRY(), sometimes the empty space gets filled with | ||
129 | * zeros. Better take a jmp than relying on empty space being | ||
130 | * filled with 0x90 (nop) | ||
131 | */ | ||
132 | jmp secondary_startup_64 | ||
133 | ENTRY(secondary_startup_64) | ||
134 | /* | ||
135 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | ||
136 | * and someone has loaded a mapped page table. | ||
137 | * | ||
138 | * %esi holds a physical pointer to real_mode_data. | ||
139 | * | ||
140 | * We come here either from startup_64 (using physical addresses) | ||
141 | * or from trampoline.S (using virtual addresses). | ||
142 | * | ||
143 | * Using virtual addresses from trampoline.S removes the need | ||
144 | * to have any identity mapped pages in the kernel page table | ||
145 | * after the boot processor executes this code. | ||
146 | */ | ||
147 | |||
148 | /* Enable PAE mode and PGE */ | ||
149 | xorq %rax, %rax | ||
150 | btsq $5, %rax | ||
151 | btsq $7, %rax | ||
152 | movq %rax, %cr4 | ||
153 | |||
154 | /* Setup early boot stage 4 level pagetables. */ | ||
155 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
156 | addq phys_base(%rip), %rax | ||
157 | movq %rax, %cr3 | ||
158 | |||
159 | /* Ensure I am executing from virtual addresses */ | ||
160 | movq $1f, %rax | ||
161 | jmp *%rax | ||
162 | 1: | ||
163 | |||
164 | /* Check if nx is implemented */ | ||
165 | movl $0x80000001, %eax | ||
166 | cpuid | ||
167 | movl %edx,%edi | ||
168 | |||
169 | /* Setup EFER (Extended Feature Enable Register) */ | ||
170 | movl $MSR_EFER, %ecx | ||
171 | rdmsr | ||
172 | btsl $_EFER_SCE, %eax /* Enable System Call */ | ||
173 | btl $20,%edi /* No Execute supported? */ | ||
174 | jnc 1f | ||
175 | btsl $_EFER_NX, %eax | ||
176 | 1: wrmsr /* Make changes effective */ | ||
177 | |||
178 | /* Setup cr0 */ | ||
179 | #define CR0_PM 1 /* protected mode */ | ||
180 | #define CR0_MP (1<<1) | ||
181 | #define CR0_ET (1<<4) | ||
182 | #define CR0_NE (1<<5) | ||
183 | #define CR0_WP (1<<16) | ||
184 | #define CR0_AM (1<<18) | ||
185 | #define CR0_PAGING (1<<31) | ||
186 | movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax | ||
187 | /* Make changes effective */ | ||
188 | movq %rax, %cr0 | ||
189 | |||
190 | /* Setup a boot time stack */ | ||
191 | movq init_rsp(%rip),%rsp | ||
192 | |||
193 | /* zero EFLAGS after setting rsp */ | ||
194 | pushq $0 | ||
195 | popfq | ||
196 | |||
197 | /* | ||
198 | * We must switch to a new descriptor in kernel space for the GDT | ||
199 | * because soon the kernel won't have access anymore to the userspace | ||
200 | * addresses where we're currently running on. We have to do that here | ||
201 | * because in 32bit we couldn't load a 64bit linear address. | ||
202 | */ | ||
203 | lgdt cpu_gdt_descr(%rip) | ||
204 | |||
205 | /* set up data segments. actually 0 would do too */ | ||
206 | movl $__KERNEL_DS,%eax | ||
207 | movl %eax,%ds | ||
208 | movl %eax,%ss | ||
209 | movl %eax,%es | ||
210 | |||
211 | /* | ||
212 | * We don't really need to load %fs or %gs, but load them anyway | ||
213 | * to kill any stale realmode selectors. This allows execution | ||
214 | * under VT hardware. | ||
215 | */ | ||
216 | movl %eax,%fs | ||
217 | movl %eax,%gs | ||
218 | |||
219 | /* | ||
220 | * Setup up a dummy PDA. this is just for some early bootup code | ||
221 | * that does in_interrupt() | ||
222 | */ | ||
223 | movl $MSR_GS_BASE,%ecx | ||
224 | movq $empty_zero_page,%rax | ||
225 | movq %rax,%rdx | ||
226 | shrq $32,%rdx | ||
227 | wrmsr | ||
228 | |||
229 | /* esi is pointer to real mode structure with interesting info. | ||
230 | pass it to C */ | ||
231 | movl %esi, %edi | ||
232 | |||
233 | /* Finally jump to run C code and to be on real kernel address | ||
234 | * Since we are running on identity-mapped space we have to jump | ||
235 | * to the full 64bit address, this is only possible as indirect | ||
236 | * jump. In addition we need to ensure %cs is set so we make this | ||
237 | * a far return. | ||
238 | */ | ||
239 | movq initial_code(%rip),%rax | ||
240 | pushq $0 # fake return address to stop unwinder | ||
241 | pushq $__KERNEL_CS # set correct cs | ||
242 | pushq %rax # target address in negative space | ||
243 | lretq | ||
244 | |||
245 | /* SMP bootup changes these two */ | ||
246 | #ifndef CONFIG_HOTPLUG_CPU | ||
247 | .pushsection .init.data | ||
248 | #endif | ||
249 | .align 8 | ||
250 | .globl initial_code | ||
251 | initial_code: | ||
252 | .quad x86_64_start_kernel | ||
253 | #ifndef CONFIG_HOTPLUG_CPU | ||
254 | .popsection | ||
255 | #endif | ||
256 | .globl init_rsp | ||
257 | init_rsp: | ||
258 | .quad init_thread_union+THREAD_SIZE-8 | ||
259 | |||
260 | bad_address: | ||
261 | jmp bad_address | ||
262 | |||
263 | ENTRY(early_idt_handler) | ||
264 | cmpl $2,early_recursion_flag(%rip) | ||
265 | jz 1f | ||
266 | incl early_recursion_flag(%rip) | ||
267 | xorl %eax,%eax | ||
268 | movq 8(%rsp),%rsi # get rip | ||
269 | movq (%rsp),%rdx | ||
270 | movq %cr2,%rcx | ||
271 | leaq early_idt_msg(%rip),%rdi | ||
272 | call early_printk | ||
273 | cmpl $2,early_recursion_flag(%rip) | ||
274 | jz 1f | ||
275 | call dump_stack | ||
276 | #ifdef CONFIG_KALLSYMS | ||
277 | leaq early_idt_ripmsg(%rip),%rdi | ||
278 | movq 8(%rsp),%rsi # get rip again | ||
279 | call __print_symbol | ||
280 | #endif | ||
281 | 1: hlt | ||
282 | jmp 1b | ||
283 | early_recursion_flag: | ||
284 | .long 0 | ||
285 | |||
286 | early_idt_msg: | ||
287 | .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" | ||
288 | early_idt_ripmsg: | ||
289 | .asciz "RIP %s\n" | ||
290 | |||
291 | .balign PAGE_SIZE | ||
292 | |||
293 | #define NEXT_PAGE(name) \ | ||
294 | .balign PAGE_SIZE; \ | ||
295 | ENTRY(name) | ||
296 | |||
297 | /* Automate the creation of 1 to 1 mapping pmd entries */ | ||
298 | #define PMDS(START, PERM, COUNT) \ | ||
299 | i = 0 ; \ | ||
300 | .rept (COUNT) ; \ | ||
301 | .quad (START) + (i << 21) + (PERM) ; \ | ||
302 | i = i + 1 ; \ | ||
303 | .endr | ||
304 | |||
305 | /* | ||
306 | * This default setting generates an ident mapping at address 0x100000 | ||
307 | * and a mapping for the kernel that precisely maps virtual address | ||
308 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
309 | * 2Mbyte large pages provided by PAE mode) | ||
310 | */ | ||
311 | NEXT_PAGE(init_level4_pgt) | ||
312 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
313 | .fill 257,8,0 | ||
314 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
315 | .fill 252,8,0 | ||
316 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
317 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
318 | |||
319 | NEXT_PAGE(level3_ident_pgt) | ||
320 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
321 | .fill 511,8,0 | ||
322 | |||
323 | NEXT_PAGE(level3_kernel_pgt) | ||
324 | .fill 510,8,0 | ||
325 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | ||
326 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
327 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
328 | |||
329 | NEXT_PAGE(level2_fixmap_pgt) | ||
330 | .fill 506,8,0 | ||
331 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
332 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
333 | .fill 5,8,0 | ||
334 | |||
335 | NEXT_PAGE(level1_fixmap_pgt) | ||
336 | .fill 512,8,0 | ||
337 | |||
338 | NEXT_PAGE(level2_ident_pgt) | ||
339 | /* Since I easily can, map the first 1G. | ||
340 | * Don't set NX because code runs from these pages. | ||
341 | */ | ||
342 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | ||
343 | |||
344 | NEXT_PAGE(level2_kernel_pgt) | ||
345 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | ||
346 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | ||
347 | /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ | ||
348 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) | ||
349 | /* Module mapping starts here */ | ||
350 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 | ||
351 | |||
352 | NEXT_PAGE(level2_spare_pgt) | ||
353 | .fill 512,8,0 | ||
354 | |||
355 | #undef PMDS | ||
356 | #undef NEXT_PAGE | ||
357 | |||
358 | .data | ||
359 | .align 16 | ||
360 | .globl cpu_gdt_descr | ||
361 | cpu_gdt_descr: | ||
362 | .word gdt_end-cpu_gdt_table-1 | ||
363 | gdt: | ||
364 | .quad cpu_gdt_table | ||
365 | #ifdef CONFIG_SMP | ||
366 | .rept NR_CPUS-1 | ||
367 | .word 0 | ||
368 | .quad 0 | ||
369 | .endr | ||
370 | #endif | ||
371 | |||
372 | ENTRY(phys_base) | ||
373 | /* This must match the first entry in level2_kernel_pgt */ | ||
374 | .quad 0x0000000000000000 | ||
375 | |||
376 | /* We need valid kernel segments for data and code in long mode too | ||
377 | * IRET will check the segment types kkeil 2000/10/28 | ||
378 | * Also sysret mandates a special GDT layout | ||
379 | */ | ||
380 | |||
381 | .section .data.page_aligned, "aw" | ||
382 | .align PAGE_SIZE | ||
383 | |||
384 | /* The TLS descriptors are currently at a different place compared to i386. | ||
385 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
386 | |||
387 | ENTRY(cpu_gdt_table) | ||
388 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
389 | .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ | ||
390 | .quad 0x00af9b000000ffff /* __KERNEL_CS */ | ||
391 | .quad 0x00cf93000000ffff /* __KERNEL_DS */ | ||
392 | .quad 0x00cffb000000ffff /* __USER32_CS */ | ||
393 | .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ | ||
394 | .quad 0x00affb000000ffff /* __USER_CS */ | ||
395 | .quad 0x0 /* unused */ | ||
396 | .quad 0,0 /* TSS */ | ||
397 | .quad 0,0 /* LDT */ | ||
398 | .quad 0,0,0 /* three TLS descriptors */ | ||
399 | .quad 0x0000f40000000000 /* node/CPU stored in limit */ | ||
400 | gdt_end: | ||
401 | /* asm/segment.h:GDT_ENTRIES must match this */ | ||
402 | /* This should be a multiple of the cache line size */ | ||
403 | /* GDTs of other CPUs are now dynamically allocated */ | ||
404 | |||
405 | /* zero the remaining page */ | ||
406 | .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 | ||
407 | |||
408 | .section .bss, "aw", @nobits | ||
409 | .align L1_CACHE_BYTES | ||
410 | ENTRY(idt_table) | ||
411 | .skip 256 * 16 | ||
412 | |||
413 | .section .bss.page_aligned, "aw", @nobits | ||
414 | .align PAGE_SIZE | ||
415 | ENTRY(empty_zero_page) | ||
416 | .skip PAGE_SIZE | ||
diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c new file mode 100644 index 000000000000..e2d1b912e154 --- /dev/null +++ b/arch/x86/kernel/hpet_64.c | |||
@@ -0,0 +1,493 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/mc146818rtc.h> | ||
5 | #include <linux/time.h> | ||
6 | #include <linux/clocksource.h> | ||
7 | #include <linux/ioport.h> | ||
8 | #include <linux/acpi.h> | ||
9 | #include <linux/hpet.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/vsyscall.h> | ||
12 | #include <asm/timex.h> | ||
13 | #include <asm/hpet.h> | ||
14 | |||
15 | #define HPET_MASK 0xFFFFFFFF | ||
16 | #define HPET_SHIFT 22 | ||
17 | |||
18 | /* FSEC = 10^-15 NSEC = 10^-9 */ | ||
19 | #define FSEC_PER_NSEC 1000000 | ||
20 | |||
21 | int nohpet __initdata; | ||
22 | |||
23 | unsigned long hpet_address; | ||
24 | unsigned long hpet_period; /* fsecs / HPET clock */ | ||
25 | unsigned long hpet_tick; /* HPET clocks / interrupt */ | ||
26 | |||
27 | int hpet_use_timer; /* Use counter of hpet for time keeping, | ||
28 | * otherwise PIT | ||
29 | */ | ||
30 | |||
31 | #ifdef CONFIG_HPET | ||
32 | static __init int late_hpet_init(void) | ||
33 | { | ||
34 | struct hpet_data hd; | ||
35 | unsigned int ntimer; | ||
36 | |||
37 | if (!hpet_address) | ||
38 | return 0; | ||
39 | |||
40 | memset(&hd, 0, sizeof(hd)); | ||
41 | |||
42 | ntimer = hpet_readl(HPET_ID); | ||
43 | ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; | ||
44 | ntimer++; | ||
45 | |||
46 | /* | ||
47 | * Register with driver. | ||
48 | * Timer0 and Timer1 is used by platform. | ||
49 | */ | ||
50 | hd.hd_phys_address = hpet_address; | ||
51 | hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); | ||
52 | hd.hd_nirqs = ntimer; | ||
53 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
54 | hpet_reserve_timer(&hd, 0); | ||
55 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
56 | hpet_reserve_timer(&hd, 1); | ||
57 | #endif | ||
58 | hd.hd_irq[0] = HPET_LEGACY_8254; | ||
59 | hd.hd_irq[1] = HPET_LEGACY_RTC; | ||
60 | if (ntimer > 2) { | ||
61 | struct hpet *hpet; | ||
62 | struct hpet_timer *timer; | ||
63 | int i; | ||
64 | |||
65 | hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); | ||
66 | timer = &hpet->hpet_timers[2]; | ||
67 | for (i = 2; i < ntimer; timer++, i++) | ||
68 | hd.hd_irq[i] = (timer->hpet_config & | ||
69 | Tn_INT_ROUTE_CNF_MASK) >> | ||
70 | Tn_INT_ROUTE_CNF_SHIFT; | ||
71 | |||
72 | } | ||
73 | |||
74 | hpet_alloc(&hd); | ||
75 | return 0; | ||
76 | } | ||
77 | fs_initcall(late_hpet_init); | ||
78 | #endif | ||
79 | |||
80 | int hpet_timer_stop_set_go(unsigned long tick) | ||
81 | { | ||
82 | unsigned int cfg; | ||
83 | |||
84 | /* | ||
85 | * Stop the timers and reset the main counter. | ||
86 | */ | ||
87 | |||
88 | cfg = hpet_readl(HPET_CFG); | ||
89 | cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); | ||
90 | hpet_writel(cfg, HPET_CFG); | ||
91 | hpet_writel(0, HPET_COUNTER); | ||
92 | hpet_writel(0, HPET_COUNTER + 4); | ||
93 | |||
94 | /* | ||
95 | * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, | ||
96 | * and period also hpet_tick. | ||
97 | */ | ||
98 | if (hpet_use_timer) { | ||
99 | hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | | ||
100 | HPET_TN_32BIT, HPET_T0_CFG); | ||
101 | hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ | ||
102 | hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ | ||
103 | cfg |= HPET_CFG_LEGACY; | ||
104 | } | ||
105 | /* | ||
106 | * Go! | ||
107 | */ | ||
108 | |||
109 | cfg |= HPET_CFG_ENABLE; | ||
110 | hpet_writel(cfg, HPET_CFG); | ||
111 | |||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static cycle_t read_hpet(void) | ||
116 | { | ||
117 | return (cycle_t)hpet_readl(HPET_COUNTER); | ||
118 | } | ||
119 | |||
120 | static cycle_t __vsyscall_fn vread_hpet(void) | ||
121 | { | ||
122 | return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); | ||
123 | } | ||
124 | |||
125 | struct clocksource clocksource_hpet = { | ||
126 | .name = "hpet", | ||
127 | .rating = 250, | ||
128 | .read = read_hpet, | ||
129 | .mask = (cycle_t)HPET_MASK, | ||
130 | .mult = 0, /* set below */ | ||
131 | .shift = HPET_SHIFT, | ||
132 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
133 | .vread = vread_hpet, | ||
134 | }; | ||
135 | |||
136 | int __init hpet_arch_init(void) | ||
137 | { | ||
138 | unsigned int id; | ||
139 | u64 tmp; | ||
140 | |||
141 | if (!hpet_address) | ||
142 | return -1; | ||
143 | set_fixmap_nocache(FIX_HPET_BASE, hpet_address); | ||
144 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
145 | |||
146 | /* | ||
147 | * Read the period, compute tick and quotient. | ||
148 | */ | ||
149 | |||
150 | id = hpet_readl(HPET_ID); | ||
151 | |||
152 | if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) | ||
153 | return -1; | ||
154 | |||
155 | hpet_period = hpet_readl(HPET_PERIOD); | ||
156 | if (hpet_period < 100000 || hpet_period > 100000000) | ||
157 | return -1; | ||
158 | |||
159 | hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; | ||
160 | |||
161 | hpet_use_timer = (id & HPET_ID_LEGSUP); | ||
162 | |||
163 | /* | ||
164 | * hpet period is in femto seconds per cycle | ||
165 | * so we need to convert this to ns/cyc units | ||
166 | * aproximated by mult/2^shift | ||
167 | * | ||
168 | * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift | ||
169 | * fsec/cyc * 1ns/1000000fsec * 2^shift = mult | ||
170 | * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult | ||
171 | * (fsec/cyc << shift)/1000000 = mult | ||
172 | * (hpet_period << shift)/FSEC_PER_NSEC = mult | ||
173 | */ | ||
174 | tmp = (u64)hpet_period << HPET_SHIFT; | ||
175 | do_div(tmp, FSEC_PER_NSEC); | ||
176 | clocksource_hpet.mult = (u32)tmp; | ||
177 | clocksource_register(&clocksource_hpet); | ||
178 | |||
179 | return hpet_timer_stop_set_go(hpet_tick); | ||
180 | } | ||
181 | |||
182 | int hpet_reenable(void) | ||
183 | { | ||
184 | return hpet_timer_stop_set_go(hpet_tick); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing | ||
189 | * it to the HPET timer of known frequency. | ||
190 | */ | ||
191 | |||
192 | #define TICK_COUNT 100000000 | ||
193 | #define SMI_THRESHOLD 50000 | ||
194 | #define MAX_TRIES 5 | ||
195 | |||
196 | /* | ||
197 | * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none | ||
198 | * occurs between the reads of the hpet & TSC. | ||
199 | */ | ||
200 | static void __init read_hpet_tsc(int *hpet, int *tsc) | ||
201 | { | ||
202 | int tsc1, tsc2, hpet1, i; | ||
203 | |||
204 | for (i = 0; i < MAX_TRIES; i++) { | ||
205 | tsc1 = get_cycles_sync(); | ||
206 | hpet1 = hpet_readl(HPET_COUNTER); | ||
207 | tsc2 = get_cycles_sync(); | ||
208 | if ((tsc2 - tsc1) < SMI_THRESHOLD) | ||
209 | break; | ||
210 | } | ||
211 | *hpet = hpet1; | ||
212 | *tsc = tsc2; | ||
213 | } | ||
214 | |||
215 | unsigned int __init hpet_calibrate_tsc(void) | ||
216 | { | ||
217 | int tsc_start, hpet_start; | ||
218 | int tsc_now, hpet_now; | ||
219 | unsigned long flags; | ||
220 | |||
221 | local_irq_save(flags); | ||
222 | |||
223 | read_hpet_tsc(&hpet_start, &tsc_start); | ||
224 | |||
225 | do { | ||
226 | local_irq_disable(); | ||
227 | read_hpet_tsc(&hpet_now, &tsc_now); | ||
228 | local_irq_restore(flags); | ||
229 | } while ((tsc_now - tsc_start) < TICK_COUNT && | ||
230 | (hpet_now - hpet_start) < TICK_COUNT); | ||
231 | |||
232 | return (tsc_now - tsc_start) * 1000000000L | ||
233 | / ((hpet_now - hpet_start) * hpet_period / 1000); | ||
234 | } | ||
235 | |||
236 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
237 | /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET | ||
238 | * is enabled, we support RTC interrupt functionality in software. | ||
239 | * RTC has 3 kinds of interrupts: | ||
240 | * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock | ||
241 | * is updated | ||
242 | * 2) Alarm Interrupt - generate an interrupt at a specific time of day | ||
243 | * 3) Periodic Interrupt - generate periodic interrupt, with frequencies | ||
244 | * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) | ||
245 | * (1) and (2) above are implemented using polling at a frequency of | ||
246 | * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt | ||
247 | * overhead. (DEFAULT_RTC_INT_FREQ) | ||
248 | * For (3), we use interrupts at 64Hz or user specified periodic | ||
249 | * frequency, whichever is higher. | ||
250 | */ | ||
251 | #include <linux/rtc.h> | ||
252 | |||
253 | #define DEFAULT_RTC_INT_FREQ 64 | ||
254 | #define RTC_NUM_INTS 1 | ||
255 | |||
256 | static unsigned long UIE_on; | ||
257 | static unsigned long prev_update_sec; | ||
258 | |||
259 | static unsigned long AIE_on; | ||
260 | static struct rtc_time alarm_time; | ||
261 | |||
262 | static unsigned long PIE_on; | ||
263 | static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; | ||
264 | static unsigned long PIE_count; | ||
265 | |||
266 | static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ | ||
267 | static unsigned int hpet_t1_cmp; /* cached comparator register */ | ||
268 | |||
269 | int is_hpet_enabled(void) | ||
270 | { | ||
271 | return hpet_address != 0; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Timer 1 for RTC, we do not use periodic interrupt feature, | ||
276 | * even if HPET supports periodic interrupts on Timer 1. | ||
277 | * The reason being, to set up a periodic interrupt in HPET, we need to | ||
278 | * stop the main counter. And if we do that everytime someone diables/enables | ||
279 | * RTC, we will have adverse effect on main kernel timer running on Timer 0. | ||
280 | * So, for the time being, simulate the periodic interrupt in software. | ||
281 | * | ||
282 | * hpet_rtc_timer_init() is called for the first time and during subsequent | ||
283 | * interuppts reinit happens through hpet_rtc_timer_reinit(). | ||
284 | */ | ||
285 | int hpet_rtc_timer_init(void) | ||
286 | { | ||
287 | unsigned int cfg, cnt; | ||
288 | unsigned long flags; | ||
289 | |||
290 | if (!is_hpet_enabled()) | ||
291 | return 0; | ||
292 | /* | ||
293 | * Set the counter 1 and enable the interrupts. | ||
294 | */ | ||
295 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
296 | hpet_rtc_int_freq = PIE_freq; | ||
297 | else | ||
298 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
299 | |||
300 | local_irq_save(flags); | ||
301 | |||
302 | cnt = hpet_readl(HPET_COUNTER); | ||
303 | cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); | ||
304 | hpet_writel(cnt, HPET_T1_CMP); | ||
305 | hpet_t1_cmp = cnt; | ||
306 | |||
307 | cfg = hpet_readl(HPET_T1_CFG); | ||
308 | cfg &= ~HPET_TN_PERIODIC; | ||
309 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; | ||
310 | hpet_writel(cfg, HPET_T1_CFG); | ||
311 | |||
312 | local_irq_restore(flags); | ||
313 | |||
314 | return 1; | ||
315 | } | ||
316 | |||
317 | static void hpet_rtc_timer_reinit(void) | ||
318 | { | ||
319 | unsigned int cfg, cnt, ticks_per_int, lost_ints; | ||
320 | |||
321 | if (unlikely(!(PIE_on | AIE_on | UIE_on))) { | ||
322 | cfg = hpet_readl(HPET_T1_CFG); | ||
323 | cfg &= ~HPET_TN_ENABLE; | ||
324 | hpet_writel(cfg, HPET_T1_CFG); | ||
325 | return; | ||
326 | } | ||
327 | |||
328 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
329 | hpet_rtc_int_freq = PIE_freq; | ||
330 | else | ||
331 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
332 | |||
333 | /* It is more accurate to use the comparator value than current count.*/ | ||
334 | ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; | ||
335 | hpet_t1_cmp += ticks_per_int; | ||
336 | hpet_writel(hpet_t1_cmp, HPET_T1_CMP); | ||
337 | |||
338 | /* | ||
339 | * If the interrupt handler was delayed too long, the write above tries | ||
340 | * to schedule the next interrupt in the past and the hardware would | ||
341 | * not interrupt until the counter had wrapped around. | ||
342 | * So we have to check that the comparator wasn't set to a past time. | ||
343 | */ | ||
344 | cnt = hpet_readl(HPET_COUNTER); | ||
345 | if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { | ||
346 | lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; | ||
347 | /* Make sure that, even with the time needed to execute | ||
348 | * this code, the next scheduled interrupt has been moved | ||
349 | * back to the future: */ | ||
350 | lost_ints++; | ||
351 | |||
352 | hpet_t1_cmp += lost_ints * ticks_per_int; | ||
353 | hpet_writel(hpet_t1_cmp, HPET_T1_CMP); | ||
354 | |||
355 | if (PIE_on) | ||
356 | PIE_count += lost_ints; | ||
357 | |||
358 | if (printk_ratelimit()) | ||
359 | printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", | ||
360 | hpet_rtc_int_freq); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * The functions below are called from rtc driver. | ||
366 | * Return 0 if HPET is not being used. | ||
367 | * Otherwise do the necessary changes and return 1. | ||
368 | */ | ||
369 | int hpet_mask_rtc_irq_bit(unsigned long bit_mask) | ||
370 | { | ||
371 | if (!is_hpet_enabled()) | ||
372 | return 0; | ||
373 | |||
374 | if (bit_mask & RTC_UIE) | ||
375 | UIE_on = 0; | ||
376 | if (bit_mask & RTC_PIE) | ||
377 | PIE_on = 0; | ||
378 | if (bit_mask & RTC_AIE) | ||
379 | AIE_on = 0; | ||
380 | |||
381 | return 1; | ||
382 | } | ||
383 | |||
384 | int hpet_set_rtc_irq_bit(unsigned long bit_mask) | ||
385 | { | ||
386 | int timer_init_reqd = 0; | ||
387 | |||
388 | if (!is_hpet_enabled()) | ||
389 | return 0; | ||
390 | |||
391 | if (!(PIE_on | AIE_on | UIE_on)) | ||
392 | timer_init_reqd = 1; | ||
393 | |||
394 | if (bit_mask & RTC_UIE) { | ||
395 | UIE_on = 1; | ||
396 | } | ||
397 | if (bit_mask & RTC_PIE) { | ||
398 | PIE_on = 1; | ||
399 | PIE_count = 0; | ||
400 | } | ||
401 | if (bit_mask & RTC_AIE) { | ||
402 | AIE_on = 1; | ||
403 | } | ||
404 | |||
405 | if (timer_init_reqd) | ||
406 | hpet_rtc_timer_init(); | ||
407 | |||
408 | return 1; | ||
409 | } | ||
410 | |||
411 | int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) | ||
412 | { | ||
413 | if (!is_hpet_enabled()) | ||
414 | return 0; | ||
415 | |||
416 | alarm_time.tm_hour = hrs; | ||
417 | alarm_time.tm_min = min; | ||
418 | alarm_time.tm_sec = sec; | ||
419 | |||
420 | return 1; | ||
421 | } | ||
422 | |||
423 | int hpet_set_periodic_freq(unsigned long freq) | ||
424 | { | ||
425 | if (!is_hpet_enabled()) | ||
426 | return 0; | ||
427 | |||
428 | PIE_freq = freq; | ||
429 | PIE_count = 0; | ||
430 | |||
431 | return 1; | ||
432 | } | ||
433 | |||
434 | int hpet_rtc_dropped_irq(void) | ||
435 | { | ||
436 | if (!is_hpet_enabled()) | ||
437 | return 0; | ||
438 | |||
439 | return 1; | ||
440 | } | ||
441 | |||
442 | irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) | ||
443 | { | ||
444 | struct rtc_time curr_time; | ||
445 | unsigned long rtc_int_flag = 0; | ||
446 | int call_rtc_interrupt = 0; | ||
447 | |||
448 | hpet_rtc_timer_reinit(); | ||
449 | |||
450 | if (UIE_on | AIE_on) { | ||
451 | rtc_get_rtc_time(&curr_time); | ||
452 | } | ||
453 | if (UIE_on) { | ||
454 | if (curr_time.tm_sec != prev_update_sec) { | ||
455 | /* Set update int info, call real rtc int routine */ | ||
456 | call_rtc_interrupt = 1; | ||
457 | rtc_int_flag = RTC_UF; | ||
458 | prev_update_sec = curr_time.tm_sec; | ||
459 | } | ||
460 | } | ||
461 | if (PIE_on) { | ||
462 | PIE_count++; | ||
463 | if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { | ||
464 | /* Set periodic int info, call real rtc int routine */ | ||
465 | call_rtc_interrupt = 1; | ||
466 | rtc_int_flag |= RTC_PF; | ||
467 | PIE_count = 0; | ||
468 | } | ||
469 | } | ||
470 | if (AIE_on) { | ||
471 | if ((curr_time.tm_sec == alarm_time.tm_sec) && | ||
472 | (curr_time.tm_min == alarm_time.tm_min) && | ||
473 | (curr_time.tm_hour == alarm_time.tm_hour)) { | ||
474 | /* Set alarm int info, call real rtc int routine */ | ||
475 | call_rtc_interrupt = 1; | ||
476 | rtc_int_flag |= RTC_AF; | ||
477 | } | ||
478 | } | ||
479 | if (call_rtc_interrupt) { | ||
480 | rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); | ||
481 | rtc_interrupt(rtc_int_flag, dev_id); | ||
482 | } | ||
483 | return IRQ_HANDLED; | ||
484 | } | ||
485 | #endif | ||
486 | |||
487 | static int __init nohpet_setup(char *s) | ||
488 | { | ||
489 | nohpet = 1; | ||
490 | return 1; | ||
491 | } | ||
492 | |||
493 | __setup("nohpet", nohpet_setup); | ||
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c new file mode 100644 index 000000000000..1d58c13bc6bc --- /dev/null +++ b/arch/x86/kernel/i387_64.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/i387.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * General FPU state handling cleanups | ||
9 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
10 | * | ||
11 | * x86-64 rework 2002 Andi Kleen. | ||
12 | * Does direct fxsave in and out of user space now for signal handlers. | ||
13 | * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, | ||
14 | * the 64bit user space sees a FXSAVE frame directly. | ||
15 | */ | ||
16 | |||
17 | #include <linux/sched.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <asm/processor.h> | ||
20 | #include <asm/i387.h> | ||
21 | #include <asm/sigcontext.h> | ||
22 | #include <asm/user.h> | ||
23 | #include <asm/ptrace.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | |||
26 | unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; | ||
27 | |||
28 | void mxcsr_feature_mask_init(void) | ||
29 | { | ||
30 | unsigned int mask; | ||
31 | clts(); | ||
32 | memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
33 | asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); | ||
34 | mask = current->thread.i387.fxsave.mxcsr_mask; | ||
35 | if (mask == 0) mask = 0x0000ffbf; | ||
36 | mxcsr_feature_mask &= mask; | ||
37 | stts(); | ||
38 | } | ||
39 | |||
40 | /* | ||
41 | * Called at bootup to set up the initial FPU state that is later cloned | ||
42 | * into all processes. | ||
43 | */ | ||
44 | void __cpuinit fpu_init(void) | ||
45 | { | ||
46 | unsigned long oldcr0 = read_cr0(); | ||
47 | extern void __bad_fxsave_alignment(void); | ||
48 | |||
49 | if (offsetof(struct task_struct, thread.i387.fxsave) & 15) | ||
50 | __bad_fxsave_alignment(); | ||
51 | set_in_cr4(X86_CR4_OSFXSR); | ||
52 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
53 | |||
54 | write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ | ||
55 | |||
56 | mxcsr_feature_mask_init(); | ||
57 | /* clean state in init */ | ||
58 | current_thread_info()->status = 0; | ||
59 | clear_used_math(); | ||
60 | } | ||
61 | |||
62 | void init_fpu(struct task_struct *child) | ||
63 | { | ||
64 | if (tsk_used_math(child)) { | ||
65 | if (child == current) | ||
66 | unlazy_fpu(child); | ||
67 | return; | ||
68 | } | ||
69 | memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
70 | child->thread.i387.fxsave.cwd = 0x37f; | ||
71 | child->thread.i387.fxsave.mxcsr = 0x1f80; | ||
72 | /* only the device not available exception or ptrace can call init_fpu */ | ||
73 | set_stopped_child_used_math(child); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Signal frame handlers. | ||
78 | */ | ||
79 | |||
80 | int save_i387(struct _fpstate __user *buf) | ||
81 | { | ||
82 | struct task_struct *tsk = current; | ||
83 | int err = 0; | ||
84 | |||
85 | BUILD_BUG_ON(sizeof(struct user_i387_struct) != | ||
86 | sizeof(tsk->thread.i387.fxsave)); | ||
87 | |||
88 | if ((unsigned long)buf % 16) | ||
89 | printk("save_i387: bad fpstate %p\n",buf); | ||
90 | |||
91 | if (!used_math()) | ||
92 | return 0; | ||
93 | clear_used_math(); /* trigger finit */ | ||
94 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | ||
95 | err = save_i387_checking((struct i387_fxsave_struct __user *)buf); | ||
96 | if (err) return err; | ||
97 | stts(); | ||
98 | } else { | ||
99 | if (__copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
100 | sizeof(struct i387_fxsave_struct))) | ||
101 | return -1; | ||
102 | } | ||
103 | return 1; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * ptrace request handlers. | ||
108 | */ | ||
109 | |||
110 | int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) | ||
111 | { | ||
112 | init_fpu(tsk); | ||
113 | return __copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
114 | sizeof(struct user_i387_struct)) ? -EFAULT : 0; | ||
115 | } | ||
116 | |||
117 | int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) | ||
118 | { | ||
119 | if (__copy_from_user(&tsk->thread.i387.fxsave, buf, | ||
120 | sizeof(struct user_i387_struct))) | ||
121 | return -EFAULT; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * FPU state for core dumps. | ||
127 | */ | ||
128 | |||
129 | int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) | ||
130 | { | ||
131 | struct task_struct *tsk = current; | ||
132 | |||
133 | if (!used_math()) | ||
134 | return 0; | ||
135 | |||
136 | unlazy_fpu(tsk); | ||
137 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) | ||
142 | { | ||
143 | int fpvalid = !!tsk_used_math(tsk); | ||
144 | |||
145 | if (fpvalid) { | ||
146 | if (tsk == current) | ||
147 | unlazy_fpu(tsk); | ||
148 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
149 | } | ||
150 | return fpvalid; | ||
151 | } | ||
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c new file mode 100644 index 000000000000..948cae646099 --- /dev/null +++ b/arch/x86/kernel/i8259_64.c | |||
@@ -0,0 +1,544 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/signal.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/timex.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/random.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kernel_stat.h> | ||
12 | #include <linux/sysdev.h> | ||
13 | #include <linux/bitops.h> | ||
14 | |||
15 | #include <asm/acpi.h> | ||
16 | #include <asm/atomic.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <asm/io.h> | ||
19 | #include <asm/hw_irq.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/delay.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/apic.h> | ||
24 | |||
25 | /* | ||
26 | * Common place to define all x86 IRQ vectors | ||
27 | * | ||
28 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
29 | * | ||
30 | * These macros create the low-level assembly IRQ routines that save | ||
31 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
32 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
33 | * interrupt-controller happy. | ||
34 | */ | ||
35 | |||
36 | #define BI(x,y) \ | ||
37 | BUILD_IRQ(x##y) | ||
38 | |||
39 | #define BUILD_16_IRQS(x) \ | ||
40 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
41 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
42 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
43 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
44 | |||
45 | /* | ||
46 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
47 | * (these are usually mapped to vectors 0x30-0x3f) | ||
48 | */ | ||
49 | |||
50 | /* | ||
51 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
52 | * are unused but an SMP system is supposed to have enough memory ... | ||
53 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
54 | * across the spectrum, so we really want to be prepared to get all | ||
55 | * of these. Plus, more powerful systems might have more than 64 | ||
56 | * IO-APIC registers. | ||
57 | * | ||
58 | * (these are usually mapped into the 0x30-0xff vector range) | ||
59 | */ | ||
60 | BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
61 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
62 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
63 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) | ||
64 | |||
65 | #undef BUILD_16_IRQS | ||
66 | #undef BI | ||
67 | |||
68 | |||
69 | #define IRQ(x,y) \ | ||
70 | IRQ##x##y##_interrupt | ||
71 | |||
72 | #define IRQLIST_16(x) \ | ||
73 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
74 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
75 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
76 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
77 | |||
78 | /* for the irq vectors */ | ||
79 | static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { | ||
80 | IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
81 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
82 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
83 | IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) | ||
84 | }; | ||
85 | |||
86 | #undef IRQ | ||
87 | #undef IRQLIST_16 | ||
88 | |||
89 | /* | ||
90 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
91 | * present in the majority of PC/AT boxes. | ||
92 | * plus some generic x86 specific things if generic specifics makes | ||
93 | * any sense at all. | ||
94 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
95 | * moves to arch independent land | ||
96 | */ | ||
97 | |||
98 | static int i8259A_auto_eoi; | ||
99 | DEFINE_SPINLOCK(i8259A_lock); | ||
100 | static void mask_and_ack_8259A(unsigned int); | ||
101 | |||
102 | static struct irq_chip i8259A_chip = { | ||
103 | .name = "XT-PIC", | ||
104 | .mask = disable_8259A_irq, | ||
105 | .disable = disable_8259A_irq, | ||
106 | .unmask = enable_8259A_irq, | ||
107 | .mask_ack = mask_and_ack_8259A, | ||
108 | }; | ||
109 | |||
110 | /* | ||
111 | * 8259A PIC functions to handle ISA devices: | ||
112 | */ | ||
113 | |||
114 | /* | ||
115 | * This contains the irq mask for both 8259A irq controllers, | ||
116 | */ | ||
117 | static unsigned int cached_irq_mask = 0xffff; | ||
118 | |||
119 | #define __byte(x,y) (((unsigned char *)&(y))[x]) | ||
120 | #define cached_21 (__byte(0,cached_irq_mask)) | ||
121 | #define cached_A1 (__byte(1,cached_irq_mask)) | ||
122 | |||
123 | /* | ||
124 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
125 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
126 | * it's fed to the master 8259A's IR0 line only. | ||
127 | * | ||
128 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
129 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
130 | * at IRQ setup time. | ||
131 | */ | ||
132 | unsigned long io_apic_irqs; | ||
133 | |||
134 | void disable_8259A_irq(unsigned int irq) | ||
135 | { | ||
136 | unsigned int mask = 1 << irq; | ||
137 | unsigned long flags; | ||
138 | |||
139 | spin_lock_irqsave(&i8259A_lock, flags); | ||
140 | cached_irq_mask |= mask; | ||
141 | if (irq & 8) | ||
142 | outb(cached_A1,0xA1); | ||
143 | else | ||
144 | outb(cached_21,0x21); | ||
145 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
146 | } | ||
147 | |||
148 | void enable_8259A_irq(unsigned int irq) | ||
149 | { | ||
150 | unsigned int mask = ~(1 << irq); | ||
151 | unsigned long flags; | ||
152 | |||
153 | spin_lock_irqsave(&i8259A_lock, flags); | ||
154 | cached_irq_mask &= mask; | ||
155 | if (irq & 8) | ||
156 | outb(cached_A1,0xA1); | ||
157 | else | ||
158 | outb(cached_21,0x21); | ||
159 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
160 | } | ||
161 | |||
162 | int i8259A_irq_pending(unsigned int irq) | ||
163 | { | ||
164 | unsigned int mask = 1<<irq; | ||
165 | unsigned long flags; | ||
166 | int ret; | ||
167 | |||
168 | spin_lock_irqsave(&i8259A_lock, flags); | ||
169 | if (irq < 8) | ||
170 | ret = inb(0x20) & mask; | ||
171 | else | ||
172 | ret = inb(0xA0) & (mask >> 8); | ||
173 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
178 | void make_8259A_irq(unsigned int irq) | ||
179 | { | ||
180 | disable_irq_nosync(irq); | ||
181 | io_apic_irqs &= ~(1<<irq); | ||
182 | set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, | ||
183 | "XT"); | ||
184 | enable_irq(irq); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * This function assumes to be called rarely. Switching between | ||
189 | * 8259A registers is slow. | ||
190 | * This has to be protected by the irq controller spinlock | ||
191 | * before being called. | ||
192 | */ | ||
193 | static inline int i8259A_irq_real(unsigned int irq) | ||
194 | { | ||
195 | int value; | ||
196 | int irqmask = 1<<irq; | ||
197 | |||
198 | if (irq < 8) { | ||
199 | outb(0x0B,0x20); /* ISR register */ | ||
200 | value = inb(0x20) & irqmask; | ||
201 | outb(0x0A,0x20); /* back to the IRR register */ | ||
202 | return value; | ||
203 | } | ||
204 | outb(0x0B,0xA0); /* ISR register */ | ||
205 | value = inb(0xA0) & (irqmask >> 8); | ||
206 | outb(0x0A,0xA0); /* back to the IRR register */ | ||
207 | return value; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Careful! The 8259A is a fragile beast, it pretty | ||
212 | * much _has_ to be done exactly like this (mask it | ||
213 | * first, _then_ send the EOI, and the order of EOI | ||
214 | * to the two 8259s is important! | ||
215 | */ | ||
216 | static void mask_and_ack_8259A(unsigned int irq) | ||
217 | { | ||
218 | unsigned int irqmask = 1 << irq; | ||
219 | unsigned long flags; | ||
220 | |||
221 | spin_lock_irqsave(&i8259A_lock, flags); | ||
222 | /* | ||
223 | * Lightweight spurious IRQ detection. We do not want | ||
224 | * to overdo spurious IRQ handling - it's usually a sign | ||
225 | * of hardware problems, so we only do the checks we can | ||
226 | * do without slowing down good hardware unnecessarily. | ||
227 | * | ||
228 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
229 | * usually resulting from the 8259A-1|2 PICs) occur | ||
230 | * even if the IRQ is masked in the 8259A. Thus we | ||
231 | * can check spurious 8259A IRQs without doing the | ||
232 | * quite slow i8259A_irq_real() call for every IRQ. | ||
233 | * This does not cover 100% of spurious interrupts, | ||
234 | * but should be enough to warn the user that there | ||
235 | * is something bad going on ... | ||
236 | */ | ||
237 | if (cached_irq_mask & irqmask) | ||
238 | goto spurious_8259A_irq; | ||
239 | cached_irq_mask |= irqmask; | ||
240 | |||
241 | handle_real_irq: | ||
242 | if (irq & 8) { | ||
243 | inb(0xA1); /* DUMMY - (do we need this?) */ | ||
244 | outb(cached_A1,0xA1); | ||
245 | outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ | ||
246 | outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ | ||
247 | } else { | ||
248 | inb(0x21); /* DUMMY - (do we need this?) */ | ||
249 | outb(cached_21,0x21); | ||
250 | outb(0x60+irq,0x20); /* 'Specific EOI' to master */ | ||
251 | } | ||
252 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
253 | return; | ||
254 | |||
255 | spurious_8259A_irq: | ||
256 | /* | ||
257 | * this is the slow path - should happen rarely. | ||
258 | */ | ||
259 | if (i8259A_irq_real(irq)) | ||
260 | /* | ||
261 | * oops, the IRQ _is_ in service according to the | ||
262 | * 8259A - not spurious, go handle it. | ||
263 | */ | ||
264 | goto handle_real_irq; | ||
265 | |||
266 | { | ||
267 | static int spurious_irq_mask; | ||
268 | /* | ||
269 | * At this point we can be sure the IRQ is spurious, | ||
270 | * lets ACK and report it. [once per IRQ] | ||
271 | */ | ||
272 | if (!(spurious_irq_mask & irqmask)) { | ||
273 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
274 | spurious_irq_mask |= irqmask; | ||
275 | } | ||
276 | atomic_inc(&irq_err_count); | ||
277 | /* | ||
278 | * Theoretically we do not have to handle this IRQ, | ||
279 | * but in Linux this does not cause problems and is | ||
280 | * simpler for us. | ||
281 | */ | ||
282 | goto handle_real_irq; | ||
283 | } | ||
284 | } | ||
285 | |||
286 | void init_8259A(int auto_eoi) | ||
287 | { | ||
288 | unsigned long flags; | ||
289 | |||
290 | i8259A_auto_eoi = auto_eoi; | ||
291 | |||
292 | spin_lock_irqsave(&i8259A_lock, flags); | ||
293 | |||
294 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
295 | outb(0xff, 0xA1); /* mask all of 8259A-2 */ | ||
296 | |||
297 | /* | ||
298 | * outb_p - this has to work on a wide range of PC hardware. | ||
299 | */ | ||
300 | outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ | ||
301 | outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ | ||
302 | outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ | ||
303 | if (auto_eoi) | ||
304 | outb_p(0x03, 0x21); /* master does Auto EOI */ | ||
305 | else | ||
306 | outb_p(0x01, 0x21); /* master expects normal EOI */ | ||
307 | |||
308 | outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ | ||
309 | outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ | ||
310 | outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ | ||
311 | outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode | ||
312 | is to be investigated) */ | ||
313 | |||
314 | if (auto_eoi) | ||
315 | /* | ||
316 | * in AEOI mode we just have to mask the interrupt | ||
317 | * when acking. | ||
318 | */ | ||
319 | i8259A_chip.mask_ack = disable_8259A_irq; | ||
320 | else | ||
321 | i8259A_chip.mask_ack = mask_and_ack_8259A; | ||
322 | |||
323 | udelay(100); /* wait for 8259A to initialize */ | ||
324 | |||
325 | outb(cached_21, 0x21); /* restore master IRQ mask */ | ||
326 | outb(cached_A1, 0xA1); /* restore slave IRQ mask */ | ||
327 | |||
328 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
329 | } | ||
330 | |||
331 | static char irq_trigger[2]; | ||
332 | /** | ||
333 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
334 | */ | ||
335 | static void restore_ELCR(char *trigger) | ||
336 | { | ||
337 | outb(trigger[0], 0x4d0); | ||
338 | outb(trigger[1], 0x4d1); | ||
339 | } | ||
340 | |||
341 | static void save_ELCR(char *trigger) | ||
342 | { | ||
343 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
344 | trigger[0] = inb(0x4d0) & 0xF8; | ||
345 | trigger[1] = inb(0x4d1) & 0xDE; | ||
346 | } | ||
347 | |||
348 | static int i8259A_resume(struct sys_device *dev) | ||
349 | { | ||
350 | init_8259A(i8259A_auto_eoi); | ||
351 | restore_ELCR(irq_trigger); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | static int i8259A_suspend(struct sys_device *dev, pm_message_t state) | ||
356 | { | ||
357 | save_ELCR(irq_trigger); | ||
358 | return 0; | ||
359 | } | ||
360 | |||
361 | static int i8259A_shutdown(struct sys_device *dev) | ||
362 | { | ||
363 | /* Put the i8259A into a quiescent state that | ||
364 | * the kernel initialization code can get it | ||
365 | * out of. | ||
366 | */ | ||
367 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
368 | outb(0xff, 0xA1); /* mask all of 8259A-1 */ | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static struct sysdev_class i8259_sysdev_class = { | ||
373 | set_kset_name("i8259"), | ||
374 | .suspend = i8259A_suspend, | ||
375 | .resume = i8259A_resume, | ||
376 | .shutdown = i8259A_shutdown, | ||
377 | }; | ||
378 | |||
379 | static struct sys_device device_i8259A = { | ||
380 | .id = 0, | ||
381 | .cls = &i8259_sysdev_class, | ||
382 | }; | ||
383 | |||
384 | static int __init i8259A_init_sysfs(void) | ||
385 | { | ||
386 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
387 | if (!error) | ||
388 | error = sysdev_register(&device_i8259A); | ||
389 | return error; | ||
390 | } | ||
391 | |||
392 | device_initcall(i8259A_init_sysfs); | ||
393 | |||
394 | /* | ||
395 | * IRQ2 is cascade interrupt to second interrupt controller | ||
396 | */ | ||
397 | |||
398 | static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; | ||
399 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
400 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
401 | [IRQ0_VECTOR] = 0, | ||
402 | [IRQ1_VECTOR] = 1, | ||
403 | [IRQ2_VECTOR] = 2, | ||
404 | [IRQ3_VECTOR] = 3, | ||
405 | [IRQ4_VECTOR] = 4, | ||
406 | [IRQ5_VECTOR] = 5, | ||
407 | [IRQ6_VECTOR] = 6, | ||
408 | [IRQ7_VECTOR] = 7, | ||
409 | [IRQ8_VECTOR] = 8, | ||
410 | [IRQ9_VECTOR] = 9, | ||
411 | [IRQ10_VECTOR] = 10, | ||
412 | [IRQ11_VECTOR] = 11, | ||
413 | [IRQ12_VECTOR] = 12, | ||
414 | [IRQ13_VECTOR] = 13, | ||
415 | [IRQ14_VECTOR] = 14, | ||
416 | [IRQ15_VECTOR] = 15, | ||
417 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
418 | }; | ||
419 | |||
420 | void __init init_ISA_irqs (void) | ||
421 | { | ||
422 | int i; | ||
423 | |||
424 | init_bsp_APIC(); | ||
425 | init_8259A(0); | ||
426 | |||
427 | for (i = 0; i < NR_IRQS; i++) { | ||
428 | irq_desc[i].status = IRQ_DISABLED; | ||
429 | irq_desc[i].action = NULL; | ||
430 | irq_desc[i].depth = 1; | ||
431 | |||
432 | if (i < 16) { | ||
433 | /* | ||
434 | * 16 old-style INTA-cycle interrupts: | ||
435 | */ | ||
436 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
437 | handle_level_irq, "XT"); | ||
438 | } else { | ||
439 | /* | ||
440 | * 'high' PCI IRQs filled in on demand | ||
441 | */ | ||
442 | irq_desc[i].chip = &no_irq_chip; | ||
443 | } | ||
444 | } | ||
445 | } | ||
446 | |||
447 | static void setup_timer_hardware(void) | ||
448 | { | ||
449 | outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
450 | udelay(10); | ||
451 | outb_p(LATCH & 0xff , 0x40); /* LSB */ | ||
452 | udelay(10); | ||
453 | outb(LATCH >> 8 , 0x40); /* MSB */ | ||
454 | } | ||
455 | |||
456 | static int timer_resume(struct sys_device *dev) | ||
457 | { | ||
458 | setup_timer_hardware(); | ||
459 | return 0; | ||
460 | } | ||
461 | |||
462 | void i8254_timer_resume(void) | ||
463 | { | ||
464 | setup_timer_hardware(); | ||
465 | } | ||
466 | |||
467 | static struct sysdev_class timer_sysclass = { | ||
468 | set_kset_name("timer_pit"), | ||
469 | .resume = timer_resume, | ||
470 | }; | ||
471 | |||
472 | static struct sys_device device_timer = { | ||
473 | .id = 0, | ||
474 | .cls = &timer_sysclass, | ||
475 | }; | ||
476 | |||
477 | static int __init init_timer_sysfs(void) | ||
478 | { | ||
479 | int error = sysdev_class_register(&timer_sysclass); | ||
480 | if (!error) | ||
481 | error = sysdev_register(&device_timer); | ||
482 | return error; | ||
483 | } | ||
484 | |||
485 | device_initcall(init_timer_sysfs); | ||
486 | |||
487 | void __init init_IRQ(void) | ||
488 | { | ||
489 | int i; | ||
490 | |||
491 | init_ISA_irqs(); | ||
492 | /* | ||
493 | * Cover the whole vector space, no vector can escape | ||
494 | * us. (some of these will be overridden and become | ||
495 | * 'special' SMP interrupts) | ||
496 | */ | ||
497 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
498 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
499 | if (vector != IA32_SYSCALL_VECTOR) | ||
500 | set_intr_gate(vector, interrupt[i]); | ||
501 | } | ||
502 | |||
503 | #ifdef CONFIG_SMP | ||
504 | /* | ||
505 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
506 | * IPI, driven by wakeup. | ||
507 | */ | ||
508 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
509 | |||
510 | /* IPIs for invalidation */ | ||
511 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | ||
512 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
513 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
514 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
515 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
516 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
517 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
518 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
519 | |||
520 | /* IPI for generic function call */ | ||
521 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
522 | |||
523 | /* Low priority IPI to cleanup after moving an irq */ | ||
524 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
525 | #endif | ||
526 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
527 | set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
528 | |||
529 | /* self generated IPI for local APIC timer */ | ||
530 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
531 | |||
532 | /* IPI vectors for APIC spurious and error interrupts */ | ||
533 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
534 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
535 | |||
536 | /* | ||
537 | * Set the clock to HZ Hz, we already have a valid | ||
538 | * vector now: | ||
539 | */ | ||
540 | setup_timer_hardware(); | ||
541 | |||
542 | if (!acpi_ioapic) | ||
543 | setup_irq(2, &irq2); | ||
544 | } | ||
diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c new file mode 100644 index 000000000000..4ff33d4f8551 --- /dev/null +++ b/arch/x86/kernel/init_task_64.c | |||
@@ -0,0 +1,54 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct fs_struct init_fs = INIT_FS; | ||
14 | static struct files_struct init_files = INIT_FILES; | ||
15 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
16 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
17 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
18 | |||
19 | EXPORT_SYMBOL(init_mm); | ||
20 | |||
21 | /* | ||
22 | * Initial task structure. | ||
23 | * | ||
24 | * We need to make sure that this is 8192-byte aligned due to the | ||
25 | * way process stacks are handled. This is done by having a special | ||
26 | * "init_task" linker map entry.. | ||
27 | */ | ||
28 | union thread_union init_thread_union | ||
29 | __attribute__((__section__(".data.init_task"))) = | ||
30 | { INIT_THREAD_INFO(init_task) }; | ||
31 | |||
32 | /* | ||
33 | * Initial task structure. | ||
34 | * | ||
35 | * All other task structs will be allocated on slabs in fork.c | ||
36 | */ | ||
37 | struct task_struct init_task = INIT_TASK(init_task); | ||
38 | |||
39 | EXPORT_SYMBOL(init_task); | ||
40 | /* | ||
41 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
42 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
43 | * so they are allowed to end up in the .data.cacheline_aligned | ||
44 | * section. Since TSS's are completely CPU-local, we want them | ||
45 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
46 | */ | ||
47 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | ||
48 | |||
49 | /* Copies of the original ist values from the tss are only accessed during | ||
50 | * debugging, no special alignment required. | ||
51 | */ | ||
52 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
53 | |||
54 | #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) | ||
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c new file mode 100644 index 000000000000..966fa1062491 --- /dev/null +++ b/arch/x86/kernel/io_apic_64.c | |||
@@ -0,0 +1,2202 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/pci.h> | ||
29 | #include <linux/mc146818rtc.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #include <linux/sysdev.h> | ||
32 | #include <linux/msi.h> | ||
33 | #include <linux/htirq.h> | ||
34 | #ifdef CONFIG_ACPI | ||
35 | #include <acpi/acpi_bus.h> | ||
36 | #endif | ||
37 | |||
38 | #include <asm/idle.h> | ||
39 | #include <asm/io.h> | ||
40 | #include <asm/smp.h> | ||
41 | #include <asm/desc.h> | ||
42 | #include <asm/proto.h> | ||
43 | #include <asm/mach_apic.h> | ||
44 | #include <asm/acpi.h> | ||
45 | #include <asm/dma.h> | ||
46 | #include <asm/nmi.h> | ||
47 | #include <asm/msidef.h> | ||
48 | #include <asm/hypertransport.h> | ||
49 | |||
50 | struct irq_cfg { | ||
51 | cpumask_t domain; | ||
52 | cpumask_t old_domain; | ||
53 | unsigned move_cleanup_count; | ||
54 | u8 vector; | ||
55 | u8 move_in_progress : 1; | ||
56 | }; | ||
57 | |||
58 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | ||
59 | struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { | ||
60 | [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, | ||
61 | [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, | ||
62 | [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, | ||
63 | [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, | ||
64 | [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, | ||
65 | [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, | ||
66 | [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, | ||
67 | [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, | ||
68 | [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, | ||
69 | [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, | ||
70 | [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, | ||
71 | [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, | ||
72 | [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, | ||
73 | [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, | ||
74 | [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, | ||
75 | [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, | ||
76 | }; | ||
77 | |||
78 | static int assign_irq_vector(int irq, cpumask_t mask); | ||
79 | |||
80 | #define __apicdebuginit __init | ||
81 | |||
82 | int sis_apic_bug; /* not actually supported, dummy for compile */ | ||
83 | |||
84 | static int no_timer_check; | ||
85 | |||
86 | static int disable_timer_pin_1 __initdata; | ||
87 | |||
88 | int timer_over_8254 __initdata = 1; | ||
89 | |||
90 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
91 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
92 | |||
93 | static DEFINE_SPINLOCK(ioapic_lock); | ||
94 | DEFINE_SPINLOCK(vector_lock); | ||
95 | |||
96 | /* | ||
97 | * # of IRQ routing registers | ||
98 | */ | ||
99 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
100 | |||
101 | /* | ||
102 | * Rough estimation of how many shared IRQs there are, can | ||
103 | * be changed anytime. | ||
104 | */ | ||
105 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
106 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
107 | |||
108 | /* | ||
109 | * This is performance-critical, we want to do it O(1) | ||
110 | * | ||
111 | * the indexing order of this array favors 1:1 mappings | ||
112 | * between pins and IRQs. | ||
113 | */ | ||
114 | |||
115 | static struct irq_pin_list { | ||
116 | short apic, pin, next; | ||
117 | } irq_2_pin[PIN_MAP_SIZE]; | ||
118 | |||
119 | struct io_apic { | ||
120 | unsigned int index; | ||
121 | unsigned int unused[3]; | ||
122 | unsigned int data; | ||
123 | }; | ||
124 | |||
125 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | ||
126 | { | ||
127 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | ||
128 | + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); | ||
129 | } | ||
130 | |||
131 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | ||
132 | { | ||
133 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
134 | writel(reg, &io_apic->index); | ||
135 | return readl(&io_apic->data); | ||
136 | } | ||
137 | |||
138 | static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | ||
139 | { | ||
140 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
141 | writel(reg, &io_apic->index); | ||
142 | writel(value, &io_apic->data); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Re-write a value: to be used for read-modify-write | ||
147 | * cycles where the read already set up the index register. | ||
148 | */ | ||
149 | static inline void io_apic_modify(unsigned int apic, unsigned int value) | ||
150 | { | ||
151 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
152 | writel(value, &io_apic->data); | ||
153 | } | ||
154 | |||
155 | static int io_apic_level_ack_pending(unsigned int irq) | ||
156 | { | ||
157 | struct irq_pin_list *entry; | ||
158 | unsigned long flags; | ||
159 | int pending = 0; | ||
160 | |||
161 | spin_lock_irqsave(&ioapic_lock, flags); | ||
162 | entry = irq_2_pin + irq; | ||
163 | for (;;) { | ||
164 | unsigned int reg; | ||
165 | int pin; | ||
166 | |||
167 | pin = entry->pin; | ||
168 | if (pin == -1) | ||
169 | break; | ||
170 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
171 | /* Is the remote IRR bit set? */ | ||
172 | pending |= (reg >> 14) & 1; | ||
173 | if (!entry->next) | ||
174 | break; | ||
175 | entry = irq_2_pin + entry->next; | ||
176 | } | ||
177 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
178 | return pending; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Synchronize the IO-APIC and the CPU by doing | ||
183 | * a dummy read from the IO-APIC | ||
184 | */ | ||
185 | static inline void io_apic_sync(unsigned int apic) | ||
186 | { | ||
187 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
188 | readl(&io_apic->data); | ||
189 | } | ||
190 | |||
191 | #define __DO_ACTION(R, ACTION, FINAL) \ | ||
192 | \ | ||
193 | { \ | ||
194 | int pin; \ | ||
195 | struct irq_pin_list *entry = irq_2_pin + irq; \ | ||
196 | \ | ||
197 | BUG_ON(irq >= NR_IRQS); \ | ||
198 | for (;;) { \ | ||
199 | unsigned int reg; \ | ||
200 | pin = entry->pin; \ | ||
201 | if (pin == -1) \ | ||
202 | break; \ | ||
203 | reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | ||
204 | reg ACTION; \ | ||
205 | io_apic_modify(entry->apic, reg); \ | ||
206 | FINAL; \ | ||
207 | if (!entry->next) \ | ||
208 | break; \ | ||
209 | entry = irq_2_pin + entry->next; \ | ||
210 | } \ | ||
211 | } | ||
212 | |||
213 | union entry_union { | ||
214 | struct { u32 w1, w2; }; | ||
215 | struct IO_APIC_route_entry entry; | ||
216 | }; | ||
217 | |||
218 | static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) | ||
219 | { | ||
220 | union entry_union eu; | ||
221 | unsigned long flags; | ||
222 | spin_lock_irqsave(&ioapic_lock, flags); | ||
223 | eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); | ||
224 | eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); | ||
225 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
226 | return eu.entry; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * When we write a new IO APIC routing entry, we need to write the high | ||
231 | * word first! If the mask bit in the low word is clear, we will enable | ||
232 | * the interrupt, and we need to make sure the entry is fully populated | ||
233 | * before that happens. | ||
234 | */ | ||
235 | static void | ||
236 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
237 | { | ||
238 | union entry_union eu; | ||
239 | eu.entry = e; | ||
240 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
241 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
242 | } | ||
243 | |||
244 | static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
245 | { | ||
246 | unsigned long flags; | ||
247 | spin_lock_irqsave(&ioapic_lock, flags); | ||
248 | __ioapic_write_entry(apic, pin, e); | ||
249 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * When we mask an IO APIC routing entry, we need to write the low | ||
254 | * word first, in order to set the mask bit before we change the | ||
255 | * high bits! | ||
256 | */ | ||
257 | static void ioapic_mask_entry(int apic, int pin) | ||
258 | { | ||
259 | unsigned long flags; | ||
260 | union entry_union eu = { .entry.mask = 1 }; | ||
261 | |||
262 | spin_lock_irqsave(&ioapic_lock, flags); | ||
263 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
264 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
265 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
266 | } | ||
267 | |||
268 | #ifdef CONFIG_SMP | ||
269 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) | ||
270 | { | ||
271 | int apic, pin; | ||
272 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
273 | |||
274 | BUG_ON(irq >= NR_IRQS); | ||
275 | for (;;) { | ||
276 | unsigned int reg; | ||
277 | apic = entry->apic; | ||
278 | pin = entry->pin; | ||
279 | if (pin == -1) | ||
280 | break; | ||
281 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
282 | reg = io_apic_read(apic, 0x10 + pin*2); | ||
283 | reg &= ~0x000000ff; | ||
284 | reg |= vector; | ||
285 | io_apic_modify(apic, reg); | ||
286 | if (!entry->next) | ||
287 | break; | ||
288 | entry = irq_2_pin + entry->next; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | ||
293 | { | ||
294 | struct irq_cfg *cfg = irq_cfg + irq; | ||
295 | unsigned long flags; | ||
296 | unsigned int dest; | ||
297 | cpumask_t tmp; | ||
298 | |||
299 | cpus_and(tmp, mask, cpu_online_map); | ||
300 | if (cpus_empty(tmp)) | ||
301 | return; | ||
302 | |||
303 | if (assign_irq_vector(irq, mask)) | ||
304 | return; | ||
305 | |||
306 | cpus_and(tmp, cfg->domain, mask); | ||
307 | dest = cpu_mask_to_apicid(tmp); | ||
308 | |||
309 | /* | ||
310 | * Only the high 8 bits are valid. | ||
311 | */ | ||
312 | dest = SET_APIC_LOGICAL_ID(dest); | ||
313 | |||
314 | spin_lock_irqsave(&ioapic_lock, flags); | ||
315 | __target_IO_APIC_irq(irq, dest, cfg->vector); | ||
316 | irq_desc[irq].affinity = mask; | ||
317 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
318 | } | ||
319 | #endif | ||
320 | |||
321 | /* | ||
322 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
323 | * shared ISA-space IRQs, so we have to support them. We are super | ||
324 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
325 | */ | ||
326 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
327 | { | ||
328 | static int first_free_entry = NR_IRQS; | ||
329 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
330 | |||
331 | BUG_ON(irq >= NR_IRQS); | ||
332 | while (entry->next) | ||
333 | entry = irq_2_pin + entry->next; | ||
334 | |||
335 | if (entry->pin != -1) { | ||
336 | entry->next = first_free_entry; | ||
337 | entry = irq_2_pin + entry->next; | ||
338 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
339 | panic("io_apic.c: ran out of irq_2_pin entries!"); | ||
340 | } | ||
341 | entry->apic = apic; | ||
342 | entry->pin = pin; | ||
343 | } | ||
344 | |||
345 | |||
346 | #define DO_ACTION(name,R,ACTION, FINAL) \ | ||
347 | \ | ||
348 | static void name##_IO_APIC_irq (unsigned int irq) \ | ||
349 | __DO_ACTION(R, ACTION, FINAL) | ||
350 | |||
351 | DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | ||
352 | /* mask = 1 */ | ||
353 | DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | ||
354 | /* mask = 0 */ | ||
355 | |||
356 | static void mask_IO_APIC_irq (unsigned int irq) | ||
357 | { | ||
358 | unsigned long flags; | ||
359 | |||
360 | spin_lock_irqsave(&ioapic_lock, flags); | ||
361 | __mask_IO_APIC_irq(irq); | ||
362 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
363 | } | ||
364 | |||
365 | static void unmask_IO_APIC_irq (unsigned int irq) | ||
366 | { | ||
367 | unsigned long flags; | ||
368 | |||
369 | spin_lock_irqsave(&ioapic_lock, flags); | ||
370 | __unmask_IO_APIC_irq(irq); | ||
371 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
372 | } | ||
373 | |||
374 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
375 | { | ||
376 | struct IO_APIC_route_entry entry; | ||
377 | |||
378 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
379 | entry = ioapic_read_entry(apic, pin); | ||
380 | if (entry.delivery_mode == dest_SMI) | ||
381 | return; | ||
382 | /* | ||
383 | * Disable it in the IO-APIC irq-routing table: | ||
384 | */ | ||
385 | ioapic_mask_entry(apic, pin); | ||
386 | } | ||
387 | |||
388 | static void clear_IO_APIC (void) | ||
389 | { | ||
390 | int apic, pin; | ||
391 | |||
392 | for (apic = 0; apic < nr_ioapics; apic++) | ||
393 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
394 | clear_IO_APIC_pin(apic, pin); | ||
395 | } | ||
396 | |||
397 | int skip_ioapic_setup; | ||
398 | int ioapic_force; | ||
399 | |||
400 | static int __init parse_noapic(char *str) | ||
401 | { | ||
402 | disable_ioapic_setup(); | ||
403 | return 0; | ||
404 | } | ||
405 | early_param("noapic", parse_noapic); | ||
406 | |||
407 | /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ | ||
408 | static int __init disable_timer_pin_setup(char *arg) | ||
409 | { | ||
410 | disable_timer_pin_1 = 1; | ||
411 | return 1; | ||
412 | } | ||
413 | __setup("disable_timer_pin_1", disable_timer_pin_setup); | ||
414 | |||
415 | static int __init setup_disable_8254_timer(char *s) | ||
416 | { | ||
417 | timer_over_8254 = -1; | ||
418 | return 1; | ||
419 | } | ||
420 | static int __init setup_enable_8254_timer(char *s) | ||
421 | { | ||
422 | timer_over_8254 = 2; | ||
423 | return 1; | ||
424 | } | ||
425 | |||
426 | __setup("disable_8254_timer", setup_disable_8254_timer); | ||
427 | __setup("enable_8254_timer", setup_enable_8254_timer); | ||
428 | |||
429 | |||
430 | /* | ||
431 | * Find the IRQ entry number of a certain pin. | ||
432 | */ | ||
433 | static int find_irq_entry(int apic, int pin, int type) | ||
434 | { | ||
435 | int i; | ||
436 | |||
437 | for (i = 0; i < mp_irq_entries; i++) | ||
438 | if (mp_irqs[i].mpc_irqtype == type && | ||
439 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | ||
440 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | ||
441 | mp_irqs[i].mpc_dstirq == pin) | ||
442 | return i; | ||
443 | |||
444 | return -1; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
449 | */ | ||
450 | static int __init find_isa_irq_pin(int irq, int type) | ||
451 | { | ||
452 | int i; | ||
453 | |||
454 | for (i = 0; i < mp_irq_entries; i++) { | ||
455 | int lbus = mp_irqs[i].mpc_srcbus; | ||
456 | |||
457 | if (test_bit(lbus, mp_bus_not_pci) && | ||
458 | (mp_irqs[i].mpc_irqtype == type) && | ||
459 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
460 | |||
461 | return mp_irqs[i].mpc_dstirq; | ||
462 | } | ||
463 | return -1; | ||
464 | } | ||
465 | |||
466 | static int __init find_isa_irq_apic(int irq, int type) | ||
467 | { | ||
468 | int i; | ||
469 | |||
470 | for (i = 0; i < mp_irq_entries; i++) { | ||
471 | int lbus = mp_irqs[i].mpc_srcbus; | ||
472 | |||
473 | if (test_bit(lbus, mp_bus_not_pci) && | ||
474 | (mp_irqs[i].mpc_irqtype == type) && | ||
475 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
476 | break; | ||
477 | } | ||
478 | if (i < mp_irq_entries) { | ||
479 | int apic; | ||
480 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
481 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | ||
482 | return apic; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | return -1; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Find a specific PCI IRQ entry. | ||
491 | * Not an __init, possibly needed by modules | ||
492 | */ | ||
493 | static int pin_2_irq(int idx, int apic, int pin); | ||
494 | |||
495 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
496 | { | ||
497 | int apic, i, best_guess = -1; | ||
498 | |||
499 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
500 | bus, slot, pin); | ||
501 | if (mp_bus_id_to_pci_bus[bus] == -1) { | ||
502 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
503 | return -1; | ||
504 | } | ||
505 | for (i = 0; i < mp_irq_entries; i++) { | ||
506 | int lbus = mp_irqs[i].mpc_srcbus; | ||
507 | |||
508 | for (apic = 0; apic < nr_ioapics; apic++) | ||
509 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | ||
510 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | ||
511 | break; | ||
512 | |||
513 | if (!test_bit(lbus, mp_bus_not_pci) && | ||
514 | !mp_irqs[i].mpc_irqtype && | ||
515 | (bus == lbus) && | ||
516 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | ||
517 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | ||
518 | |||
519 | if (!(apic || IO_APIC_IRQ(irq))) | ||
520 | continue; | ||
521 | |||
522 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | ||
523 | return irq; | ||
524 | /* | ||
525 | * Use the first all-but-pin matching entry as a | ||
526 | * best-guess fuzzy result for broken mptables. | ||
527 | */ | ||
528 | if (best_guess < 0) | ||
529 | best_guess = irq; | ||
530 | } | ||
531 | } | ||
532 | BUG_ON(best_guess >= NR_IRQS); | ||
533 | return best_guess; | ||
534 | } | ||
535 | |||
536 | /* ISA interrupts are always polarity zero edge triggered, | ||
537 | * when listed as conforming in the MP table. */ | ||
538 | |||
539 | #define default_ISA_trigger(idx) (0) | ||
540 | #define default_ISA_polarity(idx) (0) | ||
541 | |||
542 | /* PCI interrupts are always polarity one level triggered, | ||
543 | * when listed as conforming in the MP table. */ | ||
544 | |||
545 | #define default_PCI_trigger(idx) (1) | ||
546 | #define default_PCI_polarity(idx) (1) | ||
547 | |||
548 | static int __init MPBIOS_polarity(int idx) | ||
549 | { | ||
550 | int bus = mp_irqs[idx].mpc_srcbus; | ||
551 | int polarity; | ||
552 | |||
553 | /* | ||
554 | * Determine IRQ line polarity (high active or low active): | ||
555 | */ | ||
556 | switch (mp_irqs[idx].mpc_irqflag & 3) | ||
557 | { | ||
558 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
559 | if (test_bit(bus, mp_bus_not_pci)) | ||
560 | polarity = default_ISA_polarity(idx); | ||
561 | else | ||
562 | polarity = default_PCI_polarity(idx); | ||
563 | break; | ||
564 | case 1: /* high active */ | ||
565 | { | ||
566 | polarity = 0; | ||
567 | break; | ||
568 | } | ||
569 | case 2: /* reserved */ | ||
570 | { | ||
571 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
572 | polarity = 1; | ||
573 | break; | ||
574 | } | ||
575 | case 3: /* low active */ | ||
576 | { | ||
577 | polarity = 1; | ||
578 | break; | ||
579 | } | ||
580 | default: /* invalid */ | ||
581 | { | ||
582 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
583 | polarity = 1; | ||
584 | break; | ||
585 | } | ||
586 | } | ||
587 | return polarity; | ||
588 | } | ||
589 | |||
590 | static int MPBIOS_trigger(int idx) | ||
591 | { | ||
592 | int bus = mp_irqs[idx].mpc_srcbus; | ||
593 | int trigger; | ||
594 | |||
595 | /* | ||
596 | * Determine IRQ trigger mode (edge or level sensitive): | ||
597 | */ | ||
598 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | ||
599 | { | ||
600 | case 0: /* conforms, ie. bus-type dependent */ | ||
601 | if (test_bit(bus, mp_bus_not_pci)) | ||
602 | trigger = default_ISA_trigger(idx); | ||
603 | else | ||
604 | trigger = default_PCI_trigger(idx); | ||
605 | break; | ||
606 | case 1: /* edge */ | ||
607 | { | ||
608 | trigger = 0; | ||
609 | break; | ||
610 | } | ||
611 | case 2: /* reserved */ | ||
612 | { | ||
613 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
614 | trigger = 1; | ||
615 | break; | ||
616 | } | ||
617 | case 3: /* level */ | ||
618 | { | ||
619 | trigger = 1; | ||
620 | break; | ||
621 | } | ||
622 | default: /* invalid */ | ||
623 | { | ||
624 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
625 | trigger = 0; | ||
626 | break; | ||
627 | } | ||
628 | } | ||
629 | return trigger; | ||
630 | } | ||
631 | |||
632 | static inline int irq_polarity(int idx) | ||
633 | { | ||
634 | return MPBIOS_polarity(idx); | ||
635 | } | ||
636 | |||
637 | static inline int irq_trigger(int idx) | ||
638 | { | ||
639 | return MPBIOS_trigger(idx); | ||
640 | } | ||
641 | |||
642 | static int pin_2_irq(int idx, int apic, int pin) | ||
643 | { | ||
644 | int irq, i; | ||
645 | int bus = mp_irqs[idx].mpc_srcbus; | ||
646 | |||
647 | /* | ||
648 | * Debugging check, we are in big trouble if this message pops up! | ||
649 | */ | ||
650 | if (mp_irqs[idx].mpc_dstirq != pin) | ||
651 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
652 | |||
653 | if (test_bit(bus, mp_bus_not_pci)) { | ||
654 | irq = mp_irqs[idx].mpc_srcbusirq; | ||
655 | } else { | ||
656 | /* | ||
657 | * PCI IRQs are mapped in order | ||
658 | */ | ||
659 | i = irq = 0; | ||
660 | while (i < apic) | ||
661 | irq += nr_ioapic_registers[i++]; | ||
662 | irq += pin; | ||
663 | } | ||
664 | BUG_ON(irq >= NR_IRQS); | ||
665 | return irq; | ||
666 | } | ||
667 | |||
668 | static int __assign_irq_vector(int irq, cpumask_t mask) | ||
669 | { | ||
670 | /* | ||
671 | * NOTE! The local APIC isn't very good at handling | ||
672 | * multiple interrupts at the same interrupt level. | ||
673 | * As the interrupt level is determined by taking the | ||
674 | * vector number and shifting that right by 4, we | ||
675 | * want to spread these out a bit so that they don't | ||
676 | * all fall in the same interrupt level. | ||
677 | * | ||
678 | * Also, we've got to be careful not to trash gate | ||
679 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
680 | */ | ||
681 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; | ||
682 | unsigned int old_vector; | ||
683 | int cpu; | ||
684 | struct irq_cfg *cfg; | ||
685 | |||
686 | BUG_ON((unsigned)irq >= NR_IRQS); | ||
687 | cfg = &irq_cfg[irq]; | ||
688 | |||
689 | /* Only try and allocate irqs on cpus that are present */ | ||
690 | cpus_and(mask, mask, cpu_online_map); | ||
691 | |||
692 | if ((cfg->move_in_progress) || cfg->move_cleanup_count) | ||
693 | return -EBUSY; | ||
694 | |||
695 | old_vector = cfg->vector; | ||
696 | if (old_vector) { | ||
697 | cpumask_t tmp; | ||
698 | cpus_and(tmp, cfg->domain, mask); | ||
699 | if (!cpus_empty(tmp)) | ||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | for_each_cpu_mask(cpu, mask) { | ||
704 | cpumask_t domain, new_mask; | ||
705 | int new_cpu; | ||
706 | int vector, offset; | ||
707 | |||
708 | domain = vector_allocation_domain(cpu); | ||
709 | cpus_and(new_mask, domain, cpu_online_map); | ||
710 | |||
711 | vector = current_vector; | ||
712 | offset = current_offset; | ||
713 | next: | ||
714 | vector += 8; | ||
715 | if (vector >= FIRST_SYSTEM_VECTOR) { | ||
716 | /* If we run out of vectors on large boxen, must share them. */ | ||
717 | offset = (offset + 1) % 8; | ||
718 | vector = FIRST_DEVICE_VECTOR + offset; | ||
719 | } | ||
720 | if (unlikely(current_vector == vector)) | ||
721 | continue; | ||
722 | if (vector == IA32_SYSCALL_VECTOR) | ||
723 | goto next; | ||
724 | for_each_cpu_mask(new_cpu, new_mask) | ||
725 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) | ||
726 | goto next; | ||
727 | /* Found one! */ | ||
728 | current_vector = vector; | ||
729 | current_offset = offset; | ||
730 | if (old_vector) { | ||
731 | cfg->move_in_progress = 1; | ||
732 | cfg->old_domain = cfg->domain; | ||
733 | } | ||
734 | for_each_cpu_mask(new_cpu, new_mask) | ||
735 | per_cpu(vector_irq, new_cpu)[vector] = irq; | ||
736 | cfg->vector = vector; | ||
737 | cfg->domain = domain; | ||
738 | return 0; | ||
739 | } | ||
740 | return -ENOSPC; | ||
741 | } | ||
742 | |||
743 | static int assign_irq_vector(int irq, cpumask_t mask) | ||
744 | { | ||
745 | int err; | ||
746 | unsigned long flags; | ||
747 | |||
748 | spin_lock_irqsave(&vector_lock, flags); | ||
749 | err = __assign_irq_vector(irq, mask); | ||
750 | spin_unlock_irqrestore(&vector_lock, flags); | ||
751 | return err; | ||
752 | } | ||
753 | |||
754 | static void __clear_irq_vector(int irq) | ||
755 | { | ||
756 | struct irq_cfg *cfg; | ||
757 | cpumask_t mask; | ||
758 | int cpu, vector; | ||
759 | |||
760 | BUG_ON((unsigned)irq >= NR_IRQS); | ||
761 | cfg = &irq_cfg[irq]; | ||
762 | BUG_ON(!cfg->vector); | ||
763 | |||
764 | vector = cfg->vector; | ||
765 | cpus_and(mask, cfg->domain, cpu_online_map); | ||
766 | for_each_cpu_mask(cpu, mask) | ||
767 | per_cpu(vector_irq, cpu)[vector] = -1; | ||
768 | |||
769 | cfg->vector = 0; | ||
770 | cfg->domain = CPU_MASK_NONE; | ||
771 | } | ||
772 | |||
773 | void __setup_vector_irq(int cpu) | ||
774 | { | ||
775 | /* Initialize vector_irq on a new cpu */ | ||
776 | /* This function must be called with vector_lock held */ | ||
777 | int irq, vector; | ||
778 | |||
779 | /* Mark the inuse vectors */ | ||
780 | for (irq = 0; irq < NR_IRQS; ++irq) { | ||
781 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | ||
782 | continue; | ||
783 | vector = irq_cfg[irq].vector; | ||
784 | per_cpu(vector_irq, cpu)[vector] = irq; | ||
785 | } | ||
786 | /* Mark the free vectors */ | ||
787 | for (vector = 0; vector < NR_VECTORS; ++vector) { | ||
788 | irq = per_cpu(vector_irq, cpu)[vector]; | ||
789 | if (irq < 0) | ||
790 | continue; | ||
791 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | ||
792 | per_cpu(vector_irq, cpu)[vector] = -1; | ||
793 | } | ||
794 | } | ||
795 | |||
796 | |||
797 | static struct irq_chip ioapic_chip; | ||
798 | |||
799 | static void ioapic_register_intr(int irq, unsigned long trigger) | ||
800 | { | ||
801 | if (trigger) { | ||
802 | irq_desc[irq].status |= IRQ_LEVEL; | ||
803 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
804 | handle_fasteoi_irq, "fasteoi"); | ||
805 | } else { | ||
806 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
807 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
808 | handle_edge_irq, "edge"); | ||
809 | } | ||
810 | } | ||
811 | |||
812 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | ||
813 | int trigger, int polarity) | ||
814 | { | ||
815 | struct irq_cfg *cfg = irq_cfg + irq; | ||
816 | struct IO_APIC_route_entry entry; | ||
817 | cpumask_t mask; | ||
818 | |||
819 | if (!IO_APIC_IRQ(irq)) | ||
820 | return; | ||
821 | |||
822 | mask = TARGET_CPUS; | ||
823 | if (assign_irq_vector(irq, mask)) | ||
824 | return; | ||
825 | |||
826 | cpus_and(mask, cfg->domain, mask); | ||
827 | |||
828 | apic_printk(APIC_VERBOSE,KERN_DEBUG | ||
829 | "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " | ||
830 | "IRQ %d Mode:%i Active:%i)\n", | ||
831 | apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, | ||
832 | irq, trigger, polarity); | ||
833 | |||
834 | /* | ||
835 | * add it to the IO-APIC irq-routing table: | ||
836 | */ | ||
837 | memset(&entry,0,sizeof(entry)); | ||
838 | |||
839 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
840 | entry.dest_mode = INT_DEST_MODE; | ||
841 | entry.dest = cpu_mask_to_apicid(mask); | ||
842 | entry.mask = 0; /* enable IRQ */ | ||
843 | entry.trigger = trigger; | ||
844 | entry.polarity = polarity; | ||
845 | entry.vector = cfg->vector; | ||
846 | |||
847 | /* Mask level triggered irqs. | ||
848 | * Use IRQ_DELAYED_DISABLE for edge triggered irqs. | ||
849 | */ | ||
850 | if (trigger) | ||
851 | entry.mask = 1; | ||
852 | |||
853 | ioapic_register_intr(irq, trigger); | ||
854 | if (irq < 16) | ||
855 | disable_8259A_irq(irq); | ||
856 | |||
857 | ioapic_write_entry(apic, pin, entry); | ||
858 | } | ||
859 | |||
860 | static void __init setup_IO_APIC_irqs(void) | ||
861 | { | ||
862 | int apic, pin, idx, irq, first_notcon = 1; | ||
863 | |||
864 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
865 | |||
866 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
867 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
868 | |||
869 | idx = find_irq_entry(apic,pin,mp_INT); | ||
870 | if (idx == -1) { | ||
871 | if (first_notcon) { | ||
872 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
873 | first_notcon = 0; | ||
874 | } else | ||
875 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
876 | continue; | ||
877 | } | ||
878 | |||
879 | irq = pin_2_irq(idx, apic, pin); | ||
880 | add_pin_to_irq(irq, apic, pin); | ||
881 | |||
882 | setup_IO_APIC_irq(apic, pin, irq, | ||
883 | irq_trigger(idx), irq_polarity(idx)); | ||
884 | } | ||
885 | } | ||
886 | |||
887 | if (!first_notcon) | ||
888 | apic_printk(APIC_VERBOSE," not connected.\n"); | ||
889 | } | ||
890 | |||
891 | /* | ||
892 | * Set up the 8259A-master output pin as broadcast to all | ||
893 | * CPUs. | ||
894 | */ | ||
895 | static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | ||
896 | { | ||
897 | struct IO_APIC_route_entry entry; | ||
898 | unsigned long flags; | ||
899 | |||
900 | memset(&entry,0,sizeof(entry)); | ||
901 | |||
902 | disable_8259A_irq(0); | ||
903 | |||
904 | /* mask LVT0 */ | ||
905 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
906 | |||
907 | /* | ||
908 | * We use logical delivery to get the timer IRQ | ||
909 | * to the first CPU. | ||
910 | */ | ||
911 | entry.dest_mode = INT_DEST_MODE; | ||
912 | entry.mask = 0; /* unmask IRQ now */ | ||
913 | entry.dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
914 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
915 | entry.polarity = 0; | ||
916 | entry.trigger = 0; | ||
917 | entry.vector = vector; | ||
918 | |||
919 | /* | ||
920 | * The timer IRQ doesn't have to know that behind the | ||
921 | * scene we have a 8259A-master in AEOI mode ... | ||
922 | */ | ||
923 | set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); | ||
924 | |||
925 | /* | ||
926 | * Add it to the IO-APIC irq-routing table: | ||
927 | */ | ||
928 | spin_lock_irqsave(&ioapic_lock, flags); | ||
929 | io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
930 | io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
931 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
932 | |||
933 | enable_8259A_irq(0); | ||
934 | } | ||
935 | |||
936 | void __apicdebuginit print_IO_APIC(void) | ||
937 | { | ||
938 | int apic, i; | ||
939 | union IO_APIC_reg_00 reg_00; | ||
940 | union IO_APIC_reg_01 reg_01; | ||
941 | union IO_APIC_reg_02 reg_02; | ||
942 | unsigned long flags; | ||
943 | |||
944 | if (apic_verbosity == APIC_QUIET) | ||
945 | return; | ||
946 | |||
947 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
948 | for (i = 0; i < nr_ioapics; i++) | ||
949 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
950 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | ||
951 | |||
952 | /* | ||
953 | * We are a bit conservative about what we expect. We have to | ||
954 | * know about every hardware change ASAP. | ||
955 | */ | ||
956 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
957 | |||
958 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
959 | |||
960 | spin_lock_irqsave(&ioapic_lock, flags); | ||
961 | reg_00.raw = io_apic_read(apic, 0); | ||
962 | reg_01.raw = io_apic_read(apic, 1); | ||
963 | if (reg_01.bits.version >= 0x10) | ||
964 | reg_02.raw = io_apic_read(apic, 2); | ||
965 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
966 | |||
967 | printk("\n"); | ||
968 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | ||
969 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
970 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
971 | |||
972 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | ||
973 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
974 | |||
975 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
976 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
977 | |||
978 | if (reg_01.bits.version >= 0x10) { | ||
979 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
980 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
981 | } | ||
982 | |||
983 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
984 | |||
985 | printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" | ||
986 | " Stat Dmod Deli Vect: \n"); | ||
987 | |||
988 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
989 | struct IO_APIC_route_entry entry; | ||
990 | |||
991 | entry = ioapic_read_entry(apic, i); | ||
992 | |||
993 | printk(KERN_DEBUG " %02x %03X ", | ||
994 | i, | ||
995 | entry.dest | ||
996 | ); | ||
997 | |||
998 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
999 | entry.mask, | ||
1000 | entry.trigger, | ||
1001 | entry.irr, | ||
1002 | entry.polarity, | ||
1003 | entry.delivery_status, | ||
1004 | entry.dest_mode, | ||
1005 | entry.delivery_mode, | ||
1006 | entry.vector | ||
1007 | ); | ||
1008 | } | ||
1009 | } | ||
1010 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
1011 | for (i = 0; i < NR_IRQS; i++) { | ||
1012 | struct irq_pin_list *entry = irq_2_pin + i; | ||
1013 | if (entry->pin < 0) | ||
1014 | continue; | ||
1015 | printk(KERN_DEBUG "IRQ%d ", i); | ||
1016 | for (;;) { | ||
1017 | printk("-> %d:%d", entry->apic, entry->pin); | ||
1018 | if (!entry->next) | ||
1019 | break; | ||
1020 | entry = irq_2_pin + entry->next; | ||
1021 | } | ||
1022 | printk("\n"); | ||
1023 | } | ||
1024 | |||
1025 | printk(KERN_INFO ".................................... done.\n"); | ||
1026 | |||
1027 | return; | ||
1028 | } | ||
1029 | |||
1030 | #if 0 | ||
1031 | |||
1032 | static __apicdebuginit void print_APIC_bitfield (int base) | ||
1033 | { | ||
1034 | unsigned int v; | ||
1035 | int i, j; | ||
1036 | |||
1037 | if (apic_verbosity == APIC_QUIET) | ||
1038 | return; | ||
1039 | |||
1040 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
1041 | for (i = 0; i < 8; i++) { | ||
1042 | v = apic_read(base + i*0x10); | ||
1043 | for (j = 0; j < 32; j++) { | ||
1044 | if (v & (1<<j)) | ||
1045 | printk("1"); | ||
1046 | else | ||
1047 | printk("0"); | ||
1048 | } | ||
1049 | printk("\n"); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | void __apicdebuginit print_local_APIC(void * dummy) | ||
1054 | { | ||
1055 | unsigned int v, ver, maxlvt; | ||
1056 | |||
1057 | if (apic_verbosity == APIC_QUIET) | ||
1058 | return; | ||
1059 | |||
1060 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
1061 | smp_processor_id(), hard_smp_processor_id()); | ||
1062 | v = apic_read(APIC_ID); | ||
1063 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | ||
1064 | v = apic_read(APIC_LVR); | ||
1065 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
1066 | ver = GET_APIC_VERSION(v); | ||
1067 | maxlvt = get_maxlvt(); | ||
1068 | |||
1069 | v = apic_read(APIC_TASKPRI); | ||
1070 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
1071 | |||
1072 | v = apic_read(APIC_ARBPRI); | ||
1073 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
1074 | v & APIC_ARBPRI_MASK); | ||
1075 | v = apic_read(APIC_PROCPRI); | ||
1076 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
1077 | |||
1078 | v = apic_read(APIC_EOI); | ||
1079 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1080 | v = apic_read(APIC_RRR); | ||
1081 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1082 | v = apic_read(APIC_LDR); | ||
1083 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1084 | v = apic_read(APIC_DFR); | ||
1085 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1086 | v = apic_read(APIC_SPIV); | ||
1087 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1088 | |||
1089 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1090 | print_APIC_bitfield(APIC_ISR); | ||
1091 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1092 | print_APIC_bitfield(APIC_TMR); | ||
1093 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1094 | print_APIC_bitfield(APIC_IRR); | ||
1095 | |||
1096 | v = apic_read(APIC_ESR); | ||
1097 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1098 | |||
1099 | v = apic_read(APIC_ICR); | ||
1100 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | ||
1101 | v = apic_read(APIC_ICR2); | ||
1102 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1103 | |||
1104 | v = apic_read(APIC_LVTT); | ||
1105 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1106 | |||
1107 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1108 | v = apic_read(APIC_LVTPC); | ||
1109 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1110 | } | ||
1111 | v = apic_read(APIC_LVT0); | ||
1112 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1113 | v = apic_read(APIC_LVT1); | ||
1114 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1115 | |||
1116 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1117 | v = apic_read(APIC_LVTERR); | ||
1118 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1119 | } | ||
1120 | |||
1121 | v = apic_read(APIC_TMICT); | ||
1122 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1123 | v = apic_read(APIC_TMCCT); | ||
1124 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1125 | v = apic_read(APIC_TDCR); | ||
1126 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1127 | printk("\n"); | ||
1128 | } | ||
1129 | |||
1130 | void print_all_local_APICs (void) | ||
1131 | { | ||
1132 | on_each_cpu(print_local_APIC, NULL, 1, 1); | ||
1133 | } | ||
1134 | |||
1135 | void __apicdebuginit print_PIC(void) | ||
1136 | { | ||
1137 | unsigned int v; | ||
1138 | unsigned long flags; | ||
1139 | |||
1140 | if (apic_verbosity == APIC_QUIET) | ||
1141 | return; | ||
1142 | |||
1143 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1144 | |||
1145 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1146 | |||
1147 | v = inb(0xa1) << 8 | inb(0x21); | ||
1148 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1149 | |||
1150 | v = inb(0xa0) << 8 | inb(0x20); | ||
1151 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1152 | |||
1153 | outb(0x0b,0xa0); | ||
1154 | outb(0x0b,0x20); | ||
1155 | v = inb(0xa0) << 8 | inb(0x20); | ||
1156 | outb(0x0a,0xa0); | ||
1157 | outb(0x0a,0x20); | ||
1158 | |||
1159 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1160 | |||
1161 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1162 | |||
1163 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1164 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1165 | } | ||
1166 | |||
1167 | #endif /* 0 */ | ||
1168 | |||
1169 | static void __init enable_IO_APIC(void) | ||
1170 | { | ||
1171 | union IO_APIC_reg_01 reg_01; | ||
1172 | int i8259_apic, i8259_pin; | ||
1173 | int i, apic; | ||
1174 | unsigned long flags; | ||
1175 | |||
1176 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1177 | irq_2_pin[i].pin = -1; | ||
1178 | irq_2_pin[i].next = 0; | ||
1179 | } | ||
1180 | |||
1181 | /* | ||
1182 | * The number of IO-APIC IRQ registers (== #pins): | ||
1183 | */ | ||
1184 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1185 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1186 | reg_01.raw = io_apic_read(apic, 1); | ||
1187 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1188 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
1189 | } | ||
1190 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
1191 | int pin; | ||
1192 | /* See if any of the pins is in ExtINT mode */ | ||
1193 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1194 | struct IO_APIC_route_entry entry; | ||
1195 | entry = ioapic_read_entry(apic, pin); | ||
1196 | |||
1197 | /* If the interrupt line is enabled and in ExtInt mode | ||
1198 | * I have found the pin where the i8259 is connected. | ||
1199 | */ | ||
1200 | if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | ||
1201 | ioapic_i8259.apic = apic; | ||
1202 | ioapic_i8259.pin = pin; | ||
1203 | goto found_i8259; | ||
1204 | } | ||
1205 | } | ||
1206 | } | ||
1207 | found_i8259: | ||
1208 | /* Look to see what if the MP table has reported the ExtINT */ | ||
1209 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | ||
1210 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | ||
1211 | /* Trust the MP table if nothing is setup in the hardware */ | ||
1212 | if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | ||
1213 | printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | ||
1214 | ioapic_i8259.pin = i8259_pin; | ||
1215 | ioapic_i8259.apic = i8259_apic; | ||
1216 | } | ||
1217 | /* Complain if the MP table and the hardware disagree */ | ||
1218 | if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | ||
1219 | (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | ||
1220 | { | ||
1221 | printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | ||
1222 | } | ||
1223 | |||
1224 | /* | ||
1225 | * Do not trust the IO-APIC being empty at bootup | ||
1226 | */ | ||
1227 | clear_IO_APIC(); | ||
1228 | } | ||
1229 | |||
1230 | /* | ||
1231 | * Not an __init, needed by the reboot code | ||
1232 | */ | ||
1233 | void disable_IO_APIC(void) | ||
1234 | { | ||
1235 | /* | ||
1236 | * Clear the IO-APIC before rebooting: | ||
1237 | */ | ||
1238 | clear_IO_APIC(); | ||
1239 | |||
1240 | /* | ||
1241 | * If the i8259 is routed through an IOAPIC | ||
1242 | * Put that IOAPIC in virtual wire mode | ||
1243 | * so legacy interrupts can be delivered. | ||
1244 | */ | ||
1245 | if (ioapic_i8259.pin != -1) { | ||
1246 | struct IO_APIC_route_entry entry; | ||
1247 | |||
1248 | memset(&entry, 0, sizeof(entry)); | ||
1249 | entry.mask = 0; /* Enabled */ | ||
1250 | entry.trigger = 0; /* Edge */ | ||
1251 | entry.irr = 0; | ||
1252 | entry.polarity = 0; /* High */ | ||
1253 | entry.delivery_status = 0; | ||
1254 | entry.dest_mode = 0; /* Physical */ | ||
1255 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | ||
1256 | entry.vector = 0; | ||
1257 | entry.dest = GET_APIC_ID(apic_read(APIC_ID)); | ||
1258 | |||
1259 | /* | ||
1260 | * Add it to the IO-APIC irq-routing table: | ||
1261 | */ | ||
1262 | ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); | ||
1263 | } | ||
1264 | |||
1265 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); | ||
1266 | } | ||
1267 | |||
1268 | /* | ||
1269 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1270 | * about the timer IRQ. We do the following to work around the situation: | ||
1271 | * | ||
1272 | * - timer IRQ defaults to IO-APIC IRQ | ||
1273 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1274 | * back to ISA timer IRQs | ||
1275 | */ | ||
1276 | static int __init timer_irq_works(void) | ||
1277 | { | ||
1278 | unsigned long t1 = jiffies; | ||
1279 | |||
1280 | local_irq_enable(); | ||
1281 | /* Let ten ticks pass... */ | ||
1282 | mdelay((10 * 1000) / HZ); | ||
1283 | |||
1284 | /* | ||
1285 | * Expect a few ticks at least, to be sure some possible | ||
1286 | * glue logic does not lock up after one or two first | ||
1287 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1288 | * might have cached one ExtINT interrupt. Finally, at | ||
1289 | * least one tick may be lost due to delays. | ||
1290 | */ | ||
1291 | |||
1292 | /* jiffies wrap? */ | ||
1293 | if (jiffies - t1 > 4) | ||
1294 | return 1; | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1300 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1301 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1302 | * better to do it this way as thus we do not have to be aware of | ||
1303 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1304 | */ | ||
1305 | /* | ||
1306 | * Edge triggered needs to resend any interrupt | ||
1307 | * that was delayed but this is now handled in the device | ||
1308 | * independent code. | ||
1309 | */ | ||
1310 | |||
1311 | /* | ||
1312 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1313 | * nasty - we need to make sure that we get the edge. | ||
1314 | * If it is already asserted for some reason, we need | ||
1315 | * return 1 to indicate that is was pending. | ||
1316 | * | ||
1317 | * This is not complete - we should be able to fake | ||
1318 | * an edge even if it isn't on the 8259A... | ||
1319 | */ | ||
1320 | |||
1321 | static unsigned int startup_ioapic_irq(unsigned int irq) | ||
1322 | { | ||
1323 | int was_pending = 0; | ||
1324 | unsigned long flags; | ||
1325 | |||
1326 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1327 | if (irq < 16) { | ||
1328 | disable_8259A_irq(irq); | ||
1329 | if (i8259A_irq_pending(irq)) | ||
1330 | was_pending = 1; | ||
1331 | } | ||
1332 | __unmask_IO_APIC_irq(irq); | ||
1333 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1334 | |||
1335 | return was_pending; | ||
1336 | } | ||
1337 | |||
1338 | static int ioapic_retrigger_irq(unsigned int irq) | ||
1339 | { | ||
1340 | struct irq_cfg *cfg = &irq_cfg[irq]; | ||
1341 | cpumask_t mask; | ||
1342 | unsigned long flags; | ||
1343 | |||
1344 | spin_lock_irqsave(&vector_lock, flags); | ||
1345 | cpus_clear(mask); | ||
1346 | cpu_set(first_cpu(cfg->domain), mask); | ||
1347 | |||
1348 | send_IPI_mask(mask, cfg->vector); | ||
1349 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1350 | |||
1351 | return 1; | ||
1352 | } | ||
1353 | |||
1354 | /* | ||
1355 | * Level and edge triggered IO-APIC interrupts need different handling, | ||
1356 | * so we use two separate IRQ descriptors. Edge triggered IRQs can be | ||
1357 | * handled with the level-triggered descriptor, but that one has slightly | ||
1358 | * more overhead. Level-triggered interrupts cannot be handled with the | ||
1359 | * edge-triggered handler, without risking IRQ storms and other ugly | ||
1360 | * races. | ||
1361 | */ | ||
1362 | |||
1363 | #ifdef CONFIG_SMP | ||
1364 | asmlinkage void smp_irq_move_cleanup_interrupt(void) | ||
1365 | { | ||
1366 | unsigned vector, me; | ||
1367 | ack_APIC_irq(); | ||
1368 | exit_idle(); | ||
1369 | irq_enter(); | ||
1370 | |||
1371 | me = smp_processor_id(); | ||
1372 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | ||
1373 | unsigned int irq; | ||
1374 | struct irq_desc *desc; | ||
1375 | struct irq_cfg *cfg; | ||
1376 | irq = __get_cpu_var(vector_irq)[vector]; | ||
1377 | if (irq >= NR_IRQS) | ||
1378 | continue; | ||
1379 | |||
1380 | desc = irq_desc + irq; | ||
1381 | cfg = irq_cfg + irq; | ||
1382 | spin_lock(&desc->lock); | ||
1383 | if (!cfg->move_cleanup_count) | ||
1384 | goto unlock; | ||
1385 | |||
1386 | if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) | ||
1387 | goto unlock; | ||
1388 | |||
1389 | __get_cpu_var(vector_irq)[vector] = -1; | ||
1390 | cfg->move_cleanup_count--; | ||
1391 | unlock: | ||
1392 | spin_unlock(&desc->lock); | ||
1393 | } | ||
1394 | |||
1395 | irq_exit(); | ||
1396 | } | ||
1397 | |||
1398 | static void irq_complete_move(unsigned int irq) | ||
1399 | { | ||
1400 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1401 | unsigned vector, me; | ||
1402 | |||
1403 | if (likely(!cfg->move_in_progress)) | ||
1404 | return; | ||
1405 | |||
1406 | vector = ~get_irq_regs()->orig_rax; | ||
1407 | me = smp_processor_id(); | ||
1408 | if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { | ||
1409 | cpumask_t cleanup_mask; | ||
1410 | |||
1411 | cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); | ||
1412 | cfg->move_cleanup_count = cpus_weight(cleanup_mask); | ||
1413 | send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
1414 | cfg->move_in_progress = 0; | ||
1415 | } | ||
1416 | } | ||
1417 | #else | ||
1418 | static inline void irq_complete_move(unsigned int irq) {} | ||
1419 | #endif | ||
1420 | |||
1421 | static void ack_apic_edge(unsigned int irq) | ||
1422 | { | ||
1423 | irq_complete_move(irq); | ||
1424 | move_native_irq(irq); | ||
1425 | ack_APIC_irq(); | ||
1426 | } | ||
1427 | |||
1428 | static void ack_apic_level(unsigned int irq) | ||
1429 | { | ||
1430 | int do_unmask_irq = 0; | ||
1431 | |||
1432 | irq_complete_move(irq); | ||
1433 | #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) | ||
1434 | /* If we are moving the irq we need to mask it */ | ||
1435 | if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { | ||
1436 | do_unmask_irq = 1; | ||
1437 | mask_IO_APIC_irq(irq); | ||
1438 | } | ||
1439 | #endif | ||
1440 | |||
1441 | /* | ||
1442 | * We must acknowledge the irq before we move it or the acknowledge will | ||
1443 | * not propagate properly. | ||
1444 | */ | ||
1445 | ack_APIC_irq(); | ||
1446 | |||
1447 | /* Now we can move and renable the irq */ | ||
1448 | if (unlikely(do_unmask_irq)) { | ||
1449 | /* Only migrate the irq if the ack has been received. | ||
1450 | * | ||
1451 | * On rare occasions the broadcast level triggered ack gets | ||
1452 | * delayed going to ioapics, and if we reprogram the | ||
1453 | * vector while Remote IRR is still set the irq will never | ||
1454 | * fire again. | ||
1455 | * | ||
1456 | * To prevent this scenario we read the Remote IRR bit | ||
1457 | * of the ioapic. This has two effects. | ||
1458 | * - On any sane system the read of the ioapic will | ||
1459 | * flush writes (and acks) going to the ioapic from | ||
1460 | * this cpu. | ||
1461 | * - We get to see if the ACK has actually been delivered. | ||
1462 | * | ||
1463 | * Based on failed experiments of reprogramming the | ||
1464 | * ioapic entry from outside of irq context starting | ||
1465 | * with masking the ioapic entry and then polling until | ||
1466 | * Remote IRR was clear before reprogramming the | ||
1467 | * ioapic I don't trust the Remote IRR bit to be | ||
1468 | * completey accurate. | ||
1469 | * | ||
1470 | * However there appears to be no other way to plug | ||
1471 | * this race, so if the Remote IRR bit is not | ||
1472 | * accurate and is causing problems then it is a hardware bug | ||
1473 | * and you can go talk to the chipset vendor about it. | ||
1474 | */ | ||
1475 | if (!io_apic_level_ack_pending(irq)) | ||
1476 | move_masked_irq(irq); | ||
1477 | unmask_IO_APIC_irq(irq); | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | static struct irq_chip ioapic_chip __read_mostly = { | ||
1482 | .name = "IO-APIC", | ||
1483 | .startup = startup_ioapic_irq, | ||
1484 | .mask = mask_IO_APIC_irq, | ||
1485 | .unmask = unmask_IO_APIC_irq, | ||
1486 | .ack = ack_apic_edge, | ||
1487 | .eoi = ack_apic_level, | ||
1488 | #ifdef CONFIG_SMP | ||
1489 | .set_affinity = set_ioapic_affinity_irq, | ||
1490 | #endif | ||
1491 | .retrigger = ioapic_retrigger_irq, | ||
1492 | }; | ||
1493 | |||
1494 | static inline void init_IO_APIC_traps(void) | ||
1495 | { | ||
1496 | int irq; | ||
1497 | |||
1498 | /* | ||
1499 | * NOTE! The local APIC isn't very good at handling | ||
1500 | * multiple interrupts at the same interrupt level. | ||
1501 | * As the interrupt level is determined by taking the | ||
1502 | * vector number and shifting that right by 4, we | ||
1503 | * want to spread these out a bit so that they don't | ||
1504 | * all fall in the same interrupt level. | ||
1505 | * | ||
1506 | * Also, we've got to be careful not to trash gate | ||
1507 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
1508 | */ | ||
1509 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
1510 | int tmp = irq; | ||
1511 | if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) { | ||
1512 | /* | ||
1513 | * Hmm.. We don't have an entry for this, | ||
1514 | * so default to an old-fashioned 8259 | ||
1515 | * interrupt if we can.. | ||
1516 | */ | ||
1517 | if (irq < 16) | ||
1518 | make_8259A_irq(irq); | ||
1519 | else | ||
1520 | /* Strange. Oh, well.. */ | ||
1521 | irq_desc[irq].chip = &no_irq_chip; | ||
1522 | } | ||
1523 | } | ||
1524 | } | ||
1525 | |||
1526 | static void enable_lapic_irq (unsigned int irq) | ||
1527 | { | ||
1528 | unsigned long v; | ||
1529 | |||
1530 | v = apic_read(APIC_LVT0); | ||
1531 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
1532 | } | ||
1533 | |||
1534 | static void disable_lapic_irq (unsigned int irq) | ||
1535 | { | ||
1536 | unsigned long v; | ||
1537 | |||
1538 | v = apic_read(APIC_LVT0); | ||
1539 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
1540 | } | ||
1541 | |||
1542 | static void ack_lapic_irq (unsigned int irq) | ||
1543 | { | ||
1544 | ack_APIC_irq(); | ||
1545 | } | ||
1546 | |||
1547 | static void end_lapic_irq (unsigned int i) { /* nothing */ } | ||
1548 | |||
1549 | static struct hw_interrupt_type lapic_irq_type __read_mostly = { | ||
1550 | .name = "local-APIC", | ||
1551 | .typename = "local-APIC-edge", | ||
1552 | .startup = NULL, /* startup_irq() not used for IRQ0 */ | ||
1553 | .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | ||
1554 | .enable = enable_lapic_irq, | ||
1555 | .disable = disable_lapic_irq, | ||
1556 | .ack = ack_lapic_irq, | ||
1557 | .end = end_lapic_irq, | ||
1558 | }; | ||
1559 | |||
1560 | static void setup_nmi (void) | ||
1561 | { | ||
1562 | /* | ||
1563 | * Dirty trick to enable the NMI watchdog ... | ||
1564 | * We put the 8259A master into AEOI mode and | ||
1565 | * unmask on all local APICs LVT0 as NMI. | ||
1566 | * | ||
1567 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
1568 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
1569 | * the NMI handler or the timer interrupt. | ||
1570 | */ | ||
1571 | printk(KERN_INFO "activating NMI Watchdog ..."); | ||
1572 | |||
1573 | enable_NMI_through_LVT0(NULL); | ||
1574 | |||
1575 | printk(" done.\n"); | ||
1576 | } | ||
1577 | |||
1578 | /* | ||
1579 | * This looks a bit hackish but it's about the only one way of sending | ||
1580 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
1581 | * not support the ExtINT mode, unfortunately. We need to send these | ||
1582 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
1583 | * 8259A interrupt line asserted until INTA. --macro | ||
1584 | */ | ||
1585 | static inline void unlock_ExtINT_logic(void) | ||
1586 | { | ||
1587 | int apic, pin, i; | ||
1588 | struct IO_APIC_route_entry entry0, entry1; | ||
1589 | unsigned char save_control, save_freq_select; | ||
1590 | unsigned long flags; | ||
1591 | |||
1592 | pin = find_isa_irq_pin(8, mp_INT); | ||
1593 | apic = find_isa_irq_apic(8, mp_INT); | ||
1594 | if (pin == -1) | ||
1595 | return; | ||
1596 | |||
1597 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1598 | *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | ||
1599 | *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | ||
1600 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1601 | clear_IO_APIC_pin(apic, pin); | ||
1602 | |||
1603 | memset(&entry1, 0, sizeof(entry1)); | ||
1604 | |||
1605 | entry1.dest_mode = 0; /* physical delivery */ | ||
1606 | entry1.mask = 0; /* unmask IRQ now */ | ||
1607 | entry1.dest = hard_smp_processor_id(); | ||
1608 | entry1.delivery_mode = dest_ExtINT; | ||
1609 | entry1.polarity = entry0.polarity; | ||
1610 | entry1.trigger = 0; | ||
1611 | entry1.vector = 0; | ||
1612 | |||
1613 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1614 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | ||
1615 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | ||
1616 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1617 | |||
1618 | save_control = CMOS_READ(RTC_CONTROL); | ||
1619 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
1620 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
1621 | RTC_FREQ_SELECT); | ||
1622 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
1623 | |||
1624 | i = 100; | ||
1625 | while (i-- > 0) { | ||
1626 | mdelay(10); | ||
1627 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
1628 | i -= 10; | ||
1629 | } | ||
1630 | |||
1631 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
1632 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
1633 | clear_IO_APIC_pin(apic, pin); | ||
1634 | |||
1635 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1636 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | ||
1637 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | ||
1638 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1639 | } | ||
1640 | |||
1641 | /* | ||
1642 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
1643 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
1644 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
1645 | * fanatically on his truly buggy board. | ||
1646 | * | ||
1647 | * FIXME: really need to revamp this for modern platforms only. | ||
1648 | */ | ||
1649 | static inline void check_timer(void) | ||
1650 | { | ||
1651 | struct irq_cfg *cfg = irq_cfg + 0; | ||
1652 | int apic1, pin1, apic2, pin2; | ||
1653 | |||
1654 | /* | ||
1655 | * get/set the timer IRQ vector: | ||
1656 | */ | ||
1657 | disable_8259A_irq(0); | ||
1658 | assign_irq_vector(0, TARGET_CPUS); | ||
1659 | |||
1660 | /* | ||
1661 | * Subtle, code in do_timer_interrupt() expects an AEOI | ||
1662 | * mode for the 8259A whenever interrupts are routed | ||
1663 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
1664 | * the 8259A which implies the virtual wire has to be | ||
1665 | * disabled in the local APIC. | ||
1666 | */ | ||
1667 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1668 | init_8259A(1); | ||
1669 | if (timer_over_8254 > 0) | ||
1670 | enable_8259A_irq(0); | ||
1671 | |||
1672 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
1673 | apic1 = find_isa_irq_apic(0, mp_INT); | ||
1674 | pin2 = ioapic_i8259.pin; | ||
1675 | apic2 = ioapic_i8259.apic; | ||
1676 | |||
1677 | apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | ||
1678 | cfg->vector, apic1, pin1, apic2, pin2); | ||
1679 | |||
1680 | if (pin1 != -1) { | ||
1681 | /* | ||
1682 | * Ok, does IRQ0 through the IOAPIC work? | ||
1683 | */ | ||
1684 | unmask_IO_APIC_irq(0); | ||
1685 | if (!no_timer_check && timer_irq_works()) { | ||
1686 | nmi_watchdog_default(); | ||
1687 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1688 | disable_8259A_irq(0); | ||
1689 | setup_nmi(); | ||
1690 | enable_8259A_irq(0); | ||
1691 | } | ||
1692 | if (disable_timer_pin_1 > 0) | ||
1693 | clear_IO_APIC_pin(0, pin1); | ||
1694 | return; | ||
1695 | } | ||
1696 | clear_IO_APIC_pin(apic1, pin1); | ||
1697 | apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " | ||
1698 | "connected to IO-APIC\n"); | ||
1699 | } | ||
1700 | |||
1701 | apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " | ||
1702 | "through the 8259A ... "); | ||
1703 | if (pin2 != -1) { | ||
1704 | apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", | ||
1705 | apic2, pin2); | ||
1706 | /* | ||
1707 | * legacy devices should be connected to IO APIC #0 | ||
1708 | */ | ||
1709 | setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); | ||
1710 | if (timer_irq_works()) { | ||
1711 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1712 | nmi_watchdog_default(); | ||
1713 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1714 | setup_nmi(); | ||
1715 | } | ||
1716 | return; | ||
1717 | } | ||
1718 | /* | ||
1719 | * Cleanup, just in case ... | ||
1720 | */ | ||
1721 | clear_IO_APIC_pin(apic2, pin2); | ||
1722 | } | ||
1723 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1724 | |||
1725 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1726 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | ||
1727 | nmi_watchdog = 0; | ||
1728 | } | ||
1729 | |||
1730 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | ||
1731 | |||
1732 | disable_8259A_irq(0); | ||
1733 | irq_desc[0].chip = &lapic_irq_type; | ||
1734 | apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ | ||
1735 | enable_8259A_irq(0); | ||
1736 | |||
1737 | if (timer_irq_works()) { | ||
1738 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1739 | return; | ||
1740 | } | ||
1741 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); | ||
1742 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1743 | |||
1744 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | ||
1745 | |||
1746 | init_8259A(0); | ||
1747 | make_8259A_irq(0); | ||
1748 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
1749 | |||
1750 | unlock_ExtINT_logic(); | ||
1751 | |||
1752 | if (timer_irq_works()) { | ||
1753 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1754 | return; | ||
1755 | } | ||
1756 | apic_printk(APIC_VERBOSE," failed :(.\n"); | ||
1757 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | ||
1758 | } | ||
1759 | |||
1760 | static int __init notimercheck(char *s) | ||
1761 | { | ||
1762 | no_timer_check = 1; | ||
1763 | return 1; | ||
1764 | } | ||
1765 | __setup("no_timer_check", notimercheck); | ||
1766 | |||
1767 | /* | ||
1768 | * | ||
1769 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | ||
1770 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | ||
1771 | * Linux doesn't really care, as it's not actually used | ||
1772 | * for any interrupt handling anyway. | ||
1773 | */ | ||
1774 | #define PIC_IRQS (1<<2) | ||
1775 | |||
1776 | void __init setup_IO_APIC(void) | ||
1777 | { | ||
1778 | enable_IO_APIC(); | ||
1779 | |||
1780 | if (acpi_ioapic) | ||
1781 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
1782 | else | ||
1783 | io_apic_irqs = ~PIC_IRQS; | ||
1784 | |||
1785 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | ||
1786 | |||
1787 | sync_Arb_IDs(); | ||
1788 | setup_IO_APIC_irqs(); | ||
1789 | init_IO_APIC_traps(); | ||
1790 | check_timer(); | ||
1791 | if (!acpi_ioapic) | ||
1792 | print_IO_APIC(); | ||
1793 | } | ||
1794 | |||
1795 | struct sysfs_ioapic_data { | ||
1796 | struct sys_device dev; | ||
1797 | struct IO_APIC_route_entry entry[0]; | ||
1798 | }; | ||
1799 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | ||
1800 | |||
1801 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | ||
1802 | { | ||
1803 | struct IO_APIC_route_entry *entry; | ||
1804 | struct sysfs_ioapic_data *data; | ||
1805 | int i; | ||
1806 | |||
1807 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1808 | entry = data->entry; | ||
1809 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) | ||
1810 | *entry = ioapic_read_entry(dev->id, i); | ||
1811 | |||
1812 | return 0; | ||
1813 | } | ||
1814 | |||
1815 | static int ioapic_resume(struct sys_device *dev) | ||
1816 | { | ||
1817 | struct IO_APIC_route_entry *entry; | ||
1818 | struct sysfs_ioapic_data *data; | ||
1819 | unsigned long flags; | ||
1820 | union IO_APIC_reg_00 reg_00; | ||
1821 | int i; | ||
1822 | |||
1823 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1824 | entry = data->entry; | ||
1825 | |||
1826 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1827 | reg_00.raw = io_apic_read(dev->id, 0); | ||
1828 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | ||
1829 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | ||
1830 | io_apic_write(dev->id, 0, reg_00.raw); | ||
1831 | } | ||
1832 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1833 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) | ||
1834 | ioapic_write_entry(dev->id, i, entry[i]); | ||
1835 | |||
1836 | return 0; | ||
1837 | } | ||
1838 | |||
1839 | static struct sysdev_class ioapic_sysdev_class = { | ||
1840 | set_kset_name("ioapic"), | ||
1841 | .suspend = ioapic_suspend, | ||
1842 | .resume = ioapic_resume, | ||
1843 | }; | ||
1844 | |||
1845 | static int __init ioapic_init_sysfs(void) | ||
1846 | { | ||
1847 | struct sys_device * dev; | ||
1848 | int i, size, error = 0; | ||
1849 | |||
1850 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
1851 | if (error) | ||
1852 | return error; | ||
1853 | |||
1854 | for (i = 0; i < nr_ioapics; i++ ) { | ||
1855 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
1856 | * sizeof(struct IO_APIC_route_entry); | ||
1857 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | ||
1858 | if (!mp_ioapic_data[i]) { | ||
1859 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1860 | continue; | ||
1861 | } | ||
1862 | memset(mp_ioapic_data[i], 0, size); | ||
1863 | dev = &mp_ioapic_data[i]->dev; | ||
1864 | dev->id = i; | ||
1865 | dev->cls = &ioapic_sysdev_class; | ||
1866 | error = sysdev_register(dev); | ||
1867 | if (error) { | ||
1868 | kfree(mp_ioapic_data[i]); | ||
1869 | mp_ioapic_data[i] = NULL; | ||
1870 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1871 | continue; | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1875 | return 0; | ||
1876 | } | ||
1877 | |||
1878 | device_initcall(ioapic_init_sysfs); | ||
1879 | |||
1880 | /* | ||
1881 | * Dynamic irq allocate and deallocation | ||
1882 | */ | ||
1883 | int create_irq(void) | ||
1884 | { | ||
1885 | /* Allocate an unused irq */ | ||
1886 | int irq; | ||
1887 | int new; | ||
1888 | unsigned long flags; | ||
1889 | |||
1890 | irq = -ENOSPC; | ||
1891 | spin_lock_irqsave(&vector_lock, flags); | ||
1892 | for (new = (NR_IRQS - 1); new >= 0; new--) { | ||
1893 | if (platform_legacy_irq(new)) | ||
1894 | continue; | ||
1895 | if (irq_cfg[new].vector != 0) | ||
1896 | continue; | ||
1897 | if (__assign_irq_vector(new, TARGET_CPUS) == 0) | ||
1898 | irq = new; | ||
1899 | break; | ||
1900 | } | ||
1901 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1902 | |||
1903 | if (irq >= 0) { | ||
1904 | dynamic_irq_init(irq); | ||
1905 | } | ||
1906 | return irq; | ||
1907 | } | ||
1908 | |||
1909 | void destroy_irq(unsigned int irq) | ||
1910 | { | ||
1911 | unsigned long flags; | ||
1912 | |||
1913 | dynamic_irq_cleanup(irq); | ||
1914 | |||
1915 | spin_lock_irqsave(&vector_lock, flags); | ||
1916 | __clear_irq_vector(irq); | ||
1917 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1918 | } | ||
1919 | |||
1920 | /* | ||
1921 | * MSI mesage composition | ||
1922 | */ | ||
1923 | #ifdef CONFIG_PCI_MSI | ||
1924 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) | ||
1925 | { | ||
1926 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1927 | int err; | ||
1928 | unsigned dest; | ||
1929 | cpumask_t tmp; | ||
1930 | |||
1931 | tmp = TARGET_CPUS; | ||
1932 | err = assign_irq_vector(irq, tmp); | ||
1933 | if (!err) { | ||
1934 | cpus_and(tmp, cfg->domain, tmp); | ||
1935 | dest = cpu_mask_to_apicid(tmp); | ||
1936 | |||
1937 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
1938 | msg->address_lo = | ||
1939 | MSI_ADDR_BASE_LO | | ||
1940 | ((INT_DEST_MODE == 0) ? | ||
1941 | MSI_ADDR_DEST_MODE_PHYSICAL: | ||
1942 | MSI_ADDR_DEST_MODE_LOGICAL) | | ||
1943 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
1944 | MSI_ADDR_REDIRECTION_CPU: | ||
1945 | MSI_ADDR_REDIRECTION_LOWPRI) | | ||
1946 | MSI_ADDR_DEST_ID(dest); | ||
1947 | |||
1948 | msg->data = | ||
1949 | MSI_DATA_TRIGGER_EDGE | | ||
1950 | MSI_DATA_LEVEL_ASSERT | | ||
1951 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
1952 | MSI_DATA_DELIVERY_FIXED: | ||
1953 | MSI_DATA_DELIVERY_LOWPRI) | | ||
1954 | MSI_DATA_VECTOR(cfg->vector); | ||
1955 | } | ||
1956 | return err; | ||
1957 | } | ||
1958 | |||
1959 | #ifdef CONFIG_SMP | ||
1960 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | ||
1961 | { | ||
1962 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1963 | struct msi_msg msg; | ||
1964 | unsigned int dest; | ||
1965 | cpumask_t tmp; | ||
1966 | |||
1967 | cpus_and(tmp, mask, cpu_online_map); | ||
1968 | if (cpus_empty(tmp)) | ||
1969 | return; | ||
1970 | |||
1971 | if (assign_irq_vector(irq, mask)) | ||
1972 | return; | ||
1973 | |||
1974 | cpus_and(tmp, cfg->domain, mask); | ||
1975 | dest = cpu_mask_to_apicid(tmp); | ||
1976 | |||
1977 | read_msi_msg(irq, &msg); | ||
1978 | |||
1979 | msg.data &= ~MSI_DATA_VECTOR_MASK; | ||
1980 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | ||
1981 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | ||
1982 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | ||
1983 | |||
1984 | write_msi_msg(irq, &msg); | ||
1985 | irq_desc[irq].affinity = mask; | ||
1986 | } | ||
1987 | #endif /* CONFIG_SMP */ | ||
1988 | |||
1989 | /* | ||
1990 | * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, | ||
1991 | * which implement the MSI or MSI-X Capability Structure. | ||
1992 | */ | ||
1993 | static struct irq_chip msi_chip = { | ||
1994 | .name = "PCI-MSI", | ||
1995 | .unmask = unmask_msi_irq, | ||
1996 | .mask = mask_msi_irq, | ||
1997 | .ack = ack_apic_edge, | ||
1998 | #ifdef CONFIG_SMP | ||
1999 | .set_affinity = set_msi_irq_affinity, | ||
2000 | #endif | ||
2001 | .retrigger = ioapic_retrigger_irq, | ||
2002 | }; | ||
2003 | |||
2004 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | ||
2005 | { | ||
2006 | struct msi_msg msg; | ||
2007 | int irq, ret; | ||
2008 | irq = create_irq(); | ||
2009 | if (irq < 0) | ||
2010 | return irq; | ||
2011 | |||
2012 | ret = msi_compose_msg(dev, irq, &msg); | ||
2013 | if (ret < 0) { | ||
2014 | destroy_irq(irq); | ||
2015 | return ret; | ||
2016 | } | ||
2017 | |||
2018 | set_irq_msi(irq, desc); | ||
2019 | write_msi_msg(irq, &msg); | ||
2020 | |||
2021 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); | ||
2022 | |||
2023 | return 0; | ||
2024 | } | ||
2025 | |||
2026 | void arch_teardown_msi_irq(unsigned int irq) | ||
2027 | { | ||
2028 | destroy_irq(irq); | ||
2029 | } | ||
2030 | |||
2031 | #endif /* CONFIG_PCI_MSI */ | ||
2032 | |||
2033 | /* | ||
2034 | * Hypertransport interrupt support | ||
2035 | */ | ||
2036 | #ifdef CONFIG_HT_IRQ | ||
2037 | |||
2038 | #ifdef CONFIG_SMP | ||
2039 | |||
2040 | static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) | ||
2041 | { | ||
2042 | struct ht_irq_msg msg; | ||
2043 | fetch_ht_irq_msg(irq, &msg); | ||
2044 | |||
2045 | msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); | ||
2046 | msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); | ||
2047 | |||
2048 | msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); | ||
2049 | msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); | ||
2050 | |||
2051 | write_ht_irq_msg(irq, &msg); | ||
2052 | } | ||
2053 | |||
2054 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2055 | { | ||
2056 | struct irq_cfg *cfg = irq_cfg + irq; | ||
2057 | unsigned int dest; | ||
2058 | cpumask_t tmp; | ||
2059 | |||
2060 | cpus_and(tmp, mask, cpu_online_map); | ||
2061 | if (cpus_empty(tmp)) | ||
2062 | return; | ||
2063 | |||
2064 | if (assign_irq_vector(irq, mask)) | ||
2065 | return; | ||
2066 | |||
2067 | cpus_and(tmp, cfg->domain, mask); | ||
2068 | dest = cpu_mask_to_apicid(tmp); | ||
2069 | |||
2070 | target_ht_irq(irq, dest, cfg->vector); | ||
2071 | irq_desc[irq].affinity = mask; | ||
2072 | } | ||
2073 | #endif | ||
2074 | |||
2075 | static struct irq_chip ht_irq_chip = { | ||
2076 | .name = "PCI-HT", | ||
2077 | .mask = mask_ht_irq, | ||
2078 | .unmask = unmask_ht_irq, | ||
2079 | .ack = ack_apic_edge, | ||
2080 | #ifdef CONFIG_SMP | ||
2081 | .set_affinity = set_ht_irq_affinity, | ||
2082 | #endif | ||
2083 | .retrigger = ioapic_retrigger_irq, | ||
2084 | }; | ||
2085 | |||
2086 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | ||
2087 | { | ||
2088 | struct irq_cfg *cfg = irq_cfg + irq; | ||
2089 | int err; | ||
2090 | cpumask_t tmp; | ||
2091 | |||
2092 | tmp = TARGET_CPUS; | ||
2093 | err = assign_irq_vector(irq, tmp); | ||
2094 | if (!err) { | ||
2095 | struct ht_irq_msg msg; | ||
2096 | unsigned dest; | ||
2097 | |||
2098 | cpus_and(tmp, cfg->domain, tmp); | ||
2099 | dest = cpu_mask_to_apicid(tmp); | ||
2100 | |||
2101 | msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); | ||
2102 | |||
2103 | msg.address_lo = | ||
2104 | HT_IRQ_LOW_BASE | | ||
2105 | HT_IRQ_LOW_DEST_ID(dest) | | ||
2106 | HT_IRQ_LOW_VECTOR(cfg->vector) | | ||
2107 | ((INT_DEST_MODE == 0) ? | ||
2108 | HT_IRQ_LOW_DM_PHYSICAL : | ||
2109 | HT_IRQ_LOW_DM_LOGICAL) | | ||
2110 | HT_IRQ_LOW_RQEOI_EDGE | | ||
2111 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2112 | HT_IRQ_LOW_MT_FIXED : | ||
2113 | HT_IRQ_LOW_MT_ARBITRATED) | | ||
2114 | HT_IRQ_LOW_IRQ_MASKED; | ||
2115 | |||
2116 | write_ht_irq_msg(irq, &msg); | ||
2117 | |||
2118 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, | ||
2119 | handle_edge_irq, "edge"); | ||
2120 | } | ||
2121 | return err; | ||
2122 | } | ||
2123 | #endif /* CONFIG_HT_IRQ */ | ||
2124 | |||
2125 | /* -------------------------------------------------------------------------- | ||
2126 | ACPI-based IOAPIC Configuration | ||
2127 | -------------------------------------------------------------------------- */ | ||
2128 | |||
2129 | #ifdef CONFIG_ACPI | ||
2130 | |||
2131 | #define IO_APIC_MAX_ID 0xFE | ||
2132 | |||
2133 | int __init io_apic_get_redir_entries (int ioapic) | ||
2134 | { | ||
2135 | union IO_APIC_reg_01 reg_01; | ||
2136 | unsigned long flags; | ||
2137 | |||
2138 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2139 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2140 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2141 | |||
2142 | return reg_01.bits.entries; | ||
2143 | } | ||
2144 | |||
2145 | |||
2146 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) | ||
2147 | { | ||
2148 | if (!IO_APIC_IRQ(irq)) { | ||
2149 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
2150 | ioapic); | ||
2151 | return -EINVAL; | ||
2152 | } | ||
2153 | |||
2154 | /* | ||
2155 | * IRQs < 16 are already in the irq_2_pin[] map | ||
2156 | */ | ||
2157 | if (irq >= 16) | ||
2158 | add_pin_to_irq(irq, ioapic, pin); | ||
2159 | |||
2160 | setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); | ||
2161 | |||
2162 | return 0; | ||
2163 | } | ||
2164 | |||
2165 | #endif /* CONFIG_ACPI */ | ||
2166 | |||
2167 | |||
2168 | /* | ||
2169 | * This function currently is only a helper for the i386 smp boot process where | ||
2170 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
2171 | * so mask in all cases should simply be TARGET_CPUS | ||
2172 | */ | ||
2173 | #ifdef CONFIG_SMP | ||
2174 | void __init setup_ioapic_dest(void) | ||
2175 | { | ||
2176 | int pin, ioapic, irq, irq_entry; | ||
2177 | |||
2178 | if (skip_ioapic_setup == 1) | ||
2179 | return; | ||
2180 | |||
2181 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
2182 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
2183 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
2184 | if (irq_entry == -1) | ||
2185 | continue; | ||
2186 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
2187 | |||
2188 | /* setup_IO_APIC_irqs could fail to get vector for some device | ||
2189 | * when you have too many devices, because at that time only boot | ||
2190 | * cpu is online. | ||
2191 | */ | ||
2192 | if (!irq_cfg[irq].vector) | ||
2193 | setup_IO_APIC_irq(ioapic, pin, irq, | ||
2194 | irq_trigger(irq_entry), | ||
2195 | irq_polarity(irq_entry)); | ||
2196 | else | ||
2197 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
2198 | } | ||
2199 | |||
2200 | } | ||
2201 | } | ||
2202 | #endif | ||
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c new file mode 100644 index 000000000000..653efa30b0f4 --- /dev/null +++ b/arch/x86/kernel/ioport_64.c | |||
@@ -0,0 +1,119 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ioport.c | ||
3 | * | ||
4 | * This contains the io-permission bitmap code - written by obz, with changes | ||
5 | * by Linus. | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/capability.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/ioport.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | |||
20 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | ||
21 | static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | ||
22 | { | ||
23 | int i; | ||
24 | if (new_value) | ||
25 | for (i = base; i < base + extent; i++) | ||
26 | __set_bit(i, bitmap); | ||
27 | else | ||
28 | for (i = base; i < base + extent; i++) | ||
29 | clear_bit(i, bitmap); | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * this changes the io permissions bitmap in the current task. | ||
34 | */ | ||
35 | asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | ||
36 | { | ||
37 | unsigned int i, max_long, bytes, bytes_updated; | ||
38 | struct thread_struct * t = ¤t->thread; | ||
39 | struct tss_struct * tss; | ||
40 | unsigned long *bitmap; | ||
41 | |||
42 | if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | ||
43 | return -EINVAL; | ||
44 | if (turn_on && !capable(CAP_SYS_RAWIO)) | ||
45 | return -EPERM; | ||
46 | |||
47 | /* | ||
48 | * If it's the first ioperm() call in this thread's lifetime, set the | ||
49 | * IO bitmap up. ioperm() is much less timing critical than clone(), | ||
50 | * this is why we delay this operation until now: | ||
51 | */ | ||
52 | if (!t->io_bitmap_ptr) { | ||
53 | bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
54 | if (!bitmap) | ||
55 | return -ENOMEM; | ||
56 | |||
57 | memset(bitmap, 0xff, IO_BITMAP_BYTES); | ||
58 | t->io_bitmap_ptr = bitmap; | ||
59 | set_thread_flag(TIF_IO_BITMAP); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * do it in the per-thread copy and in the TSS ... | ||
64 | * | ||
65 | * Disable preemption via get_cpu() - we must not switch away | ||
66 | * because the ->io_bitmap_max value must match the bitmap | ||
67 | * contents: | ||
68 | */ | ||
69 | tss = &per_cpu(init_tss, get_cpu()); | ||
70 | |||
71 | set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | ||
72 | |||
73 | /* | ||
74 | * Search for a (possibly new) maximum. This is simple and stupid, | ||
75 | * to keep it obviously correct: | ||
76 | */ | ||
77 | max_long = 0; | ||
78 | for (i = 0; i < IO_BITMAP_LONGS; i++) | ||
79 | if (t->io_bitmap_ptr[i] != ~0UL) | ||
80 | max_long = i; | ||
81 | |||
82 | bytes = (max_long + 1) * sizeof(long); | ||
83 | bytes_updated = max(bytes, t->io_bitmap_max); | ||
84 | |||
85 | t->io_bitmap_max = bytes; | ||
86 | |||
87 | /* Update the TSS: */ | ||
88 | memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); | ||
89 | |||
90 | put_cpu(); | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * sys_iopl has to be used when you want to access the IO ports | ||
97 | * beyond the 0x3ff range: to get the full 65536 ports bitmapped | ||
98 | * you'd need 8kB of bitmaps/process, which is a bit excessive. | ||
99 | * | ||
100 | * Here we just change the eflags value on the stack: we allow | ||
101 | * only the super-user to do it. This depends on the stack-layout | ||
102 | * on system-call entry - see also fork() and the signal handling | ||
103 | * code. | ||
104 | */ | ||
105 | |||
106 | asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) | ||
107 | { | ||
108 | unsigned int old = (regs->eflags >> 12) & 3; | ||
109 | |||
110 | if (level > 3) | ||
111 | return -EINVAL; | ||
112 | /* Trying to gain more privileges? */ | ||
113 | if (level > old) { | ||
114 | if (!capable(CAP_SYS_RAWIO)) | ||
115 | return -EPERM; | ||
116 | } | ||
117 | regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); | ||
118 | return 0; | ||
119 | } | ||
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c new file mode 100644 index 000000000000..39cb3fa83ebb --- /dev/null +++ b/arch/x86/kernel/irq_64.c | |||
@@ -0,0 +1,213 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/irq.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the lowest level x86_64-specific interrupt | ||
7 | * entry and irq statistics code. All the remaining irq logic is | ||
8 | * done by the generic kernel/irq/ code and in the | ||
9 | * x86_64-specific irq controller code. (e.g. i8259.c and | ||
10 | * io_apic.c.) | ||
11 | */ | ||
12 | |||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <asm/uaccess.h> | ||
19 | #include <asm/io_apic.h> | ||
20 | #include <asm/idle.h> | ||
21 | #include <asm/smp.h> | ||
22 | |||
23 | atomic_t irq_err_count; | ||
24 | |||
25 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
26 | /* | ||
27 | * Probabilistic stack overflow check: | ||
28 | * | ||
29 | * Only check the stack in process context, because everything else | ||
30 | * runs on the big interrupt stacks. Checking reliably is too expensive, | ||
31 | * so we just check from interrupts. | ||
32 | */ | ||
33 | static inline void stack_overflow_check(struct pt_regs *regs) | ||
34 | { | ||
35 | u64 curbase = (u64)task_stack_page(current); | ||
36 | static unsigned long warned = -60*HZ; | ||
37 | |||
38 | if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && | ||
39 | regs->rsp < curbase + sizeof(struct thread_info) + 128 && | ||
40 | time_after(jiffies, warned + 60*HZ)) { | ||
41 | printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", | ||
42 | current->comm, curbase, regs->rsp); | ||
43 | show_stack(NULL,NULL); | ||
44 | warned = jiffies; | ||
45 | } | ||
46 | } | ||
47 | #endif | ||
48 | |||
49 | /* | ||
50 | * Generic, controller-independent functions: | ||
51 | */ | ||
52 | |||
53 | int show_interrupts(struct seq_file *p, void *v) | ||
54 | { | ||
55 | int i = *(loff_t *) v, j; | ||
56 | struct irqaction * action; | ||
57 | unsigned long flags; | ||
58 | |||
59 | if (i == 0) { | ||
60 | seq_printf(p, " "); | ||
61 | for_each_online_cpu(j) | ||
62 | seq_printf(p, "CPU%-8d",j); | ||
63 | seq_putc(p, '\n'); | ||
64 | } | ||
65 | |||
66 | if (i < NR_IRQS) { | ||
67 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
68 | action = irq_desc[i].action; | ||
69 | if (!action) | ||
70 | goto skip; | ||
71 | seq_printf(p, "%3d: ",i); | ||
72 | #ifndef CONFIG_SMP | ||
73 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
74 | #else | ||
75 | for_each_online_cpu(j) | ||
76 | seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | ||
77 | #endif | ||
78 | seq_printf(p, " %8s", irq_desc[i].chip->name); | ||
79 | seq_printf(p, "-%-8s", irq_desc[i].name); | ||
80 | |||
81 | seq_printf(p, " %s", action->name); | ||
82 | for (action=action->next; action; action = action->next) | ||
83 | seq_printf(p, ", %s", action->name); | ||
84 | seq_putc(p, '\n'); | ||
85 | skip: | ||
86 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
87 | } else if (i == NR_IRQS) { | ||
88 | seq_printf(p, "NMI: "); | ||
89 | for_each_online_cpu(j) | ||
90 | seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); | ||
91 | seq_putc(p, '\n'); | ||
92 | seq_printf(p, "LOC: "); | ||
93 | for_each_online_cpu(j) | ||
94 | seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); | ||
95 | seq_putc(p, '\n'); | ||
96 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
97 | } | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * do_IRQ handles all normal device IRQ's (the special | ||
103 | * SMP cross-CPU interrupts have their own specific | ||
104 | * handlers). | ||
105 | */ | ||
106 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | ||
107 | { | ||
108 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
109 | |||
110 | /* high bit used in ret_from_ code */ | ||
111 | unsigned vector = ~regs->orig_rax; | ||
112 | unsigned irq; | ||
113 | |||
114 | exit_idle(); | ||
115 | irq_enter(); | ||
116 | irq = __get_cpu_var(vector_irq)[vector]; | ||
117 | |||
118 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
119 | stack_overflow_check(regs); | ||
120 | #endif | ||
121 | |||
122 | if (likely(irq < NR_IRQS)) | ||
123 | generic_handle_irq(irq); | ||
124 | else { | ||
125 | if (!disable_apic) | ||
126 | ack_APIC_irq(); | ||
127 | |||
128 | if (printk_ratelimit()) | ||
129 | printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", | ||
130 | __func__, smp_processor_id(), vector); | ||
131 | } | ||
132 | |||
133 | irq_exit(); | ||
134 | |||
135 | set_irq_regs(old_regs); | ||
136 | return 1; | ||
137 | } | ||
138 | |||
139 | #ifdef CONFIG_HOTPLUG_CPU | ||
140 | void fixup_irqs(cpumask_t map) | ||
141 | { | ||
142 | unsigned int irq; | ||
143 | static int warned; | ||
144 | |||
145 | for (irq = 0; irq < NR_IRQS; irq++) { | ||
146 | cpumask_t mask; | ||
147 | int break_affinity = 0; | ||
148 | int set_affinity = 1; | ||
149 | |||
150 | if (irq == 2) | ||
151 | continue; | ||
152 | |||
153 | /* interrupt's are disabled at this point */ | ||
154 | spin_lock(&irq_desc[irq].lock); | ||
155 | |||
156 | if (!irq_has_action(irq) || | ||
157 | cpus_equal(irq_desc[irq].affinity, map)) { | ||
158 | spin_unlock(&irq_desc[irq].lock); | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | cpus_and(mask, irq_desc[irq].affinity, map); | ||
163 | if (cpus_empty(mask)) { | ||
164 | break_affinity = 1; | ||
165 | mask = map; | ||
166 | } | ||
167 | |||
168 | if (irq_desc[irq].chip->mask) | ||
169 | irq_desc[irq].chip->mask(irq); | ||
170 | |||
171 | if (irq_desc[irq].chip->set_affinity) | ||
172 | irq_desc[irq].chip->set_affinity(irq, mask); | ||
173 | else if (!(warned++)) | ||
174 | set_affinity = 0; | ||
175 | |||
176 | if (irq_desc[irq].chip->unmask) | ||
177 | irq_desc[irq].chip->unmask(irq); | ||
178 | |||
179 | spin_unlock(&irq_desc[irq].lock); | ||
180 | |||
181 | if (break_affinity && set_affinity) | ||
182 | printk("Broke affinity for irq %i\n", irq); | ||
183 | else if (!set_affinity) | ||
184 | printk("Cannot set affinity for irq %i\n", irq); | ||
185 | } | ||
186 | |||
187 | /* That doesn't seem sufficient. Give it 1ms. */ | ||
188 | local_irq_enable(); | ||
189 | mdelay(1); | ||
190 | local_irq_disable(); | ||
191 | } | ||
192 | #endif | ||
193 | |||
194 | extern void call_softirq(void); | ||
195 | |||
196 | asmlinkage void do_softirq(void) | ||
197 | { | ||
198 | __u32 pending; | ||
199 | unsigned long flags; | ||
200 | |||
201 | if (in_interrupt()) | ||
202 | return; | ||
203 | |||
204 | local_irq_save(flags); | ||
205 | pending = local_softirq_pending(); | ||
206 | /* Switch to interrupt stack */ | ||
207 | if (pending) { | ||
208 | call_softirq(); | ||
209 | WARN_ON_ONCE(softirq_count()); | ||
210 | } | ||
211 | local_irq_restore(flags); | ||
212 | } | ||
213 | EXPORT_SYMBOL(do_softirq); | ||
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c new file mode 100644 index 000000000000..7377ccb21335 --- /dev/null +++ b/arch/x86/kernel/k8.c | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * Shared support code for AMD K8 northbridges and derivates. | ||
3 | * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. | ||
4 | */ | ||
5 | #include <linux/gfp.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/spinlock.h> | ||
11 | #include <asm/k8.h> | ||
12 | |||
13 | int num_k8_northbridges; | ||
14 | EXPORT_SYMBOL(num_k8_northbridges); | ||
15 | |||
16 | static u32 *flush_words; | ||
17 | |||
18 | struct pci_device_id k8_nb_ids[] = { | ||
19 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, | ||
20 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, | ||
21 | {} | ||
22 | }; | ||
23 | EXPORT_SYMBOL(k8_nb_ids); | ||
24 | |||
25 | struct pci_dev **k8_northbridges; | ||
26 | EXPORT_SYMBOL(k8_northbridges); | ||
27 | |||
28 | static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) | ||
29 | { | ||
30 | do { | ||
31 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | ||
32 | if (!dev) | ||
33 | break; | ||
34 | } while (!pci_match_id(&k8_nb_ids[0], dev)); | ||
35 | return dev; | ||
36 | } | ||
37 | |||
38 | int cache_k8_northbridges(void) | ||
39 | { | ||
40 | int i; | ||
41 | struct pci_dev *dev; | ||
42 | |||
43 | if (num_k8_northbridges) | ||
44 | return 0; | ||
45 | |||
46 | dev = NULL; | ||
47 | while ((dev = next_k8_northbridge(dev)) != NULL) | ||
48 | num_k8_northbridges++; | ||
49 | |||
50 | k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), | ||
51 | GFP_KERNEL); | ||
52 | if (!k8_northbridges) | ||
53 | return -ENOMEM; | ||
54 | |||
55 | if (!num_k8_northbridges) { | ||
56 | k8_northbridges[0] = NULL; | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); | ||
61 | if (!flush_words) { | ||
62 | kfree(k8_northbridges); | ||
63 | return -ENOMEM; | ||
64 | } | ||
65 | |||
66 | dev = NULL; | ||
67 | i = 0; | ||
68 | while ((dev = next_k8_northbridge(dev)) != NULL) { | ||
69 | k8_northbridges[i] = dev; | ||
70 | pci_read_config_dword(dev, 0x9c, &flush_words[i++]); | ||
71 | } | ||
72 | k8_northbridges[i] = NULL; | ||
73 | return 0; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(cache_k8_northbridges); | ||
76 | |||
77 | /* Ignores subdevice/subvendor but as far as I can figure out | ||
78 | they're useless anyways */ | ||
79 | int __init early_is_k8_nb(u32 device) | ||
80 | { | ||
81 | struct pci_device_id *id; | ||
82 | u32 vendor = device & 0xffff; | ||
83 | device >>= 16; | ||
84 | for (id = k8_nb_ids; id->vendor; id++) | ||
85 | if (vendor == id->vendor && device == id->device) | ||
86 | return 1; | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | void k8_flush_garts(void) | ||
91 | { | ||
92 | int flushed, i; | ||
93 | unsigned long flags; | ||
94 | static DEFINE_SPINLOCK(gart_lock); | ||
95 | |||
96 | /* Avoid races between AGP and IOMMU. In theory it's not needed | ||
97 | but I'm not sure if the hardware won't lose flush requests | ||
98 | when another is pending. This whole thing is so expensive anyways | ||
99 | that it doesn't matter to serialize more. -AK */ | ||
100 | spin_lock_irqsave(&gart_lock, flags); | ||
101 | flushed = 0; | ||
102 | for (i = 0; i < num_k8_northbridges; i++) { | ||
103 | pci_write_config_dword(k8_northbridges[i], 0x9c, | ||
104 | flush_words[i]|1); | ||
105 | flushed++; | ||
106 | } | ||
107 | for (i = 0; i < num_k8_northbridges; i++) { | ||
108 | u32 w; | ||
109 | /* Make sure the hardware actually executed the flush*/ | ||
110 | for (;;) { | ||
111 | pci_read_config_dword(k8_northbridges[i], | ||
112 | 0x9c, &w); | ||
113 | if (!(w & 1)) | ||
114 | break; | ||
115 | cpu_relax(); | ||
116 | } | ||
117 | } | ||
118 | spin_unlock_irqrestore(&gart_lock, flags); | ||
119 | if (!flushed) | ||
120 | printk("nothing to flush?\n"); | ||
121 | } | ||
122 | EXPORT_SYMBOL_GPL(k8_flush_garts); | ||
123 | |||
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c new file mode 100644 index 000000000000..a30e004682e2 --- /dev/null +++ b/arch/x86/kernel/kprobes_64.c | |||
@@ -0,0 +1,749 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/x86_64/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi | ||
27 | * <prasanna@in.ibm.com> adapted for x86_64 | ||
28 | * 2005-Mar Roland McGrath <roland@redhat.com> | ||
29 | * Fixed to handle %rip-relative addressing mode correctly. | ||
30 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> | ||
31 | * Added function return probes functionality | ||
32 | */ | ||
33 | |||
34 | #include <linux/kprobes.h> | ||
35 | #include <linux/ptrace.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/preempt.h> | ||
39 | #include <linux/module.h> | ||
40 | #include <linux/kdebug.h> | ||
41 | |||
42 | #include <asm/pgtable.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/alternative.h> | ||
45 | |||
46 | void jprobe_return_end(void); | ||
47 | static void __kprobes arch_copy_kprobe(struct kprobe *p); | ||
48 | |||
49 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; | ||
50 | DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); | ||
51 | |||
52 | /* | ||
53 | * returns non-zero if opcode modifies the interrupt flag. | ||
54 | */ | ||
55 | static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) | ||
56 | { | ||
57 | switch (*insn) { | ||
58 | case 0xfa: /* cli */ | ||
59 | case 0xfb: /* sti */ | ||
60 | case 0xcf: /* iret/iretd */ | ||
61 | case 0x9d: /* popf/popfd */ | ||
62 | return 1; | ||
63 | } | ||
64 | |||
65 | if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) | ||
66 | return 1; | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | int __kprobes arch_prepare_kprobe(struct kprobe *p) | ||
71 | { | ||
72 | /* insn: must be on special executable page on x86_64. */ | ||
73 | p->ainsn.insn = get_insn_slot(); | ||
74 | if (!p->ainsn.insn) { | ||
75 | return -ENOMEM; | ||
76 | } | ||
77 | arch_copy_kprobe(p); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Determine if the instruction uses the %rip-relative addressing mode. | ||
83 | * If it does, return the address of the 32-bit displacement word. | ||
84 | * If not, return null. | ||
85 | */ | ||
86 | static s32 __kprobes *is_riprel(u8 *insn) | ||
87 | { | ||
88 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
89 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
90 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
91 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
92 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
93 | << (row % 64)) | ||
94 | static const u64 onebyte_has_modrm[256 / 64] = { | ||
95 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
96 | /* ------------------------------- */ | ||
97 | W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ | ||
98 | W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ | ||
99 | W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ | ||
100 | W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ | ||
101 | W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ | ||
102 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ | ||
103 | W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ | ||
104 | W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ | ||
105 | W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ | ||
106 | W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ | ||
107 | W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ | ||
108 | W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ | ||
109 | W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ | ||
110 | W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ | ||
111 | W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ | ||
112 | W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ | ||
113 | /* ------------------------------- */ | ||
114 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
115 | }; | ||
116 | static const u64 twobyte_has_modrm[256 / 64] = { | ||
117 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
118 | /* ------------------------------- */ | ||
119 | W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ | ||
120 | W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ | ||
121 | W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ | ||
122 | W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ | ||
123 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ | ||
124 | W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ | ||
125 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ | ||
126 | W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ | ||
127 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ | ||
128 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ | ||
129 | W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ | ||
130 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ | ||
131 | W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ | ||
132 | W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ | ||
133 | W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ | ||
134 | W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ | ||
135 | /* ------------------------------- */ | ||
136 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
137 | }; | ||
138 | #undef W | ||
139 | int need_modrm; | ||
140 | |||
141 | /* Skip legacy instruction prefixes. */ | ||
142 | while (1) { | ||
143 | switch (*insn) { | ||
144 | case 0x66: | ||
145 | case 0x67: | ||
146 | case 0x2e: | ||
147 | case 0x3e: | ||
148 | case 0x26: | ||
149 | case 0x64: | ||
150 | case 0x65: | ||
151 | case 0x36: | ||
152 | case 0xf0: | ||
153 | case 0xf3: | ||
154 | case 0xf2: | ||
155 | ++insn; | ||
156 | continue; | ||
157 | } | ||
158 | break; | ||
159 | } | ||
160 | |||
161 | /* Skip REX instruction prefix. */ | ||
162 | if ((*insn & 0xf0) == 0x40) | ||
163 | ++insn; | ||
164 | |||
165 | if (*insn == 0x0f) { /* Two-byte opcode. */ | ||
166 | ++insn; | ||
167 | need_modrm = test_bit(*insn, twobyte_has_modrm); | ||
168 | } else { /* One-byte opcode. */ | ||
169 | need_modrm = test_bit(*insn, onebyte_has_modrm); | ||
170 | } | ||
171 | |||
172 | if (need_modrm) { | ||
173 | u8 modrm = *++insn; | ||
174 | if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ | ||
175 | /* Displacement follows ModRM byte. */ | ||
176 | return (s32 *) ++insn; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | /* No %rip-relative addressing mode here. */ | ||
181 | return NULL; | ||
182 | } | ||
183 | |||
184 | static void __kprobes arch_copy_kprobe(struct kprobe *p) | ||
185 | { | ||
186 | s32 *ripdisp; | ||
187 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); | ||
188 | ripdisp = is_riprel(p->ainsn.insn); | ||
189 | if (ripdisp) { | ||
190 | /* | ||
191 | * The copied instruction uses the %rip-relative | ||
192 | * addressing mode. Adjust the displacement for the | ||
193 | * difference between the original location of this | ||
194 | * instruction and the location of the copy that will | ||
195 | * actually be run. The tricky bit here is making sure | ||
196 | * that the sign extension happens correctly in this | ||
197 | * calculation, since we need a signed 32-bit result to | ||
198 | * be sign-extended to 64 bits when it's added to the | ||
199 | * %rip value and yield the same 64-bit result that the | ||
200 | * sign-extension of the original signed 32-bit | ||
201 | * displacement would have given. | ||
202 | */ | ||
203 | s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; | ||
204 | BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ | ||
205 | *ripdisp = disp; | ||
206 | } | ||
207 | p->opcode = *p->addr; | ||
208 | } | ||
209 | |||
210 | void __kprobes arch_arm_kprobe(struct kprobe *p) | ||
211 | { | ||
212 | text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); | ||
213 | } | ||
214 | |||
215 | void __kprobes arch_disarm_kprobe(struct kprobe *p) | ||
216 | { | ||
217 | text_poke(p->addr, &p->opcode, 1); | ||
218 | } | ||
219 | |||
220 | void __kprobes arch_remove_kprobe(struct kprobe *p) | ||
221 | { | ||
222 | mutex_lock(&kprobe_mutex); | ||
223 | free_insn_slot(p->ainsn.insn, 0); | ||
224 | mutex_unlock(&kprobe_mutex); | ||
225 | } | ||
226 | |||
227 | static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
228 | { | ||
229 | kcb->prev_kprobe.kp = kprobe_running(); | ||
230 | kcb->prev_kprobe.status = kcb->kprobe_status; | ||
231 | kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; | ||
232 | kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; | ||
233 | } | ||
234 | |||
235 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
236 | { | ||
237 | __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; | ||
238 | kcb->kprobe_status = kcb->prev_kprobe.status; | ||
239 | kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; | ||
240 | kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; | ||
241 | } | ||
242 | |||
243 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | ||
244 | struct kprobe_ctlblk *kcb) | ||
245 | { | ||
246 | __get_cpu_var(current_kprobe) = p; | ||
247 | kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags | ||
248 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
249 | if (is_IF_modifier(p->ainsn.insn)) | ||
250 | kcb->kprobe_saved_rflags &= ~IF_MASK; | ||
251 | } | ||
252 | |||
253 | static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
254 | { | ||
255 | regs->eflags |= TF_MASK; | ||
256 | regs->eflags &= ~IF_MASK; | ||
257 | /*single step inline if the instruction is an int3*/ | ||
258 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
259 | regs->rip = (unsigned long)p->addr; | ||
260 | else | ||
261 | regs->rip = (unsigned long)p->ainsn.insn; | ||
262 | } | ||
263 | |||
264 | /* Called with kretprobe_lock held */ | ||
265 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | ||
266 | struct pt_regs *regs) | ||
267 | { | ||
268 | unsigned long *sara = (unsigned long *)regs->rsp; | ||
269 | |||
270 | ri->ret_addr = (kprobe_opcode_t *) *sara; | ||
271 | /* Replace the return addr with trampoline addr */ | ||
272 | *sara = (unsigned long) &kretprobe_trampoline; | ||
273 | } | ||
274 | |||
275 | int __kprobes kprobe_handler(struct pt_regs *regs) | ||
276 | { | ||
277 | struct kprobe *p; | ||
278 | int ret = 0; | ||
279 | kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); | ||
280 | struct kprobe_ctlblk *kcb; | ||
281 | |||
282 | /* | ||
283 | * We don't want to be preempted for the entire | ||
284 | * duration of kprobe processing | ||
285 | */ | ||
286 | preempt_disable(); | ||
287 | kcb = get_kprobe_ctlblk(); | ||
288 | |||
289 | /* Check we're not actually recursing */ | ||
290 | if (kprobe_running()) { | ||
291 | p = get_kprobe(addr); | ||
292 | if (p) { | ||
293 | if (kcb->kprobe_status == KPROBE_HIT_SS && | ||
294 | *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { | ||
295 | regs->eflags &= ~TF_MASK; | ||
296 | regs->eflags |= kcb->kprobe_saved_rflags; | ||
297 | goto no_kprobe; | ||
298 | } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { | ||
299 | /* TODO: Provide re-entrancy from | ||
300 | * post_kprobes_handler() and avoid exception | ||
301 | * stack corruption while single-stepping on | ||
302 | * the instruction of the new probe. | ||
303 | */ | ||
304 | arch_disarm_kprobe(p); | ||
305 | regs->rip = (unsigned long)p->addr; | ||
306 | reset_current_kprobe(); | ||
307 | ret = 1; | ||
308 | } else { | ||
309 | /* We have reentered the kprobe_handler(), since | ||
310 | * another probe was hit while within the | ||
311 | * handler. We here save the original kprobe | ||
312 | * variables and just single step on instruction | ||
313 | * of the new probe without calling any user | ||
314 | * handlers. | ||
315 | */ | ||
316 | save_previous_kprobe(kcb); | ||
317 | set_current_kprobe(p, regs, kcb); | ||
318 | kprobes_inc_nmissed_count(p); | ||
319 | prepare_singlestep(p, regs); | ||
320 | kcb->kprobe_status = KPROBE_REENTER; | ||
321 | return 1; | ||
322 | } | ||
323 | } else { | ||
324 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
325 | /* The breakpoint instruction was removed by | ||
326 | * another cpu right after we hit, no further | ||
327 | * handling of this interrupt is appropriate | ||
328 | */ | ||
329 | regs->rip = (unsigned long)addr; | ||
330 | ret = 1; | ||
331 | goto no_kprobe; | ||
332 | } | ||
333 | p = __get_cpu_var(current_kprobe); | ||
334 | if (p->break_handler && p->break_handler(p, regs)) { | ||
335 | goto ss_probe; | ||
336 | } | ||
337 | } | ||
338 | goto no_kprobe; | ||
339 | } | ||
340 | |||
341 | p = get_kprobe(addr); | ||
342 | if (!p) { | ||
343 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
344 | /* | ||
345 | * The breakpoint instruction was removed right | ||
346 | * after we hit it. Another cpu has removed | ||
347 | * either a probepoint or a debugger breakpoint | ||
348 | * at this address. In either case, no further | ||
349 | * handling of this interrupt is appropriate. | ||
350 | * Back up over the (now missing) int3 and run | ||
351 | * the original instruction. | ||
352 | */ | ||
353 | regs->rip = (unsigned long)addr; | ||
354 | ret = 1; | ||
355 | } | ||
356 | /* Not one of ours: let kernel handle it */ | ||
357 | goto no_kprobe; | ||
358 | } | ||
359 | |||
360 | set_current_kprobe(p, regs, kcb); | ||
361 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
362 | |||
363 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
364 | /* handler has already set things up, so skip ss setup */ | ||
365 | return 1; | ||
366 | |||
367 | ss_probe: | ||
368 | prepare_singlestep(p, regs); | ||
369 | kcb->kprobe_status = KPROBE_HIT_SS; | ||
370 | return 1; | ||
371 | |||
372 | no_kprobe: | ||
373 | preempt_enable_no_resched(); | ||
374 | return ret; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * For function-return probes, init_kprobes() establishes a probepoint | ||
379 | * here. When a retprobed function returns, this probe is hit and | ||
380 | * trampoline_probe_handler() runs, calling the kretprobe's handler. | ||
381 | */ | ||
382 | void kretprobe_trampoline_holder(void) | ||
383 | { | ||
384 | asm volatile ( ".global kretprobe_trampoline\n" | ||
385 | "kretprobe_trampoline: \n" | ||
386 | "nop\n"); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Called when we hit the probe point at kretprobe_trampoline | ||
391 | */ | ||
392 | int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) | ||
393 | { | ||
394 | struct kretprobe_instance *ri = NULL; | ||
395 | struct hlist_head *head, empty_rp; | ||
396 | struct hlist_node *node, *tmp; | ||
397 | unsigned long flags, orig_ret_address = 0; | ||
398 | unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; | ||
399 | |||
400 | INIT_HLIST_HEAD(&empty_rp); | ||
401 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
402 | head = kretprobe_inst_table_head(current); | ||
403 | |||
404 | /* | ||
405 | * It is possible to have multiple instances associated with a given | ||
406 | * task either because an multiple functions in the call path | ||
407 | * have a return probe installed on them, and/or more then one return | ||
408 | * return probe was registered for a target function. | ||
409 | * | ||
410 | * We can handle this because: | ||
411 | * - instances are always inserted at the head of the list | ||
412 | * - when multiple return probes are registered for the same | ||
413 | * function, the first instance's ret_addr will point to the | ||
414 | * real return address, and all the rest will point to | ||
415 | * kretprobe_trampoline | ||
416 | */ | ||
417 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | ||
418 | if (ri->task != current) | ||
419 | /* another task is sharing our hash bucket */ | ||
420 | continue; | ||
421 | |||
422 | if (ri->rp && ri->rp->handler) | ||
423 | ri->rp->handler(ri, regs); | ||
424 | |||
425 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
426 | recycle_rp_inst(ri, &empty_rp); | ||
427 | |||
428 | if (orig_ret_address != trampoline_address) | ||
429 | /* | ||
430 | * This is the real return address. Any other | ||
431 | * instances associated with this task are for | ||
432 | * other calls deeper on the call stack | ||
433 | */ | ||
434 | break; | ||
435 | } | ||
436 | |||
437 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | ||
438 | regs->rip = orig_ret_address; | ||
439 | |||
440 | reset_current_kprobe(); | ||
441 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
442 | preempt_enable_no_resched(); | ||
443 | |||
444 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | ||
445 | hlist_del(&ri->hlist); | ||
446 | kfree(ri); | ||
447 | } | ||
448 | /* | ||
449 | * By returning a non-zero value, we are telling | ||
450 | * kprobe_handler() that we don't want the post_handler | ||
451 | * to run (and have re-enabled preemption) | ||
452 | */ | ||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Called after single-stepping. p->addr is the address of the | ||
458 | * instruction whose first byte has been replaced by the "int 3" | ||
459 | * instruction. To avoid the SMP problems that can occur when we | ||
460 | * temporarily put back the original opcode to single-step, we | ||
461 | * single-stepped a copy of the instruction. The address of this | ||
462 | * copy is p->ainsn.insn. | ||
463 | * | ||
464 | * This function prepares to return from the post-single-step | ||
465 | * interrupt. We have to fix up the stack as follows: | ||
466 | * | ||
467 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
468 | * the new rip is relative to the copied instruction. We need to make | ||
469 | * it relative to the original instruction. | ||
470 | * | ||
471 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
472 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
473 | * | ||
474 | * 2) If the single-stepped instruction was a call, the return address | ||
475 | * that is atop the stack is the address following the copied instruction. | ||
476 | * We need to make it the address following the original instruction. | ||
477 | */ | ||
478 | static void __kprobes resume_execution(struct kprobe *p, | ||
479 | struct pt_regs *regs, struct kprobe_ctlblk *kcb) | ||
480 | { | ||
481 | unsigned long *tos = (unsigned long *)regs->rsp; | ||
482 | unsigned long next_rip = 0; | ||
483 | unsigned long copy_rip = (unsigned long)p->ainsn.insn; | ||
484 | unsigned long orig_rip = (unsigned long)p->addr; | ||
485 | kprobe_opcode_t *insn = p->ainsn.insn; | ||
486 | |||
487 | /*skip the REX prefix*/ | ||
488 | if (*insn >= 0x40 && *insn <= 0x4f) | ||
489 | insn++; | ||
490 | |||
491 | switch (*insn) { | ||
492 | case 0x9c: /* pushfl */ | ||
493 | *tos &= ~(TF_MASK | IF_MASK); | ||
494 | *tos |= kcb->kprobe_old_rflags; | ||
495 | break; | ||
496 | case 0xc3: /* ret/lret */ | ||
497 | case 0xcb: | ||
498 | case 0xc2: | ||
499 | case 0xca: | ||
500 | regs->eflags &= ~TF_MASK; | ||
501 | /* rip is already adjusted, no more changes required*/ | ||
502 | return; | ||
503 | case 0xe8: /* call relative - Fix return addr */ | ||
504 | *tos = orig_rip + (*tos - copy_rip); | ||
505 | break; | ||
506 | case 0xff: | ||
507 | if ((insn[1] & 0x30) == 0x10) { | ||
508 | /* call absolute, indirect */ | ||
509 | /* Fix return addr; rip is correct. */ | ||
510 | next_rip = regs->rip; | ||
511 | *tos = orig_rip + (*tos - copy_rip); | ||
512 | } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
513 | ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
514 | /* rip is correct. */ | ||
515 | next_rip = regs->rip; | ||
516 | } | ||
517 | break; | ||
518 | case 0xea: /* jmp absolute -- rip is correct */ | ||
519 | next_rip = regs->rip; | ||
520 | break; | ||
521 | default: | ||
522 | break; | ||
523 | } | ||
524 | |||
525 | regs->eflags &= ~TF_MASK; | ||
526 | if (next_rip) { | ||
527 | regs->rip = next_rip; | ||
528 | } else { | ||
529 | regs->rip = orig_rip + (regs->rip - copy_rip); | ||
530 | } | ||
531 | } | ||
532 | |||
533 | int __kprobes post_kprobe_handler(struct pt_regs *regs) | ||
534 | { | ||
535 | struct kprobe *cur = kprobe_running(); | ||
536 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
537 | |||
538 | if (!cur) | ||
539 | return 0; | ||
540 | |||
541 | if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { | ||
542 | kcb->kprobe_status = KPROBE_HIT_SSDONE; | ||
543 | cur->post_handler(cur, regs, 0); | ||
544 | } | ||
545 | |||
546 | resume_execution(cur, regs, kcb); | ||
547 | regs->eflags |= kcb->kprobe_saved_rflags; | ||
548 | |||
549 | /* Restore the original saved kprobes variables and continue. */ | ||
550 | if (kcb->kprobe_status == KPROBE_REENTER) { | ||
551 | restore_previous_kprobe(kcb); | ||
552 | goto out; | ||
553 | } | ||
554 | reset_current_kprobe(); | ||
555 | out: | ||
556 | preempt_enable_no_resched(); | ||
557 | |||
558 | /* | ||
559 | * if somebody else is singlestepping across a probe point, eflags | ||
560 | * will have TF set, in which case, continue the remaining processing | ||
561 | * of do_debug, as if this is not a probe hit. | ||
562 | */ | ||
563 | if (regs->eflags & TF_MASK) | ||
564 | return 0; | ||
565 | |||
566 | return 1; | ||
567 | } | ||
568 | |||
569 | int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
570 | { | ||
571 | struct kprobe *cur = kprobe_running(); | ||
572 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
573 | const struct exception_table_entry *fixup; | ||
574 | |||
575 | switch(kcb->kprobe_status) { | ||
576 | case KPROBE_HIT_SS: | ||
577 | case KPROBE_REENTER: | ||
578 | /* | ||
579 | * We are here because the instruction being single | ||
580 | * stepped caused a page fault. We reset the current | ||
581 | * kprobe and the rip points back to the probe address | ||
582 | * and allow the page fault handler to continue as a | ||
583 | * normal page fault. | ||
584 | */ | ||
585 | regs->rip = (unsigned long)cur->addr; | ||
586 | regs->eflags |= kcb->kprobe_old_rflags; | ||
587 | if (kcb->kprobe_status == KPROBE_REENTER) | ||
588 | restore_previous_kprobe(kcb); | ||
589 | else | ||
590 | reset_current_kprobe(); | ||
591 | preempt_enable_no_resched(); | ||
592 | break; | ||
593 | case KPROBE_HIT_ACTIVE: | ||
594 | case KPROBE_HIT_SSDONE: | ||
595 | /* | ||
596 | * We increment the nmissed count for accounting, | ||
597 | * we can also use npre/npostfault count for accouting | ||
598 | * these specific fault cases. | ||
599 | */ | ||
600 | kprobes_inc_nmissed_count(cur); | ||
601 | |||
602 | /* | ||
603 | * We come here because instructions in the pre/post | ||
604 | * handler caused the page_fault, this could happen | ||
605 | * if handler tries to access user space by | ||
606 | * copy_from_user(), get_user() etc. Let the | ||
607 | * user-specified handler try to fix it first. | ||
608 | */ | ||
609 | if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) | ||
610 | return 1; | ||
611 | |||
612 | /* | ||
613 | * In case the user-specified fault handler returned | ||
614 | * zero, try to fix up. | ||
615 | */ | ||
616 | fixup = search_exception_tables(regs->rip); | ||
617 | if (fixup) { | ||
618 | regs->rip = fixup->fixup; | ||
619 | return 1; | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * fixup() could not handle it, | ||
624 | * Let do_page_fault() fix it. | ||
625 | */ | ||
626 | break; | ||
627 | default: | ||
628 | break; | ||
629 | } | ||
630 | return 0; | ||
631 | } | ||
632 | |||
633 | /* | ||
634 | * Wrapper routine for handling exceptions. | ||
635 | */ | ||
636 | int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | ||
637 | unsigned long val, void *data) | ||
638 | { | ||
639 | struct die_args *args = (struct die_args *)data; | ||
640 | int ret = NOTIFY_DONE; | ||
641 | |||
642 | if (args->regs && user_mode(args->regs)) | ||
643 | return ret; | ||
644 | |||
645 | switch (val) { | ||
646 | case DIE_INT3: | ||
647 | if (kprobe_handler(args->regs)) | ||
648 | ret = NOTIFY_STOP; | ||
649 | break; | ||
650 | case DIE_DEBUG: | ||
651 | if (post_kprobe_handler(args->regs)) | ||
652 | ret = NOTIFY_STOP; | ||
653 | break; | ||
654 | case DIE_GPF: | ||
655 | case DIE_PAGE_FAULT: | ||
656 | /* kprobe_running() needs smp_processor_id() */ | ||
657 | preempt_disable(); | ||
658 | if (kprobe_running() && | ||
659 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
660 | ret = NOTIFY_STOP; | ||
661 | preempt_enable(); | ||
662 | break; | ||
663 | default: | ||
664 | break; | ||
665 | } | ||
666 | return ret; | ||
667 | } | ||
668 | |||
669 | int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
670 | { | ||
671 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
672 | unsigned long addr; | ||
673 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
674 | |||
675 | kcb->jprobe_saved_regs = *regs; | ||
676 | kcb->jprobe_saved_rsp = (long *) regs->rsp; | ||
677 | addr = (unsigned long)(kcb->jprobe_saved_rsp); | ||
678 | /* | ||
679 | * As Linus pointed out, gcc assumes that the callee | ||
680 | * owns the argument space and could overwrite it, e.g. | ||
681 | * tailcall optimization. So, to be absolutely safe | ||
682 | * we also save and restore enough stack bytes to cover | ||
683 | * the argument area. | ||
684 | */ | ||
685 | memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, | ||
686 | MIN_STACK_SIZE(addr)); | ||
687 | regs->eflags &= ~IF_MASK; | ||
688 | regs->rip = (unsigned long)(jp->entry); | ||
689 | return 1; | ||
690 | } | ||
691 | |||
692 | void __kprobes jprobe_return(void) | ||
693 | { | ||
694 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
695 | |||
696 | asm volatile (" xchg %%rbx,%%rsp \n" | ||
697 | " int3 \n" | ||
698 | " .globl jprobe_return_end \n" | ||
699 | " jprobe_return_end: \n" | ||
700 | " nop \n"::"b" | ||
701 | (kcb->jprobe_saved_rsp):"memory"); | ||
702 | } | ||
703 | |||
704 | int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
705 | { | ||
706 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
707 | u8 *addr = (u8 *) (regs->rip - 1); | ||
708 | unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); | ||
709 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
710 | |||
711 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
712 | if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { | ||
713 | struct pt_regs *saved_regs = | ||
714 | container_of(kcb->jprobe_saved_rsp, | ||
715 | struct pt_regs, rsp); | ||
716 | printk("current rsp %p does not match saved rsp %p\n", | ||
717 | (long *)regs->rsp, kcb->jprobe_saved_rsp); | ||
718 | printk("Saved registers for jprobe %p\n", jp); | ||
719 | show_registers(saved_regs); | ||
720 | printk("Current registers\n"); | ||
721 | show_registers(regs); | ||
722 | BUG(); | ||
723 | } | ||
724 | *regs = kcb->jprobe_saved_regs; | ||
725 | memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, | ||
726 | MIN_STACK_SIZE(stack_addr)); | ||
727 | preempt_enable_no_resched(); | ||
728 | return 1; | ||
729 | } | ||
730 | return 0; | ||
731 | } | ||
732 | |||
733 | static struct kprobe trampoline_p = { | ||
734 | .addr = (kprobe_opcode_t *) &kretprobe_trampoline, | ||
735 | .pre_handler = trampoline_probe_handler | ||
736 | }; | ||
737 | |||
738 | int __init arch_init_kprobes(void) | ||
739 | { | ||
740 | return register_kprobe(&trampoline_p); | ||
741 | } | ||
742 | |||
743 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | ||
744 | { | ||
745 | if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) | ||
746 | return 1; | ||
747 | |||
748 | return 0; | ||
749 | } | ||
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c new file mode 100644 index 000000000000..bc9ffd5c19cc --- /dev/null +++ b/arch/x86/kernel/ldt_64.c | |||
@@ -0,0 +1,252 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ldt.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | ||
5 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright (C) 2002 Andi Kleen | ||
7 | * | ||
8 | * This handles calls from both 32bit and 64bit mode. | ||
9 | */ | ||
10 | |||
11 | #include <linux/errno.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/smp.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <linux/slab.h> | ||
18 | |||
19 | #include <asm/uaccess.h> | ||
20 | #include <asm/system.h> | ||
21 | #include <asm/ldt.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/proto.h> | ||
24 | |||
25 | #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | ||
26 | static void flush_ldt(void *null) | ||
27 | { | ||
28 | if (current->active_mm) | ||
29 | load_LDT(¤t->active_mm->context); | ||
30 | } | ||
31 | #endif | ||
32 | |||
33 | static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | ||
34 | { | ||
35 | void *oldldt; | ||
36 | void *newldt; | ||
37 | unsigned oldsize; | ||
38 | |||
39 | if (mincount <= (unsigned)pc->size) | ||
40 | return 0; | ||
41 | oldsize = pc->size; | ||
42 | mincount = (mincount+511)&(~511); | ||
43 | if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
44 | newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | ||
45 | else | ||
46 | newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | ||
47 | |||
48 | if (!newldt) | ||
49 | return -ENOMEM; | ||
50 | |||
51 | if (oldsize) | ||
52 | memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | ||
53 | oldldt = pc->ldt; | ||
54 | memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | ||
55 | wmb(); | ||
56 | pc->ldt = newldt; | ||
57 | wmb(); | ||
58 | pc->size = mincount; | ||
59 | wmb(); | ||
60 | if (reload) { | ||
61 | #ifdef CONFIG_SMP | ||
62 | cpumask_t mask; | ||
63 | |||
64 | preempt_disable(); | ||
65 | mask = cpumask_of_cpu(smp_processor_id()); | ||
66 | load_LDT(pc); | ||
67 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | ||
68 | smp_call_function(flush_ldt, NULL, 1, 1); | ||
69 | preempt_enable(); | ||
70 | #else | ||
71 | load_LDT(pc); | ||
72 | #endif | ||
73 | } | ||
74 | if (oldsize) { | ||
75 | if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
76 | vfree(oldldt); | ||
77 | else | ||
78 | kfree(oldldt); | ||
79 | } | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | ||
84 | { | ||
85 | int err = alloc_ldt(new, old->size, 0); | ||
86 | if (err < 0) | ||
87 | return err; | ||
88 | memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * we do not have to muck with descriptors here, that is | ||
94 | * done in switch_mm() as needed. | ||
95 | */ | ||
96 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
97 | { | ||
98 | struct mm_struct * old_mm; | ||
99 | int retval = 0; | ||
100 | |||
101 | init_MUTEX(&mm->context.sem); | ||
102 | mm->context.size = 0; | ||
103 | old_mm = current->mm; | ||
104 | if (old_mm && old_mm->context.size > 0) { | ||
105 | down(&old_mm->context.sem); | ||
106 | retval = copy_ldt(&mm->context, &old_mm->context); | ||
107 | up(&old_mm->context.sem); | ||
108 | } | ||
109 | return retval; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * | ||
114 | * Don't touch the LDT register - we're already in the next thread. | ||
115 | */ | ||
116 | void destroy_context(struct mm_struct *mm) | ||
117 | { | ||
118 | if (mm->context.size) { | ||
119 | if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
120 | vfree(mm->context.ldt); | ||
121 | else | ||
122 | kfree(mm->context.ldt); | ||
123 | mm->context.size = 0; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | static int read_ldt(void __user * ptr, unsigned long bytecount) | ||
128 | { | ||
129 | int err; | ||
130 | unsigned long size; | ||
131 | struct mm_struct * mm = current->mm; | ||
132 | |||
133 | if (!mm->context.size) | ||
134 | return 0; | ||
135 | if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | ||
136 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | ||
137 | |||
138 | down(&mm->context.sem); | ||
139 | size = mm->context.size*LDT_ENTRY_SIZE; | ||
140 | if (size > bytecount) | ||
141 | size = bytecount; | ||
142 | |||
143 | err = 0; | ||
144 | if (copy_to_user(ptr, mm->context.ldt, size)) | ||
145 | err = -EFAULT; | ||
146 | up(&mm->context.sem); | ||
147 | if (err < 0) | ||
148 | goto error_return; | ||
149 | if (size != bytecount) { | ||
150 | /* zero-fill the rest */ | ||
151 | if (clear_user(ptr+size, bytecount-size) != 0) { | ||
152 | err = -EFAULT; | ||
153 | goto error_return; | ||
154 | } | ||
155 | } | ||
156 | return bytecount; | ||
157 | error_return: | ||
158 | return err; | ||
159 | } | ||
160 | |||
161 | static int read_default_ldt(void __user * ptr, unsigned long bytecount) | ||
162 | { | ||
163 | /* Arbitrary number */ | ||
164 | /* x86-64 default LDT is all zeros */ | ||
165 | if (bytecount > 128) | ||
166 | bytecount = 128; | ||
167 | if (clear_user(ptr, bytecount)) | ||
168 | return -EFAULT; | ||
169 | return bytecount; | ||
170 | } | ||
171 | |||
172 | static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | ||
173 | { | ||
174 | struct task_struct *me = current; | ||
175 | struct mm_struct * mm = me->mm; | ||
176 | __u32 entry_1, entry_2, *lp; | ||
177 | int error; | ||
178 | struct user_desc ldt_info; | ||
179 | |||
180 | error = -EINVAL; | ||
181 | |||
182 | if (bytecount != sizeof(ldt_info)) | ||
183 | goto out; | ||
184 | error = -EFAULT; | ||
185 | if (copy_from_user(&ldt_info, ptr, bytecount)) | ||
186 | goto out; | ||
187 | |||
188 | error = -EINVAL; | ||
189 | if (ldt_info.entry_number >= LDT_ENTRIES) | ||
190 | goto out; | ||
191 | if (ldt_info.contents == 3) { | ||
192 | if (oldmode) | ||
193 | goto out; | ||
194 | if (ldt_info.seg_not_present == 0) | ||
195 | goto out; | ||
196 | } | ||
197 | |||
198 | down(&mm->context.sem); | ||
199 | if (ldt_info.entry_number >= (unsigned)mm->context.size) { | ||
200 | error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | ||
201 | if (error < 0) | ||
202 | goto out_unlock; | ||
203 | } | ||
204 | |||
205 | lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | ||
206 | |||
207 | /* Allow LDTs to be cleared by the user. */ | ||
208 | if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | ||
209 | if (oldmode || LDT_empty(&ldt_info)) { | ||
210 | entry_1 = 0; | ||
211 | entry_2 = 0; | ||
212 | goto install; | ||
213 | } | ||
214 | } | ||
215 | |||
216 | entry_1 = LDT_entry_a(&ldt_info); | ||
217 | entry_2 = LDT_entry_b(&ldt_info); | ||
218 | if (oldmode) | ||
219 | entry_2 &= ~(1 << 20); | ||
220 | |||
221 | /* Install the new entry ... */ | ||
222 | install: | ||
223 | *lp = entry_1; | ||
224 | *(lp+1) = entry_2; | ||
225 | error = 0; | ||
226 | |||
227 | out_unlock: | ||
228 | up(&mm->context.sem); | ||
229 | out: | ||
230 | return error; | ||
231 | } | ||
232 | |||
233 | asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | ||
234 | { | ||
235 | int ret = -ENOSYS; | ||
236 | |||
237 | switch (func) { | ||
238 | case 0: | ||
239 | ret = read_ldt(ptr, bytecount); | ||
240 | break; | ||
241 | case 1: | ||
242 | ret = write_ldt(ptr, bytecount, 1); | ||
243 | break; | ||
244 | case 2: | ||
245 | ret = read_default_ldt(ptr, bytecount); | ||
246 | break; | ||
247 | case 0x11: | ||
248 | ret = write_ldt(ptr, bytecount, 0); | ||
249 | break; | ||
250 | } | ||
251 | return ret; | ||
252 | } | ||
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c new file mode 100644 index 000000000000..c3a554703672 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/reboot.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/mmu_context.h> | ||
16 | #include <asm/io.h> | ||
17 | |||
18 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | ||
19 | static u64 kexec_pgd[512] PAGE_ALIGNED; | ||
20 | static u64 kexec_pud0[512] PAGE_ALIGNED; | ||
21 | static u64 kexec_pmd0[512] PAGE_ALIGNED; | ||
22 | static u64 kexec_pte0[512] PAGE_ALIGNED; | ||
23 | static u64 kexec_pud1[512] PAGE_ALIGNED; | ||
24 | static u64 kexec_pmd1[512] PAGE_ALIGNED; | ||
25 | static u64 kexec_pte1[512] PAGE_ALIGNED; | ||
26 | |||
27 | static void init_level2_page(pmd_t *level2p, unsigned long addr) | ||
28 | { | ||
29 | unsigned long end_addr; | ||
30 | |||
31 | addr &= PAGE_MASK; | ||
32 | end_addr = addr + PUD_SIZE; | ||
33 | while (addr < end_addr) { | ||
34 | set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | ||
35 | addr += PMD_SIZE; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | static int init_level3_page(struct kimage *image, pud_t *level3p, | ||
40 | unsigned long addr, unsigned long last_addr) | ||
41 | { | ||
42 | unsigned long end_addr; | ||
43 | int result; | ||
44 | |||
45 | result = 0; | ||
46 | addr &= PAGE_MASK; | ||
47 | end_addr = addr + PGDIR_SIZE; | ||
48 | while ((addr < last_addr) && (addr < end_addr)) { | ||
49 | struct page *page; | ||
50 | pmd_t *level2p; | ||
51 | |||
52 | page = kimage_alloc_control_pages(image, 0); | ||
53 | if (!page) { | ||
54 | result = -ENOMEM; | ||
55 | goto out; | ||
56 | } | ||
57 | level2p = (pmd_t *)page_address(page); | ||
58 | init_level2_page(level2p, addr); | ||
59 | set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); | ||
60 | addr += PUD_SIZE; | ||
61 | } | ||
62 | /* clear the unused entries */ | ||
63 | while (addr < end_addr) { | ||
64 | pud_clear(level3p++); | ||
65 | addr += PUD_SIZE; | ||
66 | } | ||
67 | out: | ||
68 | return result; | ||
69 | } | ||
70 | |||
71 | |||
72 | static int init_level4_page(struct kimage *image, pgd_t *level4p, | ||
73 | unsigned long addr, unsigned long last_addr) | ||
74 | { | ||
75 | unsigned long end_addr; | ||
76 | int result; | ||
77 | |||
78 | result = 0; | ||
79 | addr &= PAGE_MASK; | ||
80 | end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); | ||
81 | while ((addr < last_addr) && (addr < end_addr)) { | ||
82 | struct page *page; | ||
83 | pud_t *level3p; | ||
84 | |||
85 | page = kimage_alloc_control_pages(image, 0); | ||
86 | if (!page) { | ||
87 | result = -ENOMEM; | ||
88 | goto out; | ||
89 | } | ||
90 | level3p = (pud_t *)page_address(page); | ||
91 | result = init_level3_page(image, level3p, addr, last_addr); | ||
92 | if (result) { | ||
93 | goto out; | ||
94 | } | ||
95 | set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); | ||
96 | addr += PGDIR_SIZE; | ||
97 | } | ||
98 | /* clear the unused entries */ | ||
99 | while (addr < end_addr) { | ||
100 | pgd_clear(level4p++); | ||
101 | addr += PGDIR_SIZE; | ||
102 | } | ||
103 | out: | ||
104 | return result; | ||
105 | } | ||
106 | |||
107 | |||
108 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | ||
109 | { | ||
110 | pgd_t *level4p; | ||
111 | level4p = (pgd_t *)__va(start_pgtable); | ||
112 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); | ||
113 | } | ||
114 | |||
115 | static void set_idt(void *newidt, u16 limit) | ||
116 | { | ||
117 | struct desc_ptr curidt; | ||
118 | |||
119 | /* x86-64 supports unaliged loads & stores */ | ||
120 | curidt.size = limit; | ||
121 | curidt.address = (unsigned long)newidt; | ||
122 | |||
123 | __asm__ __volatile__ ( | ||
124 | "lidtq %0\n" | ||
125 | : : "m" (curidt) | ||
126 | ); | ||
127 | }; | ||
128 | |||
129 | |||
130 | static void set_gdt(void *newgdt, u16 limit) | ||
131 | { | ||
132 | struct desc_ptr curgdt; | ||
133 | |||
134 | /* x86-64 supports unaligned loads & stores */ | ||
135 | curgdt.size = limit; | ||
136 | curgdt.address = (unsigned long)newgdt; | ||
137 | |||
138 | __asm__ __volatile__ ( | ||
139 | "lgdtq %0\n" | ||
140 | : : "m" (curgdt) | ||
141 | ); | ||
142 | }; | ||
143 | |||
144 | static void load_segments(void) | ||
145 | { | ||
146 | __asm__ __volatile__ ( | ||
147 | "\tmovl %0,%%ds\n" | ||
148 | "\tmovl %0,%%es\n" | ||
149 | "\tmovl %0,%%ss\n" | ||
150 | "\tmovl %0,%%fs\n" | ||
151 | "\tmovl %0,%%gs\n" | ||
152 | : : "a" (__KERNEL_DS) : "memory" | ||
153 | ); | ||
154 | } | ||
155 | |||
156 | int machine_kexec_prepare(struct kimage *image) | ||
157 | { | ||
158 | unsigned long start_pgtable; | ||
159 | int result; | ||
160 | |||
161 | /* Calculate the offsets */ | ||
162 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
163 | |||
164 | /* Setup the identity mapped 64bit page table */ | ||
165 | result = init_pgtable(image, start_pgtable); | ||
166 | if (result) | ||
167 | return result; | ||
168 | |||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | void machine_kexec_cleanup(struct kimage *image) | ||
173 | { | ||
174 | return; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
179 | * We are past the point of no return, committed to rebooting now. | ||
180 | */ | ||
181 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
182 | { | ||
183 | unsigned long page_list[PAGES_NR]; | ||
184 | void *control_page; | ||
185 | |||
186 | /* Interrupts aren't acceptable while we reboot */ | ||
187 | local_irq_disable(); | ||
188 | |||
189 | control_page = page_address(image->control_code_page) + PAGE_SIZE; | ||
190 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | ||
191 | |||
192 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); | ||
193 | page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; | ||
194 | page_list[PA_PGD] = virt_to_phys(&kexec_pgd); | ||
195 | page_list[VA_PGD] = (unsigned long)kexec_pgd; | ||
196 | page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); | ||
197 | page_list[VA_PUD_0] = (unsigned long)kexec_pud0; | ||
198 | page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); | ||
199 | page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; | ||
200 | page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); | ||
201 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; | ||
202 | page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); | ||
203 | page_list[VA_PUD_1] = (unsigned long)kexec_pud1; | ||
204 | page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); | ||
205 | page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; | ||
206 | page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); | ||
207 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; | ||
208 | |||
209 | page_list[PA_TABLE_PAGE] = | ||
210 | (unsigned long)__pa(page_address(image->control_code_page)); | ||
211 | |||
212 | /* The segment registers are funny things, they have both a | ||
213 | * visible and an invisible part. Whenever the visible part is | ||
214 | * set to a specific selector, the invisible part is loaded | ||
215 | * with from a table in memory. At no other time is the | ||
216 | * descriptor table in memory accessed. | ||
217 | * | ||
218 | * I take advantage of this here by force loading the | ||
219 | * segments, before I zap the gdt with an invalid value. | ||
220 | */ | ||
221 | load_segments(); | ||
222 | /* The gdt & idt are now invalid. | ||
223 | * If you want to load them you must set up your own idt & gdt. | ||
224 | */ | ||
225 | set_gdt(phys_to_virt(0),0); | ||
226 | set_idt(phys_to_virt(0),0); | ||
227 | |||
228 | /* now call it */ | ||
229 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | ||
230 | image->start); | ||
231 | } | ||
232 | |||
233 | /* crashkernel=size@addr specifies the location to reserve for | ||
234 | * a crash kernel. By reserving this memory we guarantee | ||
235 | * that linux never set's it up as a DMA target. | ||
236 | * Useful for holding code to do something appropriate | ||
237 | * after a kernel panic. | ||
238 | */ | ||
239 | static int __init setup_crashkernel(char *arg) | ||
240 | { | ||
241 | unsigned long size, base; | ||
242 | char *p; | ||
243 | if (!arg) | ||
244 | return -EINVAL; | ||
245 | size = memparse(arg, &p); | ||
246 | if (arg == p) | ||
247 | return -EINVAL; | ||
248 | if (*p == '@') { | ||
249 | base = memparse(p+1, &p); | ||
250 | /* FIXME: Do I want a sanity check to validate the | ||
251 | * memory range? Yes you do, but it's too early for | ||
252 | * e820 -AK */ | ||
253 | crashk_res.start = base; | ||
254 | crashk_res.end = base + size - 1; | ||
255 | } | ||
256 | return 0; | ||
257 | } | ||
258 | early_param("crashkernel", setup_crashkernel); | ||
259 | |||
diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c new file mode 100644 index 000000000000..a66d607f5b92 --- /dev/null +++ b/arch/x86/kernel/mce_64.c | |||
@@ -0,0 +1,875 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/rcupdate.h> | ||
14 | #include <linux/kallsyms.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/miscdevice.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/capability.h> | ||
19 | #include <linux/cpu.h> | ||
20 | #include <linux/percpu.h> | ||
21 | #include <linux/poll.h> | ||
22 | #include <linux/thread_info.h> | ||
23 | #include <linux/ctype.h> | ||
24 | #include <linux/kmod.h> | ||
25 | #include <linux/kdebug.h> | ||
26 | #include <asm/processor.h> | ||
27 | #include <asm/msr.h> | ||
28 | #include <asm/mce.h> | ||
29 | #include <asm/uaccess.h> | ||
30 | #include <asm/smp.h> | ||
31 | #include <asm/idle.h> | ||
32 | |||
33 | #define MISC_MCELOG_MINOR 227 | ||
34 | #define NR_BANKS 6 | ||
35 | |||
36 | atomic_t mce_entry; | ||
37 | |||
38 | static int mce_dont_init; | ||
39 | |||
40 | /* | ||
41 | * Tolerant levels: | ||
42 | * 0: always panic on uncorrected errors, log corrected errors | ||
43 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
44 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
45 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
46 | */ | ||
47 | static int tolerant = 1; | ||
48 | static int banks; | ||
49 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | ||
50 | static unsigned long notify_user; | ||
51 | static int rip_msr; | ||
52 | static int mce_bootlog = 1; | ||
53 | static atomic_t mce_events; | ||
54 | |||
55 | static char trigger[128]; | ||
56 | static char *trigger_argv[2] = { trigger, NULL }; | ||
57 | |||
58 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
59 | |||
60 | /* | ||
61 | * Lockless MCE logging infrastructure. | ||
62 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
63 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
64 | */ | ||
65 | |||
66 | struct mce_log mcelog = { | ||
67 | MCE_LOG_SIGNATURE, | ||
68 | MCE_LOG_LEN, | ||
69 | }; | ||
70 | |||
71 | void mce_log(struct mce *mce) | ||
72 | { | ||
73 | unsigned next, entry; | ||
74 | atomic_inc(&mce_events); | ||
75 | mce->finished = 0; | ||
76 | wmb(); | ||
77 | for (;;) { | ||
78 | entry = rcu_dereference(mcelog.next); | ||
79 | /* The rmb forces the compiler to reload next in each | ||
80 | iteration */ | ||
81 | rmb(); | ||
82 | for (;;) { | ||
83 | /* When the buffer fills up discard new entries. Assume | ||
84 | that the earlier errors are the more interesting. */ | ||
85 | if (entry >= MCE_LOG_LEN) { | ||
86 | set_bit(MCE_OVERFLOW, &mcelog.flags); | ||
87 | return; | ||
88 | } | ||
89 | /* Old left over entry. Skip. */ | ||
90 | if (mcelog.entry[entry].finished) { | ||
91 | entry++; | ||
92 | continue; | ||
93 | } | ||
94 | break; | ||
95 | } | ||
96 | smp_rmb(); | ||
97 | next = entry + 1; | ||
98 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
99 | break; | ||
100 | } | ||
101 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
102 | wmb(); | ||
103 | mcelog.entry[entry].finished = 1; | ||
104 | wmb(); | ||
105 | |||
106 | set_bit(0, ¬ify_user); | ||
107 | } | ||
108 | |||
109 | static void print_mce(struct mce *m) | ||
110 | { | ||
111 | printk(KERN_EMERG "\n" | ||
112 | KERN_EMERG "HARDWARE ERROR\n" | ||
113 | KERN_EMERG | ||
114 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
115 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
116 | if (m->rip) { | ||
117 | printk(KERN_EMERG | ||
118 | "RIP%s %02x:<%016Lx> ", | ||
119 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
120 | m->cs, m->rip); | ||
121 | if (m->cs == __KERNEL_CS) | ||
122 | print_symbol("{%s}", m->rip); | ||
123 | printk("\n"); | ||
124 | } | ||
125 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | ||
126 | if (m->addr) | ||
127 | printk("ADDR %Lx ", m->addr); | ||
128 | if (m->misc) | ||
129 | printk("MISC %Lx ", m->misc); | ||
130 | printk("\n"); | ||
131 | printk(KERN_EMERG "This is not a software problem!\n"); | ||
132 | printk(KERN_EMERG | ||
133 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | ||
134 | } | ||
135 | |||
136 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
137 | { | ||
138 | int i; | ||
139 | |||
140 | oops_begin(); | ||
141 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
142 | unsigned long tsc = mcelog.entry[i].tsc; | ||
143 | if (time_before(tsc, start)) | ||
144 | continue; | ||
145 | print_mce(&mcelog.entry[i]); | ||
146 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
147 | backup = NULL; | ||
148 | } | ||
149 | if (backup) | ||
150 | print_mce(backup); | ||
151 | panic(msg); | ||
152 | } | ||
153 | |||
154 | static int mce_available(struct cpuinfo_x86 *c) | ||
155 | { | ||
156 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
157 | } | ||
158 | |||
159 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
160 | { | ||
161 | if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { | ||
162 | m->rip = regs->rip; | ||
163 | m->cs = regs->cs; | ||
164 | } else { | ||
165 | m->rip = 0; | ||
166 | m->cs = 0; | ||
167 | } | ||
168 | if (rip_msr) { | ||
169 | /* Assume the RIP in the MSR is exact. Is this true? */ | ||
170 | m->mcgstatus |= MCG_STATUS_EIPV; | ||
171 | rdmsrl(rip_msr, m->rip); | ||
172 | m->cs = 0; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * The actual machine check handler | ||
178 | */ | ||
179 | |||
180 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
181 | { | ||
182 | struct mce m, panicm; | ||
183 | u64 mcestart = 0; | ||
184 | int i; | ||
185 | int panicm_found = 0; | ||
186 | /* | ||
187 | * If no_way_out gets set, there is no safe way to recover from this | ||
188 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
189 | */ | ||
190 | int no_way_out = 0; | ||
191 | /* | ||
192 | * If kill_it gets set, there might be a way to recover from this | ||
193 | * error. | ||
194 | */ | ||
195 | int kill_it = 0; | ||
196 | |||
197 | atomic_inc(&mce_entry); | ||
198 | |||
199 | if (regs) | ||
200 | notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); | ||
201 | if (!banks) | ||
202 | goto out2; | ||
203 | |||
204 | memset(&m, 0, sizeof(struct mce)); | ||
205 | m.cpu = smp_processor_id(); | ||
206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
207 | /* if the restart IP is not valid, we're done for */ | ||
208 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
209 | no_way_out = 1; | ||
210 | |||
211 | rdtscll(mcestart); | ||
212 | barrier(); | ||
213 | |||
214 | for (i = 0; i < banks; i++) { | ||
215 | if (!bank[i]) | ||
216 | continue; | ||
217 | |||
218 | m.misc = 0; | ||
219 | m.addr = 0; | ||
220 | m.bank = i; | ||
221 | m.tsc = 0; | ||
222 | |||
223 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
224 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
225 | continue; | ||
226 | |||
227 | if (m.status & MCI_STATUS_EN) { | ||
228 | /* if PCC was set, there's no way out */ | ||
229 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | ||
230 | /* | ||
231 | * If this error was uncorrectable and there was | ||
232 | * an overflow, we're in trouble. If no overflow, | ||
233 | * we might get away with just killing a task. | ||
234 | */ | ||
235 | if (m.status & MCI_STATUS_UC) { | ||
236 | if (tolerant < 1 || m.status & MCI_STATUS_OVER) | ||
237 | no_way_out = 1; | ||
238 | kill_it = 1; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | if (m.status & MCI_STATUS_MISCV) | ||
243 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
244 | if (m.status & MCI_STATUS_ADDRV) | ||
245 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
246 | |||
247 | mce_get_rip(&m, regs); | ||
248 | if (error_code >= 0) | ||
249 | rdtscll(m.tsc); | ||
250 | if (error_code != -2) | ||
251 | mce_log(&m); | ||
252 | |||
253 | /* Did this bank cause the exception? */ | ||
254 | /* Assume that the bank with uncorrectable errors did it, | ||
255 | and that there is only a single one. */ | ||
256 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
257 | panicm = m; | ||
258 | panicm_found = 1; | ||
259 | } | ||
260 | |||
261 | add_taint(TAINT_MACHINE_CHECK); | ||
262 | } | ||
263 | |||
264 | /* Never do anything final in the polling timer */ | ||
265 | if (!regs) | ||
266 | goto out; | ||
267 | |||
268 | /* If we didn't find an uncorrectable error, pick | ||
269 | the last one (shouldn't happen, just being safe). */ | ||
270 | if (!panicm_found) | ||
271 | panicm = m; | ||
272 | |||
273 | /* | ||
274 | * If we have decided that we just CAN'T continue, and the user | ||
275 | * has not set tolerant to an insane level, give up and die. | ||
276 | */ | ||
277 | if (no_way_out && tolerant < 3) | ||
278 | mce_panic("Machine check", &panicm, mcestart); | ||
279 | |||
280 | /* | ||
281 | * If the error seems to be unrecoverable, something should be | ||
282 | * done. Try to kill as little as possible. If we can kill just | ||
283 | * one task, do that. If the user has set the tolerance very | ||
284 | * high, don't try to do anything at all. | ||
285 | */ | ||
286 | if (kill_it && tolerant < 3) { | ||
287 | int user_space = 0; | ||
288 | |||
289 | /* | ||
290 | * If the EIPV bit is set, it means the saved IP is the | ||
291 | * instruction which caused the MCE. | ||
292 | */ | ||
293 | if (m.mcgstatus & MCG_STATUS_EIPV) | ||
294 | user_space = panicm.rip && (panicm.cs & 3); | ||
295 | |||
296 | /* | ||
297 | * If we know that the error was in user space, send a | ||
298 | * SIGBUS. Otherwise, panic if tolerance is low. | ||
299 | * | ||
300 | * do_exit() takes an awful lot of locks and has a slight | ||
301 | * risk of deadlocking. | ||
302 | */ | ||
303 | if (user_space) { | ||
304 | do_exit(SIGBUS); | ||
305 | } else if (panic_on_oops || tolerant < 2) { | ||
306 | mce_panic("Uncorrected machine check", | ||
307 | &panicm, mcestart); | ||
308 | } | ||
309 | } | ||
310 | |||
311 | /* notify userspace ASAP */ | ||
312 | set_thread_flag(TIF_MCE_NOTIFY); | ||
313 | |||
314 | out: | ||
315 | /* the last thing we do is clear state */ | ||
316 | for (i = 0; i < banks; i++) | ||
317 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
318 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
319 | out2: | ||
320 | atomic_dec(&mce_entry); | ||
321 | } | ||
322 | |||
323 | #ifdef CONFIG_X86_MCE_INTEL | ||
324 | /*** | ||
325 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
326 | * @cpu: The CPU on which the event occured. | ||
327 | * @status: Event status information | ||
328 | * | ||
329 | * This function should be called by the thermal interrupt after the | ||
330 | * event has been processed and the decision was made to log the event | ||
331 | * further. | ||
332 | * | ||
333 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
334 | * and historically has been the register value of the | ||
335 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
336 | */ | ||
337 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | ||
338 | { | ||
339 | struct mce m; | ||
340 | |||
341 | memset(&m, 0, sizeof(m)); | ||
342 | m.cpu = cpu; | ||
343 | m.bank = MCE_THERMAL_BANK; | ||
344 | m.status = status; | ||
345 | rdtscll(m.tsc); | ||
346 | mce_log(&m); | ||
347 | } | ||
348 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
349 | |||
350 | /* | ||
351 | * Periodic polling timer for "silent" machine check errors. If the | ||
352 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
353 | * errors, poll 2x slower (up to check_interval seconds). | ||
354 | */ | ||
355 | |||
356 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
357 | static int next_interval; /* in jiffies */ | ||
358 | static void mcheck_timer(struct work_struct *work); | ||
359 | static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); | ||
360 | |||
361 | static void mcheck_check_cpu(void *info) | ||
362 | { | ||
363 | if (mce_available(¤t_cpu_data)) | ||
364 | do_machine_check(NULL, 0); | ||
365 | } | ||
366 | |||
367 | static void mcheck_timer(struct work_struct *work) | ||
368 | { | ||
369 | on_each_cpu(mcheck_check_cpu, NULL, 1, 1); | ||
370 | |||
371 | /* | ||
372 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
373 | * polling interval, otherwise increase the polling interval. | ||
374 | */ | ||
375 | if (mce_notify_user()) { | ||
376 | next_interval = max(next_interval/2, HZ/100); | ||
377 | } else { | ||
378 | next_interval = min(next_interval*2, | ||
379 | (int)round_jiffies_relative(check_interval*HZ)); | ||
380 | } | ||
381 | |||
382 | schedule_delayed_work(&mcheck_work, next_interval); | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * This is only called from process context. This is where we do | ||
387 | * anything we need to alert userspace about new MCEs. This is called | ||
388 | * directly from the poller and also from entry.S and idle, thanks to | ||
389 | * TIF_MCE_NOTIFY. | ||
390 | */ | ||
391 | int mce_notify_user(void) | ||
392 | { | ||
393 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
394 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
395 | static unsigned long last_print; | ||
396 | unsigned long now = jiffies; | ||
397 | |||
398 | wake_up_interruptible(&mce_wait); | ||
399 | if (trigger[0]) | ||
400 | call_usermodehelper(trigger, trigger_argv, NULL, | ||
401 | UMH_NO_WAIT); | ||
402 | |||
403 | if (time_after_eq(now, last_print + (check_interval*HZ))) { | ||
404 | last_print = now; | ||
405 | printk(KERN_INFO "Machine check events logged\n"); | ||
406 | } | ||
407 | |||
408 | return 1; | ||
409 | } | ||
410 | return 0; | ||
411 | } | ||
412 | |||
413 | /* see if the idle task needs to notify userspace */ | ||
414 | static int | ||
415 | mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) | ||
416 | { | ||
417 | /* IDLE_END should be safe - interrupts are back on */ | ||
418 | if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) | ||
419 | mce_notify_user(); | ||
420 | |||
421 | return NOTIFY_OK; | ||
422 | } | ||
423 | |||
424 | static struct notifier_block mce_idle_notifier = { | ||
425 | .notifier_call = mce_idle_callback, | ||
426 | }; | ||
427 | |||
428 | static __init int periodic_mcheck_init(void) | ||
429 | { | ||
430 | next_interval = check_interval * HZ; | ||
431 | if (next_interval) | ||
432 | schedule_delayed_work(&mcheck_work, | ||
433 | round_jiffies_relative(next_interval)); | ||
434 | idle_notifier_register(&mce_idle_notifier); | ||
435 | return 0; | ||
436 | } | ||
437 | __initcall(periodic_mcheck_init); | ||
438 | |||
439 | |||
440 | /* | ||
441 | * Initialize Machine Checks for a CPU. | ||
442 | */ | ||
443 | static void mce_init(void *dummy) | ||
444 | { | ||
445 | u64 cap; | ||
446 | int i; | ||
447 | |||
448 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
449 | banks = cap & 0xff; | ||
450 | if (banks > NR_BANKS) { | ||
451 | printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); | ||
452 | banks = NR_BANKS; | ||
453 | } | ||
454 | /* Use accurate RIP reporting if available. */ | ||
455 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | ||
456 | rip_msr = MSR_IA32_MCG_EIP; | ||
457 | |||
458 | /* Log the machine checks left over from the previous reset. | ||
459 | This also clears all registers */ | ||
460 | do_machine_check(NULL, mce_bootlog ? -1 : -2); | ||
461 | |||
462 | set_in_cr4(X86_CR4_MCE); | ||
463 | |||
464 | if (cap & MCG_CTL_P) | ||
465 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
466 | |||
467 | for (i = 0; i < banks; i++) { | ||
468 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
469 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /* Add per CPU specific workarounds here */ | ||
474 | static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
475 | { | ||
476 | /* This should be disabled by the BIOS, but isn't always */ | ||
477 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { | ||
478 | /* disable GART TBL walk error reporting, which trips off | ||
479 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
480 | clear_bit(10, &bank[4]); | ||
481 | /* Lots of broken BIOS around that don't clear them | ||
482 | by default and leave crap in there. Don't log. */ | ||
483 | mce_bootlog = 0; | ||
484 | } | ||
485 | |||
486 | } | ||
487 | |||
488 | static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) | ||
489 | { | ||
490 | switch (c->x86_vendor) { | ||
491 | case X86_VENDOR_INTEL: | ||
492 | mce_intel_feature_init(c); | ||
493 | break; | ||
494 | case X86_VENDOR_AMD: | ||
495 | mce_amd_feature_init(c); | ||
496 | break; | ||
497 | default: | ||
498 | break; | ||
499 | } | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Called for each booted CPU to set up machine checks. | ||
504 | * Must be called with preempt off. | ||
505 | */ | ||
506 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
507 | { | ||
508 | static cpumask_t mce_cpus = CPU_MASK_NONE; | ||
509 | |||
510 | mce_cpu_quirks(c); | ||
511 | |||
512 | if (mce_dont_init || | ||
513 | cpu_test_and_set(smp_processor_id(), mce_cpus) || | ||
514 | !mce_available(c)) | ||
515 | return; | ||
516 | |||
517 | mce_init(NULL); | ||
518 | mce_cpu_features(c); | ||
519 | } | ||
520 | |||
521 | /* | ||
522 | * Character device to read and clear the MCE log. | ||
523 | */ | ||
524 | |||
525 | static DEFINE_SPINLOCK(mce_state_lock); | ||
526 | static int open_count; /* #times opened */ | ||
527 | static int open_exclu; /* already open exclusive? */ | ||
528 | |||
529 | static int mce_open(struct inode *inode, struct file *file) | ||
530 | { | ||
531 | spin_lock(&mce_state_lock); | ||
532 | |||
533 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
534 | spin_unlock(&mce_state_lock); | ||
535 | return -EBUSY; | ||
536 | } | ||
537 | |||
538 | if (file->f_flags & O_EXCL) | ||
539 | open_exclu = 1; | ||
540 | open_count++; | ||
541 | |||
542 | spin_unlock(&mce_state_lock); | ||
543 | |||
544 | return nonseekable_open(inode, file); | ||
545 | } | ||
546 | |||
547 | static int mce_release(struct inode *inode, struct file *file) | ||
548 | { | ||
549 | spin_lock(&mce_state_lock); | ||
550 | |||
551 | open_count--; | ||
552 | open_exclu = 0; | ||
553 | |||
554 | spin_unlock(&mce_state_lock); | ||
555 | |||
556 | return 0; | ||
557 | } | ||
558 | |||
559 | static void collect_tscs(void *data) | ||
560 | { | ||
561 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
562 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
563 | } | ||
564 | |||
565 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) | ||
566 | { | ||
567 | unsigned long *cpu_tsc; | ||
568 | static DECLARE_MUTEX(mce_read_sem); | ||
569 | unsigned next; | ||
570 | char __user *buf = ubuf; | ||
571 | int i, err; | ||
572 | |||
573 | cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); | ||
574 | if (!cpu_tsc) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | down(&mce_read_sem); | ||
578 | next = rcu_dereference(mcelog.next); | ||
579 | |||
580 | /* Only supports full reads right now */ | ||
581 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
582 | up(&mce_read_sem); | ||
583 | kfree(cpu_tsc); | ||
584 | return -EINVAL; | ||
585 | } | ||
586 | |||
587 | err = 0; | ||
588 | for (i = 0; i < next; i++) { | ||
589 | unsigned long start = jiffies; | ||
590 | while (!mcelog.entry[i].finished) { | ||
591 | if (time_after_eq(jiffies, start + 2)) { | ||
592 | memset(mcelog.entry + i,0, sizeof(struct mce)); | ||
593 | goto timeout; | ||
594 | } | ||
595 | cpu_relax(); | ||
596 | } | ||
597 | smp_rmb(); | ||
598 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
599 | buf += sizeof(struct mce); | ||
600 | timeout: | ||
601 | ; | ||
602 | } | ||
603 | |||
604 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | ||
605 | mcelog.next = 0; | ||
606 | |||
607 | synchronize_sched(); | ||
608 | |||
609 | /* Collect entries that were still getting written before the synchronize. */ | ||
610 | |||
611 | on_each_cpu(collect_tscs, cpu_tsc, 1, 1); | ||
612 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
613 | if (mcelog.entry[i].finished && | ||
614 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
615 | err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); | ||
616 | smp_rmb(); | ||
617 | buf += sizeof(struct mce); | ||
618 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
619 | } | ||
620 | } | ||
621 | up(&mce_read_sem); | ||
622 | kfree(cpu_tsc); | ||
623 | return err ? -EFAULT : buf - ubuf; | ||
624 | } | ||
625 | |||
626 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
627 | { | ||
628 | poll_wait(file, &mce_wait, wait); | ||
629 | if (rcu_dereference(mcelog.next)) | ||
630 | return POLLIN | POLLRDNORM; | ||
631 | return 0; | ||
632 | } | ||
633 | |||
634 | static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) | ||
635 | { | ||
636 | int __user *p = (int __user *)arg; | ||
637 | if (!capable(CAP_SYS_ADMIN)) | ||
638 | return -EPERM; | ||
639 | switch (cmd) { | ||
640 | case MCE_GET_RECORD_LEN: | ||
641 | return put_user(sizeof(struct mce), p); | ||
642 | case MCE_GET_LOG_LEN: | ||
643 | return put_user(MCE_LOG_LEN, p); | ||
644 | case MCE_GETCLEAR_FLAGS: { | ||
645 | unsigned flags; | ||
646 | do { | ||
647 | flags = mcelog.flags; | ||
648 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
649 | return put_user(flags, p); | ||
650 | } | ||
651 | default: | ||
652 | return -ENOTTY; | ||
653 | } | ||
654 | } | ||
655 | |||
656 | static const struct file_operations mce_chrdev_ops = { | ||
657 | .open = mce_open, | ||
658 | .release = mce_release, | ||
659 | .read = mce_read, | ||
660 | .poll = mce_poll, | ||
661 | .ioctl = mce_ioctl, | ||
662 | }; | ||
663 | |||
664 | static struct miscdevice mce_log_device = { | ||
665 | MISC_MCELOG_MINOR, | ||
666 | "mcelog", | ||
667 | &mce_chrdev_ops, | ||
668 | }; | ||
669 | |||
670 | static unsigned long old_cr4 __initdata; | ||
671 | |||
672 | void __init stop_mce(void) | ||
673 | { | ||
674 | old_cr4 = read_cr4(); | ||
675 | clear_in_cr4(X86_CR4_MCE); | ||
676 | } | ||
677 | |||
678 | void __init restart_mce(void) | ||
679 | { | ||
680 | if (old_cr4 & X86_CR4_MCE) | ||
681 | set_in_cr4(X86_CR4_MCE); | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * Old style boot options parsing. Only for compatibility. | ||
686 | */ | ||
687 | |||
688 | static int __init mcheck_disable(char *str) | ||
689 | { | ||
690 | mce_dont_init = 1; | ||
691 | return 1; | ||
692 | } | ||
693 | |||
694 | /* mce=off disables machine check. Note you can reenable it later | ||
695 | using sysfs. | ||
696 | mce=TOLERANCELEVEL (number, see above) | ||
697 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
698 | mce=nobootlog Don't log MCEs from before booting. */ | ||
699 | static int __init mcheck_enable(char *str) | ||
700 | { | ||
701 | if (*str == '=') | ||
702 | str++; | ||
703 | if (!strcmp(str, "off")) | ||
704 | mce_dont_init = 1; | ||
705 | else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) | ||
706 | mce_bootlog = str[0] == 'b'; | ||
707 | else if (isdigit(str[0])) | ||
708 | get_option(&str, &tolerant); | ||
709 | else | ||
710 | printk("mce= argument %s ignored. Please use /sys", str); | ||
711 | return 1; | ||
712 | } | ||
713 | |||
714 | __setup("nomce", mcheck_disable); | ||
715 | __setup("mce", mcheck_enable); | ||
716 | |||
717 | /* | ||
718 | * Sysfs support | ||
719 | */ | ||
720 | |||
721 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
722 | Only one CPU is active at this time, the others get readded later using | ||
723 | CPU hotplug. */ | ||
724 | static int mce_resume(struct sys_device *dev) | ||
725 | { | ||
726 | mce_init(NULL); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | /* Reinit MCEs after user configuration changes */ | ||
731 | static void mce_restart(void) | ||
732 | { | ||
733 | if (next_interval) | ||
734 | cancel_delayed_work(&mcheck_work); | ||
735 | /* Timer race is harmless here */ | ||
736 | on_each_cpu(mce_init, NULL, 1, 1); | ||
737 | next_interval = check_interval * HZ; | ||
738 | if (next_interval) | ||
739 | schedule_delayed_work(&mcheck_work, | ||
740 | round_jiffies_relative(next_interval)); | ||
741 | } | ||
742 | |||
743 | static struct sysdev_class mce_sysclass = { | ||
744 | .resume = mce_resume, | ||
745 | set_kset_name("machinecheck"), | ||
746 | }; | ||
747 | |||
748 | DEFINE_PER_CPU(struct sys_device, device_mce); | ||
749 | |||
750 | /* Why are there no generic functions for this? */ | ||
751 | #define ACCESSOR(name, var, start) \ | ||
752 | static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ | ||
753 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
754 | } \ | ||
755 | static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ | ||
756 | char *end; \ | ||
757 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
758 | if (end == buf) return -EINVAL; \ | ||
759 | var = new; \ | ||
760 | start; \ | ||
761 | return end-buf; \ | ||
762 | } \ | ||
763 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
764 | |||
765 | /* TBD should generate these dynamically based on number of available banks */ | ||
766 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | ||
767 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | ||
768 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | ||
769 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | ||
770 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | ||
771 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | ||
772 | |||
773 | static ssize_t show_trigger(struct sys_device *s, char *buf) | ||
774 | { | ||
775 | strcpy(buf, trigger); | ||
776 | strcat(buf, "\n"); | ||
777 | return strlen(trigger) + 1; | ||
778 | } | ||
779 | |||
780 | static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) | ||
781 | { | ||
782 | char *p; | ||
783 | int len; | ||
784 | strncpy(trigger, buf, sizeof(trigger)); | ||
785 | trigger[sizeof(trigger)-1] = 0; | ||
786 | len = strlen(trigger); | ||
787 | p = strchr(trigger, '\n'); | ||
788 | if (*p) *p = 0; | ||
789 | return len; | ||
790 | } | ||
791 | |||
792 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
793 | ACCESSOR(tolerant,tolerant,) | ||
794 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
795 | static struct sysdev_attribute *mce_attributes[] = { | ||
796 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
797 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
798 | &attr_tolerant, &attr_check_interval, &attr_trigger, | ||
799 | NULL | ||
800 | }; | ||
801 | |||
802 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ | ||
803 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
804 | { | ||
805 | int err; | ||
806 | int i; | ||
807 | if (!mce_available(&cpu_data[cpu])) | ||
808 | return -EIO; | ||
809 | |||
810 | per_cpu(device_mce,cpu).id = cpu; | ||
811 | per_cpu(device_mce,cpu).cls = &mce_sysclass; | ||
812 | |||
813 | err = sysdev_register(&per_cpu(device_mce,cpu)); | ||
814 | |||
815 | if (!err) { | ||
816 | for (i = 0; mce_attributes[i]; i++) | ||
817 | sysdev_create_file(&per_cpu(device_mce,cpu), | ||
818 | mce_attributes[i]); | ||
819 | } | ||
820 | return err; | ||
821 | } | ||
822 | |||
823 | static void mce_remove_device(unsigned int cpu) | ||
824 | { | ||
825 | int i; | ||
826 | |||
827 | for (i = 0; mce_attributes[i]; i++) | ||
828 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
829 | mce_attributes[i]); | ||
830 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
831 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); | ||
832 | } | ||
833 | |||
834 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
835 | static int | ||
836 | mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
837 | { | ||
838 | unsigned int cpu = (unsigned long)hcpu; | ||
839 | |||
840 | switch (action) { | ||
841 | case CPU_ONLINE: | ||
842 | case CPU_ONLINE_FROZEN: | ||
843 | mce_create_device(cpu); | ||
844 | break; | ||
845 | case CPU_DEAD: | ||
846 | case CPU_DEAD_FROZEN: | ||
847 | mce_remove_device(cpu); | ||
848 | break; | ||
849 | } | ||
850 | return NOTIFY_OK; | ||
851 | } | ||
852 | |||
853 | static struct notifier_block mce_cpu_notifier = { | ||
854 | .notifier_call = mce_cpu_callback, | ||
855 | }; | ||
856 | |||
857 | static __init int mce_init_device(void) | ||
858 | { | ||
859 | int err; | ||
860 | int i = 0; | ||
861 | |||
862 | if (!mce_available(&boot_cpu_data)) | ||
863 | return -EIO; | ||
864 | err = sysdev_class_register(&mce_sysclass); | ||
865 | |||
866 | for_each_online_cpu(i) { | ||
867 | mce_create_device(i); | ||
868 | } | ||
869 | |||
870 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
871 | misc_register(&mce_log_device); | ||
872 | return err; | ||
873 | } | ||
874 | |||
875 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c new file mode 100644 index 000000000000..2f8a7f18b0fe --- /dev/null +++ b/arch/x86/kernel/mce_amd_64.c | |||
@@ -0,0 +1,689 @@ | |||
1 | /* | ||
2 | * (c) 2005, 2006 Advanced Micro Devices, Inc. | ||
3 | * Your use of this code is subject to the terms and conditions of the | ||
4 | * GNU general public license version 2. See "COPYING" or | ||
5 | * http://www.gnu.org/licenses/gpl.html | ||
6 | * | ||
7 | * Written by Jacob Shin - AMD, Inc. | ||
8 | * | ||
9 | * Support : jacob.shin@amd.com | ||
10 | * | ||
11 | * April 2006 | ||
12 | * - added support for AMD Family 0x10 processors | ||
13 | * | ||
14 | * All MC4_MISCi registers are shared between multi-cores | ||
15 | */ | ||
16 | |||
17 | #include <linux/cpu.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | #include <linux/kobject.h> | ||
22 | #include <linux/notifier.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/sysfs.h> | ||
27 | #include <asm/apic.h> | ||
28 | #include <asm/mce.h> | ||
29 | #include <asm/msr.h> | ||
30 | #include <asm/percpu.h> | ||
31 | #include <asm/idle.h> | ||
32 | |||
33 | #define PFX "mce_threshold: " | ||
34 | #define VERSION "version 1.1.1" | ||
35 | #define NR_BANKS 6 | ||
36 | #define NR_BLOCKS 9 | ||
37 | #define THRESHOLD_MAX 0xFFF | ||
38 | #define INT_TYPE_APIC 0x00020000 | ||
39 | #define MASK_VALID_HI 0x80000000 | ||
40 | #define MASK_CNTP_HI 0x40000000 | ||
41 | #define MASK_LOCKED_HI 0x20000000 | ||
42 | #define MASK_LVTOFF_HI 0x00F00000 | ||
43 | #define MASK_COUNT_EN_HI 0x00080000 | ||
44 | #define MASK_INT_TYPE_HI 0x00060000 | ||
45 | #define MASK_OVERFLOW_HI 0x00010000 | ||
46 | #define MASK_ERR_COUNT_HI 0x00000FFF | ||
47 | #define MASK_BLKPTR_LO 0xFF000000 | ||
48 | #define MCG_XBLK_ADDR 0xC0000400 | ||
49 | |||
50 | struct threshold_block { | ||
51 | unsigned int block; | ||
52 | unsigned int bank; | ||
53 | unsigned int cpu; | ||
54 | u32 address; | ||
55 | u16 interrupt_enable; | ||
56 | u16 threshold_limit; | ||
57 | struct kobject kobj; | ||
58 | struct list_head miscj; | ||
59 | }; | ||
60 | |||
61 | /* defaults used early on boot */ | ||
62 | static struct threshold_block threshold_defaults = { | ||
63 | .interrupt_enable = 0, | ||
64 | .threshold_limit = THRESHOLD_MAX, | ||
65 | }; | ||
66 | |||
67 | struct threshold_bank { | ||
68 | struct kobject kobj; | ||
69 | struct threshold_block *blocks; | ||
70 | cpumask_t cpus; | ||
71 | }; | ||
72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); | ||
73 | |||
74 | #ifdef CONFIG_SMP | ||
75 | static unsigned char shared_bank[NR_BANKS] = { | ||
76 | 0, 0, 0, 0, 1 | ||
77 | }; | ||
78 | #endif | ||
79 | |||
80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ | ||
81 | |||
82 | /* | ||
83 | * CPU Initialization | ||
84 | */ | ||
85 | |||
86 | /* must be called with correct cpu affinity */ | ||
87 | static void threshold_restart_bank(struct threshold_block *b, | ||
88 | int reset, u16 old_limit) | ||
89 | { | ||
90 | u32 mci_misc_hi, mci_misc_lo; | ||
91 | |||
92 | rdmsr(b->address, mci_misc_lo, mci_misc_hi); | ||
93 | |||
94 | if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) | ||
95 | reset = 1; /* limit cannot be lower than err count */ | ||
96 | |||
97 | if (reset) { /* reset err count and overflow bit */ | ||
98 | mci_misc_hi = | ||
99 | (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | | ||
100 | (THRESHOLD_MAX - b->threshold_limit); | ||
101 | } else if (old_limit) { /* change limit w/o reset */ | ||
102 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + | ||
103 | (old_limit - b->threshold_limit); | ||
104 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | | ||
105 | (new_count & THRESHOLD_MAX); | ||
106 | } | ||
107 | |||
108 | b->interrupt_enable ? | ||
109 | (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : | ||
110 | (mci_misc_hi &= ~MASK_INT_TYPE_HI); | ||
111 | |||
112 | mci_misc_hi |= MASK_COUNT_EN_HI; | ||
113 | wrmsr(b->address, mci_misc_lo, mci_misc_hi); | ||
114 | } | ||
115 | |||
116 | /* cpu init entry point, called from mce.c with preempt off */ | ||
117 | void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) | ||
118 | { | ||
119 | unsigned int bank, block; | ||
120 | unsigned int cpu = smp_processor_id(); | ||
121 | u32 low = 0, high = 0, address = 0; | ||
122 | |||
123 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
124 | for (block = 0; block < NR_BLOCKS; ++block) { | ||
125 | if (block == 0) | ||
126 | address = MSR_IA32_MC0_MISC + bank * 4; | ||
127 | else if (block == 1) { | ||
128 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
129 | if (!address) | ||
130 | break; | ||
131 | address += MCG_XBLK_ADDR; | ||
132 | } | ||
133 | else | ||
134 | ++address; | ||
135 | |||
136 | if (rdmsr_safe(address, &low, &high)) | ||
137 | break; | ||
138 | |||
139 | if (!(high & MASK_VALID_HI)) { | ||
140 | if (block) | ||
141 | continue; | ||
142 | else | ||
143 | break; | ||
144 | } | ||
145 | |||
146 | if (!(high & MASK_CNTP_HI) || | ||
147 | (high & MASK_LOCKED_HI)) | ||
148 | continue; | ||
149 | |||
150 | if (!block) | ||
151 | per_cpu(bank_map, cpu) |= (1 << bank); | ||
152 | #ifdef CONFIG_SMP | ||
153 | if (shared_bank[bank] && c->cpu_core_id) | ||
154 | break; | ||
155 | #endif | ||
156 | high &= ~MASK_LVTOFF_HI; | ||
157 | high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; | ||
158 | wrmsr(address, low, high); | ||
159 | |||
160 | setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, | ||
161 | THRESHOLD_APIC_VECTOR, | ||
162 | K8_APIC_EXT_INT_MSG_FIX, 0); | ||
163 | |||
164 | threshold_defaults.address = address; | ||
165 | threshold_restart_bank(&threshold_defaults, 0, 0); | ||
166 | } | ||
167 | } | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * APIC Interrupt Handler | ||
172 | */ | ||
173 | |||
174 | /* | ||
175 | * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. | ||
176 | * the interrupt goes off when error_count reaches threshold_limit. | ||
177 | * the handler will simply log mcelog w/ software defined bank number. | ||
178 | */ | ||
179 | asmlinkage void mce_threshold_interrupt(void) | ||
180 | { | ||
181 | unsigned int bank, block; | ||
182 | struct mce m; | ||
183 | u32 low = 0, high = 0, address = 0; | ||
184 | |||
185 | ack_APIC_irq(); | ||
186 | exit_idle(); | ||
187 | irq_enter(); | ||
188 | |||
189 | memset(&m, 0, sizeof(m)); | ||
190 | rdtscll(m.tsc); | ||
191 | m.cpu = smp_processor_id(); | ||
192 | |||
193 | /* assume first bank caused it */ | ||
194 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
195 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) | ||
196 | continue; | ||
197 | for (block = 0; block < NR_BLOCKS; ++block) { | ||
198 | if (block == 0) | ||
199 | address = MSR_IA32_MC0_MISC + bank * 4; | ||
200 | else if (block == 1) { | ||
201 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
202 | if (!address) | ||
203 | break; | ||
204 | address += MCG_XBLK_ADDR; | ||
205 | } | ||
206 | else | ||
207 | ++address; | ||
208 | |||
209 | if (rdmsr_safe(address, &low, &high)) | ||
210 | break; | ||
211 | |||
212 | if (!(high & MASK_VALID_HI)) { | ||
213 | if (block) | ||
214 | continue; | ||
215 | else | ||
216 | break; | ||
217 | } | ||
218 | |||
219 | if (!(high & MASK_CNTP_HI) || | ||
220 | (high & MASK_LOCKED_HI)) | ||
221 | continue; | ||
222 | |||
223 | /* Log the machine check that caused the threshold | ||
224 | event. */ | ||
225 | do_machine_check(NULL, 0); | ||
226 | |||
227 | if (high & MASK_OVERFLOW_HI) { | ||
228 | rdmsrl(address, m.misc); | ||
229 | rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, | ||
230 | m.status); | ||
231 | m.bank = K8_MCE_THRESHOLD_BASE | ||
232 | + bank * NR_BLOCKS | ||
233 | + block; | ||
234 | mce_log(&m); | ||
235 | goto out; | ||
236 | } | ||
237 | } | ||
238 | } | ||
239 | out: | ||
240 | irq_exit(); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Sysfs Interface | ||
245 | */ | ||
246 | |||
247 | struct threshold_attr { | ||
248 | struct attribute attr; | ||
249 | ssize_t(*show) (struct threshold_block *, char *); | ||
250 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); | ||
251 | }; | ||
252 | |||
253 | static cpumask_t affinity_set(unsigned int cpu) | ||
254 | { | ||
255 | cpumask_t oldmask = current->cpus_allowed; | ||
256 | cpumask_t newmask = CPU_MASK_NONE; | ||
257 | cpu_set(cpu, newmask); | ||
258 | set_cpus_allowed(current, newmask); | ||
259 | return oldmask; | ||
260 | } | ||
261 | |||
262 | static void affinity_restore(cpumask_t oldmask) | ||
263 | { | ||
264 | set_cpus_allowed(current, oldmask); | ||
265 | } | ||
266 | |||
267 | #define SHOW_FIELDS(name) \ | ||
268 | static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ | ||
269 | { \ | ||
270 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ | ||
271 | } | ||
272 | SHOW_FIELDS(interrupt_enable) | ||
273 | SHOW_FIELDS(threshold_limit) | ||
274 | |||
275 | static ssize_t store_interrupt_enable(struct threshold_block *b, | ||
276 | const char *buf, size_t count) | ||
277 | { | ||
278 | char *end; | ||
279 | cpumask_t oldmask; | ||
280 | unsigned long new = simple_strtoul(buf, &end, 0); | ||
281 | if (end == buf) | ||
282 | return -EINVAL; | ||
283 | b->interrupt_enable = !!new; | ||
284 | |||
285 | oldmask = affinity_set(b->cpu); | ||
286 | threshold_restart_bank(b, 0, 0); | ||
287 | affinity_restore(oldmask); | ||
288 | |||
289 | return end - buf; | ||
290 | } | ||
291 | |||
292 | static ssize_t store_threshold_limit(struct threshold_block *b, | ||
293 | const char *buf, size_t count) | ||
294 | { | ||
295 | char *end; | ||
296 | cpumask_t oldmask; | ||
297 | u16 old; | ||
298 | unsigned long new = simple_strtoul(buf, &end, 0); | ||
299 | if (end == buf) | ||
300 | return -EINVAL; | ||
301 | if (new > THRESHOLD_MAX) | ||
302 | new = THRESHOLD_MAX; | ||
303 | if (new < 1) | ||
304 | new = 1; | ||
305 | old = b->threshold_limit; | ||
306 | b->threshold_limit = new; | ||
307 | |||
308 | oldmask = affinity_set(b->cpu); | ||
309 | threshold_restart_bank(b, 0, old); | ||
310 | affinity_restore(oldmask); | ||
311 | |||
312 | return end - buf; | ||
313 | } | ||
314 | |||
315 | static ssize_t show_error_count(struct threshold_block *b, char *buf) | ||
316 | { | ||
317 | u32 high, low; | ||
318 | cpumask_t oldmask; | ||
319 | oldmask = affinity_set(b->cpu); | ||
320 | rdmsr(b->address, low, high); | ||
321 | affinity_restore(oldmask); | ||
322 | return sprintf(buf, "%x\n", | ||
323 | (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); | ||
324 | } | ||
325 | |||
326 | static ssize_t store_error_count(struct threshold_block *b, | ||
327 | const char *buf, size_t count) | ||
328 | { | ||
329 | cpumask_t oldmask; | ||
330 | oldmask = affinity_set(b->cpu); | ||
331 | threshold_restart_bank(b, 1, 0); | ||
332 | affinity_restore(oldmask); | ||
333 | return 1; | ||
334 | } | ||
335 | |||
336 | #define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ | ||
337 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | ||
338 | .show = _show, \ | ||
339 | .store = _store, \ | ||
340 | }; | ||
341 | |||
342 | #define RW_ATTR(name) \ | ||
343 | static struct threshold_attr name = \ | ||
344 | THRESHOLD_ATTR(name, 0644, show_## name, store_## name) | ||
345 | |||
346 | RW_ATTR(interrupt_enable); | ||
347 | RW_ATTR(threshold_limit); | ||
348 | RW_ATTR(error_count); | ||
349 | |||
350 | static struct attribute *default_attrs[] = { | ||
351 | &interrupt_enable.attr, | ||
352 | &threshold_limit.attr, | ||
353 | &error_count.attr, | ||
354 | NULL | ||
355 | }; | ||
356 | |||
357 | #define to_block(k) container_of(k, struct threshold_block, kobj) | ||
358 | #define to_attr(a) container_of(a, struct threshold_attr, attr) | ||
359 | |||
360 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | ||
361 | { | ||
362 | struct threshold_block *b = to_block(kobj); | ||
363 | struct threshold_attr *a = to_attr(attr); | ||
364 | ssize_t ret; | ||
365 | ret = a->show ? a->show(b, buf) : -EIO; | ||
366 | return ret; | ||
367 | } | ||
368 | |||
369 | static ssize_t store(struct kobject *kobj, struct attribute *attr, | ||
370 | const char *buf, size_t count) | ||
371 | { | ||
372 | struct threshold_block *b = to_block(kobj); | ||
373 | struct threshold_attr *a = to_attr(attr); | ||
374 | ssize_t ret; | ||
375 | ret = a->store ? a->store(b, buf, count) : -EIO; | ||
376 | return ret; | ||
377 | } | ||
378 | |||
379 | static struct sysfs_ops threshold_ops = { | ||
380 | .show = show, | ||
381 | .store = store, | ||
382 | }; | ||
383 | |||
384 | static struct kobj_type threshold_ktype = { | ||
385 | .sysfs_ops = &threshold_ops, | ||
386 | .default_attrs = default_attrs, | ||
387 | }; | ||
388 | |||
389 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | ||
390 | unsigned int bank, | ||
391 | unsigned int block, | ||
392 | u32 address) | ||
393 | { | ||
394 | int err; | ||
395 | u32 low, high; | ||
396 | struct threshold_block *b = NULL; | ||
397 | |||
398 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) | ||
399 | return 0; | ||
400 | |||
401 | if (rdmsr_safe(address, &low, &high)) | ||
402 | return 0; | ||
403 | |||
404 | if (!(high & MASK_VALID_HI)) { | ||
405 | if (block) | ||
406 | goto recurse; | ||
407 | else | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | if (!(high & MASK_CNTP_HI) || | ||
412 | (high & MASK_LOCKED_HI)) | ||
413 | goto recurse; | ||
414 | |||
415 | b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); | ||
416 | if (!b) | ||
417 | return -ENOMEM; | ||
418 | |||
419 | b->block = block; | ||
420 | b->bank = bank; | ||
421 | b->cpu = cpu; | ||
422 | b->address = address; | ||
423 | b->interrupt_enable = 0; | ||
424 | b->threshold_limit = THRESHOLD_MAX; | ||
425 | |||
426 | INIT_LIST_HEAD(&b->miscj); | ||
427 | |||
428 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) | ||
429 | list_add(&b->miscj, | ||
430 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); | ||
431 | else | ||
432 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; | ||
433 | |||
434 | kobject_set_name(&b->kobj, "misc%i", block); | ||
435 | b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; | ||
436 | b->kobj.ktype = &threshold_ktype; | ||
437 | err = kobject_register(&b->kobj); | ||
438 | if (err) | ||
439 | goto out_free; | ||
440 | recurse: | ||
441 | if (!block) { | ||
442 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
443 | if (!address) | ||
444 | return 0; | ||
445 | address += MCG_XBLK_ADDR; | ||
446 | } else | ||
447 | ++address; | ||
448 | |||
449 | err = allocate_threshold_blocks(cpu, bank, ++block, address); | ||
450 | if (err) | ||
451 | goto out_free; | ||
452 | |||
453 | return err; | ||
454 | |||
455 | out_free: | ||
456 | if (b) { | ||
457 | kobject_unregister(&b->kobj); | ||
458 | kfree(b); | ||
459 | } | ||
460 | return err; | ||
461 | } | ||
462 | |||
463 | /* symlinks sibling shared banks to first core. first core owns dir/files. */ | ||
464 | static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | ||
465 | { | ||
466 | int i, err = 0; | ||
467 | struct threshold_bank *b = NULL; | ||
468 | cpumask_t oldmask = CPU_MASK_NONE; | ||
469 | char name[32]; | ||
470 | |||
471 | sprintf(name, "threshold_bank%i", bank); | ||
472 | |||
473 | #ifdef CONFIG_SMP | ||
474 | if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ | ||
475 | i = first_cpu(cpu_core_map[cpu]); | ||
476 | |||
477 | /* first core not up yet */ | ||
478 | if (cpu_data[i].cpu_core_id) | ||
479 | goto out; | ||
480 | |||
481 | /* already linked */ | ||
482 | if (per_cpu(threshold_banks, cpu)[bank]) | ||
483 | goto out; | ||
484 | |||
485 | b = per_cpu(threshold_banks, i)[bank]; | ||
486 | |||
487 | if (!b) | ||
488 | goto out; | ||
489 | |||
490 | err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, | ||
491 | &b->kobj, name); | ||
492 | if (err) | ||
493 | goto out; | ||
494 | |||
495 | b->cpus = cpu_core_map[cpu]; | ||
496 | per_cpu(threshold_banks, cpu)[bank] = b; | ||
497 | goto out; | ||
498 | } | ||
499 | #endif | ||
500 | |||
501 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); | ||
502 | if (!b) { | ||
503 | err = -ENOMEM; | ||
504 | goto out; | ||
505 | } | ||
506 | |||
507 | kobject_set_name(&b->kobj, "threshold_bank%i", bank); | ||
508 | b->kobj.parent = &per_cpu(device_mce, cpu).kobj; | ||
509 | #ifndef CONFIG_SMP | ||
510 | b->cpus = CPU_MASK_ALL; | ||
511 | #else | ||
512 | b->cpus = cpu_core_map[cpu]; | ||
513 | #endif | ||
514 | err = kobject_register(&b->kobj); | ||
515 | if (err) | ||
516 | goto out_free; | ||
517 | |||
518 | per_cpu(threshold_banks, cpu)[bank] = b; | ||
519 | |||
520 | oldmask = affinity_set(cpu); | ||
521 | err = allocate_threshold_blocks(cpu, bank, 0, | ||
522 | MSR_IA32_MC0_MISC + bank * 4); | ||
523 | affinity_restore(oldmask); | ||
524 | |||
525 | if (err) | ||
526 | goto out_free; | ||
527 | |||
528 | for_each_cpu_mask(i, b->cpus) { | ||
529 | if (i == cpu) | ||
530 | continue; | ||
531 | |||
532 | err = sysfs_create_link(&per_cpu(device_mce, i).kobj, | ||
533 | &b->kobj, name); | ||
534 | if (err) | ||
535 | goto out; | ||
536 | |||
537 | per_cpu(threshold_banks, i)[bank] = b; | ||
538 | } | ||
539 | |||
540 | goto out; | ||
541 | |||
542 | out_free: | ||
543 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
544 | kfree(b); | ||
545 | out: | ||
546 | return err; | ||
547 | } | ||
548 | |||
549 | /* create dir/files for all valid threshold banks */ | ||
550 | static __cpuinit int threshold_create_device(unsigned int cpu) | ||
551 | { | ||
552 | unsigned int bank; | ||
553 | int err = 0; | ||
554 | |||
555 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
556 | if (!(per_cpu(bank_map, cpu) & 1 << bank)) | ||
557 | continue; | ||
558 | err = threshold_create_bank(cpu, bank); | ||
559 | if (err) | ||
560 | goto out; | ||
561 | } | ||
562 | out: | ||
563 | return err; | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * let's be hotplug friendly. | ||
568 | * in case of multiple core processors, the first core always takes ownership | ||
569 | * of shared sysfs dir/files, and rest of the cores will be symlinked to it. | ||
570 | */ | ||
571 | |||
572 | static void deallocate_threshold_block(unsigned int cpu, | ||
573 | unsigned int bank) | ||
574 | { | ||
575 | struct threshold_block *pos = NULL; | ||
576 | struct threshold_block *tmp = NULL; | ||
577 | struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; | ||
578 | |||
579 | if (!head) | ||
580 | return; | ||
581 | |||
582 | list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { | ||
583 | kobject_unregister(&pos->kobj); | ||
584 | list_del(&pos->miscj); | ||
585 | kfree(pos); | ||
586 | } | ||
587 | |||
588 | kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); | ||
589 | per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; | ||
590 | } | ||
591 | |||
592 | static void threshold_remove_bank(unsigned int cpu, int bank) | ||
593 | { | ||
594 | int i = 0; | ||
595 | struct threshold_bank *b; | ||
596 | char name[32]; | ||
597 | |||
598 | b = per_cpu(threshold_banks, cpu)[bank]; | ||
599 | |||
600 | if (!b) | ||
601 | return; | ||
602 | |||
603 | if (!b->blocks) | ||
604 | goto free_out; | ||
605 | |||
606 | sprintf(name, "threshold_bank%i", bank); | ||
607 | |||
608 | #ifdef CONFIG_SMP | ||
609 | /* sibling symlink */ | ||
610 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | ||
611 | sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); | ||
612 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
613 | return; | ||
614 | } | ||
615 | #endif | ||
616 | |||
617 | /* remove all sibling symlinks before unregistering */ | ||
618 | for_each_cpu_mask(i, b->cpus) { | ||
619 | if (i == cpu) | ||
620 | continue; | ||
621 | |||
622 | sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); | ||
623 | per_cpu(threshold_banks, i)[bank] = NULL; | ||
624 | } | ||
625 | |||
626 | deallocate_threshold_block(cpu, bank); | ||
627 | |||
628 | free_out: | ||
629 | kobject_unregister(&b->kobj); | ||
630 | kfree(b); | ||
631 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
632 | } | ||
633 | |||
634 | static void threshold_remove_device(unsigned int cpu) | ||
635 | { | ||
636 | unsigned int bank; | ||
637 | |||
638 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
639 | if (!(per_cpu(bank_map, cpu) & 1 << bank)) | ||
640 | continue; | ||
641 | threshold_remove_bank(cpu, bank); | ||
642 | } | ||
643 | } | ||
644 | |||
645 | /* get notified when a cpu comes on/off */ | ||
646 | static int threshold_cpu_callback(struct notifier_block *nfb, | ||
647 | unsigned long action, void *hcpu) | ||
648 | { | ||
649 | /* cpu was unsigned int to begin with */ | ||
650 | unsigned int cpu = (unsigned long)hcpu; | ||
651 | |||
652 | if (cpu >= NR_CPUS) | ||
653 | goto out; | ||
654 | |||
655 | switch (action) { | ||
656 | case CPU_ONLINE: | ||
657 | case CPU_ONLINE_FROZEN: | ||
658 | threshold_create_device(cpu); | ||
659 | break; | ||
660 | case CPU_DEAD: | ||
661 | case CPU_DEAD_FROZEN: | ||
662 | threshold_remove_device(cpu); | ||
663 | break; | ||
664 | default: | ||
665 | break; | ||
666 | } | ||
667 | out: | ||
668 | return NOTIFY_OK; | ||
669 | } | ||
670 | |||
671 | static struct notifier_block threshold_cpu_notifier = { | ||
672 | .notifier_call = threshold_cpu_callback, | ||
673 | }; | ||
674 | |||
675 | static __init int threshold_init_device(void) | ||
676 | { | ||
677 | unsigned lcpu = 0; | ||
678 | |||
679 | /* to hit CPUs online before the notifier is up */ | ||
680 | for_each_online_cpu(lcpu) { | ||
681 | int err = threshold_create_device(lcpu); | ||
682 | if (err) | ||
683 | return err; | ||
684 | } | ||
685 | register_hotcpu_notifier(&threshold_cpu_notifier); | ||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | device_initcall(threshold_init_device); | ||
diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c new file mode 100644 index 000000000000..6551505d8a2c --- /dev/null +++ b/arch/x86/kernel/mce_intel_64.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | * Intel specific MCE features. | ||
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/percpu.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/msr.h> | ||
11 | #include <asm/mce.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | #include <asm/idle.h> | ||
14 | #include <asm/therm_throt.h> | ||
15 | |||
16 | asmlinkage void smp_thermal_interrupt(void) | ||
17 | { | ||
18 | __u64 msr_val; | ||
19 | |||
20 | ack_APIC_irq(); | ||
21 | |||
22 | exit_idle(); | ||
23 | irq_enter(); | ||
24 | |||
25 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | ||
26 | if (therm_throt_process(msr_val & 1)) | ||
27 | mce_log_therm_throt_event(smp_processor_id(), msr_val); | ||
28 | |||
29 | irq_exit(); | ||
30 | } | ||
31 | |||
32 | static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) | ||
33 | { | ||
34 | u32 l, h; | ||
35 | int tm2 = 0; | ||
36 | unsigned int cpu = smp_processor_id(); | ||
37 | |||
38 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
39 | return; | ||
40 | |||
41 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
42 | return; | ||
43 | |||
44 | /* first check if TM1 is already enabled by the BIOS, in which | ||
45 | * case there might be some SMM goo which handles it, so we can't even | ||
46 | * put a handler since it might be delivered via SMI already. | ||
47 | */ | ||
48 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
49 | h = apic_read(APIC_LVTTHMR); | ||
50 | if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { | ||
51 | printk(KERN_DEBUG | ||
52 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) | ||
57 | tm2 = 1; | ||
58 | |||
59 | if (h & APIC_VECTOR_MASK) { | ||
60 | printk(KERN_DEBUG | ||
61 | "CPU%d: Thermal LVT vector (%#x) already " | ||
62 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
63 | return; | ||
64 | } | ||
65 | |||
66 | h = THERMAL_APIC_VECTOR; | ||
67 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
68 | apic_write(APIC_LVTTHMR, h); | ||
69 | |||
70 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
71 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
72 | |||
73 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
74 | wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); | ||
75 | |||
76 | l = apic_read(APIC_LVTTHMR); | ||
77 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
78 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
79 | cpu, tm2 ? "TM2" : "TM1"); | ||
80 | |||
81 | /* enable thermal throttle processing */ | ||
82 | atomic_set(&therm_throt_en, 1); | ||
83 | return; | ||
84 | } | ||
85 | |||
86 | void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) | ||
87 | { | ||
88 | intel_init_thermal(c); | ||
89 | } | ||
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c new file mode 100644 index 000000000000..a888e67f5874 --- /dev/null +++ b/arch/x86/kernel/module_64.c | |||
@@ -0,0 +1,185 @@ | |||
1 | /* Kernel module help for x86-64 | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #include <linux/moduleloader.h> | ||
20 | #include <linux/elf.h> | ||
21 | #include <linux/vmalloc.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/bug.h> | ||
27 | |||
28 | #include <asm/system.h> | ||
29 | #include <asm/page.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | |||
32 | #define DEBUGP(fmt...) | ||
33 | |||
34 | #ifndef CONFIG_UML | ||
35 | void module_free(struct module *mod, void *module_region) | ||
36 | { | ||
37 | vfree(module_region); | ||
38 | /* FIXME: If module_region == mod->init_region, trim exception | ||
39 | table entries. */ | ||
40 | } | ||
41 | |||
42 | void *module_alloc(unsigned long size) | ||
43 | { | ||
44 | struct vm_struct *area; | ||
45 | |||
46 | if (!size) | ||
47 | return NULL; | ||
48 | size = PAGE_ALIGN(size); | ||
49 | if (size > MODULES_LEN) | ||
50 | return NULL; | ||
51 | |||
52 | area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); | ||
53 | if (!area) | ||
54 | return NULL; | ||
55 | |||
56 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | ||
57 | } | ||
58 | #endif | ||
59 | |||
60 | /* We don't need anything special. */ | ||
61 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
62 | Elf_Shdr *sechdrs, | ||
63 | char *secstrings, | ||
64 | struct module *mod) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | int apply_relocate_add(Elf64_Shdr *sechdrs, | ||
70 | const char *strtab, | ||
71 | unsigned int symindex, | ||
72 | unsigned int relsec, | ||
73 | struct module *me) | ||
74 | { | ||
75 | unsigned int i; | ||
76 | Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; | ||
77 | Elf64_Sym *sym; | ||
78 | void *loc; | ||
79 | u64 val; | ||
80 | |||
81 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
82 | sechdrs[relsec].sh_info); | ||
83 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
84 | /* This is where to make the change */ | ||
85 | loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
86 | + rel[i].r_offset; | ||
87 | |||
88 | /* This is the symbol it is referring to. Note that all | ||
89 | undefined symbols have been resolved. */ | ||
90 | sym = (Elf64_Sym *)sechdrs[symindex].sh_addr | ||
91 | + ELF64_R_SYM(rel[i].r_info); | ||
92 | |||
93 | DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", | ||
94 | (int)ELF64_R_TYPE(rel[i].r_info), | ||
95 | sym->st_value, rel[i].r_addend, (u64)loc); | ||
96 | |||
97 | val = sym->st_value + rel[i].r_addend; | ||
98 | |||
99 | switch (ELF64_R_TYPE(rel[i].r_info)) { | ||
100 | case R_X86_64_NONE: | ||
101 | break; | ||
102 | case R_X86_64_64: | ||
103 | *(u64 *)loc = val; | ||
104 | break; | ||
105 | case R_X86_64_32: | ||
106 | *(u32 *)loc = val; | ||
107 | if (val != *(u32 *)loc) | ||
108 | goto overflow; | ||
109 | break; | ||
110 | case R_X86_64_32S: | ||
111 | *(s32 *)loc = val; | ||
112 | if ((s64)val != *(s32 *)loc) | ||
113 | goto overflow; | ||
114 | break; | ||
115 | case R_X86_64_PC32: | ||
116 | val -= (u64)loc; | ||
117 | *(u32 *)loc = val; | ||
118 | #if 0 | ||
119 | if ((s64)val != *(s32 *)loc) | ||
120 | goto overflow; | ||
121 | #endif | ||
122 | break; | ||
123 | default: | ||
124 | printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", | ||
125 | me->name, ELF64_R_TYPE(rel[i].r_info)); | ||
126 | return -ENOEXEC; | ||
127 | } | ||
128 | } | ||
129 | return 0; | ||
130 | |||
131 | overflow: | ||
132 | printk(KERN_ERR "overflow in relocation type %d val %Lx\n", | ||
133 | (int)ELF64_R_TYPE(rel[i].r_info), val); | ||
134 | printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", | ||
135 | me->name); | ||
136 | return -ENOEXEC; | ||
137 | } | ||
138 | |||
139 | int apply_relocate(Elf_Shdr *sechdrs, | ||
140 | const char *strtab, | ||
141 | unsigned int symindex, | ||
142 | unsigned int relsec, | ||
143 | struct module *me) | ||
144 | { | ||
145 | printk("non add relocation not supported\n"); | ||
146 | return -ENOSYS; | ||
147 | } | ||
148 | |||
149 | int module_finalize(const Elf_Ehdr *hdr, | ||
150 | const Elf_Shdr *sechdrs, | ||
151 | struct module *me) | ||
152 | { | ||
153 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; | ||
154 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
155 | |||
156 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
157 | if (!strcmp(".text", secstrings + s->sh_name)) | ||
158 | text = s; | ||
159 | if (!strcmp(".altinstructions", secstrings + s->sh_name)) | ||
160 | alt = s; | ||
161 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | ||
162 | locks= s; | ||
163 | } | ||
164 | |||
165 | if (alt) { | ||
166 | /* patch .altinstructions */ | ||
167 | void *aseg = (void *)alt->sh_addr; | ||
168 | apply_alternatives(aseg, aseg + alt->sh_size); | ||
169 | } | ||
170 | if (locks && text) { | ||
171 | void *lseg = (void *)locks->sh_addr; | ||
172 | void *tseg = (void *)text->sh_addr; | ||
173 | alternatives_smp_module_add(me, me->name, | ||
174 | lseg, lseg + locks->sh_size, | ||
175 | tseg, tseg + text->sh_size); | ||
176 | } | ||
177 | |||
178 | return module_bug_finalize(hdr, sechdrs, me); | ||
179 | } | ||
180 | |||
181 | void module_arch_cleanup(struct module *mod) | ||
182 | { | ||
183 | alternatives_smp_module_del(mod); | ||
184 | module_bug_cleanup(mod); | ||
185 | } | ||
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c new file mode 100644 index 000000000000..8bf0ca03ac8e --- /dev/null +++ b/arch/x86/kernel/mpparse_64.c | |||
@@ -0,0 +1,852 @@ | |||
1 | /* | ||
2 | * Intel Multiprocessor Specification 1.1 and 1.4 | ||
3 | * compliant MP-table parsing routines. | ||
4 | * | ||
5 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
6 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes | ||
9 | * Erich Boleyn : MP v1.4 and additional changes. | ||
10 | * Alan Cox : Added EBDA scanning | ||
11 | * Ingo Molnar : various cleanups and rewrites | ||
12 | * Maciej W. Rozycki: Bits for default MP configurations | ||
13 | * Paul Diefenbaugh: Added full ACPI support | ||
14 | */ | ||
15 | |||
16 | #include <linux/mm.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/kernel_stat.h> | ||
21 | #include <linux/mc146818rtc.h> | ||
22 | #include <linux/acpi.h> | ||
23 | #include <linux/module.h> | ||
24 | |||
25 | #include <asm/smp.h> | ||
26 | #include <asm/mtrr.h> | ||
27 | #include <asm/mpspec.h> | ||
28 | #include <asm/pgalloc.h> | ||
29 | #include <asm/io_apic.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/acpi.h> | ||
32 | |||
33 | /* Have we found an MP table */ | ||
34 | int smp_found_config; | ||
35 | |||
36 | /* | ||
37 | * Various Linux-internal data structures created from the | ||
38 | * MP-table. | ||
39 | */ | ||
40 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
41 | int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
42 | |||
43 | static int mp_current_pci_id = 0; | ||
44 | /* I/O APIC entries */ | ||
45 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
46 | |||
47 | /* # of MP IRQ source entries */ | ||
48 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
49 | |||
50 | /* MP IRQ source entries */ | ||
51 | int mp_irq_entries; | ||
52 | |||
53 | int nr_ioapics; | ||
54 | unsigned long mp_lapic_addr = 0; | ||
55 | |||
56 | |||
57 | |||
58 | /* Processor that is doing the boot up */ | ||
59 | unsigned int boot_cpu_id = -1U; | ||
60 | /* Internal processor count */ | ||
61 | unsigned int num_processors __cpuinitdata = 0; | ||
62 | |||
63 | unsigned disabled_cpus __cpuinitdata; | ||
64 | |||
65 | /* Bitmask of physically existing CPUs */ | ||
66 | physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | ||
67 | |||
68 | u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * Intel MP BIOS table parsing routines: | ||
73 | */ | ||
74 | |||
75 | /* | ||
76 | * Checksum an MP configuration block. | ||
77 | */ | ||
78 | |||
79 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
80 | { | ||
81 | int sum = 0; | ||
82 | |||
83 | while (len--) | ||
84 | sum += *mp++; | ||
85 | |||
86 | return sum & 0xFF; | ||
87 | } | ||
88 | |||
89 | static void __cpuinit MP_processor_info (struct mpc_config_processor *m) | ||
90 | { | ||
91 | int cpu; | ||
92 | cpumask_t tmp_map; | ||
93 | char *bootup_cpu = ""; | ||
94 | |||
95 | if (!(m->mpc_cpuflag & CPU_ENABLED)) { | ||
96 | disabled_cpus++; | ||
97 | return; | ||
98 | } | ||
99 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
100 | bootup_cpu = " (Bootup-CPU)"; | ||
101 | boot_cpu_id = m->mpc_apicid; | ||
102 | } | ||
103 | |||
104 | printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); | ||
105 | |||
106 | if (num_processors >= NR_CPUS) { | ||
107 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
108 | " Processor ignored.\n", NR_CPUS); | ||
109 | return; | ||
110 | } | ||
111 | |||
112 | num_processors++; | ||
113 | cpus_complement(tmp_map, cpu_present_map); | ||
114 | cpu = first_cpu(tmp_map); | ||
115 | |||
116 | physid_set(m->mpc_apicid, phys_cpu_present_map); | ||
117 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
118 | /* | ||
119 | * bios_cpu_apicid is required to have processors listed | ||
120 | * in same order as logical cpu numbers. Hence the first | ||
121 | * entry is BSP, and so on. | ||
122 | */ | ||
123 | cpu = 0; | ||
124 | } | ||
125 | bios_cpu_apicid[cpu] = m->mpc_apicid; | ||
126 | x86_cpu_to_apicid[cpu] = m->mpc_apicid; | ||
127 | |||
128 | cpu_set(cpu, cpu_possible_map); | ||
129 | cpu_set(cpu, cpu_present_map); | ||
130 | } | ||
131 | |||
132 | static void __init MP_bus_info (struct mpc_config_bus *m) | ||
133 | { | ||
134 | char str[7]; | ||
135 | |||
136 | memcpy(str, m->mpc_bustype, 6); | ||
137 | str[6] = 0; | ||
138 | Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | ||
139 | |||
140 | if (strncmp(str, "ISA", 3) == 0) { | ||
141 | set_bit(m->mpc_busid, mp_bus_not_pci); | ||
142 | } else if (strncmp(str, "PCI", 3) == 0) { | ||
143 | clear_bit(m->mpc_busid, mp_bus_not_pci); | ||
144 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
145 | mp_current_pci_id++; | ||
146 | } else { | ||
147 | printk(KERN_ERR "Unknown bustype %s\n", str); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | static int bad_ioapic(unsigned long address) | ||
152 | { | ||
153 | if (nr_ioapics >= MAX_IO_APICS) { | ||
154 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
155 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
156 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
157 | } | ||
158 | if (!address) { | ||
159 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
160 | " found in table, skipping!\n"); | ||
161 | return 1; | ||
162 | } | ||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | ||
167 | { | ||
168 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | ||
169 | return; | ||
170 | |||
171 | printk("I/O APIC #%d at 0x%X.\n", | ||
172 | m->mpc_apicid, m->mpc_apicaddr); | ||
173 | |||
174 | if (bad_ioapic(m->mpc_apicaddr)) | ||
175 | return; | ||
176 | |||
177 | mp_ioapics[nr_ioapics] = *m; | ||
178 | nr_ioapics++; | ||
179 | } | ||
180 | |||
181 | static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | ||
182 | { | ||
183 | mp_irqs [mp_irq_entries] = *m; | ||
184 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
185 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
186 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
187 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | ||
188 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | ||
189 | if (++mp_irq_entries >= MAX_IRQ_SOURCES) | ||
190 | panic("Max # of irq sources exceeded!!\n"); | ||
191 | } | ||
192 | |||
193 | static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | ||
194 | { | ||
195 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | ||
196 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | ||
197 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
198 | (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | ||
199 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Read/parse the MPC | ||
204 | */ | ||
205 | |||
206 | static int __init smp_read_mpc(struct mp_config_table *mpc) | ||
207 | { | ||
208 | char str[16]; | ||
209 | int count=sizeof(*mpc); | ||
210 | unsigned char *mpt=((unsigned char *)mpc)+count; | ||
211 | |||
212 | if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | ||
213 | printk("MPTABLE: bad signature [%c%c%c%c]!\n", | ||
214 | mpc->mpc_signature[0], | ||
215 | mpc->mpc_signature[1], | ||
216 | mpc->mpc_signature[2], | ||
217 | mpc->mpc_signature[3]); | ||
218 | return 0; | ||
219 | } | ||
220 | if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | ||
221 | printk("MPTABLE: checksum error!\n"); | ||
222 | return 0; | ||
223 | } | ||
224 | if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | ||
225 | printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", | ||
226 | mpc->mpc_spec); | ||
227 | return 0; | ||
228 | } | ||
229 | if (!mpc->mpc_lapic) { | ||
230 | printk(KERN_ERR "MPTABLE: null local APIC address!\n"); | ||
231 | return 0; | ||
232 | } | ||
233 | memcpy(str,mpc->mpc_oem,8); | ||
234 | str[8] = 0; | ||
235 | printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); | ||
236 | |||
237 | memcpy(str,mpc->mpc_productid,12); | ||
238 | str[12] = 0; | ||
239 | printk("MPTABLE: Product ID: %s ",str); | ||
240 | |||
241 | printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); | ||
242 | |||
243 | /* save the local APIC address, it might be non-default */ | ||
244 | if (!acpi_lapic) | ||
245 | mp_lapic_addr = mpc->mpc_lapic; | ||
246 | |||
247 | /* | ||
248 | * Now process the configuration blocks. | ||
249 | */ | ||
250 | while (count < mpc->mpc_length) { | ||
251 | switch(*mpt) { | ||
252 | case MP_PROCESSOR: | ||
253 | { | ||
254 | struct mpc_config_processor *m= | ||
255 | (struct mpc_config_processor *)mpt; | ||
256 | if (!acpi_lapic) | ||
257 | MP_processor_info(m); | ||
258 | mpt += sizeof(*m); | ||
259 | count += sizeof(*m); | ||
260 | break; | ||
261 | } | ||
262 | case MP_BUS: | ||
263 | { | ||
264 | struct mpc_config_bus *m= | ||
265 | (struct mpc_config_bus *)mpt; | ||
266 | MP_bus_info(m); | ||
267 | mpt += sizeof(*m); | ||
268 | count += sizeof(*m); | ||
269 | break; | ||
270 | } | ||
271 | case MP_IOAPIC: | ||
272 | { | ||
273 | struct mpc_config_ioapic *m= | ||
274 | (struct mpc_config_ioapic *)mpt; | ||
275 | MP_ioapic_info(m); | ||
276 | mpt += sizeof(*m); | ||
277 | count += sizeof(*m); | ||
278 | break; | ||
279 | } | ||
280 | case MP_INTSRC: | ||
281 | { | ||
282 | struct mpc_config_intsrc *m= | ||
283 | (struct mpc_config_intsrc *)mpt; | ||
284 | |||
285 | MP_intsrc_info(m); | ||
286 | mpt += sizeof(*m); | ||
287 | count += sizeof(*m); | ||
288 | break; | ||
289 | } | ||
290 | case MP_LINTSRC: | ||
291 | { | ||
292 | struct mpc_config_lintsrc *m= | ||
293 | (struct mpc_config_lintsrc *)mpt; | ||
294 | MP_lintsrc_info(m); | ||
295 | mpt += sizeof(*m); | ||
296 | count += sizeof(*m); | ||
297 | break; | ||
298 | } | ||
299 | } | ||
300 | } | ||
301 | setup_apic_routing(); | ||
302 | if (!num_processors) | ||
303 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); | ||
304 | return num_processors; | ||
305 | } | ||
306 | |||
307 | static int __init ELCR_trigger(unsigned int irq) | ||
308 | { | ||
309 | unsigned int port; | ||
310 | |||
311 | port = 0x4d0 + (irq >> 3); | ||
312 | return (inb(port) >> (irq & 7)) & 1; | ||
313 | } | ||
314 | |||
315 | static void __init construct_default_ioirq_mptable(int mpc_default_type) | ||
316 | { | ||
317 | struct mpc_config_intsrc intsrc; | ||
318 | int i; | ||
319 | int ELCR_fallback = 0; | ||
320 | |||
321 | intsrc.mpc_type = MP_INTSRC; | ||
322 | intsrc.mpc_irqflag = 0; /* conforming */ | ||
323 | intsrc.mpc_srcbus = 0; | ||
324 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | ||
325 | |||
326 | intsrc.mpc_irqtype = mp_INT; | ||
327 | |||
328 | /* | ||
329 | * If true, we have an ISA/PCI system with no IRQ entries | ||
330 | * in the MP table. To prevent the PCI interrupts from being set up | ||
331 | * incorrectly, we try to use the ELCR. The sanity check to see if | ||
332 | * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | ||
333 | * never be level sensitive, so we simply see if the ELCR agrees. | ||
334 | * If it does, we assume it's valid. | ||
335 | */ | ||
336 | if (mpc_default_type == 5) { | ||
337 | printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | ||
338 | |||
339 | if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | ||
340 | printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); | ||
341 | else { | ||
342 | printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | ||
343 | ELCR_fallback = 1; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | for (i = 0; i < 16; i++) { | ||
348 | switch (mpc_default_type) { | ||
349 | case 2: | ||
350 | if (i == 0 || i == 13) | ||
351 | continue; /* IRQ0 & IRQ13 not connected */ | ||
352 | /* fall through */ | ||
353 | default: | ||
354 | if (i == 2) | ||
355 | continue; /* IRQ2 is never connected */ | ||
356 | } | ||
357 | |||
358 | if (ELCR_fallback) { | ||
359 | /* | ||
360 | * If the ELCR indicates a level-sensitive interrupt, we | ||
361 | * copy that information over to the MP table in the | ||
362 | * irqflag field (level sensitive, active high polarity). | ||
363 | */ | ||
364 | if (ELCR_trigger(i)) | ||
365 | intsrc.mpc_irqflag = 13; | ||
366 | else | ||
367 | intsrc.mpc_irqflag = 0; | ||
368 | } | ||
369 | |||
370 | intsrc.mpc_srcbusirq = i; | ||
371 | intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | ||
372 | MP_intsrc_info(&intsrc); | ||
373 | } | ||
374 | |||
375 | intsrc.mpc_irqtype = mp_ExtINT; | ||
376 | intsrc.mpc_srcbusirq = 0; | ||
377 | intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | ||
378 | MP_intsrc_info(&intsrc); | ||
379 | } | ||
380 | |||
381 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
382 | { | ||
383 | struct mpc_config_processor processor; | ||
384 | struct mpc_config_bus bus; | ||
385 | struct mpc_config_ioapic ioapic; | ||
386 | struct mpc_config_lintsrc lintsrc; | ||
387 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
388 | int i; | ||
389 | |||
390 | /* | ||
391 | * local APIC has default address | ||
392 | */ | ||
393 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
394 | |||
395 | /* | ||
396 | * 2 CPUs, numbered 0 & 1. | ||
397 | */ | ||
398 | processor.mpc_type = MP_PROCESSOR; | ||
399 | processor.mpc_apicver = 0; | ||
400 | processor.mpc_cpuflag = CPU_ENABLED; | ||
401 | processor.mpc_cpufeature = 0; | ||
402 | processor.mpc_featureflag = 0; | ||
403 | processor.mpc_reserved[0] = 0; | ||
404 | processor.mpc_reserved[1] = 0; | ||
405 | for (i = 0; i < 2; i++) { | ||
406 | processor.mpc_apicid = i; | ||
407 | MP_processor_info(&processor); | ||
408 | } | ||
409 | |||
410 | bus.mpc_type = MP_BUS; | ||
411 | bus.mpc_busid = 0; | ||
412 | switch (mpc_default_type) { | ||
413 | default: | ||
414 | printk(KERN_ERR "???\nUnknown standard configuration %d\n", | ||
415 | mpc_default_type); | ||
416 | /* fall through */ | ||
417 | case 1: | ||
418 | case 5: | ||
419 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
420 | break; | ||
421 | } | ||
422 | MP_bus_info(&bus); | ||
423 | if (mpc_default_type > 4) { | ||
424 | bus.mpc_busid = 1; | ||
425 | memcpy(bus.mpc_bustype, "PCI ", 6); | ||
426 | MP_bus_info(&bus); | ||
427 | } | ||
428 | |||
429 | ioapic.mpc_type = MP_IOAPIC; | ||
430 | ioapic.mpc_apicid = 2; | ||
431 | ioapic.mpc_apicver = 0; | ||
432 | ioapic.mpc_flags = MPC_APIC_USABLE; | ||
433 | ioapic.mpc_apicaddr = 0xFEC00000; | ||
434 | MP_ioapic_info(&ioapic); | ||
435 | |||
436 | /* | ||
437 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | ||
438 | */ | ||
439 | construct_default_ioirq_mptable(mpc_default_type); | ||
440 | |||
441 | lintsrc.mpc_type = MP_LINTSRC; | ||
442 | lintsrc.mpc_irqflag = 0; /* conforming */ | ||
443 | lintsrc.mpc_srcbusid = 0; | ||
444 | lintsrc.mpc_srcbusirq = 0; | ||
445 | lintsrc.mpc_destapic = MP_APIC_ALL; | ||
446 | for (i = 0; i < 2; i++) { | ||
447 | lintsrc.mpc_irqtype = linttypes[i]; | ||
448 | lintsrc.mpc_destapiclint = i; | ||
449 | MP_lintsrc_info(&lintsrc); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | static struct intel_mp_floating *mpf_found; | ||
454 | |||
455 | /* | ||
456 | * Scan the memory blocks for an SMP configuration block. | ||
457 | */ | ||
458 | void __init get_smp_config (void) | ||
459 | { | ||
460 | struct intel_mp_floating *mpf = mpf_found; | ||
461 | |||
462 | /* | ||
463 | * ACPI supports both logical (e.g. Hyper-Threading) and physical | ||
464 | * processors, where MPS only supports physical. | ||
465 | */ | ||
466 | if (acpi_lapic && acpi_ioapic) { | ||
467 | printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | ||
468 | return; | ||
469 | } | ||
470 | else if (acpi_lapic) | ||
471 | printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | ||
472 | |||
473 | printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | ||
474 | |||
475 | /* | ||
476 | * Now see if we need to read further. | ||
477 | */ | ||
478 | if (mpf->mpf_feature1 != 0) { | ||
479 | |||
480 | printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | ||
481 | construct_default_ISA_mptable(mpf->mpf_feature1); | ||
482 | |||
483 | } else if (mpf->mpf_physptr) { | ||
484 | |||
485 | /* | ||
486 | * Read the physical hardware table. Anything here will | ||
487 | * override the defaults. | ||
488 | */ | ||
489 | if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { | ||
490 | smp_found_config = 0; | ||
491 | printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | ||
492 | printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | ||
493 | return; | ||
494 | } | ||
495 | /* | ||
496 | * If there are no explicit MP IRQ entries, then we are | ||
497 | * broken. We set up most of the low 16 IO-APIC pins to | ||
498 | * ISA defaults and hope it will work. | ||
499 | */ | ||
500 | if (!mp_irq_entries) { | ||
501 | struct mpc_config_bus bus; | ||
502 | |||
503 | printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | ||
504 | |||
505 | bus.mpc_type = MP_BUS; | ||
506 | bus.mpc_busid = 0; | ||
507 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
508 | MP_bus_info(&bus); | ||
509 | |||
510 | construct_default_ioirq_mptable(0); | ||
511 | } | ||
512 | |||
513 | } else | ||
514 | BUG(); | ||
515 | |||
516 | printk(KERN_INFO "Processors: %d\n", num_processors); | ||
517 | /* | ||
518 | * Only use the first configuration found. | ||
519 | */ | ||
520 | } | ||
521 | |||
522 | static int __init smp_scan_config (unsigned long base, unsigned long length) | ||
523 | { | ||
524 | extern void __bad_mpf_size(void); | ||
525 | unsigned int *bp = phys_to_virt(base); | ||
526 | struct intel_mp_floating *mpf; | ||
527 | |||
528 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | ||
529 | if (sizeof(*mpf) != 16) | ||
530 | __bad_mpf_size(); | ||
531 | |||
532 | while (length > 0) { | ||
533 | mpf = (struct intel_mp_floating *)bp; | ||
534 | if ((*bp == SMP_MAGIC_IDENT) && | ||
535 | (mpf->mpf_length == 1) && | ||
536 | !mpf_checksum((unsigned char *)bp, 16) && | ||
537 | ((mpf->mpf_specification == 1) | ||
538 | || (mpf->mpf_specification == 4)) ) { | ||
539 | |||
540 | smp_found_config = 1; | ||
541 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); | ||
542 | if (mpf->mpf_physptr) | ||
543 | reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); | ||
544 | mpf_found = mpf; | ||
545 | return 1; | ||
546 | } | ||
547 | bp += 4; | ||
548 | length -= 16; | ||
549 | } | ||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | void __init find_smp_config(void) | ||
554 | { | ||
555 | unsigned int address; | ||
556 | |||
557 | /* | ||
558 | * FIXME: Linux assumes you have 640K of base ram.. | ||
559 | * this continues the error... | ||
560 | * | ||
561 | * 1) Scan the bottom 1K for a signature | ||
562 | * 2) Scan the top 1K of base RAM | ||
563 | * 3) Scan the 64K of bios | ||
564 | */ | ||
565 | if (smp_scan_config(0x0,0x400) || | ||
566 | smp_scan_config(639*0x400,0x400) || | ||
567 | smp_scan_config(0xF0000,0x10000)) | ||
568 | return; | ||
569 | /* | ||
570 | * If it is an SMP machine we should know now. | ||
571 | * | ||
572 | * there is a real-mode segmented pointer pointing to the | ||
573 | * 4K EBDA area at 0x40E, calculate and scan it here. | ||
574 | * | ||
575 | * NOTE! There are Linux loaders that will corrupt the EBDA | ||
576 | * area, and as such this kind of SMP config may be less | ||
577 | * trustworthy, simply because the SMP table may have been | ||
578 | * stomped on during early boot. These loaders are buggy and | ||
579 | * should be fixed. | ||
580 | */ | ||
581 | |||
582 | address = *(unsigned short *)phys_to_virt(0x40E); | ||
583 | address <<= 4; | ||
584 | if (smp_scan_config(address, 0x1000)) | ||
585 | return; | ||
586 | |||
587 | /* If we have come this far, we did not find an MP table */ | ||
588 | printk(KERN_INFO "No mptable found.\n"); | ||
589 | } | ||
590 | |||
591 | /* -------------------------------------------------------------------------- | ||
592 | ACPI-based MP Configuration | ||
593 | -------------------------------------------------------------------------- */ | ||
594 | |||
595 | #ifdef CONFIG_ACPI | ||
596 | |||
597 | void __init mp_register_lapic_address(u64 address) | ||
598 | { | ||
599 | mp_lapic_addr = (unsigned long) address; | ||
600 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
601 | if (boot_cpu_id == -1U) | ||
602 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
603 | } | ||
604 | |||
605 | void __cpuinit mp_register_lapic (u8 id, u8 enabled) | ||
606 | { | ||
607 | struct mpc_config_processor processor; | ||
608 | int boot_cpu = 0; | ||
609 | |||
610 | if (id == boot_cpu_id) | ||
611 | boot_cpu = 1; | ||
612 | |||
613 | processor.mpc_type = MP_PROCESSOR; | ||
614 | processor.mpc_apicid = id; | ||
615 | processor.mpc_apicver = 0; | ||
616 | processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | ||
617 | processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | ||
618 | processor.mpc_cpufeature = 0; | ||
619 | processor.mpc_featureflag = 0; | ||
620 | processor.mpc_reserved[0] = 0; | ||
621 | processor.mpc_reserved[1] = 0; | ||
622 | |||
623 | MP_processor_info(&processor); | ||
624 | } | ||
625 | |||
626 | #define MP_ISA_BUS 0 | ||
627 | #define MP_MAX_IOAPIC_PIN 127 | ||
628 | |||
629 | static struct mp_ioapic_routing { | ||
630 | int apic_id; | ||
631 | int gsi_start; | ||
632 | int gsi_end; | ||
633 | u32 pin_programmed[4]; | ||
634 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
635 | |||
636 | static int mp_find_ioapic(int gsi) | ||
637 | { | ||
638 | int i = 0; | ||
639 | |||
640 | /* Find the IOAPIC that manages this GSI. */ | ||
641 | for (i = 0; i < nr_ioapics; i++) { | ||
642 | if ((gsi >= mp_ioapic_routing[i].gsi_start) | ||
643 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
644 | return i; | ||
645 | } | ||
646 | |||
647 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
648 | return -1; | ||
649 | } | ||
650 | |||
651 | static u8 uniq_ioapic_id(u8 id) | ||
652 | { | ||
653 | int i; | ||
654 | DECLARE_BITMAP(used, 256); | ||
655 | bitmap_zero(used, 256); | ||
656 | for (i = 0; i < nr_ioapics; i++) { | ||
657 | struct mpc_config_ioapic *ia = &mp_ioapics[i]; | ||
658 | __set_bit(ia->mpc_apicid, used); | ||
659 | } | ||
660 | if (!test_bit(id, used)) | ||
661 | return id; | ||
662 | return find_first_zero_bit(used, 256); | ||
663 | } | ||
664 | |||
665 | void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) | ||
666 | { | ||
667 | int idx = 0; | ||
668 | |||
669 | if (bad_ioapic(address)) | ||
670 | return; | ||
671 | |||
672 | idx = nr_ioapics; | ||
673 | |||
674 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | ||
675 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | ||
676 | mp_ioapics[idx].mpc_apicaddr = address; | ||
677 | |||
678 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
679 | mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); | ||
680 | mp_ioapics[idx].mpc_apicver = 0; | ||
681 | |||
682 | /* | ||
683 | * Build basic IRQ lookup table to facilitate gsi->io_apic lookups | ||
684 | * and to prevent reprogramming of IOAPIC pins (PCI IRQs). | ||
685 | */ | ||
686 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | ||
687 | mp_ioapic_routing[idx].gsi_start = gsi_base; | ||
688 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
689 | io_apic_get_redir_entries(idx); | ||
690 | |||
691 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " | ||
692 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | ||
693 | mp_ioapics[idx].mpc_apicaddr, | ||
694 | mp_ioapic_routing[idx].gsi_start, | ||
695 | mp_ioapic_routing[idx].gsi_end); | ||
696 | |||
697 | nr_ioapics++; | ||
698 | } | ||
699 | |||
700 | void __init | ||
701 | mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | ||
702 | { | ||
703 | struct mpc_config_intsrc intsrc; | ||
704 | int ioapic = -1; | ||
705 | int pin = -1; | ||
706 | |||
707 | /* | ||
708 | * Convert 'gsi' to 'ioapic.pin'. | ||
709 | */ | ||
710 | ioapic = mp_find_ioapic(gsi); | ||
711 | if (ioapic < 0) | ||
712 | return; | ||
713 | pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
714 | |||
715 | /* | ||
716 | * TBD: This check is for faulty timer entries, where the override | ||
717 | * erroneously sets the trigger to level, resulting in a HUGE | ||
718 | * increase of timer interrupts! | ||
719 | */ | ||
720 | if ((bus_irq == 0) && (trigger == 3)) | ||
721 | trigger = 1; | ||
722 | |||
723 | intsrc.mpc_type = MP_INTSRC; | ||
724 | intsrc.mpc_irqtype = mp_INT; | ||
725 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | ||
726 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
727 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | ||
728 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | ||
729 | intsrc.mpc_dstirq = pin; /* INTIN# */ | ||
730 | |||
731 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | ||
732 | intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
733 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
734 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | ||
735 | |||
736 | mp_irqs[mp_irq_entries] = intsrc; | ||
737 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
738 | panic("Max # of irq sources exceeded!\n"); | ||
739 | } | ||
740 | |||
741 | void __init mp_config_acpi_legacy_irqs(void) | ||
742 | { | ||
743 | struct mpc_config_intsrc intsrc; | ||
744 | int i = 0; | ||
745 | int ioapic = -1; | ||
746 | |||
747 | /* | ||
748 | * Fabricate the legacy ISA bus (bus #31). | ||
749 | */ | ||
750 | set_bit(MP_ISA_BUS, mp_bus_not_pci); | ||
751 | |||
752 | /* | ||
753 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
754 | */ | ||
755 | ioapic = mp_find_ioapic(0); | ||
756 | if (ioapic < 0) | ||
757 | return; | ||
758 | |||
759 | intsrc.mpc_type = MP_INTSRC; | ||
760 | intsrc.mpc_irqflag = 0; /* Conforming */ | ||
761 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
762 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | ||
763 | |||
764 | /* | ||
765 | * Use the default configuration for the IRQs 0-15. Unless | ||
766 | * overridden by (MADT) interrupt source override entries. | ||
767 | */ | ||
768 | for (i = 0; i < 16; i++) { | ||
769 | int idx; | ||
770 | |||
771 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
772 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
773 | |||
774 | /* Do we already have a mapping for this ISA IRQ? */ | ||
775 | if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | ||
776 | break; | ||
777 | |||
778 | /* Do we already have a mapping for this IOAPIC pin */ | ||
779 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
780 | (irq->mpc_dstirq == i)) | ||
781 | break; | ||
782 | } | ||
783 | |||
784 | if (idx != mp_irq_entries) { | ||
785 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
786 | continue; /* IRQ already used */ | ||
787 | } | ||
788 | |||
789 | intsrc.mpc_irqtype = mp_INT; | ||
790 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
791 | intsrc.mpc_dstirq = i; | ||
792 | |||
793 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | ||
794 | "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
795 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
796 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | ||
797 | intsrc.mpc_dstirq); | ||
798 | |||
799 | mp_irqs[mp_irq_entries] = intsrc; | ||
800 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
801 | panic("Max # of irq sources exceeded!\n"); | ||
802 | } | ||
803 | } | ||
804 | |||
805 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | ||
806 | { | ||
807 | int ioapic = -1; | ||
808 | int ioapic_pin = 0; | ||
809 | int idx, bit = 0; | ||
810 | |||
811 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | ||
812 | return gsi; | ||
813 | |||
814 | /* Don't set up the ACPI SCI because it's already set up */ | ||
815 | if (acpi_gbl_FADT.sci_interrupt == gsi) | ||
816 | return gsi; | ||
817 | |||
818 | ioapic = mp_find_ioapic(gsi); | ||
819 | if (ioapic < 0) { | ||
820 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
821 | return gsi; | ||
822 | } | ||
823 | |||
824 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
825 | |||
826 | /* | ||
827 | * Avoid pin reprogramming. PRTs typically include entries | ||
828 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
829 | * we only program the IOAPIC on the first. | ||
830 | */ | ||
831 | bit = ioapic_pin % 32; | ||
832 | idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | ||
833 | if (idx > 3) { | ||
834 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
835 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
836 | ioapic_pin); | ||
837 | return gsi; | ||
838 | } | ||
839 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | ||
840 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
841 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
842 | return gsi; | ||
843 | } | ||
844 | |||
845 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | ||
846 | |||
847 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
848 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
849 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
850 | return gsi; | ||
851 | } | ||
852 | #endif /*CONFIG_ACPI*/ | ||
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c new file mode 100644 index 000000000000..0ec6d2ddb931 --- /dev/null +++ b/arch/x86/kernel/nmi_64.c | |||
@@ -0,0 +1,483 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/nmi.c | ||
3 | * | ||
4 | * NMI watchdog support on APIC systems | ||
5 | * | ||
6 | * Started by Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes: | ||
9 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
10 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
11 | * Pavel Machek and | ||
12 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
13 | */ | ||
14 | |||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/sysdev.h> | ||
21 | #include <linux/sysctl.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/cpumask.h> | ||
24 | #include <linux/kdebug.h> | ||
25 | |||
26 | #include <asm/smp.h> | ||
27 | #include <asm/nmi.h> | ||
28 | #include <asm/proto.h> | ||
29 | #include <asm/mce.h> | ||
30 | |||
31 | int unknown_nmi_panic; | ||
32 | int nmi_watchdog_enabled; | ||
33 | int panic_on_unrecovered_nmi; | ||
34 | |||
35 | static cpumask_t backtrace_mask = CPU_MASK_NONE; | ||
36 | |||
37 | /* nmi_active: | ||
38 | * >0: the lapic NMI watchdog is active, but can be disabled | ||
39 | * <0: the lapic NMI watchdog has not been set up, and cannot | ||
40 | * be enabled | ||
41 | * 0: the lapic NMI watchdog is disabled, but can be enabled | ||
42 | */ | ||
43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
44 | int panic_on_timeout; | ||
45 | |||
46 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
47 | static unsigned int nmi_hz = HZ; | ||
48 | |||
49 | static DEFINE_PER_CPU(short, wd_enabled); | ||
50 | |||
51 | /* local prototypes */ | ||
52 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); | ||
53 | |||
54 | /* Run after command line and cpu_init init, but before all other checks */ | ||
55 | void nmi_watchdog_default(void) | ||
56 | { | ||
57 | if (nmi_watchdog != NMI_DEFAULT) | ||
58 | return; | ||
59 | nmi_watchdog = NMI_NONE; | ||
60 | } | ||
61 | |||
62 | static int endflag __initdata = 0; | ||
63 | |||
64 | #ifdef CONFIG_SMP | ||
65 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
66 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
67 | * CPUs during the test make them busy. | ||
68 | */ | ||
69 | static __init void nmi_cpu_busy(void *data) | ||
70 | { | ||
71 | local_irq_enable_in_hardirq(); | ||
72 | /* Intentionally don't use cpu_relax here. This is | ||
73 | to make sure that the performance counter really ticks, | ||
74 | even if there is a simulator or similar that catches the | ||
75 | pause instruction. On a real HT machine this is fine because | ||
76 | all other CPUs are busy with "useless" delay loops and don't | ||
77 | care if they get somewhat less cycles. */ | ||
78 | while (endflag == 0) | ||
79 | mb(); | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | int __init check_nmi_watchdog (void) | ||
84 | { | ||
85 | int *counts; | ||
86 | int cpu; | ||
87 | |||
88 | if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) | ||
89 | return 0; | ||
90 | |||
91 | if (!atomic_read(&nmi_active)) | ||
92 | return 0; | ||
93 | |||
94 | counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); | ||
95 | if (!counts) | ||
96 | return -1; | ||
97 | |||
98 | printk(KERN_INFO "testing NMI watchdog ... "); | ||
99 | |||
100 | #ifdef CONFIG_SMP | ||
101 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
102 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | ||
103 | #endif | ||
104 | |||
105 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
106 | counts[cpu] = cpu_pda(cpu)->__nmi_count; | ||
107 | local_irq_enable(); | ||
108 | mdelay((20*1000)/nmi_hz); // wait 20 ticks | ||
109 | |||
110 | for_each_online_cpu(cpu) { | ||
111 | if (!per_cpu(wd_enabled, cpu)) | ||
112 | continue; | ||
113 | if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { | ||
114 | printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", | ||
115 | cpu, | ||
116 | counts[cpu], | ||
117 | cpu_pda(cpu)->__nmi_count); | ||
118 | per_cpu(wd_enabled, cpu) = 0; | ||
119 | atomic_dec(&nmi_active); | ||
120 | } | ||
121 | } | ||
122 | if (!atomic_read(&nmi_active)) { | ||
123 | kfree(counts); | ||
124 | atomic_set(&nmi_active, -1); | ||
125 | endflag = 1; | ||
126 | return -1; | ||
127 | } | ||
128 | endflag = 1; | ||
129 | printk("OK.\n"); | ||
130 | |||
131 | /* now that we know it works we can reduce NMI frequency to | ||
132 | something more reasonable; makes a difference in some configs */ | ||
133 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
134 | nmi_hz = lapic_adjust_nmi_hz(1); | ||
135 | |||
136 | kfree(counts); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | int __init setup_nmi_watchdog(char *str) | ||
141 | { | ||
142 | int nmi; | ||
143 | |||
144 | if (!strncmp(str,"panic",5)) { | ||
145 | panic_on_timeout = 1; | ||
146 | str = strchr(str, ','); | ||
147 | if (!str) | ||
148 | return 1; | ||
149 | ++str; | ||
150 | } | ||
151 | |||
152 | get_option(&str, &nmi); | ||
153 | |||
154 | if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) | ||
155 | return 0; | ||
156 | |||
157 | nmi_watchdog = nmi; | ||
158 | return 1; | ||
159 | } | ||
160 | |||
161 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
162 | |||
163 | |||
164 | static void __acpi_nmi_disable(void *__unused) | ||
165 | { | ||
166 | apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Disable timer based NMIs on all CPUs: | ||
171 | */ | ||
172 | void acpi_nmi_disable(void) | ||
173 | { | ||
174 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
175 | on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); | ||
176 | } | ||
177 | |||
178 | static void __acpi_nmi_enable(void *__unused) | ||
179 | { | ||
180 | apic_write(APIC_LVT0, APIC_DM_NMI); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * Enable timer based NMIs on all CPUs: | ||
185 | */ | ||
186 | void acpi_nmi_enable(void) | ||
187 | { | ||
188 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
189 | on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); | ||
190 | } | ||
191 | #ifdef CONFIG_PM | ||
192 | |||
193 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
194 | |||
195 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | ||
196 | { | ||
197 | /* only CPU0 goes here, other CPUs should be offline */ | ||
198 | nmi_pm_active = atomic_read(&nmi_active); | ||
199 | stop_apic_nmi_watchdog(NULL); | ||
200 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int lapic_nmi_resume(struct sys_device *dev) | ||
205 | { | ||
206 | /* only CPU0 goes here, other CPUs should be offline */ | ||
207 | if (nmi_pm_active > 0) { | ||
208 | setup_apic_nmi_watchdog(NULL); | ||
209 | touch_nmi_watchdog(); | ||
210 | } | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static struct sysdev_class nmi_sysclass = { | ||
215 | set_kset_name("lapic_nmi"), | ||
216 | .resume = lapic_nmi_resume, | ||
217 | .suspend = lapic_nmi_suspend, | ||
218 | }; | ||
219 | |||
220 | static struct sys_device device_lapic_nmi = { | ||
221 | .id = 0, | ||
222 | .cls = &nmi_sysclass, | ||
223 | }; | ||
224 | |||
225 | static int __init init_lapic_nmi_sysfs(void) | ||
226 | { | ||
227 | int error; | ||
228 | |||
229 | /* should really be a BUG_ON but b/c this is an | ||
230 | * init call, it just doesn't work. -dcz | ||
231 | */ | ||
232 | if (nmi_watchdog != NMI_LOCAL_APIC) | ||
233 | return 0; | ||
234 | |||
235 | if ( atomic_read(&nmi_active) < 0 ) | ||
236 | return 0; | ||
237 | |||
238 | error = sysdev_class_register(&nmi_sysclass); | ||
239 | if (!error) | ||
240 | error = sysdev_register(&device_lapic_nmi); | ||
241 | return error; | ||
242 | } | ||
243 | /* must come after the local APIC's device_initcall() */ | ||
244 | late_initcall(init_lapic_nmi_sysfs); | ||
245 | |||
246 | #endif /* CONFIG_PM */ | ||
247 | |||
248 | void setup_apic_nmi_watchdog(void *unused) | ||
249 | { | ||
250 | if (__get_cpu_var(wd_enabled) == 1) | ||
251 | return; | ||
252 | |||
253 | /* cheap hack to support suspend/resume */ | ||
254 | /* if cpu0 is not active neither should the other cpus */ | ||
255 | if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) | ||
256 | return; | ||
257 | |||
258 | switch (nmi_watchdog) { | ||
259 | case NMI_LOCAL_APIC: | ||
260 | __get_cpu_var(wd_enabled) = 1; | ||
261 | if (lapic_watchdog_init(nmi_hz) < 0) { | ||
262 | __get_cpu_var(wd_enabled) = 0; | ||
263 | return; | ||
264 | } | ||
265 | /* FALL THROUGH */ | ||
266 | case NMI_IO_APIC: | ||
267 | __get_cpu_var(wd_enabled) = 1; | ||
268 | atomic_inc(&nmi_active); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | void stop_apic_nmi_watchdog(void *unused) | ||
273 | { | ||
274 | /* only support LOCAL and IO APICs for now */ | ||
275 | if ((nmi_watchdog != NMI_LOCAL_APIC) && | ||
276 | (nmi_watchdog != NMI_IO_APIC)) | ||
277 | return; | ||
278 | if (__get_cpu_var(wd_enabled) == 0) | ||
279 | return; | ||
280 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
281 | lapic_watchdog_stop(); | ||
282 | __get_cpu_var(wd_enabled) = 0; | ||
283 | atomic_dec(&nmi_active); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
288 | * is to check it's local APIC timer IRQ counts. If they are not | ||
289 | * changing then that CPU has some problem. | ||
290 | * | ||
291 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
292 | * have to check the current processor. | ||
293 | */ | ||
294 | |||
295 | static DEFINE_PER_CPU(unsigned, last_irq_sum); | ||
296 | static DEFINE_PER_CPU(local_t, alert_counter); | ||
297 | static DEFINE_PER_CPU(int, nmi_touch); | ||
298 | |||
299 | void touch_nmi_watchdog(void) | ||
300 | { | ||
301 | if (nmi_watchdog > 0) { | ||
302 | unsigned cpu; | ||
303 | |||
304 | /* | ||
305 | * Tell other CPUs to reset their alert counters. We cannot | ||
306 | * do it ourselves because the alert count increase is not | ||
307 | * atomic. | ||
308 | */ | ||
309 | for_each_present_cpu(cpu) { | ||
310 | if (per_cpu(nmi_touch, cpu) != 1) | ||
311 | per_cpu(nmi_touch, cpu) = 1; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | touch_softlockup_watchdog(); | ||
316 | } | ||
317 | |||
318 | int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) | ||
319 | { | ||
320 | int sum; | ||
321 | int touched = 0; | ||
322 | int cpu = smp_processor_id(); | ||
323 | int rc = 0; | ||
324 | |||
325 | /* check for other users first */ | ||
326 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
327 | == NOTIFY_STOP) { | ||
328 | rc = 1; | ||
329 | touched = 1; | ||
330 | } | ||
331 | |||
332 | sum = read_pda(apic_timer_irqs); | ||
333 | if (__get_cpu_var(nmi_touch)) { | ||
334 | __get_cpu_var(nmi_touch) = 0; | ||
335 | touched = 1; | ||
336 | } | ||
337 | |||
338 | if (cpu_isset(cpu, backtrace_mask)) { | ||
339 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | ||
340 | |||
341 | spin_lock(&lock); | ||
342 | printk("NMI backtrace for cpu %d\n", cpu); | ||
343 | dump_stack(); | ||
344 | spin_unlock(&lock); | ||
345 | cpu_clear(cpu, backtrace_mask); | ||
346 | } | ||
347 | |||
348 | #ifdef CONFIG_X86_MCE | ||
349 | /* Could check oops_in_progress here too, but it's safer | ||
350 | not too */ | ||
351 | if (atomic_read(&mce_entry) > 0) | ||
352 | touched = 1; | ||
353 | #endif | ||
354 | /* if the apic timer isn't firing, this cpu isn't doing much */ | ||
355 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { | ||
356 | /* | ||
357 | * Ayiee, looks like this CPU is stuck ... | ||
358 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
359 | */ | ||
360 | local_inc(&__get_cpu_var(alert_counter)); | ||
361 | if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) | ||
362 | die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, | ||
363 | panic_on_timeout); | ||
364 | } else { | ||
365 | __get_cpu_var(last_irq_sum) = sum; | ||
366 | local_set(&__get_cpu_var(alert_counter), 0); | ||
367 | } | ||
368 | |||
369 | /* see if the nmi watchdog went off */ | ||
370 | if (!__get_cpu_var(wd_enabled)) | ||
371 | return rc; | ||
372 | switch (nmi_watchdog) { | ||
373 | case NMI_LOCAL_APIC: | ||
374 | rc |= lapic_wd_event(nmi_hz); | ||
375 | break; | ||
376 | case NMI_IO_APIC: | ||
377 | /* don't know how to accurately check for this. | ||
378 | * just assume it was a watchdog timer interrupt | ||
379 | * This matches the old behaviour. | ||
380 | */ | ||
381 | rc = 1; | ||
382 | break; | ||
383 | } | ||
384 | return rc; | ||
385 | } | ||
386 | |||
387 | static unsigned ignore_nmis; | ||
388 | |||
389 | asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) | ||
390 | { | ||
391 | nmi_enter(); | ||
392 | add_pda(__nmi_count,1); | ||
393 | if (!ignore_nmis) | ||
394 | default_do_nmi(regs); | ||
395 | nmi_exit(); | ||
396 | } | ||
397 | |||
398 | int do_nmi_callback(struct pt_regs * regs, int cpu) | ||
399 | { | ||
400 | #ifdef CONFIG_SYSCTL | ||
401 | if (unknown_nmi_panic) | ||
402 | return unknown_nmi_panic_callback(regs, cpu); | ||
403 | #endif | ||
404 | return 0; | ||
405 | } | ||
406 | |||
407 | void stop_nmi(void) | ||
408 | { | ||
409 | acpi_nmi_disable(); | ||
410 | ignore_nmis++; | ||
411 | } | ||
412 | |||
413 | void restart_nmi(void) | ||
414 | { | ||
415 | ignore_nmis--; | ||
416 | acpi_nmi_enable(); | ||
417 | } | ||
418 | |||
419 | #ifdef CONFIG_SYSCTL | ||
420 | |||
421 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
422 | { | ||
423 | unsigned char reason = get_nmi_reason(); | ||
424 | char buf[64]; | ||
425 | |||
426 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
427 | die_nmi(buf, regs, 1); /* Always panic here */ | ||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * proc handler for /proc/sys/kernel/nmi | ||
433 | */ | ||
434 | int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | ||
435 | void __user *buffer, size_t *length, loff_t *ppos) | ||
436 | { | ||
437 | int old_state; | ||
438 | |||
439 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | ||
440 | old_state = nmi_watchdog_enabled; | ||
441 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
442 | if (!!old_state == !!nmi_watchdog_enabled) | ||
443 | return 0; | ||
444 | |||
445 | if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { | ||
446 | printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); | ||
447 | return -EIO; | ||
448 | } | ||
449 | |||
450 | /* if nmi_watchdog is not set yet, then set it */ | ||
451 | nmi_watchdog_default(); | ||
452 | |||
453 | if (nmi_watchdog == NMI_LOCAL_APIC) { | ||
454 | if (nmi_watchdog_enabled) | ||
455 | enable_lapic_nmi_watchdog(); | ||
456 | else | ||
457 | disable_lapic_nmi_watchdog(); | ||
458 | } else { | ||
459 | printk( KERN_WARNING | ||
460 | "NMI watchdog doesn't know what hardware to touch\n"); | ||
461 | return -EIO; | ||
462 | } | ||
463 | return 0; | ||
464 | } | ||
465 | |||
466 | #endif | ||
467 | |||
468 | void __trigger_all_cpu_backtrace(void) | ||
469 | { | ||
470 | int i; | ||
471 | |||
472 | backtrace_mask = cpu_online_map; | ||
473 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
474 | for (i = 0; i < 10 * 1000; i++) { | ||
475 | if (cpus_empty(backtrace_mask)) | ||
476 | break; | ||
477 | mdelay(1); | ||
478 | } | ||
479 | } | ||
480 | |||
481 | EXPORT_SYMBOL(nmi_active); | ||
482 | EXPORT_SYMBOL(nmi_watchdog); | ||
483 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c new file mode 100644 index 000000000000..71da01e73f03 --- /dev/null +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -0,0 +1,1578 @@ | |||
1 | /* | ||
2 | * Derived from arch/powerpc/kernel/iommu.c | ||
3 | * | ||
4 | * Copyright IBM Corporation, 2006-2007 | ||
5 | * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> | ||
6 | * | ||
7 | * Author: Jon Mason <jdmason@kudzu.us> | ||
8 | * Author: Muli Ben-Yehuda <muli@il.ibm.com> | ||
9 | |||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | */ | ||
24 | |||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/string.h> | ||
32 | #include <linux/dma-mapping.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/pci_ids.h> | ||
36 | #include <linux/pci.h> | ||
37 | #include <linux/delay.h> | ||
38 | #include <asm/iommu.h> | ||
39 | #include <asm/calgary.h> | ||
40 | #include <asm/tce.h> | ||
41 | #include <asm/pci-direct.h> | ||
42 | #include <asm/system.h> | ||
43 | #include <asm/dma.h> | ||
44 | #include <asm/rio.h> | ||
45 | |||
46 | #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT | ||
47 | int use_calgary __read_mostly = 1; | ||
48 | #else | ||
49 | int use_calgary __read_mostly = 0; | ||
50 | #endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ | ||
51 | |||
52 | #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 | ||
53 | #define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 | ||
54 | |||
55 | /* register offsets inside the host bridge space */ | ||
56 | #define CALGARY_CONFIG_REG 0x0108 | ||
57 | #define PHB_CSR_OFFSET 0x0110 /* Channel Status */ | ||
58 | #define PHB_PLSSR_OFFSET 0x0120 | ||
59 | #define PHB_CONFIG_RW_OFFSET 0x0160 | ||
60 | #define PHB_IOBASE_BAR_LOW 0x0170 | ||
61 | #define PHB_IOBASE_BAR_HIGH 0x0180 | ||
62 | #define PHB_MEM_1_LOW 0x0190 | ||
63 | #define PHB_MEM_1_HIGH 0x01A0 | ||
64 | #define PHB_IO_ADDR_SIZE 0x01B0 | ||
65 | #define PHB_MEM_1_SIZE 0x01C0 | ||
66 | #define PHB_MEM_ST_OFFSET 0x01D0 | ||
67 | #define PHB_AER_OFFSET 0x0200 | ||
68 | #define PHB_CONFIG_0_HIGH 0x0220 | ||
69 | #define PHB_CONFIG_0_LOW 0x0230 | ||
70 | #define PHB_CONFIG_0_END 0x0240 | ||
71 | #define PHB_MEM_2_LOW 0x02B0 | ||
72 | #define PHB_MEM_2_HIGH 0x02C0 | ||
73 | #define PHB_MEM_2_SIZE_HIGH 0x02D0 | ||
74 | #define PHB_MEM_2_SIZE_LOW 0x02E0 | ||
75 | #define PHB_DOSHOLE_OFFSET 0x08E0 | ||
76 | |||
77 | /* CalIOC2 specific */ | ||
78 | #define PHB_SAVIOR_L2 0x0DB0 | ||
79 | #define PHB_PAGE_MIG_CTRL 0x0DA8 | ||
80 | #define PHB_PAGE_MIG_DEBUG 0x0DA0 | ||
81 | #define PHB_ROOT_COMPLEX_STATUS 0x0CB0 | ||
82 | |||
83 | /* PHB_CONFIG_RW */ | ||
84 | #define PHB_TCE_ENABLE 0x20000000 | ||
85 | #define PHB_SLOT_DISABLE 0x1C000000 | ||
86 | #define PHB_DAC_DISABLE 0x01000000 | ||
87 | #define PHB_MEM2_ENABLE 0x00400000 | ||
88 | #define PHB_MCSR_ENABLE 0x00100000 | ||
89 | /* TAR (Table Address Register) */ | ||
90 | #define TAR_SW_BITS 0x0000ffffffff800fUL | ||
91 | #define TAR_VALID 0x0000000000000008UL | ||
92 | /* CSR (Channel/DMA Status Register) */ | ||
93 | #define CSR_AGENT_MASK 0xffe0ffff | ||
94 | /* CCR (Calgary Configuration Register) */ | ||
95 | #define CCR_2SEC_TIMEOUT 0x000000000000000EUL | ||
96 | /* PMCR/PMDR (Page Migration Control/Debug Registers */ | ||
97 | #define PMR_SOFTSTOP 0x80000000 | ||
98 | #define PMR_SOFTSTOPFAULT 0x40000000 | ||
99 | #define PMR_HARDSTOP 0x20000000 | ||
100 | |||
101 | #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ | ||
102 | #define MAX_NUM_CHASSIS 8 /* max number of chassis */ | ||
103 | /* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ | ||
104 | #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) | ||
105 | #define PHBS_PER_CALGARY 4 | ||
106 | |||
107 | /* register offsets in Calgary's internal register space */ | ||
108 | static const unsigned long tar_offsets[] = { | ||
109 | 0x0580 /* TAR0 */, | ||
110 | 0x0588 /* TAR1 */, | ||
111 | 0x0590 /* TAR2 */, | ||
112 | 0x0598 /* TAR3 */ | ||
113 | }; | ||
114 | |||
115 | static const unsigned long split_queue_offsets[] = { | ||
116 | 0x4870 /* SPLIT QUEUE 0 */, | ||
117 | 0x5870 /* SPLIT QUEUE 1 */, | ||
118 | 0x6870 /* SPLIT QUEUE 2 */, | ||
119 | 0x7870 /* SPLIT QUEUE 3 */ | ||
120 | }; | ||
121 | |||
122 | static const unsigned long phb_offsets[] = { | ||
123 | 0x8000 /* PHB0 */, | ||
124 | 0x9000 /* PHB1 */, | ||
125 | 0xA000 /* PHB2 */, | ||
126 | 0xB000 /* PHB3 */ | ||
127 | }; | ||
128 | |||
129 | /* PHB debug registers */ | ||
130 | |||
131 | static const unsigned long phb_debug_offsets[] = { | ||
132 | 0x4000 /* PHB 0 DEBUG */, | ||
133 | 0x5000 /* PHB 1 DEBUG */, | ||
134 | 0x6000 /* PHB 2 DEBUG */, | ||
135 | 0x7000 /* PHB 3 DEBUG */ | ||
136 | }; | ||
137 | |||
138 | /* | ||
139 | * STUFF register for each debug PHB, | ||
140 | * byte 1 = start bus number, byte 2 = end bus number | ||
141 | */ | ||
142 | |||
143 | #define PHB_DEBUG_STUFF_OFFSET 0x0020 | ||
144 | |||
145 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
146 | |||
147 | unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; | ||
148 | static int translate_empty_slots __read_mostly = 0; | ||
149 | static int calgary_detected __read_mostly = 0; | ||
150 | |||
151 | static struct rio_table_hdr *rio_table_hdr __initdata; | ||
152 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; | ||
153 | static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; | ||
154 | |||
155 | struct calgary_bus_info { | ||
156 | void *tce_space; | ||
157 | unsigned char translation_disabled; | ||
158 | signed char phbid; | ||
159 | void __iomem *bbar; | ||
160 | }; | ||
161 | |||
162 | static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); | ||
163 | static void calgary_tce_cache_blast(struct iommu_table *tbl); | ||
164 | static void calgary_dump_error_regs(struct iommu_table *tbl); | ||
165 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); | ||
166 | static void calioc2_tce_cache_blast(struct iommu_table *tbl); | ||
167 | static void calioc2_dump_error_regs(struct iommu_table *tbl); | ||
168 | |||
169 | static struct cal_chipset_ops calgary_chip_ops = { | ||
170 | .handle_quirks = calgary_handle_quirks, | ||
171 | .tce_cache_blast = calgary_tce_cache_blast, | ||
172 | .dump_error_regs = calgary_dump_error_regs | ||
173 | }; | ||
174 | |||
175 | static struct cal_chipset_ops calioc2_chip_ops = { | ||
176 | .handle_quirks = calioc2_handle_quirks, | ||
177 | .tce_cache_blast = calioc2_tce_cache_blast, | ||
178 | .dump_error_regs = calioc2_dump_error_regs | ||
179 | }; | ||
180 | |||
181 | static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; | ||
182 | |||
183 | /* enable this to stress test the chip's TCE cache */ | ||
184 | #ifdef CONFIG_IOMMU_DEBUG | ||
185 | int debugging __read_mostly = 1; | ||
186 | |||
187 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
188 | int expected, unsigned long start, unsigned long end) | ||
189 | { | ||
190 | unsigned long idx = start; | ||
191 | |||
192 | BUG_ON(start >= end); | ||
193 | |||
194 | while (idx < end) { | ||
195 | if (!!test_bit(idx, bitmap) != expected) | ||
196 | return idx; | ||
197 | ++idx; | ||
198 | } | ||
199 | |||
200 | /* all bits have the expected value */ | ||
201 | return ~0UL; | ||
202 | } | ||
203 | #else /* debugging is disabled */ | ||
204 | int debugging __read_mostly = 0; | ||
205 | |||
206 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
207 | int expected, unsigned long start, unsigned long end) | ||
208 | { | ||
209 | return ~0UL; | ||
210 | } | ||
211 | |||
212 | #endif /* CONFIG_IOMMU_DEBUG */ | ||
213 | |||
214 | static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) | ||
215 | { | ||
216 | unsigned int npages; | ||
217 | |||
218 | npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); | ||
219 | npages >>= PAGE_SHIFT; | ||
220 | |||
221 | return npages; | ||
222 | } | ||
223 | |||
224 | static inline int translate_phb(struct pci_dev* dev) | ||
225 | { | ||
226 | int disabled = bus_info[dev->bus->number].translation_disabled; | ||
227 | return !disabled; | ||
228 | } | ||
229 | |||
230 | static void iommu_range_reserve(struct iommu_table *tbl, | ||
231 | unsigned long start_addr, unsigned int npages) | ||
232 | { | ||
233 | unsigned long index; | ||
234 | unsigned long end; | ||
235 | unsigned long badbit; | ||
236 | unsigned long flags; | ||
237 | |||
238 | index = start_addr >> PAGE_SHIFT; | ||
239 | |||
240 | /* bail out if we're asked to reserve a region we don't cover */ | ||
241 | if (index >= tbl->it_size) | ||
242 | return; | ||
243 | |||
244 | end = index + npages; | ||
245 | if (end > tbl->it_size) /* don't go off the table */ | ||
246 | end = tbl->it_size; | ||
247 | |||
248 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
249 | |||
250 | badbit = verify_bit_range(tbl->it_map, 0, index, end); | ||
251 | if (badbit != ~0UL) { | ||
252 | if (printk_ratelimit()) | ||
253 | printk(KERN_ERR "Calgary: entry already allocated at " | ||
254 | "0x%lx tbl %p dma 0x%lx npages %u\n", | ||
255 | badbit, tbl, start_addr, npages); | ||
256 | } | ||
257 | |||
258 | set_bit_string(tbl->it_map, index, npages); | ||
259 | |||
260 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
261 | } | ||
262 | |||
263 | static unsigned long iommu_range_alloc(struct iommu_table *tbl, | ||
264 | unsigned int npages) | ||
265 | { | ||
266 | unsigned long flags; | ||
267 | unsigned long offset; | ||
268 | |||
269 | BUG_ON(npages == 0); | ||
270 | |||
271 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
272 | |||
273 | offset = find_next_zero_string(tbl->it_map, tbl->it_hint, | ||
274 | tbl->it_size, npages); | ||
275 | if (offset == ~0UL) { | ||
276 | tbl->chip_ops->tce_cache_blast(tbl); | ||
277 | offset = find_next_zero_string(tbl->it_map, 0, | ||
278 | tbl->it_size, npages); | ||
279 | if (offset == ~0UL) { | ||
280 | printk(KERN_WARNING "Calgary: IOMMU full.\n"); | ||
281 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
282 | if (panic_on_overflow) | ||
283 | panic("Calgary: fix the allocator.\n"); | ||
284 | else | ||
285 | return bad_dma_address; | ||
286 | } | ||
287 | } | ||
288 | |||
289 | set_bit_string(tbl->it_map, offset, npages); | ||
290 | tbl->it_hint = offset + npages; | ||
291 | BUG_ON(tbl->it_hint > tbl->it_size); | ||
292 | |||
293 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
294 | |||
295 | return offset; | ||
296 | } | ||
297 | |||
298 | static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, | ||
299 | unsigned int npages, int direction) | ||
300 | { | ||
301 | unsigned long entry; | ||
302 | dma_addr_t ret = bad_dma_address; | ||
303 | |||
304 | entry = iommu_range_alloc(tbl, npages); | ||
305 | |||
306 | if (unlikely(entry == bad_dma_address)) | ||
307 | goto error; | ||
308 | |||
309 | /* set the return dma address */ | ||
310 | ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); | ||
311 | |||
312 | /* put the TCEs in the HW table */ | ||
313 | tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, | ||
314 | direction); | ||
315 | |||
316 | return ret; | ||
317 | |||
318 | error: | ||
319 | printk(KERN_WARNING "Calgary: failed to allocate %u pages in " | ||
320 | "iommu %p\n", npages, tbl); | ||
321 | return bad_dma_address; | ||
322 | } | ||
323 | |||
324 | static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, | ||
325 | unsigned int npages) | ||
326 | { | ||
327 | unsigned long entry; | ||
328 | unsigned long badbit; | ||
329 | unsigned long badend; | ||
330 | unsigned long flags; | ||
331 | |||
332 | /* were we called with bad_dma_address? */ | ||
333 | badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); | ||
334 | if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { | ||
335 | printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " | ||
336 | "address 0x%Lx\n", dma_addr); | ||
337 | WARN_ON(1); | ||
338 | return; | ||
339 | } | ||
340 | |||
341 | entry = dma_addr >> PAGE_SHIFT; | ||
342 | |||
343 | BUG_ON(entry + npages > tbl->it_size); | ||
344 | |||
345 | tce_free(tbl, entry, npages); | ||
346 | |||
347 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
348 | |||
349 | badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); | ||
350 | if (badbit != ~0UL) { | ||
351 | if (printk_ratelimit()) | ||
352 | printk(KERN_ERR "Calgary: bit is off at 0x%lx " | ||
353 | "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", | ||
354 | badbit, tbl, dma_addr, entry, npages); | ||
355 | } | ||
356 | |||
357 | __clear_bit_string(tbl->it_map, entry, npages); | ||
358 | |||
359 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
360 | } | ||
361 | |||
362 | static inline struct iommu_table *find_iommu_table(struct device *dev) | ||
363 | { | ||
364 | struct pci_dev *pdev; | ||
365 | struct pci_bus *pbus; | ||
366 | struct iommu_table *tbl; | ||
367 | |||
368 | pdev = to_pci_dev(dev); | ||
369 | |||
370 | pbus = pdev->bus; | ||
371 | |||
372 | /* is the device behind a bridge? Look for the root bus */ | ||
373 | while (pbus->parent) | ||
374 | pbus = pbus->parent; | ||
375 | |||
376 | tbl = pci_iommu(pbus); | ||
377 | |||
378 | BUG_ON(tbl && (tbl->it_busno != pbus->number)); | ||
379 | |||
380 | return tbl; | ||
381 | } | ||
382 | |||
383 | static void calgary_unmap_sg(struct device *dev, | ||
384 | struct scatterlist *sglist, int nelems, int direction) | ||
385 | { | ||
386 | struct iommu_table *tbl = find_iommu_table(dev); | ||
387 | |||
388 | if (!translate_phb(to_pci_dev(dev))) | ||
389 | return; | ||
390 | |||
391 | while (nelems--) { | ||
392 | unsigned int npages; | ||
393 | dma_addr_t dma = sglist->dma_address; | ||
394 | unsigned int dmalen = sglist->dma_length; | ||
395 | |||
396 | if (dmalen == 0) | ||
397 | break; | ||
398 | |||
399 | npages = num_dma_pages(dma, dmalen); | ||
400 | iommu_free(tbl, dma, npages); | ||
401 | sglist++; | ||
402 | } | ||
403 | } | ||
404 | |||
405 | static int calgary_nontranslate_map_sg(struct device* dev, | ||
406 | struct scatterlist *sg, int nelems, int direction) | ||
407 | { | ||
408 | int i; | ||
409 | |||
410 | for (i = 0; i < nelems; i++ ) { | ||
411 | struct scatterlist *s = &sg[i]; | ||
412 | BUG_ON(!s->page); | ||
413 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
414 | s->dma_length = s->length; | ||
415 | } | ||
416 | return nelems; | ||
417 | } | ||
418 | |||
419 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | ||
420 | int nelems, int direction) | ||
421 | { | ||
422 | struct iommu_table *tbl = find_iommu_table(dev); | ||
423 | unsigned long vaddr; | ||
424 | unsigned int npages; | ||
425 | unsigned long entry; | ||
426 | int i; | ||
427 | |||
428 | if (!translate_phb(to_pci_dev(dev))) | ||
429 | return calgary_nontranslate_map_sg(dev, sg, nelems, direction); | ||
430 | |||
431 | for (i = 0; i < nelems; i++ ) { | ||
432 | struct scatterlist *s = &sg[i]; | ||
433 | BUG_ON(!s->page); | ||
434 | |||
435 | vaddr = (unsigned long)page_address(s->page) + s->offset; | ||
436 | npages = num_dma_pages(vaddr, s->length); | ||
437 | |||
438 | entry = iommu_range_alloc(tbl, npages); | ||
439 | if (entry == bad_dma_address) { | ||
440 | /* makes sure unmap knows to stop */ | ||
441 | s->dma_length = 0; | ||
442 | goto error; | ||
443 | } | ||
444 | |||
445 | s->dma_address = (entry << PAGE_SHIFT) | s->offset; | ||
446 | |||
447 | /* insert into HW table */ | ||
448 | tce_build(tbl, entry, npages, vaddr & PAGE_MASK, | ||
449 | direction); | ||
450 | |||
451 | s->dma_length = s->length; | ||
452 | } | ||
453 | |||
454 | return nelems; | ||
455 | error: | ||
456 | calgary_unmap_sg(dev, sg, nelems, direction); | ||
457 | for (i = 0; i < nelems; i++) { | ||
458 | sg[i].dma_address = bad_dma_address; | ||
459 | sg[i].dma_length = 0; | ||
460 | } | ||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, | ||
465 | size_t size, int direction) | ||
466 | { | ||
467 | dma_addr_t dma_handle = bad_dma_address; | ||
468 | unsigned long uaddr; | ||
469 | unsigned int npages; | ||
470 | struct iommu_table *tbl = find_iommu_table(dev); | ||
471 | |||
472 | uaddr = (unsigned long)vaddr; | ||
473 | npages = num_dma_pages(uaddr, size); | ||
474 | |||
475 | if (translate_phb(to_pci_dev(dev))) | ||
476 | dma_handle = iommu_alloc(tbl, vaddr, npages, direction); | ||
477 | else | ||
478 | dma_handle = virt_to_bus(vaddr); | ||
479 | |||
480 | return dma_handle; | ||
481 | } | ||
482 | |||
483 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | ||
484 | size_t size, int direction) | ||
485 | { | ||
486 | struct iommu_table *tbl = find_iommu_table(dev); | ||
487 | unsigned int npages; | ||
488 | |||
489 | if (!translate_phb(to_pci_dev(dev))) | ||
490 | return; | ||
491 | |||
492 | npages = num_dma_pages(dma_handle, size); | ||
493 | iommu_free(tbl, dma_handle, npages); | ||
494 | } | ||
495 | |||
496 | static void* calgary_alloc_coherent(struct device *dev, size_t size, | ||
497 | dma_addr_t *dma_handle, gfp_t flag) | ||
498 | { | ||
499 | void *ret = NULL; | ||
500 | dma_addr_t mapping; | ||
501 | unsigned int npages, order; | ||
502 | struct iommu_table *tbl = find_iommu_table(dev); | ||
503 | |||
504 | size = PAGE_ALIGN(size); /* size rounded up to full pages */ | ||
505 | npages = size >> PAGE_SHIFT; | ||
506 | order = get_order(size); | ||
507 | |||
508 | /* alloc enough pages (and possibly more) */ | ||
509 | ret = (void *)__get_free_pages(flag, order); | ||
510 | if (!ret) | ||
511 | goto error; | ||
512 | memset(ret, 0, size); | ||
513 | |||
514 | if (translate_phb(to_pci_dev(dev))) { | ||
515 | /* set up tces to cover the allocated range */ | ||
516 | mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); | ||
517 | if (mapping == bad_dma_address) | ||
518 | goto free; | ||
519 | |||
520 | *dma_handle = mapping; | ||
521 | } else /* non translated slot */ | ||
522 | *dma_handle = virt_to_bus(ret); | ||
523 | |||
524 | return ret; | ||
525 | |||
526 | free: | ||
527 | free_pages((unsigned long)ret, get_order(size)); | ||
528 | ret = NULL; | ||
529 | error: | ||
530 | return ret; | ||
531 | } | ||
532 | |||
533 | static const struct dma_mapping_ops calgary_dma_ops = { | ||
534 | .alloc_coherent = calgary_alloc_coherent, | ||
535 | .map_single = calgary_map_single, | ||
536 | .unmap_single = calgary_unmap_single, | ||
537 | .map_sg = calgary_map_sg, | ||
538 | .unmap_sg = calgary_unmap_sg, | ||
539 | }; | ||
540 | |||
541 | static inline void __iomem * busno_to_bbar(unsigned char num) | ||
542 | { | ||
543 | return bus_info[num].bbar; | ||
544 | } | ||
545 | |||
546 | static inline int busno_to_phbid(unsigned char num) | ||
547 | { | ||
548 | return bus_info[num].phbid; | ||
549 | } | ||
550 | |||
551 | static inline unsigned long split_queue_offset(unsigned char num) | ||
552 | { | ||
553 | size_t idx = busno_to_phbid(num); | ||
554 | |||
555 | return split_queue_offsets[idx]; | ||
556 | } | ||
557 | |||
558 | static inline unsigned long tar_offset(unsigned char num) | ||
559 | { | ||
560 | size_t idx = busno_to_phbid(num); | ||
561 | |||
562 | return tar_offsets[idx]; | ||
563 | } | ||
564 | |||
565 | static inline unsigned long phb_offset(unsigned char num) | ||
566 | { | ||
567 | size_t idx = busno_to_phbid(num); | ||
568 | |||
569 | return phb_offsets[idx]; | ||
570 | } | ||
571 | |||
572 | static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) | ||
573 | { | ||
574 | unsigned long target = ((unsigned long)bar) | offset; | ||
575 | return (void __iomem*)target; | ||
576 | } | ||
577 | |||
578 | static inline int is_calioc2(unsigned short device) | ||
579 | { | ||
580 | return (device == PCI_DEVICE_ID_IBM_CALIOC2); | ||
581 | } | ||
582 | |||
583 | static inline int is_calgary(unsigned short device) | ||
584 | { | ||
585 | return (device == PCI_DEVICE_ID_IBM_CALGARY); | ||
586 | } | ||
587 | |||
588 | static inline int is_cal_pci_dev(unsigned short device) | ||
589 | { | ||
590 | return (is_calgary(device) || is_calioc2(device)); | ||
591 | } | ||
592 | |||
593 | static void calgary_tce_cache_blast(struct iommu_table *tbl) | ||
594 | { | ||
595 | u64 val; | ||
596 | u32 aer; | ||
597 | int i = 0; | ||
598 | void __iomem *bbar = tbl->bbar; | ||
599 | void __iomem *target; | ||
600 | |||
601 | /* disable arbitration on the bus */ | ||
602 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); | ||
603 | aer = readl(target); | ||
604 | writel(0, target); | ||
605 | |||
606 | /* read plssr to ensure it got there */ | ||
607 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); | ||
608 | val = readl(target); | ||
609 | |||
610 | /* poll split queues until all DMA activity is done */ | ||
611 | target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); | ||
612 | do { | ||
613 | val = readq(target); | ||
614 | i++; | ||
615 | } while ((val & 0xff) != 0xff && i < 100); | ||
616 | if (i == 100) | ||
617 | printk(KERN_WARNING "Calgary: PCI bus not quiesced, " | ||
618 | "continuing anyway\n"); | ||
619 | |||
620 | /* invalidate TCE cache */ | ||
621 | target = calgary_reg(bbar, tar_offset(tbl->it_busno)); | ||
622 | writeq(tbl->tar_val, target); | ||
623 | |||
624 | /* enable arbitration */ | ||
625 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); | ||
626 | writel(aer, target); | ||
627 | (void)readl(target); /* flush */ | ||
628 | } | ||
629 | |||
630 | static void calioc2_tce_cache_blast(struct iommu_table *tbl) | ||
631 | { | ||
632 | void __iomem *bbar = tbl->bbar; | ||
633 | void __iomem *target; | ||
634 | u64 val64; | ||
635 | u32 val; | ||
636 | int i = 0; | ||
637 | int count = 1; | ||
638 | unsigned char bus = tbl->it_busno; | ||
639 | |||
640 | begin: | ||
641 | printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " | ||
642 | "sequence - count %d\n", bus, count); | ||
643 | |||
644 | /* 1. using the Page Migration Control reg set SoftStop */ | ||
645 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
646 | val = be32_to_cpu(readl(target)); | ||
647 | printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); | ||
648 | val |= PMR_SOFTSTOP; | ||
649 | printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); | ||
650 | writel(cpu_to_be32(val), target); | ||
651 | |||
652 | /* 2. poll split queues until all DMA activity is done */ | ||
653 | printk(KERN_DEBUG "2a. starting to poll split queues\n"); | ||
654 | target = calgary_reg(bbar, split_queue_offset(bus)); | ||
655 | do { | ||
656 | val64 = readq(target); | ||
657 | i++; | ||
658 | } while ((val64 & 0xff) != 0xff && i < 100); | ||
659 | if (i == 100) | ||
660 | printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " | ||
661 | "continuing anyway\n"); | ||
662 | |||
663 | /* 3. poll Page Migration DEBUG for SoftStopFault */ | ||
664 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); | ||
665 | val = be32_to_cpu(readl(target)); | ||
666 | printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); | ||
667 | |||
668 | /* 4. if SoftStopFault - goto (1) */ | ||
669 | if (val & PMR_SOFTSTOPFAULT) { | ||
670 | if (++count < 100) | ||
671 | goto begin; | ||
672 | else { | ||
673 | printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " | ||
674 | "aborting TCE cache flush sequence!\n"); | ||
675 | return; /* pray for the best */ | ||
676 | } | ||
677 | } | ||
678 | |||
679 | /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ | ||
680 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
681 | printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); | ||
682 | val = be32_to_cpu(readl(target)); | ||
683 | printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); | ||
684 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); | ||
685 | val = be32_to_cpu(readl(target)); | ||
686 | printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); | ||
687 | |||
688 | /* 6. invalidate TCE cache */ | ||
689 | printk(KERN_DEBUG "6. invalidating TCE cache\n"); | ||
690 | target = calgary_reg(bbar, tar_offset(bus)); | ||
691 | writeq(tbl->tar_val, target); | ||
692 | |||
693 | /* 7. Re-read PMCR */ | ||
694 | printk(KERN_DEBUG "7a. Re-reading PMCR\n"); | ||
695 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
696 | val = be32_to_cpu(readl(target)); | ||
697 | printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); | ||
698 | |||
699 | /* 8. Remove HardStop */ | ||
700 | printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); | ||
701 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
702 | val = 0; | ||
703 | printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); | ||
704 | writel(cpu_to_be32(val), target); | ||
705 | val = be32_to_cpu(readl(target)); | ||
706 | printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); | ||
707 | } | ||
708 | |||
709 | static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, | ||
710 | u64 limit) | ||
711 | { | ||
712 | unsigned int numpages; | ||
713 | |||
714 | limit = limit | 0xfffff; | ||
715 | limit++; | ||
716 | |||
717 | numpages = ((limit - start) >> PAGE_SHIFT); | ||
718 | iommu_range_reserve(pci_iommu(dev->bus), start, numpages); | ||
719 | } | ||
720 | |||
721 | static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) | ||
722 | { | ||
723 | void __iomem *target; | ||
724 | u64 low, high, sizelow; | ||
725 | u64 start, limit; | ||
726 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
727 | unsigned char busnum = dev->bus->number; | ||
728 | void __iomem *bbar = tbl->bbar; | ||
729 | |||
730 | /* peripheral MEM_1 region */ | ||
731 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); | ||
732 | low = be32_to_cpu(readl(target)); | ||
733 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); | ||
734 | high = be32_to_cpu(readl(target)); | ||
735 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); | ||
736 | sizelow = be32_to_cpu(readl(target)); | ||
737 | |||
738 | start = (high << 32) | low; | ||
739 | limit = sizelow; | ||
740 | |||
741 | calgary_reserve_mem_region(dev, start, limit); | ||
742 | } | ||
743 | |||
744 | static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) | ||
745 | { | ||
746 | void __iomem *target; | ||
747 | u32 val32; | ||
748 | u64 low, high, sizelow, sizehigh; | ||
749 | u64 start, limit; | ||
750 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
751 | unsigned char busnum = dev->bus->number; | ||
752 | void __iomem *bbar = tbl->bbar; | ||
753 | |||
754 | /* is it enabled? */ | ||
755 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
756 | val32 = be32_to_cpu(readl(target)); | ||
757 | if (!(val32 & PHB_MEM2_ENABLE)) | ||
758 | return; | ||
759 | |||
760 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); | ||
761 | low = be32_to_cpu(readl(target)); | ||
762 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); | ||
763 | high = be32_to_cpu(readl(target)); | ||
764 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); | ||
765 | sizelow = be32_to_cpu(readl(target)); | ||
766 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); | ||
767 | sizehigh = be32_to_cpu(readl(target)); | ||
768 | |||
769 | start = (high << 32) | low; | ||
770 | limit = (sizehigh << 32) | sizelow; | ||
771 | |||
772 | calgary_reserve_mem_region(dev, start, limit); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * some regions of the IO address space do not get translated, so we | ||
777 | * must not give devices IO addresses in those regions. The regions | ||
778 | * are the 640KB-1MB region and the two PCI peripheral memory holes. | ||
779 | * Reserve all of them in the IOMMU bitmap to avoid giving them out | ||
780 | * later. | ||
781 | */ | ||
782 | static void __init calgary_reserve_regions(struct pci_dev *dev) | ||
783 | { | ||
784 | unsigned int npages; | ||
785 | u64 start; | ||
786 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
787 | |||
788 | /* reserve EMERGENCY_PAGES from bad_dma_address and up */ | ||
789 | iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); | ||
790 | |||
791 | /* avoid the BIOS/VGA first 640KB-1MB region */ | ||
792 | /* for CalIOC2 - avoid the entire first MB */ | ||
793 | if (is_calgary(dev->device)) { | ||
794 | start = (640 * 1024); | ||
795 | npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; | ||
796 | } else { /* calioc2 */ | ||
797 | start = 0; | ||
798 | npages = (1 * 1024 * 1024) >> PAGE_SHIFT; | ||
799 | } | ||
800 | iommu_range_reserve(tbl, start, npages); | ||
801 | |||
802 | /* reserve the two PCI peripheral memory regions in IO space */ | ||
803 | calgary_reserve_peripheral_mem_1(dev); | ||
804 | calgary_reserve_peripheral_mem_2(dev); | ||
805 | } | ||
806 | |||
807 | static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) | ||
808 | { | ||
809 | u64 val64; | ||
810 | u64 table_phys; | ||
811 | void __iomem *target; | ||
812 | int ret; | ||
813 | struct iommu_table *tbl; | ||
814 | |||
815 | /* build TCE tables for each PHB */ | ||
816 | ret = build_tce_table(dev, bbar); | ||
817 | if (ret) | ||
818 | return ret; | ||
819 | |||
820 | tbl = pci_iommu(dev->bus); | ||
821 | tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; | ||
822 | tce_free(tbl, 0, tbl->it_size); | ||
823 | |||
824 | if (is_calgary(dev->device)) | ||
825 | tbl->chip_ops = &calgary_chip_ops; | ||
826 | else if (is_calioc2(dev->device)) | ||
827 | tbl->chip_ops = &calioc2_chip_ops; | ||
828 | else | ||
829 | BUG(); | ||
830 | |||
831 | calgary_reserve_regions(dev); | ||
832 | |||
833 | /* set TARs for each PHB */ | ||
834 | target = calgary_reg(bbar, tar_offset(dev->bus->number)); | ||
835 | val64 = be64_to_cpu(readq(target)); | ||
836 | |||
837 | /* zero out all TAR bits under sw control */ | ||
838 | val64 &= ~TAR_SW_BITS; | ||
839 | table_phys = (u64)__pa(tbl->it_base); | ||
840 | |||
841 | val64 |= table_phys; | ||
842 | |||
843 | BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); | ||
844 | val64 |= (u64) specified_table_size; | ||
845 | |||
846 | tbl->tar_val = cpu_to_be64(val64); | ||
847 | |||
848 | writeq(tbl->tar_val, target); | ||
849 | readq(target); /* flush */ | ||
850 | |||
851 | return 0; | ||
852 | } | ||
853 | |||
854 | static void __init calgary_free_bus(struct pci_dev *dev) | ||
855 | { | ||
856 | u64 val64; | ||
857 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
858 | void __iomem *target; | ||
859 | unsigned int bitmapsz; | ||
860 | |||
861 | target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); | ||
862 | val64 = be64_to_cpu(readq(target)); | ||
863 | val64 &= ~TAR_SW_BITS; | ||
864 | writeq(cpu_to_be64(val64), target); | ||
865 | readq(target); /* flush */ | ||
866 | |||
867 | bitmapsz = tbl->it_size / BITS_PER_BYTE; | ||
868 | free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); | ||
869 | tbl->it_map = NULL; | ||
870 | |||
871 | kfree(tbl); | ||
872 | |||
873 | set_pci_iommu(dev->bus, NULL); | ||
874 | |||
875 | /* Can't free bootmem allocated memory after system is up :-( */ | ||
876 | bus_info[dev->bus->number].tce_space = NULL; | ||
877 | } | ||
878 | |||
879 | static void calgary_dump_error_regs(struct iommu_table *tbl) | ||
880 | { | ||
881 | void __iomem *bbar = tbl->bbar; | ||
882 | void __iomem *target; | ||
883 | u32 csr, plssr; | ||
884 | |||
885 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); | ||
886 | csr = be32_to_cpu(readl(target)); | ||
887 | |||
888 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); | ||
889 | plssr = be32_to_cpu(readl(target)); | ||
890 | |||
891 | /* If no error, the agent ID in the CSR is not valid */ | ||
892 | printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " | ||
893 | "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); | ||
894 | } | ||
895 | |||
896 | static void calioc2_dump_error_regs(struct iommu_table *tbl) | ||
897 | { | ||
898 | void __iomem *bbar = tbl->bbar; | ||
899 | u32 csr, csmr, plssr, mck, rcstat; | ||
900 | void __iomem *target; | ||
901 | unsigned long phboff = phb_offset(tbl->it_busno); | ||
902 | unsigned long erroff; | ||
903 | u32 errregs[7]; | ||
904 | int i; | ||
905 | |||
906 | /* dump CSR */ | ||
907 | target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); | ||
908 | csr = be32_to_cpu(readl(target)); | ||
909 | /* dump PLSSR */ | ||
910 | target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); | ||
911 | plssr = be32_to_cpu(readl(target)); | ||
912 | /* dump CSMR */ | ||
913 | target = calgary_reg(bbar, phboff | 0x290); | ||
914 | csmr = be32_to_cpu(readl(target)); | ||
915 | /* dump mck */ | ||
916 | target = calgary_reg(bbar, phboff | 0x800); | ||
917 | mck = be32_to_cpu(readl(target)); | ||
918 | |||
919 | printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", | ||
920 | tbl->it_busno); | ||
921 | |||
922 | printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", | ||
923 | csr, plssr, csmr, mck); | ||
924 | |||
925 | /* dump rest of error regs */ | ||
926 | printk(KERN_EMERG "Calgary: "); | ||
927 | for (i = 0; i < ARRAY_SIZE(errregs); i++) { | ||
928 | /* err regs are at 0x810 - 0x870 */ | ||
929 | erroff = (0x810 + (i * 0x10)); | ||
930 | target = calgary_reg(bbar, phboff | erroff); | ||
931 | errregs[i] = be32_to_cpu(readl(target)); | ||
932 | printk("0x%08x@0x%lx ", errregs[i], erroff); | ||
933 | } | ||
934 | printk("\n"); | ||
935 | |||
936 | /* root complex status */ | ||
937 | target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); | ||
938 | rcstat = be32_to_cpu(readl(target)); | ||
939 | printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, | ||
940 | PHB_ROOT_COMPLEX_STATUS); | ||
941 | } | ||
942 | |||
943 | static void calgary_watchdog(unsigned long data) | ||
944 | { | ||
945 | struct pci_dev *dev = (struct pci_dev *)data; | ||
946 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
947 | void __iomem *bbar = tbl->bbar; | ||
948 | u32 val32; | ||
949 | void __iomem *target; | ||
950 | |||
951 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); | ||
952 | val32 = be32_to_cpu(readl(target)); | ||
953 | |||
954 | /* If no error, the agent ID in the CSR is not valid */ | ||
955 | if (val32 & CSR_AGENT_MASK) { | ||
956 | tbl->chip_ops->dump_error_regs(tbl); | ||
957 | |||
958 | /* reset error */ | ||
959 | writel(0, target); | ||
960 | |||
961 | /* Disable bus that caused the error */ | ||
962 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | | ||
963 | PHB_CONFIG_RW_OFFSET); | ||
964 | val32 = be32_to_cpu(readl(target)); | ||
965 | val32 |= PHB_SLOT_DISABLE; | ||
966 | writel(cpu_to_be32(val32), target); | ||
967 | readl(target); /* flush */ | ||
968 | } else { | ||
969 | /* Reset the timer */ | ||
970 | mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | static void __init calgary_set_split_completion_timeout(void __iomem *bbar, | ||
975 | unsigned char busnum, unsigned long timeout) | ||
976 | { | ||
977 | u64 val64; | ||
978 | void __iomem *target; | ||
979 | unsigned int phb_shift = ~0; /* silence gcc */ | ||
980 | u64 mask; | ||
981 | |||
982 | switch (busno_to_phbid(busnum)) { | ||
983 | case 0: phb_shift = (63 - 19); | ||
984 | break; | ||
985 | case 1: phb_shift = (63 - 23); | ||
986 | break; | ||
987 | case 2: phb_shift = (63 - 27); | ||
988 | break; | ||
989 | case 3: phb_shift = (63 - 35); | ||
990 | break; | ||
991 | default: | ||
992 | BUG_ON(busno_to_phbid(busnum)); | ||
993 | } | ||
994 | |||
995 | target = calgary_reg(bbar, CALGARY_CONFIG_REG); | ||
996 | val64 = be64_to_cpu(readq(target)); | ||
997 | |||
998 | /* zero out this PHB's timer bits */ | ||
999 | mask = ~(0xFUL << phb_shift); | ||
1000 | val64 &= mask; | ||
1001 | val64 |= (timeout << phb_shift); | ||
1002 | writeq(cpu_to_be64(val64), target); | ||
1003 | readq(target); /* flush */ | ||
1004 | } | ||
1005 | |||
1006 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) | ||
1007 | { | ||
1008 | unsigned char busnum = dev->bus->number; | ||
1009 | void __iomem *bbar = tbl->bbar; | ||
1010 | void __iomem *target; | ||
1011 | u32 val; | ||
1012 | |||
1013 | /* | ||
1014 | * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 | ||
1015 | */ | ||
1016 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); | ||
1017 | val = cpu_to_be32(readl(target)); | ||
1018 | val |= 0x00800000; | ||
1019 | writel(cpu_to_be32(val), target); | ||
1020 | } | ||
1021 | |||
1022 | static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) | ||
1023 | { | ||
1024 | unsigned char busnum = dev->bus->number; | ||
1025 | |||
1026 | /* | ||
1027 | * Give split completion a longer timeout on bus 1 for aic94xx | ||
1028 | * http://bugzilla.kernel.org/show_bug.cgi?id=7180 | ||
1029 | */ | ||
1030 | if (is_calgary(dev->device) && (busnum == 1)) | ||
1031 | calgary_set_split_completion_timeout(tbl->bbar, busnum, | ||
1032 | CCR_2SEC_TIMEOUT); | ||
1033 | } | ||
1034 | |||
1035 | static void __init calgary_enable_translation(struct pci_dev *dev) | ||
1036 | { | ||
1037 | u32 val32; | ||
1038 | unsigned char busnum; | ||
1039 | void __iomem *target; | ||
1040 | void __iomem *bbar; | ||
1041 | struct iommu_table *tbl; | ||
1042 | |||
1043 | busnum = dev->bus->number; | ||
1044 | tbl = pci_iommu(dev->bus); | ||
1045 | bbar = tbl->bbar; | ||
1046 | |||
1047 | /* enable TCE in PHB Config Register */ | ||
1048 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
1049 | val32 = be32_to_cpu(readl(target)); | ||
1050 | val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; | ||
1051 | |||
1052 | printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", | ||
1053 | (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? | ||
1054 | "Calgary" : "CalIOC2", busnum); | ||
1055 | printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " | ||
1056 | "bus.\n"); | ||
1057 | |||
1058 | writel(cpu_to_be32(val32), target); | ||
1059 | readl(target); /* flush */ | ||
1060 | |||
1061 | init_timer(&tbl->watchdog_timer); | ||
1062 | tbl->watchdog_timer.function = &calgary_watchdog; | ||
1063 | tbl->watchdog_timer.data = (unsigned long)dev; | ||
1064 | mod_timer(&tbl->watchdog_timer, jiffies); | ||
1065 | } | ||
1066 | |||
1067 | static void __init calgary_disable_translation(struct pci_dev *dev) | ||
1068 | { | ||
1069 | u32 val32; | ||
1070 | unsigned char busnum; | ||
1071 | void __iomem *target; | ||
1072 | void __iomem *bbar; | ||
1073 | struct iommu_table *tbl; | ||
1074 | |||
1075 | busnum = dev->bus->number; | ||
1076 | tbl = pci_iommu(dev->bus); | ||
1077 | bbar = tbl->bbar; | ||
1078 | |||
1079 | /* disable TCE in PHB Config Register */ | ||
1080 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
1081 | val32 = be32_to_cpu(readl(target)); | ||
1082 | val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); | ||
1083 | |||
1084 | printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); | ||
1085 | writel(cpu_to_be32(val32), target); | ||
1086 | readl(target); /* flush */ | ||
1087 | |||
1088 | del_timer_sync(&tbl->watchdog_timer); | ||
1089 | } | ||
1090 | |||
1091 | static void __init calgary_init_one_nontraslated(struct pci_dev *dev) | ||
1092 | { | ||
1093 | pci_dev_get(dev); | ||
1094 | set_pci_iommu(dev->bus, NULL); | ||
1095 | |||
1096 | /* is the device behind a bridge? */ | ||
1097 | if (dev->bus->parent) | ||
1098 | dev->bus->parent->self = dev; | ||
1099 | else | ||
1100 | dev->bus->self = dev; | ||
1101 | } | ||
1102 | |||
1103 | static int __init calgary_init_one(struct pci_dev *dev) | ||
1104 | { | ||
1105 | void __iomem *bbar; | ||
1106 | struct iommu_table *tbl; | ||
1107 | int ret; | ||
1108 | |||
1109 | BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); | ||
1110 | |||
1111 | bbar = busno_to_bbar(dev->bus->number); | ||
1112 | ret = calgary_setup_tar(dev, bbar); | ||
1113 | if (ret) | ||
1114 | goto done; | ||
1115 | |||
1116 | pci_dev_get(dev); | ||
1117 | |||
1118 | if (dev->bus->parent) { | ||
1119 | if (dev->bus->parent->self) | ||
1120 | printk(KERN_WARNING "Calgary: IEEEE, dev %p has " | ||
1121 | "bus->parent->self!\n", dev); | ||
1122 | dev->bus->parent->self = dev; | ||
1123 | } else | ||
1124 | dev->bus->self = dev; | ||
1125 | |||
1126 | tbl = pci_iommu(dev->bus); | ||
1127 | tbl->chip_ops->handle_quirks(tbl, dev); | ||
1128 | |||
1129 | calgary_enable_translation(dev); | ||
1130 | |||
1131 | return 0; | ||
1132 | |||
1133 | done: | ||
1134 | return ret; | ||
1135 | } | ||
1136 | |||
1137 | static int __init calgary_locate_bbars(void) | ||
1138 | { | ||
1139 | int ret; | ||
1140 | int rioidx, phb, bus; | ||
1141 | void __iomem *bbar; | ||
1142 | void __iomem *target; | ||
1143 | unsigned long offset; | ||
1144 | u8 start_bus, end_bus; | ||
1145 | u32 val; | ||
1146 | |||
1147 | ret = -ENODATA; | ||
1148 | for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { | ||
1149 | struct rio_detail *rio = rio_devs[rioidx]; | ||
1150 | |||
1151 | if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) | ||
1152 | continue; | ||
1153 | |||
1154 | /* map entire 1MB of Calgary config space */ | ||
1155 | bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); | ||
1156 | if (!bbar) | ||
1157 | goto error; | ||
1158 | |||
1159 | for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { | ||
1160 | offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; | ||
1161 | target = calgary_reg(bbar, offset); | ||
1162 | |||
1163 | val = be32_to_cpu(readl(target)); | ||
1164 | |||
1165 | start_bus = (u8)((val & 0x00FF0000) >> 16); | ||
1166 | end_bus = (u8)((val & 0x0000FF00) >> 8); | ||
1167 | |||
1168 | if (end_bus) { | ||
1169 | for (bus = start_bus; bus <= end_bus; bus++) { | ||
1170 | bus_info[bus].bbar = bbar; | ||
1171 | bus_info[bus].phbid = phb; | ||
1172 | } | ||
1173 | } else { | ||
1174 | bus_info[start_bus].bbar = bbar; | ||
1175 | bus_info[start_bus].phbid = phb; | ||
1176 | } | ||
1177 | } | ||
1178 | } | ||
1179 | |||
1180 | return 0; | ||
1181 | |||
1182 | error: | ||
1183 | /* scan bus_info and iounmap any bbars we previously ioremap'd */ | ||
1184 | for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) | ||
1185 | if (bus_info[bus].bbar) | ||
1186 | iounmap(bus_info[bus].bbar); | ||
1187 | |||
1188 | return ret; | ||
1189 | } | ||
1190 | |||
1191 | static int __init calgary_init(void) | ||
1192 | { | ||
1193 | int ret; | ||
1194 | struct pci_dev *dev = NULL; | ||
1195 | void *tce_space; | ||
1196 | |||
1197 | ret = calgary_locate_bbars(); | ||
1198 | if (ret) | ||
1199 | return ret; | ||
1200 | |||
1201 | do { | ||
1202 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); | ||
1203 | if (!dev) | ||
1204 | break; | ||
1205 | if (!is_cal_pci_dev(dev->device)) | ||
1206 | continue; | ||
1207 | if (!translate_phb(dev)) { | ||
1208 | calgary_init_one_nontraslated(dev); | ||
1209 | continue; | ||
1210 | } | ||
1211 | tce_space = bus_info[dev->bus->number].tce_space; | ||
1212 | if (!tce_space && !translate_empty_slots) | ||
1213 | continue; | ||
1214 | |||
1215 | ret = calgary_init_one(dev); | ||
1216 | if (ret) | ||
1217 | goto error; | ||
1218 | } while (1); | ||
1219 | |||
1220 | return ret; | ||
1221 | |||
1222 | error: | ||
1223 | do { | ||
1224 | dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, | ||
1225 | PCI_ANY_ID, dev); | ||
1226 | if (!dev) | ||
1227 | break; | ||
1228 | if (!is_cal_pci_dev(dev->device)) | ||
1229 | continue; | ||
1230 | if (!translate_phb(dev)) { | ||
1231 | pci_dev_put(dev); | ||
1232 | continue; | ||
1233 | } | ||
1234 | if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) | ||
1235 | continue; | ||
1236 | |||
1237 | calgary_disable_translation(dev); | ||
1238 | calgary_free_bus(dev); | ||
1239 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ | ||
1240 | } while (1); | ||
1241 | |||
1242 | return ret; | ||
1243 | } | ||
1244 | |||
1245 | static inline int __init determine_tce_table_size(u64 ram) | ||
1246 | { | ||
1247 | int ret; | ||
1248 | |||
1249 | if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) | ||
1250 | return specified_table_size; | ||
1251 | |||
1252 | /* | ||
1253 | * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to | ||
1254 | * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each | ||
1255 | * larger table size has twice as many entries, so shift the | ||
1256 | * max ram address by 13 to divide by 8K and then look at the | ||
1257 | * order of the result to choose between 0-7. | ||
1258 | */ | ||
1259 | ret = get_order(ram >> 13); | ||
1260 | if (ret > TCE_TABLE_SIZE_8M) | ||
1261 | ret = TCE_TABLE_SIZE_8M; | ||
1262 | |||
1263 | return ret; | ||
1264 | } | ||
1265 | |||
1266 | static int __init build_detail_arrays(void) | ||
1267 | { | ||
1268 | unsigned long ptr; | ||
1269 | int i, scal_detail_size, rio_detail_size; | ||
1270 | |||
1271 | if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ | ||
1272 | printk(KERN_WARNING | ||
1273 | "Calgary: MAX_NUMNODES too low! Defined as %d, " | ||
1274 | "but system has %d nodes.\n", | ||
1275 | MAX_NUMNODES, rio_table_hdr->num_scal_dev); | ||
1276 | return -ENODEV; | ||
1277 | } | ||
1278 | |||
1279 | switch (rio_table_hdr->version){ | ||
1280 | case 2: | ||
1281 | scal_detail_size = 11; | ||
1282 | rio_detail_size = 13; | ||
1283 | break; | ||
1284 | case 3: | ||
1285 | scal_detail_size = 12; | ||
1286 | rio_detail_size = 15; | ||
1287 | break; | ||
1288 | default: | ||
1289 | printk(KERN_WARNING | ||
1290 | "Calgary: Invalid Rio Grande Table Version: %d\n", | ||
1291 | rio_table_hdr->version); | ||
1292 | return -EPROTO; | ||
1293 | } | ||
1294 | |||
1295 | ptr = ((unsigned long)rio_table_hdr) + 3; | ||
1296 | for (i = 0; i < rio_table_hdr->num_scal_dev; | ||
1297 | i++, ptr += scal_detail_size) | ||
1298 | scal_devs[i] = (struct scal_detail *)ptr; | ||
1299 | |||
1300 | for (i = 0; i < rio_table_hdr->num_rio_dev; | ||
1301 | i++, ptr += rio_detail_size) | ||
1302 | rio_devs[i] = (struct rio_detail *)ptr; | ||
1303 | |||
1304 | return 0; | ||
1305 | } | ||
1306 | |||
1307 | static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) | ||
1308 | { | ||
1309 | int dev; | ||
1310 | u32 val; | ||
1311 | |||
1312 | if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { | ||
1313 | /* | ||
1314 | * FIXME: properly scan for devices accross the | ||
1315 | * PCI-to-PCI bridge on every CalIOC2 port. | ||
1316 | */ | ||
1317 | return 1; | ||
1318 | } | ||
1319 | |||
1320 | for (dev = 1; dev < 8; dev++) { | ||
1321 | val = read_pci_config(bus, dev, 0, 0); | ||
1322 | if (val != 0xffffffff) | ||
1323 | break; | ||
1324 | } | ||
1325 | return (val != 0xffffffff); | ||
1326 | } | ||
1327 | |||
1328 | void __init detect_calgary(void) | ||
1329 | { | ||
1330 | int bus; | ||
1331 | void *tbl; | ||
1332 | int calgary_found = 0; | ||
1333 | unsigned long ptr; | ||
1334 | unsigned int offset, prev_offset; | ||
1335 | int ret; | ||
1336 | |||
1337 | /* | ||
1338 | * if the user specified iommu=off or iommu=soft or we found | ||
1339 | * another HW IOMMU already, bail out. | ||
1340 | */ | ||
1341 | if (swiotlb || no_iommu || iommu_detected) | ||
1342 | return; | ||
1343 | |||
1344 | if (!use_calgary) | ||
1345 | return; | ||
1346 | |||
1347 | if (!early_pci_allowed()) | ||
1348 | return; | ||
1349 | |||
1350 | printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); | ||
1351 | |||
1352 | ptr = (unsigned long)phys_to_virt(get_bios_ebda()); | ||
1353 | |||
1354 | rio_table_hdr = NULL; | ||
1355 | prev_offset = 0; | ||
1356 | offset = 0x180; | ||
1357 | /* | ||
1358 | * The next offset is stored in the 1st word. | ||
1359 | * Only parse up until the offset increases: | ||
1360 | */ | ||
1361 | while (offset > prev_offset) { | ||
1362 | /* The block id is stored in the 2nd word */ | ||
1363 | if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ | ||
1364 | /* set the pointer past the offset & block id */ | ||
1365 | rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); | ||
1366 | break; | ||
1367 | } | ||
1368 | prev_offset = offset; | ||
1369 | offset = *((unsigned short *)(ptr + offset)); | ||
1370 | } | ||
1371 | if (!rio_table_hdr) { | ||
1372 | printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " | ||
1373 | "in EBDA - bailing!\n"); | ||
1374 | return; | ||
1375 | } | ||
1376 | |||
1377 | ret = build_detail_arrays(); | ||
1378 | if (ret) { | ||
1379 | printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); | ||
1380 | return; | ||
1381 | } | ||
1382 | |||
1383 | specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); | ||
1384 | |||
1385 | for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { | ||
1386 | struct calgary_bus_info *info = &bus_info[bus]; | ||
1387 | unsigned short pci_device; | ||
1388 | u32 val; | ||
1389 | |||
1390 | val = read_pci_config(bus, 0, 0, 0); | ||
1391 | pci_device = (val & 0xFFFF0000) >> 16; | ||
1392 | |||
1393 | if (!is_cal_pci_dev(pci_device)) | ||
1394 | continue; | ||
1395 | |||
1396 | if (info->translation_disabled) | ||
1397 | continue; | ||
1398 | |||
1399 | if (calgary_bus_has_devices(bus, pci_device) || | ||
1400 | translate_empty_slots) { | ||
1401 | tbl = alloc_tce_table(); | ||
1402 | if (!tbl) | ||
1403 | goto cleanup; | ||
1404 | info->tce_space = tbl; | ||
1405 | calgary_found = 1; | ||
1406 | } | ||
1407 | } | ||
1408 | |||
1409 | printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", | ||
1410 | calgary_found ? "found" : "not found"); | ||
1411 | |||
1412 | if (calgary_found) { | ||
1413 | iommu_detected = 1; | ||
1414 | calgary_detected = 1; | ||
1415 | printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); | ||
1416 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " | ||
1417 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, | ||
1418 | debugging ? "enabled" : "disabled"); | ||
1419 | } | ||
1420 | return; | ||
1421 | |||
1422 | cleanup: | ||
1423 | for (--bus; bus >= 0; --bus) { | ||
1424 | struct calgary_bus_info *info = &bus_info[bus]; | ||
1425 | |||
1426 | if (info->tce_space) | ||
1427 | free_tce_table(info->tce_space); | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | int __init calgary_iommu_init(void) | ||
1432 | { | ||
1433 | int ret; | ||
1434 | |||
1435 | if (no_iommu || swiotlb) | ||
1436 | return -ENODEV; | ||
1437 | |||
1438 | if (!calgary_detected) | ||
1439 | return -ENODEV; | ||
1440 | |||
1441 | /* ok, we're trying to use Calgary - let's roll */ | ||
1442 | printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); | ||
1443 | |||
1444 | ret = calgary_init(); | ||
1445 | if (ret) { | ||
1446 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " | ||
1447 | "falling back to no_iommu\n", ret); | ||
1448 | if (end_pfn > MAX_DMA32_PFN) | ||
1449 | printk(KERN_ERR "WARNING more than 4GB of memory, " | ||
1450 | "32bit PCI may malfunction.\n"); | ||
1451 | return ret; | ||
1452 | } | ||
1453 | |||
1454 | force_iommu = 1; | ||
1455 | bad_dma_address = 0x0; | ||
1456 | dma_ops = &calgary_dma_ops; | ||
1457 | |||
1458 | return 0; | ||
1459 | } | ||
1460 | |||
1461 | static int __init calgary_parse_options(char *p) | ||
1462 | { | ||
1463 | unsigned int bridge; | ||
1464 | size_t len; | ||
1465 | char* endp; | ||
1466 | |||
1467 | while (*p) { | ||
1468 | if (!strncmp(p, "64k", 3)) | ||
1469 | specified_table_size = TCE_TABLE_SIZE_64K; | ||
1470 | else if (!strncmp(p, "128k", 4)) | ||
1471 | specified_table_size = TCE_TABLE_SIZE_128K; | ||
1472 | else if (!strncmp(p, "256k", 4)) | ||
1473 | specified_table_size = TCE_TABLE_SIZE_256K; | ||
1474 | else if (!strncmp(p, "512k", 4)) | ||
1475 | specified_table_size = TCE_TABLE_SIZE_512K; | ||
1476 | else if (!strncmp(p, "1M", 2)) | ||
1477 | specified_table_size = TCE_TABLE_SIZE_1M; | ||
1478 | else if (!strncmp(p, "2M", 2)) | ||
1479 | specified_table_size = TCE_TABLE_SIZE_2M; | ||
1480 | else if (!strncmp(p, "4M", 2)) | ||
1481 | specified_table_size = TCE_TABLE_SIZE_4M; | ||
1482 | else if (!strncmp(p, "8M", 2)) | ||
1483 | specified_table_size = TCE_TABLE_SIZE_8M; | ||
1484 | |||
1485 | len = strlen("translate_empty_slots"); | ||
1486 | if (!strncmp(p, "translate_empty_slots", len)) | ||
1487 | translate_empty_slots = 1; | ||
1488 | |||
1489 | len = strlen("disable"); | ||
1490 | if (!strncmp(p, "disable", len)) { | ||
1491 | p += len; | ||
1492 | if (*p == '=') | ||
1493 | ++p; | ||
1494 | if (*p == '\0') | ||
1495 | break; | ||
1496 | bridge = simple_strtol(p, &endp, 0); | ||
1497 | if (p == endp) | ||
1498 | break; | ||
1499 | |||
1500 | if (bridge < MAX_PHB_BUS_NUM) { | ||
1501 | printk(KERN_INFO "Calgary: disabling " | ||
1502 | "translation for PHB %#x\n", bridge); | ||
1503 | bus_info[bridge].translation_disabled = 1; | ||
1504 | } | ||
1505 | } | ||
1506 | |||
1507 | p = strpbrk(p, ","); | ||
1508 | if (!p) | ||
1509 | break; | ||
1510 | |||
1511 | p++; /* skip ',' */ | ||
1512 | } | ||
1513 | return 1; | ||
1514 | } | ||
1515 | __setup("calgary=", calgary_parse_options); | ||
1516 | |||
1517 | static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) | ||
1518 | { | ||
1519 | struct iommu_table *tbl; | ||
1520 | unsigned int npages; | ||
1521 | int i; | ||
1522 | |||
1523 | tbl = pci_iommu(dev->bus); | ||
1524 | |||
1525 | for (i = 0; i < 4; i++) { | ||
1526 | struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; | ||
1527 | |||
1528 | /* Don't give out TCEs that map MEM resources */ | ||
1529 | if (!(r->flags & IORESOURCE_MEM)) | ||
1530 | continue; | ||
1531 | |||
1532 | /* 0-based? we reserve the whole 1st MB anyway */ | ||
1533 | if (!r->start) | ||
1534 | continue; | ||
1535 | |||
1536 | /* cover the whole region */ | ||
1537 | npages = (r->end - r->start) >> PAGE_SHIFT; | ||
1538 | npages++; | ||
1539 | |||
1540 | iommu_range_reserve(tbl, r->start, npages); | ||
1541 | } | ||
1542 | } | ||
1543 | |||
1544 | static int __init calgary_fixup_tce_spaces(void) | ||
1545 | { | ||
1546 | struct pci_dev *dev = NULL; | ||
1547 | void *tce_space; | ||
1548 | |||
1549 | if (no_iommu || swiotlb || !calgary_detected) | ||
1550 | return -ENODEV; | ||
1551 | |||
1552 | printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); | ||
1553 | |||
1554 | do { | ||
1555 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); | ||
1556 | if (!dev) | ||
1557 | break; | ||
1558 | if (!is_cal_pci_dev(dev->device)) | ||
1559 | continue; | ||
1560 | if (!translate_phb(dev)) | ||
1561 | continue; | ||
1562 | |||
1563 | tce_space = bus_info[dev->bus->number].tce_space; | ||
1564 | if (!tce_space) | ||
1565 | continue; | ||
1566 | |||
1567 | calgary_fixup_one_tce_space(dev); | ||
1568 | |||
1569 | } while (1); | ||
1570 | |||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | /* | ||
1575 | * We need to be call after pcibios_assign_resources (fs_initcall level) | ||
1576 | * and before device_initcall. | ||
1577 | */ | ||
1578 | rootfs_initcall(calgary_fixup_tce_spaces); | ||
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c new file mode 100644 index 000000000000..29711445c818 --- /dev/null +++ b/arch/x86/kernel/pci-dma_64.c | |||
@@ -0,0 +1,346 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/pci.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/io.h> | ||
11 | #include <asm/iommu.h> | ||
12 | #include <asm/calgary.h> | ||
13 | |||
14 | int iommu_merge __read_mostly = 0; | ||
15 | EXPORT_SYMBOL(iommu_merge); | ||
16 | |||
17 | dma_addr_t bad_dma_address __read_mostly; | ||
18 | EXPORT_SYMBOL(bad_dma_address); | ||
19 | |||
20 | /* This tells the BIO block layer to assume merging. Default to off | ||
21 | because we cannot guarantee merging later. */ | ||
22 | int iommu_bio_merge __read_mostly = 0; | ||
23 | EXPORT_SYMBOL(iommu_bio_merge); | ||
24 | |||
25 | static int iommu_sac_force __read_mostly = 0; | ||
26 | |||
27 | int no_iommu __read_mostly; | ||
28 | #ifdef CONFIG_IOMMU_DEBUG | ||
29 | int panic_on_overflow __read_mostly = 1; | ||
30 | int force_iommu __read_mostly = 1; | ||
31 | #else | ||
32 | int panic_on_overflow __read_mostly = 0; | ||
33 | int force_iommu __read_mostly= 0; | ||
34 | #endif | ||
35 | |||
36 | /* Set this to 1 if there is a HW IOMMU in the system */ | ||
37 | int iommu_detected __read_mostly = 0; | ||
38 | |||
39 | /* Dummy device used for NULL arguments (normally ISA). Better would | ||
40 | be probably a smaller DMA mask, but this is bug-to-bug compatible | ||
41 | to i386. */ | ||
42 | struct device fallback_dev = { | ||
43 | .bus_id = "fallback device", | ||
44 | .coherent_dma_mask = DMA_32BIT_MASK, | ||
45 | .dma_mask = &fallback_dev.coherent_dma_mask, | ||
46 | }; | ||
47 | |||
48 | /* Allocate DMA memory on node near device */ | ||
49 | noinline static void * | ||
50 | dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) | ||
51 | { | ||
52 | struct page *page; | ||
53 | int node; | ||
54 | #ifdef CONFIG_PCI | ||
55 | if (dev->bus == &pci_bus_type) | ||
56 | node = pcibus_to_node(to_pci_dev(dev)->bus); | ||
57 | else | ||
58 | #endif | ||
59 | node = numa_node_id(); | ||
60 | |||
61 | if (node < first_node(node_online_map)) | ||
62 | node = first_node(node_online_map); | ||
63 | |||
64 | page = alloc_pages_node(node, gfp, order); | ||
65 | return page ? page_address(page) : NULL; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Allocate memory for a coherent mapping. | ||
70 | */ | ||
71 | void * | ||
72 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | ||
73 | gfp_t gfp) | ||
74 | { | ||
75 | void *memory; | ||
76 | unsigned long dma_mask = 0; | ||
77 | u64 bus; | ||
78 | |||
79 | if (!dev) | ||
80 | dev = &fallback_dev; | ||
81 | dma_mask = dev->coherent_dma_mask; | ||
82 | if (dma_mask == 0) | ||
83 | dma_mask = DMA_32BIT_MASK; | ||
84 | |||
85 | /* Device not DMA able */ | ||
86 | if (dev->dma_mask == NULL) | ||
87 | return NULL; | ||
88 | |||
89 | /* Don't invoke OOM killer */ | ||
90 | gfp |= __GFP_NORETRY; | ||
91 | |||
92 | /* Kludge to make it bug-to-bug compatible with i386. i386 | ||
93 | uses the normal dma_mask for alloc_coherent. */ | ||
94 | dma_mask &= *dev->dma_mask; | ||
95 | |||
96 | /* Why <=? Even when the mask is smaller than 4GB it is often | ||
97 | larger than 16MB and in this case we have a chance of | ||
98 | finding fitting memory in the next higher zone first. If | ||
99 | not retry with true GFP_DMA. -AK */ | ||
100 | if (dma_mask <= DMA_32BIT_MASK) | ||
101 | gfp |= GFP_DMA32; | ||
102 | |||
103 | again: | ||
104 | memory = dma_alloc_pages(dev, gfp, get_order(size)); | ||
105 | if (memory == NULL) | ||
106 | return NULL; | ||
107 | |||
108 | { | ||
109 | int high, mmu; | ||
110 | bus = virt_to_bus(memory); | ||
111 | high = (bus + size) >= dma_mask; | ||
112 | mmu = high; | ||
113 | if (force_iommu && !(gfp & GFP_DMA)) | ||
114 | mmu = 1; | ||
115 | else if (high) { | ||
116 | free_pages((unsigned long)memory, | ||
117 | get_order(size)); | ||
118 | |||
119 | /* Don't use the 16MB ZONE_DMA unless absolutely | ||
120 | needed. It's better to use remapping first. */ | ||
121 | if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { | ||
122 | gfp = (gfp & ~GFP_DMA32) | GFP_DMA; | ||
123 | goto again; | ||
124 | } | ||
125 | |||
126 | /* Let low level make its own zone decisions */ | ||
127 | gfp &= ~(GFP_DMA32|GFP_DMA); | ||
128 | |||
129 | if (dma_ops->alloc_coherent) | ||
130 | return dma_ops->alloc_coherent(dev, size, | ||
131 | dma_handle, gfp); | ||
132 | return NULL; | ||
133 | } | ||
134 | |||
135 | memset(memory, 0, size); | ||
136 | if (!mmu) { | ||
137 | *dma_handle = virt_to_bus(memory); | ||
138 | return memory; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | if (dma_ops->alloc_coherent) { | ||
143 | free_pages((unsigned long)memory, get_order(size)); | ||
144 | gfp &= ~(GFP_DMA|GFP_DMA32); | ||
145 | return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); | ||
146 | } | ||
147 | |||
148 | if (dma_ops->map_simple) { | ||
149 | *dma_handle = dma_ops->map_simple(dev, memory, | ||
150 | size, | ||
151 | PCI_DMA_BIDIRECTIONAL); | ||
152 | if (*dma_handle != bad_dma_address) | ||
153 | return memory; | ||
154 | } | ||
155 | |||
156 | if (panic_on_overflow) | ||
157 | panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size); | ||
158 | free_pages((unsigned long)memory, get_order(size)); | ||
159 | return NULL; | ||
160 | } | ||
161 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
162 | |||
163 | /* | ||
164 | * Unmap coherent memory. | ||
165 | * The caller must ensure that the device has finished accessing the mapping. | ||
166 | */ | ||
167 | void dma_free_coherent(struct device *dev, size_t size, | ||
168 | void *vaddr, dma_addr_t bus) | ||
169 | { | ||
170 | if (dma_ops->unmap_single) | ||
171 | dma_ops->unmap_single(dev, bus, size, 0); | ||
172 | free_pages((unsigned long)vaddr, get_order(size)); | ||
173 | } | ||
174 | EXPORT_SYMBOL(dma_free_coherent); | ||
175 | |||
176 | static int forbid_dac __read_mostly; | ||
177 | |||
178 | int dma_supported(struct device *dev, u64 mask) | ||
179 | { | ||
180 | #ifdef CONFIG_PCI | ||
181 | if (mask > 0xffffffff && forbid_dac > 0) { | ||
182 | |||
183 | |||
184 | |||
185 | printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id); | ||
186 | return 0; | ||
187 | } | ||
188 | #endif | ||
189 | |||
190 | if (dma_ops->dma_supported) | ||
191 | return dma_ops->dma_supported(dev, mask); | ||
192 | |||
193 | /* Copied from i386. Doesn't make much sense, because it will | ||
194 | only work for pci_alloc_coherent. | ||
195 | The caller just has to use GFP_DMA in this case. */ | ||
196 | if (mask < DMA_24BIT_MASK) | ||
197 | return 0; | ||
198 | |||
199 | /* Tell the device to use SAC when IOMMU force is on. This | ||
200 | allows the driver to use cheaper accesses in some cases. | ||
201 | |||
202 | Problem with this is that if we overflow the IOMMU area and | ||
203 | return DAC as fallback address the device may not handle it | ||
204 | correctly. | ||
205 | |||
206 | As a special case some controllers have a 39bit address | ||
207 | mode that is as efficient as 32bit (aic79xx). Don't force | ||
208 | SAC for these. Assume all masks <= 40 bits are of this | ||
209 | type. Normally this doesn't make any difference, but gives | ||
210 | more gentle handling of IOMMU overflow. */ | ||
211 | if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { | ||
212 | printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | return 1; | ||
217 | } | ||
218 | EXPORT_SYMBOL(dma_supported); | ||
219 | |||
220 | int dma_set_mask(struct device *dev, u64 mask) | ||
221 | { | ||
222 | if (!dev->dma_mask || !dma_supported(dev, mask)) | ||
223 | return -EIO; | ||
224 | *dev->dma_mask = mask; | ||
225 | return 0; | ||
226 | } | ||
227 | EXPORT_SYMBOL(dma_set_mask); | ||
228 | |||
229 | /* | ||
230 | * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter | ||
231 | * documentation. | ||
232 | */ | ||
233 | __init int iommu_setup(char *p) | ||
234 | { | ||
235 | iommu_merge = 1; | ||
236 | |||
237 | if (!p) | ||
238 | return -EINVAL; | ||
239 | |||
240 | while (*p) { | ||
241 | if (!strncmp(p,"off",3)) | ||
242 | no_iommu = 1; | ||
243 | /* gart_parse_options has more force support */ | ||
244 | if (!strncmp(p,"force",5)) | ||
245 | force_iommu = 1; | ||
246 | if (!strncmp(p,"noforce",7)) { | ||
247 | iommu_merge = 0; | ||
248 | force_iommu = 0; | ||
249 | } | ||
250 | |||
251 | if (!strncmp(p, "biomerge",8)) { | ||
252 | iommu_bio_merge = 4096; | ||
253 | iommu_merge = 1; | ||
254 | force_iommu = 1; | ||
255 | } | ||
256 | if (!strncmp(p, "panic",5)) | ||
257 | panic_on_overflow = 1; | ||
258 | if (!strncmp(p, "nopanic",7)) | ||
259 | panic_on_overflow = 0; | ||
260 | if (!strncmp(p, "merge",5)) { | ||
261 | iommu_merge = 1; | ||
262 | force_iommu = 1; | ||
263 | } | ||
264 | if (!strncmp(p, "nomerge",7)) | ||
265 | iommu_merge = 0; | ||
266 | if (!strncmp(p, "forcesac",8)) | ||
267 | iommu_sac_force = 1; | ||
268 | if (!strncmp(p, "allowdac", 8)) | ||
269 | forbid_dac = 0; | ||
270 | if (!strncmp(p, "nodac", 5)) | ||
271 | forbid_dac = -1; | ||
272 | |||
273 | #ifdef CONFIG_SWIOTLB | ||
274 | if (!strncmp(p, "soft",4)) | ||
275 | swiotlb = 1; | ||
276 | #endif | ||
277 | |||
278 | #ifdef CONFIG_IOMMU | ||
279 | gart_parse_options(p); | ||
280 | #endif | ||
281 | |||
282 | #ifdef CONFIG_CALGARY_IOMMU | ||
283 | if (!strncmp(p, "calgary", 7)) | ||
284 | use_calgary = 1; | ||
285 | #endif /* CONFIG_CALGARY_IOMMU */ | ||
286 | |||
287 | p += strcspn(p, ","); | ||
288 | if (*p == ',') | ||
289 | ++p; | ||
290 | } | ||
291 | return 0; | ||
292 | } | ||
293 | early_param("iommu", iommu_setup); | ||
294 | |||
295 | void __init pci_iommu_alloc(void) | ||
296 | { | ||
297 | /* | ||
298 | * The order of these functions is important for | ||
299 | * fall-back/fail-over reasons | ||
300 | */ | ||
301 | #ifdef CONFIG_IOMMU | ||
302 | iommu_hole_init(); | ||
303 | #endif | ||
304 | |||
305 | #ifdef CONFIG_CALGARY_IOMMU | ||
306 | detect_calgary(); | ||
307 | #endif | ||
308 | |||
309 | #ifdef CONFIG_SWIOTLB | ||
310 | pci_swiotlb_init(); | ||
311 | #endif | ||
312 | } | ||
313 | |||
314 | static int __init pci_iommu_init(void) | ||
315 | { | ||
316 | #ifdef CONFIG_CALGARY_IOMMU | ||
317 | calgary_iommu_init(); | ||
318 | #endif | ||
319 | |||
320 | #ifdef CONFIG_IOMMU | ||
321 | gart_iommu_init(); | ||
322 | #endif | ||
323 | |||
324 | no_iommu_init(); | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | void pci_iommu_shutdown(void) | ||
329 | { | ||
330 | gart_iommu_shutdown(); | ||
331 | } | ||
332 | |||
333 | #ifdef CONFIG_PCI | ||
334 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ | ||
335 | |||
336 | static __devinit void via_no_dac(struct pci_dev *dev) | ||
337 | { | ||
338 | if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { | ||
339 | printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); | ||
340 | forbid_dac = 1; | ||
341 | } | ||
342 | } | ||
343 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); | ||
344 | #endif | ||
345 | /* Must execute after PCI subsystem */ | ||
346 | fs_initcall(pci_iommu_init); | ||
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c new file mode 100644 index 000000000000..4918c575d582 --- /dev/null +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -0,0 +1,740 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support for AMD Hammer. | ||
3 | * | ||
4 | * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. | ||
5 | * This allows to use PCI devices that only support 32bit addresses on systems | ||
6 | * with more than 4GB. | ||
7 | * | ||
8 | * See Documentation/DMA-mapping.txt for the interface specification. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | |||
13 | #include <linux/types.h> | ||
14 | #include <linux/ctype.h> | ||
15 | #include <linux/agp_backend.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/spinlock.h> | ||
20 | #include <linux/pci.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/topology.h> | ||
23 | #include <linux/interrupt.h> | ||
24 | #include <linux/bitops.h> | ||
25 | #include <linux/kdebug.h> | ||
26 | #include <asm/atomic.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/iommu.h> | ||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/swiotlb.h> | ||
34 | #include <asm/dma.h> | ||
35 | #include <asm/k8.h> | ||
36 | |||
37 | unsigned long iommu_bus_base; /* GART remapping area (physical) */ | ||
38 | static unsigned long iommu_size; /* size of remapping area bytes */ | ||
39 | static unsigned long iommu_pages; /* .. and in pages */ | ||
40 | |||
41 | u32 *iommu_gatt_base; /* Remapping table */ | ||
42 | |||
43 | /* If this is disabled the IOMMU will use an optimized flushing strategy | ||
44 | of only flushing when an mapping is reused. With it true the GART is flushed | ||
45 | for every mapping. Problem is that doing the lazy flush seems to trigger | ||
46 | bugs with some popular PCI cards, in particular 3ware (but has been also | ||
47 | also seen with Qlogic at least). */ | ||
48 | int iommu_fullflush = 1; | ||
49 | |||
50 | /* Allocation bitmap for the remapping area */ | ||
51 | static DEFINE_SPINLOCK(iommu_bitmap_lock); | ||
52 | static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ | ||
53 | |||
54 | static u32 gart_unmapped_entry; | ||
55 | |||
56 | #define GPTE_VALID 1 | ||
57 | #define GPTE_COHERENT 2 | ||
58 | #define GPTE_ENCODE(x) \ | ||
59 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) | ||
60 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) | ||
61 | |||
62 | #define to_pages(addr,size) \ | ||
63 | (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) | ||
64 | |||
65 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
66 | |||
67 | #ifdef CONFIG_AGP | ||
68 | #define AGPEXTERN extern | ||
69 | #else | ||
70 | #define AGPEXTERN | ||
71 | #endif | ||
72 | |||
73 | /* backdoor interface to AGP driver */ | ||
74 | AGPEXTERN int agp_memory_reserved; | ||
75 | AGPEXTERN __u32 *agp_gatt_table; | ||
76 | |||
77 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ | ||
78 | static int need_flush; /* global flush state. set for each gart wrap */ | ||
79 | |||
80 | static unsigned long alloc_iommu(int size) | ||
81 | { | ||
82 | unsigned long offset, flags; | ||
83 | |||
84 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
85 | offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); | ||
86 | if (offset == -1) { | ||
87 | need_flush = 1; | ||
88 | offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); | ||
89 | } | ||
90 | if (offset != -1) { | ||
91 | set_bit_string(iommu_gart_bitmap, offset, size); | ||
92 | next_bit = offset+size; | ||
93 | if (next_bit >= iommu_pages) { | ||
94 | next_bit = 0; | ||
95 | need_flush = 1; | ||
96 | } | ||
97 | } | ||
98 | if (iommu_fullflush) | ||
99 | need_flush = 1; | ||
100 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
101 | return offset; | ||
102 | } | ||
103 | |||
104 | static void free_iommu(unsigned long offset, int size) | ||
105 | { | ||
106 | unsigned long flags; | ||
107 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
108 | __clear_bit_string(iommu_gart_bitmap, offset, size); | ||
109 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Use global flush state to avoid races with multiple flushers. | ||
114 | */ | ||
115 | static void flush_gart(void) | ||
116 | { | ||
117 | unsigned long flags; | ||
118 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
119 | if (need_flush) { | ||
120 | k8_flush_garts(); | ||
121 | need_flush = 0; | ||
122 | } | ||
123 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
124 | } | ||
125 | |||
126 | #ifdef CONFIG_IOMMU_LEAK | ||
127 | |||
128 | #define SET_LEAK(x) if (iommu_leak_tab) \ | ||
129 | iommu_leak_tab[x] = __builtin_return_address(0); | ||
130 | #define CLEAR_LEAK(x) if (iommu_leak_tab) \ | ||
131 | iommu_leak_tab[x] = NULL; | ||
132 | |||
133 | /* Debugging aid for drivers that don't free their IOMMU tables */ | ||
134 | static void **iommu_leak_tab; | ||
135 | static int leak_trace; | ||
136 | int iommu_leak_pages = 20; | ||
137 | void dump_leak(void) | ||
138 | { | ||
139 | int i; | ||
140 | static int dump; | ||
141 | if (dump || !iommu_leak_tab) return; | ||
142 | dump = 1; | ||
143 | show_stack(NULL,NULL); | ||
144 | /* Very crude. dump some from the end of the table too */ | ||
145 | printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); | ||
146 | for (i = 0; i < iommu_leak_pages; i+=2) { | ||
147 | printk("%lu: ", iommu_pages-i); | ||
148 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); | ||
149 | printk("%c", (i+1)%2 == 0 ? '\n' : ' '); | ||
150 | } | ||
151 | printk("\n"); | ||
152 | } | ||
153 | #else | ||
154 | #define SET_LEAK(x) | ||
155 | #define CLEAR_LEAK(x) | ||
156 | #endif | ||
157 | |||
158 | static void iommu_full(struct device *dev, size_t size, int dir) | ||
159 | { | ||
160 | /* | ||
161 | * Ran out of IOMMU space for this operation. This is very bad. | ||
162 | * Unfortunately the drivers cannot handle this operation properly. | ||
163 | * Return some non mapped prereserved space in the aperture and | ||
164 | * let the Northbridge deal with it. This will result in garbage | ||
165 | * in the IO operation. When the size exceeds the prereserved space | ||
166 | * memory corruption will occur or random memory will be DMAed | ||
167 | * out. Hopefully no network devices use single mappings that big. | ||
168 | */ | ||
169 | |||
170 | printk(KERN_ERR | ||
171 | "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", | ||
172 | size, dev->bus_id); | ||
173 | |||
174 | if (size > PAGE_SIZE*EMERGENCY_PAGES) { | ||
175 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
176 | panic("PCI-DMA: Memory would be corrupted\n"); | ||
177 | if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
178 | panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); | ||
179 | } | ||
180 | |||
181 | #ifdef CONFIG_IOMMU_LEAK | ||
182 | dump_leak(); | ||
183 | #endif | ||
184 | } | ||
185 | |||
186 | static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) | ||
187 | { | ||
188 | u64 mask = *dev->dma_mask; | ||
189 | int high = addr + size > mask; | ||
190 | int mmu = high; | ||
191 | if (force_iommu) | ||
192 | mmu = 1; | ||
193 | return mmu; | ||
194 | } | ||
195 | |||
196 | static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | ||
197 | { | ||
198 | u64 mask = *dev->dma_mask; | ||
199 | int high = addr + size > mask; | ||
200 | int mmu = high; | ||
201 | return mmu; | ||
202 | } | ||
203 | |||
204 | /* Map a single continuous physical area into the IOMMU. | ||
205 | * Caller needs to check if the iommu is needed and flush. | ||
206 | */ | ||
207 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | ||
208 | size_t size, int dir) | ||
209 | { | ||
210 | unsigned long npages = to_pages(phys_mem, size); | ||
211 | unsigned long iommu_page = alloc_iommu(npages); | ||
212 | int i; | ||
213 | if (iommu_page == -1) { | ||
214 | if (!nonforced_iommu(dev, phys_mem, size)) | ||
215 | return phys_mem; | ||
216 | if (panic_on_overflow) | ||
217 | panic("dma_map_area overflow %lu bytes\n", size); | ||
218 | iommu_full(dev, size, dir); | ||
219 | return bad_dma_address; | ||
220 | } | ||
221 | |||
222 | for (i = 0; i < npages; i++) { | ||
223 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); | ||
224 | SET_LEAK(iommu_page + i); | ||
225 | phys_mem += PAGE_SIZE; | ||
226 | } | ||
227 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | ||
228 | } | ||
229 | |||
230 | static dma_addr_t gart_map_simple(struct device *dev, char *buf, | ||
231 | size_t size, int dir) | ||
232 | { | ||
233 | dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); | ||
234 | flush_gart(); | ||
235 | return map; | ||
236 | } | ||
237 | |||
238 | /* Map a single area into the IOMMU */ | ||
239 | static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) | ||
240 | { | ||
241 | unsigned long phys_mem, bus; | ||
242 | |||
243 | if (!dev) | ||
244 | dev = &fallback_dev; | ||
245 | |||
246 | phys_mem = virt_to_phys(addr); | ||
247 | if (!need_iommu(dev, phys_mem, size)) | ||
248 | return phys_mem; | ||
249 | |||
250 | bus = gart_map_simple(dev, addr, size, dir); | ||
251 | return bus; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Free a DMA mapping. | ||
256 | */ | ||
257 | static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | ||
258 | size_t size, int direction) | ||
259 | { | ||
260 | unsigned long iommu_page; | ||
261 | int npages; | ||
262 | int i; | ||
263 | |||
264 | if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || | ||
265 | dma_addr >= iommu_bus_base + iommu_size) | ||
266 | return; | ||
267 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | ||
268 | npages = to_pages(dma_addr, size); | ||
269 | for (i = 0; i < npages; i++) { | ||
270 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | ||
271 | CLEAR_LEAK(iommu_page + i); | ||
272 | } | ||
273 | free_iommu(iommu_page, npages); | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Wrapper for pci_unmap_single working with scatterlists. | ||
278 | */ | ||
279 | static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
280 | { | ||
281 | int i; | ||
282 | |||
283 | for (i = 0; i < nents; i++) { | ||
284 | struct scatterlist *s = &sg[i]; | ||
285 | if (!s->dma_length || !s->length) | ||
286 | break; | ||
287 | gart_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
288 | } | ||
289 | } | ||
290 | |||
291 | /* Fallback for dma_map_sg in case of overflow */ | ||
292 | static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, | ||
293 | int nents, int dir) | ||
294 | { | ||
295 | int i; | ||
296 | |||
297 | #ifdef CONFIG_IOMMU_DEBUG | ||
298 | printk(KERN_DEBUG "dma_map_sg overflow\n"); | ||
299 | #endif | ||
300 | |||
301 | for (i = 0; i < nents; i++ ) { | ||
302 | struct scatterlist *s = &sg[i]; | ||
303 | unsigned long addr = page_to_phys(s->page) + s->offset; | ||
304 | if (nonforced_iommu(dev, addr, s->length)) { | ||
305 | addr = dma_map_area(dev, addr, s->length, dir); | ||
306 | if (addr == bad_dma_address) { | ||
307 | if (i > 0) | ||
308 | gart_unmap_sg(dev, sg, i, dir); | ||
309 | nents = 0; | ||
310 | sg[0].dma_length = 0; | ||
311 | break; | ||
312 | } | ||
313 | } | ||
314 | s->dma_address = addr; | ||
315 | s->dma_length = s->length; | ||
316 | } | ||
317 | flush_gart(); | ||
318 | return nents; | ||
319 | } | ||
320 | |||
321 | /* Map multiple scatterlist entries continuous into the first. */ | ||
322 | static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
323 | struct scatterlist *sout, unsigned long pages) | ||
324 | { | ||
325 | unsigned long iommu_start = alloc_iommu(pages); | ||
326 | unsigned long iommu_page = iommu_start; | ||
327 | int i; | ||
328 | |||
329 | if (iommu_start == -1) | ||
330 | return -1; | ||
331 | |||
332 | for (i = start; i < stopat; i++) { | ||
333 | struct scatterlist *s = &sg[i]; | ||
334 | unsigned long pages, addr; | ||
335 | unsigned long phys_addr = s->dma_address; | ||
336 | |||
337 | BUG_ON(i > start && s->offset); | ||
338 | if (i == start) { | ||
339 | *sout = *s; | ||
340 | sout->dma_address = iommu_bus_base; | ||
341 | sout->dma_address += iommu_page*PAGE_SIZE + s->offset; | ||
342 | sout->dma_length = s->length; | ||
343 | } else { | ||
344 | sout->dma_length += s->length; | ||
345 | } | ||
346 | |||
347 | addr = phys_addr; | ||
348 | pages = to_pages(s->offset, s->length); | ||
349 | while (pages--) { | ||
350 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | ||
351 | SET_LEAK(iommu_page); | ||
352 | addr += PAGE_SIZE; | ||
353 | iommu_page++; | ||
354 | } | ||
355 | } | ||
356 | BUG_ON(iommu_page - iommu_start != pages); | ||
357 | return 0; | ||
358 | } | ||
359 | |||
360 | static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
361 | struct scatterlist *sout, | ||
362 | unsigned long pages, int need) | ||
363 | { | ||
364 | if (!need) { | ||
365 | BUG_ON(stopat - start != 1); | ||
366 | *sout = sg[start]; | ||
367 | sout->dma_length = sg[start].length; | ||
368 | return 0; | ||
369 | } | ||
370 | return __dma_map_cont(sg, start, stopat, sout, pages); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * DMA map all entries in a scatterlist. | ||
375 | * Merge chunks that have page aligned sizes into a continuous mapping. | ||
376 | */ | ||
377 | int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
378 | { | ||
379 | int i; | ||
380 | int out; | ||
381 | int start; | ||
382 | unsigned long pages = 0; | ||
383 | int need = 0, nextneed; | ||
384 | |||
385 | if (nents == 0) | ||
386 | return 0; | ||
387 | |||
388 | if (!dev) | ||
389 | dev = &fallback_dev; | ||
390 | |||
391 | out = 0; | ||
392 | start = 0; | ||
393 | for (i = 0; i < nents; i++) { | ||
394 | struct scatterlist *s = &sg[i]; | ||
395 | dma_addr_t addr = page_to_phys(s->page) + s->offset; | ||
396 | s->dma_address = addr; | ||
397 | BUG_ON(s->length == 0); | ||
398 | |||
399 | nextneed = need_iommu(dev, addr, s->length); | ||
400 | |||
401 | /* Handle the previous not yet processed entries */ | ||
402 | if (i > start) { | ||
403 | struct scatterlist *ps = &sg[i-1]; | ||
404 | /* Can only merge when the last chunk ends on a page | ||
405 | boundary and the new one doesn't have an offset. */ | ||
406 | if (!iommu_merge || !nextneed || !need || s->offset || | ||
407 | (ps->offset + ps->length) % PAGE_SIZE) { | ||
408 | if (dma_map_cont(sg, start, i, sg+out, pages, | ||
409 | need) < 0) | ||
410 | goto error; | ||
411 | out++; | ||
412 | pages = 0; | ||
413 | start = i; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | need = nextneed; | ||
418 | pages += to_pages(s->offset, s->length); | ||
419 | } | ||
420 | if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) | ||
421 | goto error; | ||
422 | out++; | ||
423 | flush_gart(); | ||
424 | if (out < nents) | ||
425 | sg[out].dma_length = 0; | ||
426 | return out; | ||
427 | |||
428 | error: | ||
429 | flush_gart(); | ||
430 | gart_unmap_sg(dev, sg, nents, dir); | ||
431 | /* When it was forced or merged try again in a dumb way */ | ||
432 | if (force_iommu || iommu_merge) { | ||
433 | out = dma_map_sg_nonforce(dev, sg, nents, dir); | ||
434 | if (out > 0) | ||
435 | return out; | ||
436 | } | ||
437 | if (panic_on_overflow) | ||
438 | panic("dma_map_sg: overflow on %lu pages\n", pages); | ||
439 | iommu_full(dev, pages << PAGE_SHIFT, dir); | ||
440 | for (i = 0; i < nents; i++) | ||
441 | sg[i].dma_address = bad_dma_address; | ||
442 | return 0; | ||
443 | } | ||
444 | |||
445 | static int no_agp; | ||
446 | |||
447 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) | ||
448 | { | ||
449 | unsigned long a; | ||
450 | if (!iommu_size) { | ||
451 | iommu_size = aper_size; | ||
452 | if (!no_agp) | ||
453 | iommu_size /= 2; | ||
454 | } | ||
455 | |||
456 | a = aper + iommu_size; | ||
457 | iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; | ||
458 | |||
459 | if (iommu_size < 64*1024*1024) | ||
460 | printk(KERN_WARNING | ||
461 | "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); | ||
462 | |||
463 | return iommu_size; | ||
464 | } | ||
465 | |||
466 | static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | ||
467 | { | ||
468 | unsigned aper_size = 0, aper_base_32; | ||
469 | u64 aper_base; | ||
470 | unsigned aper_order; | ||
471 | |||
472 | pci_read_config_dword(dev, 0x94, &aper_base_32); | ||
473 | pci_read_config_dword(dev, 0x90, &aper_order); | ||
474 | aper_order = (aper_order >> 1) & 7; | ||
475 | |||
476 | aper_base = aper_base_32 & 0x7fff; | ||
477 | aper_base <<= 25; | ||
478 | |||
479 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
480 | if (aper_base + aper_size > 0x100000000UL || !aper_size) | ||
481 | aper_base = 0; | ||
482 | |||
483 | *size = aper_size; | ||
484 | return aper_base; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * Private Northbridge GATT initialization in case we cannot use the | ||
489 | * AGP driver for some reason. | ||
490 | */ | ||
491 | static __init int init_k8_gatt(struct agp_kern_info *info) | ||
492 | { | ||
493 | struct pci_dev *dev; | ||
494 | void *gatt; | ||
495 | unsigned aper_base, new_aper_base; | ||
496 | unsigned aper_size, gatt_size, new_aper_size; | ||
497 | int i; | ||
498 | |||
499 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | ||
500 | aper_size = aper_base = info->aper_size = 0; | ||
501 | dev = NULL; | ||
502 | for (i = 0; i < num_k8_northbridges; i++) { | ||
503 | dev = k8_northbridges[i]; | ||
504 | new_aper_base = read_aperture(dev, &new_aper_size); | ||
505 | if (!new_aper_base) | ||
506 | goto nommu; | ||
507 | |||
508 | if (!aper_base) { | ||
509 | aper_size = new_aper_size; | ||
510 | aper_base = new_aper_base; | ||
511 | } | ||
512 | if (aper_size != new_aper_size || aper_base != new_aper_base) | ||
513 | goto nommu; | ||
514 | } | ||
515 | if (!aper_base) | ||
516 | goto nommu; | ||
517 | info->aper_base = aper_base; | ||
518 | info->aper_size = aper_size>>20; | ||
519 | |||
520 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); | ||
521 | gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); | ||
522 | if (!gatt) | ||
523 | panic("Cannot allocate GATT table"); | ||
524 | if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) | ||
525 | panic("Could not set GART PTEs to uncacheable pages"); | ||
526 | global_flush_tlb(); | ||
527 | |||
528 | memset(gatt, 0, gatt_size); | ||
529 | agp_gatt_table = gatt; | ||
530 | |||
531 | for (i = 0; i < num_k8_northbridges; i++) { | ||
532 | u32 ctl; | ||
533 | u32 gatt_reg; | ||
534 | |||
535 | dev = k8_northbridges[i]; | ||
536 | gatt_reg = __pa(gatt) >> 12; | ||
537 | gatt_reg <<= 4; | ||
538 | pci_write_config_dword(dev, 0x98, gatt_reg); | ||
539 | pci_read_config_dword(dev, 0x90, &ctl); | ||
540 | |||
541 | ctl |= 1; | ||
542 | ctl &= ~((1<<4) | (1<<5)); | ||
543 | |||
544 | pci_write_config_dword(dev, 0x90, ctl); | ||
545 | } | ||
546 | flush_gart(); | ||
547 | |||
548 | printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); | ||
549 | return 0; | ||
550 | |||
551 | nommu: | ||
552 | /* Should not happen anymore */ | ||
553 | printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" | ||
554 | KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); | ||
555 | return -1; | ||
556 | } | ||
557 | |||
558 | extern int agp_amd64_init(void); | ||
559 | |||
560 | static const struct dma_mapping_ops gart_dma_ops = { | ||
561 | .mapping_error = NULL, | ||
562 | .map_single = gart_map_single, | ||
563 | .map_simple = gart_map_simple, | ||
564 | .unmap_single = gart_unmap_single, | ||
565 | .sync_single_for_cpu = NULL, | ||
566 | .sync_single_for_device = NULL, | ||
567 | .sync_single_range_for_cpu = NULL, | ||
568 | .sync_single_range_for_device = NULL, | ||
569 | .sync_sg_for_cpu = NULL, | ||
570 | .sync_sg_for_device = NULL, | ||
571 | .map_sg = gart_map_sg, | ||
572 | .unmap_sg = gart_unmap_sg, | ||
573 | }; | ||
574 | |||
575 | void gart_iommu_shutdown(void) | ||
576 | { | ||
577 | struct pci_dev *dev; | ||
578 | int i; | ||
579 | |||
580 | if (no_agp && (dma_ops != &gart_dma_ops)) | ||
581 | return; | ||
582 | |||
583 | for (i = 0; i < num_k8_northbridges; i++) { | ||
584 | u32 ctl; | ||
585 | |||
586 | dev = k8_northbridges[i]; | ||
587 | pci_read_config_dword(dev, 0x90, &ctl); | ||
588 | |||
589 | ctl &= ~1; | ||
590 | |||
591 | pci_write_config_dword(dev, 0x90, ctl); | ||
592 | } | ||
593 | } | ||
594 | |||
595 | void __init gart_iommu_init(void) | ||
596 | { | ||
597 | struct agp_kern_info info; | ||
598 | unsigned long aper_size; | ||
599 | unsigned long iommu_start; | ||
600 | unsigned long scratch; | ||
601 | long i; | ||
602 | |||
603 | if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { | ||
604 | printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); | ||
605 | return; | ||
606 | } | ||
607 | |||
608 | #ifndef CONFIG_AGP_AMD64 | ||
609 | no_agp = 1; | ||
610 | #else | ||
611 | /* Makefile puts PCI initialization via subsys_initcall first. */ | ||
612 | /* Add other K8 AGP bridge drivers here */ | ||
613 | no_agp = no_agp || | ||
614 | (agp_amd64_init() < 0) || | ||
615 | (agp_copy_info(agp_bridge, &info) < 0); | ||
616 | #endif | ||
617 | |||
618 | if (swiotlb) | ||
619 | return; | ||
620 | |||
621 | /* Did we detect a different HW IOMMU? */ | ||
622 | if (iommu_detected && !iommu_aperture) | ||
623 | return; | ||
624 | |||
625 | if (no_iommu || | ||
626 | (!force_iommu && end_pfn <= MAX_DMA32_PFN) || | ||
627 | !iommu_aperture || | ||
628 | (no_agp && init_k8_gatt(&info) < 0)) { | ||
629 | if (end_pfn > MAX_DMA32_PFN) { | ||
630 | printk(KERN_ERR "WARNING more than 4GB of memory " | ||
631 | "but GART IOMMU not available.\n" | ||
632 | KERN_ERR "WARNING 32bit PCI may malfunction.\n"); | ||
633 | } | ||
634 | return; | ||
635 | } | ||
636 | |||
637 | printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); | ||
638 | aper_size = info.aper_size * 1024 * 1024; | ||
639 | iommu_size = check_iommu_size(info.aper_base, aper_size); | ||
640 | iommu_pages = iommu_size >> PAGE_SHIFT; | ||
641 | |||
642 | iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, | ||
643 | get_order(iommu_pages/8)); | ||
644 | if (!iommu_gart_bitmap) | ||
645 | panic("Cannot allocate iommu bitmap\n"); | ||
646 | memset(iommu_gart_bitmap, 0, iommu_pages/8); | ||
647 | |||
648 | #ifdef CONFIG_IOMMU_LEAK | ||
649 | if (leak_trace) { | ||
650 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, | ||
651 | get_order(iommu_pages*sizeof(void *))); | ||
652 | if (iommu_leak_tab) | ||
653 | memset(iommu_leak_tab, 0, iommu_pages * 8); | ||
654 | else | ||
655 | printk("PCI-DMA: Cannot allocate leak trace area\n"); | ||
656 | } | ||
657 | #endif | ||
658 | |||
659 | /* | ||
660 | * Out of IOMMU space handling. | ||
661 | * Reserve some invalid pages at the beginning of the GART. | ||
662 | */ | ||
663 | set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); | ||
664 | |||
665 | agp_memory_reserved = iommu_size; | ||
666 | printk(KERN_INFO | ||
667 | "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", | ||
668 | iommu_size>>20); | ||
669 | |||
670 | iommu_start = aper_size - iommu_size; | ||
671 | iommu_bus_base = info.aper_base + iommu_start; | ||
672 | bad_dma_address = iommu_bus_base; | ||
673 | iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); | ||
674 | |||
675 | /* | ||
676 | * Unmap the IOMMU part of the GART. The alias of the page is | ||
677 | * always mapped with cache enabled and there is no full cache | ||
678 | * coherency across the GART remapping. The unmapping avoids | ||
679 | * automatic prefetches from the CPU allocating cache lines in | ||
680 | * there. All CPU accesses are done via the direct mapping to | ||
681 | * the backing memory. The GART address is only used by PCI | ||
682 | * devices. | ||
683 | */ | ||
684 | clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); | ||
685 | |||
686 | /* | ||
687 | * Try to workaround a bug (thanks to BenH) | ||
688 | * Set unmapped entries to a scratch page instead of 0. | ||
689 | * Any prefetches that hit unmapped entries won't get an bus abort | ||
690 | * then. | ||
691 | */ | ||
692 | scratch = get_zeroed_page(GFP_KERNEL); | ||
693 | if (!scratch) | ||
694 | panic("Cannot allocate iommu scratch page"); | ||
695 | gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); | ||
696 | for (i = EMERGENCY_PAGES; i < iommu_pages; i++) | ||
697 | iommu_gatt_base[i] = gart_unmapped_entry; | ||
698 | |||
699 | flush_gart(); | ||
700 | dma_ops = &gart_dma_ops; | ||
701 | } | ||
702 | |||
703 | void __init gart_parse_options(char *p) | ||
704 | { | ||
705 | int arg; | ||
706 | |||
707 | #ifdef CONFIG_IOMMU_LEAK | ||
708 | if (!strncmp(p,"leak",4)) { | ||
709 | leak_trace = 1; | ||
710 | p += 4; | ||
711 | if (*p == '=') ++p; | ||
712 | if (isdigit(*p) && get_option(&p, &arg)) | ||
713 | iommu_leak_pages = arg; | ||
714 | } | ||
715 | #endif | ||
716 | if (isdigit(*p) && get_option(&p, &arg)) | ||
717 | iommu_size = arg; | ||
718 | if (!strncmp(p, "fullflush",8)) | ||
719 | iommu_fullflush = 1; | ||
720 | if (!strncmp(p, "nofullflush",11)) | ||
721 | iommu_fullflush = 0; | ||
722 | if (!strncmp(p,"noagp",5)) | ||
723 | no_agp = 1; | ||
724 | if (!strncmp(p, "noaperture",10)) | ||
725 | fix_aperture = 0; | ||
726 | /* duplicated from pci-dma.c */ | ||
727 | if (!strncmp(p,"force",5)) | ||
728 | iommu_aperture_allowed = 1; | ||
729 | if (!strncmp(p,"allowed",7)) | ||
730 | iommu_aperture_allowed = 1; | ||
731 | if (!strncmp(p, "memaper", 7)) { | ||
732 | fallback_aper_force = 1; | ||
733 | p += 7; | ||
734 | if (*p == '=') { | ||
735 | ++p; | ||
736 | if (get_option(&p, &arg)) | ||
737 | fallback_aper_order = arg; | ||
738 | } | ||
739 | } | ||
740 | } | ||
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c new file mode 100644 index 000000000000..2a34c6c025a9 --- /dev/null +++ b/arch/x86/kernel/pci-nommu_64.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* Fallback functions when the main IOMMU code is not compiled in. This | ||
2 | code is roughly equivalent to i386. */ | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/pci.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/dma-mapping.h> | ||
8 | |||
9 | #include <asm/iommu.h> | ||
10 | #include <asm/processor.h> | ||
11 | #include <asm/dma.h> | ||
12 | |||
13 | static int | ||
14 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) | ||
15 | { | ||
16 | if (hwdev && bus + size > *hwdev->dma_mask) { | ||
17 | if (*hwdev->dma_mask >= DMA_32BIT_MASK) | ||
18 | printk(KERN_ERR | ||
19 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", | ||
20 | name, (long long)bus, size, | ||
21 | (long long)*hwdev->dma_mask); | ||
22 | return 0; | ||
23 | } | ||
24 | return 1; | ||
25 | } | ||
26 | |||
27 | static dma_addr_t | ||
28 | nommu_map_single(struct device *hwdev, void *ptr, size_t size, | ||
29 | int direction) | ||
30 | { | ||
31 | dma_addr_t bus = virt_to_bus(ptr); | ||
32 | if (!check_addr("map_single", hwdev, bus, size)) | ||
33 | return bad_dma_address; | ||
34 | return bus; | ||
35 | } | ||
36 | |||
37 | static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, | ||
38 | int direction) | ||
39 | { | ||
40 | } | ||
41 | |||
42 | /* Map a set of buffers described by scatterlist in streaming | ||
43 | * mode for DMA. This is the scatter-gather version of the | ||
44 | * above pci_map_single interface. Here the scatter gather list | ||
45 | * elements are each tagged with the appropriate dma address | ||
46 | * and length. They are obtained via sg_dma_{address,length}(SG). | ||
47 | * | ||
48 | * NOTE: An implementation may be able to use a smaller number of | ||
49 | * DMA address/length pairs than there are SG table elements. | ||
50 | * (for example via virtual mapping capabilities) | ||
51 | * The routine returns the number of addr/length pairs actually | ||
52 | * used, at most nents. | ||
53 | * | ||
54 | * Device ownership issues as mentioned above for pci_map_single are | ||
55 | * the same here. | ||
56 | */ | ||
57 | static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | ||
58 | int nents, int direction) | ||
59 | { | ||
60 | int i; | ||
61 | |||
62 | for (i = 0; i < nents; i++ ) { | ||
63 | struct scatterlist *s = &sg[i]; | ||
64 | BUG_ON(!s->page); | ||
65 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
66 | if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) | ||
67 | return 0; | ||
68 | s->dma_length = s->length; | ||
69 | } | ||
70 | return nents; | ||
71 | } | ||
72 | |||
73 | /* Unmap a set of streaming mode DMA translations. | ||
74 | * Again, cpu read rules concerning calls here are the same as for | ||
75 | * pci_unmap_single() above. | ||
76 | */ | ||
77 | static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, | ||
78 | int nents, int dir) | ||
79 | { | ||
80 | } | ||
81 | |||
82 | const struct dma_mapping_ops nommu_dma_ops = { | ||
83 | .map_single = nommu_map_single, | ||
84 | .unmap_single = nommu_unmap_single, | ||
85 | .map_sg = nommu_map_sg, | ||
86 | .unmap_sg = nommu_unmap_sg, | ||
87 | .is_phys = 1, | ||
88 | }; | ||
89 | |||
90 | void __init no_iommu_init(void) | ||
91 | { | ||
92 | if (dma_ops) | ||
93 | return; | ||
94 | |||
95 | force_iommu = 0; /* no HW IOMMU */ | ||
96 | dma_ops = &nommu_dma_ops; | ||
97 | } | ||
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c new file mode 100644 index 000000000000..b2f405ea7c85 --- /dev/null +++ b/arch/x86/kernel/pci-swiotlb_64.c | |||
@@ -0,0 +1,44 @@ | |||
1 | /* Glue code to lib/swiotlb.c */ | ||
2 | |||
3 | #include <linux/pci.h> | ||
4 | #include <linux/cache.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/dma-mapping.h> | ||
7 | |||
8 | #include <asm/iommu.h> | ||
9 | #include <asm/swiotlb.h> | ||
10 | #include <asm/dma.h> | ||
11 | |||
12 | int swiotlb __read_mostly; | ||
13 | EXPORT_SYMBOL(swiotlb); | ||
14 | |||
15 | const struct dma_mapping_ops swiotlb_dma_ops = { | ||
16 | .mapping_error = swiotlb_dma_mapping_error, | ||
17 | .alloc_coherent = swiotlb_alloc_coherent, | ||
18 | .free_coherent = swiotlb_free_coherent, | ||
19 | .map_single = swiotlb_map_single, | ||
20 | .unmap_single = swiotlb_unmap_single, | ||
21 | .sync_single_for_cpu = swiotlb_sync_single_for_cpu, | ||
22 | .sync_single_for_device = swiotlb_sync_single_for_device, | ||
23 | .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, | ||
24 | .sync_single_range_for_device = swiotlb_sync_single_range_for_device, | ||
25 | .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, | ||
26 | .sync_sg_for_device = swiotlb_sync_sg_for_device, | ||
27 | .map_sg = swiotlb_map_sg, | ||
28 | .unmap_sg = swiotlb_unmap_sg, | ||
29 | .dma_supported = NULL, | ||
30 | }; | ||
31 | |||
32 | void __init pci_swiotlb_init(void) | ||
33 | { | ||
34 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | ||
35 | if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) | ||
36 | swiotlb = 1; | ||
37 | if (swiotlb_force) | ||
38 | swiotlb = 1; | ||
39 | if (swiotlb) { | ||
40 | printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); | ||
41 | swiotlb_init(); | ||
42 | dma_ops = &swiotlb_dma_ops; | ||
43 | } | ||
44 | } | ||
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c new file mode 100644 index 000000000000..ae8f91214f15 --- /dev/null +++ b/arch/x86/kernel/pmtimer_64.c | |||
@@ -0,0 +1,69 @@ | |||
1 | /* Ported over from i386 by AK, original copyright was: | ||
2 | * | ||
3 | * (C) Dominik Brodowski <linux@brodo.de> 2003 | ||
4 | * | ||
5 | * Driver to use the Power Management Timer (PMTMR) available in some | ||
6 | * southbridges as primary timing source for the Linux kernel. | ||
7 | * | ||
8 | * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, | ||
9 | * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. | ||
10 | * | ||
11 | * This file is licensed under the GPL v2. | ||
12 | * | ||
13 | * Dropped all the hardware bug workarounds for now. Hopefully they | ||
14 | * are not needed on 64bit chipsets. | ||
15 | */ | ||
16 | |||
17 | #include <linux/jiffies.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <asm/io.h> | ||
23 | #include <asm/proto.h> | ||
24 | #include <asm/msr.h> | ||
25 | #include <asm/vsyscall.h> | ||
26 | |||
27 | #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ | ||
28 | |||
29 | static inline u32 cyc2us(u32 cycles) | ||
30 | { | ||
31 | /* The Power Management Timer ticks at 3.579545 ticks per microsecond. | ||
32 | * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] | ||
33 | * | ||
34 | * Even with HZ = 100, delta is at maximum 35796 ticks, so it can | ||
35 | * easily be multiplied with 286 (=0x11E) without having to fear | ||
36 | * u32 overflows. | ||
37 | */ | ||
38 | cycles *= 286; | ||
39 | return (cycles >> 10); | ||
40 | } | ||
41 | |||
42 | static unsigned pmtimer_wait_tick(void) | ||
43 | { | ||
44 | u32 a, b; | ||
45 | for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; | ||
46 | a == b; | ||
47 | b = inl(pmtmr_ioport) & ACPI_PM_MASK) | ||
48 | cpu_relax(); | ||
49 | return b; | ||
50 | } | ||
51 | |||
52 | /* note: wait time is rounded up to one tick */ | ||
53 | void pmtimer_wait(unsigned us) | ||
54 | { | ||
55 | u32 a, b; | ||
56 | a = pmtimer_wait_tick(); | ||
57 | do { | ||
58 | b = inl(pmtmr_ioport); | ||
59 | cpu_relax(); | ||
60 | } while (cyc2us(b - a) < us); | ||
61 | } | ||
62 | |||
63 | static int __init nopmtimer_setup(char *s) | ||
64 | { | ||
65 | pmtmr_ioport = 0; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | __setup("nopmtimer", nopmtimer_setup); | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c new file mode 100644 index 000000000000..98956555450b --- /dev/null +++ b/arch/x86/kernel/process_64.c | |||
@@ -0,0 +1,903 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/process.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | * | ||
9 | * X86-64 port | ||
10 | * Andi Kleen. | ||
11 | * | ||
12 | * CPU hotplug support - ashok.raj@intel.com | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This file handles the architecture-dependent parts of process handling.. | ||
17 | */ | ||
18 | |||
19 | #include <stdarg.h> | ||
20 | |||
21 | #include <linux/cpu.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/elfcore.h> | ||
28 | #include <linux/smp.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/user.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/a.out.h> | ||
33 | #include <linux/interrupt.h> | ||
34 | #include <linux/delay.h> | ||
35 | #include <linux/ptrace.h> | ||
36 | #include <linux/utsname.h> | ||
37 | #include <linux/random.h> | ||
38 | #include <linux/notifier.h> | ||
39 | #include <linux/kprobes.h> | ||
40 | #include <linux/kdebug.h> | ||
41 | |||
42 | #include <asm/uaccess.h> | ||
43 | #include <asm/pgtable.h> | ||
44 | #include <asm/system.h> | ||
45 | #include <asm/io.h> | ||
46 | #include <asm/processor.h> | ||
47 | #include <asm/i387.h> | ||
48 | #include <asm/mmu_context.h> | ||
49 | #include <asm/pda.h> | ||
50 | #include <asm/prctl.h> | ||
51 | #include <asm/desc.h> | ||
52 | #include <asm/proto.h> | ||
53 | #include <asm/ia32.h> | ||
54 | #include <asm/idle.h> | ||
55 | |||
56 | asmlinkage extern void ret_from_fork(void); | ||
57 | |||
58 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | ||
59 | |||
60 | unsigned long boot_option_idle_override = 0; | ||
61 | EXPORT_SYMBOL(boot_option_idle_override); | ||
62 | |||
63 | /* | ||
64 | * Powermanagement idle function, if any.. | ||
65 | */ | ||
66 | void (*pm_idle)(void); | ||
67 | EXPORT_SYMBOL(pm_idle); | ||
68 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | ||
69 | |||
70 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | ||
71 | |||
72 | void idle_notifier_register(struct notifier_block *n) | ||
73 | { | ||
74 | atomic_notifier_chain_register(&idle_notifier, n); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(idle_notifier_register); | ||
77 | |||
78 | void idle_notifier_unregister(struct notifier_block *n) | ||
79 | { | ||
80 | atomic_notifier_chain_unregister(&idle_notifier, n); | ||
81 | } | ||
82 | EXPORT_SYMBOL(idle_notifier_unregister); | ||
83 | |||
84 | void enter_idle(void) | ||
85 | { | ||
86 | write_pda(isidle, 1); | ||
87 | atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); | ||
88 | } | ||
89 | |||
90 | static void __exit_idle(void) | ||
91 | { | ||
92 | if (test_and_clear_bit_pda(0, isidle) == 0) | ||
93 | return; | ||
94 | atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); | ||
95 | } | ||
96 | |||
97 | /* Called from interrupts to signify idle end */ | ||
98 | void exit_idle(void) | ||
99 | { | ||
100 | /* idle loop has pid 0 */ | ||
101 | if (current->pid) | ||
102 | return; | ||
103 | __exit_idle(); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * We use this if we don't have any better | ||
108 | * idle routine.. | ||
109 | */ | ||
110 | static void default_idle(void) | ||
111 | { | ||
112 | current_thread_info()->status &= ~TS_POLLING; | ||
113 | /* | ||
114 | * TS_POLLING-cleared state must be visible before we | ||
115 | * test NEED_RESCHED: | ||
116 | */ | ||
117 | smp_mb(); | ||
118 | local_irq_disable(); | ||
119 | if (!need_resched()) { | ||
120 | /* Enables interrupts one instruction before HLT. | ||
121 | x86 special cases this so there is no race. */ | ||
122 | safe_halt(); | ||
123 | } else | ||
124 | local_irq_enable(); | ||
125 | current_thread_info()->status |= TS_POLLING; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * On SMP it's slightly faster (but much more power-consuming!) | ||
130 | * to poll the ->need_resched flag instead of waiting for the | ||
131 | * cross-CPU IPI to arrive. Use this option with caution. | ||
132 | */ | ||
133 | static void poll_idle (void) | ||
134 | { | ||
135 | local_irq_enable(); | ||
136 | cpu_relax(); | ||
137 | } | ||
138 | |||
139 | void cpu_idle_wait(void) | ||
140 | { | ||
141 | unsigned int cpu, this_cpu = get_cpu(); | ||
142 | cpumask_t map, tmp = current->cpus_allowed; | ||
143 | |||
144 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | ||
145 | put_cpu(); | ||
146 | |||
147 | cpus_clear(map); | ||
148 | for_each_online_cpu(cpu) { | ||
149 | per_cpu(cpu_idle_state, cpu) = 1; | ||
150 | cpu_set(cpu, map); | ||
151 | } | ||
152 | |||
153 | __get_cpu_var(cpu_idle_state) = 0; | ||
154 | |||
155 | wmb(); | ||
156 | do { | ||
157 | ssleep(1); | ||
158 | for_each_online_cpu(cpu) { | ||
159 | if (cpu_isset(cpu, map) && | ||
160 | !per_cpu(cpu_idle_state, cpu)) | ||
161 | cpu_clear(cpu, map); | ||
162 | } | ||
163 | cpus_and(map, map, cpu_online_map); | ||
164 | } while (!cpus_empty(map)); | ||
165 | |||
166 | set_cpus_allowed(current, tmp); | ||
167 | } | ||
168 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
169 | |||
170 | #ifdef CONFIG_HOTPLUG_CPU | ||
171 | DECLARE_PER_CPU(int, cpu_state); | ||
172 | |||
173 | #include <asm/nmi.h> | ||
174 | /* We halt the CPU with physical CPU hotplug */ | ||
175 | static inline void play_dead(void) | ||
176 | { | ||
177 | idle_task_exit(); | ||
178 | wbinvd(); | ||
179 | mb(); | ||
180 | /* Ack it */ | ||
181 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
182 | |||
183 | local_irq_disable(); | ||
184 | while (1) | ||
185 | halt(); | ||
186 | } | ||
187 | #else | ||
188 | static inline void play_dead(void) | ||
189 | { | ||
190 | BUG(); | ||
191 | } | ||
192 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
193 | |||
194 | /* | ||
195 | * The idle thread. There's no useful work to be | ||
196 | * done, so just try to conserve power and have a | ||
197 | * low exit latency (ie sit in a loop waiting for | ||
198 | * somebody to say that they'd like to reschedule) | ||
199 | */ | ||
200 | void cpu_idle (void) | ||
201 | { | ||
202 | current_thread_info()->status |= TS_POLLING; | ||
203 | /* endless idle loop with no priority at all */ | ||
204 | while (1) { | ||
205 | while (!need_resched()) { | ||
206 | void (*idle)(void); | ||
207 | |||
208 | if (__get_cpu_var(cpu_idle_state)) | ||
209 | __get_cpu_var(cpu_idle_state) = 0; | ||
210 | |||
211 | rmb(); | ||
212 | idle = pm_idle; | ||
213 | if (!idle) | ||
214 | idle = default_idle; | ||
215 | if (cpu_is_offline(smp_processor_id())) | ||
216 | play_dead(); | ||
217 | /* | ||
218 | * Idle routines should keep interrupts disabled | ||
219 | * from here on, until they go to idle. | ||
220 | * Otherwise, idle callbacks can misfire. | ||
221 | */ | ||
222 | local_irq_disable(); | ||
223 | enter_idle(); | ||
224 | idle(); | ||
225 | /* In many cases the interrupt that ended idle | ||
226 | has already called exit_idle. But some idle | ||
227 | loops can be woken up without interrupt. */ | ||
228 | __exit_idle(); | ||
229 | } | ||
230 | |||
231 | preempt_enable_no_resched(); | ||
232 | schedule(); | ||
233 | preempt_disable(); | ||
234 | } | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
239 | * which can obviate IPI to trigger checking of need_resched. | ||
240 | * We execute MONITOR against need_resched and enter optimized wait state | ||
241 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
242 | * up from MWAIT (without an IPI). | ||
243 | * | ||
244 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
245 | * capability. | ||
246 | */ | ||
247 | void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) | ||
248 | { | ||
249 | if (!need_resched()) { | ||
250 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
251 | smp_mb(); | ||
252 | if (!need_resched()) | ||
253 | __mwait(eax, ecx); | ||
254 | } | ||
255 | } | ||
256 | |||
257 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ | ||
258 | static void mwait_idle(void) | ||
259 | { | ||
260 | if (!need_resched()) { | ||
261 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
262 | smp_mb(); | ||
263 | if (!need_resched()) | ||
264 | __sti_mwait(0, 0); | ||
265 | else | ||
266 | local_irq_enable(); | ||
267 | } else { | ||
268 | local_irq_enable(); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | ||
273 | { | ||
274 | static int printed; | ||
275 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | ||
276 | /* | ||
277 | * Skip, if setup has overridden idle. | ||
278 | * One CPU supports mwait => All CPUs supports mwait | ||
279 | */ | ||
280 | if (!pm_idle) { | ||
281 | if (!printed) { | ||
282 | printk(KERN_INFO "using mwait in idle threads.\n"); | ||
283 | printed = 1; | ||
284 | } | ||
285 | pm_idle = mwait_idle; | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | static int __init idle_setup (char *str) | ||
291 | { | ||
292 | if (!strcmp(str, "poll")) { | ||
293 | printk("using polling idle threads.\n"); | ||
294 | pm_idle = poll_idle; | ||
295 | } else if (!strcmp(str, "mwait")) | ||
296 | force_mwait = 1; | ||
297 | else | ||
298 | return -1; | ||
299 | |||
300 | boot_option_idle_override = 1; | ||
301 | return 0; | ||
302 | } | ||
303 | early_param("idle", idle_setup); | ||
304 | |||
305 | /* Prints also some state that isn't saved in the pt_regs */ | ||
306 | void __show_regs(struct pt_regs * regs) | ||
307 | { | ||
308 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | ||
309 | unsigned long d0, d1, d2, d3, d6, d7; | ||
310 | unsigned int fsindex,gsindex; | ||
311 | unsigned int ds,cs,es; | ||
312 | |||
313 | printk("\n"); | ||
314 | print_modules(); | ||
315 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
316 | current->pid, current->comm, print_tainted(), | ||
317 | init_utsname()->release, | ||
318 | (int)strcspn(init_utsname()->version, " "), | ||
319 | init_utsname()->version); | ||
320 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | ||
321 | printk_address(regs->rip); | ||
322 | printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, | ||
323 | regs->eflags); | ||
324 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | ||
325 | regs->rax, regs->rbx, regs->rcx); | ||
326 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | ||
327 | regs->rdx, regs->rsi, regs->rdi); | ||
328 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | ||
329 | regs->rbp, regs->r8, regs->r9); | ||
330 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | ||
331 | regs->r10, regs->r11, regs->r12); | ||
332 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | ||
333 | regs->r13, regs->r14, regs->r15); | ||
334 | |||
335 | asm("movl %%ds,%0" : "=r" (ds)); | ||
336 | asm("movl %%cs,%0" : "=r" (cs)); | ||
337 | asm("movl %%es,%0" : "=r" (es)); | ||
338 | asm("movl %%fs,%0" : "=r" (fsindex)); | ||
339 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
340 | |||
341 | rdmsrl(MSR_FS_BASE, fs); | ||
342 | rdmsrl(MSR_GS_BASE, gs); | ||
343 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | ||
344 | |||
345 | cr0 = read_cr0(); | ||
346 | cr2 = read_cr2(); | ||
347 | cr3 = read_cr3(); | ||
348 | cr4 = read_cr4(); | ||
349 | |||
350 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | ||
351 | fs,fsindex,gs,gsindex,shadowgs); | ||
352 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | ||
353 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | ||
354 | |||
355 | get_debugreg(d0, 0); | ||
356 | get_debugreg(d1, 1); | ||
357 | get_debugreg(d2, 2); | ||
358 | printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); | ||
359 | get_debugreg(d3, 3); | ||
360 | get_debugreg(d6, 6); | ||
361 | get_debugreg(d7, 7); | ||
362 | printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); | ||
363 | } | ||
364 | |||
365 | void show_regs(struct pt_regs *regs) | ||
366 | { | ||
367 | printk("CPU %d:", smp_processor_id()); | ||
368 | __show_regs(regs); | ||
369 | show_trace(NULL, regs, (void *)(regs + 1)); | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Free current thread data structures etc.. | ||
374 | */ | ||
375 | void exit_thread(void) | ||
376 | { | ||
377 | struct task_struct *me = current; | ||
378 | struct thread_struct *t = &me->thread; | ||
379 | |||
380 | if (me->thread.io_bitmap_ptr) { | ||
381 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | ||
382 | |||
383 | kfree(t->io_bitmap_ptr); | ||
384 | t->io_bitmap_ptr = NULL; | ||
385 | clear_thread_flag(TIF_IO_BITMAP); | ||
386 | /* | ||
387 | * Careful, clear this in the TSS too: | ||
388 | */ | ||
389 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | ||
390 | t->io_bitmap_max = 0; | ||
391 | put_cpu(); | ||
392 | } | ||
393 | } | ||
394 | |||
395 | void flush_thread(void) | ||
396 | { | ||
397 | struct task_struct *tsk = current; | ||
398 | |||
399 | if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { | ||
400 | clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); | ||
401 | if (test_tsk_thread_flag(tsk, TIF_IA32)) { | ||
402 | clear_tsk_thread_flag(tsk, TIF_IA32); | ||
403 | } else { | ||
404 | set_tsk_thread_flag(tsk, TIF_IA32); | ||
405 | current_thread_info()->status |= TS_COMPAT; | ||
406 | } | ||
407 | } | ||
408 | clear_tsk_thread_flag(tsk, TIF_DEBUG); | ||
409 | |||
410 | tsk->thread.debugreg0 = 0; | ||
411 | tsk->thread.debugreg1 = 0; | ||
412 | tsk->thread.debugreg2 = 0; | ||
413 | tsk->thread.debugreg3 = 0; | ||
414 | tsk->thread.debugreg6 = 0; | ||
415 | tsk->thread.debugreg7 = 0; | ||
416 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | ||
417 | /* | ||
418 | * Forget coprocessor state.. | ||
419 | */ | ||
420 | clear_fpu(tsk); | ||
421 | clear_used_math(); | ||
422 | } | ||
423 | |||
424 | void release_thread(struct task_struct *dead_task) | ||
425 | { | ||
426 | if (dead_task->mm) { | ||
427 | if (dead_task->mm->context.size) { | ||
428 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | ||
429 | dead_task->comm, | ||
430 | dead_task->mm->context.ldt, | ||
431 | dead_task->mm->context.size); | ||
432 | BUG(); | ||
433 | } | ||
434 | } | ||
435 | } | ||
436 | |||
437 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | ||
438 | { | ||
439 | struct user_desc ud = { | ||
440 | .base_addr = addr, | ||
441 | .limit = 0xfffff, | ||
442 | .seg_32bit = 1, | ||
443 | .limit_in_pages = 1, | ||
444 | .useable = 1, | ||
445 | }; | ||
446 | struct n_desc_struct *desc = (void *)t->thread.tls_array; | ||
447 | desc += tls; | ||
448 | desc->a = LDT_entry_a(&ud); | ||
449 | desc->b = LDT_entry_b(&ud); | ||
450 | } | ||
451 | |||
452 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | ||
453 | { | ||
454 | struct desc_struct *desc = (void *)t->thread.tls_array; | ||
455 | desc += tls; | ||
456 | return desc->base0 | | ||
457 | (((u32)desc->base1) << 16) | | ||
458 | (((u32)desc->base2) << 24); | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * This gets called before we allocate a new thread and copy | ||
463 | * the current task into it. | ||
464 | */ | ||
465 | void prepare_to_copy(struct task_struct *tsk) | ||
466 | { | ||
467 | unlazy_fpu(tsk); | ||
468 | } | ||
469 | |||
470 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | ||
471 | unsigned long unused, | ||
472 | struct task_struct * p, struct pt_regs * regs) | ||
473 | { | ||
474 | int err; | ||
475 | struct pt_regs * childregs; | ||
476 | struct task_struct *me = current; | ||
477 | |||
478 | childregs = ((struct pt_regs *) | ||
479 | (THREAD_SIZE + task_stack_page(p))) - 1; | ||
480 | *childregs = *regs; | ||
481 | |||
482 | childregs->rax = 0; | ||
483 | childregs->rsp = rsp; | ||
484 | if (rsp == ~0UL) | ||
485 | childregs->rsp = (unsigned long)childregs; | ||
486 | |||
487 | p->thread.rsp = (unsigned long) childregs; | ||
488 | p->thread.rsp0 = (unsigned long) (childregs+1); | ||
489 | p->thread.userrsp = me->thread.userrsp; | ||
490 | |||
491 | set_tsk_thread_flag(p, TIF_FORK); | ||
492 | |||
493 | p->thread.fs = me->thread.fs; | ||
494 | p->thread.gs = me->thread.gs; | ||
495 | |||
496 | asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); | ||
497 | asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); | ||
498 | asm("mov %%es,%0" : "=m" (p->thread.es)); | ||
499 | asm("mov %%ds,%0" : "=m" (p->thread.ds)); | ||
500 | |||
501 | if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { | ||
502 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
503 | if (!p->thread.io_bitmap_ptr) { | ||
504 | p->thread.io_bitmap_max = 0; | ||
505 | return -ENOMEM; | ||
506 | } | ||
507 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, | ||
508 | IO_BITMAP_BYTES); | ||
509 | set_tsk_thread_flag(p, TIF_IO_BITMAP); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Set a new TLS for the child thread? | ||
514 | */ | ||
515 | if (clone_flags & CLONE_SETTLS) { | ||
516 | #ifdef CONFIG_IA32_EMULATION | ||
517 | if (test_thread_flag(TIF_IA32)) | ||
518 | err = ia32_child_tls(p, childregs); | ||
519 | else | ||
520 | #endif | ||
521 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | ||
522 | if (err) | ||
523 | goto out; | ||
524 | } | ||
525 | err = 0; | ||
526 | out: | ||
527 | if (err && p->thread.io_bitmap_ptr) { | ||
528 | kfree(p->thread.io_bitmap_ptr); | ||
529 | p->thread.io_bitmap_max = 0; | ||
530 | } | ||
531 | return err; | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * This special macro can be used to load a debugging register | ||
536 | */ | ||
537 | #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) | ||
538 | |||
539 | static inline void __switch_to_xtra(struct task_struct *prev_p, | ||
540 | struct task_struct *next_p, | ||
541 | struct tss_struct *tss) | ||
542 | { | ||
543 | struct thread_struct *prev, *next; | ||
544 | |||
545 | prev = &prev_p->thread, | ||
546 | next = &next_p->thread; | ||
547 | |||
548 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | ||
549 | loaddebug(next, 0); | ||
550 | loaddebug(next, 1); | ||
551 | loaddebug(next, 2); | ||
552 | loaddebug(next, 3); | ||
553 | /* no 4 and 5 */ | ||
554 | loaddebug(next, 6); | ||
555 | loaddebug(next, 7); | ||
556 | } | ||
557 | |||
558 | if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { | ||
559 | /* | ||
560 | * Copy the relevant range of the IO bitmap. | ||
561 | * Normally this is 128 bytes or less: | ||
562 | */ | ||
563 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, | ||
564 | max(prev->io_bitmap_max, next->io_bitmap_max)); | ||
565 | } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { | ||
566 | /* | ||
567 | * Clear any possible leftover bits: | ||
568 | */ | ||
569 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | ||
570 | } | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * switch_to(x,y) should switch tasks from x to y. | ||
575 | * | ||
576 | * This could still be optimized: | ||
577 | * - fold all the options into a flag word and test it with a single test. | ||
578 | * - could test fs/gs bitsliced | ||
579 | * | ||
580 | * Kprobes not supported here. Set the probe on schedule instead. | ||
581 | */ | ||
582 | __kprobes struct task_struct * | ||
583 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
584 | { | ||
585 | struct thread_struct *prev = &prev_p->thread, | ||
586 | *next = &next_p->thread; | ||
587 | int cpu = smp_processor_id(); | ||
588 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
589 | |||
590 | /* we're going to use this soon, after a few expensive things */ | ||
591 | if (next_p->fpu_counter>5) | ||
592 | prefetch(&next->i387.fxsave); | ||
593 | |||
594 | /* | ||
595 | * Reload esp0, LDT and the page table pointer: | ||
596 | */ | ||
597 | tss->rsp0 = next->rsp0; | ||
598 | |||
599 | /* | ||
600 | * Switch DS and ES. | ||
601 | * This won't pick up thread selector changes, but I guess that is ok. | ||
602 | */ | ||
603 | asm volatile("mov %%es,%0" : "=m" (prev->es)); | ||
604 | if (unlikely(next->es | prev->es)) | ||
605 | loadsegment(es, next->es); | ||
606 | |||
607 | asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); | ||
608 | if (unlikely(next->ds | prev->ds)) | ||
609 | loadsegment(ds, next->ds); | ||
610 | |||
611 | load_TLS(next, cpu); | ||
612 | |||
613 | /* | ||
614 | * Switch FS and GS. | ||
615 | */ | ||
616 | { | ||
617 | unsigned fsindex; | ||
618 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | ||
619 | /* segment register != 0 always requires a reload. | ||
620 | also reload when it has changed. | ||
621 | when prev process used 64bit base always reload | ||
622 | to avoid an information leak. */ | ||
623 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | ||
624 | loadsegment(fs, next->fsindex); | ||
625 | /* check if the user used a selector != 0 | ||
626 | * if yes clear 64bit base, since overloaded base | ||
627 | * is always mapped to the Null selector | ||
628 | */ | ||
629 | if (fsindex) | ||
630 | prev->fs = 0; | ||
631 | } | ||
632 | /* when next process has a 64bit base use it */ | ||
633 | if (next->fs) | ||
634 | wrmsrl(MSR_FS_BASE, next->fs); | ||
635 | prev->fsindex = fsindex; | ||
636 | } | ||
637 | { | ||
638 | unsigned gsindex; | ||
639 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | ||
640 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | ||
641 | load_gs_index(next->gsindex); | ||
642 | if (gsindex) | ||
643 | prev->gs = 0; | ||
644 | } | ||
645 | if (next->gs) | ||
646 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
647 | prev->gsindex = gsindex; | ||
648 | } | ||
649 | |||
650 | /* Must be after DS reload */ | ||
651 | unlazy_fpu(prev_p); | ||
652 | |||
653 | /* | ||
654 | * Switch the PDA and FPU contexts. | ||
655 | */ | ||
656 | prev->userrsp = read_pda(oldrsp); | ||
657 | write_pda(oldrsp, next->userrsp); | ||
658 | write_pda(pcurrent, next_p); | ||
659 | |||
660 | write_pda(kernelstack, | ||
661 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | ||
662 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
663 | write_pda(stack_canary, next_p->stack_canary); | ||
664 | /* | ||
665 | * Build time only check to make sure the stack_canary is at | ||
666 | * offset 40 in the pda; this is a gcc ABI requirement | ||
667 | */ | ||
668 | BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); | ||
669 | #endif | ||
670 | |||
671 | /* | ||
672 | * Now maybe reload the debug registers and handle I/O bitmaps | ||
673 | */ | ||
674 | if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) | ||
675 | || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) | ||
676 | __switch_to_xtra(prev_p, next_p, tss); | ||
677 | |||
678 | /* If the task has used fpu the last 5 timeslices, just do a full | ||
679 | * restore of the math state immediately to avoid the trap; the | ||
680 | * chances of needing FPU soon are obviously high now | ||
681 | */ | ||
682 | if (next_p->fpu_counter>5) | ||
683 | math_state_restore(); | ||
684 | return prev_p; | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * sys_execve() executes a new program. | ||
689 | */ | ||
690 | asmlinkage | ||
691 | long sys_execve(char __user *name, char __user * __user *argv, | ||
692 | char __user * __user *envp, struct pt_regs regs) | ||
693 | { | ||
694 | long error; | ||
695 | char * filename; | ||
696 | |||
697 | filename = getname(name); | ||
698 | error = PTR_ERR(filename); | ||
699 | if (IS_ERR(filename)) | ||
700 | return error; | ||
701 | error = do_execve(filename, argv, envp, ®s); | ||
702 | if (error == 0) { | ||
703 | task_lock(current); | ||
704 | current->ptrace &= ~PT_DTRACE; | ||
705 | task_unlock(current); | ||
706 | } | ||
707 | putname(filename); | ||
708 | return error; | ||
709 | } | ||
710 | |||
711 | void set_personality_64bit(void) | ||
712 | { | ||
713 | /* inherit personality from parent */ | ||
714 | |||
715 | /* Make sure to be in 64bit mode */ | ||
716 | clear_thread_flag(TIF_IA32); | ||
717 | |||
718 | /* TBD: overwrites user setup. Should have two bits. | ||
719 | But 64bit processes have always behaved this way, | ||
720 | so it's not too bad. The main problem is just that | ||
721 | 32bit childs are affected again. */ | ||
722 | current->personality &= ~READ_IMPLIES_EXEC; | ||
723 | } | ||
724 | |||
725 | asmlinkage long sys_fork(struct pt_regs *regs) | ||
726 | { | ||
727 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | ||
728 | } | ||
729 | |||
730 | asmlinkage long | ||
731 | sys_clone(unsigned long clone_flags, unsigned long newsp, | ||
732 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | ||
733 | { | ||
734 | if (!newsp) | ||
735 | newsp = regs->rsp; | ||
736 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * This is trivial, and on the face of it looks like it | ||
741 | * could equally well be done in user mode. | ||
742 | * | ||
743 | * Not so, for quite unobvious reasons - register pressure. | ||
744 | * In user mode vfork() cannot have a stack frame, and if | ||
745 | * done by calling the "clone()" system call directly, you | ||
746 | * do not have enough call-clobbered registers to hold all | ||
747 | * the information you need. | ||
748 | */ | ||
749 | asmlinkage long sys_vfork(struct pt_regs *regs) | ||
750 | { | ||
751 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | ||
752 | NULL, NULL); | ||
753 | } | ||
754 | |||
755 | unsigned long get_wchan(struct task_struct *p) | ||
756 | { | ||
757 | unsigned long stack; | ||
758 | u64 fp,rip; | ||
759 | int count = 0; | ||
760 | |||
761 | if (!p || p == current || p->state==TASK_RUNNING) | ||
762 | return 0; | ||
763 | stack = (unsigned long)task_stack_page(p); | ||
764 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | ||
765 | return 0; | ||
766 | fp = *(u64 *)(p->thread.rsp); | ||
767 | do { | ||
768 | if (fp < (unsigned long)stack || | ||
769 | fp > (unsigned long)stack+THREAD_SIZE) | ||
770 | return 0; | ||
771 | rip = *(u64 *)(fp+8); | ||
772 | if (!in_sched_functions(rip)) | ||
773 | return rip; | ||
774 | fp = *(u64 *)fp; | ||
775 | } while (count++ < 16); | ||
776 | return 0; | ||
777 | } | ||
778 | |||
779 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | ||
780 | { | ||
781 | int ret = 0; | ||
782 | int doit = task == current; | ||
783 | int cpu; | ||
784 | |||
785 | switch (code) { | ||
786 | case ARCH_SET_GS: | ||
787 | if (addr >= TASK_SIZE_OF(task)) | ||
788 | return -EPERM; | ||
789 | cpu = get_cpu(); | ||
790 | /* handle small bases via the GDT because that's faster to | ||
791 | switch. */ | ||
792 | if (addr <= 0xffffffff) { | ||
793 | set_32bit_tls(task, GS_TLS, addr); | ||
794 | if (doit) { | ||
795 | load_TLS(&task->thread, cpu); | ||
796 | load_gs_index(GS_TLS_SEL); | ||
797 | } | ||
798 | task->thread.gsindex = GS_TLS_SEL; | ||
799 | task->thread.gs = 0; | ||
800 | } else { | ||
801 | task->thread.gsindex = 0; | ||
802 | task->thread.gs = addr; | ||
803 | if (doit) { | ||
804 | load_gs_index(0); | ||
805 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | ||
806 | } | ||
807 | } | ||
808 | put_cpu(); | ||
809 | break; | ||
810 | case ARCH_SET_FS: | ||
811 | /* Not strictly needed for fs, but do it for symmetry | ||
812 | with gs */ | ||
813 | if (addr >= TASK_SIZE_OF(task)) | ||
814 | return -EPERM; | ||
815 | cpu = get_cpu(); | ||
816 | /* handle small bases via the GDT because that's faster to | ||
817 | switch. */ | ||
818 | if (addr <= 0xffffffff) { | ||
819 | set_32bit_tls(task, FS_TLS, addr); | ||
820 | if (doit) { | ||
821 | load_TLS(&task->thread, cpu); | ||
822 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | ||
823 | } | ||
824 | task->thread.fsindex = FS_TLS_SEL; | ||
825 | task->thread.fs = 0; | ||
826 | } else { | ||
827 | task->thread.fsindex = 0; | ||
828 | task->thread.fs = addr; | ||
829 | if (doit) { | ||
830 | /* set the selector to 0 to not confuse | ||
831 | __switch_to */ | ||
832 | asm volatile("movl %0,%%fs" :: "r" (0)); | ||
833 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | ||
834 | } | ||
835 | } | ||
836 | put_cpu(); | ||
837 | break; | ||
838 | case ARCH_GET_FS: { | ||
839 | unsigned long base; | ||
840 | if (task->thread.fsindex == FS_TLS_SEL) | ||
841 | base = read_32bit_tls(task, FS_TLS); | ||
842 | else if (doit) | ||
843 | rdmsrl(MSR_FS_BASE, base); | ||
844 | else | ||
845 | base = task->thread.fs; | ||
846 | ret = put_user(base, (unsigned long __user *)addr); | ||
847 | break; | ||
848 | } | ||
849 | case ARCH_GET_GS: { | ||
850 | unsigned long base; | ||
851 | unsigned gsindex; | ||
852 | if (task->thread.gsindex == GS_TLS_SEL) | ||
853 | base = read_32bit_tls(task, GS_TLS); | ||
854 | else if (doit) { | ||
855 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
856 | if (gsindex) | ||
857 | rdmsrl(MSR_KERNEL_GS_BASE, base); | ||
858 | else | ||
859 | base = task->thread.gs; | ||
860 | } | ||
861 | else | ||
862 | base = task->thread.gs; | ||
863 | ret = put_user(base, (unsigned long __user *)addr); | ||
864 | break; | ||
865 | } | ||
866 | |||
867 | default: | ||
868 | ret = -EINVAL; | ||
869 | break; | ||
870 | } | ||
871 | |||
872 | return ret; | ||
873 | } | ||
874 | |||
875 | long sys_arch_prctl(int code, unsigned long addr) | ||
876 | { | ||
877 | return do_arch_prctl(current, code, addr); | ||
878 | } | ||
879 | |||
880 | /* | ||
881 | * Capture the user space registers if the task is not running (in user space) | ||
882 | */ | ||
883 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | ||
884 | { | ||
885 | struct pt_regs *pp, ptregs; | ||
886 | |||
887 | pp = task_pt_regs(tsk); | ||
888 | |||
889 | ptregs = *pp; | ||
890 | ptregs.cs &= 0xffff; | ||
891 | ptregs.ss &= 0xffff; | ||
892 | |||
893 | elf_core_copy_regs(regs, &ptregs); | ||
894 | |||
895 | return 1; | ||
896 | } | ||
897 | |||
898 | unsigned long arch_align_stack(unsigned long sp) | ||
899 | { | ||
900 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
901 | sp -= get_random_int() % 8192; | ||
902 | return sp & ~0xf; | ||
903 | } | ||
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c new file mode 100644 index 000000000000..eea3702427b4 --- /dev/null +++ b/arch/x86/kernel/ptrace_64.c | |||
@@ -0,0 +1,627 @@ | |||
1 | /* ptrace.c */ | ||
2 | /* By Ross Biro 1/23/92 */ | ||
3 | /* | ||
4 | * Pentium III FXSR, SSE support | ||
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
6 | * | ||
7 | * x86-64 port 2000-2002 Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/errno.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/user.h> | ||
17 | #include <linux/security.h> | ||
18 | #include <linux/audit.h> | ||
19 | #include <linux/seccomp.h> | ||
20 | #include <linux/signal.h> | ||
21 | |||
22 | #include <asm/uaccess.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <asm/processor.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/debugreg.h> | ||
28 | #include <asm/ldt.h> | ||
29 | #include <asm/desc.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/ia32.h> | ||
32 | |||
33 | /* | ||
34 | * does not yet catch signals sent when the child dies. | ||
35 | * in exit.c or in signal.c. | ||
36 | */ | ||
37 | |||
38 | /* | ||
39 | * Determines which flags the user has access to [1 = access, 0 = no access]. | ||
40 | * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). | ||
41 | * Also masks reserved bits (63-22, 15, 5, 3, 1). | ||
42 | */ | ||
43 | #define FLAG_MASK 0x54dd5UL | ||
44 | |||
45 | /* set's the trap flag. */ | ||
46 | #define TRAP_FLAG 0x100UL | ||
47 | |||
48 | /* | ||
49 | * eflags and offset of eflags on child stack.. | ||
50 | */ | ||
51 | #define EFLAGS offsetof(struct pt_regs, eflags) | ||
52 | #define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) | ||
53 | |||
54 | /* | ||
55 | * this routine will get a word off of the processes privileged stack. | ||
56 | * the offset is how far from the base addr as stored in the TSS. | ||
57 | * this routine assumes that all the privileged stacks are in our | ||
58 | * data space. | ||
59 | */ | ||
60 | static inline unsigned long get_stack_long(struct task_struct *task, int offset) | ||
61 | { | ||
62 | unsigned char *stack; | ||
63 | |||
64 | stack = (unsigned char *)task->thread.rsp0; | ||
65 | stack += offset; | ||
66 | return (*((unsigned long *)stack)); | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * this routine will put a word on the processes privileged stack. | ||
71 | * the offset is how far from the base addr as stored in the TSS. | ||
72 | * this routine assumes that all the privileged stacks are in our | ||
73 | * data space. | ||
74 | */ | ||
75 | static inline long put_stack_long(struct task_struct *task, int offset, | ||
76 | unsigned long data) | ||
77 | { | ||
78 | unsigned char * stack; | ||
79 | |||
80 | stack = (unsigned char *) task->thread.rsp0; | ||
81 | stack += offset; | ||
82 | *(unsigned long *) stack = data; | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | #define LDT_SEGMENT 4 | ||
87 | |||
88 | unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) | ||
89 | { | ||
90 | unsigned long addr, seg; | ||
91 | |||
92 | addr = regs->rip; | ||
93 | seg = regs->cs & 0xffff; | ||
94 | |||
95 | /* | ||
96 | * We'll assume that the code segments in the GDT | ||
97 | * are all zero-based. That is largely true: the | ||
98 | * TLS segments are used for data, and the PNPBIOS | ||
99 | * and APM bios ones we just ignore here. | ||
100 | */ | ||
101 | if (seg & LDT_SEGMENT) { | ||
102 | u32 *desc; | ||
103 | unsigned long base; | ||
104 | |||
105 | seg &= ~7UL; | ||
106 | |||
107 | down(&child->mm->context.sem); | ||
108 | if (unlikely((seg >> 3) >= child->mm->context.size)) | ||
109 | addr = -1L; /* bogus selector, access would fault */ | ||
110 | else { | ||
111 | desc = child->mm->context.ldt + seg; | ||
112 | base = ((desc[0] >> 16) | | ||
113 | ((desc[1] & 0xff) << 16) | | ||
114 | (desc[1] & 0xff000000)); | ||
115 | |||
116 | /* 16-bit code segment? */ | ||
117 | if (!((desc[1] >> 22) & 1)) | ||
118 | addr &= 0xffff; | ||
119 | addr += base; | ||
120 | } | ||
121 | up(&child->mm->context.sem); | ||
122 | } | ||
123 | |||
124 | return addr; | ||
125 | } | ||
126 | |||
127 | static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) | ||
128 | { | ||
129 | int i, copied; | ||
130 | unsigned char opcode[15]; | ||
131 | unsigned long addr = convert_rip_to_linear(child, regs); | ||
132 | |||
133 | copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); | ||
134 | for (i = 0; i < copied; i++) { | ||
135 | switch (opcode[i]) { | ||
136 | /* popf and iret */ | ||
137 | case 0x9d: case 0xcf: | ||
138 | return 1; | ||
139 | |||
140 | /* CHECKME: 64 65 */ | ||
141 | |||
142 | /* opcode and address size prefixes */ | ||
143 | case 0x66: case 0x67: | ||
144 | continue; | ||
145 | /* irrelevant prefixes (segment overrides and repeats) */ | ||
146 | case 0x26: case 0x2e: | ||
147 | case 0x36: case 0x3e: | ||
148 | case 0x64: case 0x65: | ||
149 | case 0xf2: case 0xf3: | ||
150 | continue; | ||
151 | |||
152 | case 0x40 ... 0x4f: | ||
153 | if (regs->cs != __USER_CS) | ||
154 | /* 32-bit mode: register increment */ | ||
155 | return 0; | ||
156 | /* 64-bit mode: REX prefix */ | ||
157 | continue; | ||
158 | |||
159 | /* CHECKME: f2, f3 */ | ||
160 | |||
161 | /* | ||
162 | * pushf: NOTE! We should probably not let | ||
163 | * the user see the TF bit being set. But | ||
164 | * it's more pain than it's worth to avoid | ||
165 | * it, and a debugger could emulate this | ||
166 | * all in user space if it _really_ cares. | ||
167 | */ | ||
168 | case 0x9c: | ||
169 | default: | ||
170 | return 0; | ||
171 | } | ||
172 | } | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | static void set_singlestep(struct task_struct *child) | ||
177 | { | ||
178 | struct pt_regs *regs = task_pt_regs(child); | ||
179 | |||
180 | /* | ||
181 | * Always set TIF_SINGLESTEP - this guarantees that | ||
182 | * we single-step system calls etc.. This will also | ||
183 | * cause us to set TF when returning to user mode. | ||
184 | */ | ||
185 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
186 | |||
187 | /* | ||
188 | * If TF was already set, don't do anything else | ||
189 | */ | ||
190 | if (regs->eflags & TRAP_FLAG) | ||
191 | return; | ||
192 | |||
193 | /* Set TF on the kernel stack.. */ | ||
194 | regs->eflags |= TRAP_FLAG; | ||
195 | |||
196 | /* | ||
197 | * ..but if TF is changed by the instruction we will trace, | ||
198 | * don't mark it as being "us" that set it, so that we | ||
199 | * won't clear it by hand later. | ||
200 | */ | ||
201 | if (is_setting_trap_flag(child, regs)) | ||
202 | return; | ||
203 | |||
204 | child->ptrace |= PT_DTRACE; | ||
205 | } | ||
206 | |||
207 | static void clear_singlestep(struct task_struct *child) | ||
208 | { | ||
209 | /* Always clear TIF_SINGLESTEP... */ | ||
210 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
211 | |||
212 | /* But touch TF only if it was set by us.. */ | ||
213 | if (child->ptrace & PT_DTRACE) { | ||
214 | struct pt_regs *regs = task_pt_regs(child); | ||
215 | regs->eflags &= ~TRAP_FLAG; | ||
216 | child->ptrace &= ~PT_DTRACE; | ||
217 | } | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * Called by kernel/ptrace.c when detaching.. | ||
222 | * | ||
223 | * Make sure the single step bit is not set. | ||
224 | */ | ||
225 | void ptrace_disable(struct task_struct *child) | ||
226 | { | ||
227 | clear_singlestep(child); | ||
228 | } | ||
229 | |||
230 | static int putreg(struct task_struct *child, | ||
231 | unsigned long regno, unsigned long value) | ||
232 | { | ||
233 | unsigned long tmp; | ||
234 | |||
235 | switch (regno) { | ||
236 | case offsetof(struct user_regs_struct,fs): | ||
237 | if (value && (value & 3) != 3) | ||
238 | return -EIO; | ||
239 | child->thread.fsindex = value & 0xffff; | ||
240 | return 0; | ||
241 | case offsetof(struct user_regs_struct,gs): | ||
242 | if (value && (value & 3) != 3) | ||
243 | return -EIO; | ||
244 | child->thread.gsindex = value & 0xffff; | ||
245 | return 0; | ||
246 | case offsetof(struct user_regs_struct,ds): | ||
247 | if (value && (value & 3) != 3) | ||
248 | return -EIO; | ||
249 | child->thread.ds = value & 0xffff; | ||
250 | return 0; | ||
251 | case offsetof(struct user_regs_struct,es): | ||
252 | if (value && (value & 3) != 3) | ||
253 | return -EIO; | ||
254 | child->thread.es = value & 0xffff; | ||
255 | return 0; | ||
256 | case offsetof(struct user_regs_struct,ss): | ||
257 | if ((value & 3) != 3) | ||
258 | return -EIO; | ||
259 | value &= 0xffff; | ||
260 | return 0; | ||
261 | case offsetof(struct user_regs_struct,fs_base): | ||
262 | if (value >= TASK_SIZE_OF(child)) | ||
263 | return -EIO; | ||
264 | child->thread.fs = value; | ||
265 | return 0; | ||
266 | case offsetof(struct user_regs_struct,gs_base): | ||
267 | if (value >= TASK_SIZE_OF(child)) | ||
268 | return -EIO; | ||
269 | child->thread.gs = value; | ||
270 | return 0; | ||
271 | case offsetof(struct user_regs_struct, eflags): | ||
272 | value &= FLAG_MASK; | ||
273 | tmp = get_stack_long(child, EFL_OFFSET); | ||
274 | tmp &= ~FLAG_MASK; | ||
275 | value |= tmp; | ||
276 | break; | ||
277 | case offsetof(struct user_regs_struct,cs): | ||
278 | if ((value & 3) != 3) | ||
279 | return -EIO; | ||
280 | value &= 0xffff; | ||
281 | break; | ||
282 | } | ||
283 | put_stack_long(child, regno - sizeof(struct pt_regs), value); | ||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | static unsigned long getreg(struct task_struct *child, unsigned long regno) | ||
288 | { | ||
289 | unsigned long val; | ||
290 | switch (regno) { | ||
291 | case offsetof(struct user_regs_struct, fs): | ||
292 | return child->thread.fsindex; | ||
293 | case offsetof(struct user_regs_struct, gs): | ||
294 | return child->thread.gsindex; | ||
295 | case offsetof(struct user_regs_struct, ds): | ||
296 | return child->thread.ds; | ||
297 | case offsetof(struct user_regs_struct, es): | ||
298 | return child->thread.es; | ||
299 | case offsetof(struct user_regs_struct, fs_base): | ||
300 | return child->thread.fs; | ||
301 | case offsetof(struct user_regs_struct, gs_base): | ||
302 | return child->thread.gs; | ||
303 | default: | ||
304 | regno = regno - sizeof(struct pt_regs); | ||
305 | val = get_stack_long(child, regno); | ||
306 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
307 | val &= 0xffffffff; | ||
308 | return val; | ||
309 | } | ||
310 | |||
311 | } | ||
312 | |||
313 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | ||
314 | { | ||
315 | long i, ret; | ||
316 | unsigned ui; | ||
317 | |||
318 | switch (request) { | ||
319 | /* when I and D space are separate, these will need to be fixed. */ | ||
320 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | ||
321 | case PTRACE_PEEKDATA: | ||
322 | ret = generic_ptrace_peekdata(child, addr, data); | ||
323 | break; | ||
324 | |||
325 | /* read the word at location addr in the USER area. */ | ||
326 | case PTRACE_PEEKUSR: { | ||
327 | unsigned long tmp; | ||
328 | |||
329 | ret = -EIO; | ||
330 | if ((addr & 7) || | ||
331 | addr > sizeof(struct user) - 7) | ||
332 | break; | ||
333 | |||
334 | switch (addr) { | ||
335 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): | ||
336 | tmp = getreg(child, addr); | ||
337 | break; | ||
338 | case offsetof(struct user, u_debugreg[0]): | ||
339 | tmp = child->thread.debugreg0; | ||
340 | break; | ||
341 | case offsetof(struct user, u_debugreg[1]): | ||
342 | tmp = child->thread.debugreg1; | ||
343 | break; | ||
344 | case offsetof(struct user, u_debugreg[2]): | ||
345 | tmp = child->thread.debugreg2; | ||
346 | break; | ||
347 | case offsetof(struct user, u_debugreg[3]): | ||
348 | tmp = child->thread.debugreg3; | ||
349 | break; | ||
350 | case offsetof(struct user, u_debugreg[6]): | ||
351 | tmp = child->thread.debugreg6; | ||
352 | break; | ||
353 | case offsetof(struct user, u_debugreg[7]): | ||
354 | tmp = child->thread.debugreg7; | ||
355 | break; | ||
356 | default: | ||
357 | tmp = 0; | ||
358 | break; | ||
359 | } | ||
360 | ret = put_user(tmp,(unsigned long __user *) data); | ||
361 | break; | ||
362 | } | ||
363 | |||
364 | /* when I and D space are separate, this will have to be fixed. */ | ||
365 | case PTRACE_POKETEXT: /* write the word at location addr. */ | ||
366 | case PTRACE_POKEDATA: | ||
367 | ret = generic_ptrace_pokedata(child, addr, data); | ||
368 | break; | ||
369 | |||
370 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | ||
371 | { | ||
372 | int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; | ||
373 | ret = -EIO; | ||
374 | if ((addr & 7) || | ||
375 | addr > sizeof(struct user) - 7) | ||
376 | break; | ||
377 | |||
378 | switch (addr) { | ||
379 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): | ||
380 | ret = putreg(child, addr, data); | ||
381 | break; | ||
382 | /* Disallows to set a breakpoint into the vsyscall */ | ||
383 | case offsetof(struct user, u_debugreg[0]): | ||
384 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
385 | child->thread.debugreg0 = data; | ||
386 | ret = 0; | ||
387 | break; | ||
388 | case offsetof(struct user, u_debugreg[1]): | ||
389 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
390 | child->thread.debugreg1 = data; | ||
391 | ret = 0; | ||
392 | break; | ||
393 | case offsetof(struct user, u_debugreg[2]): | ||
394 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
395 | child->thread.debugreg2 = data; | ||
396 | ret = 0; | ||
397 | break; | ||
398 | case offsetof(struct user, u_debugreg[3]): | ||
399 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
400 | child->thread.debugreg3 = data; | ||
401 | ret = 0; | ||
402 | break; | ||
403 | case offsetof(struct user, u_debugreg[6]): | ||
404 | if (data >> 32) | ||
405 | break; | ||
406 | child->thread.debugreg6 = data; | ||
407 | ret = 0; | ||
408 | break; | ||
409 | case offsetof(struct user, u_debugreg[7]): | ||
410 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
411 | * this awkward check.*/ | ||
412 | data &= ~DR_CONTROL_RESERVED; | ||
413 | for(i=0; i<4; i++) | ||
414 | if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) | ||
415 | break; | ||
416 | if (i == 4) { | ||
417 | child->thread.debugreg7 = data; | ||
418 | if (data) | ||
419 | set_tsk_thread_flag(child, TIF_DEBUG); | ||
420 | else | ||
421 | clear_tsk_thread_flag(child, TIF_DEBUG); | ||
422 | ret = 0; | ||
423 | } | ||
424 | break; | ||
425 | } | ||
426 | break; | ||
427 | } | ||
428 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | ||
429 | case PTRACE_CONT: /* restart after signal. */ | ||
430 | |||
431 | ret = -EIO; | ||
432 | if (!valid_signal(data)) | ||
433 | break; | ||
434 | if (request == PTRACE_SYSCALL) | ||
435 | set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
436 | else | ||
437 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
438 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
439 | child->exit_code = data; | ||
440 | /* make sure the single step bit is not set. */ | ||
441 | clear_singlestep(child); | ||
442 | wake_up_process(child); | ||
443 | ret = 0; | ||
444 | break; | ||
445 | |||
446 | #ifdef CONFIG_IA32_EMULATION | ||
447 | /* This makes only sense with 32bit programs. Allow a | ||
448 | 64bit debugger to fully examine them too. Better | ||
449 | don't use it against 64bit processes, use | ||
450 | PTRACE_ARCH_PRCTL instead. */ | ||
451 | case PTRACE_SET_THREAD_AREA: { | ||
452 | struct user_desc __user *p; | ||
453 | int old; | ||
454 | p = (struct user_desc __user *)data; | ||
455 | get_user(old, &p->entry_number); | ||
456 | put_user(addr, &p->entry_number); | ||
457 | ret = do_set_thread_area(&child->thread, p); | ||
458 | put_user(old, &p->entry_number); | ||
459 | break; | ||
460 | case PTRACE_GET_THREAD_AREA: | ||
461 | p = (struct user_desc __user *)data; | ||
462 | get_user(old, &p->entry_number); | ||
463 | put_user(addr, &p->entry_number); | ||
464 | ret = do_get_thread_area(&child->thread, p); | ||
465 | put_user(old, &p->entry_number); | ||
466 | break; | ||
467 | } | ||
468 | #endif | ||
469 | /* normal 64bit interface to access TLS data. | ||
470 | Works just like arch_prctl, except that the arguments | ||
471 | are reversed. */ | ||
472 | case PTRACE_ARCH_PRCTL: | ||
473 | ret = do_arch_prctl(child, data, addr); | ||
474 | break; | ||
475 | |||
476 | /* | ||
477 | * make the child exit. Best I can do is send it a sigkill. | ||
478 | * perhaps it should be put in the status that it wants to | ||
479 | * exit. | ||
480 | */ | ||
481 | case PTRACE_KILL: | ||
482 | ret = 0; | ||
483 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | ||
484 | break; | ||
485 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
486 | child->exit_code = SIGKILL; | ||
487 | /* make sure the single step bit is not set. */ | ||
488 | clear_singlestep(child); | ||
489 | wake_up_process(child); | ||
490 | break; | ||
491 | |||
492 | case PTRACE_SINGLESTEP: /* set the trap flag. */ | ||
493 | ret = -EIO; | ||
494 | if (!valid_signal(data)) | ||
495 | break; | ||
496 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
497 | set_singlestep(child); | ||
498 | child->exit_code = data; | ||
499 | /* give it a chance to run. */ | ||
500 | wake_up_process(child); | ||
501 | ret = 0; | ||
502 | break; | ||
503 | |||
504 | case PTRACE_DETACH: | ||
505 | /* detach a process that was attached. */ | ||
506 | ret = ptrace_detach(child, data); | ||
507 | break; | ||
508 | |||
509 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
510 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
511 | sizeof(struct user_regs_struct))) { | ||
512 | ret = -EIO; | ||
513 | break; | ||
514 | } | ||
515 | ret = 0; | ||
516 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
517 | ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); | ||
518 | data += sizeof(long); | ||
519 | } | ||
520 | break; | ||
521 | } | ||
522 | |||
523 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
524 | unsigned long tmp; | ||
525 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
526 | sizeof(struct user_regs_struct))) { | ||
527 | ret = -EIO; | ||
528 | break; | ||
529 | } | ||
530 | ret = 0; | ||
531 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
532 | ret = __get_user(tmp, (unsigned long __user *) data); | ||
533 | if (ret) | ||
534 | break; | ||
535 | ret = putreg(child, ui, tmp); | ||
536 | if (ret) | ||
537 | break; | ||
538 | data += sizeof(long); | ||
539 | } | ||
540 | break; | ||
541 | } | ||
542 | |||
543 | case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ | ||
544 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
545 | sizeof(struct user_i387_struct))) { | ||
546 | ret = -EIO; | ||
547 | break; | ||
548 | } | ||
549 | ret = get_fpregs((struct user_i387_struct __user *)data, child); | ||
550 | break; | ||
551 | } | ||
552 | |||
553 | case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ | ||
554 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
555 | sizeof(struct user_i387_struct))) { | ||
556 | ret = -EIO; | ||
557 | break; | ||
558 | } | ||
559 | set_stopped_child_used_math(child); | ||
560 | ret = set_fpregs(child, (struct user_i387_struct __user *)data); | ||
561 | break; | ||
562 | } | ||
563 | |||
564 | default: | ||
565 | ret = ptrace_request(child, request, addr, data); | ||
566 | break; | ||
567 | } | ||
568 | return ret; | ||
569 | } | ||
570 | |||
571 | static void syscall_trace(struct pt_regs *regs) | ||
572 | { | ||
573 | |||
574 | #if 0 | ||
575 | printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", | ||
576 | current->comm, | ||
577 | regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), | ||
578 | current_thread_info()->flags, current->ptrace); | ||
579 | #endif | ||
580 | |||
581 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | ||
582 | ? 0x80 : 0)); | ||
583 | /* | ||
584 | * this isn't the same as continuing with a signal, but it will do | ||
585 | * for normal use. strace only continues with a signal if the | ||
586 | * stopping signal is not SIGTRAP. -brl | ||
587 | */ | ||
588 | if (current->exit_code) { | ||
589 | send_sig(current->exit_code, current, 1); | ||
590 | current->exit_code = 0; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | asmlinkage void syscall_trace_enter(struct pt_regs *regs) | ||
595 | { | ||
596 | /* do the secure computing check first */ | ||
597 | secure_computing(regs->orig_rax); | ||
598 | |||
599 | if (test_thread_flag(TIF_SYSCALL_TRACE) | ||
600 | && (current->ptrace & PT_PTRACED)) | ||
601 | syscall_trace(regs); | ||
602 | |||
603 | if (unlikely(current->audit_context)) { | ||
604 | if (test_thread_flag(TIF_IA32)) { | ||
605 | audit_syscall_entry(AUDIT_ARCH_I386, | ||
606 | regs->orig_rax, | ||
607 | regs->rbx, regs->rcx, | ||
608 | regs->rdx, regs->rsi); | ||
609 | } else { | ||
610 | audit_syscall_entry(AUDIT_ARCH_X86_64, | ||
611 | regs->orig_rax, | ||
612 | regs->rdi, regs->rsi, | ||
613 | regs->rdx, regs->r10); | ||
614 | } | ||
615 | } | ||
616 | } | ||
617 | |||
618 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | ||
619 | { | ||
620 | if (unlikely(current->audit_context)) | ||
621 | audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); | ||
622 | |||
623 | if ((test_thread_flag(TIF_SYSCALL_TRACE) | ||
624 | || test_thread_flag(TIF_SINGLESTEP)) | ||
625 | && (current->ptrace & PT_PTRACED)) | ||
626 | syscall_trace(regs); | ||
627 | } | ||
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c new file mode 100644 index 000000000000..368db2b9c5ac --- /dev/null +++ b/arch/x86/kernel/reboot_64.c | |||
@@ -0,0 +1,171 @@ | |||
1 | /* Various gunk just to reboot the machine. */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/reboot.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/ctype.h> | ||
8 | #include <linux/string.h> | ||
9 | #include <linux/pm.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <asm/io.h> | ||
13 | #include <asm/delay.h> | ||
14 | #include <asm/hw_irq.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/apic.h> | ||
19 | #include <asm/iommu.h> | ||
20 | |||
21 | /* | ||
22 | * Power off function, if any | ||
23 | */ | ||
24 | void (*pm_power_off)(void); | ||
25 | EXPORT_SYMBOL(pm_power_off); | ||
26 | |||
27 | static long no_idt[3]; | ||
28 | static enum { | ||
29 | BOOT_TRIPLE = 't', | ||
30 | BOOT_KBD = 'k' | ||
31 | } reboot_type = BOOT_KBD; | ||
32 | static int reboot_mode = 0; | ||
33 | int reboot_force; | ||
34 | |||
35 | /* reboot=t[riple] | k[bd] [, [w]arm | [c]old] | ||
36 | warm Don't set the cold reboot flag | ||
37 | cold Set the cold reboot flag | ||
38 | triple Force a triple fault (init) | ||
39 | kbd Use the keyboard controller. cold reset (default) | ||
40 | force Avoid anything that could hang. | ||
41 | */ | ||
42 | static int __init reboot_setup(char *str) | ||
43 | { | ||
44 | for (;;) { | ||
45 | switch (*str) { | ||
46 | case 'w': | ||
47 | reboot_mode = 0x1234; | ||
48 | break; | ||
49 | |||
50 | case 'c': | ||
51 | reboot_mode = 0; | ||
52 | break; | ||
53 | |||
54 | case 't': | ||
55 | case 'b': | ||
56 | case 'k': | ||
57 | reboot_type = *str; | ||
58 | break; | ||
59 | case 'f': | ||
60 | reboot_force = 1; | ||
61 | break; | ||
62 | } | ||
63 | if((str = strchr(str,',')) != NULL) | ||
64 | str++; | ||
65 | else | ||
66 | break; | ||
67 | } | ||
68 | return 1; | ||
69 | } | ||
70 | |||
71 | __setup("reboot=", reboot_setup); | ||
72 | |||
73 | static inline void kb_wait(void) | ||
74 | { | ||
75 | int i; | ||
76 | |||
77 | for (i=0; i<0x10000; i++) | ||
78 | if ((inb_p(0x64) & 0x02) == 0) | ||
79 | break; | ||
80 | } | ||
81 | |||
82 | void machine_shutdown(void) | ||
83 | { | ||
84 | unsigned long flags; | ||
85 | |||
86 | /* Stop the cpus and apics */ | ||
87 | #ifdef CONFIG_SMP | ||
88 | int reboot_cpu_id; | ||
89 | |||
90 | /* The boot cpu is always logical cpu 0 */ | ||
91 | reboot_cpu_id = 0; | ||
92 | |||
93 | /* Make certain the cpu I'm about to reboot on is online */ | ||
94 | if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { | ||
95 | reboot_cpu_id = smp_processor_id(); | ||
96 | } | ||
97 | |||
98 | /* Make certain I only run on the appropriate processor */ | ||
99 | set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); | ||
100 | |||
101 | /* O.K Now that I'm on the appropriate processor, | ||
102 | * stop all of the others. | ||
103 | */ | ||
104 | smp_send_stop(); | ||
105 | #endif | ||
106 | |||
107 | local_irq_save(flags); | ||
108 | |||
109 | #ifndef CONFIG_SMP | ||
110 | disable_local_APIC(); | ||
111 | #endif | ||
112 | |||
113 | disable_IO_APIC(); | ||
114 | |||
115 | local_irq_restore(flags); | ||
116 | |||
117 | pci_iommu_shutdown(); | ||
118 | } | ||
119 | |||
120 | void machine_emergency_restart(void) | ||
121 | { | ||
122 | int i; | ||
123 | |||
124 | /* Tell the BIOS if we want cold or warm reboot */ | ||
125 | *((unsigned short *)__va(0x472)) = reboot_mode; | ||
126 | |||
127 | for (;;) { | ||
128 | /* Could also try the reset bit in the Hammer NB */ | ||
129 | switch (reboot_type) { | ||
130 | case BOOT_KBD: | ||
131 | for (i=0; i<10; i++) { | ||
132 | kb_wait(); | ||
133 | udelay(50); | ||
134 | outb(0xfe,0x64); /* pulse reset low */ | ||
135 | udelay(50); | ||
136 | } | ||
137 | |||
138 | case BOOT_TRIPLE: | ||
139 | __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); | ||
140 | __asm__ __volatile__("int3"); | ||
141 | |||
142 | reboot_type = BOOT_KBD; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | |||
148 | void machine_restart(char * __unused) | ||
149 | { | ||
150 | printk("machine restart\n"); | ||
151 | |||
152 | if (!reboot_force) { | ||
153 | machine_shutdown(); | ||
154 | } | ||
155 | machine_emergency_restart(); | ||
156 | } | ||
157 | |||
158 | void machine_halt(void) | ||
159 | { | ||
160 | } | ||
161 | |||
162 | void machine_power_off(void) | ||
163 | { | ||
164 | if (pm_power_off) { | ||
165 | if (!reboot_force) { | ||
166 | machine_shutdown(); | ||
167 | } | ||
168 | pm_power_off(); | ||
169 | } | ||
170 | } | ||
171 | |||
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S new file mode 100644 index 000000000000..14e95872c6a3 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
@@ -0,0 +1,276 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | #include <asm/page.h> | ||
11 | #include <asm/kexec.h> | ||
12 | |||
13 | /* | ||
14 | * Must be relocatable PIC code callable as a C function | ||
15 | */ | ||
16 | |||
17 | #define PTR(x) (x << 3) | ||
18 | #define PAGE_ALIGNED (1 << PAGE_SHIFT) | ||
19 | #define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ | ||
20 | |||
21 | .text | ||
22 | .align PAGE_ALIGNED | ||
23 | .code64 | ||
24 | .globl relocate_kernel | ||
25 | relocate_kernel: | ||
26 | /* %rdi indirection_page | ||
27 | * %rsi page_list | ||
28 | * %rdx start address | ||
29 | */ | ||
30 | |||
31 | /* map the control page at its virtual address */ | ||
32 | |||
33 | movq $0x0000ff8000000000, %r10 /* mask */ | ||
34 | mov $(39 - 3), %cl /* bits to shift */ | ||
35 | movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ | ||
36 | |||
37 | movq %r11, %r9 | ||
38 | andq %r10, %r9 | ||
39 | shrq %cl, %r9 | ||
40 | |||
41 | movq PTR(VA_PGD)(%rsi), %r8 | ||
42 | addq %r8, %r9 | ||
43 | movq PTR(PA_PUD_0)(%rsi), %r8 | ||
44 | orq $PAGE_ATTR, %r8 | ||
45 | movq %r8, (%r9) | ||
46 | |||
47 | shrq $9, %r10 | ||
48 | sub $9, %cl | ||
49 | |||
50 | movq %r11, %r9 | ||
51 | andq %r10, %r9 | ||
52 | shrq %cl, %r9 | ||
53 | |||
54 | movq PTR(VA_PUD_0)(%rsi), %r8 | ||
55 | addq %r8, %r9 | ||
56 | movq PTR(PA_PMD_0)(%rsi), %r8 | ||
57 | orq $PAGE_ATTR, %r8 | ||
58 | movq %r8, (%r9) | ||
59 | |||
60 | shrq $9, %r10 | ||
61 | sub $9, %cl | ||
62 | |||
63 | movq %r11, %r9 | ||
64 | andq %r10, %r9 | ||
65 | shrq %cl, %r9 | ||
66 | |||
67 | movq PTR(VA_PMD_0)(%rsi), %r8 | ||
68 | addq %r8, %r9 | ||
69 | movq PTR(PA_PTE_0)(%rsi), %r8 | ||
70 | orq $PAGE_ATTR, %r8 | ||
71 | movq %r8, (%r9) | ||
72 | |||
73 | shrq $9, %r10 | ||
74 | sub $9, %cl | ||
75 | |||
76 | movq %r11, %r9 | ||
77 | andq %r10, %r9 | ||
78 | shrq %cl, %r9 | ||
79 | |||
80 | movq PTR(VA_PTE_0)(%rsi), %r8 | ||
81 | addq %r8, %r9 | ||
82 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
83 | orq $PAGE_ATTR, %r8 | ||
84 | movq %r8, (%r9) | ||
85 | |||
86 | /* identity map the control page at its physical address */ | ||
87 | |||
88 | movq $0x0000ff8000000000, %r10 /* mask */ | ||
89 | mov $(39 - 3), %cl /* bits to shift */ | ||
90 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ | ||
91 | |||
92 | movq %r11, %r9 | ||
93 | andq %r10, %r9 | ||
94 | shrq %cl, %r9 | ||
95 | |||
96 | movq PTR(VA_PGD)(%rsi), %r8 | ||
97 | addq %r8, %r9 | ||
98 | movq PTR(PA_PUD_1)(%rsi), %r8 | ||
99 | orq $PAGE_ATTR, %r8 | ||
100 | movq %r8, (%r9) | ||
101 | |||
102 | shrq $9, %r10 | ||
103 | sub $9, %cl | ||
104 | |||
105 | movq %r11, %r9 | ||
106 | andq %r10, %r9 | ||
107 | shrq %cl, %r9 | ||
108 | |||
109 | movq PTR(VA_PUD_1)(%rsi), %r8 | ||
110 | addq %r8, %r9 | ||
111 | movq PTR(PA_PMD_1)(%rsi), %r8 | ||
112 | orq $PAGE_ATTR, %r8 | ||
113 | movq %r8, (%r9) | ||
114 | |||
115 | shrq $9, %r10 | ||
116 | sub $9, %cl | ||
117 | |||
118 | movq %r11, %r9 | ||
119 | andq %r10, %r9 | ||
120 | shrq %cl, %r9 | ||
121 | |||
122 | movq PTR(VA_PMD_1)(%rsi), %r8 | ||
123 | addq %r8, %r9 | ||
124 | movq PTR(PA_PTE_1)(%rsi), %r8 | ||
125 | orq $PAGE_ATTR, %r8 | ||
126 | movq %r8, (%r9) | ||
127 | |||
128 | shrq $9, %r10 | ||
129 | sub $9, %cl | ||
130 | |||
131 | movq %r11, %r9 | ||
132 | andq %r10, %r9 | ||
133 | shrq %cl, %r9 | ||
134 | |||
135 | movq PTR(VA_PTE_1)(%rsi), %r8 | ||
136 | addq %r8, %r9 | ||
137 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
138 | orq $PAGE_ATTR, %r8 | ||
139 | movq %r8, (%r9) | ||
140 | |||
141 | relocate_new_kernel: | ||
142 | /* %rdi indirection_page | ||
143 | * %rsi page_list | ||
144 | * %rdx start address | ||
145 | */ | ||
146 | |||
147 | /* zero out flags, and disable interrupts */ | ||
148 | pushq $0 | ||
149 | popfq | ||
150 | |||
151 | /* get physical address of control page now */ | ||
152 | /* this is impossible after page table switch */ | ||
153 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
154 | |||
155 | /* get physical address of page table now too */ | ||
156 | movq PTR(PA_TABLE_PAGE)(%rsi), %rcx | ||
157 | |||
158 | /* switch to new set of page tables */ | ||
159 | movq PTR(PA_PGD)(%rsi), %r9 | ||
160 | movq %r9, %cr3 | ||
161 | |||
162 | /* setup a new stack at the end of the physical control page */ | ||
163 | lea 4096(%r8), %rsp | ||
164 | |||
165 | /* jump to identity mapped page */ | ||
166 | addq $(identity_mapped - relocate_kernel), %r8 | ||
167 | pushq %r8 | ||
168 | ret | ||
169 | |||
170 | identity_mapped: | ||
171 | /* store the start address on the stack */ | ||
172 | pushq %rdx | ||
173 | |||
174 | /* Set cr0 to a known state: | ||
175 | * 31 1 == Paging enabled | ||
176 | * 18 0 == Alignment check disabled | ||
177 | * 16 0 == Write protect disabled | ||
178 | * 3 0 == No task switch | ||
179 | * 2 0 == Don't do FP software emulation. | ||
180 | * 0 1 == Proctected mode enabled | ||
181 | */ | ||
182 | movq %cr0, %rax | ||
183 | andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax | ||
184 | orl $((1<<31)|(1<<0)), %eax | ||
185 | movq %rax, %cr0 | ||
186 | |||
187 | /* Set cr4 to a known state: | ||
188 | * 10 0 == xmm exceptions disabled | ||
189 | * 9 0 == xmm registers instructions disabled | ||
190 | * 8 0 == performance monitoring counter disabled | ||
191 | * 7 0 == page global disabled | ||
192 | * 6 0 == machine check exceptions disabled | ||
193 | * 5 1 == physical address extension enabled | ||
194 | * 4 0 == page size extensions disabled | ||
195 | * 3 0 == Debug extensions disabled | ||
196 | * 2 0 == Time stamp disable (disabled) | ||
197 | * 1 0 == Protected mode virtual interrupts disabled | ||
198 | * 0 0 == VME disabled | ||
199 | */ | ||
200 | |||
201 | movq $((1<<5)), %rax | ||
202 | movq %rax, %cr4 | ||
203 | |||
204 | jmp 1f | ||
205 | 1: | ||
206 | |||
207 | /* Switch to the identity mapped page tables, | ||
208 | * and flush the TLB. | ||
209 | */ | ||
210 | movq %rcx, %cr3 | ||
211 | |||
212 | /* Do the copies */ | ||
213 | movq %rdi, %rcx /* Put the page_list in %rcx */ | ||
214 | xorq %rdi, %rdi | ||
215 | xorq %rsi, %rsi | ||
216 | jmp 1f | ||
217 | |||
218 | 0: /* top, read another word for the indirection page */ | ||
219 | |||
220 | movq (%rbx), %rcx | ||
221 | addq $8, %rbx | ||
222 | 1: | ||
223 | testq $0x1, %rcx /* is it a destination page? */ | ||
224 | jz 2f | ||
225 | movq %rcx, %rdi | ||
226 | andq $0xfffffffffffff000, %rdi | ||
227 | jmp 0b | ||
228 | 2: | ||
229 | testq $0x2, %rcx /* is it an indirection page? */ | ||
230 | jz 2f | ||
231 | movq %rcx, %rbx | ||
232 | andq $0xfffffffffffff000, %rbx | ||
233 | jmp 0b | ||
234 | 2: | ||
235 | testq $0x4, %rcx /* is it the done indicator? */ | ||
236 | jz 2f | ||
237 | jmp 3f | ||
238 | 2: | ||
239 | testq $0x8, %rcx /* is it the source indicator? */ | ||
240 | jz 0b /* Ignore it otherwise */ | ||
241 | movq %rcx, %rsi /* For ever source page do a copy */ | ||
242 | andq $0xfffffffffffff000, %rsi | ||
243 | |||
244 | movq $512, %rcx | ||
245 | rep ; movsq | ||
246 | jmp 0b | ||
247 | 3: | ||
248 | |||
249 | /* To be certain of avoiding problems with self-modifying code | ||
250 | * I need to execute a serializing instruction here. | ||
251 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
252 | * and not processor dependent. | ||
253 | */ | ||
254 | movq %cr3, %rax | ||
255 | movq %rax, %cr3 | ||
256 | |||
257 | /* set all of the registers to known values */ | ||
258 | /* leave %rsp alone */ | ||
259 | |||
260 | xorq %rax, %rax | ||
261 | xorq %rbx, %rbx | ||
262 | xorq %rcx, %rcx | ||
263 | xorq %rdx, %rdx | ||
264 | xorq %rsi, %rsi | ||
265 | xorq %rdi, %rdi | ||
266 | xorq %rbp, %rbp | ||
267 | xorq %r8, %r8 | ||
268 | xorq %r9, %r9 | ||
269 | xorq %r10, %r9 | ||
270 | xorq %r11, %r11 | ||
271 | xorq %r12, %r12 | ||
272 | xorq %r13, %r13 | ||
273 | xorq %r14, %r14 | ||
274 | xorq %r15, %r15 | ||
275 | |||
276 | ret | ||
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c new file mode 100644 index 000000000000..1200aaac403e --- /dev/null +++ b/arch/x86/kernel/setup64.c | |||
@@ -0,0 +1,289 @@ | |||
1 | /* | ||
2 | * X86-64 specific CPU setup. | ||
3 | * Copyright (C) 1995 Linus Torvalds | ||
4 | * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | ||
5 | * See setup.c for older changelog. | ||
6 | */ | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/bitops.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <asm/bootsetup.h> | ||
15 | #include <asm/pda.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/processor.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include <asm/atomic.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/smp.h> | ||
22 | #include <asm/i387.h> | ||
23 | #include <asm/percpu.h> | ||
24 | #include <asm/proto.h> | ||
25 | #include <asm/sections.h> | ||
26 | |||
27 | char x86_boot_params[BOOT_PARAM_SIZE] __initdata; | ||
28 | |||
29 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | ||
30 | |||
31 | struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; | ||
32 | EXPORT_SYMBOL(_cpu_pda); | ||
33 | struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; | ||
34 | |||
35 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
36 | |||
37 | char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | ||
38 | |||
39 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
40 | static int do_not_nx __cpuinitdata = 0; | ||
41 | |||
42 | /* noexec=on|off | ||
43 | Control non executable mappings for 64bit processes. | ||
44 | |||
45 | on Enable(default) | ||
46 | off Disable | ||
47 | */ | ||
48 | static int __init nonx_setup(char *str) | ||
49 | { | ||
50 | if (!str) | ||
51 | return -EINVAL; | ||
52 | if (!strncmp(str, "on", 2)) { | ||
53 | __supported_pte_mask |= _PAGE_NX; | ||
54 | do_not_nx = 0; | ||
55 | } else if (!strncmp(str, "off", 3)) { | ||
56 | do_not_nx = 1; | ||
57 | __supported_pte_mask &= ~_PAGE_NX; | ||
58 | } | ||
59 | return 0; | ||
60 | } | ||
61 | early_param("noexec", nonx_setup); | ||
62 | |||
63 | int force_personality32 = 0; | ||
64 | |||
65 | /* noexec32=on|off | ||
66 | Control non executable heap for 32bit processes. | ||
67 | To control the stack too use noexec=off | ||
68 | |||
69 | on PROT_READ does not imply PROT_EXEC for 32bit processes | ||
70 | off PROT_READ implies PROT_EXEC (default) | ||
71 | */ | ||
72 | static int __init nonx32_setup(char *str) | ||
73 | { | ||
74 | if (!strcmp(str, "on")) | ||
75 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
76 | else if (!strcmp(str, "off")) | ||
77 | force_personality32 |= READ_IMPLIES_EXEC; | ||
78 | return 1; | ||
79 | } | ||
80 | __setup("noexec32=", nonx32_setup); | ||
81 | |||
82 | /* | ||
83 | * Great future plan: | ||
84 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
85 | * Always point %gs to its beginning | ||
86 | */ | ||
87 | void __init setup_per_cpu_areas(void) | ||
88 | { | ||
89 | int i; | ||
90 | unsigned long size; | ||
91 | |||
92 | #ifdef CONFIG_HOTPLUG_CPU | ||
93 | prefill_possible_map(); | ||
94 | #endif | ||
95 | |||
96 | /* Copy section for each CPU (we discard the original) */ | ||
97 | size = PERCPU_ENOUGH_ROOM; | ||
98 | |||
99 | printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); | ||
100 | for_each_cpu_mask (i, cpu_possible_map) { | ||
101 | char *ptr; | ||
102 | |||
103 | if (!NODE_DATA(cpu_to_node(i))) { | ||
104 | printk("cpu with no node %d, num_online_nodes %d\n", | ||
105 | i, num_online_nodes()); | ||
106 | ptr = alloc_bootmem_pages(size); | ||
107 | } else { | ||
108 | ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); | ||
109 | } | ||
110 | if (!ptr) | ||
111 | panic("Cannot allocate cpu data for CPU %d\n", i); | ||
112 | cpu_pda(i)->data_offset = ptr - __per_cpu_start; | ||
113 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
114 | } | ||
115 | } | ||
116 | |||
117 | void pda_init(int cpu) | ||
118 | { | ||
119 | struct x8664_pda *pda = cpu_pda(cpu); | ||
120 | |||
121 | /* Setup up data that may be needed in __get_free_pages early */ | ||
122 | asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | ||
123 | /* Memory clobbers used to order PDA accessed */ | ||
124 | mb(); | ||
125 | wrmsrl(MSR_GS_BASE, pda); | ||
126 | mb(); | ||
127 | |||
128 | pda->cpunumber = cpu; | ||
129 | pda->irqcount = -1; | ||
130 | pda->kernelstack = | ||
131 | (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | ||
132 | pda->active_mm = &init_mm; | ||
133 | pda->mmu_state = 0; | ||
134 | |||
135 | if (cpu == 0) { | ||
136 | /* others are initialized in smpboot.c */ | ||
137 | pda->pcurrent = &init_task; | ||
138 | pda->irqstackptr = boot_cpu_stack; | ||
139 | } else { | ||
140 | pda->irqstackptr = (char *) | ||
141 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
142 | if (!pda->irqstackptr) | ||
143 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
144 | } | ||
145 | |||
146 | |||
147 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
148 | } | ||
149 | |||
150 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] | ||
151 | __attribute__((section(".bss.page_aligned"))); | ||
152 | |||
153 | extern asmlinkage void ignore_sysret(void); | ||
154 | |||
155 | /* May not be marked __init: used by software suspend */ | ||
156 | void syscall_init(void) | ||
157 | { | ||
158 | /* | ||
159 | * LSTAR and STAR live in a bit strange symbiosis. | ||
160 | * They both write to the same internal register. STAR allows to set CS/DS | ||
161 | * but only a 32bit target. LSTAR sets the 64bit rip. | ||
162 | */ | ||
163 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
164 | wrmsrl(MSR_LSTAR, system_call); | ||
165 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
166 | |||
167 | #ifdef CONFIG_IA32_EMULATION | ||
168 | syscall32_cpu_init (); | ||
169 | #endif | ||
170 | |||
171 | /* Flags to clear on syscall */ | ||
172 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | ||
173 | } | ||
174 | |||
175 | void __cpuinit check_efer(void) | ||
176 | { | ||
177 | unsigned long efer; | ||
178 | |||
179 | rdmsrl(MSR_EFER, efer); | ||
180 | if (!(efer & EFER_NX) || do_not_nx) { | ||
181 | __supported_pte_mask &= ~_PAGE_NX; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | unsigned long kernel_eflags; | ||
186 | |||
187 | /* | ||
188 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
189 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
190 | * and IDT. We reload them nevertheless, this function acts as a | ||
191 | * 'CPU state barrier', nothing should get across. | ||
192 | * A lot of state is already set up in PDA init. | ||
193 | */ | ||
194 | void __cpuinit cpu_init (void) | ||
195 | { | ||
196 | int cpu = stack_smp_processor_id(); | ||
197 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
198 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
199 | unsigned long v; | ||
200 | char *estacks = NULL; | ||
201 | struct task_struct *me; | ||
202 | int i; | ||
203 | |||
204 | /* CPU 0 is initialised in head64.c */ | ||
205 | if (cpu != 0) { | ||
206 | pda_init(cpu); | ||
207 | } else | ||
208 | estacks = boot_exception_stacks; | ||
209 | |||
210 | me = current; | ||
211 | |||
212 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
213 | panic("CPU#%d already initialized!\n", cpu); | ||
214 | |||
215 | printk("Initializing CPU#%d\n", cpu); | ||
216 | |||
217 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
218 | |||
219 | /* | ||
220 | * Initialize the per-CPU GDT with the boot GDT, | ||
221 | * and set up the GDT descriptor: | ||
222 | */ | ||
223 | if (cpu) | ||
224 | memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); | ||
225 | |||
226 | cpu_gdt_descr[cpu].size = GDT_SIZE; | ||
227 | asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); | ||
228 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
229 | |||
230 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
231 | syscall_init(); | ||
232 | |||
233 | wrmsrl(MSR_FS_BASE, 0); | ||
234 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
235 | barrier(); | ||
236 | |||
237 | check_efer(); | ||
238 | |||
239 | /* | ||
240 | * set up and load the per-CPU TSS | ||
241 | */ | ||
242 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
243 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
244 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
245 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
246 | }; | ||
247 | if (cpu) { | ||
248 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
249 | if (!estacks) | ||
250 | panic("Cannot allocate exception stack %ld %d\n", | ||
251 | v, cpu); | ||
252 | } | ||
253 | estacks += PAGE_SIZE << order[v]; | ||
254 | orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; | ||
255 | } | ||
256 | |||
257 | t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
258 | /* | ||
259 | * <= is required because the CPU will access up to | ||
260 | * 8 bits beyond the end of the IO permission bitmap. | ||
261 | */ | ||
262 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
263 | t->io_bitmap[i] = ~0UL; | ||
264 | |||
265 | atomic_inc(&init_mm.mm_count); | ||
266 | me->active_mm = &init_mm; | ||
267 | if (me->mm) | ||
268 | BUG(); | ||
269 | enter_lazy_tlb(&init_mm, me); | ||
270 | |||
271 | set_tss_desc(cpu, t); | ||
272 | load_TR_desc(); | ||
273 | load_LDT(&init_mm.context); | ||
274 | |||
275 | /* | ||
276 | * Clear all 6 debug registers: | ||
277 | */ | ||
278 | |||
279 | set_debugreg(0UL, 0); | ||
280 | set_debugreg(0UL, 1); | ||
281 | set_debugreg(0UL, 2); | ||
282 | set_debugreg(0UL, 3); | ||
283 | set_debugreg(0UL, 6); | ||
284 | set_debugreg(0UL, 7); | ||
285 | |||
286 | fpu_init(); | ||
287 | |||
288 | raw_local_save_flags(kernel_eflags); | ||
289 | } | ||
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c new file mode 100644 index 000000000000..af838f6b0b7f --- /dev/null +++ b/arch/x86/kernel/setup_64.c | |||
@@ -0,0 +1,1117 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/setup.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Nov 2001 Dave Jones <davej@suse.de> | ||
7 | * Forked from i386 setup code. | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * This file handles the architecture-dependent parts of initialization | ||
12 | */ | ||
13 | |||
14 | #include <linux/errno.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/stddef.h> | ||
19 | #include <linux/unistd.h> | ||
20 | #include <linux/ptrace.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/user.h> | ||
23 | #include <linux/a.out.h> | ||
24 | #include <linux/screen_info.h> | ||
25 | #include <linux/ioport.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/initrd.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/bootmem.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <asm/processor.h> | ||
33 | #include <linux/console.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/crash_dump.h> | ||
36 | #include <linux/root_dev.h> | ||
37 | #include <linux/pci.h> | ||
38 | #include <linux/acpi.h> | ||
39 | #include <linux/kallsyms.h> | ||
40 | #include <linux/edd.h> | ||
41 | #include <linux/mmzone.h> | ||
42 | #include <linux/kexec.h> | ||
43 | #include <linux/cpufreq.h> | ||
44 | #include <linux/dmi.h> | ||
45 | #include <linux/dma-mapping.h> | ||
46 | #include <linux/ctype.h> | ||
47 | |||
48 | #include <asm/mtrr.h> | ||
49 | #include <asm/uaccess.h> | ||
50 | #include <asm/system.h> | ||
51 | #include <asm/io.h> | ||
52 | #include <asm/smp.h> | ||
53 | #include <asm/msr.h> | ||
54 | #include <asm/desc.h> | ||
55 | #include <video/edid.h> | ||
56 | #include <asm/e820.h> | ||
57 | #include <asm/dma.h> | ||
58 | #include <asm/mpspec.h> | ||
59 | #include <asm/mmu_context.h> | ||
60 | #include <asm/bootsetup.h> | ||
61 | #include <asm/proto.h> | ||
62 | #include <asm/setup.h> | ||
63 | #include <asm/mach_apic.h> | ||
64 | #include <asm/numa.h> | ||
65 | #include <asm/sections.h> | ||
66 | #include <asm/dmi.h> | ||
67 | |||
68 | /* | ||
69 | * Machine setup.. | ||
70 | */ | ||
71 | |||
72 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | ||
73 | EXPORT_SYMBOL(boot_cpu_data); | ||
74 | |||
75 | unsigned long mmu_cr4_features; | ||
76 | |||
77 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
78 | int bootloader_type; | ||
79 | |||
80 | unsigned long saved_video_mode; | ||
81 | |||
82 | int force_mwait __cpuinitdata; | ||
83 | |||
84 | /* | ||
85 | * Early DMI memory | ||
86 | */ | ||
87 | int dmi_alloc_index; | ||
88 | char dmi_alloc_data[DMI_MAX_DATA]; | ||
89 | |||
90 | /* | ||
91 | * Setup options | ||
92 | */ | ||
93 | struct screen_info screen_info; | ||
94 | EXPORT_SYMBOL(screen_info); | ||
95 | struct sys_desc_table_struct { | ||
96 | unsigned short length; | ||
97 | unsigned char table[0]; | ||
98 | }; | ||
99 | |||
100 | struct edid_info edid_info; | ||
101 | EXPORT_SYMBOL_GPL(edid_info); | ||
102 | |||
103 | extern int root_mountflags; | ||
104 | |||
105 | char __initdata command_line[COMMAND_LINE_SIZE]; | ||
106 | |||
107 | struct resource standard_io_resources[] = { | ||
108 | { .name = "dma1", .start = 0x00, .end = 0x1f, | ||
109 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
110 | { .name = "pic1", .start = 0x20, .end = 0x21, | ||
111 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
112 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
113 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
114 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
115 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
116 | { .name = "keyboard", .start = 0x60, .end = 0x6f, | ||
117 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
118 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
119 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
120 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
121 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
122 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
123 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
124 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
125 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
126 | }; | ||
127 | |||
128 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | ||
129 | |||
130 | struct resource data_resource = { | ||
131 | .name = "Kernel data", | ||
132 | .start = 0, | ||
133 | .end = 0, | ||
134 | .flags = IORESOURCE_RAM, | ||
135 | }; | ||
136 | struct resource code_resource = { | ||
137 | .name = "Kernel code", | ||
138 | .start = 0, | ||
139 | .end = 0, | ||
140 | .flags = IORESOURCE_RAM, | ||
141 | }; | ||
142 | |||
143 | #ifdef CONFIG_PROC_VMCORE | ||
144 | /* elfcorehdr= specifies the location of elf core header | ||
145 | * stored by the crashed kernel. This option will be passed | ||
146 | * by kexec loader to the capture kernel. | ||
147 | */ | ||
148 | static int __init setup_elfcorehdr(char *arg) | ||
149 | { | ||
150 | char *end; | ||
151 | if (!arg) | ||
152 | return -EINVAL; | ||
153 | elfcorehdr_addr = memparse(arg, &end); | ||
154 | return end > arg ? 0 : -EINVAL; | ||
155 | } | ||
156 | early_param("elfcorehdr", setup_elfcorehdr); | ||
157 | #endif | ||
158 | |||
159 | #ifndef CONFIG_NUMA | ||
160 | static void __init | ||
161 | contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
162 | { | ||
163 | unsigned long bootmap_size, bootmap; | ||
164 | |||
165 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
166 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | ||
167 | if (bootmap == -1L) | ||
168 | panic("Cannot find bootmem map of size %ld\n",bootmap_size); | ||
169 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | ||
170 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
171 | free_bootmem_with_active_regions(0, end_pfn); | ||
172 | reserve_bootmem(bootmap, bootmap_size); | ||
173 | } | ||
174 | #endif | ||
175 | |||
176 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
177 | struct edd edd; | ||
178 | #ifdef CONFIG_EDD_MODULE | ||
179 | EXPORT_SYMBOL(edd); | ||
180 | #endif | ||
181 | /** | ||
182 | * copy_edd() - Copy the BIOS EDD information | ||
183 | * from boot_params into a safe place. | ||
184 | * | ||
185 | */ | ||
186 | static inline void copy_edd(void) | ||
187 | { | ||
188 | memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | ||
189 | memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | ||
190 | edd.mbr_signature_nr = EDD_MBR_SIG_NR; | ||
191 | edd.edd_info_nr = EDD_NR; | ||
192 | } | ||
193 | #else | ||
194 | static inline void copy_edd(void) | ||
195 | { | ||
196 | } | ||
197 | #endif | ||
198 | |||
199 | #define EBDA_ADDR_POINTER 0x40E | ||
200 | |||
201 | unsigned __initdata ebda_addr; | ||
202 | unsigned __initdata ebda_size; | ||
203 | |||
204 | static void discover_ebda(void) | ||
205 | { | ||
206 | /* | ||
207 | * there is a real-mode segmented pointer pointing to the | ||
208 | * 4K EBDA area at 0x40E | ||
209 | */ | ||
210 | ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); | ||
211 | ebda_addr <<= 4; | ||
212 | |||
213 | ebda_size = *(unsigned short *)__va(ebda_addr); | ||
214 | |||
215 | /* Round EBDA up to pages */ | ||
216 | if (ebda_size == 0) | ||
217 | ebda_size = 1; | ||
218 | ebda_size <<= 10; | ||
219 | ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); | ||
220 | if (ebda_size > 64*1024) | ||
221 | ebda_size = 64*1024; | ||
222 | } | ||
223 | |||
224 | void __init setup_arch(char **cmdline_p) | ||
225 | { | ||
226 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | ||
227 | |||
228 | ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | ||
229 | screen_info = SCREEN_INFO; | ||
230 | edid_info = EDID_INFO; | ||
231 | saved_video_mode = SAVED_VIDEO_MODE; | ||
232 | bootloader_type = LOADER_TYPE; | ||
233 | |||
234 | #ifdef CONFIG_BLK_DEV_RAM | ||
235 | rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | ||
236 | rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | ||
237 | rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | ||
238 | #endif | ||
239 | setup_memory_region(); | ||
240 | copy_edd(); | ||
241 | |||
242 | if (!MOUNT_ROOT_RDONLY) | ||
243 | root_mountflags &= ~MS_RDONLY; | ||
244 | init_mm.start_code = (unsigned long) &_text; | ||
245 | init_mm.end_code = (unsigned long) &_etext; | ||
246 | init_mm.end_data = (unsigned long) &_edata; | ||
247 | init_mm.brk = (unsigned long) &_end; | ||
248 | |||
249 | code_resource.start = virt_to_phys(&_text); | ||
250 | code_resource.end = virt_to_phys(&_etext)-1; | ||
251 | data_resource.start = virt_to_phys(&_etext); | ||
252 | data_resource.end = virt_to_phys(&_edata)-1; | ||
253 | |||
254 | early_identify_cpu(&boot_cpu_data); | ||
255 | |||
256 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
257 | *cmdline_p = command_line; | ||
258 | |||
259 | parse_early_param(); | ||
260 | |||
261 | finish_e820_parsing(); | ||
262 | |||
263 | e820_register_active_regions(0, 0, -1UL); | ||
264 | /* | ||
265 | * partially used pages are not usable - thus | ||
266 | * we are rounding upwards: | ||
267 | */ | ||
268 | end_pfn = e820_end_of_ram(); | ||
269 | num_physpages = end_pfn; | ||
270 | |||
271 | check_efer(); | ||
272 | |||
273 | discover_ebda(); | ||
274 | |||
275 | init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | ||
276 | |||
277 | dmi_scan_machine(); | ||
278 | |||
279 | #ifdef CONFIG_ACPI | ||
280 | /* | ||
281 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | ||
282 | * Call this early for SRAT node setup. | ||
283 | */ | ||
284 | acpi_boot_table_init(); | ||
285 | #endif | ||
286 | |||
287 | /* How many end-of-memory variables you have, grandma! */ | ||
288 | max_low_pfn = end_pfn; | ||
289 | max_pfn = end_pfn; | ||
290 | high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; | ||
291 | |||
292 | /* Remove active ranges so rediscovery with NUMA-awareness happens */ | ||
293 | remove_all_active_ranges(); | ||
294 | |||
295 | #ifdef CONFIG_ACPI_NUMA | ||
296 | /* | ||
297 | * Parse SRAT to discover nodes. | ||
298 | */ | ||
299 | acpi_numa_init(); | ||
300 | #endif | ||
301 | |||
302 | #ifdef CONFIG_NUMA | ||
303 | numa_initmem_init(0, end_pfn); | ||
304 | #else | ||
305 | contig_initmem_init(0, end_pfn); | ||
306 | #endif | ||
307 | |||
308 | /* Reserve direct mapping */ | ||
309 | reserve_bootmem_generic(table_start << PAGE_SHIFT, | ||
310 | (table_end - table_start) << PAGE_SHIFT); | ||
311 | |||
312 | /* reserve kernel */ | ||
313 | reserve_bootmem_generic(__pa_symbol(&_text), | ||
314 | __pa_symbol(&_end) - __pa_symbol(&_text)); | ||
315 | |||
316 | /* | ||
317 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
318 | * enabling clean reboots, SMP operation, laptop functions. | ||
319 | */ | ||
320 | reserve_bootmem_generic(0, PAGE_SIZE); | ||
321 | |||
322 | /* reserve ebda region */ | ||
323 | if (ebda_addr) | ||
324 | reserve_bootmem_generic(ebda_addr, ebda_size); | ||
325 | #ifdef CONFIG_NUMA | ||
326 | /* reserve nodemap region */ | ||
327 | if (nodemap_addr) | ||
328 | reserve_bootmem_generic(nodemap_addr, nodemap_size); | ||
329 | #endif | ||
330 | |||
331 | #ifdef CONFIG_SMP | ||
332 | /* Reserve SMP trampoline */ | ||
333 | reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); | ||
334 | #endif | ||
335 | |||
336 | #ifdef CONFIG_ACPI_SLEEP | ||
337 | /* | ||
338 | * Reserve low memory region for sleep support. | ||
339 | */ | ||
340 | acpi_reserve_bootmem(); | ||
341 | #endif | ||
342 | /* | ||
343 | * Find and reserve possible boot-time SMP configuration: | ||
344 | */ | ||
345 | find_smp_config(); | ||
346 | #ifdef CONFIG_BLK_DEV_INITRD | ||
347 | if (LOADER_TYPE && INITRD_START) { | ||
348 | if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | ||
349 | reserve_bootmem_generic(INITRD_START, INITRD_SIZE); | ||
350 | initrd_start = INITRD_START + PAGE_OFFSET; | ||
351 | initrd_end = initrd_start+INITRD_SIZE; | ||
352 | } | ||
353 | else { | ||
354 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
355 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
356 | (unsigned long)(INITRD_START + INITRD_SIZE), | ||
357 | (unsigned long)(end_pfn << PAGE_SHIFT)); | ||
358 | initrd_start = 0; | ||
359 | } | ||
360 | } | ||
361 | #endif | ||
362 | #ifdef CONFIG_KEXEC | ||
363 | if (crashk_res.start != crashk_res.end) { | ||
364 | reserve_bootmem_generic(crashk_res.start, | ||
365 | crashk_res.end - crashk_res.start + 1); | ||
366 | } | ||
367 | #endif | ||
368 | |||
369 | paging_init(); | ||
370 | |||
371 | #ifdef CONFIG_PCI | ||
372 | early_quirks(); | ||
373 | #endif | ||
374 | |||
375 | /* | ||
376 | * set this early, so we dont allocate cpu0 | ||
377 | * if MADT list doesnt list BSP first | ||
378 | * mpparse.c/MP_processor_info() allocates logical cpu numbers. | ||
379 | */ | ||
380 | cpu_set(0, cpu_present_map); | ||
381 | #ifdef CONFIG_ACPI | ||
382 | /* | ||
383 | * Read APIC and some other early information from ACPI tables. | ||
384 | */ | ||
385 | acpi_boot_init(); | ||
386 | #endif | ||
387 | |||
388 | init_cpu_to_node(); | ||
389 | |||
390 | /* | ||
391 | * get boot-time SMP configuration: | ||
392 | */ | ||
393 | if (smp_found_config) | ||
394 | get_smp_config(); | ||
395 | init_apic_mappings(); | ||
396 | |||
397 | /* | ||
398 | * We trust e820 completely. No explicit ROM probing in memory. | ||
399 | */ | ||
400 | e820_reserve_resources(); | ||
401 | e820_mark_nosave_regions(); | ||
402 | |||
403 | { | ||
404 | unsigned i; | ||
405 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
406 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
407 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
408 | } | ||
409 | |||
410 | e820_setup_gap(); | ||
411 | |||
412 | #ifdef CONFIG_VT | ||
413 | #if defined(CONFIG_VGA_CONSOLE) | ||
414 | conswitchp = &vga_con; | ||
415 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
416 | conswitchp = &dummy_con; | ||
417 | #endif | ||
418 | #endif | ||
419 | } | ||
420 | |||
421 | static int __cpuinit get_model_name(struct cpuinfo_x86 *c) | ||
422 | { | ||
423 | unsigned int *v; | ||
424 | |||
425 | if (c->extended_cpuid_level < 0x80000004) | ||
426 | return 0; | ||
427 | |||
428 | v = (unsigned int *) c->x86_model_id; | ||
429 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
430 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
431 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
432 | c->x86_model_id[48] = 0; | ||
433 | return 1; | ||
434 | } | ||
435 | |||
436 | |||
437 | static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | ||
438 | { | ||
439 | unsigned int n, dummy, eax, ebx, ecx, edx; | ||
440 | |||
441 | n = c->extended_cpuid_level; | ||
442 | |||
443 | if (n >= 0x80000005) { | ||
444 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
445 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | ||
446 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
447 | c->x86_cache_size=(ecx>>24)+(edx>>24); | ||
448 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
449 | c->x86_tlbsize = 0; | ||
450 | } | ||
451 | |||
452 | if (n >= 0x80000006) { | ||
453 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
454 | ecx = cpuid_ecx(0x80000006); | ||
455 | c->x86_cache_size = ecx >> 16; | ||
456 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
457 | |||
458 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
459 | c->x86_cache_size, ecx & 0xFF); | ||
460 | } | ||
461 | |||
462 | if (n >= 0x80000007) | ||
463 | cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | ||
464 | if (n >= 0x80000008) { | ||
465 | cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | ||
466 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
467 | c->x86_phys_bits = eax & 0xff; | ||
468 | } | ||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_NUMA | ||
472 | static int nearby_node(int apicid) | ||
473 | { | ||
474 | int i; | ||
475 | for (i = apicid - 1; i >= 0; i--) { | ||
476 | int node = apicid_to_node[i]; | ||
477 | if (node != NUMA_NO_NODE && node_online(node)) | ||
478 | return node; | ||
479 | } | ||
480 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
481 | int node = apicid_to_node[i]; | ||
482 | if (node != NUMA_NO_NODE && node_online(node)) | ||
483 | return node; | ||
484 | } | ||
485 | return first_node(node_online_map); /* Shouldn't happen */ | ||
486 | } | ||
487 | #endif | ||
488 | |||
489 | /* | ||
490 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
491 | * Assumes number of cores is a power of two. | ||
492 | */ | ||
493 | static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | ||
494 | { | ||
495 | #ifdef CONFIG_SMP | ||
496 | unsigned bits; | ||
497 | #ifdef CONFIG_NUMA | ||
498 | int cpu = smp_processor_id(); | ||
499 | int node = 0; | ||
500 | unsigned apicid = hard_smp_processor_id(); | ||
501 | #endif | ||
502 | unsigned ecx = cpuid_ecx(0x80000008); | ||
503 | |||
504 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
505 | |||
506 | /* CPU telling us the core id bits shift? */ | ||
507 | bits = (ecx >> 12) & 0xF; | ||
508 | |||
509 | /* Otherwise recompute */ | ||
510 | if (bits == 0) { | ||
511 | while ((1 << bits) < c->x86_max_cores) | ||
512 | bits++; | ||
513 | } | ||
514 | |||
515 | /* Low order bits define the core id (index of core in socket) */ | ||
516 | c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); | ||
517 | /* Convert the APIC ID into the socket ID */ | ||
518 | c->phys_proc_id = phys_pkg_id(bits); | ||
519 | |||
520 | #ifdef CONFIG_NUMA | ||
521 | node = c->phys_proc_id; | ||
522 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
523 | node = apicid_to_node[apicid]; | ||
524 | if (!node_online(node)) { | ||
525 | /* Two possibilities here: | ||
526 | - The CPU is missing memory and no node was created. | ||
527 | In that case try picking one from a nearby CPU | ||
528 | - The APIC IDs differ from the HyperTransport node IDs | ||
529 | which the K8 northbridge parsing fills in. | ||
530 | Assume they are all increased by a constant offset, | ||
531 | but in the same order as the HT nodeids. | ||
532 | If that doesn't result in a usable node fall back to the | ||
533 | path for the previous case. */ | ||
534 | int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); | ||
535 | if (ht_nodeid >= 0 && | ||
536 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
537 | node = apicid_to_node[ht_nodeid]; | ||
538 | /* Pick a nearby node */ | ||
539 | if (!node_online(node)) | ||
540 | node = nearby_node(apicid); | ||
541 | } | ||
542 | numa_set_node(cpu, node); | ||
543 | |||
544 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
545 | #endif | ||
546 | #endif | ||
547 | } | ||
548 | |||
549 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
550 | { | ||
551 | unsigned level; | ||
552 | |||
553 | #ifdef CONFIG_SMP | ||
554 | unsigned long value; | ||
555 | |||
556 | /* | ||
557 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | ||
558 | * bit 6 of msr C001_0015 | ||
559 | * | ||
560 | * Errata 63 for SH-B3 steppings | ||
561 | * Errata 122 for all steppings (F+ have it disabled by default) | ||
562 | */ | ||
563 | if (c->x86 == 15) { | ||
564 | rdmsrl(MSR_K8_HWCR, value); | ||
565 | value |= 1 << 6; | ||
566 | wrmsrl(MSR_K8_HWCR, value); | ||
567 | } | ||
568 | #endif | ||
569 | |||
570 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
571 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
572 | clear_bit(0*32+31, &c->x86_capability); | ||
573 | |||
574 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
575 | level = cpuid_eax(1); | ||
576 | if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) | ||
577 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
578 | if (c->x86 == 0x10) | ||
579 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
580 | |||
581 | /* Enable workaround for FXSAVE leak */ | ||
582 | if (c->x86 >= 6) | ||
583 | set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); | ||
584 | |||
585 | level = get_model_name(c); | ||
586 | if (!level) { | ||
587 | switch (c->x86) { | ||
588 | case 15: | ||
589 | /* Should distinguish Models here, but this is only | ||
590 | a fallback anyways. */ | ||
591 | strcpy(c->x86_model_id, "Hammer"); | ||
592 | break; | ||
593 | } | ||
594 | } | ||
595 | display_cacheinfo(c); | ||
596 | |||
597 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
598 | if (c->x86_power & (1<<8)) | ||
599 | set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | ||
600 | |||
601 | /* Multi core CPU? */ | ||
602 | if (c->extended_cpuid_level >= 0x80000008) | ||
603 | amd_detect_cmp(c); | ||
604 | |||
605 | if (c->extended_cpuid_level >= 0x80000006 && | ||
606 | (cpuid_edx(0x80000006) & 0xf000)) | ||
607 | num_cache_leaves = 4; | ||
608 | else | ||
609 | num_cache_leaves = 3; | ||
610 | |||
611 | if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) | ||
612 | set_bit(X86_FEATURE_K8, &c->x86_capability); | ||
613 | |||
614 | /* RDTSC can be speculated around */ | ||
615 | clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
616 | |||
617 | /* Family 10 doesn't support C states in MWAIT so don't use it */ | ||
618 | if (c->x86 == 0x10 && !force_mwait) | ||
619 | clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); | ||
620 | } | ||
621 | |||
622 | static void __cpuinit detect_ht(struct cpuinfo_x86 *c) | ||
623 | { | ||
624 | #ifdef CONFIG_SMP | ||
625 | u32 eax, ebx, ecx, edx; | ||
626 | int index_msb, core_bits; | ||
627 | |||
628 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
629 | |||
630 | |||
631 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
632 | return; | ||
633 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
634 | goto out; | ||
635 | |||
636 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
637 | |||
638 | if (smp_num_siblings == 1) { | ||
639 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
640 | } else if (smp_num_siblings > 1 ) { | ||
641 | |||
642 | if (smp_num_siblings > NR_CPUS) { | ||
643 | printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | ||
644 | smp_num_siblings = 1; | ||
645 | return; | ||
646 | } | ||
647 | |||
648 | index_msb = get_count_order(smp_num_siblings); | ||
649 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
650 | |||
651 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
652 | |||
653 | index_msb = get_count_order(smp_num_siblings) ; | ||
654 | |||
655 | core_bits = get_count_order(c->x86_max_cores); | ||
656 | |||
657 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
658 | ((1 << core_bits) - 1); | ||
659 | } | ||
660 | out: | ||
661 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
662 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); | ||
663 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); | ||
664 | } | ||
665 | |||
666 | #endif | ||
667 | } | ||
668 | |||
669 | /* | ||
670 | * find out the number of processor cores on the die | ||
671 | */ | ||
672 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | ||
673 | { | ||
674 | unsigned int eax, t; | ||
675 | |||
676 | if (c->cpuid_level < 4) | ||
677 | return 1; | ||
678 | |||
679 | cpuid_count(4, 0, &eax, &t, &t, &t); | ||
680 | |||
681 | if (eax & 0x1f) | ||
682 | return ((eax >> 26) + 1); | ||
683 | else | ||
684 | return 1; | ||
685 | } | ||
686 | |||
687 | static void srat_detect_node(void) | ||
688 | { | ||
689 | #ifdef CONFIG_NUMA | ||
690 | unsigned node; | ||
691 | int cpu = smp_processor_id(); | ||
692 | int apicid = hard_smp_processor_id(); | ||
693 | |||
694 | /* Don't do the funky fallback heuristics the AMD version employs | ||
695 | for now. */ | ||
696 | node = apicid_to_node[apicid]; | ||
697 | if (node == NUMA_NO_NODE) | ||
698 | node = first_node(node_online_map); | ||
699 | numa_set_node(cpu, node); | ||
700 | |||
701 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
702 | #endif | ||
703 | } | ||
704 | |||
705 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | ||
706 | { | ||
707 | /* Cache sizes */ | ||
708 | unsigned n; | ||
709 | |||
710 | init_intel_cacheinfo(c); | ||
711 | if (c->cpuid_level > 9 ) { | ||
712 | unsigned eax = cpuid_eax(10); | ||
713 | /* Check for version and the number of counters */ | ||
714 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | ||
715 | set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); | ||
716 | } | ||
717 | |||
718 | if (cpu_has_ds) { | ||
719 | unsigned int l1, l2; | ||
720 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
721 | if (!(l1 & (1<<11))) | ||
722 | set_bit(X86_FEATURE_BTS, c->x86_capability); | ||
723 | if (!(l1 & (1<<12))) | ||
724 | set_bit(X86_FEATURE_PEBS, c->x86_capability); | ||
725 | } | ||
726 | |||
727 | n = c->extended_cpuid_level; | ||
728 | if (n >= 0x80000008) { | ||
729 | unsigned eax = cpuid_eax(0x80000008); | ||
730 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
731 | c->x86_phys_bits = eax & 0xff; | ||
732 | /* CPUID workaround for Intel 0F34 CPU */ | ||
733 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
734 | c->x86 == 0xF && c->x86_model == 0x3 && | ||
735 | c->x86_mask == 0x4) | ||
736 | c->x86_phys_bits = 36; | ||
737 | } | ||
738 | |||
739 | if (c->x86 == 15) | ||
740 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
741 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | ||
742 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
743 | set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | ||
744 | if (c->x86 == 6) | ||
745 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
746 | if (c->x86 == 15) | ||
747 | set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
748 | else | ||
749 | clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
750 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
751 | |||
752 | srat_detect_node(); | ||
753 | } | ||
754 | |||
755 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | ||
756 | { | ||
757 | char *v = c->x86_vendor_id; | ||
758 | |||
759 | if (!strcmp(v, "AuthenticAMD")) | ||
760 | c->x86_vendor = X86_VENDOR_AMD; | ||
761 | else if (!strcmp(v, "GenuineIntel")) | ||
762 | c->x86_vendor = X86_VENDOR_INTEL; | ||
763 | else | ||
764 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
765 | } | ||
766 | |||
767 | struct cpu_model_info { | ||
768 | int vendor; | ||
769 | int family; | ||
770 | char *model_names[16]; | ||
771 | }; | ||
772 | |||
773 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
774 | needed before check_bugs. Everything advanced is in identify_cpu | ||
775 | below. */ | ||
776 | void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | ||
777 | { | ||
778 | u32 tfms; | ||
779 | |||
780 | c->loops_per_jiffy = loops_per_jiffy; | ||
781 | c->x86_cache_size = -1; | ||
782 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
783 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
784 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
785 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
786 | c->x86_clflush_size = 64; | ||
787 | c->x86_cache_alignment = c->x86_clflush_size; | ||
788 | c->x86_max_cores = 1; | ||
789 | c->extended_cpuid_level = 0; | ||
790 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
791 | |||
792 | /* Get vendor name */ | ||
793 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
794 | (unsigned int *)&c->x86_vendor_id[0], | ||
795 | (unsigned int *)&c->x86_vendor_id[8], | ||
796 | (unsigned int *)&c->x86_vendor_id[4]); | ||
797 | |||
798 | get_cpu_vendor(c); | ||
799 | |||
800 | /* Initialize the standard set of capabilities */ | ||
801 | /* Note that the vendor-specific code below might override */ | ||
802 | |||
803 | /* Intel-defined flags: level 0x00000001 */ | ||
804 | if (c->cpuid_level >= 0x00000001) { | ||
805 | __u32 misc; | ||
806 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
807 | &c->x86_capability[0]); | ||
808 | c->x86 = (tfms >> 8) & 0xf; | ||
809 | c->x86_model = (tfms >> 4) & 0xf; | ||
810 | c->x86_mask = tfms & 0xf; | ||
811 | if (c->x86 == 0xf) | ||
812 | c->x86 += (tfms >> 20) & 0xff; | ||
813 | if (c->x86 >= 0x6) | ||
814 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
815 | if (c->x86_capability[0] & (1<<19)) | ||
816 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
817 | } else { | ||
818 | /* Have CPUID level 0 only - unheard of */ | ||
819 | c->x86 = 4; | ||
820 | } | ||
821 | |||
822 | #ifdef CONFIG_SMP | ||
823 | c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | ||
824 | #endif | ||
825 | } | ||
826 | |||
827 | /* | ||
828 | * This does the hard work of actually picking apart the CPU stuff... | ||
829 | */ | ||
830 | void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | ||
831 | { | ||
832 | int i; | ||
833 | u32 xlvl; | ||
834 | |||
835 | early_identify_cpu(c); | ||
836 | |||
837 | /* AMD-defined flags: level 0x80000001 */ | ||
838 | xlvl = cpuid_eax(0x80000000); | ||
839 | c->extended_cpuid_level = xlvl; | ||
840 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
841 | if (xlvl >= 0x80000001) { | ||
842 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
843 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
844 | } | ||
845 | if (xlvl >= 0x80000004) | ||
846 | get_model_name(c); /* Default name */ | ||
847 | } | ||
848 | |||
849 | /* Transmeta-defined flags: level 0x80860001 */ | ||
850 | xlvl = cpuid_eax(0x80860000); | ||
851 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
852 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
853 | if (xlvl >= 0x80860001) | ||
854 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
855 | } | ||
856 | |||
857 | init_scattered_cpuid_features(c); | ||
858 | |||
859 | c->apicid = phys_pkg_id(0); | ||
860 | |||
861 | /* | ||
862 | * Vendor-specific initialization. In this section we | ||
863 | * canonicalize the feature flags, meaning if there are | ||
864 | * features a certain CPU supports which CPUID doesn't | ||
865 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
866 | * we handle them here. | ||
867 | * | ||
868 | * At the end of this section, c->x86_capability better | ||
869 | * indicate the features this CPU genuinely supports! | ||
870 | */ | ||
871 | switch (c->x86_vendor) { | ||
872 | case X86_VENDOR_AMD: | ||
873 | init_amd(c); | ||
874 | break; | ||
875 | |||
876 | case X86_VENDOR_INTEL: | ||
877 | init_intel(c); | ||
878 | break; | ||
879 | |||
880 | case X86_VENDOR_UNKNOWN: | ||
881 | default: | ||
882 | display_cacheinfo(c); | ||
883 | break; | ||
884 | } | ||
885 | |||
886 | select_idle_routine(c); | ||
887 | detect_ht(c); | ||
888 | |||
889 | /* | ||
890 | * On SMP, boot_cpu_data holds the common feature set between | ||
891 | * all CPUs; so make sure that we indicate which features are | ||
892 | * common between the CPUs. The first time this routine gets | ||
893 | * executed, c == &boot_cpu_data. | ||
894 | */ | ||
895 | if (c != &boot_cpu_data) { | ||
896 | /* AND the already accumulated flags with these */ | ||
897 | for (i = 0 ; i < NCAPINTS ; i++) | ||
898 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
899 | } | ||
900 | |||
901 | #ifdef CONFIG_X86_MCE | ||
902 | mcheck_init(c); | ||
903 | #endif | ||
904 | if (c != &boot_cpu_data) | ||
905 | mtrr_ap_init(); | ||
906 | #ifdef CONFIG_NUMA | ||
907 | numa_add_cpu(smp_processor_id()); | ||
908 | #endif | ||
909 | } | ||
910 | |||
911 | |||
912 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | ||
913 | { | ||
914 | if (c->x86_model_id[0]) | ||
915 | printk("%s", c->x86_model_id); | ||
916 | |||
917 | if (c->x86_mask || c->cpuid_level >= 0) | ||
918 | printk(" stepping %02x\n", c->x86_mask); | ||
919 | else | ||
920 | printk("\n"); | ||
921 | } | ||
922 | |||
923 | /* | ||
924 | * Get CPU information for use by the procfs. | ||
925 | */ | ||
926 | |||
927 | static int show_cpuinfo(struct seq_file *m, void *v) | ||
928 | { | ||
929 | struct cpuinfo_x86 *c = v; | ||
930 | |||
931 | /* | ||
932 | * These flag bits must match the definitions in <asm/cpufeature.h>. | ||
933 | * NULL means this bit is undefined or reserved; either way it doesn't | ||
934 | * have meaning as far as Linux is concerned. Note that it's important | ||
935 | * to realize there is a difference between this table and CPUID -- if | ||
936 | * applications want to get the raw CPUID data, they should access | ||
937 | * /dev/cpu/<cpu_nr>/cpuid instead. | ||
938 | */ | ||
939 | static char *x86_cap_flags[] = { | ||
940 | /* Intel-defined */ | ||
941 | "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | ||
942 | "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | ||
943 | "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | ||
944 | "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", | ||
945 | |||
946 | /* AMD-defined */ | ||
947 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
948 | NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | ||
949 | NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | ||
950 | NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", | ||
951 | "3dnowext", "3dnow", | ||
952 | |||
953 | /* Transmeta-defined */ | ||
954 | "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | ||
955 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
956 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
957 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
958 | |||
959 | /* Other (Linux-defined) */ | ||
960 | "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", | ||
961 | NULL, NULL, NULL, NULL, | ||
962 | "constant_tsc", "up", NULL, "arch_perfmon", | ||
963 | "pebs", "bts", NULL, "sync_rdtsc", | ||
964 | "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
965 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
966 | |||
967 | /* Intel-defined (#2) */ | ||
968 | "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", | ||
969 | "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, | ||
970 | NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", | ||
971 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
972 | |||
973 | /* VIA/Cyrix/Centaur-defined */ | ||
974 | NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", | ||
975 | "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, | ||
976 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
977 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
978 | |||
979 | /* AMD-defined (#2) */ | ||
980 | "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", | ||
981 | "altmovcr8", "abm", "sse4a", | ||
982 | "misalignsse", "3dnowprefetch", | ||
983 | "osvw", "ibs", NULL, NULL, NULL, NULL, | ||
984 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
985 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
986 | |||
987 | /* Auxiliary (Linux-defined) */ | ||
988 | "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
989 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
990 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
991 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
992 | }; | ||
993 | static char *x86_power_flags[] = { | ||
994 | "ts", /* temperature sensor */ | ||
995 | "fid", /* frequency id control */ | ||
996 | "vid", /* voltage id control */ | ||
997 | "ttp", /* thermal trip */ | ||
998 | "tm", | ||
999 | "stc", | ||
1000 | "100mhzsteps", | ||
1001 | "hwpstate", | ||
1002 | "", /* tsc invariant mapped to constant_tsc */ | ||
1003 | /* nothing */ | ||
1004 | }; | ||
1005 | |||
1006 | |||
1007 | #ifdef CONFIG_SMP | ||
1008 | if (!cpu_online(c-cpu_data)) | ||
1009 | return 0; | ||
1010 | #endif | ||
1011 | |||
1012 | seq_printf(m,"processor\t: %u\n" | ||
1013 | "vendor_id\t: %s\n" | ||
1014 | "cpu family\t: %d\n" | ||
1015 | "model\t\t: %d\n" | ||
1016 | "model name\t: %s\n", | ||
1017 | (unsigned)(c-cpu_data), | ||
1018 | c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | ||
1019 | c->x86, | ||
1020 | (int)c->x86_model, | ||
1021 | c->x86_model_id[0] ? c->x86_model_id : "unknown"); | ||
1022 | |||
1023 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1024 | seq_printf(m, "stepping\t: %d\n", c->x86_mask); | ||
1025 | else | ||
1026 | seq_printf(m, "stepping\t: unknown\n"); | ||
1027 | |||
1028 | if (cpu_has(c,X86_FEATURE_TSC)) { | ||
1029 | unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); | ||
1030 | if (!freq) | ||
1031 | freq = cpu_khz; | ||
1032 | seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | ||
1033 | freq / 1000, (freq % 1000)); | ||
1034 | } | ||
1035 | |||
1036 | /* Cache size */ | ||
1037 | if (c->x86_cache_size >= 0) | ||
1038 | seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | ||
1039 | |||
1040 | #ifdef CONFIG_SMP | ||
1041 | if (smp_num_siblings * c->x86_max_cores > 1) { | ||
1042 | int cpu = c - cpu_data; | ||
1043 | seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); | ||
1044 | seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); | ||
1045 | seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); | ||
1046 | seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); | ||
1047 | } | ||
1048 | #endif | ||
1049 | |||
1050 | seq_printf(m, | ||
1051 | "fpu\t\t: yes\n" | ||
1052 | "fpu_exception\t: yes\n" | ||
1053 | "cpuid level\t: %d\n" | ||
1054 | "wp\t\t: yes\n" | ||
1055 | "flags\t\t:", | ||
1056 | c->cpuid_level); | ||
1057 | |||
1058 | { | ||
1059 | int i; | ||
1060 | for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | ||
1061 | if (cpu_has(c, i) && x86_cap_flags[i] != NULL) | ||
1062 | seq_printf(m, " %s", x86_cap_flags[i]); | ||
1063 | } | ||
1064 | |||
1065 | seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | ||
1066 | c->loops_per_jiffy/(500000/HZ), | ||
1067 | (c->loops_per_jiffy/(5000/HZ)) % 100); | ||
1068 | |||
1069 | if (c->x86_tlbsize > 0) | ||
1070 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | ||
1071 | seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | ||
1072 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | ||
1073 | |||
1074 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | ||
1075 | c->x86_phys_bits, c->x86_virt_bits); | ||
1076 | |||
1077 | seq_printf(m, "power management:"); | ||
1078 | { | ||
1079 | unsigned i; | ||
1080 | for (i = 0; i < 32; i++) | ||
1081 | if (c->x86_power & (1 << i)) { | ||
1082 | if (i < ARRAY_SIZE(x86_power_flags) && | ||
1083 | x86_power_flags[i]) | ||
1084 | seq_printf(m, "%s%s", | ||
1085 | x86_power_flags[i][0]?" ":"", | ||
1086 | x86_power_flags[i]); | ||
1087 | else | ||
1088 | seq_printf(m, " [%d]", i); | ||
1089 | } | ||
1090 | } | ||
1091 | |||
1092 | seq_printf(m, "\n\n"); | ||
1093 | |||
1094 | return 0; | ||
1095 | } | ||
1096 | |||
1097 | static void *c_start(struct seq_file *m, loff_t *pos) | ||
1098 | { | ||
1099 | return *pos < NR_CPUS ? cpu_data + *pos : NULL; | ||
1100 | } | ||
1101 | |||
1102 | static void *c_next(struct seq_file *m, void *v, loff_t *pos) | ||
1103 | { | ||
1104 | ++*pos; | ||
1105 | return c_start(m, pos); | ||
1106 | } | ||
1107 | |||
1108 | static void c_stop(struct seq_file *m, void *v) | ||
1109 | { | ||
1110 | } | ||
1111 | |||
1112 | struct seq_operations cpuinfo_op = { | ||
1113 | .start =c_start, | ||
1114 | .next = c_next, | ||
1115 | .stop = c_stop, | ||
1116 | .show = show_cpuinfo, | ||
1117 | }; | ||
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c new file mode 100644 index 000000000000..739175b01e06 --- /dev/null +++ b/arch/x86/kernel/signal_64.c | |||
@@ -0,0 +1,495 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * | ||
7 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
8 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
9 | * 2000-2002 x86-64 support by Andi Kleen | ||
10 | */ | ||
11 | |||
12 | #include <linux/sched.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/signal.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/wait.h> | ||
19 | #include <linux/ptrace.h> | ||
20 | #include <linux/unistd.h> | ||
21 | #include <linux/stddef.h> | ||
22 | #include <linux/personality.h> | ||
23 | #include <linux/compiler.h> | ||
24 | #include <asm/ucontext.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/proto.h> | ||
28 | #include <asm/ia32_unistd.h> | ||
29 | #include <asm/mce.h> | ||
30 | |||
31 | /* #define DEBUG_SIG 1 */ | ||
32 | |||
33 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
34 | |||
35 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
36 | sigset_t *set, struct pt_regs * regs); | ||
37 | int ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
38 | sigset_t *set, struct pt_regs * regs); | ||
39 | |||
40 | asmlinkage long | ||
41 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | ||
42 | struct pt_regs *regs) | ||
43 | { | ||
44 | return do_sigaltstack(uss, uoss, regs->rsp); | ||
45 | } | ||
46 | |||
47 | |||
48 | /* | ||
49 | * Do a signal return; undo the signal stack. | ||
50 | */ | ||
51 | |||
52 | struct rt_sigframe | ||
53 | { | ||
54 | char __user *pretcode; | ||
55 | struct ucontext uc; | ||
56 | struct siginfo info; | ||
57 | }; | ||
58 | |||
59 | static int | ||
60 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) | ||
61 | { | ||
62 | unsigned int err = 0; | ||
63 | |||
64 | /* Always make any pending restarted system calls return -EINTR */ | ||
65 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
66 | |||
67 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
68 | |||
69 | COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); | ||
70 | COPY(rdx); COPY(rcx); COPY(rip); | ||
71 | COPY(r8); | ||
72 | COPY(r9); | ||
73 | COPY(r10); | ||
74 | COPY(r11); | ||
75 | COPY(r12); | ||
76 | COPY(r13); | ||
77 | COPY(r14); | ||
78 | COPY(r15); | ||
79 | |||
80 | /* Kernel saves and restores only the CS segment register on signals, | ||
81 | * which is the bare minimum needed to allow mixed 32/64-bit code. | ||
82 | * App's signal handler can save/restore other segments if needed. */ | ||
83 | { | ||
84 | unsigned cs; | ||
85 | err |= __get_user(cs, &sc->cs); | ||
86 | regs->cs = cs | 3; /* Force into user mode */ | ||
87 | } | ||
88 | |||
89 | { | ||
90 | unsigned int tmpflags; | ||
91 | err |= __get_user(tmpflags, &sc->eflags); | ||
92 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
93 | regs->orig_rax = -1; /* disable syscall checks */ | ||
94 | } | ||
95 | |||
96 | { | ||
97 | struct _fpstate __user * buf; | ||
98 | err |= __get_user(buf, &sc->fpstate); | ||
99 | |||
100 | if (buf) { | ||
101 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
102 | goto badframe; | ||
103 | err |= restore_i387(buf); | ||
104 | } else { | ||
105 | struct task_struct *me = current; | ||
106 | if (used_math()) { | ||
107 | clear_fpu(me); | ||
108 | clear_used_math(); | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | |||
113 | err |= __get_user(*prax, &sc->rax); | ||
114 | return err; | ||
115 | |||
116 | badframe: | ||
117 | return 1; | ||
118 | } | ||
119 | |||
120 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | ||
121 | { | ||
122 | struct rt_sigframe __user *frame; | ||
123 | sigset_t set; | ||
124 | unsigned long eax; | ||
125 | |||
126 | frame = (struct rt_sigframe __user *)(regs->rsp - 8); | ||
127 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { | ||
128 | goto badframe; | ||
129 | } | ||
130 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { | ||
131 | goto badframe; | ||
132 | } | ||
133 | |||
134 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
135 | spin_lock_irq(¤t->sighand->siglock); | ||
136 | current->blocked = set; | ||
137 | recalc_sigpending(); | ||
138 | spin_unlock_irq(¤t->sighand->siglock); | ||
139 | |||
140 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
141 | goto badframe; | ||
142 | |||
143 | #ifdef DEBUG_SIG | ||
144 | printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); | ||
145 | #endif | ||
146 | |||
147 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) | ||
148 | goto badframe; | ||
149 | |||
150 | return eax; | ||
151 | |||
152 | badframe: | ||
153 | signal_fault(regs,frame,"sigreturn"); | ||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Set up a signal frame. | ||
159 | */ | ||
160 | |||
161 | static inline int | ||
162 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) | ||
163 | { | ||
164 | int err = 0; | ||
165 | |||
166 | err |= __put_user(regs->cs, &sc->cs); | ||
167 | err |= __put_user(0, &sc->gs); | ||
168 | err |= __put_user(0, &sc->fs); | ||
169 | |||
170 | err |= __put_user(regs->rdi, &sc->rdi); | ||
171 | err |= __put_user(regs->rsi, &sc->rsi); | ||
172 | err |= __put_user(regs->rbp, &sc->rbp); | ||
173 | err |= __put_user(regs->rsp, &sc->rsp); | ||
174 | err |= __put_user(regs->rbx, &sc->rbx); | ||
175 | err |= __put_user(regs->rdx, &sc->rdx); | ||
176 | err |= __put_user(regs->rcx, &sc->rcx); | ||
177 | err |= __put_user(regs->rax, &sc->rax); | ||
178 | err |= __put_user(regs->r8, &sc->r8); | ||
179 | err |= __put_user(regs->r9, &sc->r9); | ||
180 | err |= __put_user(regs->r10, &sc->r10); | ||
181 | err |= __put_user(regs->r11, &sc->r11); | ||
182 | err |= __put_user(regs->r12, &sc->r12); | ||
183 | err |= __put_user(regs->r13, &sc->r13); | ||
184 | err |= __put_user(regs->r14, &sc->r14); | ||
185 | err |= __put_user(regs->r15, &sc->r15); | ||
186 | err |= __put_user(me->thread.trap_no, &sc->trapno); | ||
187 | err |= __put_user(me->thread.error_code, &sc->err); | ||
188 | err |= __put_user(regs->rip, &sc->rip); | ||
189 | err |= __put_user(regs->eflags, &sc->eflags); | ||
190 | err |= __put_user(mask, &sc->oldmask); | ||
191 | err |= __put_user(me->thread.cr2, &sc->cr2); | ||
192 | |||
193 | return err; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Determine which stack to use.. | ||
198 | */ | ||
199 | |||
200 | static void __user * | ||
201 | get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) | ||
202 | { | ||
203 | unsigned long rsp; | ||
204 | |||
205 | /* Default to using normal stack - redzone*/ | ||
206 | rsp = regs->rsp - 128; | ||
207 | |||
208 | /* This is the X/Open sanctioned signal stack switching. */ | ||
209 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
210 | if (sas_ss_flags(rsp) == 0) | ||
211 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
212 | } | ||
213 | |||
214 | return (void __user *)round_down(rsp - size, 16); | ||
215 | } | ||
216 | |||
217 | static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
218 | sigset_t *set, struct pt_regs * regs) | ||
219 | { | ||
220 | struct rt_sigframe __user *frame; | ||
221 | struct _fpstate __user *fp = NULL; | ||
222 | int err = 0; | ||
223 | struct task_struct *me = current; | ||
224 | |||
225 | if (used_math()) { | ||
226 | fp = get_stack(ka, regs, sizeof(struct _fpstate)); | ||
227 | frame = (void __user *)round_down( | ||
228 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; | ||
229 | |||
230 | if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) | ||
231 | goto give_sigsegv; | ||
232 | |||
233 | if (save_i387(fp) < 0) | ||
234 | err |= -1; | ||
235 | } else | ||
236 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; | ||
237 | |||
238 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
239 | goto give_sigsegv; | ||
240 | |||
241 | if (ka->sa.sa_flags & SA_SIGINFO) { | ||
242 | err |= copy_siginfo_to_user(&frame->info, info); | ||
243 | if (err) | ||
244 | goto give_sigsegv; | ||
245 | } | ||
246 | |||
247 | /* Create the ucontext. */ | ||
248 | err |= __put_user(0, &frame->uc.uc_flags); | ||
249 | err |= __put_user(0, &frame->uc.uc_link); | ||
250 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
251 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
252 | &frame->uc.uc_stack.ss_flags); | ||
253 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
254 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); | ||
255 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); | ||
256 | if (sizeof(*set) == 16) { | ||
257 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); | ||
258 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); | ||
259 | } else | ||
260 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
261 | |||
262 | /* Set up to return from userspace. If provided, use a stub | ||
263 | already in userspace. */ | ||
264 | /* x86-64 should always use SA_RESTORER. */ | ||
265 | if (ka->sa.sa_flags & SA_RESTORER) { | ||
266 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); | ||
267 | } else { | ||
268 | /* could use a vstub here */ | ||
269 | goto give_sigsegv; | ||
270 | } | ||
271 | |||
272 | if (err) | ||
273 | goto give_sigsegv; | ||
274 | |||
275 | #ifdef DEBUG_SIG | ||
276 | printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); | ||
277 | #endif | ||
278 | |||
279 | /* Set up registers for signal handler */ | ||
280 | regs->rdi = sig; | ||
281 | /* In case the signal handler was declared without prototypes */ | ||
282 | regs->rax = 0; | ||
283 | |||
284 | /* This also works for non SA_SIGINFO handlers because they expect the | ||
285 | next argument after the signal number on the stack. */ | ||
286 | regs->rsi = (unsigned long)&frame->info; | ||
287 | regs->rdx = (unsigned long)&frame->uc; | ||
288 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
289 | |||
290 | regs->rsp = (unsigned long)frame; | ||
291 | |||
292 | /* Set up the CS register to run signal handlers in 64-bit mode, | ||
293 | even if the handler happens to be interrupting 32-bit code. */ | ||
294 | regs->cs = __USER_CS; | ||
295 | |||
296 | /* This, by contrast, has nothing to do with segment registers - | ||
297 | see include/asm-x86_64/uaccess.h for details. */ | ||
298 | set_fs(USER_DS); | ||
299 | |||
300 | regs->eflags &= ~TF_MASK; | ||
301 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
302 | ptrace_notify(SIGTRAP); | ||
303 | #ifdef DEBUG_SIG | ||
304 | printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", | ||
305 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
306 | #endif | ||
307 | |||
308 | return 0; | ||
309 | |||
310 | give_sigsegv: | ||
311 | force_sigsegv(sig, current); | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * OK, we're invoking a handler | ||
317 | */ | ||
318 | |||
319 | static int | ||
320 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | ||
321 | sigset_t *oldset, struct pt_regs *regs) | ||
322 | { | ||
323 | int ret; | ||
324 | |||
325 | #ifdef DEBUG_SIG | ||
326 | printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", | ||
327 | current->pid, sig, | ||
328 | regs->rip, regs->rsp, regs); | ||
329 | #endif | ||
330 | |||
331 | /* Are we from a system call? */ | ||
332 | if ((long)regs->orig_rax >= 0) { | ||
333 | /* If so, check system call restarting.. */ | ||
334 | switch (regs->rax) { | ||
335 | case -ERESTART_RESTARTBLOCK: | ||
336 | case -ERESTARTNOHAND: | ||
337 | regs->rax = -EINTR; | ||
338 | break; | ||
339 | |||
340 | case -ERESTARTSYS: | ||
341 | if (!(ka->sa.sa_flags & SA_RESTART)) { | ||
342 | regs->rax = -EINTR; | ||
343 | break; | ||
344 | } | ||
345 | /* fallthrough */ | ||
346 | case -ERESTARTNOINTR: | ||
347 | regs->rax = regs->orig_rax; | ||
348 | regs->rip -= 2; | ||
349 | break; | ||
350 | } | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * If TF is set due to a debugger (PT_DTRACE), clear the TF | ||
355 | * flag so that register information in the sigcontext is | ||
356 | * correct. | ||
357 | */ | ||
358 | if (unlikely(regs->eflags & TF_MASK)) { | ||
359 | if (likely(current->ptrace & PT_DTRACE)) { | ||
360 | current->ptrace &= ~PT_DTRACE; | ||
361 | regs->eflags &= ~TF_MASK; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | #ifdef CONFIG_IA32_EMULATION | ||
366 | if (test_thread_flag(TIF_IA32)) { | ||
367 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
368 | ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); | ||
369 | else | ||
370 | ret = ia32_setup_frame(sig, ka, oldset, regs); | ||
371 | } else | ||
372 | #endif | ||
373 | ret = setup_rt_frame(sig, ka, info, oldset, regs); | ||
374 | |||
375 | if (ret == 0) { | ||
376 | spin_lock_irq(¤t->sighand->siglock); | ||
377 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
378 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
379 | sigaddset(¤t->blocked,sig); | ||
380 | recalc_sigpending(); | ||
381 | spin_unlock_irq(¤t->sighand->siglock); | ||
382 | } | ||
383 | |||
384 | return ret; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * Note that 'init' is a special process: it doesn't get signals it doesn't | ||
389 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | ||
390 | * mistake. | ||
391 | */ | ||
392 | static void do_signal(struct pt_regs *regs) | ||
393 | { | ||
394 | struct k_sigaction ka; | ||
395 | siginfo_t info; | ||
396 | int signr; | ||
397 | sigset_t *oldset; | ||
398 | |||
399 | /* | ||
400 | * We want the common case to go fast, which | ||
401 | * is why we may in certain cases get here from | ||
402 | * kernel mode. Just return without doing anything | ||
403 | * if so. | ||
404 | */ | ||
405 | if (!user_mode(regs)) | ||
406 | return; | ||
407 | |||
408 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) | ||
409 | oldset = ¤t->saved_sigmask; | ||
410 | else | ||
411 | oldset = ¤t->blocked; | ||
412 | |||
413 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | ||
414 | if (signr > 0) { | ||
415 | /* Reenable any watchpoints before delivering the | ||
416 | * signal to user space. The processor register will | ||
417 | * have been cleared if the watchpoint triggered | ||
418 | * inside the kernel. | ||
419 | */ | ||
420 | if (current->thread.debugreg7) | ||
421 | set_debugreg(current->thread.debugreg7, 7); | ||
422 | |||
423 | /* Whee! Actually deliver the signal. */ | ||
424 | if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { | ||
425 | /* a signal was successfully delivered; the saved | ||
426 | * sigmask will have been stored in the signal frame, | ||
427 | * and will be restored by sigreturn, so we can simply | ||
428 | * clear the TIF_RESTORE_SIGMASK flag */ | ||
429 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
430 | } | ||
431 | return; | ||
432 | } | ||
433 | |||
434 | /* Did we come from a system call? */ | ||
435 | if ((long)regs->orig_rax >= 0) { | ||
436 | /* Restart the system call - no handlers present */ | ||
437 | long res = regs->rax; | ||
438 | switch (res) { | ||
439 | case -ERESTARTNOHAND: | ||
440 | case -ERESTARTSYS: | ||
441 | case -ERESTARTNOINTR: | ||
442 | regs->rax = regs->orig_rax; | ||
443 | regs->rip -= 2; | ||
444 | break; | ||
445 | case -ERESTART_RESTARTBLOCK: | ||
446 | regs->rax = test_thread_flag(TIF_IA32) ? | ||
447 | __NR_ia32_restart_syscall : | ||
448 | __NR_restart_syscall; | ||
449 | regs->rip -= 2; | ||
450 | break; | ||
451 | } | ||
452 | } | ||
453 | |||
454 | /* if there's no signal to deliver, we just put the saved sigmask | ||
455 | back. */ | ||
456 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) { | ||
457 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
458 | sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | void | ||
463 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | ||
464 | { | ||
465 | #ifdef DEBUG_SIG | ||
466 | printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", | ||
467 | thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); | ||
468 | #endif | ||
469 | |||
470 | /* Pending single-step? */ | ||
471 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
472 | regs->eflags |= TF_MASK; | ||
473 | clear_thread_flag(TIF_SINGLESTEP); | ||
474 | } | ||
475 | |||
476 | #ifdef CONFIG_X86_MCE | ||
477 | /* notify userspace of pending MCEs */ | ||
478 | if (thread_info_flags & _TIF_MCE_NOTIFY) | ||
479 | mce_notify_user(); | ||
480 | #endif /* CONFIG_X86_MCE */ | ||
481 | |||
482 | /* deal with pending signal delivery */ | ||
483 | if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) | ||
484 | do_signal(regs); | ||
485 | } | ||
486 | |||
487 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | ||
488 | { | ||
489 | struct task_struct *me = current; | ||
490 | if (show_unhandled_signals && printk_ratelimit()) | ||
491 | printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", | ||
492 | me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); | ||
493 | |||
494 | force_sig(SIGSEGV, me); | ||
495 | } | ||
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c new file mode 100644 index 000000000000..df4a82812adb --- /dev/null +++ b/arch/x86/kernel/smp_64.c | |||
@@ -0,0 +1,523 @@ | |||
1 | /* | ||
2 | * Intel SMP support routines. | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * (c) 2002,2003 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * This code is released under the GNU General Public License version 2 or | ||
9 | * later. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/delay.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/smp.h> | ||
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/mc146818rtc.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | |||
22 | #include <asm/mtrr.h> | ||
23 | #include <asm/pgalloc.h> | ||
24 | #include <asm/tlbflush.h> | ||
25 | #include <asm/mach_apic.h> | ||
26 | #include <asm/mmu_context.h> | ||
27 | #include <asm/proto.h> | ||
28 | #include <asm/apicdef.h> | ||
29 | #include <asm/idle.h> | ||
30 | |||
31 | /* | ||
32 | * Smarter SMP flushing macros. | ||
33 | * c/o Linus Torvalds. | ||
34 | * | ||
35 | * These mean you can really definitely utterly forget about | ||
36 | * writing to user space from interrupts. (Its not allowed anyway). | ||
37 | * | ||
38 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
39 | * | ||
40 | * More scalable flush, from Andi Kleen | ||
41 | * | ||
42 | * To avoid global state use 8 different call vectors. | ||
43 | * Each CPU uses a specific vector to trigger flushes on other | ||
44 | * CPUs. Depending on the received vector the target CPUs look into | ||
45 | * the right per cpu variable for the flush data. | ||
46 | * | ||
47 | * With more than 8 CPUs they are hashed to the 8 available | ||
48 | * vectors. The limited global vector space forces us to this right now. | ||
49 | * In future when interrupts are split into per CPU domains this could be | ||
50 | * fixed, at the cost of triggering multiple IPIs in some cases. | ||
51 | */ | ||
52 | |||
53 | union smp_flush_state { | ||
54 | struct { | ||
55 | cpumask_t flush_cpumask; | ||
56 | struct mm_struct *flush_mm; | ||
57 | unsigned long flush_va; | ||
58 | #define FLUSH_ALL -1ULL | ||
59 | spinlock_t tlbstate_lock; | ||
60 | }; | ||
61 | char pad[SMP_CACHE_BYTES]; | ||
62 | } ____cacheline_aligned; | ||
63 | |||
64 | /* State is put into the per CPU data section, but padded | ||
65 | to a full cache line because other CPUs can access it and we don't | ||
66 | want false sharing in the per cpu data segment. */ | ||
67 | static DEFINE_PER_CPU(union smp_flush_state, flush_state); | ||
68 | |||
69 | /* | ||
70 | * We cannot call mmdrop() because we are in interrupt context, | ||
71 | * instead update mm->cpu_vm_mask. | ||
72 | */ | ||
73 | static inline void leave_mm(int cpu) | ||
74 | { | ||
75 | if (read_pda(mmu_state) == TLBSTATE_OK) | ||
76 | BUG(); | ||
77 | cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); | ||
78 | load_cr3(swapper_pg_dir); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * | ||
83 | * The flush IPI assumes that a thread switch happens in this order: | ||
84 | * [cpu0: the cpu that switches] | ||
85 | * 1) switch_mm() either 1a) or 1b) | ||
86 | * 1a) thread switch to a different mm | ||
87 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
88 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
89 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
90 | * for the wrong mm, and in the worst case we perform a superfluous | ||
91 | * tlb flush. | ||
92 | * 1a2) set cpu mmu_state to TLBSTATE_OK | ||
93 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
94 | * was in lazy tlb mode. | ||
95 | * 1a3) update cpu active_mm | ||
96 | * Now cpu0 accepts tlb flushes for the new mm. | ||
97 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
98 | * Now the other cpus will send tlb flush ipis. | ||
99 | * 1a4) change cr3. | ||
100 | * 1b) thread switch without mm change | ||
101 | * cpu active_mm is correct, cpu0 already handles | ||
102 | * flush ipis. | ||
103 | * 1b1) set cpu mmu_state to TLBSTATE_OK | ||
104 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
105 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
106 | * and test the bit. | ||
107 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
108 | * 2) switch %%esp, ie current | ||
109 | * | ||
110 | * The interrupt must handle 2 special cases: | ||
111 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
112 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
113 | * runs in kernel space, the cpu could load tlb entries for user space | ||
114 | * pages. | ||
115 | * | ||
116 | * The good news is that cpu mmu_state is local to each cpu, no | ||
117 | * write/read ordering problems. | ||
118 | */ | ||
119 | |||
120 | /* | ||
121 | * TLB flush IPI: | ||
122 | * | ||
123 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
124 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
125 | * | ||
126 | * Interrupts are disabled. | ||
127 | */ | ||
128 | |||
129 | asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) | ||
130 | { | ||
131 | int cpu; | ||
132 | int sender; | ||
133 | union smp_flush_state *f; | ||
134 | |||
135 | cpu = smp_processor_id(); | ||
136 | /* | ||
137 | * orig_rax contains the negated interrupt vector. | ||
138 | * Use that to determine where the sender put the data. | ||
139 | */ | ||
140 | sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; | ||
141 | f = &per_cpu(flush_state, sender); | ||
142 | |||
143 | if (!cpu_isset(cpu, f->flush_cpumask)) | ||
144 | goto out; | ||
145 | /* | ||
146 | * This was a BUG() but until someone can quote me the | ||
147 | * line from the intel manual that guarantees an IPI to | ||
148 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
149 | * its staying as a return | ||
150 | * | ||
151 | * BUG(); | ||
152 | */ | ||
153 | |||
154 | if (f->flush_mm == read_pda(active_mm)) { | ||
155 | if (read_pda(mmu_state) == TLBSTATE_OK) { | ||
156 | if (f->flush_va == FLUSH_ALL) | ||
157 | local_flush_tlb(); | ||
158 | else | ||
159 | __flush_tlb_one(f->flush_va); | ||
160 | } else | ||
161 | leave_mm(cpu); | ||
162 | } | ||
163 | out: | ||
164 | ack_APIC_irq(); | ||
165 | cpu_clear(cpu, f->flush_cpumask); | ||
166 | } | ||
167 | |||
168 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | ||
169 | unsigned long va) | ||
170 | { | ||
171 | int sender; | ||
172 | union smp_flush_state *f; | ||
173 | |||
174 | /* Caller has disabled preemption */ | ||
175 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | ||
176 | f = &per_cpu(flush_state, sender); | ||
177 | |||
178 | /* Could avoid this lock when | ||
179 | num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
180 | probably not worth checking this for a cache-hot lock. */ | ||
181 | spin_lock(&f->tlbstate_lock); | ||
182 | |||
183 | f->flush_mm = mm; | ||
184 | f->flush_va = va; | ||
185 | cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); | ||
186 | |||
187 | /* | ||
188 | * We have to send the IPI only to | ||
189 | * CPUs affected. | ||
190 | */ | ||
191 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); | ||
192 | |||
193 | while (!cpus_empty(f->flush_cpumask)) | ||
194 | cpu_relax(); | ||
195 | |||
196 | f->flush_mm = NULL; | ||
197 | f->flush_va = 0; | ||
198 | spin_unlock(&f->tlbstate_lock); | ||
199 | } | ||
200 | |||
201 | int __cpuinit init_smp_flush(void) | ||
202 | { | ||
203 | int i; | ||
204 | for_each_cpu_mask(i, cpu_possible_map) { | ||
205 | spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); | ||
206 | } | ||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | core_initcall(init_smp_flush); | ||
211 | |||
212 | void flush_tlb_current_task(void) | ||
213 | { | ||
214 | struct mm_struct *mm = current->mm; | ||
215 | cpumask_t cpu_mask; | ||
216 | |||
217 | preempt_disable(); | ||
218 | cpu_mask = mm->cpu_vm_mask; | ||
219 | cpu_clear(smp_processor_id(), cpu_mask); | ||
220 | |||
221 | local_flush_tlb(); | ||
222 | if (!cpus_empty(cpu_mask)) | ||
223 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
224 | preempt_enable(); | ||
225 | } | ||
226 | EXPORT_SYMBOL(flush_tlb_current_task); | ||
227 | |||
228 | void flush_tlb_mm (struct mm_struct * mm) | ||
229 | { | ||
230 | cpumask_t cpu_mask; | ||
231 | |||
232 | preempt_disable(); | ||
233 | cpu_mask = mm->cpu_vm_mask; | ||
234 | cpu_clear(smp_processor_id(), cpu_mask); | ||
235 | |||
236 | if (current->active_mm == mm) { | ||
237 | if (current->mm) | ||
238 | local_flush_tlb(); | ||
239 | else | ||
240 | leave_mm(smp_processor_id()); | ||
241 | } | ||
242 | if (!cpus_empty(cpu_mask)) | ||
243 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
244 | |||
245 | preempt_enable(); | ||
246 | } | ||
247 | EXPORT_SYMBOL(flush_tlb_mm); | ||
248 | |||
249 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | ||
250 | { | ||
251 | struct mm_struct *mm = vma->vm_mm; | ||
252 | cpumask_t cpu_mask; | ||
253 | |||
254 | preempt_disable(); | ||
255 | cpu_mask = mm->cpu_vm_mask; | ||
256 | cpu_clear(smp_processor_id(), cpu_mask); | ||
257 | |||
258 | if (current->active_mm == mm) { | ||
259 | if(current->mm) | ||
260 | __flush_tlb_one(va); | ||
261 | else | ||
262 | leave_mm(smp_processor_id()); | ||
263 | } | ||
264 | |||
265 | if (!cpus_empty(cpu_mask)) | ||
266 | flush_tlb_others(cpu_mask, mm, va); | ||
267 | |||
268 | preempt_enable(); | ||
269 | } | ||
270 | EXPORT_SYMBOL(flush_tlb_page); | ||
271 | |||
272 | static void do_flush_tlb_all(void* info) | ||
273 | { | ||
274 | unsigned long cpu = smp_processor_id(); | ||
275 | |||
276 | __flush_tlb_all(); | ||
277 | if (read_pda(mmu_state) == TLBSTATE_LAZY) | ||
278 | leave_mm(cpu); | ||
279 | } | ||
280 | |||
281 | void flush_tlb_all(void) | ||
282 | { | ||
283 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * this function sends a 'reschedule' IPI to another CPU. | ||
288 | * it goes straight through and wastes no time serializing | ||
289 | * anything. Worst case is that we lose a reschedule ... | ||
290 | */ | ||
291 | |||
292 | void smp_send_reschedule(int cpu) | ||
293 | { | ||
294 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Structure and data for smp_call_function(). This is designed to minimise | ||
299 | * static memory requirements. It also looks cleaner. | ||
300 | */ | ||
301 | static DEFINE_SPINLOCK(call_lock); | ||
302 | |||
303 | struct call_data_struct { | ||
304 | void (*func) (void *info); | ||
305 | void *info; | ||
306 | atomic_t started; | ||
307 | atomic_t finished; | ||
308 | int wait; | ||
309 | }; | ||
310 | |||
311 | static struct call_data_struct * call_data; | ||
312 | |||
313 | void lock_ipi_call_lock(void) | ||
314 | { | ||
315 | spin_lock_irq(&call_lock); | ||
316 | } | ||
317 | |||
318 | void unlock_ipi_call_lock(void) | ||
319 | { | ||
320 | spin_unlock_irq(&call_lock); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * this function sends a 'generic call function' IPI to one other CPU | ||
325 | * in the system. | ||
326 | * | ||
327 | * cpu is a standard Linux logical CPU number. | ||
328 | */ | ||
329 | static void | ||
330 | __smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
331 | int nonatomic, int wait) | ||
332 | { | ||
333 | struct call_data_struct data; | ||
334 | int cpus = 1; | ||
335 | |||
336 | data.func = func; | ||
337 | data.info = info; | ||
338 | atomic_set(&data.started, 0); | ||
339 | data.wait = wait; | ||
340 | if (wait) | ||
341 | atomic_set(&data.finished, 0); | ||
342 | |||
343 | call_data = &data; | ||
344 | wmb(); | ||
345 | /* Send a message to all other CPUs and wait for them to respond */ | ||
346 | send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); | ||
347 | |||
348 | /* Wait for response */ | ||
349 | while (atomic_read(&data.started) != cpus) | ||
350 | cpu_relax(); | ||
351 | |||
352 | if (!wait) | ||
353 | return; | ||
354 | |||
355 | while (atomic_read(&data.finished) != cpus) | ||
356 | cpu_relax(); | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * smp_call_function_single - Run a function on a specific CPU | ||
361 | * @func: The function to run. This must be fast and non-blocking. | ||
362 | * @info: An arbitrary pointer to pass to the function. | ||
363 | * @nonatomic: Currently unused. | ||
364 | * @wait: If true, wait until function has completed on other CPUs. | ||
365 | * | ||
366 | * Retrurns 0 on success, else a negative status code. | ||
367 | * | ||
368 | * Does not return until the remote CPU is nearly ready to execute <func> | ||
369 | * or is or has executed. | ||
370 | */ | ||
371 | |||
372 | int smp_call_function_single (int cpu, void (*func) (void *info), void *info, | ||
373 | int nonatomic, int wait) | ||
374 | { | ||
375 | /* prevent preemption and reschedule on another processor */ | ||
376 | int me = get_cpu(); | ||
377 | |||
378 | /* Can deadlock when called with interrupts disabled */ | ||
379 | WARN_ON(irqs_disabled()); | ||
380 | |||
381 | if (cpu == me) { | ||
382 | local_irq_disable(); | ||
383 | func(info); | ||
384 | local_irq_enable(); | ||
385 | put_cpu(); | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | spin_lock(&call_lock); | ||
390 | __smp_call_function_single(cpu, func, info, nonatomic, wait); | ||
391 | spin_unlock(&call_lock); | ||
392 | put_cpu(); | ||
393 | return 0; | ||
394 | } | ||
395 | EXPORT_SYMBOL(smp_call_function_single); | ||
396 | |||
397 | /* | ||
398 | * this function sends a 'generic call function' IPI to all other CPUs | ||
399 | * in the system. | ||
400 | */ | ||
401 | static void __smp_call_function (void (*func) (void *info), void *info, | ||
402 | int nonatomic, int wait) | ||
403 | { | ||
404 | struct call_data_struct data; | ||
405 | int cpus = num_online_cpus()-1; | ||
406 | |||
407 | if (!cpus) | ||
408 | return; | ||
409 | |||
410 | data.func = func; | ||
411 | data.info = info; | ||
412 | atomic_set(&data.started, 0); | ||
413 | data.wait = wait; | ||
414 | if (wait) | ||
415 | atomic_set(&data.finished, 0); | ||
416 | |||
417 | call_data = &data; | ||
418 | wmb(); | ||
419 | /* Send a message to all other CPUs and wait for them to respond */ | ||
420 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
421 | |||
422 | /* Wait for response */ | ||
423 | while (atomic_read(&data.started) != cpus) | ||
424 | cpu_relax(); | ||
425 | |||
426 | if (!wait) | ||
427 | return; | ||
428 | |||
429 | while (atomic_read(&data.finished) != cpus) | ||
430 | cpu_relax(); | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * smp_call_function - run a function on all other CPUs. | ||
435 | * @func: The function to run. This must be fast and non-blocking. | ||
436 | * @info: An arbitrary pointer to pass to the function. | ||
437 | * @nonatomic: currently unused. | ||
438 | * @wait: If true, wait (atomically) until function has completed on other | ||
439 | * CPUs. | ||
440 | * | ||
441 | * Returns 0 on success, else a negative status code. Does not return until | ||
442 | * remote CPUs are nearly ready to execute func or are or have executed. | ||
443 | * | ||
444 | * You must not call this function with disabled interrupts or from a | ||
445 | * hardware interrupt handler or from a bottom half handler. | ||
446 | * Actually there are a few legal cases, like panic. | ||
447 | */ | ||
448 | int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
449 | int wait) | ||
450 | { | ||
451 | spin_lock(&call_lock); | ||
452 | __smp_call_function(func,info,nonatomic,wait); | ||
453 | spin_unlock(&call_lock); | ||
454 | return 0; | ||
455 | } | ||
456 | EXPORT_SYMBOL(smp_call_function); | ||
457 | |||
458 | static void stop_this_cpu(void *dummy) | ||
459 | { | ||
460 | local_irq_disable(); | ||
461 | /* | ||
462 | * Remove this CPU: | ||
463 | */ | ||
464 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
465 | disable_local_APIC(); | ||
466 | for (;;) | ||
467 | halt(); | ||
468 | } | ||
469 | |||
470 | void smp_send_stop(void) | ||
471 | { | ||
472 | int nolock; | ||
473 | unsigned long flags; | ||
474 | |||
475 | if (reboot_force) | ||
476 | return; | ||
477 | |||
478 | /* Don't deadlock on the call lock in panic */ | ||
479 | nolock = !spin_trylock(&call_lock); | ||
480 | local_irq_save(flags); | ||
481 | __smp_call_function(stop_this_cpu, NULL, 0, 0); | ||
482 | if (!nolock) | ||
483 | spin_unlock(&call_lock); | ||
484 | disable_local_APIC(); | ||
485 | local_irq_restore(flags); | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Reschedule call back. Nothing to do, | ||
490 | * all the work is done automatically when | ||
491 | * we return from the interrupt. | ||
492 | */ | ||
493 | asmlinkage void smp_reschedule_interrupt(void) | ||
494 | { | ||
495 | ack_APIC_irq(); | ||
496 | } | ||
497 | |||
498 | asmlinkage void smp_call_function_interrupt(void) | ||
499 | { | ||
500 | void (*func) (void *info) = call_data->func; | ||
501 | void *info = call_data->info; | ||
502 | int wait = call_data->wait; | ||
503 | |||
504 | ack_APIC_irq(); | ||
505 | /* | ||
506 | * Notify initiating CPU that I've grabbed the data and am | ||
507 | * about to execute the function | ||
508 | */ | ||
509 | mb(); | ||
510 | atomic_inc(&call_data->started); | ||
511 | /* | ||
512 | * At this point the info structure may be out of scope unless wait==1 | ||
513 | */ | ||
514 | exit_idle(); | ||
515 | irq_enter(); | ||
516 | (*func)(info); | ||
517 | irq_exit(); | ||
518 | if (wait) { | ||
519 | mb(); | ||
520 | atomic_inc(&call_data->finished); | ||
521 | } | ||
522 | } | ||
523 | |||
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c new file mode 100644 index 000000000000..32f50783edc8 --- /dev/null +++ b/arch/x86/kernel/smpboot_64.c | |||
@@ -0,0 +1,1085 @@ | |||
1 | /* | ||
2 | * x86 SMP booting functions | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright 2001 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * Much of the core SMP work is based on previous work by Thomas Radke, to | ||
9 | * whom a great many thanks are extended. | ||
10 | * | ||
11 | * Thanks to Intel for making available several different Pentium, | ||
12 | * Pentium Pro and Pentium-II/Xeon MP machines. | ||
13 | * Original development of Linux SMP code supported by Caldera. | ||
14 | * | ||
15 | * This code is released under the GNU General Public License version 2 | ||
16 | * | ||
17 | * Fixes | ||
18 | * Felix Koop : NR_CPUS used properly | ||
19 | * Jose Renau : Handle single CPU case. | ||
20 | * Alan Cox : By repeated request 8) - Total BogoMIP report. | ||
21 | * Greg Wright : Fix for kernel stacks panic. | ||
22 | * Erich Boleyn : MP v1.4 and additional changes. | ||
23 | * Matthias Sattler : Changes for 2.1 kernel map. | ||
24 | * Michel Lespinasse : Changes for 2.1 kernel map. | ||
25 | * Michael Chastain : Change trampoline.S to gnu as. | ||
26 | * Alan Cox : Dumb bug: 'B' step PPro's are fine | ||
27 | * Ingo Molnar : Added APIC timers, based on code | ||
28 | * from Jose Renau | ||
29 | * Ingo Molnar : various cleanups and rewrites | ||
30 | * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. | ||
31 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs | ||
32 | * Andi Kleen : Changed for SMP boot into long mode. | ||
33 | * Rusty Russell : Hacked into shape for new "hotplug" boot process. | ||
34 | * Andi Kleen : Converted to new state machine. | ||
35 | * Various cleanups. | ||
36 | * Probably mostly hotplug CPU ready now. | ||
37 | * Ashok Raj : CPU hotplug support | ||
38 | */ | ||
39 | |||
40 | |||
41 | #include <linux/init.h> | ||
42 | |||
43 | #include <linux/mm.h> | ||
44 | #include <linux/kernel_stat.h> | ||
45 | #include <linux/bootmem.h> | ||
46 | #include <linux/thread_info.h> | ||
47 | #include <linux/module.h> | ||
48 | #include <linux/delay.h> | ||
49 | #include <linux/mc146818rtc.h> | ||
50 | #include <linux/smp.h> | ||
51 | #include <linux/kdebug.h> | ||
52 | |||
53 | #include <asm/mtrr.h> | ||
54 | #include <asm/pgalloc.h> | ||
55 | #include <asm/desc.h> | ||
56 | #include <asm/tlbflush.h> | ||
57 | #include <asm/proto.h> | ||
58 | #include <asm/nmi.h> | ||
59 | #include <asm/irq.h> | ||
60 | #include <asm/hw_irq.h> | ||
61 | #include <asm/numa.h> | ||
62 | |||
63 | /* Number of siblings per CPU package */ | ||
64 | int smp_num_siblings = 1; | ||
65 | EXPORT_SYMBOL(smp_num_siblings); | ||
66 | |||
67 | /* Last level cache ID of each logical CPU */ | ||
68 | u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; | ||
69 | |||
70 | /* Bitmask of currently online CPUs */ | ||
71 | cpumask_t cpu_online_map __read_mostly; | ||
72 | |||
73 | EXPORT_SYMBOL(cpu_online_map); | ||
74 | |||
75 | /* | ||
76 | * Private maps to synchronize booting between AP and BP. | ||
77 | * Probably not needed anymore, but it makes for easier debugging. -AK | ||
78 | */ | ||
79 | cpumask_t cpu_callin_map; | ||
80 | cpumask_t cpu_callout_map; | ||
81 | EXPORT_SYMBOL(cpu_callout_map); | ||
82 | |||
83 | cpumask_t cpu_possible_map; | ||
84 | EXPORT_SYMBOL(cpu_possible_map); | ||
85 | |||
86 | /* Per CPU bogomips and other parameters */ | ||
87 | struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; | ||
88 | EXPORT_SYMBOL(cpu_data); | ||
89 | |||
90 | /* Set when the idlers are all forked */ | ||
91 | int smp_threads_ready; | ||
92 | |||
93 | /* representing HT siblings of each logical CPU */ | ||
94 | cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; | ||
95 | EXPORT_SYMBOL(cpu_sibling_map); | ||
96 | |||
97 | /* representing HT and core siblings of each logical CPU */ | ||
98 | cpumask_t cpu_core_map[NR_CPUS] __read_mostly; | ||
99 | EXPORT_SYMBOL(cpu_core_map); | ||
100 | |||
101 | /* | ||
102 | * Trampoline 80x86 program as an array. | ||
103 | */ | ||
104 | |||
105 | extern unsigned char trampoline_data[]; | ||
106 | extern unsigned char trampoline_end[]; | ||
107 | |||
108 | /* State of each CPU */ | ||
109 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
110 | |||
111 | /* | ||
112 | * Store all idle threads, this can be reused instead of creating | ||
113 | * a new thread. Also avoids complicated thread destroy functionality | ||
114 | * for idle threads. | ||
115 | */ | ||
116 | struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | ||
117 | |||
118 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | ||
119 | #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) | ||
120 | |||
121 | /* | ||
122 | * Currently trivial. Write the real->protected mode | ||
123 | * bootstrap into the page concerned. The caller | ||
124 | * has made sure it's suitably aligned. | ||
125 | */ | ||
126 | |||
127 | static unsigned long __cpuinit setup_trampoline(void) | ||
128 | { | ||
129 | void *tramp = __va(SMP_TRAMPOLINE_BASE); | ||
130 | memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); | ||
131 | return virt_to_phys(tramp); | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * The bootstrap kernel entry code has set these up. Save them for | ||
136 | * a given CPU | ||
137 | */ | ||
138 | |||
139 | static void __cpuinit smp_store_cpu_info(int id) | ||
140 | { | ||
141 | struct cpuinfo_x86 *c = cpu_data + id; | ||
142 | |||
143 | *c = boot_cpu_data; | ||
144 | identify_cpu(c); | ||
145 | print_cpu_info(c); | ||
146 | } | ||
147 | |||
148 | static atomic_t init_deasserted __cpuinitdata; | ||
149 | |||
150 | /* | ||
151 | * Report back to the Boot Processor. | ||
152 | * Running on AP. | ||
153 | */ | ||
154 | void __cpuinit smp_callin(void) | ||
155 | { | ||
156 | int cpuid, phys_id; | ||
157 | unsigned long timeout; | ||
158 | |||
159 | /* | ||
160 | * If waken up by an INIT in an 82489DX configuration | ||
161 | * we may get here before an INIT-deassert IPI reaches | ||
162 | * our local APIC. We have to wait for the IPI or we'll | ||
163 | * lock up on an APIC access. | ||
164 | */ | ||
165 | while (!atomic_read(&init_deasserted)) | ||
166 | cpu_relax(); | ||
167 | |||
168 | /* | ||
169 | * (This works even if the APIC is not enabled.) | ||
170 | */ | ||
171 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
172 | cpuid = smp_processor_id(); | ||
173 | if (cpu_isset(cpuid, cpu_callin_map)) { | ||
174 | panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", | ||
175 | phys_id, cpuid); | ||
176 | } | ||
177 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
178 | |||
179 | /* | ||
180 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
181 | * trigger some glue motherboard logic. Complete APIC bus | ||
182 | * silence for 1 second, this overestimates the time the | ||
183 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
184 | * by a factor of two. This should be enough. | ||
185 | */ | ||
186 | |||
187 | /* | ||
188 | * Waiting 2s total for startup (udelay is not yet working) | ||
189 | */ | ||
190 | timeout = jiffies + 2*HZ; | ||
191 | while (time_before(jiffies, timeout)) { | ||
192 | /* | ||
193 | * Has the boot CPU finished it's STARTUP sequence? | ||
194 | */ | ||
195 | if (cpu_isset(cpuid, cpu_callout_map)) | ||
196 | break; | ||
197 | cpu_relax(); | ||
198 | } | ||
199 | |||
200 | if (!time_before(jiffies, timeout)) { | ||
201 | panic("smp_callin: CPU%d started up but did not get a callout!\n", | ||
202 | cpuid); | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * the boot CPU has finished the init stage and is spinning | ||
207 | * on callin_map until we finish. We are free to set up this | ||
208 | * CPU, first the APIC. (this is probably redundant on most | ||
209 | * boards) | ||
210 | */ | ||
211 | |||
212 | Dprintk("CALLIN, before setup_local_APIC().\n"); | ||
213 | setup_local_APIC(); | ||
214 | |||
215 | /* | ||
216 | * Get our bogomips. | ||
217 | * | ||
218 | * Need to enable IRQs because it can take longer and then | ||
219 | * the NMI watchdog might kill us. | ||
220 | */ | ||
221 | local_irq_enable(); | ||
222 | calibrate_delay(); | ||
223 | local_irq_disable(); | ||
224 | Dprintk("Stack at about %p\n",&cpuid); | ||
225 | |||
226 | disable_APIC_timer(); | ||
227 | |||
228 | /* | ||
229 | * Save our processor parameters | ||
230 | */ | ||
231 | smp_store_cpu_info(cpuid); | ||
232 | |||
233 | /* | ||
234 | * Allow the master to continue. | ||
235 | */ | ||
236 | cpu_set(cpuid, cpu_callin_map); | ||
237 | } | ||
238 | |||
239 | /* maps the cpu to the sched domain representing multi-core */ | ||
240 | cpumask_t cpu_coregroup_map(int cpu) | ||
241 | { | ||
242 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
243 | /* | ||
244 | * For perf, we return last level cache shared map. | ||
245 | * And for power savings, we return cpu_core_map | ||
246 | */ | ||
247 | if (sched_mc_power_savings || sched_smt_power_savings) | ||
248 | return cpu_core_map[cpu]; | ||
249 | else | ||
250 | return c->llc_shared_map; | ||
251 | } | ||
252 | |||
253 | /* representing cpus for which sibling maps can be computed */ | ||
254 | static cpumask_t cpu_sibling_setup_map; | ||
255 | |||
256 | static inline void set_cpu_sibling_map(int cpu) | ||
257 | { | ||
258 | int i; | ||
259 | struct cpuinfo_x86 *c = cpu_data; | ||
260 | |||
261 | cpu_set(cpu, cpu_sibling_setup_map); | ||
262 | |||
263 | if (smp_num_siblings > 1) { | ||
264 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
265 | if (c[cpu].phys_proc_id == c[i].phys_proc_id && | ||
266 | c[cpu].cpu_core_id == c[i].cpu_core_id) { | ||
267 | cpu_set(i, cpu_sibling_map[cpu]); | ||
268 | cpu_set(cpu, cpu_sibling_map[i]); | ||
269 | cpu_set(i, cpu_core_map[cpu]); | ||
270 | cpu_set(cpu, cpu_core_map[i]); | ||
271 | cpu_set(i, c[cpu].llc_shared_map); | ||
272 | cpu_set(cpu, c[i].llc_shared_map); | ||
273 | } | ||
274 | } | ||
275 | } else { | ||
276 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
277 | } | ||
278 | |||
279 | cpu_set(cpu, c[cpu].llc_shared_map); | ||
280 | |||
281 | if (current_cpu_data.x86_max_cores == 1) { | ||
282 | cpu_core_map[cpu] = cpu_sibling_map[cpu]; | ||
283 | c[cpu].booted_cores = 1; | ||
284 | return; | ||
285 | } | ||
286 | |||
287 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
288 | if (cpu_llc_id[cpu] != BAD_APICID && | ||
289 | cpu_llc_id[cpu] == cpu_llc_id[i]) { | ||
290 | cpu_set(i, c[cpu].llc_shared_map); | ||
291 | cpu_set(cpu, c[i].llc_shared_map); | ||
292 | } | ||
293 | if (c[cpu].phys_proc_id == c[i].phys_proc_id) { | ||
294 | cpu_set(i, cpu_core_map[cpu]); | ||
295 | cpu_set(cpu, cpu_core_map[i]); | ||
296 | /* | ||
297 | * Does this new cpu bringup a new core? | ||
298 | */ | ||
299 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) { | ||
300 | /* | ||
301 | * for each core in package, increment | ||
302 | * the booted_cores for this new cpu | ||
303 | */ | ||
304 | if (first_cpu(cpu_sibling_map[i]) == i) | ||
305 | c[cpu].booted_cores++; | ||
306 | /* | ||
307 | * increment the core count for all | ||
308 | * the other cpus in this package | ||
309 | */ | ||
310 | if (i != cpu) | ||
311 | c[i].booted_cores++; | ||
312 | } else if (i != cpu && !c[cpu].booted_cores) | ||
313 | c[cpu].booted_cores = c[i].booted_cores; | ||
314 | } | ||
315 | } | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Setup code on secondary processor (after comming out of the trampoline) | ||
320 | */ | ||
321 | void __cpuinit start_secondary(void) | ||
322 | { | ||
323 | /* | ||
324 | * Dont put anything before smp_callin(), SMP | ||
325 | * booting is too fragile that we want to limit the | ||
326 | * things done here to the most necessary things. | ||
327 | */ | ||
328 | cpu_init(); | ||
329 | preempt_disable(); | ||
330 | smp_callin(); | ||
331 | |||
332 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | ||
333 | barrier(); | ||
334 | |||
335 | /* | ||
336 | * Check TSC sync first: | ||
337 | */ | ||
338 | check_tsc_sync_target(); | ||
339 | |||
340 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); | ||
341 | setup_secondary_APIC_clock(); | ||
342 | |||
343 | Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); | ||
344 | |||
345 | if (nmi_watchdog == NMI_IO_APIC) { | ||
346 | disable_8259A_irq(0); | ||
347 | enable_NMI_through_LVT0(NULL); | ||
348 | enable_8259A_irq(0); | ||
349 | } | ||
350 | |||
351 | enable_APIC_timer(); | ||
352 | |||
353 | /* | ||
354 | * The sibling maps must be set before turing the online map on for | ||
355 | * this cpu | ||
356 | */ | ||
357 | set_cpu_sibling_map(smp_processor_id()); | ||
358 | |||
359 | /* | ||
360 | * We need to hold call_lock, so there is no inconsistency | ||
361 | * between the time smp_call_function() determines number of | ||
362 | * IPI receipients, and the time when the determination is made | ||
363 | * for which cpus receive the IPI in genapic_flat.c. Holding this | ||
364 | * lock helps us to not include this cpu in a currently in progress | ||
365 | * smp_call_function(). | ||
366 | */ | ||
367 | lock_ipi_call_lock(); | ||
368 | spin_lock(&vector_lock); | ||
369 | |||
370 | /* Setup the per cpu irq handling data structures */ | ||
371 | __setup_vector_irq(smp_processor_id()); | ||
372 | /* | ||
373 | * Allow the master to continue. | ||
374 | */ | ||
375 | cpu_set(smp_processor_id(), cpu_online_map); | ||
376 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | ||
377 | spin_unlock(&vector_lock); | ||
378 | |||
379 | unlock_ipi_call_lock(); | ||
380 | |||
381 | cpu_idle(); | ||
382 | } | ||
383 | |||
384 | extern volatile unsigned long init_rsp; | ||
385 | extern void (*initial_code)(void); | ||
386 | |||
387 | #ifdef APIC_DEBUG | ||
388 | static void inquire_remote_apic(int apicid) | ||
389 | { | ||
390 | unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; | ||
391 | char *names[] = { "ID", "VERSION", "SPIV" }; | ||
392 | int timeout; | ||
393 | unsigned int status; | ||
394 | |||
395 | printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); | ||
396 | |||
397 | for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { | ||
398 | printk("... APIC #%d %s: ", apicid, names[i]); | ||
399 | |||
400 | /* | ||
401 | * Wait for idle. | ||
402 | */ | ||
403 | status = safe_apic_wait_icr_idle(); | ||
404 | if (status) | ||
405 | printk("a previous APIC delivery may have failed\n"); | ||
406 | |||
407 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | ||
408 | apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
409 | |||
410 | timeout = 0; | ||
411 | do { | ||
412 | udelay(100); | ||
413 | status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; | ||
414 | } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); | ||
415 | |||
416 | switch (status) { | ||
417 | case APIC_ICR_RR_VALID: | ||
418 | status = apic_read(APIC_RRR); | ||
419 | printk("%08x\n", status); | ||
420 | break; | ||
421 | default: | ||
422 | printk("failed\n"); | ||
423 | } | ||
424 | } | ||
425 | } | ||
426 | #endif | ||
427 | |||
428 | /* | ||
429 | * Kick the secondary to wake up. | ||
430 | */ | ||
431 | static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) | ||
432 | { | ||
433 | unsigned long send_status, accept_status = 0; | ||
434 | int maxlvt, num_starts, j; | ||
435 | |||
436 | Dprintk("Asserting INIT.\n"); | ||
437 | |||
438 | /* | ||
439 | * Turn INIT on target chip | ||
440 | */ | ||
441 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
442 | |||
443 | /* | ||
444 | * Send IPI | ||
445 | */ | ||
446 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | ||
447 | | APIC_DM_INIT); | ||
448 | |||
449 | Dprintk("Waiting for send to finish...\n"); | ||
450 | send_status = safe_apic_wait_icr_idle(); | ||
451 | |||
452 | mdelay(10); | ||
453 | |||
454 | Dprintk("Deasserting INIT.\n"); | ||
455 | |||
456 | /* Target chip */ | ||
457 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
458 | |||
459 | /* Send IPI */ | ||
460 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
461 | |||
462 | Dprintk("Waiting for send to finish...\n"); | ||
463 | send_status = safe_apic_wait_icr_idle(); | ||
464 | |||
465 | mb(); | ||
466 | atomic_set(&init_deasserted, 1); | ||
467 | |||
468 | num_starts = 2; | ||
469 | |||
470 | /* | ||
471 | * Run STARTUP IPI loop. | ||
472 | */ | ||
473 | Dprintk("#startup loops: %d.\n", num_starts); | ||
474 | |||
475 | maxlvt = get_maxlvt(); | ||
476 | |||
477 | for (j = 1; j <= num_starts; j++) { | ||
478 | Dprintk("Sending STARTUP #%d.\n",j); | ||
479 | apic_write(APIC_ESR, 0); | ||
480 | apic_read(APIC_ESR); | ||
481 | Dprintk("After apic_write.\n"); | ||
482 | |||
483 | /* | ||
484 | * STARTUP IPI | ||
485 | */ | ||
486 | |||
487 | /* Target chip */ | ||
488 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
489 | |||
490 | /* Boot on the stack */ | ||
491 | /* Kick the second */ | ||
492 | apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); | ||
493 | |||
494 | /* | ||
495 | * Give the other CPU some time to accept the IPI. | ||
496 | */ | ||
497 | udelay(300); | ||
498 | |||
499 | Dprintk("Startup point 1.\n"); | ||
500 | |||
501 | Dprintk("Waiting for send to finish...\n"); | ||
502 | send_status = safe_apic_wait_icr_idle(); | ||
503 | |||
504 | /* | ||
505 | * Give the other CPU some time to accept the IPI. | ||
506 | */ | ||
507 | udelay(200); | ||
508 | /* | ||
509 | * Due to the Pentium erratum 3AP. | ||
510 | */ | ||
511 | if (maxlvt > 3) { | ||
512 | apic_write(APIC_ESR, 0); | ||
513 | } | ||
514 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
515 | if (send_status || accept_status) | ||
516 | break; | ||
517 | } | ||
518 | Dprintk("After Startup.\n"); | ||
519 | |||
520 | if (send_status) | ||
521 | printk(KERN_ERR "APIC never delivered???\n"); | ||
522 | if (accept_status) | ||
523 | printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); | ||
524 | |||
525 | return (send_status | accept_status); | ||
526 | } | ||
527 | |||
528 | struct create_idle { | ||
529 | struct work_struct work; | ||
530 | struct task_struct *idle; | ||
531 | struct completion done; | ||
532 | int cpu; | ||
533 | }; | ||
534 | |||
535 | void do_fork_idle(struct work_struct *work) | ||
536 | { | ||
537 | struct create_idle *c_idle = | ||
538 | container_of(work, struct create_idle, work); | ||
539 | |||
540 | c_idle->idle = fork_idle(c_idle->cpu); | ||
541 | complete(&c_idle->done); | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Boot one CPU. | ||
546 | */ | ||
547 | static int __cpuinit do_boot_cpu(int cpu, int apicid) | ||
548 | { | ||
549 | unsigned long boot_error; | ||
550 | int timeout; | ||
551 | unsigned long start_rip; | ||
552 | struct create_idle c_idle = { | ||
553 | .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), | ||
554 | .cpu = cpu, | ||
555 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), | ||
556 | }; | ||
557 | |||
558 | /* allocate memory for gdts of secondary cpus. Hotplug is considered */ | ||
559 | if (!cpu_gdt_descr[cpu].address && | ||
560 | !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { | ||
561 | printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); | ||
562 | return -1; | ||
563 | } | ||
564 | |||
565 | /* Allocate node local memory for AP pdas */ | ||
566 | if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { | ||
567 | struct x8664_pda *newpda, *pda; | ||
568 | int node = cpu_to_node(cpu); | ||
569 | pda = cpu_pda(cpu); | ||
570 | newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC, | ||
571 | node); | ||
572 | if (newpda) { | ||
573 | memcpy(newpda, pda, sizeof (struct x8664_pda)); | ||
574 | cpu_pda(cpu) = newpda; | ||
575 | } else | ||
576 | printk(KERN_ERR | ||
577 | "Could not allocate node local PDA for CPU %d on node %d\n", | ||
578 | cpu, node); | ||
579 | } | ||
580 | |||
581 | alternatives_smp_switch(1); | ||
582 | |||
583 | c_idle.idle = get_idle_for_cpu(cpu); | ||
584 | |||
585 | if (c_idle.idle) { | ||
586 | c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) | ||
587 | (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); | ||
588 | init_idle(c_idle.idle, cpu); | ||
589 | goto do_rest; | ||
590 | } | ||
591 | |||
592 | /* | ||
593 | * During cold boot process, keventd thread is not spun up yet. | ||
594 | * When we do cpu hot-add, we create idle threads on the fly, we should | ||
595 | * not acquire any attributes from the calling context. Hence the clean | ||
596 | * way to create kernel_threads() is to do that from keventd(). | ||
597 | * We do the current_is_keventd() due to the fact that ACPI notifier | ||
598 | * was also queuing to keventd() and when the caller is already running | ||
599 | * in context of keventd(), we would end up with locking up the keventd | ||
600 | * thread. | ||
601 | */ | ||
602 | if (!keventd_up() || current_is_keventd()) | ||
603 | c_idle.work.func(&c_idle.work); | ||
604 | else { | ||
605 | schedule_work(&c_idle.work); | ||
606 | wait_for_completion(&c_idle.done); | ||
607 | } | ||
608 | |||
609 | if (IS_ERR(c_idle.idle)) { | ||
610 | printk("failed fork for CPU %d\n", cpu); | ||
611 | return PTR_ERR(c_idle.idle); | ||
612 | } | ||
613 | |||
614 | set_idle_for_cpu(cpu, c_idle.idle); | ||
615 | |||
616 | do_rest: | ||
617 | |||
618 | cpu_pda(cpu)->pcurrent = c_idle.idle; | ||
619 | |||
620 | start_rip = setup_trampoline(); | ||
621 | |||
622 | init_rsp = c_idle.idle->thread.rsp; | ||
623 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | ||
624 | initial_code = start_secondary; | ||
625 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | ||
626 | |||
627 | printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, | ||
628 | cpus_weight(cpu_present_map), | ||
629 | apicid); | ||
630 | |||
631 | /* | ||
632 | * This grunge runs the startup process for | ||
633 | * the targeted processor. | ||
634 | */ | ||
635 | |||
636 | atomic_set(&init_deasserted, 0); | ||
637 | |||
638 | Dprintk("Setting warm reset code and vector.\n"); | ||
639 | |||
640 | CMOS_WRITE(0xa, 0xf); | ||
641 | local_flush_tlb(); | ||
642 | Dprintk("1.\n"); | ||
643 | *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; | ||
644 | Dprintk("2.\n"); | ||
645 | *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; | ||
646 | Dprintk("3.\n"); | ||
647 | |||
648 | /* | ||
649 | * Be paranoid about clearing APIC errors. | ||
650 | */ | ||
651 | apic_write(APIC_ESR, 0); | ||
652 | apic_read(APIC_ESR); | ||
653 | |||
654 | /* | ||
655 | * Status is now clean | ||
656 | */ | ||
657 | boot_error = 0; | ||
658 | |||
659 | /* | ||
660 | * Starting actual IPI sequence... | ||
661 | */ | ||
662 | boot_error = wakeup_secondary_via_INIT(apicid, start_rip); | ||
663 | |||
664 | if (!boot_error) { | ||
665 | /* | ||
666 | * allow APs to start initializing. | ||
667 | */ | ||
668 | Dprintk("Before Callout %d.\n", cpu); | ||
669 | cpu_set(cpu, cpu_callout_map); | ||
670 | Dprintk("After Callout %d.\n", cpu); | ||
671 | |||
672 | /* | ||
673 | * Wait 5s total for a response | ||
674 | */ | ||
675 | for (timeout = 0; timeout < 50000; timeout++) { | ||
676 | if (cpu_isset(cpu, cpu_callin_map)) | ||
677 | break; /* It has booted */ | ||
678 | udelay(100); | ||
679 | } | ||
680 | |||
681 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
682 | /* number CPUs logically, starting from 1 (BSP is 0) */ | ||
683 | Dprintk("CPU has booted.\n"); | ||
684 | } else { | ||
685 | boot_error = 1; | ||
686 | if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) | ||
687 | == 0xA5) | ||
688 | /* trampoline started but...? */ | ||
689 | printk("Stuck ??\n"); | ||
690 | else | ||
691 | /* trampoline code not run */ | ||
692 | printk("Not responding.\n"); | ||
693 | #ifdef APIC_DEBUG | ||
694 | inquire_remote_apic(apicid); | ||
695 | #endif | ||
696 | } | ||
697 | } | ||
698 | if (boot_error) { | ||
699 | cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ | ||
700 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
701 | clear_node_cpumask(cpu); /* was set by numa_add_cpu */ | ||
702 | cpu_clear(cpu, cpu_present_map); | ||
703 | cpu_clear(cpu, cpu_possible_map); | ||
704 | x86_cpu_to_apicid[cpu] = BAD_APICID; | ||
705 | x86_cpu_to_log_apicid[cpu] = BAD_APICID; | ||
706 | return -EIO; | ||
707 | } | ||
708 | |||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | cycles_t cacheflush_time; | ||
713 | unsigned long cache_decay_ticks; | ||
714 | |||
715 | /* | ||
716 | * Cleanup possible dangling ends... | ||
717 | */ | ||
718 | static __cpuinit void smp_cleanup_boot(void) | ||
719 | { | ||
720 | /* | ||
721 | * Paranoid: Set warm reset code and vector here back | ||
722 | * to default values. | ||
723 | */ | ||
724 | CMOS_WRITE(0, 0xf); | ||
725 | |||
726 | /* | ||
727 | * Reset trampoline flag | ||
728 | */ | ||
729 | *((volatile int *) phys_to_virt(0x467)) = 0; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * Fall back to non SMP mode after errors. | ||
734 | * | ||
735 | * RED-PEN audit/test this more. I bet there is more state messed up here. | ||
736 | */ | ||
737 | static __init void disable_smp(void) | ||
738 | { | ||
739 | cpu_present_map = cpumask_of_cpu(0); | ||
740 | cpu_possible_map = cpumask_of_cpu(0); | ||
741 | if (smp_found_config) | ||
742 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); | ||
743 | else | ||
744 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
745 | cpu_set(0, cpu_sibling_map[0]); | ||
746 | cpu_set(0, cpu_core_map[0]); | ||
747 | } | ||
748 | |||
749 | #ifdef CONFIG_HOTPLUG_CPU | ||
750 | |||
751 | int additional_cpus __initdata = -1; | ||
752 | |||
753 | /* | ||
754 | * cpu_possible_map should be static, it cannot change as cpu's | ||
755 | * are onlined, or offlined. The reason is per-cpu data-structures | ||
756 | * are allocated by some modules at init time, and dont expect to | ||
757 | * do this dynamically on cpu arrival/departure. | ||
758 | * cpu_present_map on the other hand can change dynamically. | ||
759 | * In case when cpu_hotplug is not compiled, then we resort to current | ||
760 | * behaviour, which is cpu_possible == cpu_present. | ||
761 | * - Ashok Raj | ||
762 | * | ||
763 | * Three ways to find out the number of additional hotplug CPUs: | ||
764 | * - If the BIOS specified disabled CPUs in ACPI/mptables use that. | ||
765 | * - The user can overwrite it with additional_cpus=NUM | ||
766 | * - Otherwise don't reserve additional CPUs. | ||
767 | * We do this because additional CPUs waste a lot of memory. | ||
768 | * -AK | ||
769 | */ | ||
770 | __init void prefill_possible_map(void) | ||
771 | { | ||
772 | int i; | ||
773 | int possible; | ||
774 | |||
775 | if (additional_cpus == -1) { | ||
776 | if (disabled_cpus > 0) | ||
777 | additional_cpus = disabled_cpus; | ||
778 | else | ||
779 | additional_cpus = 0; | ||
780 | } | ||
781 | possible = num_processors + additional_cpus; | ||
782 | if (possible > NR_CPUS) | ||
783 | possible = NR_CPUS; | ||
784 | |||
785 | printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", | ||
786 | possible, | ||
787 | max_t(int, possible - num_processors, 0)); | ||
788 | |||
789 | for (i = 0; i < possible; i++) | ||
790 | cpu_set(i, cpu_possible_map); | ||
791 | } | ||
792 | #endif | ||
793 | |||
794 | /* | ||
795 | * Various sanity checks. | ||
796 | */ | ||
797 | static int __init smp_sanity_check(unsigned max_cpus) | ||
798 | { | ||
799 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { | ||
800 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
801 | hard_smp_processor_id()); | ||
802 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * If we couldn't find an SMP configuration at boot time, | ||
807 | * get out of here now! | ||
808 | */ | ||
809 | if (!smp_found_config) { | ||
810 | printk(KERN_NOTICE "SMP motherboard not detected.\n"); | ||
811 | disable_smp(); | ||
812 | if (APIC_init_uniprocessor()) | ||
813 | printk(KERN_NOTICE "Local APIC not detected." | ||
814 | " Using dummy APIC emulation.\n"); | ||
815 | return -1; | ||
816 | } | ||
817 | |||
818 | /* | ||
819 | * Should not be necessary because the MP table should list the boot | ||
820 | * CPU too, but we do it for the sake of robustness anyway. | ||
821 | */ | ||
822 | if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { | ||
823 | printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
824 | boot_cpu_id); | ||
825 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
826 | } | ||
827 | |||
828 | /* | ||
829 | * If we couldn't find a local APIC, then get out of here now! | ||
830 | */ | ||
831 | if (!cpu_has_apic) { | ||
832 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
833 | boot_cpu_id); | ||
834 | printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); | ||
835 | nr_ioapics = 0; | ||
836 | return -1; | ||
837 | } | ||
838 | |||
839 | /* | ||
840 | * If SMP should be disabled, then really disable it! | ||
841 | */ | ||
842 | if (!max_cpus) { | ||
843 | printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); | ||
844 | nr_ioapics = 0; | ||
845 | return -1; | ||
846 | } | ||
847 | |||
848 | return 0; | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Prepare for SMP bootup. The MP table or ACPI has been read | ||
853 | * earlier. Just do some sanity checking here and enable APIC mode. | ||
854 | */ | ||
855 | void __init smp_prepare_cpus(unsigned int max_cpus) | ||
856 | { | ||
857 | nmi_watchdog_default(); | ||
858 | current_cpu_data = boot_cpu_data; | ||
859 | current_thread_info()->cpu = 0; /* needed? */ | ||
860 | set_cpu_sibling_map(0); | ||
861 | |||
862 | if (smp_sanity_check(max_cpus) < 0) { | ||
863 | printk(KERN_INFO "SMP disabled\n"); | ||
864 | disable_smp(); | ||
865 | return; | ||
866 | } | ||
867 | |||
868 | |||
869 | /* | ||
870 | * Switch from PIC to APIC mode. | ||
871 | */ | ||
872 | setup_local_APIC(); | ||
873 | |||
874 | if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { | ||
875 | panic("Boot APIC ID in local APIC unexpected (%d vs %d)", | ||
876 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); | ||
877 | /* Or can we switch back to PIC here? */ | ||
878 | } | ||
879 | |||
880 | /* | ||
881 | * Now start the IO-APICs | ||
882 | */ | ||
883 | if (!skip_ioapic_setup && nr_ioapics) | ||
884 | setup_IO_APIC(); | ||
885 | else | ||
886 | nr_ioapics = 0; | ||
887 | |||
888 | /* | ||
889 | * Set up local APIC timer on boot CPU. | ||
890 | */ | ||
891 | |||
892 | setup_boot_APIC_clock(); | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * Early setup to make printk work. | ||
897 | */ | ||
898 | void __init smp_prepare_boot_cpu(void) | ||
899 | { | ||
900 | int me = smp_processor_id(); | ||
901 | cpu_set(me, cpu_online_map); | ||
902 | cpu_set(me, cpu_callout_map); | ||
903 | per_cpu(cpu_state, me) = CPU_ONLINE; | ||
904 | } | ||
905 | |||
906 | /* | ||
907 | * Entry point to boot a CPU. | ||
908 | */ | ||
909 | int __cpuinit __cpu_up(unsigned int cpu) | ||
910 | { | ||
911 | int apicid = cpu_present_to_apicid(cpu); | ||
912 | unsigned long flags; | ||
913 | int err; | ||
914 | |||
915 | WARN_ON(irqs_disabled()); | ||
916 | |||
917 | Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); | ||
918 | |||
919 | if (apicid == BAD_APICID || apicid == boot_cpu_id || | ||
920 | !physid_isset(apicid, phys_cpu_present_map)) { | ||
921 | printk("__cpu_up: bad cpu %d\n", cpu); | ||
922 | return -EINVAL; | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Already booted CPU? | ||
927 | */ | ||
928 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
929 | Dprintk("do_boot_cpu %d Already started\n", cpu); | ||
930 | return -ENOSYS; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * Save current MTRR state in case it was changed since early boot | ||
935 | * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: | ||
936 | */ | ||
937 | mtrr_save_state(); | ||
938 | |||
939 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | ||
940 | /* Boot it! */ | ||
941 | err = do_boot_cpu(cpu, apicid); | ||
942 | if (err < 0) { | ||
943 | Dprintk("do_boot_cpu failed %d\n", err); | ||
944 | return err; | ||
945 | } | ||
946 | |||
947 | /* Unleash the CPU! */ | ||
948 | Dprintk("waiting for cpu %d\n", cpu); | ||
949 | |||
950 | /* | ||
951 | * Make sure and check TSC sync: | ||
952 | */ | ||
953 | local_irq_save(flags); | ||
954 | check_tsc_sync_source(cpu); | ||
955 | local_irq_restore(flags); | ||
956 | |||
957 | while (!cpu_isset(cpu, cpu_online_map)) | ||
958 | cpu_relax(); | ||
959 | err = 0; | ||
960 | |||
961 | return err; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Finish the SMP boot. | ||
966 | */ | ||
967 | void __init smp_cpus_done(unsigned int max_cpus) | ||
968 | { | ||
969 | smp_cleanup_boot(); | ||
970 | setup_ioapic_dest(); | ||
971 | check_nmi_watchdog(); | ||
972 | } | ||
973 | |||
974 | #ifdef CONFIG_HOTPLUG_CPU | ||
975 | |||
976 | static void remove_siblinginfo(int cpu) | ||
977 | { | ||
978 | int sibling; | ||
979 | struct cpuinfo_x86 *c = cpu_data; | ||
980 | |||
981 | for_each_cpu_mask(sibling, cpu_core_map[cpu]) { | ||
982 | cpu_clear(cpu, cpu_core_map[sibling]); | ||
983 | /* | ||
984 | * last thread sibling in this cpu core going down | ||
985 | */ | ||
986 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) | ||
987 | c[sibling].booted_cores--; | ||
988 | } | ||
989 | |||
990 | for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) | ||
991 | cpu_clear(cpu, cpu_sibling_map[sibling]); | ||
992 | cpus_clear(cpu_sibling_map[cpu]); | ||
993 | cpus_clear(cpu_core_map[cpu]); | ||
994 | c[cpu].phys_proc_id = 0; | ||
995 | c[cpu].cpu_core_id = 0; | ||
996 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
997 | } | ||
998 | |||
999 | void remove_cpu_from_maps(void) | ||
1000 | { | ||
1001 | int cpu = smp_processor_id(); | ||
1002 | |||
1003 | cpu_clear(cpu, cpu_callout_map); | ||
1004 | cpu_clear(cpu, cpu_callin_map); | ||
1005 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
1006 | clear_node_cpumask(cpu); | ||
1007 | } | ||
1008 | |||
1009 | int __cpu_disable(void) | ||
1010 | { | ||
1011 | int cpu = smp_processor_id(); | ||
1012 | |||
1013 | /* | ||
1014 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1015 | * into generic code. | ||
1016 | * | ||
1017 | * We won't take down the boot processor on i386 due to some | ||
1018 | * interrupts only being able to be serviced by the BSP. | ||
1019 | * Especially so if we're not using an IOAPIC -zwane | ||
1020 | */ | ||
1021 | if (cpu == 0) | ||
1022 | return -EBUSY; | ||
1023 | |||
1024 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1025 | stop_apic_nmi_watchdog(NULL); | ||
1026 | clear_local_APIC(); | ||
1027 | |||
1028 | /* | ||
1029 | * HACK: | ||
1030 | * Allow any queued timer interrupts to get serviced | ||
1031 | * This is only a temporary solution until we cleanup | ||
1032 | * fixup_irqs as we do for IA64. | ||
1033 | */ | ||
1034 | local_irq_enable(); | ||
1035 | mdelay(1); | ||
1036 | |||
1037 | local_irq_disable(); | ||
1038 | remove_siblinginfo(cpu); | ||
1039 | |||
1040 | spin_lock(&vector_lock); | ||
1041 | /* It's now safe to remove this processor from the online map */ | ||
1042 | cpu_clear(cpu, cpu_online_map); | ||
1043 | spin_unlock(&vector_lock); | ||
1044 | remove_cpu_from_maps(); | ||
1045 | fixup_irqs(cpu_online_map); | ||
1046 | return 0; | ||
1047 | } | ||
1048 | |||
1049 | void __cpu_die(unsigned int cpu) | ||
1050 | { | ||
1051 | /* We don't do anything here: idle task is faking death itself. */ | ||
1052 | unsigned int i; | ||
1053 | |||
1054 | for (i = 0; i < 10; i++) { | ||
1055 | /* They ack this in play_dead by setting CPU_DEAD */ | ||
1056 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | ||
1057 | printk ("CPU %d is now offline\n", cpu); | ||
1058 | if (1 == num_online_cpus()) | ||
1059 | alternatives_smp_switch(0); | ||
1060 | return; | ||
1061 | } | ||
1062 | msleep(100); | ||
1063 | } | ||
1064 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | ||
1065 | } | ||
1066 | |||
1067 | static __init int setup_additional_cpus(char *s) | ||
1068 | { | ||
1069 | return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; | ||
1070 | } | ||
1071 | early_param("additional_cpus", setup_additional_cpus); | ||
1072 | |||
1073 | #else /* ... !CONFIG_HOTPLUG_CPU */ | ||
1074 | |||
1075 | int __cpu_disable(void) | ||
1076 | { | ||
1077 | return -ENOSYS; | ||
1078 | } | ||
1079 | |||
1080 | void __cpu_die(unsigned int cpu) | ||
1081 | { | ||
1082 | /* We said "no" in __cpu_disable */ | ||
1083 | BUG(); | ||
1084 | } | ||
1085 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c new file mode 100644 index 000000000000..cb9109113584 --- /dev/null +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -0,0 +1,54 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/stacktrace.c | ||
3 | * | ||
4 | * Stack trace management functions | ||
5 | * | ||
6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | */ | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/stacktrace.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <asm/stacktrace.h> | ||
12 | |||
13 | static void save_stack_warning(void *data, char *msg) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | static void | ||
18 | save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
19 | { | ||
20 | } | ||
21 | |||
22 | static int save_stack_stack(void *data, char *name) | ||
23 | { | ||
24 | return -1; | ||
25 | } | ||
26 | |||
27 | static void save_stack_address(void *data, unsigned long addr) | ||
28 | { | ||
29 | struct stack_trace *trace = (struct stack_trace *)data; | ||
30 | if (trace->skip > 0) { | ||
31 | trace->skip--; | ||
32 | return; | ||
33 | } | ||
34 | if (trace->nr_entries < trace->max_entries) | ||
35 | trace->entries[trace->nr_entries++] = addr; | ||
36 | } | ||
37 | |||
38 | static struct stacktrace_ops save_stack_ops = { | ||
39 | .warning = save_stack_warning, | ||
40 | .warning_symbol = save_stack_warning_symbol, | ||
41 | .stack = save_stack_stack, | ||
42 | .address = save_stack_address, | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * Save stack-backtrace addresses into a stack_trace buffer. | ||
47 | */ | ||
48 | void save_stack_trace(struct stack_trace *trace) | ||
49 | { | ||
50 | dump_trace(current, NULL, NULL, &save_stack_ops, trace); | ||
51 | if (trace->nr_entries < trace->max_entries) | ||
52 | trace->entries[trace->nr_entries++] = ULONG_MAX; | ||
53 | } | ||
54 | EXPORT_SYMBOL(save_stack_trace); | ||
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c new file mode 100644 index 000000000000..573c0a6e0ac6 --- /dev/null +++ b/arch/x86/kernel/suspend_64.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * Suspend support specific for i386. | ||
3 | * | ||
4 | * Distribute under GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | ||
7 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | ||
8 | */ | ||
9 | |||
10 | #include <linux/smp.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/proto.h> | ||
13 | #include <asm/page.h> | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/mtrr.h> | ||
16 | |||
17 | /* References to section boundaries */ | ||
18 | extern const void __nosave_begin, __nosave_end; | ||
19 | |||
20 | struct saved_context saved_context; | ||
21 | |||
22 | unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; | ||
23 | unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; | ||
24 | unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; | ||
25 | unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; | ||
26 | unsigned long saved_context_eflags; | ||
27 | |||
28 | void __save_processor_state(struct saved_context *ctxt) | ||
29 | { | ||
30 | kernel_fpu_begin(); | ||
31 | |||
32 | /* | ||
33 | * descriptor tables | ||
34 | */ | ||
35 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); | ||
36 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); | ||
37 | asm volatile ("str %0" : "=m" (ctxt->tr)); | ||
38 | |||
39 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | ||
40 | /* | ||
41 | * segment registers | ||
42 | */ | ||
43 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); | ||
44 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); | ||
45 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); | ||
46 | asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); | ||
47 | asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); | ||
48 | |||
49 | rdmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
50 | rdmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
51 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
52 | mtrr_save_fixed_ranges(NULL); | ||
53 | |||
54 | /* | ||
55 | * control registers | ||
56 | */ | ||
57 | rdmsrl(MSR_EFER, ctxt->efer); | ||
58 | ctxt->cr0 = read_cr0(); | ||
59 | ctxt->cr2 = read_cr2(); | ||
60 | ctxt->cr3 = read_cr3(); | ||
61 | ctxt->cr4 = read_cr4(); | ||
62 | ctxt->cr8 = read_cr8(); | ||
63 | } | ||
64 | |||
65 | void save_processor_state(void) | ||
66 | { | ||
67 | __save_processor_state(&saved_context); | ||
68 | } | ||
69 | |||
70 | static void do_fpu_end(void) | ||
71 | { | ||
72 | /* | ||
73 | * Restore FPU regs if necessary | ||
74 | */ | ||
75 | kernel_fpu_end(); | ||
76 | } | ||
77 | |||
78 | void __restore_processor_state(struct saved_context *ctxt) | ||
79 | { | ||
80 | /* | ||
81 | * control registers | ||
82 | */ | ||
83 | wrmsrl(MSR_EFER, ctxt->efer); | ||
84 | write_cr8(ctxt->cr8); | ||
85 | write_cr4(ctxt->cr4); | ||
86 | write_cr3(ctxt->cr3); | ||
87 | write_cr2(ctxt->cr2); | ||
88 | write_cr0(ctxt->cr0); | ||
89 | |||
90 | /* | ||
91 | * now restore the descriptor tables to their proper values | ||
92 | * ltr is done i fix_processor_context(). | ||
93 | */ | ||
94 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
95 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
96 | |||
97 | /* | ||
98 | * segment registers | ||
99 | */ | ||
100 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | ||
101 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); | ||
102 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); | ||
103 | load_gs_index(ctxt->gs); | ||
104 | asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); | ||
105 | |||
106 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
107 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
108 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
109 | |||
110 | fix_processor_context(); | ||
111 | |||
112 | do_fpu_end(); | ||
113 | mtrr_ap_init(); | ||
114 | } | ||
115 | |||
116 | void restore_processor_state(void) | ||
117 | { | ||
118 | __restore_processor_state(&saved_context); | ||
119 | } | ||
120 | |||
121 | void fix_processor_context(void) | ||
122 | { | ||
123 | int cpu = smp_processor_id(); | ||
124 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
125 | |||
126 | set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ | ||
127 | |||
128 | cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; | ||
129 | |||
130 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
131 | load_TR_desc(); /* This does ltr */ | ||
132 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
133 | |||
134 | /* | ||
135 | * Now maybe reload the debug registers | ||
136 | */ | ||
137 | if (current->thread.debugreg7){ | ||
138 | loaddebug(¤t->thread, 0); | ||
139 | loaddebug(¤t->thread, 1); | ||
140 | loaddebug(¤t->thread, 2); | ||
141 | loaddebug(¤t->thread, 3); | ||
142 | /* no 4 and 5 */ | ||
143 | loaddebug(¤t->thread, 6); | ||
144 | loaddebug(¤t->thread, 7); | ||
145 | } | ||
146 | |||
147 | } | ||
148 | |||
149 | #ifdef CONFIG_HIBERNATION | ||
150 | /* Defined in arch/x86_64/kernel/suspend_asm.S */ | ||
151 | extern int restore_image(void); | ||
152 | |||
153 | pgd_t *temp_level4_pgt; | ||
154 | |||
155 | static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) | ||
156 | { | ||
157 | long i, j; | ||
158 | |||
159 | i = pud_index(address); | ||
160 | pud = pud + i; | ||
161 | for (; i < PTRS_PER_PUD; pud++, i++) { | ||
162 | unsigned long paddr; | ||
163 | pmd_t *pmd; | ||
164 | |||
165 | paddr = address + i*PUD_SIZE; | ||
166 | if (paddr >= end) | ||
167 | break; | ||
168 | |||
169 | pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); | ||
170 | if (!pmd) | ||
171 | return -ENOMEM; | ||
172 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | ||
173 | for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { | ||
174 | unsigned long pe; | ||
175 | |||
176 | if (paddr >= end) | ||
177 | break; | ||
178 | pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; | ||
179 | pe &= __supported_pte_mask; | ||
180 | set_pmd(pmd, __pmd(pe)); | ||
181 | } | ||
182 | } | ||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | static int set_up_temporary_mappings(void) | ||
187 | { | ||
188 | unsigned long start, end, next; | ||
189 | int error; | ||
190 | |||
191 | temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); | ||
192 | if (!temp_level4_pgt) | ||
193 | return -ENOMEM; | ||
194 | |||
195 | /* It is safe to reuse the original kernel mapping */ | ||
196 | set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), | ||
197 | init_level4_pgt[pgd_index(__START_KERNEL_map)]); | ||
198 | |||
199 | /* Set up the direct mapping from scratch */ | ||
200 | start = (unsigned long)pfn_to_kaddr(0); | ||
201 | end = (unsigned long)pfn_to_kaddr(end_pfn); | ||
202 | |||
203 | for (; start < end; start = next) { | ||
204 | pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); | ||
205 | if (!pud) | ||
206 | return -ENOMEM; | ||
207 | next = start + PGDIR_SIZE; | ||
208 | if (next > end) | ||
209 | next = end; | ||
210 | if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) | ||
211 | return error; | ||
212 | set_pgd(temp_level4_pgt + pgd_index(start), | ||
213 | mk_kernel_pgd(__pa(pud))); | ||
214 | } | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | int swsusp_arch_resume(void) | ||
219 | { | ||
220 | int error; | ||
221 | |||
222 | /* We have got enough memory and from now on we cannot recover */ | ||
223 | if ((error = set_up_temporary_mappings())) | ||
224 | return error; | ||
225 | restore_image(); | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | ||
231 | */ | ||
232 | |||
233 | int pfn_is_nosave(unsigned long pfn) | ||
234 | { | ||
235 | unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; | ||
236 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; | ||
237 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
238 | } | ||
239 | #endif /* CONFIG_HIBERNATION */ | ||
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S new file mode 100644 index 000000000000..16d183f67bc1 --- /dev/null +++ b/arch/x86/kernel/suspend_asm_64.S | |||
@@ -0,0 +1,110 @@ | |||
1 | /* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> | ||
2 | * | ||
3 | * Distribute under GPLv2. | ||
4 | * | ||
5 | * swsusp_arch_resume may not use any stack, nor any variable that is | ||
6 | * not "NoSave" during copying pages: | ||
7 | * | ||
8 | * Its rewriting one kernel image with another. What is stack in "old" | ||
9 | * image could very well be data page in "new" image, and overwriting | ||
10 | * your own stack under you is bad idea. | ||
11 | */ | ||
12 | |||
13 | .text | ||
14 | #include <linux/linkage.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/asm-offsets.h> | ||
18 | |||
19 | ENTRY(swsusp_arch_suspend) | ||
20 | |||
21 | movq %rsp, saved_context_esp(%rip) | ||
22 | movq %rax, saved_context_eax(%rip) | ||
23 | movq %rbx, saved_context_ebx(%rip) | ||
24 | movq %rcx, saved_context_ecx(%rip) | ||
25 | movq %rdx, saved_context_edx(%rip) | ||
26 | movq %rbp, saved_context_ebp(%rip) | ||
27 | movq %rsi, saved_context_esi(%rip) | ||
28 | movq %rdi, saved_context_edi(%rip) | ||
29 | movq %r8, saved_context_r08(%rip) | ||
30 | movq %r9, saved_context_r09(%rip) | ||
31 | movq %r10, saved_context_r10(%rip) | ||
32 | movq %r11, saved_context_r11(%rip) | ||
33 | movq %r12, saved_context_r12(%rip) | ||
34 | movq %r13, saved_context_r13(%rip) | ||
35 | movq %r14, saved_context_r14(%rip) | ||
36 | movq %r15, saved_context_r15(%rip) | ||
37 | pushfq ; popq saved_context_eflags(%rip) | ||
38 | |||
39 | call swsusp_save | ||
40 | ret | ||
41 | |||
42 | ENTRY(restore_image) | ||
43 | /* switch to temporary page tables */ | ||
44 | movq $__PAGE_OFFSET, %rdx | ||
45 | movq temp_level4_pgt(%rip), %rax | ||
46 | subq %rdx, %rax | ||
47 | movq %rax, %cr3 | ||
48 | /* Flush TLB */ | ||
49 | movq mmu_cr4_features(%rip), %rax | ||
50 | movq %rax, %rdx | ||
51 | andq $~(1<<7), %rdx # PGE | ||
52 | movq %rdx, %cr4; # turn off PGE | ||
53 | movq %cr3, %rcx; # flush TLB | ||
54 | movq %rcx, %cr3; | ||
55 | movq %rax, %cr4; # turn PGE back on | ||
56 | |||
57 | movq restore_pblist(%rip), %rdx | ||
58 | loop: | ||
59 | testq %rdx, %rdx | ||
60 | jz done | ||
61 | |||
62 | /* get addresses from the pbe and copy the page */ | ||
63 | movq pbe_address(%rdx), %rsi | ||
64 | movq pbe_orig_address(%rdx), %rdi | ||
65 | movq $512, %rcx | ||
66 | rep | ||
67 | movsq | ||
68 | |||
69 | /* progress to the next pbe */ | ||
70 | movq pbe_next(%rdx), %rdx | ||
71 | jmp loop | ||
72 | done: | ||
73 | /* go back to the original page tables */ | ||
74 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
75 | addq phys_base(%rip), %rax | ||
76 | movq %rax, %cr3 | ||
77 | |||
78 | /* Flush TLB, including "global" things (vmalloc) */ | ||
79 | movq mmu_cr4_features(%rip), %rax | ||
80 | movq %rax, %rdx | ||
81 | andq $~(1<<7), %rdx; # PGE | ||
82 | movq %rdx, %cr4; # turn off PGE | ||
83 | movq %cr3, %rcx; # flush TLB | ||
84 | movq %rcx, %cr3 | ||
85 | movq %rax, %cr4; # turn PGE back on | ||
86 | |||
87 | movl $24, %eax | ||
88 | movl %eax, %ds | ||
89 | |||
90 | movq saved_context_esp(%rip), %rsp | ||
91 | movq saved_context_ebp(%rip), %rbp | ||
92 | /* Don't restore %rax, it must be 0 anyway */ | ||
93 | movq saved_context_ebx(%rip), %rbx | ||
94 | movq saved_context_ecx(%rip), %rcx | ||
95 | movq saved_context_edx(%rip), %rdx | ||
96 | movq saved_context_esi(%rip), %rsi | ||
97 | movq saved_context_edi(%rip), %rdi | ||
98 | movq saved_context_r08(%rip), %r8 | ||
99 | movq saved_context_r09(%rip), %r9 | ||
100 | movq saved_context_r10(%rip), %r10 | ||
101 | movq saved_context_r11(%rip), %r11 | ||
102 | movq saved_context_r12(%rip), %r12 | ||
103 | movq saved_context_r13(%rip), %r13 | ||
104 | movq saved_context_r14(%rip), %r14 | ||
105 | movq saved_context_r15(%rip), %r15 | ||
106 | pushq saved_context_eflags(%rip) ; popfq | ||
107 | |||
108 | xorq %rax, %rax | ||
109 | |||
110 | ret | ||
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c new file mode 100644 index 000000000000..4770b7a2052c --- /dev/null +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -0,0 +1,159 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/sys_x86_64.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/syscalls.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/fs.h> | ||
10 | #include <linux/smp.h> | ||
11 | #include <linux/sem.h> | ||
12 | #include <linux/msg.h> | ||
13 | #include <linux/shm.h> | ||
14 | #include <linux/stat.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/file.h> | ||
17 | #include <linux/utsname.h> | ||
18 | #include <linux/personality.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/ia32.h> | ||
22 | |||
23 | /* | ||
24 | * sys_pipe() is the normal C calling standard for creating | ||
25 | * a pipe. It's not the way Unix traditionally does this, though. | ||
26 | */ | ||
27 | asmlinkage long sys_pipe(int __user *fildes) | ||
28 | { | ||
29 | int fd[2]; | ||
30 | int error; | ||
31 | |||
32 | error = do_pipe(fd); | ||
33 | if (!error) { | ||
34 | if (copy_to_user(fildes, fd, 2*sizeof(int))) | ||
35 | error = -EFAULT; | ||
36 | } | ||
37 | return error; | ||
38 | } | ||
39 | |||
40 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, | ||
41 | unsigned long fd, unsigned long off) | ||
42 | { | ||
43 | long error; | ||
44 | struct file * file; | ||
45 | |||
46 | error = -EINVAL; | ||
47 | if (off & ~PAGE_MASK) | ||
48 | goto out; | ||
49 | |||
50 | error = -EBADF; | ||
51 | file = NULL; | ||
52 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
53 | if (!(flags & MAP_ANONYMOUS)) { | ||
54 | file = fget(fd); | ||
55 | if (!file) | ||
56 | goto out; | ||
57 | } | ||
58 | down_write(¤t->mm->mmap_sem); | ||
59 | error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); | ||
60 | up_write(¤t->mm->mmap_sem); | ||
61 | |||
62 | if (file) | ||
63 | fput(file); | ||
64 | out: | ||
65 | return error; | ||
66 | } | ||
67 | |||
68 | static void find_start_end(unsigned long flags, unsigned long *begin, | ||
69 | unsigned long *end) | ||
70 | { | ||
71 | if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { | ||
72 | /* This is usually used needed to map code in small | ||
73 | model, so it needs to be in the first 31bit. Limit | ||
74 | it to that. This means we need to move the | ||
75 | unmapped base down for this case. This can give | ||
76 | conflicts with the heap, but we assume that glibc | ||
77 | malloc knows how to fall back to mmap. Give it 1GB | ||
78 | of playground for now. -AK */ | ||
79 | *begin = 0x40000000; | ||
80 | *end = 0x80000000; | ||
81 | } else { | ||
82 | *begin = TASK_UNMAPPED_BASE; | ||
83 | *end = TASK_SIZE; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | unsigned long | ||
88 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
89 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
90 | { | ||
91 | struct mm_struct *mm = current->mm; | ||
92 | struct vm_area_struct *vma; | ||
93 | unsigned long start_addr; | ||
94 | unsigned long begin, end; | ||
95 | |||
96 | if (flags & MAP_FIXED) | ||
97 | return addr; | ||
98 | |||
99 | find_start_end(flags, &begin, &end); | ||
100 | |||
101 | if (len > end) | ||
102 | return -ENOMEM; | ||
103 | |||
104 | if (addr) { | ||
105 | addr = PAGE_ALIGN(addr); | ||
106 | vma = find_vma(mm, addr); | ||
107 | if (end - len >= addr && | ||
108 | (!vma || addr + len <= vma->vm_start)) | ||
109 | return addr; | ||
110 | } | ||
111 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) | ||
112 | && len <= mm->cached_hole_size) { | ||
113 | mm->cached_hole_size = 0; | ||
114 | mm->free_area_cache = begin; | ||
115 | } | ||
116 | addr = mm->free_area_cache; | ||
117 | if (addr < begin) | ||
118 | addr = begin; | ||
119 | start_addr = addr; | ||
120 | |||
121 | full_search: | ||
122 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
123 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
124 | if (end - len < addr) { | ||
125 | /* | ||
126 | * Start a new search - just in case we missed | ||
127 | * some holes. | ||
128 | */ | ||
129 | if (start_addr != begin) { | ||
130 | start_addr = addr = begin; | ||
131 | mm->cached_hole_size = 0; | ||
132 | goto full_search; | ||
133 | } | ||
134 | return -ENOMEM; | ||
135 | } | ||
136 | if (!vma || addr + len <= vma->vm_start) { | ||
137 | /* | ||
138 | * Remember the place where we stopped the search: | ||
139 | */ | ||
140 | mm->free_area_cache = addr + len; | ||
141 | return addr; | ||
142 | } | ||
143 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
144 | mm->cached_hole_size = vma->vm_start - addr; | ||
145 | |||
146 | addr = vma->vm_end; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | asmlinkage long sys_uname(struct new_utsname __user * name) | ||
151 | { | ||
152 | int err; | ||
153 | down_read(&uts_sem); | ||
154 | err = copy_to_user(name, utsname(), sizeof (*name)); | ||
155 | up_read(&uts_sem); | ||
156 | if (personality(current->personality) == PER_LINUX32) | ||
157 | err |= copy_to_user(&name->machine, "i686", 5); | ||
158 | return err ? -EFAULT : 0; | ||
159 | } | ||
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c new file mode 100644 index 000000000000..63d592c276cc --- /dev/null +++ b/arch/x86/kernel/syscall_64.c | |||
@@ -0,0 +1,26 @@ | |||
1 | /* System call table for x86-64. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | #define __NO_STUBS | ||
9 | |||
10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; | ||
11 | #undef _ASM_X86_64_UNISTD_H_ | ||
12 | #include <asm-x86_64/unistd.h> | ||
13 | |||
14 | #undef __SYSCALL | ||
15 | #define __SYSCALL(nr, sym) [ nr ] = sym, | ||
16 | #undef _ASM_X86_64_UNISTD_H_ | ||
17 | |||
18 | typedef void (*sys_call_ptr_t)(void); | ||
19 | |||
20 | extern void sys_ni_syscall(void); | ||
21 | |||
22 | const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { | ||
23 | /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ | ||
24 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | ||
25 | #include <asm-x86_64/unistd.h> | ||
26 | }; | ||
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c new file mode 100644 index 000000000000..e3f2569b2c44 --- /dev/null +++ b/arch/x86/kernel/tce_64.c | |||
@@ -0,0 +1,189 @@ | |||
1 | /* | ||
2 | * This file manages the translation entries for the IBM Calgary IOMMU. | ||
3 | * | ||
4 | * Derived from arch/powerpc/platforms/pseries/iommu.c | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2006 | ||
7 | * | ||
8 | * Author: Jon Mason <jdmason@us.ibm.com> | ||
9 | * Author: Muli Ben-Yehuda <muli@il.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/pci.h> | ||
32 | #include <linux/dma-mapping.h> | ||
33 | #include <linux/bootmem.h> | ||
34 | #include <asm/tce.h> | ||
35 | #include <asm/calgary.h> | ||
36 | #include <asm/proto.h> | ||
37 | |||
38 | /* flush a tce at 'tceaddr' to main memory */ | ||
39 | static inline void flush_tce(void* tceaddr) | ||
40 | { | ||
41 | /* a single tce can't cross a cache line */ | ||
42 | if (cpu_has_clflush) | ||
43 | asm volatile("clflush (%0)" :: "r" (tceaddr)); | ||
44 | else | ||
45 | asm volatile("wbinvd":::"memory"); | ||
46 | } | ||
47 | |||
48 | void tce_build(struct iommu_table *tbl, unsigned long index, | ||
49 | unsigned int npages, unsigned long uaddr, int direction) | ||
50 | { | ||
51 | u64* tp; | ||
52 | u64 t; | ||
53 | u64 rpn; | ||
54 | |||
55 | t = (1 << TCE_READ_SHIFT); | ||
56 | if (direction != DMA_TO_DEVICE) | ||
57 | t |= (1 << TCE_WRITE_SHIFT); | ||
58 | |||
59 | tp = ((u64*)tbl->it_base) + index; | ||
60 | |||
61 | while (npages--) { | ||
62 | rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; | ||
63 | t &= ~TCE_RPN_MASK; | ||
64 | t |= (rpn << TCE_RPN_SHIFT); | ||
65 | |||
66 | *tp = cpu_to_be64(t); | ||
67 | flush_tce(tp); | ||
68 | |||
69 | uaddr += PAGE_SIZE; | ||
70 | tp++; | ||
71 | } | ||
72 | } | ||
73 | |||
74 | void tce_free(struct iommu_table *tbl, long index, unsigned int npages) | ||
75 | { | ||
76 | u64* tp; | ||
77 | |||
78 | tp = ((u64*)tbl->it_base) + index; | ||
79 | |||
80 | while (npages--) { | ||
81 | *tp = cpu_to_be64(0); | ||
82 | flush_tce(tp); | ||
83 | tp++; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | static inline unsigned int table_size_to_number_of_entries(unsigned char size) | ||
88 | { | ||
89 | /* | ||
90 | * size is the order of the table, 0-7 | ||
91 | * smallest table is 8K entries, so shift result by 13 to | ||
92 | * multiply by 8K | ||
93 | */ | ||
94 | return (1 << size) << 13; | ||
95 | } | ||
96 | |||
97 | static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) | ||
98 | { | ||
99 | unsigned int bitmapsz; | ||
100 | unsigned long bmppages; | ||
101 | int ret; | ||
102 | |||
103 | tbl->it_busno = dev->bus->number; | ||
104 | |||
105 | /* set the tce table size - measured in entries */ | ||
106 | tbl->it_size = table_size_to_number_of_entries(specified_table_size); | ||
107 | |||
108 | /* | ||
109 | * number of bytes needed for the bitmap size in number of | ||
110 | * entries; we need one bit per entry | ||
111 | */ | ||
112 | bitmapsz = tbl->it_size / BITS_PER_BYTE; | ||
113 | bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); | ||
114 | if (!bmppages) { | ||
115 | printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); | ||
116 | ret = -ENOMEM; | ||
117 | goto done; | ||
118 | } | ||
119 | |||
120 | tbl->it_map = (unsigned long*)bmppages; | ||
121 | |||
122 | memset(tbl->it_map, 0, bitmapsz); | ||
123 | |||
124 | tbl->it_hint = 0; | ||
125 | |||
126 | spin_lock_init(&tbl->it_lock); | ||
127 | |||
128 | return 0; | ||
129 | |||
130 | done: | ||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) | ||
135 | { | ||
136 | struct iommu_table *tbl; | ||
137 | int ret; | ||
138 | |||
139 | if (pci_iommu(dev->bus)) { | ||
140 | printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", | ||
141 | dev, pci_iommu(dev->bus)); | ||
142 | BUG(); | ||
143 | } | ||
144 | |||
145 | tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); | ||
146 | if (!tbl) { | ||
147 | printk(KERN_ERR "Calgary: error allocating iommu_table\n"); | ||
148 | ret = -ENOMEM; | ||
149 | goto done; | ||
150 | } | ||
151 | |||
152 | ret = tce_table_setparms(dev, tbl); | ||
153 | if (ret) | ||
154 | goto free_tbl; | ||
155 | |||
156 | tbl->bbar = bbar; | ||
157 | |||
158 | set_pci_iommu(dev->bus, tbl); | ||
159 | |||
160 | return 0; | ||
161 | |||
162 | free_tbl: | ||
163 | kfree(tbl); | ||
164 | done: | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | void * __init alloc_tce_table(void) | ||
169 | { | ||
170 | unsigned int size; | ||
171 | |||
172 | size = table_size_to_number_of_entries(specified_table_size); | ||
173 | size *= TCE_ENTRY_SIZE; | ||
174 | |||
175 | return __alloc_bootmem_low(size, size, 0); | ||
176 | } | ||
177 | |||
178 | void __init free_tce_table(void *tbl) | ||
179 | { | ||
180 | unsigned int size; | ||
181 | |||
182 | if (!tbl) | ||
183 | return; | ||
184 | |||
185 | size = table_size_to_number_of_entries(specified_table_size); | ||
186 | size *= TCE_ENTRY_SIZE; | ||
187 | |||
188 | free_bootmem(__pa(tbl), size); | ||
189 | } | ||
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c new file mode 100644 index 000000000000..6d48a4e826d9 --- /dev/null +++ b/arch/x86/kernel/time_64.c | |||
@@ -0,0 +1,447 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/time.c | ||
3 | * | ||
4 | * "High Precision Event Timer" based timekeeping. | ||
5 | * | ||
6 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
7 | * Copyright (c) 1994 Alan Modra | ||
8 | * Copyright (c) 1995 Markus Kuhn | ||
9 | * Copyright (c) 1996 Ingo Molnar | ||
10 | * Copyright (c) 1998 Andrea Arcangeli | ||
11 | * Copyright (c) 2002,2006 Vojtech Pavlik | ||
12 | * Copyright (c) 2003 Andi Kleen | ||
13 | * RTC support code taken from arch/i386/kernel/timers/time_hpet.c | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/mc146818rtc.h> | ||
21 | #include <linux/time.h> | ||
22 | #include <linux/ioport.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/device.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/bcd.h> | ||
27 | #include <linux/notifier.h> | ||
28 | #include <linux/cpu.h> | ||
29 | #include <linux/kallsyms.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #ifdef CONFIG_ACPI | ||
32 | #include <acpi/achware.h> /* for PM timer frequency */ | ||
33 | #include <acpi/acpi_bus.h> | ||
34 | #endif | ||
35 | #include <asm/8253pit.h> | ||
36 | #include <asm/i8253.h> | ||
37 | #include <asm/pgtable.h> | ||
38 | #include <asm/vsyscall.h> | ||
39 | #include <asm/timex.h> | ||
40 | #include <asm/proto.h> | ||
41 | #include <asm/hpet.h> | ||
42 | #include <asm/sections.h> | ||
43 | #include <linux/hpet.h> | ||
44 | #include <asm/apic.h> | ||
45 | #include <asm/hpet.h> | ||
46 | #include <asm/mpspec.h> | ||
47 | #include <asm/nmi.h> | ||
48 | #include <asm/vgtod.h> | ||
49 | |||
50 | static char *timename = NULL; | ||
51 | |||
52 | DEFINE_SPINLOCK(rtc_lock); | ||
53 | EXPORT_SYMBOL(rtc_lock); | ||
54 | DEFINE_SPINLOCK(i8253_lock); | ||
55 | EXPORT_SYMBOL(i8253_lock); | ||
56 | |||
57 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
58 | |||
59 | unsigned long profile_pc(struct pt_regs *regs) | ||
60 | { | ||
61 | unsigned long pc = instruction_pointer(regs); | ||
62 | |||
63 | /* Assume the lock function has either no stack frame or a copy | ||
64 | of eflags from PUSHF | ||
65 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ | ||
66 | if (!user_mode(regs) && in_lock_functions(pc)) { | ||
67 | unsigned long *sp = (unsigned long *)regs->rsp; | ||
68 | if (sp[0] >> 22) | ||
69 | return sp[0]; | ||
70 | if (sp[1] >> 22) | ||
71 | return sp[1]; | ||
72 | } | ||
73 | return pc; | ||
74 | } | ||
75 | EXPORT_SYMBOL(profile_pc); | ||
76 | |||
77 | /* | ||
78 | * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 | ||
79 | * ms after the second nowtime has started, because when nowtime is written | ||
80 | * into the registers of the CMOS clock, it will jump to the next second | ||
81 | * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data | ||
82 | * sheet for details. | ||
83 | */ | ||
84 | |||
85 | static int set_rtc_mmss(unsigned long nowtime) | ||
86 | { | ||
87 | int retval = 0; | ||
88 | int real_seconds, real_minutes, cmos_minutes; | ||
89 | unsigned char control, freq_select; | ||
90 | |||
91 | /* | ||
92 | * IRQs are disabled when we're called from the timer interrupt, | ||
93 | * no need for spin_lock_irqsave() | ||
94 | */ | ||
95 | |||
96 | spin_lock(&rtc_lock); | ||
97 | |||
98 | /* | ||
99 | * Tell the clock it's being set and stop it. | ||
100 | */ | ||
101 | |||
102 | control = CMOS_READ(RTC_CONTROL); | ||
103 | CMOS_WRITE(control | RTC_SET, RTC_CONTROL); | ||
104 | |||
105 | freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
106 | CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); | ||
107 | |||
108 | cmos_minutes = CMOS_READ(RTC_MINUTES); | ||
109 | BCD_TO_BIN(cmos_minutes); | ||
110 | |||
111 | /* | ||
112 | * since we're only adjusting minutes and seconds, don't interfere with hour | ||
113 | * overflow. This avoids messing with unknown time zones but requires your RTC | ||
114 | * not to be off by more than 15 minutes. Since we're calling it only when | ||
115 | * our clock is externally synchronized using NTP, this shouldn't be a problem. | ||
116 | */ | ||
117 | |||
118 | real_seconds = nowtime % 60; | ||
119 | real_minutes = nowtime / 60; | ||
120 | if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) | ||
121 | real_minutes += 30; /* correct for half hour time zone */ | ||
122 | real_minutes %= 60; | ||
123 | |||
124 | if (abs(real_minutes - cmos_minutes) >= 30) { | ||
125 | printk(KERN_WARNING "time.c: can't update CMOS clock " | ||
126 | "from %d to %d\n", cmos_minutes, real_minutes); | ||
127 | retval = -1; | ||
128 | } else { | ||
129 | BIN_TO_BCD(real_seconds); | ||
130 | BIN_TO_BCD(real_minutes); | ||
131 | CMOS_WRITE(real_seconds, RTC_SECONDS); | ||
132 | CMOS_WRITE(real_minutes, RTC_MINUTES); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * The following flags have to be released exactly in this order, otherwise the | ||
137 | * DS12887 (popular MC146818A clone with integrated battery and quartz) will | ||
138 | * not reset the oscillator and will not update precisely 500 ms later. You | ||
139 | * won't find this mentioned in the Dallas Semiconductor data sheets, but who | ||
140 | * believes data sheets anyway ... -- Markus Kuhn | ||
141 | */ | ||
142 | |||
143 | CMOS_WRITE(control, RTC_CONTROL); | ||
144 | CMOS_WRITE(freq_select, RTC_FREQ_SELECT); | ||
145 | |||
146 | spin_unlock(&rtc_lock); | ||
147 | |||
148 | return retval; | ||
149 | } | ||
150 | |||
151 | int update_persistent_clock(struct timespec now) | ||
152 | { | ||
153 | return set_rtc_mmss(now.tv_sec); | ||
154 | } | ||
155 | |||
156 | void main_timer_handler(void) | ||
157 | { | ||
158 | /* | ||
159 | * Here we are in the timer irq handler. We have irqs locally disabled (so we | ||
160 | * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running | ||
161 | * on the other CPU, so we need a lock. We also need to lock the vsyscall | ||
162 | * variables, because both do_timer() and us change them -arca+vojtech | ||
163 | */ | ||
164 | |||
165 | write_seqlock(&xtime_lock); | ||
166 | |||
167 | /* | ||
168 | * Do the timer stuff. | ||
169 | */ | ||
170 | |||
171 | do_timer(1); | ||
172 | #ifndef CONFIG_SMP | ||
173 | update_process_times(user_mode(get_irq_regs())); | ||
174 | #endif | ||
175 | |||
176 | /* | ||
177 | * In the SMP case we use the local APIC timer interrupt to do the profiling, | ||
178 | * except when we simulate SMP mode on a uniprocessor system, in that case we | ||
179 | * have to call the local interrupt handler. | ||
180 | */ | ||
181 | |||
182 | if (!using_apic_timer) | ||
183 | smp_local_timer_interrupt(); | ||
184 | |||
185 | write_sequnlock(&xtime_lock); | ||
186 | } | ||
187 | |||
188 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
189 | { | ||
190 | if (apic_runs_main_timer > 1) | ||
191 | return IRQ_HANDLED; | ||
192 | main_timer_handler(); | ||
193 | if (using_apic_timer) | ||
194 | smp_send_timer_broadcast_ipi(); | ||
195 | return IRQ_HANDLED; | ||
196 | } | ||
197 | |||
198 | unsigned long read_persistent_clock(void) | ||
199 | { | ||
200 | unsigned int year, mon, day, hour, min, sec; | ||
201 | unsigned long flags; | ||
202 | unsigned century = 0; | ||
203 | |||
204 | spin_lock_irqsave(&rtc_lock, flags); | ||
205 | |||
206 | do { | ||
207 | sec = CMOS_READ(RTC_SECONDS); | ||
208 | min = CMOS_READ(RTC_MINUTES); | ||
209 | hour = CMOS_READ(RTC_HOURS); | ||
210 | day = CMOS_READ(RTC_DAY_OF_MONTH); | ||
211 | mon = CMOS_READ(RTC_MONTH); | ||
212 | year = CMOS_READ(RTC_YEAR); | ||
213 | #ifdef CONFIG_ACPI | ||
214 | if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && | ||
215 | acpi_gbl_FADT.century) | ||
216 | century = CMOS_READ(acpi_gbl_FADT.century); | ||
217 | #endif | ||
218 | } while (sec != CMOS_READ(RTC_SECONDS)); | ||
219 | |||
220 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
221 | |||
222 | /* | ||
223 | * We know that x86-64 always uses BCD format, no need to check the | ||
224 | * config register. | ||
225 | */ | ||
226 | |||
227 | BCD_TO_BIN(sec); | ||
228 | BCD_TO_BIN(min); | ||
229 | BCD_TO_BIN(hour); | ||
230 | BCD_TO_BIN(day); | ||
231 | BCD_TO_BIN(mon); | ||
232 | BCD_TO_BIN(year); | ||
233 | |||
234 | if (century) { | ||
235 | BCD_TO_BIN(century); | ||
236 | year += century * 100; | ||
237 | printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); | ||
238 | } else { | ||
239 | /* | ||
240 | * x86-64 systems only exists since 2002. | ||
241 | * This will work up to Dec 31, 2100 | ||
242 | */ | ||
243 | year += 2000; | ||
244 | } | ||
245 | |||
246 | return mktime(year, mon, day, hour, min, sec); | ||
247 | } | ||
248 | |||
249 | /* calibrate_cpu is used on systems with fixed rate TSCs to determine | ||
250 | * processor frequency */ | ||
251 | #define TICK_COUNT 100000000 | ||
252 | static unsigned int __init tsc_calibrate_cpu_khz(void) | ||
253 | { | ||
254 | int tsc_start, tsc_now; | ||
255 | int i, no_ctr_free; | ||
256 | unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; | ||
257 | unsigned long flags; | ||
258 | |||
259 | for (i = 0; i < 4; i++) | ||
260 | if (avail_to_resrv_perfctr_nmi_bit(i)) | ||
261 | break; | ||
262 | no_ctr_free = (i == 4); | ||
263 | if (no_ctr_free) { | ||
264 | i = 3; | ||
265 | rdmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
266 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
267 | rdmsrl(MSR_K7_PERFCTR3, pmc3); | ||
268 | } else { | ||
269 | reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
270 | reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
271 | } | ||
272 | local_irq_save(flags); | ||
273 | /* start meauring cycles, incrementing from 0 */ | ||
274 | wrmsrl(MSR_K7_PERFCTR0 + i, 0); | ||
275 | wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); | ||
276 | rdtscl(tsc_start); | ||
277 | do { | ||
278 | rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); | ||
279 | tsc_now = get_cycles_sync(); | ||
280 | } while ((tsc_now - tsc_start) < TICK_COUNT); | ||
281 | |||
282 | local_irq_restore(flags); | ||
283 | if (no_ctr_free) { | ||
284 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
285 | wrmsrl(MSR_K7_PERFCTR3, pmc3); | ||
286 | wrmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
287 | } else { | ||
288 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
289 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
290 | } | ||
291 | |||
292 | return pmc_now * tsc_khz / (tsc_now - tsc_start); | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * pit_calibrate_tsc() uses the speaker output (channel 2) of | ||
297 | * the PIT. This is better than using the timer interrupt output, | ||
298 | * because we can read the value of the speaker with just one inb(), | ||
299 | * where we need three i/o operations for the interrupt channel. | ||
300 | * We count how many ticks the TSC does in 50 ms. | ||
301 | */ | ||
302 | |||
303 | static unsigned int __init pit_calibrate_tsc(void) | ||
304 | { | ||
305 | unsigned long start, end; | ||
306 | unsigned long flags; | ||
307 | |||
308 | spin_lock_irqsave(&i8253_lock, flags); | ||
309 | |||
310 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
311 | |||
312 | outb(0xb0, 0x43); | ||
313 | outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
314 | outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
315 | start = get_cycles_sync(); | ||
316 | while ((inb(0x61) & 0x20) == 0); | ||
317 | end = get_cycles_sync(); | ||
318 | |||
319 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
320 | |||
321 | return (end - start) / 50; | ||
322 | } | ||
323 | |||
324 | #define PIT_MODE 0x43 | ||
325 | #define PIT_CH0 0x40 | ||
326 | |||
327 | static void __pit_init(int val, u8 mode) | ||
328 | { | ||
329 | unsigned long flags; | ||
330 | |||
331 | spin_lock_irqsave(&i8253_lock, flags); | ||
332 | outb_p(mode, PIT_MODE); | ||
333 | outb_p(val & 0xff, PIT_CH0); /* LSB */ | ||
334 | outb_p(val >> 8, PIT_CH0); /* MSB */ | ||
335 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
336 | } | ||
337 | |||
338 | void __init pit_init(void) | ||
339 | { | ||
340 | __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
341 | } | ||
342 | |||
343 | void pit_stop_interrupt(void) | ||
344 | { | ||
345 | __pit_init(0, 0x30); /* mode 0 */ | ||
346 | } | ||
347 | |||
348 | void stop_timer_interrupt(void) | ||
349 | { | ||
350 | char *name; | ||
351 | if (hpet_address) { | ||
352 | name = "HPET"; | ||
353 | hpet_timer_stop_set_go(0); | ||
354 | } else { | ||
355 | name = "PIT"; | ||
356 | pit_stop_interrupt(); | ||
357 | } | ||
358 | printk(KERN_INFO "timer: %s interrupt stopped.\n", name); | ||
359 | } | ||
360 | |||
361 | static struct irqaction irq0 = { | ||
362 | .handler = timer_interrupt, | ||
363 | .flags = IRQF_DISABLED | IRQF_IRQPOLL, | ||
364 | .mask = CPU_MASK_NONE, | ||
365 | .name = "timer" | ||
366 | }; | ||
367 | |||
368 | void __init time_init(void) | ||
369 | { | ||
370 | if (nohpet) | ||
371 | hpet_address = 0; | ||
372 | |||
373 | if (hpet_arch_init()) | ||
374 | hpet_address = 0; | ||
375 | |||
376 | if (hpet_use_timer) { | ||
377 | /* set tick_nsec to use the proper rate for HPET */ | ||
378 | tick_nsec = TICK_NSEC_HPET; | ||
379 | tsc_khz = hpet_calibrate_tsc(); | ||
380 | timename = "HPET"; | ||
381 | } else { | ||
382 | pit_init(); | ||
383 | tsc_khz = pit_calibrate_tsc(); | ||
384 | timename = "PIT"; | ||
385 | } | ||
386 | |||
387 | cpu_khz = tsc_khz; | ||
388 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | ||
389 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
390 | boot_cpu_data.x86 == 16) | ||
391 | cpu_khz = tsc_calibrate_cpu_khz(); | ||
392 | |||
393 | if (unsynchronized_tsc()) | ||
394 | mark_tsc_unstable("TSCs unsynchronized"); | ||
395 | |||
396 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
397 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
398 | else | ||
399 | vgetcpu_mode = VGETCPU_LSL; | ||
400 | |||
401 | set_cyc2ns_scale(tsc_khz); | ||
402 | printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", | ||
403 | cpu_khz / 1000, cpu_khz % 1000); | ||
404 | init_tsc_clocksource(); | ||
405 | |||
406 | setup_irq(0, &irq0); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * sysfs support for the timer. | ||
411 | */ | ||
412 | |||
413 | static int timer_suspend(struct sys_device *dev, pm_message_t state) | ||
414 | { | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static int timer_resume(struct sys_device *dev) | ||
419 | { | ||
420 | if (hpet_address) | ||
421 | hpet_reenable(); | ||
422 | else | ||
423 | i8254_timer_resume(); | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | static struct sysdev_class timer_sysclass = { | ||
428 | .resume = timer_resume, | ||
429 | .suspend = timer_suspend, | ||
430 | set_kset_name("timer"), | ||
431 | }; | ||
432 | |||
433 | /* XXX this sysfs stuff should probably go elsewhere later -john */ | ||
434 | static struct sys_device device_timer = { | ||
435 | .id = 0, | ||
436 | .cls = &timer_sysclass, | ||
437 | }; | ||
438 | |||
439 | static int time_init_device(void) | ||
440 | { | ||
441 | int error = sysdev_class_register(&timer_sysclass); | ||
442 | if (!error) | ||
443 | error = sysdev_register(&device_timer); | ||
444 | return error; | ||
445 | } | ||
446 | |||
447 | device_initcall(time_init_device); | ||
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S new file mode 100644 index 000000000000..607983b0d27b --- /dev/null +++ b/arch/x86/kernel/trampoline_64.S | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Trampoline.S Derived from Setup.S by Linus Torvalds | ||
4 | * | ||
5 | * 4 Jan 1997 Michael Chastain: changed to gnu as. | ||
6 | * 15 Sept 2005 Eric Biederman: 64bit PIC support | ||
7 | * | ||
8 | * Entry: CS:IP point to the start of our code, we are | ||
9 | * in real mode with no stack, but the rest of the | ||
10 | * trampoline page to make our stack and everything else | ||
11 | * is a mystery. | ||
12 | * | ||
13 | * In fact we don't actually need a stack so we don't | ||
14 | * set one up. | ||
15 | * | ||
16 | * On entry to trampoline_data, the processor is in real mode | ||
17 | * with 16-bit addressing and 16-bit data. CS has some value | ||
18 | * and IP is zero. Thus, data addresses need to be absolute | ||
19 | * (no relocation) and are taken with regard to r_base. | ||
20 | * | ||
21 | * With the addition of trampoline_level4_pgt this code can | ||
22 | * now enter a 64bit kernel that lives at arbitrary 64bit | ||
23 | * physical addresses. | ||
24 | * | ||
25 | * If you work on this file, check the object module with objdump | ||
26 | * --full-contents --reloc to make sure there are no relocation | ||
27 | * entries. | ||
28 | */ | ||
29 | |||
30 | #include <linux/linkage.h> | ||
31 | #include <asm/pgtable.h> | ||
32 | #include <asm/page.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/segment.h> | ||
35 | |||
36 | .data | ||
37 | |||
38 | .code16 | ||
39 | |||
40 | ENTRY(trampoline_data) | ||
41 | r_base = . | ||
42 | cli # We should be safe anyway | ||
43 | wbinvd | ||
44 | mov %cs, %ax # Code and data in the same place | ||
45 | mov %ax, %ds | ||
46 | mov %ax, %es | ||
47 | mov %ax, %ss | ||
48 | |||
49 | |||
50 | movl $0xA5A5A5A5, trampoline_data - r_base | ||
51 | # write marker for master knows we're running | ||
52 | |||
53 | # Setup stack | ||
54 | movw $(trampoline_stack_end - r_base), %sp | ||
55 | |||
56 | call verify_cpu # Verify the cpu supports long mode | ||
57 | testl %eax, %eax # Check for return code | ||
58 | jnz no_longmode | ||
59 | |||
60 | mov %cs, %ax | ||
61 | movzx %ax, %esi # Find the 32bit trampoline location | ||
62 | shll $4, %esi | ||
63 | |||
64 | # Fixup the vectors | ||
65 | addl %esi, startup_32_vector - r_base | ||
66 | addl %esi, startup_64_vector - r_base | ||
67 | addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer | ||
68 | |||
69 | /* | ||
70 | * GDT tables in non default location kernel can be beyond 16MB and | ||
71 | * lgdt will not be able to load the address as in real mode default | ||
72 | * operand size is 16bit. Use lgdtl instead to force operand size | ||
73 | * to 32 bit. | ||
74 | */ | ||
75 | |||
76 | lidtl tidt - r_base # load idt with 0, 0 | ||
77 | lgdtl tgdt - r_base # load gdt with whatever is appropriate | ||
78 | |||
79 | xor %ax, %ax | ||
80 | inc %ax # protected mode (PE) bit | ||
81 | lmsw %ax # into protected mode | ||
82 | |||
83 | # flush prefetch and jump to startup_32 | ||
84 | ljmpl *(startup_32_vector - r_base) | ||
85 | |||
86 | .code32 | ||
87 | .balign 4 | ||
88 | startup_32: | ||
89 | movl $__KERNEL_DS, %eax # Initialize the %ds segment register | ||
90 | movl %eax, %ds | ||
91 | |||
92 | xorl %eax, %eax | ||
93 | btsl $5, %eax # Enable PAE mode | ||
94 | movl %eax, %cr4 | ||
95 | |||
96 | # Setup trampoline 4 level pagetables | ||
97 | leal (trampoline_level4_pgt - r_base)(%esi), %eax | ||
98 | movl %eax, %cr3 | ||
99 | |||
100 | movl $MSR_EFER, %ecx | ||
101 | movl $(1 << _EFER_LME), %eax # Enable Long Mode | ||
102 | xorl %edx, %edx | ||
103 | wrmsr | ||
104 | |||
105 | xorl %eax, %eax | ||
106 | btsl $31, %eax # Enable paging and in turn activate Long Mode | ||
107 | btsl $0, %eax # Enable protected mode | ||
108 | movl %eax, %cr0 | ||
109 | |||
110 | /* | ||
111 | * At this point we're in long mode but in 32bit compatibility mode | ||
112 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
113 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
114 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
115 | */ | ||
116 | ljmp *(startup_64_vector - r_base)(%esi) | ||
117 | |||
118 | .code64 | ||
119 | .balign 4 | ||
120 | startup_64: | ||
121 | # Now jump into the kernel using virtual addresses | ||
122 | movq $secondary_startup_64, %rax | ||
123 | jmp *%rax | ||
124 | |||
125 | .code16 | ||
126 | no_longmode: | ||
127 | hlt | ||
128 | jmp no_longmode | ||
129 | #include "verify_cpu_64.S" | ||
130 | |||
131 | # Careful these need to be in the same 64K segment as the above; | ||
132 | tidt: | ||
133 | .word 0 # idt limit = 0 | ||
134 | .word 0, 0 # idt base = 0L | ||
135 | |||
136 | # Duplicate the global descriptor table | ||
137 | # so the kernel can live anywhere | ||
138 | .balign 4 | ||
139 | tgdt: | ||
140 | .short tgdt_end - tgdt # gdt limit | ||
141 | .long tgdt - r_base | ||
142 | .short 0 | ||
143 | .quad 0x00cf9b000000ffff # __KERNEL32_CS | ||
144 | .quad 0x00af9b000000ffff # __KERNEL_CS | ||
145 | .quad 0x00cf93000000ffff # __KERNEL_DS | ||
146 | tgdt_end: | ||
147 | |||
148 | .balign 4 | ||
149 | startup_32_vector: | ||
150 | .long startup_32 - r_base | ||
151 | .word __KERNEL32_CS, 0 | ||
152 | |||
153 | .balign 4 | ||
154 | startup_64_vector: | ||
155 | .long startup_64 - r_base | ||
156 | .word __KERNEL_CS, 0 | ||
157 | |||
158 | trampoline_stack: | ||
159 | .org 0x1000 | ||
160 | trampoline_stack_end: | ||
161 | ENTRY(trampoline_level4_pgt) | ||
162 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
163 | .fill 510,8,0 | ||
164 | .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
165 | |||
166 | ENTRY(trampoline_end) | ||
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c new file mode 100644 index 000000000000..03888420775d --- /dev/null +++ b/arch/x86/kernel/traps_64.c | |||
@@ -0,0 +1,1138 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/traps.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
13 | * state in 'entry.S'. | ||
14 | */ | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/ptrace.h> | ||
20 | #include <linux/timer.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/delay.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/kallsyms.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/moduleparam.h> | ||
29 | #include <linux/nmi.h> | ||
30 | #include <linux/kprobes.h> | ||
31 | #include <linux/kexec.h> | ||
32 | #include <linux/unwind.h> | ||
33 | #include <linux/uaccess.h> | ||
34 | #include <linux/bug.h> | ||
35 | #include <linux/kdebug.h> | ||
36 | |||
37 | #if defined(CONFIG_EDAC) | ||
38 | #include <linux/edac.h> | ||
39 | #endif | ||
40 | |||
41 | #include <asm/system.h> | ||
42 | #include <asm/io.h> | ||
43 | #include <asm/atomic.h> | ||
44 | #include <asm/debugreg.h> | ||
45 | #include <asm/desc.h> | ||
46 | #include <asm/i387.h> | ||
47 | #include <asm/processor.h> | ||
48 | #include <asm/unwind.h> | ||
49 | #include <asm/smp.h> | ||
50 | #include <asm/pgalloc.h> | ||
51 | #include <asm/pda.h> | ||
52 | #include <asm/proto.h> | ||
53 | #include <asm/nmi.h> | ||
54 | #include <asm/stacktrace.h> | ||
55 | |||
56 | asmlinkage void divide_error(void); | ||
57 | asmlinkage void debug(void); | ||
58 | asmlinkage void nmi(void); | ||
59 | asmlinkage void int3(void); | ||
60 | asmlinkage void overflow(void); | ||
61 | asmlinkage void bounds(void); | ||
62 | asmlinkage void invalid_op(void); | ||
63 | asmlinkage void device_not_available(void); | ||
64 | asmlinkage void double_fault(void); | ||
65 | asmlinkage void coprocessor_segment_overrun(void); | ||
66 | asmlinkage void invalid_TSS(void); | ||
67 | asmlinkage void segment_not_present(void); | ||
68 | asmlinkage void stack_segment(void); | ||
69 | asmlinkage void general_protection(void); | ||
70 | asmlinkage void page_fault(void); | ||
71 | asmlinkage void coprocessor_error(void); | ||
72 | asmlinkage void simd_coprocessor_error(void); | ||
73 | asmlinkage void reserved(void); | ||
74 | asmlinkage void alignment_check(void); | ||
75 | asmlinkage void machine_check(void); | ||
76 | asmlinkage void spurious_interrupt_bug(void); | ||
77 | |||
78 | static inline void conditional_sti(struct pt_regs *regs) | ||
79 | { | ||
80 | if (regs->eflags & X86_EFLAGS_IF) | ||
81 | local_irq_enable(); | ||
82 | } | ||
83 | |||
84 | static inline void preempt_conditional_sti(struct pt_regs *regs) | ||
85 | { | ||
86 | preempt_disable(); | ||
87 | if (regs->eflags & X86_EFLAGS_IF) | ||
88 | local_irq_enable(); | ||
89 | } | ||
90 | |||
91 | static inline void preempt_conditional_cli(struct pt_regs *regs) | ||
92 | { | ||
93 | if (regs->eflags & X86_EFLAGS_IF) | ||
94 | local_irq_disable(); | ||
95 | /* Make sure to not schedule here because we could be running | ||
96 | on an exception stack. */ | ||
97 | preempt_enable_no_resched(); | ||
98 | } | ||
99 | |||
100 | int kstack_depth_to_print = 12; | ||
101 | |||
102 | #ifdef CONFIG_KALLSYMS | ||
103 | void printk_address(unsigned long address) | ||
104 | { | ||
105 | unsigned long offset = 0, symsize; | ||
106 | const char *symname; | ||
107 | char *modname; | ||
108 | char *delim = ":"; | ||
109 | char namebuf[128]; | ||
110 | |||
111 | symname = kallsyms_lookup(address, &symsize, &offset, | ||
112 | &modname, namebuf); | ||
113 | if (!symname) { | ||
114 | printk(" [<%016lx>]\n", address); | ||
115 | return; | ||
116 | } | ||
117 | if (!modname) | ||
118 | modname = delim = ""; | ||
119 | printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", | ||
120 | address, delim, modname, delim, symname, offset, symsize); | ||
121 | } | ||
122 | #else | ||
123 | void printk_address(unsigned long address) | ||
124 | { | ||
125 | printk(" [<%016lx>]\n", address); | ||
126 | } | ||
127 | #endif | ||
128 | |||
129 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
130 | unsigned *usedp, char **idp) | ||
131 | { | ||
132 | static char ids[][8] = { | ||
133 | [DEBUG_STACK - 1] = "#DB", | ||
134 | [NMI_STACK - 1] = "NMI", | ||
135 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
136 | [STACKFAULT_STACK - 1] = "#SS", | ||
137 | [MCE_STACK - 1] = "#MC", | ||
138 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
139 | [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
140 | #endif | ||
141 | }; | ||
142 | unsigned k; | ||
143 | |||
144 | /* | ||
145 | * Iterate over all exception stacks, and figure out whether | ||
146 | * 'stack' is in one of them: | ||
147 | */ | ||
148 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
149 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
150 | /* | ||
151 | * Is 'stack' above this exception frame's end? | ||
152 | * If yes then skip to the next frame. | ||
153 | */ | ||
154 | if (stack >= end) | ||
155 | continue; | ||
156 | /* | ||
157 | * Is 'stack' above this exception frame's start address? | ||
158 | * If yes then we found the right frame. | ||
159 | */ | ||
160 | if (stack >= end - EXCEPTION_STKSZ) { | ||
161 | /* | ||
162 | * Make sure we only iterate through an exception | ||
163 | * stack once. If it comes up for the second time | ||
164 | * then there's something wrong going on - just | ||
165 | * break out and return NULL: | ||
166 | */ | ||
167 | if (*usedp & (1U << k)) | ||
168 | break; | ||
169 | *usedp |= 1U << k; | ||
170 | *idp = ids[k]; | ||
171 | return (unsigned long *)end; | ||
172 | } | ||
173 | /* | ||
174 | * If this is a debug stack, and if it has a larger size than | ||
175 | * the usual exception stacks, then 'stack' might still | ||
176 | * be within the lower portion of the debug stack: | ||
177 | */ | ||
178 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
179 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
180 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
181 | |||
182 | /* | ||
183 | * Black magic. A large debug stack is composed of | ||
184 | * multiple exception stack entries, which we | ||
185 | * iterate through now. Dont look: | ||
186 | */ | ||
187 | do { | ||
188 | ++j; | ||
189 | end -= EXCEPTION_STKSZ; | ||
190 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
191 | } while (stack < end - EXCEPTION_STKSZ); | ||
192 | if (*usedp & (1U << j)) | ||
193 | break; | ||
194 | *usedp |= 1U << j; | ||
195 | *idp = ids[j]; | ||
196 | return (unsigned long *)end; | ||
197 | } | ||
198 | #endif | ||
199 | } | ||
200 | return NULL; | ||
201 | } | ||
202 | |||
203 | #define MSG(txt) ops->warning(data, txt) | ||
204 | |||
205 | /* | ||
206 | * x86-64 can have upto three kernel stacks: | ||
207 | * process stack | ||
208 | * interrupt stack | ||
209 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
210 | */ | ||
211 | |||
212 | static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) | ||
213 | { | ||
214 | void *t = (void *)tinfo; | ||
215 | return p > t && p < t + THREAD_SIZE - 3; | ||
216 | } | ||
217 | |||
218 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | ||
219 | unsigned long *stack, | ||
220 | struct stacktrace_ops *ops, void *data) | ||
221 | { | ||
222 | const unsigned cpu = get_cpu(); | ||
223 | unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; | ||
224 | unsigned used = 0; | ||
225 | struct thread_info *tinfo; | ||
226 | |||
227 | if (!tsk) | ||
228 | tsk = current; | ||
229 | |||
230 | if (!stack) { | ||
231 | unsigned long dummy; | ||
232 | stack = &dummy; | ||
233 | if (tsk && tsk != current) | ||
234 | stack = (unsigned long *)tsk->thread.rsp; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Print function call entries within a stack. 'cond' is the | ||
239 | * "end of stackframe" condition, that the 'stack++' | ||
240 | * iteration will eventually trigger. | ||
241 | */ | ||
242 | #define HANDLE_STACK(cond) \ | ||
243 | do while (cond) { \ | ||
244 | unsigned long addr = *stack++; \ | ||
245 | /* Use unlocked access here because except for NMIs \ | ||
246 | we should be already protected against module unloads */ \ | ||
247 | if (__kernel_text_address(addr)) { \ | ||
248 | /* \ | ||
249 | * If the address is either in the text segment of the \ | ||
250 | * kernel, or in the region which contains vmalloc'ed \ | ||
251 | * memory, it *may* be the address of a calling \ | ||
252 | * routine; if so, print it so that someone tracing \ | ||
253 | * down the cause of the crash will be able to figure \ | ||
254 | * out the call path that was taken. \ | ||
255 | */ \ | ||
256 | ops->address(data, addr); \ | ||
257 | } \ | ||
258 | } while (0) | ||
259 | |||
260 | /* | ||
261 | * Print function call entries in all stacks, starting at the | ||
262 | * current stack address. If the stacks consist of nested | ||
263 | * exceptions | ||
264 | */ | ||
265 | for (;;) { | ||
266 | char *id; | ||
267 | unsigned long *estack_end; | ||
268 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
269 | &used, &id); | ||
270 | |||
271 | if (estack_end) { | ||
272 | if (ops->stack(data, id) < 0) | ||
273 | break; | ||
274 | HANDLE_STACK (stack < estack_end); | ||
275 | ops->stack(data, "<EOE>"); | ||
276 | /* | ||
277 | * We link to the next stack via the | ||
278 | * second-to-last pointer (index -2 to end) in the | ||
279 | * exception stack: | ||
280 | */ | ||
281 | stack = (unsigned long *) estack_end[-2]; | ||
282 | continue; | ||
283 | } | ||
284 | if (irqstack_end) { | ||
285 | unsigned long *irqstack; | ||
286 | irqstack = irqstack_end - | ||
287 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
288 | |||
289 | if (stack >= irqstack && stack < irqstack_end) { | ||
290 | if (ops->stack(data, "IRQ") < 0) | ||
291 | break; | ||
292 | HANDLE_STACK (stack < irqstack_end); | ||
293 | /* | ||
294 | * We link to the next stack (which would be | ||
295 | * the process stack normally) the last | ||
296 | * pointer (index -1 to end) in the IRQ stack: | ||
297 | */ | ||
298 | stack = (unsigned long *) (irqstack_end[-1]); | ||
299 | irqstack_end = NULL; | ||
300 | ops->stack(data, "EOI"); | ||
301 | continue; | ||
302 | } | ||
303 | } | ||
304 | break; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * This handles the process stack: | ||
309 | */ | ||
310 | tinfo = task_thread_info(tsk); | ||
311 | HANDLE_STACK (valid_stack_ptr(tinfo, stack)); | ||
312 | #undef HANDLE_STACK | ||
313 | put_cpu(); | ||
314 | } | ||
315 | EXPORT_SYMBOL(dump_trace); | ||
316 | |||
317 | static void | ||
318 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
319 | { | ||
320 | print_symbol(msg, symbol); | ||
321 | printk("\n"); | ||
322 | } | ||
323 | |||
324 | static void print_trace_warning(void *data, char *msg) | ||
325 | { | ||
326 | printk("%s\n", msg); | ||
327 | } | ||
328 | |||
329 | static int print_trace_stack(void *data, char *name) | ||
330 | { | ||
331 | printk(" <%s> ", name); | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static void print_trace_address(void *data, unsigned long addr) | ||
336 | { | ||
337 | touch_nmi_watchdog(); | ||
338 | printk_address(addr); | ||
339 | } | ||
340 | |||
341 | static struct stacktrace_ops print_trace_ops = { | ||
342 | .warning = print_trace_warning, | ||
343 | .warning_symbol = print_trace_warning_symbol, | ||
344 | .stack = print_trace_stack, | ||
345 | .address = print_trace_address, | ||
346 | }; | ||
347 | |||
348 | void | ||
349 | show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) | ||
350 | { | ||
351 | printk("\nCall Trace:\n"); | ||
352 | dump_trace(tsk, regs, stack, &print_trace_ops, NULL); | ||
353 | printk("\n"); | ||
354 | } | ||
355 | |||
356 | static void | ||
357 | _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) | ||
358 | { | ||
359 | unsigned long *stack; | ||
360 | int i; | ||
361 | const int cpu = smp_processor_id(); | ||
362 | unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
363 | unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
364 | |||
365 | // debugging aid: "show_stack(NULL, NULL);" prints the | ||
366 | // back trace for this cpu. | ||
367 | |||
368 | if (rsp == NULL) { | ||
369 | if (tsk) | ||
370 | rsp = (unsigned long *)tsk->thread.rsp; | ||
371 | else | ||
372 | rsp = (unsigned long *)&rsp; | ||
373 | } | ||
374 | |||
375 | stack = rsp; | ||
376 | for(i=0; i < kstack_depth_to_print; i++) { | ||
377 | if (stack >= irqstack && stack <= irqstack_end) { | ||
378 | if (stack == irqstack_end) { | ||
379 | stack = (unsigned long *) (irqstack_end[-1]); | ||
380 | printk(" <EOI> "); | ||
381 | } | ||
382 | } else { | ||
383 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
384 | break; | ||
385 | } | ||
386 | if (i && ((i % 4) == 0)) | ||
387 | printk("\n"); | ||
388 | printk(" %016lx", *stack++); | ||
389 | touch_nmi_watchdog(); | ||
390 | } | ||
391 | show_trace(tsk, regs, rsp); | ||
392 | } | ||
393 | |||
394 | void show_stack(struct task_struct *tsk, unsigned long * rsp) | ||
395 | { | ||
396 | _show_stack(tsk, NULL, rsp); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The architecture-independent dump_stack generator | ||
401 | */ | ||
402 | void dump_stack(void) | ||
403 | { | ||
404 | unsigned long dummy; | ||
405 | show_trace(NULL, NULL, &dummy); | ||
406 | } | ||
407 | |||
408 | EXPORT_SYMBOL(dump_stack); | ||
409 | |||
410 | void show_registers(struct pt_regs *regs) | ||
411 | { | ||
412 | int i; | ||
413 | int in_kernel = !user_mode(regs); | ||
414 | unsigned long rsp; | ||
415 | const int cpu = smp_processor_id(); | ||
416 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
417 | |||
418 | rsp = regs->rsp; | ||
419 | printk("CPU %d ", cpu); | ||
420 | __show_regs(regs); | ||
421 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
422 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
423 | |||
424 | /* | ||
425 | * When in-kernel, we also print out the stack and code at the | ||
426 | * time of the fault.. | ||
427 | */ | ||
428 | if (in_kernel) { | ||
429 | printk("Stack: "); | ||
430 | _show_stack(NULL, regs, (unsigned long*)rsp); | ||
431 | |||
432 | printk("\nCode: "); | ||
433 | if (regs->rip < PAGE_OFFSET) | ||
434 | goto bad; | ||
435 | |||
436 | for (i=0; i<20; i++) { | ||
437 | unsigned char c; | ||
438 | if (__get_user(c, &((unsigned char*)regs->rip)[i])) { | ||
439 | bad: | ||
440 | printk(" Bad RIP value."); | ||
441 | break; | ||
442 | } | ||
443 | printk("%02x ", c); | ||
444 | } | ||
445 | } | ||
446 | printk("\n"); | ||
447 | } | ||
448 | |||
449 | int is_valid_bugaddr(unsigned long rip) | ||
450 | { | ||
451 | unsigned short ud2; | ||
452 | |||
453 | if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) | ||
454 | return 0; | ||
455 | |||
456 | return ud2 == 0x0b0f; | ||
457 | } | ||
458 | |||
459 | #ifdef CONFIG_BUG | ||
460 | void out_of_line_bug(void) | ||
461 | { | ||
462 | BUG(); | ||
463 | } | ||
464 | EXPORT_SYMBOL(out_of_line_bug); | ||
465 | #endif | ||
466 | |||
467 | static DEFINE_SPINLOCK(die_lock); | ||
468 | static int die_owner = -1; | ||
469 | static unsigned int die_nest_count; | ||
470 | |||
471 | unsigned __kprobes long oops_begin(void) | ||
472 | { | ||
473 | int cpu; | ||
474 | unsigned long flags; | ||
475 | |||
476 | oops_enter(); | ||
477 | |||
478 | /* racy, but better than risking deadlock. */ | ||
479 | local_irq_save(flags); | ||
480 | cpu = smp_processor_id(); | ||
481 | if (!spin_trylock(&die_lock)) { | ||
482 | if (cpu == die_owner) | ||
483 | /* nested oops. should stop eventually */; | ||
484 | else | ||
485 | spin_lock(&die_lock); | ||
486 | } | ||
487 | die_nest_count++; | ||
488 | die_owner = cpu; | ||
489 | console_verbose(); | ||
490 | bust_spinlocks(1); | ||
491 | return flags; | ||
492 | } | ||
493 | |||
494 | void __kprobes oops_end(unsigned long flags) | ||
495 | { | ||
496 | die_owner = -1; | ||
497 | bust_spinlocks(0); | ||
498 | die_nest_count--; | ||
499 | if (die_nest_count) | ||
500 | /* We still own the lock */ | ||
501 | local_irq_restore(flags); | ||
502 | else | ||
503 | /* Nest count reaches zero, release the lock. */ | ||
504 | spin_unlock_irqrestore(&die_lock, flags); | ||
505 | if (panic_on_oops) | ||
506 | panic("Fatal exception"); | ||
507 | oops_exit(); | ||
508 | } | ||
509 | |||
510 | void __kprobes __die(const char * str, struct pt_regs * regs, long err) | ||
511 | { | ||
512 | static int die_counter; | ||
513 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | ||
514 | #ifdef CONFIG_PREEMPT | ||
515 | printk("PREEMPT "); | ||
516 | #endif | ||
517 | #ifdef CONFIG_SMP | ||
518 | printk("SMP "); | ||
519 | #endif | ||
520 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
521 | printk("DEBUG_PAGEALLOC"); | ||
522 | #endif | ||
523 | printk("\n"); | ||
524 | notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); | ||
525 | show_registers(regs); | ||
526 | add_taint(TAINT_DIE); | ||
527 | /* Executive summary in case the oops scrolled away */ | ||
528 | printk(KERN_ALERT "RIP "); | ||
529 | printk_address(regs->rip); | ||
530 | printk(" RSP <%016lx>\n", regs->rsp); | ||
531 | if (kexec_should_crash(current)) | ||
532 | crash_kexec(regs); | ||
533 | } | ||
534 | |||
535 | void die(const char * str, struct pt_regs * regs, long err) | ||
536 | { | ||
537 | unsigned long flags = oops_begin(); | ||
538 | |||
539 | if (!user_mode(regs)) | ||
540 | report_bug(regs->rip, regs); | ||
541 | |||
542 | __die(str, regs, err); | ||
543 | oops_end(flags); | ||
544 | do_exit(SIGSEGV); | ||
545 | } | ||
546 | |||
547 | void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
548 | { | ||
549 | unsigned long flags = oops_begin(); | ||
550 | |||
551 | /* | ||
552 | * We are in trouble anyway, lets at least try | ||
553 | * to get a message out. | ||
554 | */ | ||
555 | printk(str, smp_processor_id()); | ||
556 | show_registers(regs); | ||
557 | if (kexec_should_crash(current)) | ||
558 | crash_kexec(regs); | ||
559 | if (do_panic || panic_on_oops) | ||
560 | panic("Non maskable interrupt"); | ||
561 | oops_end(flags); | ||
562 | nmi_exit(); | ||
563 | local_irq_enable(); | ||
564 | do_exit(SIGSEGV); | ||
565 | } | ||
566 | |||
567 | static void __kprobes do_trap(int trapnr, int signr, char *str, | ||
568 | struct pt_regs * regs, long error_code, | ||
569 | siginfo_t *info) | ||
570 | { | ||
571 | struct task_struct *tsk = current; | ||
572 | |||
573 | if (user_mode(regs)) { | ||
574 | /* | ||
575 | * We want error_code and trap_no set for userspace | ||
576 | * faults and kernelspace faults which result in | ||
577 | * die(), but not kernelspace faults which are fixed | ||
578 | * up. die() gives the process no chance to handle | ||
579 | * the signal and notice the kernel fault information, | ||
580 | * so that won't result in polluting the information | ||
581 | * about previously queued, but not yet delivered, | ||
582 | * faults. See also do_general_protection below. | ||
583 | */ | ||
584 | tsk->thread.error_code = error_code; | ||
585 | tsk->thread.trap_no = trapnr; | ||
586 | |||
587 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
588 | printk_ratelimit()) | ||
589 | printk(KERN_INFO | ||
590 | "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | ||
591 | tsk->comm, tsk->pid, str, | ||
592 | regs->rip, regs->rsp, error_code); | ||
593 | |||
594 | if (info) | ||
595 | force_sig_info(signr, info, tsk); | ||
596 | else | ||
597 | force_sig(signr, tsk); | ||
598 | return; | ||
599 | } | ||
600 | |||
601 | |||
602 | /* kernel trap */ | ||
603 | { | ||
604 | const struct exception_table_entry *fixup; | ||
605 | fixup = search_exception_tables(regs->rip); | ||
606 | if (fixup) | ||
607 | regs->rip = fixup->fixup; | ||
608 | else { | ||
609 | tsk->thread.error_code = error_code; | ||
610 | tsk->thread.trap_no = trapnr; | ||
611 | die(str, regs, error_code); | ||
612 | } | ||
613 | return; | ||
614 | } | ||
615 | } | ||
616 | |||
617 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
618 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
619 | { \ | ||
620 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
621 | == NOTIFY_STOP) \ | ||
622 | return; \ | ||
623 | conditional_sti(regs); \ | ||
624 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
625 | } | ||
626 | |||
627 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
628 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
629 | { \ | ||
630 | siginfo_t info; \ | ||
631 | info.si_signo = signr; \ | ||
632 | info.si_errno = 0; \ | ||
633 | info.si_code = sicode; \ | ||
634 | info.si_addr = (void __user *)siaddr; \ | ||
635 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
636 | == NOTIFY_STOP) \ | ||
637 | return; \ | ||
638 | conditional_sti(regs); \ | ||
639 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
640 | } | ||
641 | |||
642 | DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | ||
643 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | ||
644 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | ||
645 | DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) | ||
646 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | ||
647 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
648 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
649 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
650 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
651 | DO_ERROR(18, SIGSEGV, "reserved", reserved) | ||
652 | |||
653 | /* Runs on IST stack */ | ||
654 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
655 | { | ||
656 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
657 | 12, SIGBUS) == NOTIFY_STOP) | ||
658 | return; | ||
659 | preempt_conditional_sti(regs); | ||
660 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
661 | preempt_conditional_cli(regs); | ||
662 | } | ||
663 | |||
664 | asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) | ||
665 | { | ||
666 | static const char str[] = "double fault"; | ||
667 | struct task_struct *tsk = current; | ||
668 | |||
669 | /* Return not checked because double check cannot be ignored */ | ||
670 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
671 | |||
672 | tsk->thread.error_code = error_code; | ||
673 | tsk->thread.trap_no = 8; | ||
674 | |||
675 | /* This is always a kernel trap and never fixable (and thus must | ||
676 | never return). */ | ||
677 | for (;;) | ||
678 | die(str, regs, error_code); | ||
679 | } | ||
680 | |||
681 | asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, | ||
682 | long error_code) | ||
683 | { | ||
684 | struct task_struct *tsk = current; | ||
685 | |||
686 | conditional_sti(regs); | ||
687 | |||
688 | if (user_mode(regs)) { | ||
689 | tsk->thread.error_code = error_code; | ||
690 | tsk->thread.trap_no = 13; | ||
691 | |||
692 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
693 | printk_ratelimit()) | ||
694 | printk(KERN_INFO | ||
695 | "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | ||
696 | tsk->comm, tsk->pid, | ||
697 | regs->rip, regs->rsp, error_code); | ||
698 | |||
699 | force_sig(SIGSEGV, tsk); | ||
700 | return; | ||
701 | } | ||
702 | |||
703 | /* kernel gp */ | ||
704 | { | ||
705 | const struct exception_table_entry *fixup; | ||
706 | fixup = search_exception_tables(regs->rip); | ||
707 | if (fixup) { | ||
708 | regs->rip = fixup->fixup; | ||
709 | return; | ||
710 | } | ||
711 | |||
712 | tsk->thread.error_code = error_code; | ||
713 | tsk->thread.trap_no = 13; | ||
714 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
715 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
716 | return; | ||
717 | die("general protection fault", regs, error_code); | ||
718 | } | ||
719 | } | ||
720 | |||
721 | static __kprobes void | ||
722 | mem_parity_error(unsigned char reason, struct pt_regs * regs) | ||
723 | { | ||
724 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
725 | reason); | ||
726 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | ||
727 | |||
728 | #if defined(CONFIG_EDAC) | ||
729 | if(edac_handler_set()) { | ||
730 | edac_atomic_assert_error(); | ||
731 | return; | ||
732 | } | ||
733 | #endif | ||
734 | |||
735 | if (panic_on_unrecovered_nmi) | ||
736 | panic("NMI: Not continuing"); | ||
737 | |||
738 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
739 | |||
740 | /* Clear and disable the memory parity error line. */ | ||
741 | reason = (reason & 0xf) | 4; | ||
742 | outb(reason, 0x61); | ||
743 | } | ||
744 | |||
745 | static __kprobes void | ||
746 | io_check_error(unsigned char reason, struct pt_regs * regs) | ||
747 | { | ||
748 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
749 | show_registers(regs); | ||
750 | |||
751 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
752 | reason = (reason & 0xf) | 8; | ||
753 | outb(reason, 0x61); | ||
754 | mdelay(2000); | ||
755 | reason &= ~8; | ||
756 | outb(reason, 0x61); | ||
757 | } | ||
758 | |||
759 | static __kprobes void | ||
760 | unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
761 | { | ||
762 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
763 | reason); | ||
764 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | ||
765 | |||
766 | if (panic_on_unrecovered_nmi) | ||
767 | panic("NMI: Not continuing"); | ||
768 | |||
769 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
770 | } | ||
771 | |||
772 | /* Runs on IST stack. This code must keep interrupts off all the time. | ||
773 | Nested NMIs are prevented by the CPU. */ | ||
774 | asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) | ||
775 | { | ||
776 | unsigned char reason = 0; | ||
777 | int cpu; | ||
778 | |||
779 | cpu = smp_processor_id(); | ||
780 | |||
781 | /* Only the BSP gets external NMIs from the system. */ | ||
782 | if (!cpu) | ||
783 | reason = get_nmi_reason(); | ||
784 | |||
785 | if (!(reason & 0xc0)) { | ||
786 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
787 | == NOTIFY_STOP) | ||
788 | return; | ||
789 | /* | ||
790 | * Ok, so this is none of the documented NMI sources, | ||
791 | * so it must be the NMI watchdog. | ||
792 | */ | ||
793 | if (nmi_watchdog_tick(regs,reason)) | ||
794 | return; | ||
795 | if (!do_nmi_callback(regs,cpu)) | ||
796 | unknown_nmi_error(reason, regs); | ||
797 | |||
798 | return; | ||
799 | } | ||
800 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
801 | return; | ||
802 | |||
803 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
804 | |||
805 | if (reason & 0x80) | ||
806 | mem_parity_error(reason, regs); | ||
807 | if (reason & 0x40) | ||
808 | io_check_error(reason, regs); | ||
809 | } | ||
810 | |||
811 | /* runs on IST stack. */ | ||
812 | asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) | ||
813 | { | ||
814 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | ||
815 | return; | ||
816 | } | ||
817 | preempt_conditional_sti(regs); | ||
818 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
819 | preempt_conditional_cli(regs); | ||
820 | } | ||
821 | |||
822 | /* Help handler running on IST stack to switch back to user stack | ||
823 | for scheduling or signal handling. The actual stack switch is done in | ||
824 | entry.S */ | ||
825 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
826 | { | ||
827 | struct pt_regs *regs = eregs; | ||
828 | /* Did already sync */ | ||
829 | if (eregs == (struct pt_regs *)eregs->rsp) | ||
830 | ; | ||
831 | /* Exception from user space */ | ||
832 | else if (user_mode(eregs)) | ||
833 | regs = task_pt_regs(current); | ||
834 | /* Exception from kernel and interrupts are enabled. Move to | ||
835 | kernel process stack. */ | ||
836 | else if (eregs->eflags & X86_EFLAGS_IF) | ||
837 | regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); | ||
838 | if (eregs != regs) | ||
839 | *regs = *eregs; | ||
840 | return regs; | ||
841 | } | ||
842 | |||
843 | /* runs on IST stack. */ | ||
844 | asmlinkage void __kprobes do_debug(struct pt_regs * regs, | ||
845 | unsigned long error_code) | ||
846 | { | ||
847 | unsigned long condition; | ||
848 | struct task_struct *tsk = current; | ||
849 | siginfo_t info; | ||
850 | |||
851 | get_debugreg(condition, 6); | ||
852 | |||
853 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
854 | SIGTRAP) == NOTIFY_STOP) | ||
855 | return; | ||
856 | |||
857 | preempt_conditional_sti(regs); | ||
858 | |||
859 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
860 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
861 | if (!tsk->thread.debugreg7) { | ||
862 | goto clear_dr7; | ||
863 | } | ||
864 | } | ||
865 | |||
866 | tsk->thread.debugreg6 = condition; | ||
867 | |||
868 | /* Mask out spurious TF errors due to lazy TF clearing */ | ||
869 | if (condition & DR_STEP) { | ||
870 | /* | ||
871 | * The TF error should be masked out only if the current | ||
872 | * process is not traced and if the TRAP flag has been set | ||
873 | * previously by a tracing process (condition detected by | ||
874 | * the PT_DTRACE flag); remember that the i386 TRAP flag | ||
875 | * can be modified by the process itself in user mode, | ||
876 | * allowing programs to debug themselves without the ptrace() | ||
877 | * interface. | ||
878 | */ | ||
879 | if (!user_mode(regs)) | ||
880 | goto clear_TF_reenable; | ||
881 | /* | ||
882 | * Was the TF flag set by a debugger? If so, clear it now, | ||
883 | * so that register information is correct. | ||
884 | */ | ||
885 | if (tsk->ptrace & PT_DTRACE) { | ||
886 | regs->eflags &= ~TF_MASK; | ||
887 | tsk->ptrace &= ~PT_DTRACE; | ||
888 | } | ||
889 | } | ||
890 | |||
891 | /* Ok, finally something we can handle */ | ||
892 | tsk->thread.trap_no = 1; | ||
893 | tsk->thread.error_code = error_code; | ||
894 | info.si_signo = SIGTRAP; | ||
895 | info.si_errno = 0; | ||
896 | info.si_code = TRAP_BRKPT; | ||
897 | info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; | ||
898 | force_sig_info(SIGTRAP, &info, tsk); | ||
899 | |||
900 | clear_dr7: | ||
901 | set_debugreg(0UL, 7); | ||
902 | preempt_conditional_cli(regs); | ||
903 | return; | ||
904 | |||
905 | clear_TF_reenable: | ||
906 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
907 | regs->eflags &= ~TF_MASK; | ||
908 | preempt_conditional_cli(regs); | ||
909 | } | ||
910 | |||
911 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
912 | { | ||
913 | const struct exception_table_entry *fixup; | ||
914 | fixup = search_exception_tables(regs->rip); | ||
915 | if (fixup) { | ||
916 | regs->rip = fixup->fixup; | ||
917 | return 1; | ||
918 | } | ||
919 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
920 | /* Illegal floating point operation in the kernel */ | ||
921 | current->thread.trap_no = trapnr; | ||
922 | die(str, regs, 0); | ||
923 | return 0; | ||
924 | } | ||
925 | |||
926 | /* | ||
927 | * Note that we play around with the 'TS' bit in an attempt to get | ||
928 | * the correct behaviour even in the presence of the asynchronous | ||
929 | * IRQ13 behaviour | ||
930 | */ | ||
931 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
932 | { | ||
933 | void __user *rip = (void __user *)(regs->rip); | ||
934 | struct task_struct * task; | ||
935 | siginfo_t info; | ||
936 | unsigned short cwd, swd; | ||
937 | |||
938 | conditional_sti(regs); | ||
939 | if (!user_mode(regs) && | ||
940 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
941 | return; | ||
942 | |||
943 | /* | ||
944 | * Save the info for the exception handler and clear the error. | ||
945 | */ | ||
946 | task = current; | ||
947 | save_init_fpu(task); | ||
948 | task->thread.trap_no = 16; | ||
949 | task->thread.error_code = 0; | ||
950 | info.si_signo = SIGFPE; | ||
951 | info.si_errno = 0; | ||
952 | info.si_code = __SI_FAULT; | ||
953 | info.si_addr = rip; | ||
954 | /* | ||
955 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
956 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
957 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
958 | * fault bit. We should only be taking one exception at a time, | ||
959 | * so if this combination doesn't produce any single exception, | ||
960 | * then we have a bad program that isn't synchronizing its FPU usage | ||
961 | * and it will suffer the consequences since we won't be able to | ||
962 | * fully reproduce the context of the exception | ||
963 | */ | ||
964 | cwd = get_fpu_cwd(task); | ||
965 | swd = get_fpu_swd(task); | ||
966 | switch (swd & ~cwd & 0x3f) { | ||
967 | case 0x000: | ||
968 | default: | ||
969 | break; | ||
970 | case 0x001: /* Invalid Op */ | ||
971 | /* | ||
972 | * swd & 0x240 == 0x040: Stack Underflow | ||
973 | * swd & 0x240 == 0x240: Stack Overflow | ||
974 | * User must clear the SF bit (0x40) if set | ||
975 | */ | ||
976 | info.si_code = FPE_FLTINV; | ||
977 | break; | ||
978 | case 0x002: /* Denormalize */ | ||
979 | case 0x010: /* Underflow */ | ||
980 | info.si_code = FPE_FLTUND; | ||
981 | break; | ||
982 | case 0x004: /* Zero Divide */ | ||
983 | info.si_code = FPE_FLTDIV; | ||
984 | break; | ||
985 | case 0x008: /* Overflow */ | ||
986 | info.si_code = FPE_FLTOVF; | ||
987 | break; | ||
988 | case 0x020: /* Precision */ | ||
989 | info.si_code = FPE_FLTRES; | ||
990 | break; | ||
991 | } | ||
992 | force_sig_info(SIGFPE, &info, task); | ||
993 | } | ||
994 | |||
995 | asmlinkage void bad_intr(void) | ||
996 | { | ||
997 | printk("bad interrupt"); | ||
998 | } | ||
999 | |||
1000 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
1001 | { | ||
1002 | void __user *rip = (void __user *)(regs->rip); | ||
1003 | struct task_struct * task; | ||
1004 | siginfo_t info; | ||
1005 | unsigned short mxcsr; | ||
1006 | |||
1007 | conditional_sti(regs); | ||
1008 | if (!user_mode(regs) && | ||
1009 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
1010 | return; | ||
1011 | |||
1012 | /* | ||
1013 | * Save the info for the exception handler and clear the error. | ||
1014 | */ | ||
1015 | task = current; | ||
1016 | save_init_fpu(task); | ||
1017 | task->thread.trap_no = 19; | ||
1018 | task->thread.error_code = 0; | ||
1019 | info.si_signo = SIGFPE; | ||
1020 | info.si_errno = 0; | ||
1021 | info.si_code = __SI_FAULT; | ||
1022 | info.si_addr = rip; | ||
1023 | /* | ||
1024 | * The SIMD FPU exceptions are handled a little differently, as there | ||
1025 | * is only a single status/control register. Thus, to determine which | ||
1026 | * unmasked exception was caught we must mask the exception mask bits | ||
1027 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
1028 | */ | ||
1029 | mxcsr = get_fpu_mxcsr(task); | ||
1030 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
1031 | case 0x000: | ||
1032 | default: | ||
1033 | break; | ||
1034 | case 0x001: /* Invalid Op */ | ||
1035 | info.si_code = FPE_FLTINV; | ||
1036 | break; | ||
1037 | case 0x002: /* Denormalize */ | ||
1038 | case 0x010: /* Underflow */ | ||
1039 | info.si_code = FPE_FLTUND; | ||
1040 | break; | ||
1041 | case 0x004: /* Zero Divide */ | ||
1042 | info.si_code = FPE_FLTDIV; | ||
1043 | break; | ||
1044 | case 0x008: /* Overflow */ | ||
1045 | info.si_code = FPE_FLTOVF; | ||
1046 | break; | ||
1047 | case 0x020: /* Precision */ | ||
1048 | info.si_code = FPE_FLTRES; | ||
1049 | break; | ||
1050 | } | ||
1051 | force_sig_info(SIGFPE, &info, task); | ||
1052 | } | ||
1053 | |||
1054 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | ||
1055 | { | ||
1056 | } | ||
1057 | |||
1058 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
1059 | { | ||
1060 | } | ||
1061 | |||
1062 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
1063 | { | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * 'math_state_restore()' saves the current math information in the | ||
1068 | * old math state array, and gets the new ones from the current task | ||
1069 | * | ||
1070 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
1071 | * Don't touch unless you *really* know how it works. | ||
1072 | */ | ||
1073 | asmlinkage void math_state_restore(void) | ||
1074 | { | ||
1075 | struct task_struct *me = current; | ||
1076 | clts(); /* Allow maths ops (or we recurse) */ | ||
1077 | |||
1078 | if (!used_math()) | ||
1079 | init_fpu(me); | ||
1080 | restore_fpu_checking(&me->thread.i387.fxsave); | ||
1081 | task_thread_info(me)->status |= TS_USEDFPU; | ||
1082 | me->fpu_counter++; | ||
1083 | } | ||
1084 | |||
1085 | void __init trap_init(void) | ||
1086 | { | ||
1087 | set_intr_gate(0,÷_error); | ||
1088 | set_intr_gate_ist(1,&debug,DEBUG_STACK); | ||
1089 | set_intr_gate_ist(2,&nmi,NMI_STACK); | ||
1090 | set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ | ||
1091 | set_system_gate(4,&overflow); /* int4 can be called from all */ | ||
1092 | set_intr_gate(5,&bounds); | ||
1093 | set_intr_gate(6,&invalid_op); | ||
1094 | set_intr_gate(7,&device_not_available); | ||
1095 | set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); | ||
1096 | set_intr_gate(9,&coprocessor_segment_overrun); | ||
1097 | set_intr_gate(10,&invalid_TSS); | ||
1098 | set_intr_gate(11,&segment_not_present); | ||
1099 | set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); | ||
1100 | set_intr_gate(13,&general_protection); | ||
1101 | set_intr_gate(14,&page_fault); | ||
1102 | set_intr_gate(15,&spurious_interrupt_bug); | ||
1103 | set_intr_gate(16,&coprocessor_error); | ||
1104 | set_intr_gate(17,&alignment_check); | ||
1105 | #ifdef CONFIG_X86_MCE | ||
1106 | set_intr_gate_ist(18,&machine_check, MCE_STACK); | ||
1107 | #endif | ||
1108 | set_intr_gate(19,&simd_coprocessor_error); | ||
1109 | |||
1110 | #ifdef CONFIG_IA32_EMULATION | ||
1111 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1112 | #endif | ||
1113 | |||
1114 | /* | ||
1115 | * Should be a barrier for any external CPU state. | ||
1116 | */ | ||
1117 | cpu_init(); | ||
1118 | } | ||
1119 | |||
1120 | |||
1121 | static int __init oops_setup(char *s) | ||
1122 | { | ||
1123 | if (!s) | ||
1124 | return -EINVAL; | ||
1125 | if (!strcmp(s, "panic")) | ||
1126 | panic_on_oops = 1; | ||
1127 | return 0; | ||
1128 | } | ||
1129 | early_param("oops", oops_setup); | ||
1130 | |||
1131 | static int __init kstack_setup(char *s) | ||
1132 | { | ||
1133 | if (!s) | ||
1134 | return -EINVAL; | ||
1135 | kstack_depth_to_print = simple_strtoul(s,NULL,0); | ||
1136 | return 0; | ||
1137 | } | ||
1138 | early_param("kstack", kstack_setup); | ||
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c new file mode 100644 index 000000000000..2a59bde663f2 --- /dev/null +++ b/arch/x86/kernel/tsc_64.c | |||
@@ -0,0 +1,207 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/interrupt.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/clocksource.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/acpi.h> | ||
8 | #include <linux/cpufreq.h> | ||
9 | |||
10 | #include <asm/timex.h> | ||
11 | |||
12 | static int notsc __initdata = 0; | ||
13 | |||
14 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
15 | EXPORT_SYMBOL(cpu_khz); | ||
16 | unsigned int tsc_khz; | ||
17 | EXPORT_SYMBOL(tsc_khz); | ||
18 | |||
19 | static unsigned int cyc2ns_scale __read_mostly; | ||
20 | |||
21 | void set_cyc2ns_scale(unsigned long khz) | ||
22 | { | ||
23 | cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; | ||
24 | } | ||
25 | |||
26 | static unsigned long long cycles_2_ns(unsigned long long cyc) | ||
27 | { | ||
28 | return (cyc * cyc2ns_scale) >> NS_SCALE; | ||
29 | } | ||
30 | |||
31 | unsigned long long sched_clock(void) | ||
32 | { | ||
33 | unsigned long a = 0; | ||
34 | |||
35 | /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, | ||
36 | * which means it is not completely exact and may not be monotonous | ||
37 | * between CPUs. But the errors should be too small to matter for | ||
38 | * scheduling purposes. | ||
39 | */ | ||
40 | |||
41 | rdtscll(a); | ||
42 | return cycles_2_ns(a); | ||
43 | } | ||
44 | |||
45 | static int tsc_unstable; | ||
46 | |||
47 | inline int check_tsc_unstable(void) | ||
48 | { | ||
49 | return tsc_unstable; | ||
50 | } | ||
51 | #ifdef CONFIG_CPU_FREQ | ||
52 | |||
53 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
54 | * changes. | ||
55 | * | ||
56 | * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
57 | * not that important because current Opteron setups do not support | ||
58 | * scaling on SMP anyroads. | ||
59 | * | ||
60 | * Should fix up last_tsc too. Currently gettimeofday in the | ||
61 | * first tick after the change will be slightly wrong. | ||
62 | */ | ||
63 | |||
64 | static unsigned int ref_freq; | ||
65 | static unsigned long loops_per_jiffy_ref; | ||
66 | static unsigned long tsc_khz_ref; | ||
67 | |||
68 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
69 | void *data) | ||
70 | { | ||
71 | struct cpufreq_freqs *freq = data; | ||
72 | unsigned long *lpj, dummy; | ||
73 | |||
74 | if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) | ||
75 | return 0; | ||
76 | |||
77 | lpj = &dummy; | ||
78 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
79 | #ifdef CONFIG_SMP | ||
80 | lpj = &cpu_data[freq->cpu].loops_per_jiffy; | ||
81 | #else | ||
82 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
83 | #endif | ||
84 | |||
85 | if (!ref_freq) { | ||
86 | ref_freq = freq->old; | ||
87 | loops_per_jiffy_ref = *lpj; | ||
88 | tsc_khz_ref = tsc_khz; | ||
89 | } | ||
90 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
91 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
92 | (val == CPUFREQ_RESUMECHANGE)) { | ||
93 | *lpj = | ||
94 | cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
95 | |||
96 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); | ||
97 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
98 | mark_tsc_unstable("cpufreq changes"); | ||
99 | } | ||
100 | |||
101 | set_cyc2ns_scale(tsc_khz_ref); | ||
102 | |||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static struct notifier_block time_cpufreq_notifier_block = { | ||
107 | .notifier_call = time_cpufreq_notifier | ||
108 | }; | ||
109 | |||
110 | static int __init cpufreq_tsc(void) | ||
111 | { | ||
112 | cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
113 | CPUFREQ_TRANSITION_NOTIFIER); | ||
114 | return 0; | ||
115 | } | ||
116 | |||
117 | core_initcall(cpufreq_tsc); | ||
118 | |||
119 | #endif | ||
120 | |||
121 | /* | ||
122 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
123 | * over all CPUs. | ||
124 | */ | ||
125 | __cpuinit int unsynchronized_tsc(void) | ||
126 | { | ||
127 | if (tsc_unstable) | ||
128 | return 1; | ||
129 | |||
130 | #ifdef CONFIG_SMP | ||
131 | if (apic_is_clustered_box()) | ||
132 | return 1; | ||
133 | #endif | ||
134 | /* Most intel systems have synchronized TSCs except for | ||
135 | multi node systems */ | ||
136 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { | ||
137 | #ifdef CONFIG_ACPI | ||
138 | /* But TSC doesn't tick in C3 so don't use it there */ | ||
139 | if (acpi_gbl_FADT.header.length > 0 && | ||
140 | acpi_gbl_FADT.C3latency < 1000) | ||
141 | return 1; | ||
142 | #endif | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | /* Assume multi socket systems are not synchronized */ | ||
147 | return num_present_cpus() > 1; | ||
148 | } | ||
149 | |||
150 | int __init notsc_setup(char *s) | ||
151 | { | ||
152 | notsc = 1; | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | __setup("notsc", notsc_setup); | ||
157 | |||
158 | |||
159 | /* clock source code: */ | ||
160 | static cycle_t read_tsc(void) | ||
161 | { | ||
162 | cycle_t ret = (cycle_t)get_cycles_sync(); | ||
163 | return ret; | ||
164 | } | ||
165 | |||
166 | static cycle_t __vsyscall_fn vread_tsc(void) | ||
167 | { | ||
168 | cycle_t ret = (cycle_t)get_cycles_sync(); | ||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | static struct clocksource clocksource_tsc = { | ||
173 | .name = "tsc", | ||
174 | .rating = 300, | ||
175 | .read = read_tsc, | ||
176 | .mask = CLOCKSOURCE_MASK(64), | ||
177 | .shift = 22, | ||
178 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
179 | CLOCK_SOURCE_MUST_VERIFY, | ||
180 | .vread = vread_tsc, | ||
181 | }; | ||
182 | |||
183 | void mark_tsc_unstable(char *reason) | ||
184 | { | ||
185 | if (!tsc_unstable) { | ||
186 | tsc_unstable = 1; | ||
187 | printk("Marking TSC unstable due to %s\n", reason); | ||
188 | /* Change only the rating, when not registered */ | ||
189 | if (clocksource_tsc.mult) | ||
190 | clocksource_change_rating(&clocksource_tsc, 0); | ||
191 | else | ||
192 | clocksource_tsc.rating = 0; | ||
193 | } | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
196 | |||
197 | void __init init_tsc_clocksource(void) | ||
198 | { | ||
199 | if (!notsc) { | ||
200 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
201 | clocksource_tsc.shift); | ||
202 | if (check_tsc_unstable()) | ||
203 | clocksource_tsc.rating = 0; | ||
204 | |||
205 | clocksource_register(&clocksource_tsc); | ||
206 | } | ||
207 | } | ||
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 12424629af87..355f5f506c81 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -1 +1,187 @@ | |||
1 | #include "../../x86_64/kernel/tsc_sync.c" | 1 | /* |
2 | * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. | ||
3 | * | ||
4 | * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar | ||
5 | * | ||
6 | * We check whether all boot CPUs have their TSC's synchronized, | ||
7 | * print a warning if not and turn off the TSC clock-source. | ||
8 | * | ||
9 | * The warp-check is point-to-point between two CPUs, the CPU | ||
10 | * initiating the bootup is the 'source CPU', the freshly booting | ||
11 | * CPU is the 'target CPU'. | ||
12 | * | ||
13 | * Only two CPUs may participate - they can enter in any order. | ||
14 | * ( The serial nature of the boot logic and the CPU hotplug lock | ||
15 | * protects against more than 2 CPUs entering this code. ) | ||
16 | */ | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <linux/nmi.h> | ||
22 | #include <asm/tsc.h> | ||
23 | |||
24 | /* | ||
25 | * Entry/exit counters that make sure that both CPUs | ||
26 | * run the measurement code at once: | ||
27 | */ | ||
28 | static __cpuinitdata atomic_t start_count; | ||
29 | static __cpuinitdata atomic_t stop_count; | ||
30 | |||
31 | /* | ||
32 | * We use a raw spinlock in this exceptional case, because | ||
33 | * we want to have the fastest, inlined, non-debug version | ||
34 | * of a critical section, to be able to prove TSC time-warps: | ||
35 | */ | ||
36 | static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
37 | static __cpuinitdata cycles_t last_tsc; | ||
38 | static __cpuinitdata cycles_t max_warp; | ||
39 | static __cpuinitdata int nr_warps; | ||
40 | |||
41 | /* | ||
42 | * TSC-warp measurement loop running on both CPUs: | ||
43 | */ | ||
44 | static __cpuinit void check_tsc_warp(void) | ||
45 | { | ||
46 | cycles_t start, now, prev, end; | ||
47 | int i; | ||
48 | |||
49 | start = get_cycles_sync(); | ||
50 | /* | ||
51 | * The measurement runs for 20 msecs: | ||
52 | */ | ||
53 | end = start + tsc_khz * 20ULL; | ||
54 | now = start; | ||
55 | |||
56 | for (i = 0; ; i++) { | ||
57 | /* | ||
58 | * We take the global lock, measure TSC, save the | ||
59 | * previous TSC that was measured (possibly on | ||
60 | * another CPU) and update the previous TSC timestamp. | ||
61 | */ | ||
62 | __raw_spin_lock(&sync_lock); | ||
63 | prev = last_tsc; | ||
64 | now = get_cycles_sync(); | ||
65 | last_tsc = now; | ||
66 | __raw_spin_unlock(&sync_lock); | ||
67 | |||
68 | /* | ||
69 | * Be nice every now and then (and also check whether | ||
70 | * measurement is done [we also insert a 100 million | ||
71 | * loops safety exit, so we dont lock up in case the | ||
72 | * TSC readout is totally broken]): | ||
73 | */ | ||
74 | if (unlikely(!(i & 7))) { | ||
75 | if (now > end || i > 100000000) | ||
76 | break; | ||
77 | cpu_relax(); | ||
78 | touch_nmi_watchdog(); | ||
79 | } | ||
80 | /* | ||
81 | * Outside the critical section we can now see whether | ||
82 | * we saw a time-warp of the TSC going backwards: | ||
83 | */ | ||
84 | if (unlikely(prev > now)) { | ||
85 | __raw_spin_lock(&sync_lock); | ||
86 | max_warp = max(max_warp, prev - now); | ||
87 | nr_warps++; | ||
88 | __raw_spin_unlock(&sync_lock); | ||
89 | } | ||
90 | |||
91 | } | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Source CPU calls into this - it waits for the freshly booted | ||
96 | * target CPU to arrive and then starts the measurement: | ||
97 | */ | ||
98 | void __cpuinit check_tsc_sync_source(int cpu) | ||
99 | { | ||
100 | int cpus = 2; | ||
101 | |||
102 | /* | ||
103 | * No need to check if we already know that the TSC is not | ||
104 | * synchronized: | ||
105 | */ | ||
106 | if (unsynchronized_tsc()) | ||
107 | return; | ||
108 | |||
109 | printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", | ||
110 | smp_processor_id(), cpu); | ||
111 | |||
112 | /* | ||
113 | * Reset it - in case this is a second bootup: | ||
114 | */ | ||
115 | atomic_set(&stop_count, 0); | ||
116 | |||
117 | /* | ||
118 | * Wait for the target to arrive: | ||
119 | */ | ||
120 | while (atomic_read(&start_count) != cpus-1) | ||
121 | cpu_relax(); | ||
122 | /* | ||
123 | * Trigger the target to continue into the measurement too: | ||
124 | */ | ||
125 | atomic_inc(&start_count); | ||
126 | |||
127 | check_tsc_warp(); | ||
128 | |||
129 | while (atomic_read(&stop_count) != cpus-1) | ||
130 | cpu_relax(); | ||
131 | |||
132 | /* | ||
133 | * Reset it - just in case we boot another CPU later: | ||
134 | */ | ||
135 | atomic_set(&start_count, 0); | ||
136 | |||
137 | if (nr_warps) { | ||
138 | printk("\n"); | ||
139 | printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," | ||
140 | " turning off TSC clock.\n", max_warp); | ||
141 | mark_tsc_unstable("check_tsc_sync_source failed"); | ||
142 | nr_warps = 0; | ||
143 | max_warp = 0; | ||
144 | last_tsc = 0; | ||
145 | } else { | ||
146 | printk(" passed.\n"); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Let the target continue with the bootup: | ||
151 | */ | ||
152 | atomic_inc(&stop_count); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Freshly booted CPUs call into this: | ||
157 | */ | ||
158 | void __cpuinit check_tsc_sync_target(void) | ||
159 | { | ||
160 | int cpus = 2; | ||
161 | |||
162 | if (unsynchronized_tsc()) | ||
163 | return; | ||
164 | |||
165 | /* | ||
166 | * Register this CPU's participation and wait for the | ||
167 | * source CPU to start the measurement: | ||
168 | */ | ||
169 | atomic_inc(&start_count); | ||
170 | while (atomic_read(&start_count) != cpus) | ||
171 | cpu_relax(); | ||
172 | |||
173 | check_tsc_warp(); | ||
174 | |||
175 | /* | ||
176 | * Ok, we are done: | ||
177 | */ | ||
178 | atomic_inc(&stop_count); | ||
179 | |||
180 | /* | ||
181 | * Wait for the source CPU to print stuff: | ||
182 | */ | ||
183 | while (atomic_read(&stop_count) != cpus) | ||
184 | cpu_relax(); | ||
185 | } | ||
186 | #undef NR_LOOPS | ||
187 | |||
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S new file mode 100644 index 000000000000..45b6f8a975a1 --- /dev/null +++ b/arch/x86/kernel/verify_cpu_64.S | |||
@@ -0,0 +1,105 @@ | |||
1 | /* | ||
2 | * | ||
3 | * verify_cpu.S - Code for cpu long mode and SSE verification. This | ||
4 | * code has been borrowed from boot/setup.S and was introduced by | ||
5 | * Andi Kleen. | ||
6 | * | ||
7 | * Copyright (c) 2007 Andi Kleen (ak@suse.de) | ||
8 | * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) | ||
9 | * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) | ||
10 | * | ||
11 | * This source code is licensed under the GNU General Public License, | ||
12 | * Version 2. See the file COPYING for more details. | ||
13 | * | ||
14 | * This is a common code for verification whether CPU supports | ||
15 | * long mode and SSE or not. It is not called directly instead this | ||
16 | * file is included at various places and compiled in that context. | ||
17 | * Following are the current usage. | ||
18 | * | ||
19 | * This file is included by both 16bit and 32bit code. | ||
20 | * | ||
21 | * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) | ||
22 | * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) | ||
23 | * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) | ||
24 | * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) | ||
25 | * | ||
26 | * verify_cpu, returns the status of cpu check in register %eax. | ||
27 | * 0: Success 1: Failure | ||
28 | * | ||
29 | * The caller needs to check for the error code and take the action | ||
30 | * appropriately. Either display a message or halt. | ||
31 | */ | ||
32 | |||
33 | #include <asm/cpufeature.h> | ||
34 | |||
35 | verify_cpu: | ||
36 | pushfl # Save caller passed flags | ||
37 | pushl $0 # Kill any dangerous flags | ||
38 | popfl | ||
39 | |||
40 | pushfl # standard way to check for cpuid | ||
41 | popl %eax | ||
42 | movl %eax,%ebx | ||
43 | xorl $0x200000,%eax | ||
44 | pushl %eax | ||
45 | popfl | ||
46 | pushfl | ||
47 | popl %eax | ||
48 | cmpl %eax,%ebx | ||
49 | jz verify_cpu_no_longmode # cpu has no cpuid | ||
50 | |||
51 | movl $0x0,%eax # See if cpuid 1 is implemented | ||
52 | cpuid | ||
53 | cmpl $0x1,%eax | ||
54 | jb verify_cpu_no_longmode # no cpuid 1 | ||
55 | |||
56 | xor %di,%di | ||
57 | cmpl $0x68747541,%ebx # AuthenticAMD | ||
58 | jnz verify_cpu_noamd | ||
59 | cmpl $0x69746e65,%edx | ||
60 | jnz verify_cpu_noamd | ||
61 | cmpl $0x444d4163,%ecx | ||
62 | jnz verify_cpu_noamd | ||
63 | mov $1,%di # cpu is from AMD | ||
64 | |||
65 | verify_cpu_noamd: | ||
66 | movl $0x1,%eax # Does the cpu have what it takes | ||
67 | cpuid | ||
68 | andl $REQUIRED_MASK0,%edx | ||
69 | xorl $REQUIRED_MASK0,%edx | ||
70 | jnz verify_cpu_no_longmode | ||
71 | |||
72 | movl $0x80000000,%eax # See if extended cpuid is implemented | ||
73 | cpuid | ||
74 | cmpl $0x80000001,%eax | ||
75 | jb verify_cpu_no_longmode # no extended cpuid | ||
76 | |||
77 | movl $0x80000001,%eax # Does the cpu have what it takes | ||
78 | cpuid | ||
79 | andl $REQUIRED_MASK1,%edx | ||
80 | xorl $REQUIRED_MASK1,%edx | ||
81 | jnz verify_cpu_no_longmode | ||
82 | |||
83 | verify_cpu_sse_test: | ||
84 | movl $1,%eax | ||
85 | cpuid | ||
86 | andl $SSE_MASK,%edx | ||
87 | cmpl $SSE_MASK,%edx | ||
88 | je verify_cpu_sse_ok | ||
89 | test %di,%di | ||
90 | jz verify_cpu_no_longmode # only try to force SSE on AMD | ||
91 | movl $0xc0010015,%ecx # HWCR | ||
92 | rdmsr | ||
93 | btr $15,%eax # enable SSE | ||
94 | wrmsr | ||
95 | xor %di,%di # don't loop | ||
96 | jmp verify_cpu_sse_test # try again | ||
97 | |||
98 | verify_cpu_no_longmode: | ||
99 | popfl # Restore caller passed flags | ||
100 | movl $1,%eax | ||
101 | ret | ||
102 | verify_cpu_sse_ok: | ||
103 | popfl # Restore caller passed flags | ||
104 | xorl %eax, %eax | ||
105 | ret | ||
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S new file mode 100644 index 000000000000..ba8ea97abd21 --- /dev/null +++ b/arch/x86/kernel/vmlinux_64.lds.S | |||
@@ -0,0 +1,235 @@ | |||
1 | /* ld script to make x86-64 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | */ | ||
4 | |||
5 | #define LOAD_OFFSET __START_KERNEL_map | ||
6 | |||
7 | #include <asm-generic/vmlinux.lds.h> | ||
8 | #include <asm/page.h> | ||
9 | |||
10 | #undef i386 /* in case the preprocessor is a 32bit one */ | ||
11 | |||
12 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
13 | OUTPUT_ARCH(i386:x86-64) | ||
14 | ENTRY(phys_startup_64) | ||
15 | jiffies_64 = jiffies; | ||
16 | _proxy_pda = 1; | ||
17 | PHDRS { | ||
18 | text PT_LOAD FLAGS(5); /* R_E */ | ||
19 | data PT_LOAD FLAGS(7); /* RWE */ | ||
20 | user PT_LOAD FLAGS(7); /* RWE */ | ||
21 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
22 | note PT_NOTE FLAGS(4); /* R__ */ | ||
23 | } | ||
24 | SECTIONS | ||
25 | { | ||
26 | . = __START_KERNEL; | ||
27 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
28 | _text = .; /* Text and read-only data */ | ||
29 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
30 | /* First the code that has to be first for bootstrapping */ | ||
31 | *(.text.head) | ||
32 | _stext = .; | ||
33 | /* Then the rest */ | ||
34 | TEXT_TEXT | ||
35 | SCHED_TEXT | ||
36 | LOCK_TEXT | ||
37 | KPROBES_TEXT | ||
38 | *(.fixup) | ||
39 | *(.gnu.warning) | ||
40 | } :text = 0x9090 | ||
41 | /* out-of-line lock text */ | ||
42 | .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } | ||
43 | |||
44 | _etext = .; /* End of text section */ | ||
45 | |||
46 | . = ALIGN(16); /* Exception table */ | ||
47 | __start___ex_table = .; | ||
48 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } | ||
49 | __stop___ex_table = .; | ||
50 | |||
51 | NOTES :text :note | ||
52 | |||
53 | BUG_TABLE :text | ||
54 | |||
55 | RODATA | ||
56 | |||
57 | . = ALIGN(4); | ||
58 | .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { | ||
59 | __tracedata_start = .; | ||
60 | *(.tracedata) | ||
61 | __tracedata_end = .; | ||
62 | } | ||
63 | |||
64 | . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ | ||
65 | /* Data */ | ||
66 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | ||
67 | DATA_DATA | ||
68 | CONSTRUCTORS | ||
69 | } :data | ||
70 | |||
71 | _edata = .; /* End of data section */ | ||
72 | |||
73 | . = ALIGN(PAGE_SIZE); | ||
74 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
75 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
76 | *(.data.cacheline_aligned) | ||
77 | } | ||
78 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | ||
79 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
80 | *(.data.read_mostly) | ||
81 | } | ||
82 | |||
83 | #define VSYSCALL_ADDR (-10*1024*1024) | ||
84 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
85 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
86 | |||
87 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | ||
88 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | ||
89 | |||
90 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | ||
91 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | ||
92 | |||
93 | . = VSYSCALL_ADDR; | ||
94 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user | ||
95 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; | ||
96 | |||
97 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
98 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } | ||
99 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
100 | .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) | ||
101 | { *(.vsyscall_gtod_data) } | ||
102 | vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); | ||
103 | .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) | ||
104 | { *(.vsyscall_clock) } | ||
105 | vsyscall_clock = VVIRT(.vsyscall_clock); | ||
106 | |||
107 | |||
108 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) | ||
109 | { *(.vsyscall_1) } | ||
110 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) | ||
111 | { *(.vsyscall_2) } | ||
112 | |||
113 | .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } | ||
114 | vgetcpu_mode = VVIRT(.vgetcpu_mode); | ||
115 | |||
116 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
117 | .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } | ||
118 | jiffies = VVIRT(.jiffies); | ||
119 | |||
120 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) | ||
121 | { *(.vsyscall_3) } | ||
122 | |||
123 | . = VSYSCALL_VIRT_ADDR + 4096; | ||
124 | |||
125 | #undef VSYSCALL_ADDR | ||
126 | #undef VSYSCALL_PHYS_ADDR | ||
127 | #undef VSYSCALL_VIRT_ADDR | ||
128 | #undef VLOAD_OFFSET | ||
129 | #undef VLOAD | ||
130 | #undef VVIRT_OFFSET | ||
131 | #undef VVIRT | ||
132 | |||
133 | . = ALIGN(8192); /* init_task */ | ||
134 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
135 | *(.data.init_task) | ||
136 | }:data.init | ||
137 | |||
138 | . = ALIGN(4096); | ||
139 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
140 | *(.data.page_aligned) | ||
141 | } | ||
142 | |||
143 | /* might get freed after init */ | ||
144 | . = ALIGN(4096); | ||
145 | __smp_alt_begin = .; | ||
146 | __smp_locks = .; | ||
147 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
148 | *(.smp_locks) | ||
149 | } | ||
150 | __smp_locks_end = .; | ||
151 | . = ALIGN(4096); | ||
152 | __smp_alt_end = .; | ||
153 | |||
154 | . = ALIGN(4096); /* Init code and data */ | ||
155 | __init_begin = .; | ||
156 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
157 | _sinittext = .; | ||
158 | *(.init.text) | ||
159 | _einittext = .; | ||
160 | } | ||
161 | __initdata_begin = .; | ||
162 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } | ||
163 | __initdata_end = .; | ||
164 | . = ALIGN(16); | ||
165 | __setup_start = .; | ||
166 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } | ||
167 | __setup_end = .; | ||
168 | __initcall_start = .; | ||
169 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
170 | INITCALLS | ||
171 | } | ||
172 | __initcall_end = .; | ||
173 | __con_initcall_start = .; | ||
174 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
175 | *(.con_initcall.init) | ||
176 | } | ||
177 | __con_initcall_end = .; | ||
178 | SECURITY_INIT | ||
179 | . = ALIGN(8); | ||
180 | __alt_instructions = .; | ||
181 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
182 | *(.altinstructions) | ||
183 | } | ||
184 | __alt_instructions_end = .; | ||
185 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
186 | *(.altinstr_replacement) | ||
187 | } | ||
188 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
189 | from .altinstructions and .eh_frame */ | ||
190 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } | ||
191 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } | ||
192 | |||
193 | /* vdso blob that is mapped into user space */ | ||
194 | vdso_start = . ; | ||
195 | .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } | ||
196 | . = ALIGN(4096); | ||
197 | vdso_end = .; | ||
198 | |||
199 | #ifdef CONFIG_BLK_DEV_INITRD | ||
200 | . = ALIGN(4096); | ||
201 | __initramfs_start = .; | ||
202 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } | ||
203 | __initramfs_end = .; | ||
204 | #endif | ||
205 | |||
206 | PERCPU(4096) | ||
207 | |||
208 | . = ALIGN(4096); | ||
209 | __init_end = .; | ||
210 | |||
211 | . = ALIGN(4096); | ||
212 | __nosave_begin = .; | ||
213 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } | ||
214 | . = ALIGN(4096); | ||
215 | __nosave_end = .; | ||
216 | |||
217 | __bss_start = .; /* BSS */ | ||
218 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
219 | *(.bss.page_aligned) | ||
220 | *(.bss) | ||
221 | } | ||
222 | __bss_stop = .; | ||
223 | |||
224 | _end = . ; | ||
225 | |||
226 | /* Sections to be discarded */ | ||
227 | /DISCARD/ : { | ||
228 | *(.exitcall.exit) | ||
229 | *(.eh_frame) | ||
230 | } | ||
231 | |||
232 | STABS_DEBUG | ||
233 | |||
234 | DWARF_DEBUG | ||
235 | } | ||
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c new file mode 100644 index 000000000000..414caf0c5f9a --- /dev/null +++ b/arch/x86/kernel/vsmp_64.c | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * vSMPowered(tm) systems specific initialization | ||
3 | * Copyright (C) 2005 ScaleMP Inc. | ||
4 | * | ||
5 | * Use of this code is subject to the terms and conditions of the | ||
6 | * GNU general public license version 2. See "COPYING" or | ||
7 | * http://www.gnu.org/licenses/gpl.html | ||
8 | * | ||
9 | * Ravikiran Thirumalai <kiran@scalemp.com>, | ||
10 | * Shai Fultheim <shai@scalemp.com> | ||
11 | */ | ||
12 | |||
13 | #include <linux/init.h> | ||
14 | #include <linux/pci_ids.h> | ||
15 | #include <linux/pci_regs.h> | ||
16 | #include <asm/pci-direct.h> | ||
17 | #include <asm/io.h> | ||
18 | |||
19 | static int __init vsmp_init(void) | ||
20 | { | ||
21 | void *address; | ||
22 | unsigned int cap, ctl; | ||
23 | |||
24 | if (!early_pci_allowed()) | ||
25 | return 0; | ||
26 | |||
27 | /* Check if we are running on a ScaleMP vSMP box */ | ||
28 | if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || | ||
29 | (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) | ||
30 | return 0; | ||
31 | |||
32 | /* set vSMP magic bits to indicate vSMP capable kernel */ | ||
33 | address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); | ||
34 | cap = readl(address); | ||
35 | ctl = readl(address + 4); | ||
36 | printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); | ||
37 | if (cap & ctl & (1 << 4)) { | ||
38 | /* Turn on vSMP IRQ fastpath handling (see system.h) */ | ||
39 | ctl &= ~(1 << 4); | ||
40 | writel(ctl, address + 4); | ||
41 | ctl = readl(address + 4); | ||
42 | printk("vSMP CTL: control set to:0x%08x\n", ctl); | ||
43 | } | ||
44 | |||
45 | iounmap(address); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | core_initcall(vsmp_init); | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c new file mode 100644 index 000000000000..06c34949bfdc --- /dev/null +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/vsyscall.c | ||
3 | * | ||
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright 2003 Andi Kleen, SuSE Labs. | ||
6 | * | ||
7 | * Thanks to hpa@transmeta.com for some useful hint. | ||
8 | * Special thanks to Ingo Molnar for his early experience with | ||
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | ||
10 | * | ||
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | ||
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | ||
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | ||
14 | * jumping out of line if necessary. We cannot add more with this | ||
15 | * mechanism because older kernels won't return -ENOSYS. | ||
16 | * If we want more than four we need a vDSO. | ||
17 | * | ||
18 | * Note: the concept clashes with user mode linux. If you use UML and | ||
19 | * want per guest time just set the kernel.vsyscall64 sysctl to 0. | ||
20 | */ | ||
21 | |||
22 | #include <linux/time.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/seqlock.h> | ||
27 | #include <linux/jiffies.h> | ||
28 | #include <linux/sysctl.h> | ||
29 | #include <linux/clocksource.h> | ||
30 | #include <linux/getcpu.h> | ||
31 | #include <linux/cpu.h> | ||
32 | #include <linux/smp.h> | ||
33 | #include <linux/notifier.h> | ||
34 | |||
35 | #include <asm/vsyscall.h> | ||
36 | #include <asm/pgtable.h> | ||
37 | #include <asm/page.h> | ||
38 | #include <asm/unistd.h> | ||
39 | #include <asm/fixmap.h> | ||
40 | #include <asm/errno.h> | ||
41 | #include <asm/io.h> | ||
42 | #include <asm/segment.h> | ||
43 | #include <asm/desc.h> | ||
44 | #include <asm/topology.h> | ||
45 | #include <asm/vgtod.h> | ||
46 | |||
47 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | ||
48 | #define __syscall_clobber "r11","rcx","memory" | ||
49 | #define __pa_vsymbol(x) \ | ||
50 | ({unsigned long v; \ | ||
51 | extern char __vsyscall_0; \ | ||
52 | asm("" : "=r" (v) : "0" (x)); \ | ||
53 | ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) | ||
54 | |||
55 | /* | ||
56 | * vsyscall_gtod_data contains data that is : | ||
57 | * - readonly from vsyscalls | ||
58 | * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) | ||
59 | * Try to keep this structure as small as possible to avoid cache line ping pongs | ||
60 | */ | ||
61 | int __vgetcpu_mode __section_vgetcpu_mode; | ||
62 | |||
63 | struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = | ||
64 | { | ||
65 | .lock = SEQLOCK_UNLOCKED, | ||
66 | .sysctl_enabled = 1, | ||
67 | }; | ||
68 | |||
69 | void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) | ||
70 | { | ||
71 | unsigned long flags; | ||
72 | |||
73 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); | ||
74 | /* copy vsyscall data */ | ||
75 | vsyscall_gtod_data.clock.vread = clock->vread; | ||
76 | vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; | ||
77 | vsyscall_gtod_data.clock.mask = clock->mask; | ||
78 | vsyscall_gtod_data.clock.mult = clock->mult; | ||
79 | vsyscall_gtod_data.clock.shift = clock->shift; | ||
80 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | ||
81 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | ||
82 | vsyscall_gtod_data.sys_tz = sys_tz; | ||
83 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | ||
84 | vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; | ||
85 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | ||
86 | } | ||
87 | |||
88 | /* RED-PEN may want to readd seq locking, but then the variable should be | ||
89 | * write-once. | ||
90 | */ | ||
91 | static __always_inline void do_get_tz(struct timezone * tz) | ||
92 | { | ||
93 | *tz = __vsyscall_gtod_data.sys_tz; | ||
94 | } | ||
95 | |||
96 | static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | ||
97 | { | ||
98 | int ret; | ||
99 | asm volatile("vsysc2: syscall" | ||
100 | : "=a" (ret) | ||
101 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) | ||
102 | : __syscall_clobber ); | ||
103 | return ret; | ||
104 | } | ||
105 | |||
106 | static __always_inline long time_syscall(long *t) | ||
107 | { | ||
108 | long secs; | ||
109 | asm volatile("vsysc1: syscall" | ||
110 | : "=a" (secs) | ||
111 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | ||
112 | return secs; | ||
113 | } | ||
114 | |||
115 | static __always_inline void do_vgettimeofday(struct timeval * tv) | ||
116 | { | ||
117 | cycle_t now, base, mask, cycle_delta; | ||
118 | unsigned seq; | ||
119 | unsigned long mult, shift, nsec; | ||
120 | cycle_t (*vread)(void); | ||
121 | do { | ||
122 | seq = read_seqbegin(&__vsyscall_gtod_data.lock); | ||
123 | |||
124 | vread = __vsyscall_gtod_data.clock.vread; | ||
125 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { | ||
126 | gettimeofday(tv,NULL); | ||
127 | return; | ||
128 | } | ||
129 | now = vread(); | ||
130 | base = __vsyscall_gtod_data.clock.cycle_last; | ||
131 | mask = __vsyscall_gtod_data.clock.mask; | ||
132 | mult = __vsyscall_gtod_data.clock.mult; | ||
133 | shift = __vsyscall_gtod_data.clock.shift; | ||
134 | |||
135 | tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; | ||
136 | nsec = __vsyscall_gtod_data.wall_time_nsec; | ||
137 | } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); | ||
138 | |||
139 | /* calculate interval: */ | ||
140 | cycle_delta = (now - base) & mask; | ||
141 | /* convert to nsecs: */ | ||
142 | nsec += (cycle_delta * mult) >> shift; | ||
143 | |||
144 | while (nsec >= NSEC_PER_SEC) { | ||
145 | tv->tv_sec += 1; | ||
146 | nsec -= NSEC_PER_SEC; | ||
147 | } | ||
148 | tv->tv_usec = nsec / NSEC_PER_USEC; | ||
149 | } | ||
150 | |||
151 | int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | ||
152 | { | ||
153 | if (tv) | ||
154 | do_vgettimeofday(tv); | ||
155 | if (tz) | ||
156 | do_get_tz(tz); | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | /* This will break when the xtime seconds get inaccurate, but that is | ||
161 | * unlikely */ | ||
162 | time_t __vsyscall(1) vtime(time_t *t) | ||
163 | { | ||
164 | struct timeval tv; | ||
165 | time_t result; | ||
166 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) | ||
167 | return time_syscall(t); | ||
168 | |||
169 | vgettimeofday(&tv, 0); | ||
170 | result = tv.tv_sec; | ||
171 | if (t) | ||
172 | *t = result; | ||
173 | return result; | ||
174 | } | ||
175 | |||
176 | /* Fast way to get current CPU and node. | ||
177 | This helps to do per node and per CPU caches in user space. | ||
178 | The result is not guaranteed without CPU affinity, but usually | ||
179 | works out because the scheduler tries to keep a thread on the same | ||
180 | CPU. | ||
181 | |||
182 | tcache must point to a two element sized long array. | ||
183 | All arguments can be NULL. */ | ||
184 | long __vsyscall(2) | ||
185 | vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | ||
186 | { | ||
187 | unsigned int dummy, p; | ||
188 | unsigned long j = 0; | ||
189 | |||
190 | /* Fast cache - only recompute value once per jiffies and avoid | ||
191 | relatively costly rdtscp/cpuid otherwise. | ||
192 | This works because the scheduler usually keeps the process | ||
193 | on the same CPU and this syscall doesn't guarantee its | ||
194 | results anyways. | ||
195 | We do this here because otherwise user space would do it on | ||
196 | its own in a likely inferior way (no access to jiffies). | ||
197 | If you don't like it pass NULL. */ | ||
198 | if (tcache && tcache->blob[0] == (j = __jiffies)) { | ||
199 | p = tcache->blob[1]; | ||
200 | } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { | ||
201 | /* Load per CPU data from RDTSCP */ | ||
202 | rdtscp(dummy, dummy, p); | ||
203 | } else { | ||
204 | /* Load per CPU data from GDT */ | ||
205 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
206 | } | ||
207 | if (tcache) { | ||
208 | tcache->blob[0] = j; | ||
209 | tcache->blob[1] = p; | ||
210 | } | ||
211 | if (cpu) | ||
212 | *cpu = p & 0xfff; | ||
213 | if (node) | ||
214 | *node = p >> 12; | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | long __vsyscall(3) venosys_1(void) | ||
219 | { | ||
220 | return -ENOSYS; | ||
221 | } | ||
222 | |||
223 | #ifdef CONFIG_SYSCTL | ||
224 | |||
225 | #define SYSCALL 0x050f | ||
226 | #define NOP2 0x9090 | ||
227 | |||
228 | /* | ||
229 | * NOP out syscall in vsyscall page when not needed. | ||
230 | */ | ||
231 | static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | ||
232 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
233 | { | ||
234 | extern u16 vsysc1, vsysc2; | ||
235 | u16 __iomem *map1; | ||
236 | u16 __iomem *map2; | ||
237 | int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | ||
238 | if (!write) | ||
239 | return ret; | ||
240 | /* gcc has some trouble with __va(__pa()), so just do it this | ||
241 | way. */ | ||
242 | map1 = ioremap(__pa_vsymbol(&vsysc1), 2); | ||
243 | if (!map1) | ||
244 | return -ENOMEM; | ||
245 | map2 = ioremap(__pa_vsymbol(&vsysc2), 2); | ||
246 | if (!map2) { | ||
247 | ret = -ENOMEM; | ||
248 | goto out; | ||
249 | } | ||
250 | if (!vsyscall_gtod_data.sysctl_enabled) { | ||
251 | writew(SYSCALL, map1); | ||
252 | writew(SYSCALL, map2); | ||
253 | } else { | ||
254 | writew(NOP2, map1); | ||
255 | writew(NOP2, map2); | ||
256 | } | ||
257 | iounmap(map2); | ||
258 | out: | ||
259 | iounmap(map1); | ||
260 | return ret; | ||
261 | } | ||
262 | |||
263 | static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, | ||
264 | void __user *oldval, size_t __user *oldlenp, | ||
265 | void __user *newval, size_t newlen) | ||
266 | { | ||
267 | return -ENOSYS; | ||
268 | } | ||
269 | |||
270 | static ctl_table kernel_table2[] = { | ||
271 | { .ctl_name = 99, .procname = "vsyscall64", | ||
272 | .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), | ||
273 | .mode = 0644, | ||
274 | .strategy = vsyscall_sysctl_nostrat, | ||
275 | .proc_handler = vsyscall_sysctl_change }, | ||
276 | {} | ||
277 | }; | ||
278 | |||
279 | static ctl_table kernel_root_table2[] = { | ||
280 | { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, | ||
281 | .child = kernel_table2 }, | ||
282 | {} | ||
283 | }; | ||
284 | |||
285 | #endif | ||
286 | |||
287 | /* Assume __initcall executes before all user space. Hopefully kmod | ||
288 | doesn't violate that. We'll find out if it does. */ | ||
289 | static void __cpuinit vsyscall_set_cpu(int cpu) | ||
290 | { | ||
291 | unsigned long *d; | ||
292 | unsigned long node = 0; | ||
293 | #ifdef CONFIG_NUMA | ||
294 | node = cpu_to_node[cpu]; | ||
295 | #endif | ||
296 | if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) | ||
297 | write_rdtscp_aux((node << 12) | cpu); | ||
298 | |||
299 | /* Store cpu number in limit so that it can be loaded quickly | ||
300 | in user space in vgetcpu. | ||
301 | 12 bits for the CPU and 8 bits for the node. */ | ||
302 | d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); | ||
303 | *d = 0x0f40000000000ULL; | ||
304 | *d |= cpu; | ||
305 | *d |= (node & 0xf) << 12; | ||
306 | *d |= (node >> 4) << 48; | ||
307 | } | ||
308 | |||
309 | static void __cpuinit cpu_vsyscall_init(void *arg) | ||
310 | { | ||
311 | /* preemption should be already off */ | ||
312 | vsyscall_set_cpu(raw_smp_processor_id()); | ||
313 | } | ||
314 | |||
315 | static int __cpuinit | ||
316 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | ||
317 | { | ||
318 | long cpu = (long)arg; | ||
319 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | ||
320 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); | ||
321 | return NOTIFY_DONE; | ||
322 | } | ||
323 | |||
324 | static void __init map_vsyscall(void) | ||
325 | { | ||
326 | extern char __vsyscall_0; | ||
327 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | ||
328 | |||
329 | /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ | ||
330 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | ||
331 | } | ||
332 | |||
333 | static int __init vsyscall_init(void) | ||
334 | { | ||
335 | BUG_ON(((unsigned long) &vgettimeofday != | ||
336 | VSYSCALL_ADDR(__NR_vgettimeofday))); | ||
337 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | ||
338 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | ||
339 | BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); | ||
340 | map_vsyscall(); | ||
341 | #ifdef CONFIG_SYSCTL | ||
342 | register_sysctl_table(kernel_root_table2); | ||
343 | #endif | ||
344 | on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); | ||
345 | hotcpu_notifier(cpu_vsyscall_notifier, 0); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | __initcall(vsyscall_init); | ||
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c new file mode 100644 index 000000000000..77c25b307635 --- /dev/null +++ b/arch/x86/kernel/x8664_ksyms_64.c | |||
@@ -0,0 +1,62 @@ | |||
1 | /* Exports for assembly files. | ||
2 | All C exports should go in the respective C files. */ | ||
3 | |||
4 | #include <linux/module.h> | ||
5 | #include <linux/smp.h> | ||
6 | |||
7 | #include <asm/semaphore.h> | ||
8 | #include <asm/processor.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | |||
12 | EXPORT_SYMBOL(kernel_thread); | ||
13 | |||
14 | EXPORT_SYMBOL(__down_failed); | ||
15 | EXPORT_SYMBOL(__down_failed_interruptible); | ||
16 | EXPORT_SYMBOL(__down_failed_trylock); | ||
17 | EXPORT_SYMBOL(__up_wakeup); | ||
18 | |||
19 | EXPORT_SYMBOL(__get_user_1); | ||
20 | EXPORT_SYMBOL(__get_user_2); | ||
21 | EXPORT_SYMBOL(__get_user_4); | ||
22 | EXPORT_SYMBOL(__get_user_8); | ||
23 | EXPORT_SYMBOL(__put_user_1); | ||
24 | EXPORT_SYMBOL(__put_user_2); | ||
25 | EXPORT_SYMBOL(__put_user_4); | ||
26 | EXPORT_SYMBOL(__put_user_8); | ||
27 | |||
28 | EXPORT_SYMBOL(copy_user_generic); | ||
29 | EXPORT_SYMBOL(__copy_user_nocache); | ||
30 | EXPORT_SYMBOL(copy_from_user); | ||
31 | EXPORT_SYMBOL(copy_to_user); | ||
32 | EXPORT_SYMBOL(__copy_from_user_inatomic); | ||
33 | |||
34 | EXPORT_SYMBOL(copy_page); | ||
35 | EXPORT_SYMBOL(clear_page); | ||
36 | |||
37 | #ifdef CONFIG_SMP | ||
38 | extern void __write_lock_failed(rwlock_t *rw); | ||
39 | extern void __read_lock_failed(rwlock_t *rw); | ||
40 | EXPORT_SYMBOL(__write_lock_failed); | ||
41 | EXPORT_SYMBOL(__read_lock_failed); | ||
42 | #endif | ||
43 | |||
44 | /* Export string functions. We normally rely on gcc builtin for most of these, | ||
45 | but gcc sometimes decides not to inline them. */ | ||
46 | #undef memcpy | ||
47 | #undef memset | ||
48 | #undef memmove | ||
49 | |||
50 | extern void * memset(void *,int,__kernel_size_t); | ||
51 | extern void * memcpy(void *,const void *,__kernel_size_t); | ||
52 | extern void * __memcpy(void *,const void *,__kernel_size_t); | ||
53 | |||
54 | EXPORT_SYMBOL(memset); | ||
55 | EXPORT_SYMBOL(memcpy); | ||
56 | EXPORT_SYMBOL(__memcpy); | ||
57 | |||
58 | EXPORT_SYMBOL(empty_zero_page); | ||
59 | EXPORT_SYMBOL(init_level4_pgt); | ||
60 | EXPORT_SYMBOL(load_gs_index); | ||
61 | |||
62 | EXPORT_SYMBOL(_proxy_pda); | ||