diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64 |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/x86_64')
119 files changed, 35709 insertions, 0 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig new file mode 100644 index 000000000000..80c38c5d71fe --- /dev/null +++ b/arch/x86_64/Kconfig | |||
@@ -0,0 +1,477 @@ | |||
1 | # | ||
2 | # For a description of the syntax of this configuration file, | ||
3 | # see Documentation/kbuild/kconfig-language.txt. | ||
4 | # | ||
5 | # Note: ISA is disabled and will hopefully never be enabled. | ||
6 | # If you managed to buy an ISA x86-64 box you'll have to fix all the | ||
7 | # ISA drivers you need yourself. | ||
8 | # | ||
9 | |||
10 | mainmenu "Linux Kernel Configuration" | ||
11 | |||
12 | config X86_64 | ||
13 | bool | ||
14 | default y | ||
15 | help | ||
16 | Port to the x86-64 architecture. x86-64 is a 64-bit extension to the | ||
17 | classical 32-bit x86 architecture. For details see | ||
18 | <http://www.x86-64.org/>. | ||
19 | |||
20 | config 64BIT | ||
21 | def_bool y | ||
22 | |||
23 | config X86 | ||
24 | bool | ||
25 | default y | ||
26 | |||
27 | config MMU | ||
28 | bool | ||
29 | default y | ||
30 | |||
31 | config ISA | ||
32 | bool | ||
33 | |||
34 | config SBUS | ||
35 | bool | ||
36 | |||
37 | config RWSEM_GENERIC_SPINLOCK | ||
38 | bool | ||
39 | default y | ||
40 | |||
41 | config RWSEM_XCHGADD_ALGORITHM | ||
42 | bool | ||
43 | |||
44 | config GENERIC_CALIBRATE_DELAY | ||
45 | bool | ||
46 | default y | ||
47 | |||
48 | config X86_CMPXCHG | ||
49 | bool | ||
50 | default y | ||
51 | |||
52 | config EARLY_PRINTK | ||
53 | bool | ||
54 | default y | ||
55 | |||
56 | config GENERIC_ISA_DMA | ||
57 | bool | ||
58 | default y | ||
59 | |||
60 | config GENERIC_IOMAP | ||
61 | bool | ||
62 | default y | ||
63 | |||
64 | source "init/Kconfig" | ||
65 | |||
66 | |||
67 | menu "Processor type and features" | ||
68 | |||
69 | choice | ||
70 | prompt "Processor family" | ||
71 | default MK8 | ||
72 | |||
73 | config MK8 | ||
74 | bool "AMD-Opteron/Athlon64" | ||
75 | help | ||
76 | Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. | ||
77 | |||
78 | config MPSC | ||
79 | bool "Intel EM64T" | ||
80 | help | ||
81 | Optimize for Intel Pentium 4 and Xeon CPUs with Intel | ||
82 | Extended Memory 64 Technology(EM64T). For details see | ||
83 | <http://www.intel.com/technology/64bitextensions/>. | ||
84 | |||
85 | config GENERIC_CPU | ||
86 | bool "Generic-x86-64" | ||
87 | help | ||
88 | Generic x86-64 CPU. | ||
89 | |||
90 | endchoice | ||
91 | |||
92 | # | ||
93 | # Define implied options from the CPU selection here | ||
94 | # | ||
95 | config X86_L1_CACHE_BYTES | ||
96 | int | ||
97 | default "128" if GENERIC_CPU || MPSC | ||
98 | default "64" if MK8 | ||
99 | |||
100 | config X86_L1_CACHE_SHIFT | ||
101 | int | ||
102 | default "7" if GENERIC_CPU || MPSC | ||
103 | default "6" if MK8 | ||
104 | |||
105 | config X86_TSC | ||
106 | bool | ||
107 | default y | ||
108 | |||
109 | config X86_GOOD_APIC | ||
110 | bool | ||
111 | default y | ||
112 | |||
113 | config MICROCODE | ||
114 | tristate "/dev/cpu/microcode - Intel CPU microcode support" | ||
115 | ---help--- | ||
116 | If you say Y here the 'File systems' section, you will be | ||
117 | able to update the microcode on Intel processors. You will | ||
118 | obviously need the actual microcode binary data itself which is | ||
119 | not shipped with the Linux kernel. | ||
120 | |||
121 | For latest news and information on obtaining all the required | ||
122 | ingredients for this driver, check: | ||
123 | <http://www.urbanmyth.org/microcode/>. | ||
124 | |||
125 | To compile this driver as a module, choose M here: the | ||
126 | module will be called microcode. | ||
127 | If you use modprobe or kmod you may also want to add the line | ||
128 | 'alias char-major-10-184 microcode' to your /etc/modules.conf file. | ||
129 | |||
130 | config X86_MSR | ||
131 | tristate "/dev/cpu/*/msr - Model-specific register support" | ||
132 | help | ||
133 | This device gives privileged processes access to the x86 | ||
134 | Model-Specific Registers (MSRs). It is a character device with | ||
135 | major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. | ||
136 | MSR accesses are directed to a specific CPU on multi-processor | ||
137 | systems. | ||
138 | |||
139 | config X86_CPUID | ||
140 | tristate "/dev/cpu/*/cpuid - CPU information support" | ||
141 | help | ||
142 | This device gives processes access to the x86 CPUID instruction to | ||
143 | be executed on a specific processor. It is a character device | ||
144 | with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to | ||
145 | /dev/cpu/31/cpuid. | ||
146 | |||
147 | # disable it for opteron optimized builds because it pulls in ACPI_BOOT | ||
148 | config X86_HT | ||
149 | bool | ||
150 | depends on SMP && !MK8 | ||
151 | default y | ||
152 | |||
153 | config MATH_EMULATION | ||
154 | bool | ||
155 | |||
156 | config MCA | ||
157 | bool | ||
158 | |||
159 | config EISA | ||
160 | bool | ||
161 | |||
162 | config X86_IO_APIC | ||
163 | bool | ||
164 | default y | ||
165 | |||
166 | config X86_LOCAL_APIC | ||
167 | bool | ||
168 | default y | ||
169 | |||
170 | config MTRR | ||
171 | bool "MTRR (Memory Type Range Register) support" | ||
172 | ---help--- | ||
173 | On Intel P6 family processors (Pentium Pro, Pentium II and later) | ||
174 | the Memory Type Range Registers (MTRRs) may be used to control | ||
175 | processor access to memory ranges. This is most useful if you have | ||
176 | a video (VGA) card on a PCI or AGP bus. Enabling write-combining | ||
177 | allows bus write transfers to be combined into a larger transfer | ||
178 | before bursting over the PCI/AGP bus. This can increase performance | ||
179 | of image write operations 2.5 times or more. Saying Y here creates a | ||
180 | /proc/mtrr file which may be used to manipulate your processor's | ||
181 | MTRRs. Typically the X server should use this. | ||
182 | |||
183 | This code has a reasonably generic interface so that similar | ||
184 | control registers on other processors can be easily supported | ||
185 | as well. | ||
186 | |||
187 | Saying Y here also fixes a problem with buggy SMP BIOSes which only | ||
188 | set the MTRRs for the boot CPU and not for the secondary CPUs. This | ||
189 | can lead to all sorts of problems, so it's good to say Y here. | ||
190 | |||
191 | Just say Y here, all x86-64 machines support MTRRs. | ||
192 | |||
193 | See <file:Documentation/mtrr.txt> for more information. | ||
194 | |||
195 | config SMP | ||
196 | bool "Symmetric multi-processing support" | ||
197 | ---help--- | ||
198 | This enables support for systems with more than one CPU. If you have | ||
199 | a system with only one CPU, like most personal computers, say N. If | ||
200 | you have a system with more than one CPU, say Y. | ||
201 | |||
202 | If you say N here, the kernel will run on single and multiprocessor | ||
203 | machines, but will use only one CPU of a multiprocessor machine. If | ||
204 | you say Y here, the kernel will run on many, but not all, | ||
205 | singleprocessor machines. On a singleprocessor machine, the kernel | ||
206 | will run faster if you say N here. | ||
207 | |||
208 | If you don't know what to do here, say N. | ||
209 | |||
210 | config PREEMPT | ||
211 | bool "Preemptible Kernel" | ||
212 | ---help--- | ||
213 | This option reduces the latency of the kernel when reacting to | ||
214 | real-time or interactive events by allowing a low priority process to | ||
215 | be preempted even if it is in kernel mode executing a system call. | ||
216 | This allows applications to run more reliably even when the system is | ||
217 | under load. On contrary it may also break your drivers and add | ||
218 | priority inheritance problems to your system. Don't select it if | ||
219 | you rely on a stable system or have slightly obscure hardware. | ||
220 | It's also not very well tested on x86-64 currently. | ||
221 | You have been warned. | ||
222 | |||
223 | Say Y here if you are feeling brave and building a kernel for a | ||
224 | desktop, embedded or real-time system. Say N if you are unsure. | ||
225 | |||
226 | config PREEMPT_BKL | ||
227 | bool "Preempt The Big Kernel Lock" | ||
228 | depends on PREEMPT | ||
229 | default y | ||
230 | help | ||
231 | This option reduces the latency of the kernel by making the | ||
232 | big kernel lock preemptible. | ||
233 | |||
234 | Say Y here if you are building a kernel for a desktop system. | ||
235 | Say N if you are unsure. | ||
236 | |||
237 | config SCHED_SMT | ||
238 | bool "SMT (Hyperthreading) scheduler support" | ||
239 | depends on SMP | ||
240 | default n | ||
241 | help | ||
242 | SMT scheduler support improves the CPU scheduler's decision making | ||
243 | when dealing with Intel Pentium 4 chips with HyperThreading at a | ||
244 | cost of slightly increased overhead in some places. If unsure say | ||
245 | N here. | ||
246 | |||
247 | config K8_NUMA | ||
248 | bool "K8 NUMA support" | ||
249 | select NUMA | ||
250 | depends on SMP | ||
251 | help | ||
252 | Enable NUMA (Non Unified Memory Architecture) support for | ||
253 | AMD Opteron Multiprocessor systems. The kernel will try to allocate | ||
254 | memory used by a CPU on the local memory controller of the CPU | ||
255 | and add some more NUMA awareness to the kernel. | ||
256 | This code is recommended on all multiprocessor Opteron systems | ||
257 | and normally doesn't hurt on others. | ||
258 | |||
259 | config NUMA_EMU | ||
260 | bool "NUMA emulation support" | ||
261 | select NUMA | ||
262 | depends on SMP | ||
263 | help | ||
264 | Enable NUMA emulation. A flat machine will be split | ||
265 | into virtual nodes when booted with "numa=fake=N", where N is the | ||
266 | number of nodes. This is only useful for debugging. | ||
267 | |||
268 | config DISCONTIGMEM | ||
269 | bool | ||
270 | depends on NUMA | ||
271 | default y | ||
272 | |||
273 | config NUMA | ||
274 | bool | ||
275 | default n | ||
276 | |||
277 | config HAVE_DEC_LOCK | ||
278 | bool | ||
279 | depends on SMP | ||
280 | default y | ||
281 | |||
282 | config NR_CPUS | ||
283 | int "Maximum number of CPUs (2-256)" | ||
284 | range 2 256 | ||
285 | depends on SMP | ||
286 | default "8" | ||
287 | help | ||
288 | This allows you to specify the maximum number of CPUs which this | ||
289 | kernel will support. Current maximum is 256 CPUs due to | ||
290 | APIC addressing limits. Less depending on the hardware. | ||
291 | |||
292 | This is purely to save memory - each supported CPU requires | ||
293 | memory in the static kernel configuration. | ||
294 | |||
295 | config HPET_TIMER | ||
296 | bool | ||
297 | default y | ||
298 | help | ||
299 | Use the IA-PC HPET (High Precision Event Timer) to manage | ||
300 | time in preference to the PIT and RTC, if a HPET is | ||
301 | present. The HPET provides a stable time base on SMP | ||
302 | systems, unlike the TSC, but it is more expensive to access, | ||
303 | as it is off-chip. You can find the HPET spec at | ||
304 | <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. | ||
305 | |||
306 | config HPET_EMULATE_RTC | ||
307 | bool "Provide RTC interrupt" | ||
308 | depends on HPET_TIMER && RTC=y | ||
309 | |||
310 | config GART_IOMMU | ||
311 | bool "IOMMU support" | ||
312 | depends on PCI | ||
313 | help | ||
314 | Support the K8 IOMMU. Needed to run systems with more than 4GB of memory | ||
315 | properly with 32-bit PCI devices that do not support DAC (Double Address | ||
316 | Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. | ||
317 | Normally the kernel will take the right choice by itself. | ||
318 | If unsure, say Y. | ||
319 | |||
320 | # need this always enabled with GART_IOMMU for the VIA workaround | ||
321 | config SWIOTLB | ||
322 | bool | ||
323 | depends on GART_IOMMU | ||
324 | default y | ||
325 | |||
326 | config DUMMY_IOMMU | ||
327 | bool | ||
328 | depends on !GART_IOMMU && !SWIOTLB | ||
329 | default y | ||
330 | help | ||
331 | Don't use IOMMU code. This will cause problems when you have more than 4GB | ||
332 | of memory and any 32-bit devices. Don't turn on unless you know what you | ||
333 | are doing. | ||
334 | |||
335 | config X86_MCE | ||
336 | bool "Machine check support" if EMBEDDED | ||
337 | default y | ||
338 | help | ||
339 | Include a machine check error handler to report hardware errors. | ||
340 | This version will require the mcelog utility to decode some | ||
341 | machine check error logs. See | ||
342 | ftp://ftp.x86-64.org/pub/linux/tools/mcelog | ||
343 | |||
344 | config X86_MCE_INTEL | ||
345 | bool "Intel MCE features" | ||
346 | depends on X86_MCE && X86_LOCAL_APIC | ||
347 | default y | ||
348 | help | ||
349 | Additional support for intel specific MCE features such as | ||
350 | the thermal monitor. | ||
351 | |||
352 | config SECCOMP | ||
353 | bool "Enable seccomp to safely compute untrusted bytecode" | ||
354 | depends on PROC_FS | ||
355 | default y | ||
356 | help | ||
357 | This kernel feature is useful for number crunching applications | ||
358 | that may need to compute untrusted bytecode during their | ||
359 | execution. By using pipes or other transports made available to | ||
360 | the process as file descriptors supporting the read/write | ||
361 | syscalls, it's possible to isolate those applications in | ||
362 | their own address space using seccomp. Once seccomp is | ||
363 | enabled via /proc/<pid>/seccomp, it cannot be disabled | ||
364 | and the task is only allowed to execute a few safe syscalls | ||
365 | defined by each seccomp mode. | ||
366 | |||
367 | If unsure, say Y. Only embedded should say N here. | ||
368 | |||
369 | endmenu | ||
370 | |||
371 | # | ||
372 | # Use the generic interrupt handling code in kernel/irq/: | ||
373 | # | ||
374 | config GENERIC_HARDIRQS | ||
375 | bool | ||
376 | default y | ||
377 | |||
378 | config GENERIC_IRQ_PROBE | ||
379 | bool | ||
380 | default y | ||
381 | |||
382 | menu "Power management options" | ||
383 | |||
384 | source kernel/power/Kconfig | ||
385 | |||
386 | source "drivers/acpi/Kconfig" | ||
387 | |||
388 | source "arch/x86_64/kernel/cpufreq/Kconfig" | ||
389 | |||
390 | endmenu | ||
391 | |||
392 | menu "Bus options (PCI etc.)" | ||
393 | |||
394 | config PCI | ||
395 | bool "PCI support" | ||
396 | |||
397 | # x86-64 doesn't support PCI BIOS access from long mode so always go direct. | ||
398 | config PCI_DIRECT | ||
399 | bool | ||
400 | depends on PCI | ||
401 | default y | ||
402 | |||
403 | config PCI_MMCONFIG | ||
404 | bool "Support mmconfig PCI config space access" | ||
405 | depends on PCI | ||
406 | select ACPI_BOOT | ||
407 | |||
408 | config UNORDERED_IO | ||
409 | bool "Unordered IO mapping access" | ||
410 | depends on EXPERIMENTAL | ||
411 | help | ||
412 | Use unordered stores to access IO memory mappings in device drivers. | ||
413 | Still very experimental. When a driver works on IA64/ppc64/pa-risc it should | ||
414 | work with this option, but it makes the drivers behave differently | ||
415 | from i386. Requires that the driver writer used memory barriers | ||
416 | properly. | ||
417 | |||
418 | source "drivers/pci/pcie/Kconfig" | ||
419 | |||
420 | source "drivers/pci/Kconfig" | ||
421 | |||
422 | source "drivers/pcmcia/Kconfig" | ||
423 | |||
424 | source "drivers/pci/hotplug/Kconfig" | ||
425 | |||
426 | endmenu | ||
427 | |||
428 | |||
429 | menu "Executable file formats / Emulations" | ||
430 | |||
431 | source "fs/Kconfig.binfmt" | ||
432 | |||
433 | config IA32_EMULATION | ||
434 | bool "IA32 Emulation" | ||
435 | help | ||
436 | Include code to run 32-bit programs under a 64-bit kernel. You should likely | ||
437 | turn this on, unless you're 100% sure that you don't have any 32-bit programs | ||
438 | left. | ||
439 | |||
440 | config IA32_AOUT | ||
441 | bool "IA32 a.out support" | ||
442 | depends on IA32_EMULATION | ||
443 | help | ||
444 | Support old a.out binaries in the 32bit emulation. | ||
445 | |||
446 | config COMPAT | ||
447 | bool | ||
448 | depends on IA32_EMULATION | ||
449 | default y | ||
450 | |||
451 | config SYSVIPC_COMPAT | ||
452 | bool | ||
453 | depends on COMPAT && SYSVIPC | ||
454 | default y | ||
455 | |||
456 | config UID16 | ||
457 | bool | ||
458 | depends on IA32_EMULATION | ||
459 | default y | ||
460 | |||
461 | endmenu | ||
462 | |||
463 | source drivers/Kconfig | ||
464 | |||
465 | source "drivers/firmware/Kconfig" | ||
466 | |||
467 | source fs/Kconfig | ||
468 | |||
469 | source "arch/x86_64/oprofile/Kconfig" | ||
470 | |||
471 | source "arch/x86_64/Kconfig.debug" | ||
472 | |||
473 | source "security/Kconfig" | ||
474 | |||
475 | source "crypto/Kconfig" | ||
476 | |||
477 | source "lib/Kconfig" | ||
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug new file mode 100644 index 000000000000..9cf1410d2f5a --- /dev/null +++ b/arch/x86_64/Kconfig.debug | |||
@@ -0,0 +1,57 @@ | |||
1 | menu "Kernel hacking" | ||
2 | |||
3 | source "lib/Kconfig.debug" | ||
4 | |||
5 | # !SMP for now because the context switch early causes GPF in segment reloading | ||
6 | # and the GS base checking does the wrong thing then, causing a hang. | ||
7 | config CHECKING | ||
8 | bool "Additional run-time checks" | ||
9 | depends on DEBUG_KERNEL && !SMP | ||
10 | help | ||
11 | Enables some internal consistency checks for kernel debugging. | ||
12 | You should normally say N. | ||
13 | |||
14 | config INIT_DEBUG | ||
15 | bool "Debug __init statements" | ||
16 | depends on DEBUG_KERNEL | ||
17 | help | ||
18 | Fill __init and __initdata at the end of boot. This helps debugging | ||
19 | illegal uses of __init and __initdata after initialization. | ||
20 | |||
21 | config IOMMU_DEBUG | ||
22 | depends on GART_IOMMU && DEBUG_KERNEL | ||
23 | bool "Enable IOMMU debugging" | ||
24 | help | ||
25 | Force the IOMMU to on even when you have less than 4GB of | ||
26 | memory and add debugging code. On overflow always panic. And | ||
27 | allow to enable IOMMU leak tracing. Can be disabled at boot | ||
28 | time with iommu=noforce. This will also enable scatter gather | ||
29 | list merging. Currently not recommended for production | ||
30 | code. When you use it make sure you have a big enough | ||
31 | IOMMU/AGP aperture. Most of the options enabled by this can | ||
32 | be set more finegrained using the iommu= command line | ||
33 | options. See Documentation/x86_64/boot-options.txt for more | ||
34 | details. | ||
35 | |||
36 | config KPROBES | ||
37 | bool "Kprobes" | ||
38 | depends on DEBUG_KERNEL | ||
39 | help | ||
40 | Kprobes allows you to trap at almost any kernel address and | ||
41 | execute a callback function. register_kprobe() establishes | ||
42 | a probepoint and specifies the callback. Kprobes is useful | ||
43 | for kernel debugging, non-intrusive instrumentation and testing. | ||
44 | If in doubt, say "N". | ||
45 | |||
46 | config IOMMU_LEAK | ||
47 | bool "IOMMU leak tracing" | ||
48 | depends on DEBUG_KERNEL | ||
49 | depends on IOMMU_DEBUG | ||
50 | help | ||
51 | Add a simple leak tracer to the IOMMU code. This is useful when you | ||
52 | are debugging a buggy device driver that leaks IOMMU mappings. | ||
53 | |||
54 | #config X86_REMOTE_DEBUG | ||
55 | # bool "kgdb debugging stub" | ||
56 | |||
57 | endmenu | ||
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile new file mode 100644 index 000000000000..6f90c246c418 --- /dev/null +++ b/arch/x86_64/Makefile | |||
@@ -0,0 +1,119 @@ | |||
1 | # | ||
2 | # x86_64/Makefile | ||
3 | # | ||
4 | # This file is included by the global makefile so that you can add your own | ||
5 | # architecture-specific flags and dependencies. Remember to do have actions | ||
6 | # for "archclean" and "archdep" for cleaning up and making dependencies for | ||
7 | # this architecture | ||
8 | # | ||
9 | # This file is subject to the terms and conditions of the GNU General Public | ||
10 | # License. See the file "COPYING" in the main directory of this archive | ||
11 | # for more details. | ||
12 | # | ||
13 | # Copyright (C) 1994 by Linus Torvalds | ||
14 | # | ||
15 | # 19990713 Artur Skawina <skawina@geocities.com> | ||
16 | # Added '-march' and '-mpreferred-stack-boundary' support | ||
17 | # 20000913 Pavel Machek <pavel@suse.cz> | ||
18 | # Converted for x86_64 architecture | ||
19 | # 20010105 Andi Kleen, add IA32 compiler. | ||
20 | # ....and later removed it again.... | ||
21 | # | ||
22 | # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ | ||
23 | |||
24 | # | ||
25 | # early bootup linking needs 32bit. You can either use real 32bit tools | ||
26 | # here or 64bit tools in 32bit mode. | ||
27 | # | ||
28 | IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer | ||
29 | IA32_LD := $(LD) -m elf_i386 | ||
30 | IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c | ||
31 | IA32_OBJCOPY := $(CROSS_COMPILE)objcopy | ||
32 | IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E | ||
33 | export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP | ||
34 | |||
35 | |||
36 | LDFLAGS := -m elf_x86_64 | ||
37 | OBJCOPYFLAGS := -O binary -R .note -R .comment -S | ||
38 | LDFLAGS_vmlinux := -e stext | ||
39 | |||
40 | CHECKFLAGS += -D__x86_64__ -m64 | ||
41 | |||
42 | cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) | ||
43 | cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) | ||
44 | CFLAGS += $(cflags-y) | ||
45 | |||
46 | CFLAGS += -mno-red-zone | ||
47 | CFLAGS += -mcmodel=kernel | ||
48 | CFLAGS += -pipe | ||
49 | # this makes reading assembly source easier, but produces worse code | ||
50 | # actually it makes the kernel smaller too. | ||
51 | CFLAGS += -fno-reorder-blocks | ||
52 | CFLAGS += -Wno-sign-compare | ||
53 | ifneq ($(CONFIG_DEBUG_INFO),y) | ||
54 | CFLAGS += -fno-asynchronous-unwind-tables | ||
55 | # -fweb shrinks the kernel a bit, but the difference is very small | ||
56 | # it also messes up debugging, so don't use it for now. | ||
57 | #CFLAGS += $(call cc-option,-fweb) | ||
58 | endif | ||
59 | # -funit-at-a-time shrinks the kernel .text considerably | ||
60 | # unfortunately it makes reading oopses harder. | ||
61 | CFLAGS += $(call cc-option,-funit-at-a-time) | ||
62 | # prevent gcc from generating any FP code by mistake | ||
63 | CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) | ||
64 | |||
65 | head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o | ||
66 | |||
67 | libs-y += arch/x86_64/lib/ | ||
68 | core-y += arch/x86_64/kernel/ arch/x86_64/mm/ | ||
69 | core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ | ||
70 | drivers-$(CONFIG_PCI) += arch/x86_64/pci/ | ||
71 | drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ | ||
72 | |||
73 | boot := arch/x86_64/boot | ||
74 | |||
75 | .PHONY: bzImage bzlilo install archmrproper \ | ||
76 | fdimage fdimage144 fdimage288 archclean | ||
77 | |||
78 | #Default target when executing "make" | ||
79 | all: bzImage | ||
80 | |||
81 | BOOTIMAGE := arch/x86_64/boot/bzImage | ||
82 | KBUILD_IMAGE := $(BOOTIMAGE) | ||
83 | |||
84 | bzImage: vmlinux | ||
85 | $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) | ||
86 | |||
87 | bzlilo: vmlinux | ||
88 | $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo | ||
89 | |||
90 | bzdisk: vmlinux | ||
91 | $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk | ||
92 | |||
93 | install fdimage fdimage144 fdimage288: vmlinux | ||
94 | $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ | ||
95 | |||
96 | archclean: | ||
97 | $(Q)$(MAKE) $(clean)=$(boot) | ||
98 | |||
99 | prepare: include/asm-$(ARCH)/offset.h | ||
100 | |||
101 | arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ | ||
102 | include/config/MARKER | ||
103 | |||
104 | include/asm-$(ARCH)/offset.h: arch/$(ARCH)/kernel/asm-offsets.s | ||
105 | $(call filechk,gen-asm-offsets) | ||
106 | |||
107 | CLEAN_FILES += include/asm-$(ARCH)/offset.h | ||
108 | |||
109 | define archhelp | ||
110 | echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)' | ||
111 | echo ' install - Install kernel using' | ||
112 | echo ' (your) ~/bin/installkernel or' | ||
113 | echo ' (distribution) /sbin/installkernel or' | ||
114 | echo ' install to $$(INSTALL_PATH) and run lilo' | ||
115 | endef | ||
116 | |||
117 | CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf | ||
118 | |||
119 | |||
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile new file mode 100644 index 000000000000..f4399c701b77 --- /dev/null +++ b/arch/x86_64/boot/Makefile | |||
@@ -0,0 +1,102 @@ | |||
1 | # | ||
2 | # arch/x86_64/boot/Makefile | ||
3 | # | ||
4 | # This file is subject to the terms and conditions of the GNU General Public | ||
5 | # License. See the file "COPYING" in the main directory of this archive | ||
6 | # for more details. | ||
7 | # | ||
8 | # Copyright (C) 1994 by Linus Torvalds | ||
9 | # | ||
10 | |||
11 | # ROOT_DEV specifies the default root-device when making the image. | ||
12 | # This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case | ||
13 | # the default of FLOPPY is used by 'build'. | ||
14 | |||
15 | ROOT_DEV := CURRENT | ||
16 | |||
17 | # If you want to preset the SVGA mode, uncomment the next line and | ||
18 | # set SVGA_MODE to whatever number you want. | ||
19 | # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. | ||
20 | # The number is the same as you would ordinarily press at bootup. | ||
21 | |||
22 | SVGA_MODE := -DSVGA_MODE=NORMAL_VGA | ||
23 | |||
24 | # If you want the RAM disk device, define this to be the size in blocks. | ||
25 | |||
26 | #RAMDISK := -DRAMDISK=512 | ||
27 | |||
28 | targets := vmlinux.bin bootsect bootsect.o \ | ||
29 | setup setup.o bzImage mtools.conf | ||
30 | |||
31 | EXTRA_CFLAGS := -m32 | ||
32 | |||
33 | hostprogs-y := tools/build | ||
34 | HOST_EXTRACFLAGS += $(LINUXINCLUDE) | ||
35 | subdir- := compressed/ #Let make clean descend in compressed/ | ||
36 | # --------------------------------------------------------------------------- | ||
37 | |||
38 | $(obj)/bzImage: IMAGE_OFFSET := 0x100000 | ||
39 | $(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ | ||
40 | $(obj)/bzImage: BUILDFLAGS := -b | ||
41 | |||
42 | quiet_cmd_image = BUILD $@ | ||
43 | cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ | ||
44 | $(obj)/vmlinux.bin $(ROOT_DEV) > $@ | ||
45 | |||
46 | $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ | ||
47 | $(obj)/vmlinux.bin $(obj)/tools/build FORCE | ||
48 | $(call if_changed,image) | ||
49 | @echo 'Kernel: $@ is ready' | ||
50 | |||
51 | $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE | ||
52 | $(call if_changed,objcopy) | ||
53 | |||
54 | LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary | ||
55 | LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext | ||
56 | |||
57 | $(obj)/setup $(obj)/bootsect: %: %.o FORCE | ||
58 | $(call if_changed,ld) | ||
59 | |||
60 | $(obj)/compressed/vmlinux: FORCE | ||
61 | $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ | ||
62 | |||
63 | # Set this if you want to pass append arguments to the zdisk/fdimage kernel | ||
64 | FDARGS = | ||
65 | |||
66 | $(obj)/mtools.conf: $(src)/mtools.conf.in | ||
67 | sed -e 's|@OBJ@|$(obj)|g' < $< > $@ | ||
68 | |||
69 | # This requires write access to /dev/fd0 | ||
70 | zdisk: $(BOOTIMAGE) $(obj)/mtools.conf | ||
71 | MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync | ||
72 | syslinux /dev/fd0 ; sync | ||
73 | echo 'default linux $(FDARGS)' | \ | ||
74 | MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg | ||
75 | MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync | ||
76 | |||
77 | # These require being root or having syslinux 2.02 or higher installed | ||
78 | fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf | ||
79 | dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 | ||
80 | MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync | ||
81 | syslinux $(obj)/fdimage ; sync | ||
82 | echo 'default linux $(FDARGS)' | \ | ||
83 | MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg | ||
84 | MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync | ||
85 | |||
86 | fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf | ||
87 | dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 | ||
88 | MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync | ||
89 | syslinux $(obj)/fdimage ; sync | ||
90 | echo 'default linux $(FDARGS)' | \ | ||
91 | MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg | ||
92 | MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync | ||
93 | |||
94 | zlilo: $(BOOTIMAGE) | ||
95 | if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi | ||
96 | if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi | ||
97 | cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz | ||
98 | cp System.map $(INSTALL_PATH)/ | ||
99 | if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi | ||
100 | |||
101 | install: $(BOOTIMAGE) | ||
102 | sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" | ||
diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S new file mode 100644 index 000000000000..bb15d406ee95 --- /dev/null +++ b/arch/x86_64/boot/bootsect.S | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * | ||
4 | * modified by Drew Eckhardt | ||
5 | * modified by Bruce Evans (bde) | ||
6 | * modified by Chris Noe (May 1999) (as86 -> gas) | ||
7 | * gutted by H. Peter Anvin (Jan 2003) | ||
8 | * | ||
9 | * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment | ||
10 | * addresses must be multiplied by 16 to obtain their respective linear | ||
11 | * addresses. To avoid confusion, linear addresses are written using leading | ||
12 | * hex while segment addresses are written as segment:offset. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <asm/boot.h> | ||
17 | |||
18 | SETUPSECTS = 4 /* default nr of setup-sectors */ | ||
19 | BOOTSEG = 0x07C0 /* original address of boot-sector */ | ||
20 | INITSEG = DEF_INITSEG /* we move boot here - out of the way */ | ||
21 | SETUPSEG = DEF_SETUPSEG /* setup starts here */ | ||
22 | SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ | ||
23 | SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ | ||
24 | /* to be loaded */ | ||
25 | ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ | ||
26 | SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ | ||
27 | |||
28 | #ifndef SVGA_MODE | ||
29 | #define SVGA_MODE ASK_VGA | ||
30 | #endif | ||
31 | |||
32 | #ifndef RAMDISK | ||
33 | #define RAMDISK 0 | ||
34 | #endif | ||
35 | |||
36 | #ifndef ROOT_RDONLY | ||
37 | #define ROOT_RDONLY 1 | ||
38 | #endif | ||
39 | |||
40 | .code16 | ||
41 | .text | ||
42 | |||
43 | .global _start | ||
44 | _start: | ||
45 | |||
46 | # Normalize the start address | ||
47 | jmpl $BOOTSEG, $start2 | ||
48 | |||
49 | start2: | ||
50 | movw %cs, %ax | ||
51 | movw %ax, %ds | ||
52 | movw %ax, %es | ||
53 | movw %ax, %ss | ||
54 | movw $0x7c00, %sp | ||
55 | sti | ||
56 | cld | ||
57 | |||
58 | movw $bugger_off_msg, %si | ||
59 | |||
60 | msg_loop: | ||
61 | lodsb | ||
62 | andb %al, %al | ||
63 | jz die | ||
64 | movb $0xe, %ah | ||
65 | movw $7, %bx | ||
66 | int $0x10 | ||
67 | jmp msg_loop | ||
68 | |||
69 | die: | ||
70 | # Allow the user to press a key, then reboot | ||
71 | xorw %ax, %ax | ||
72 | int $0x16 | ||
73 | int $0x19 | ||
74 | |||
75 | # int 0x19 should never return. In case it does anyway, | ||
76 | # invoke the BIOS reset code... | ||
77 | ljmp $0xf000,$0xfff0 | ||
78 | |||
79 | |||
80 | bugger_off_msg: | ||
81 | .ascii "Direct booting from floppy is no longer supported.\r\n" | ||
82 | .ascii "Please use a boot loader program instead.\r\n" | ||
83 | .ascii "\n" | ||
84 | .ascii "Remove disk and press any key to reboot . . .\r\n" | ||
85 | .byte 0 | ||
86 | |||
87 | |||
88 | # Kernel attributes; used by setup | ||
89 | |||
90 | .org 497 | ||
91 | setup_sects: .byte SETUPSECTS | ||
92 | root_flags: .word ROOT_RDONLY | ||
93 | syssize: .word SYSSIZE | ||
94 | swap_dev: .word SWAP_DEV | ||
95 | ram_size: .word RAMDISK | ||
96 | vid_mode: .word SVGA_MODE | ||
97 | root_dev: .word ROOT_DEV | ||
98 | boot_flag: .word 0xAA55 | ||
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile new file mode 100644 index 000000000000..f89d96f11a9f --- /dev/null +++ b/arch/x86_64/boot/compressed/Makefile | |||
@@ -0,0 +1,32 @@ | |||
1 | # | ||
2 | # linux/arch/x86_64/boot/compressed/Makefile | ||
3 | # | ||
4 | # create a compressed vmlinux image from the original vmlinux | ||
5 | # | ||
6 | # Note all the files here are compiled/linked as 32bit executables. | ||
7 | # | ||
8 | |||
9 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o | ||
10 | EXTRA_AFLAGS := -traditional -m32 | ||
11 | |||
12 | # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with | ||
13 | # -m32 | ||
14 | CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing | ||
15 | LDFLAGS := -m elf_i386 | ||
16 | |||
17 | LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 | ||
18 | |||
19 | $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | ||
20 | $(call if_changed,ld) | ||
21 | @: | ||
22 | |||
23 | $(obj)/vmlinux.bin: vmlinux FORCE | ||
24 | $(call if_changed,objcopy) | ||
25 | |||
26 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE | ||
27 | $(call if_changed,gzip) | ||
28 | |||
29 | LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T | ||
30 | |||
31 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE | ||
32 | $(call if_changed,ld) | ||
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S new file mode 100644 index 000000000000..27264dbd575c --- /dev/null +++ b/arch/x86_64/boot/compressed/head.S | |||
@@ -0,0 +1,142 @@ | |||
1 | /* | ||
2 | * linux/boot/head.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992, 1993 Linus Torvalds | ||
5 | * | ||
6 | * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * head.S contains the 32-bit startup code. | ||
11 | * | ||
12 | * NOTE!!! Startup happens at absolute address 0x00001000, which is also where | ||
13 | * the page directory will exist. The startup code will be overwritten by | ||
14 | * the page directory. [According to comments etc elsewhere on a compressed | ||
15 | * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] | ||
16 | * | ||
17 | * Page 0 is deliberately kept safe, since System Management Mode code in | ||
18 | * laptops may need to access the BIOS data stored there. This is also | ||
19 | * useful for future device drivers that either access the BIOS via VM86 | ||
20 | * mode. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | ||
25 | */ | ||
26 | .code32 | ||
27 | .text | ||
28 | |||
29 | #include <linux/linkage.h> | ||
30 | #include <asm/segment.h> | ||
31 | |||
32 | .code32 | ||
33 | .globl startup_32 | ||
34 | |||
35 | startup_32: | ||
36 | cld | ||
37 | cli | ||
38 | movl $(__KERNEL_DS),%eax | ||
39 | movl %eax,%ds | ||
40 | movl %eax,%es | ||
41 | movl %eax,%fs | ||
42 | movl %eax,%gs | ||
43 | |||
44 | lss stack_start,%esp | ||
45 | xorl %eax,%eax | ||
46 | 1: incl %eax # check that A20 really IS enabled | ||
47 | movl %eax,0x000000 # loop forever if it isn't | ||
48 | cmpl %eax,0x100000 | ||
49 | je 1b | ||
50 | |||
51 | /* | ||
52 | * Initialize eflags. Some BIOS's leave bits like NT set. This would | ||
53 | * confuse the debugger if this code is traced. | ||
54 | * XXX - best to initialize before switching to protected mode. | ||
55 | */ | ||
56 | pushl $0 | ||
57 | popfl | ||
58 | /* | ||
59 | * Clear BSS | ||
60 | */ | ||
61 | xorl %eax,%eax | ||
62 | movl $_edata,%edi | ||
63 | movl $_end,%ecx | ||
64 | subl %edi,%ecx | ||
65 | cld | ||
66 | rep | ||
67 | stosb | ||
68 | /* | ||
69 | * Do the decompression, and jump to the new kernel.. | ||
70 | */ | ||
71 | subl $16,%esp # place for structure on the stack | ||
72 | movl %esp,%eax | ||
73 | pushl %esi # real mode pointer as second arg | ||
74 | pushl %eax # address of structure as first arg | ||
75 | call decompress_kernel | ||
76 | orl %eax,%eax | ||
77 | jnz 3f | ||
78 | addl $8,%esp | ||
79 | xorl %ebx,%ebx | ||
80 | ljmp $(__KERNEL_CS), $0x100000 | ||
81 | |||
82 | /* | ||
83 | * We come here, if we were loaded high. | ||
84 | * We need to move the move-in-place routine down to 0x1000 | ||
85 | * and then start it with the buffer addresses in registers, | ||
86 | * which we got from the stack. | ||
87 | */ | ||
88 | 3: | ||
89 | movl %esi,%ebx | ||
90 | movl $move_routine_start,%esi | ||
91 | movl $0x1000,%edi | ||
92 | movl $move_routine_end,%ecx | ||
93 | subl %esi,%ecx | ||
94 | addl $3,%ecx | ||
95 | shrl $2,%ecx | ||
96 | cld | ||
97 | rep | ||
98 | movsl | ||
99 | |||
100 | popl %esi # discard the address | ||
101 | addl $4,%esp # real mode pointer | ||
102 | popl %esi # low_buffer_start | ||
103 | popl %ecx # lcount | ||
104 | popl %edx # high_buffer_start | ||
105 | popl %eax # hcount | ||
106 | movl $0x100000,%edi | ||
107 | cli # make sure we don't get interrupted | ||
108 | ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine | ||
109 | |||
110 | /* | ||
111 | * Routine (template) for moving the decompressed kernel in place, | ||
112 | * if we were high loaded. This _must_ PIC-code ! | ||
113 | */ | ||
114 | move_routine_start: | ||
115 | movl %ecx,%ebp | ||
116 | shrl $2,%ecx | ||
117 | rep | ||
118 | movsl | ||
119 | movl %ebp,%ecx | ||
120 | andl $3,%ecx | ||
121 | rep | ||
122 | movsb | ||
123 | movl %edx,%esi | ||
124 | movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 | ||
125 | addl $3,%ecx | ||
126 | shrl $2,%ecx | ||
127 | rep | ||
128 | movsl | ||
129 | movl %ebx,%esi # Restore setup pointer | ||
130 | xorl %ebx,%ebx | ||
131 | ljmp $(__KERNEL_CS), $0x100000 | ||
132 | move_routine_end: | ||
133 | |||
134 | |||
135 | /* Stack for uncompression */ | ||
136 | .align 32 | ||
137 | user_stack: | ||
138 | .fill 4096,4,0 | ||
139 | stack_start: | ||
140 | .long user_stack+4096 | ||
141 | .word __KERNEL_DS | ||
142 | |||
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c new file mode 100644 index 000000000000..c8b9216f9e63 --- /dev/null +++ b/arch/x86_64/boot/compressed/misc.c | |||
@@ -0,0 +1,354 @@ | |||
1 | /* | ||
2 | * misc.c | ||
3 | * | ||
4 | * This is a collection of several routines from gzip-1.0.3 | ||
5 | * adapted for Linux. | ||
6 | * | ||
7 | * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 | ||
8 | * puts by Nick Holloway 1993, better puts by Martin Mares 1995 | ||
9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | ||
10 | */ | ||
11 | |||
12 | #include "miscsetup.h" | ||
13 | #include <asm/io.h> | ||
14 | |||
15 | /* | ||
16 | * gzip declarations | ||
17 | */ | ||
18 | |||
19 | #define OF(args) args | ||
20 | #define STATIC static | ||
21 | |||
22 | #undef memset | ||
23 | #undef memcpy | ||
24 | #define memzero(s, n) memset ((s), 0, (n)) | ||
25 | |||
26 | typedef unsigned char uch; | ||
27 | typedef unsigned short ush; | ||
28 | typedef unsigned long ulg; | ||
29 | |||
30 | #define WSIZE 0x8000 /* Window size must be at least 32k, */ | ||
31 | /* and a power of two */ | ||
32 | |||
33 | static uch *inbuf; /* input buffer */ | ||
34 | static uch window[WSIZE]; /* Sliding window buffer */ | ||
35 | |||
36 | static unsigned insize = 0; /* valid bytes in inbuf */ | ||
37 | static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ | ||
38 | static unsigned outcnt = 0; /* bytes in output buffer */ | ||
39 | |||
40 | /* gzip flag byte */ | ||
41 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ | ||
42 | #define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ | ||
43 | #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ | ||
44 | #define ORIG_NAME 0x08 /* bit 3 set: original file name present */ | ||
45 | #define COMMENT 0x10 /* bit 4 set: file comment present */ | ||
46 | #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ | ||
47 | #define RESERVED 0xC0 /* bit 6,7: reserved */ | ||
48 | |||
49 | #define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) | ||
50 | |||
51 | /* Diagnostic functions */ | ||
52 | #ifdef DEBUG | ||
53 | # define Assert(cond,msg) {if(!(cond)) error(msg);} | ||
54 | # define Trace(x) fprintf x | ||
55 | # define Tracev(x) {if (verbose) fprintf x ;} | ||
56 | # define Tracevv(x) {if (verbose>1) fprintf x ;} | ||
57 | # define Tracec(c,x) {if (verbose && (c)) fprintf x ;} | ||
58 | # define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} | ||
59 | #else | ||
60 | # define Assert(cond,msg) | ||
61 | # define Trace(x) | ||
62 | # define Tracev(x) | ||
63 | # define Tracevv(x) | ||
64 | # define Tracec(c,x) | ||
65 | # define Tracecv(c,x) | ||
66 | #endif | ||
67 | |||
68 | static int fill_inbuf(void); | ||
69 | static void flush_window(void); | ||
70 | static void error(char *m); | ||
71 | static void gzip_mark(void **); | ||
72 | static void gzip_release(void **); | ||
73 | |||
74 | /* | ||
75 | * This is set up by the setup-routine at boot-time | ||
76 | */ | ||
77 | static unsigned char *real_mode; /* Pointer to real-mode data */ | ||
78 | |||
79 | #define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) | ||
80 | #ifndef STANDARD_MEMORY_BIOS_CALL | ||
81 | #define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) | ||
82 | #endif | ||
83 | #define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) | ||
84 | |||
85 | extern char input_data[]; | ||
86 | extern int input_len; | ||
87 | |||
88 | static long bytes_out = 0; | ||
89 | static uch *output_data; | ||
90 | static unsigned long output_ptr = 0; | ||
91 | |||
92 | static void *malloc(int size); | ||
93 | static void free(void *where); | ||
94 | |||
95 | static void putstr(const char *); | ||
96 | |||
97 | extern int end; | ||
98 | static long free_mem_ptr = (long)&end; | ||
99 | static long free_mem_end_ptr; | ||
100 | |||
101 | #define INPLACE_MOVE_ROUTINE 0x1000 | ||
102 | #define LOW_BUFFER_START 0x2000 | ||
103 | #define LOW_BUFFER_MAX 0x90000 | ||
104 | #define HEAP_SIZE 0x3000 | ||
105 | static unsigned int low_buffer_end, low_buffer_size; | ||
106 | static int high_loaded =0; | ||
107 | static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; | ||
108 | |||
109 | static char *vidmem = (char *)0xb8000; | ||
110 | static int vidport; | ||
111 | static int lines, cols; | ||
112 | |||
113 | #include "../../../../lib/inflate.c" | ||
114 | |||
115 | static void *malloc(int size) | ||
116 | { | ||
117 | void *p; | ||
118 | |||
119 | if (size <0) error("Malloc error"); | ||
120 | if (free_mem_ptr <= 0) error("Memory error"); | ||
121 | |||
122 | free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ | ||
123 | |||
124 | p = (void *)free_mem_ptr; | ||
125 | free_mem_ptr += size; | ||
126 | |||
127 | if (free_mem_ptr >= free_mem_end_ptr) | ||
128 | error("Out of memory"); | ||
129 | |||
130 | return p; | ||
131 | } | ||
132 | |||
133 | static void free(void *where) | ||
134 | { /* Don't care */ | ||
135 | } | ||
136 | |||
137 | static void gzip_mark(void **ptr) | ||
138 | { | ||
139 | *ptr = (void *) free_mem_ptr; | ||
140 | } | ||
141 | |||
142 | static void gzip_release(void **ptr) | ||
143 | { | ||
144 | free_mem_ptr = (long) *ptr; | ||
145 | } | ||
146 | |||
147 | static void scroll(void) | ||
148 | { | ||
149 | int i; | ||
150 | |||
151 | memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); | ||
152 | for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) | ||
153 | vidmem[i] = ' '; | ||
154 | } | ||
155 | |||
156 | static void putstr(const char *s) | ||
157 | { | ||
158 | int x,y,pos; | ||
159 | char c; | ||
160 | |||
161 | x = SCREEN_INFO.orig_x; | ||
162 | y = SCREEN_INFO.orig_y; | ||
163 | |||
164 | while ( ( c = *s++ ) != '\0' ) { | ||
165 | if ( c == '\n' ) { | ||
166 | x = 0; | ||
167 | if ( ++y >= lines ) { | ||
168 | scroll(); | ||
169 | y--; | ||
170 | } | ||
171 | } else { | ||
172 | vidmem [ ( x + cols * y ) * 2 ] = c; | ||
173 | if ( ++x >= cols ) { | ||
174 | x = 0; | ||
175 | if ( ++y >= lines ) { | ||
176 | scroll(); | ||
177 | y--; | ||
178 | } | ||
179 | } | ||
180 | } | ||
181 | } | ||
182 | |||
183 | SCREEN_INFO.orig_x = x; | ||
184 | SCREEN_INFO.orig_y = y; | ||
185 | |||
186 | pos = (x + cols * y) * 2; /* Update cursor position */ | ||
187 | outb_p(14, vidport); | ||
188 | outb_p(0xff & (pos >> 9), vidport+1); | ||
189 | outb_p(15, vidport); | ||
190 | outb_p(0xff & (pos >> 1), vidport+1); | ||
191 | } | ||
192 | |||
193 | void* memset(void* s, int c, unsigned n) | ||
194 | { | ||
195 | int i; | ||
196 | char *ss = (char*)s; | ||
197 | |||
198 | for (i=0;i<n;i++) ss[i] = c; | ||
199 | return s; | ||
200 | } | ||
201 | |||
202 | void* memcpy(void* dest, const void* src, unsigned n) | ||
203 | { | ||
204 | int i; | ||
205 | char *d = (char *)dest, *s = (char *)src; | ||
206 | |||
207 | for (i=0;i<n;i++) d[i] = s[i]; | ||
208 | return dest; | ||
209 | } | ||
210 | |||
211 | /* =========================================================================== | ||
212 | * Fill the input buffer. This is called only when the buffer is empty | ||
213 | * and at least one byte is really needed. | ||
214 | */ | ||
215 | static int fill_inbuf(void) | ||
216 | { | ||
217 | if (insize != 0) { | ||
218 | error("ran out of input data"); | ||
219 | } | ||
220 | |||
221 | inbuf = input_data; | ||
222 | insize = input_len; | ||
223 | inptr = 1; | ||
224 | return inbuf[0]; | ||
225 | } | ||
226 | |||
227 | /* =========================================================================== | ||
228 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. | ||
229 | * (Used for the decompressed data only.) | ||
230 | */ | ||
231 | static void flush_window_low(void) | ||
232 | { | ||
233 | ulg c = crc; /* temporary variable */ | ||
234 | unsigned n; | ||
235 | uch *in, *out, ch; | ||
236 | |||
237 | in = window; | ||
238 | out = &output_data[output_ptr]; | ||
239 | for (n = 0; n < outcnt; n++) { | ||
240 | ch = *out++ = *in++; | ||
241 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
242 | } | ||
243 | crc = c; | ||
244 | bytes_out += (ulg)outcnt; | ||
245 | output_ptr += (ulg)outcnt; | ||
246 | outcnt = 0; | ||
247 | } | ||
248 | |||
249 | static void flush_window_high(void) | ||
250 | { | ||
251 | ulg c = crc; /* temporary variable */ | ||
252 | unsigned n; | ||
253 | uch *in, ch; | ||
254 | in = window; | ||
255 | for (n = 0; n < outcnt; n++) { | ||
256 | ch = *output_data++ = *in++; | ||
257 | if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; | ||
258 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
259 | } | ||
260 | crc = c; | ||
261 | bytes_out += (ulg)outcnt; | ||
262 | outcnt = 0; | ||
263 | } | ||
264 | |||
265 | static void flush_window(void) | ||
266 | { | ||
267 | if (high_loaded) flush_window_high(); | ||
268 | else flush_window_low(); | ||
269 | } | ||
270 | |||
271 | static void error(char *x) | ||
272 | { | ||
273 | putstr("\n\n"); | ||
274 | putstr(x); | ||
275 | putstr("\n\n -- System halted"); | ||
276 | |||
277 | while(1); | ||
278 | } | ||
279 | |||
280 | void setup_normal_output_buffer(void) | ||
281 | { | ||
282 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
283 | if (EXT_MEM_K < 1024) error("Less than 2MB of memory"); | ||
284 | #else | ||
285 | if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); | ||
286 | #endif | ||
287 | output_data = (char *)0x100000; /* Points to 1M */ | ||
288 | free_mem_end_ptr = (long)real_mode; | ||
289 | } | ||
290 | |||
291 | struct moveparams { | ||
292 | uch *low_buffer_start; int lcount; | ||
293 | uch *high_buffer_start; int hcount; | ||
294 | }; | ||
295 | |||
296 | void setup_output_buffer_if_we_run_high(struct moveparams *mv) | ||
297 | { | ||
298 | high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); | ||
299 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
300 | if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); | ||
301 | #else | ||
302 | if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); | ||
303 | #endif | ||
304 | mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; | ||
305 | low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX | ||
306 | ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; | ||
307 | low_buffer_size = low_buffer_end - LOW_BUFFER_START; | ||
308 | high_loaded = 1; | ||
309 | free_mem_end_ptr = (long)high_buffer_start; | ||
310 | if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { | ||
311 | high_buffer_start = (uch *)(0x100000 + low_buffer_size); | ||
312 | mv->hcount = 0; /* say: we need not to move high_buffer */ | ||
313 | } | ||
314 | else mv->hcount = -1; | ||
315 | mv->high_buffer_start = high_buffer_start; | ||
316 | } | ||
317 | |||
318 | void close_output_buffer_if_we_run_high(struct moveparams *mv) | ||
319 | { | ||
320 | if (bytes_out > low_buffer_size) { | ||
321 | mv->lcount = low_buffer_size; | ||
322 | if (mv->hcount) | ||
323 | mv->hcount = bytes_out - low_buffer_size; | ||
324 | } else { | ||
325 | mv->lcount = bytes_out; | ||
326 | mv->hcount = 0; | ||
327 | } | ||
328 | } | ||
329 | |||
330 | int decompress_kernel(struct moveparams *mv, void *rmode) | ||
331 | { | ||
332 | real_mode = rmode; | ||
333 | |||
334 | if (SCREEN_INFO.orig_video_mode == 7) { | ||
335 | vidmem = (char *) 0xb0000; | ||
336 | vidport = 0x3b4; | ||
337 | } else { | ||
338 | vidmem = (char *) 0xb8000; | ||
339 | vidport = 0x3d4; | ||
340 | } | ||
341 | |||
342 | lines = SCREEN_INFO.orig_video_lines; | ||
343 | cols = SCREEN_INFO.orig_video_cols; | ||
344 | |||
345 | if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); | ||
346 | else setup_output_buffer_if_we_run_high(mv); | ||
347 | |||
348 | makecrc(); | ||
349 | putstr(".\nDecompressing Linux..."); | ||
350 | gunzip(); | ||
351 | putstr("done.\nBooting the kernel.\n"); | ||
352 | if (high_loaded) close_output_buffer_if_we_run_high(mv); | ||
353 | return high_loaded; | ||
354 | } | ||
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h new file mode 100644 index 000000000000..bb1620531703 --- /dev/null +++ b/arch/x86_64/boot/compressed/miscsetup.h | |||
@@ -0,0 +1,39 @@ | |||
1 | #define NULL 0 | ||
2 | //typedef unsigned int size_t; | ||
3 | |||
4 | |||
5 | struct screen_info { | ||
6 | unsigned char orig_x; /* 0x00 */ | ||
7 | unsigned char orig_y; /* 0x01 */ | ||
8 | unsigned short dontuse1; /* 0x02 -- EXT_MEM_K sits here */ | ||
9 | unsigned short orig_video_page; /* 0x04 */ | ||
10 | unsigned char orig_video_mode; /* 0x06 */ | ||
11 | unsigned char orig_video_cols; /* 0x07 */ | ||
12 | unsigned short unused2; /* 0x08 */ | ||
13 | unsigned short orig_video_ega_bx; /* 0x0a */ | ||
14 | unsigned short unused3; /* 0x0c */ | ||
15 | unsigned char orig_video_lines; /* 0x0e */ | ||
16 | unsigned char orig_video_isVGA; /* 0x0f */ | ||
17 | unsigned short orig_video_points; /* 0x10 */ | ||
18 | |||
19 | /* VESA graphic mode -- linear frame buffer */ | ||
20 | unsigned short lfb_width; /* 0x12 */ | ||
21 | unsigned short lfb_height; /* 0x14 */ | ||
22 | unsigned short lfb_depth; /* 0x16 */ | ||
23 | unsigned long lfb_base; /* 0x18 */ | ||
24 | unsigned long lfb_size; /* 0x1c */ | ||
25 | unsigned short dontuse2, dontuse3; /* 0x20 -- CL_MAGIC and CL_OFFSET here */ | ||
26 | unsigned short lfb_linelength; /* 0x24 */ | ||
27 | unsigned char red_size; /* 0x26 */ | ||
28 | unsigned char red_pos; /* 0x27 */ | ||
29 | unsigned char green_size; /* 0x28 */ | ||
30 | unsigned char green_pos; /* 0x29 */ | ||
31 | unsigned char blue_size; /* 0x2a */ | ||
32 | unsigned char blue_pos; /* 0x2b */ | ||
33 | unsigned char rsvd_size; /* 0x2c */ | ||
34 | unsigned char rsvd_pos; /* 0x2d */ | ||
35 | unsigned short vesapm_seg; /* 0x2e */ | ||
36 | unsigned short vesapm_off; /* 0x30 */ | ||
37 | unsigned short pages; /* 0x32 */ | ||
38 | /* 0x34 -- 0x3f reserved for future expansion */ | ||
39 | }; | ||
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..1ed9d791f863 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.scr | |||
@@ -0,0 +1,9 @@ | |||
1 | SECTIONS | ||
2 | { | ||
3 | .data : { | ||
4 | input_len = .; | ||
5 | LONG(input_data_end - input_data) input_data = .; | ||
6 | *(.data) | ||
7 | input_data_end = .; | ||
8 | } | ||
9 | } | ||
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh new file mode 100644 index 000000000000..90f2452b3b9e --- /dev/null +++ b/arch/x86_64/boot/install.sh | |||
@@ -0,0 +1,40 @@ | |||
1 | #!/bin/sh | ||
2 | # | ||
3 | # arch/i386/boot/install.sh | ||
4 | # | ||
5 | # This file is subject to the terms and conditions of the GNU General Public | ||
6 | # License. See the file "COPYING" in the main directory of this archive | ||
7 | # for more details. | ||
8 | # | ||
9 | # Copyright (C) 1995 by Linus Torvalds | ||
10 | # | ||
11 | # Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin | ||
12 | # | ||
13 | # "make install" script for i386 architecture | ||
14 | # | ||
15 | # Arguments: | ||
16 | # $1 - kernel version | ||
17 | # $2 - kernel image file | ||
18 | # $3 - kernel map file | ||
19 | # $4 - default install path (blank if root directory) | ||
20 | # | ||
21 | |||
22 | # User may have a custom install script | ||
23 | |||
24 | if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi | ||
25 | if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi | ||
26 | |||
27 | # Default install - same as make zlilo | ||
28 | |||
29 | if [ -f $4/vmlinuz ]; then | ||
30 | mv $4/vmlinuz $4/vmlinuz.old | ||
31 | fi | ||
32 | |||
33 | if [ -f $4/System.map ]; then | ||
34 | mv $4/System.map $4/System.old | ||
35 | fi | ||
36 | |||
37 | cat $2 > $4/vmlinuz | ||
38 | cp $3 $4/System.map | ||
39 | |||
40 | if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi | ||
diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in new file mode 100644 index 000000000000..efd6d2490c1d --- /dev/null +++ b/arch/x86_64/boot/mtools.conf.in | |||
@@ -0,0 +1,17 @@ | |||
1 | # | ||
2 | # mtools configuration file for "make (b)zdisk" | ||
3 | # | ||
4 | |||
5 | # Actual floppy drive | ||
6 | drive a: | ||
7 | file="/dev/fd0" | ||
8 | |||
9 | # 1.44 MB floppy disk image | ||
10 | drive v: | ||
11 | file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter | ||
12 | |||
13 | # 2.88 MB floppy disk image (mostly for virtual uses) | ||
14 | drive w: | ||
15 | file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter | ||
16 | |||
17 | |||
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S new file mode 100644 index 000000000000..3e838be9dbe7 --- /dev/null +++ b/arch/x86_64/boot/setup.S | |||
@@ -0,0 +1,867 @@ | |||
1 | /* | ||
2 | * setup.S Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * | ||
4 | * setup.s is responsible for getting the system data from the BIOS, | ||
5 | * and putting them into the appropriate places in system memory. | ||
6 | * both setup.s and system has been loaded by the bootblock. | ||
7 | * | ||
8 | * This code asks the bios for memory/disk/other parameters, and | ||
9 | * puts them in a "safe" place: 0x90000-0x901FF, ie where the | ||
10 | * boot-block used to be. It is then up to the protected mode | ||
11 | * system to read them from there before the area is overwritten | ||
12 | * for buffer-blocks. | ||
13 | * | ||
14 | * Move PS/2 aux init code to psaux.c | ||
15 | * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92 | ||
16 | * | ||
17 | * some changes and additional features by Christoph Niemann, | ||
18 | * March 1993/June 1994 (Christoph.Niemann@linux.org) | ||
19 | * | ||
20 | * add APM BIOS checking by Stephen Rothwell, May 1994 | ||
21 | * (sfr@canb.auug.org.au) | ||
22 | * | ||
23 | * High load stuff, initrd support and position independency | ||
24 | * by Hans Lermen & Werner Almesberger, February 1996 | ||
25 | * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch> | ||
26 | * | ||
27 | * Video handling moved to video.S by Martin Mares, March 1996 | ||
28 | * <mj@k332.feld.cvut.cz> | ||
29 | * | ||
30 | * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david | ||
31 | * parsons) to avoid loadlin confusion, July 1997 | ||
32 | * | ||
33 | * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. | ||
34 | * <stiker@northlink.com> | ||
35 | * | ||
36 | * Fix to work around buggy BIOSes which dont use carry bit correctly | ||
37 | * and/or report extended memory in CX/DX for e801h memory size detection | ||
38 | * call. As a result the kernel got wrong figures. The int15/e801h docs | ||
39 | * from Ralf Brown interrupt list seem to indicate AX/BX should be used | ||
40 | * anyway. So to avoid breaking many machines (presumably there was a reason | ||
41 | * to orginally use CX/DX instead of AX/BX), we do a kludge to see | ||
42 | * if CX/DX have been changed in the e801 call and if so use AX/BX . | ||
43 | * Michael Miller, April 2001 <michaelm@mjmm.org> | ||
44 | * | ||
45 | * Added long mode checking and SSE force. March 2003, Andi Kleen. | ||
46 | */ | ||
47 | |||
48 | #include <linux/config.h> | ||
49 | #include <asm/segment.h> | ||
50 | #include <linux/version.h> | ||
51 | #include <linux/compile.h> | ||
52 | #include <asm/boot.h> | ||
53 | #include <asm/e820.h> | ||
54 | #include <asm/page.h> | ||
55 | |||
56 | /* Signature words to ensure LILO loaded us right */ | ||
57 | #define SIG1 0xAA55 | ||
58 | #define SIG2 0x5A5A | ||
59 | |||
60 | INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way | ||
61 | SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). | ||
62 | SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment | ||
63 | # ... and the former contents of CS | ||
64 | |||
65 | DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 | ||
66 | |||
67 | .code16 | ||
68 | .globl begtext, begdata, begbss, endtext, enddata, endbss | ||
69 | |||
70 | .text | ||
71 | begtext: | ||
72 | .data | ||
73 | begdata: | ||
74 | .bss | ||
75 | begbss: | ||
76 | .text | ||
77 | |||
78 | start: | ||
79 | jmp trampoline | ||
80 | |||
81 | # This is the setup header, and it must start at %cs:2 (old 0x9020:2) | ||
82 | |||
83 | .ascii "HdrS" # header signature | ||
84 | .word 0x0203 # header version number (>= 0x0105) | ||
85 | # or else old loadlin-1.5 will fail) | ||
86 | realmode_swtch: .word 0, 0 # default_switch, SETUPSEG | ||
87 | start_sys_seg: .word SYSSEG | ||
88 | .word kernel_version # pointing to kernel version string | ||
89 | # above section of header is compatible | ||
90 | # with loadlin-1.5 (header v1.5). Don't | ||
91 | # change it. | ||
92 | |||
93 | type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, | ||
94 | # Bootlin, SYSLX, bootsect...) | ||
95 | # See Documentation/i386/boot.txt for | ||
96 | # assigned ids | ||
97 | |||
98 | # flags, unused bits must be zero (RFU) bit within loadflags | ||
99 | loadflags: | ||
100 | LOADED_HIGH = 1 # If set, the kernel is loaded high | ||
101 | CAN_USE_HEAP = 0x80 # If set, the loader also has set | ||
102 | # heap_end_ptr to tell how much | ||
103 | # space behind setup.S can be used for | ||
104 | # heap purposes. | ||
105 | # Only the loader knows what is free | ||
106 | #ifndef __BIG_KERNEL__ | ||
107 | .byte 0 | ||
108 | #else | ||
109 | .byte LOADED_HIGH | ||
110 | #endif | ||
111 | |||
112 | setup_move_size: .word 0x8000 # size to move, when setup is not | ||
113 | # loaded at 0x90000. We will move setup | ||
114 | # to 0x90000 then just before jumping | ||
115 | # into the kernel. However, only the | ||
116 | # loader knows how much data behind | ||
117 | # us also needs to be loaded. | ||
118 | |||
119 | code32_start: # here loaders can put a different | ||
120 | # start address for 32-bit code. | ||
121 | #ifndef __BIG_KERNEL__ | ||
122 | .long 0x1000 # 0x1000 = default for zImage | ||
123 | #else | ||
124 | .long 0x100000 # 0x100000 = default for big kernel | ||
125 | #endif | ||
126 | |||
127 | ramdisk_image: .long 0 # address of loaded ramdisk image | ||
128 | # Here the loader puts the 32-bit | ||
129 | # address where it loaded the image. | ||
130 | # This only will be read by the kernel. | ||
131 | |||
132 | ramdisk_size: .long 0 # its size in bytes | ||
133 | |||
134 | bootsect_kludge: | ||
135 | .long 0 # obsolete | ||
136 | |||
137 | heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) | ||
138 | # space from here (exclusive) down to | ||
139 | # end of setup code can be used by setup | ||
140 | # for local heap purposes. | ||
141 | |||
142 | pad1: .word 0 | ||
143 | cmd_line_ptr: .long 0 # (Header version 0x0202 or later) | ||
144 | # If nonzero, a 32-bit pointer | ||
145 | # to the kernel command line. | ||
146 | # The command line should be | ||
147 | # located between the start of | ||
148 | # setup and the end of low | ||
149 | # memory (0xa0000), or it may | ||
150 | # get overwritten before it | ||
151 | # gets read. If this field is | ||
152 | # used, there is no longer | ||
153 | # anything magical about the | ||
154 | # 0x90000 segment; the setup | ||
155 | # can be located anywhere in | ||
156 | # low memory 0x10000 or higher. | ||
157 | |||
158 | ramdisk_max: .long 0xffffffff | ||
159 | |||
160 | trampoline: call start_of_setup | ||
161 | .align 16 | ||
162 | # The offset at this point is 0x240 | ||
163 | .space (0x7ff-0x240+1) # E820 & EDD space (ending at 0x7ff) | ||
164 | # End of setup header ##################################################### | ||
165 | |||
166 | start_of_setup: | ||
167 | # Bootlin depends on this being done early | ||
168 | movw $0x01500, %ax | ||
169 | movb $0x81, %dl | ||
170 | int $0x13 | ||
171 | |||
172 | #ifdef SAFE_RESET_DISK_CONTROLLER | ||
173 | # Reset the disk controller. | ||
174 | movw $0x0000, %ax | ||
175 | movb $0x80, %dl | ||
176 | int $0x13 | ||
177 | #endif | ||
178 | |||
179 | # Set %ds = %cs, we know that SETUPSEG = %cs at this point | ||
180 | movw %cs, %ax # aka SETUPSEG | ||
181 | movw %ax, %ds | ||
182 | # Check signature at end of setup | ||
183 | cmpw $SIG1, setup_sig1 | ||
184 | jne bad_sig | ||
185 | |||
186 | cmpw $SIG2, setup_sig2 | ||
187 | jne bad_sig | ||
188 | |||
189 | jmp good_sig1 | ||
190 | |||
191 | # Routine to print asciiz string at ds:si | ||
192 | prtstr: | ||
193 | lodsb | ||
194 | andb %al, %al | ||
195 | jz fin | ||
196 | |||
197 | call prtchr | ||
198 | jmp prtstr | ||
199 | |||
200 | fin: ret | ||
201 | |||
202 | # Space printing | ||
203 | prtsp2: call prtspc # Print double space | ||
204 | prtspc: movb $0x20, %al # Print single space (note: fall-thru) | ||
205 | |||
206 | prtchr: | ||
207 | pushw %ax | ||
208 | pushw %cx | ||
209 | movw $0007,%bx | ||
210 | movw $0x01, %cx | ||
211 | movb $0x0e, %ah | ||
212 | int $0x10 | ||
213 | popw %cx | ||
214 | popw %ax | ||
215 | ret | ||
216 | |||
217 | beep: movb $0x07, %al | ||
218 | jmp prtchr | ||
219 | |||
220 | no_sig_mess: .string "No setup signature found ..." | ||
221 | |||
222 | good_sig1: | ||
223 | jmp good_sig | ||
224 | |||
225 | # We now have to find the rest of the setup code/data | ||
226 | bad_sig: | ||
227 | movw %cs, %ax # SETUPSEG | ||
228 | subw $DELTA_INITSEG, %ax # INITSEG | ||
229 | movw %ax, %ds | ||
230 | xorb %bh, %bh | ||
231 | movb (497), %bl # get setup sect from bootsect | ||
232 | subw $4, %bx # LILO loads 4 sectors of setup | ||
233 | shlw $8, %bx # convert to words (1sect=2^8 words) | ||
234 | movw %bx, %cx | ||
235 | shrw $3, %bx # convert to segment | ||
236 | addw $SYSSEG, %bx | ||
237 | movw %bx, %cs:start_sys_seg | ||
238 | # Move rest of setup code/data to here | ||
239 | movw $2048, %di # four sectors loaded by LILO | ||
240 | subw %si, %si | ||
241 | movw %cs, %ax # aka SETUPSEG | ||
242 | movw %ax, %es | ||
243 | movw $SYSSEG, %ax | ||
244 | movw %ax, %ds | ||
245 | rep | ||
246 | movsw | ||
247 | movw %cs, %ax # aka SETUPSEG | ||
248 | movw %ax, %ds | ||
249 | cmpw $SIG1, setup_sig1 | ||
250 | jne no_sig | ||
251 | |||
252 | cmpw $SIG2, setup_sig2 | ||
253 | jne no_sig | ||
254 | |||
255 | jmp good_sig | ||
256 | |||
257 | no_sig: | ||
258 | lea no_sig_mess, %si | ||
259 | call prtstr | ||
260 | |||
261 | no_sig_loop: | ||
262 | jmp no_sig_loop | ||
263 | |||
264 | good_sig: | ||
265 | movw %cs, %ax # aka SETUPSEG | ||
266 | subw $DELTA_INITSEG, %ax # aka INITSEG | ||
267 | movw %ax, %ds | ||
268 | # Check if an old loader tries to load a big-kernel | ||
269 | testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel? | ||
270 | jz loader_ok # No, no danger for old loaders. | ||
271 | |||
272 | cmpb $0, %cs:type_of_loader # Do we have a loader that | ||
273 | # can deal with us? | ||
274 | jnz loader_ok # Yes, continue. | ||
275 | |||
276 | pushw %cs # No, we have an old loader, | ||
277 | popw %ds # die. | ||
278 | lea loader_panic_mess, %si | ||
279 | call prtstr | ||
280 | |||
281 | jmp no_sig_loop | ||
282 | |||
283 | loader_panic_mess: .string "Wrong loader, giving up..." | ||
284 | |||
285 | loader_ok: | ||
286 | /* check for long mode. */ | ||
287 | /* we have to do this before the VESA setup, otherwise the user | ||
288 | can't see the error message. */ | ||
289 | |||
290 | pushw %ds | ||
291 | movw %cs,%ax | ||
292 | movw %ax,%ds | ||
293 | |||
294 | /* minimum CPUID flags for x86-64 */ | ||
295 | /* see http://www.x86-64.org/lists/discuss/msg02971.html */ | ||
296 | #define SSE_MASK ((1<<25)|(1<<26)) | ||
297 | #define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ | ||
298 | (1<<13)|(1<<15)|(1<<24)) | ||
299 | #define REQUIRED_MASK2 (1<<29) | ||
300 | |||
301 | pushfl /* standard way to check for cpuid */ | ||
302 | popl %eax | ||
303 | movl %eax,%ebx | ||
304 | xorl $0x200000,%eax | ||
305 | pushl %eax | ||
306 | popfl | ||
307 | pushfl | ||
308 | popl %eax | ||
309 | cmpl %eax,%ebx | ||
310 | jz no_longmode /* cpu has no cpuid */ | ||
311 | movl $0x0,%eax | ||
312 | cpuid | ||
313 | cmpl $0x1,%eax | ||
314 | jb no_longmode /* no cpuid 1 */ | ||
315 | xor %di,%di | ||
316 | cmpl $0x68747541,%ebx /* AuthenticAMD */ | ||
317 | jnz noamd | ||
318 | cmpl $0x69746e65,%edx | ||
319 | jnz noamd | ||
320 | cmpl $0x444d4163,%ecx | ||
321 | jnz noamd | ||
322 | mov $1,%di /* cpu is from AMD */ | ||
323 | noamd: | ||
324 | movl $0x1,%eax | ||
325 | cpuid | ||
326 | andl $REQUIRED_MASK1,%edx | ||
327 | xorl $REQUIRED_MASK1,%edx | ||
328 | jnz no_longmode | ||
329 | movl $0x80000000,%eax | ||
330 | cpuid | ||
331 | cmpl $0x80000001,%eax | ||
332 | jb no_longmode /* no extended cpuid */ | ||
333 | movl $0x80000001,%eax | ||
334 | cpuid | ||
335 | andl $REQUIRED_MASK2,%edx | ||
336 | xorl $REQUIRED_MASK2,%edx | ||
337 | jnz no_longmode | ||
338 | sse_test: | ||
339 | movl $1,%eax | ||
340 | cpuid | ||
341 | andl $SSE_MASK,%edx | ||
342 | cmpl $SSE_MASK,%edx | ||
343 | je sse_ok | ||
344 | test %di,%di | ||
345 | jz no_longmode /* only try to force SSE on AMD */ | ||
346 | movl $0xc0010015,%ecx /* HWCR */ | ||
347 | rdmsr | ||
348 | btr $15,%eax /* enable SSE */ | ||
349 | wrmsr | ||
350 | xor %di,%di /* don't loop */ | ||
351 | jmp sse_test /* try again */ | ||
352 | no_longmode: | ||
353 | call beep | ||
354 | lea long_mode_panic,%si | ||
355 | call prtstr | ||
356 | no_longmode_loop: | ||
357 | jmp no_longmode_loop | ||
358 | long_mode_panic: | ||
359 | .string "Your CPU does not support long mode. Use a 32bit distribution." | ||
360 | .byte 0 | ||
361 | |||
362 | sse_ok: | ||
363 | popw %ds | ||
364 | |||
365 | # tell BIOS we want to go to long mode | ||
366 | movl $0xec00,%eax # declare target operating mode | ||
367 | movl $2,%ebx # long mode | ||
368 | int $0x15 | ||
369 | |||
370 | # Get memory size (extended mem, kB) | ||
371 | |||
372 | xorl %eax, %eax | ||
373 | movl %eax, (0x1e0) | ||
374 | #ifndef STANDARD_MEMORY_BIOS_CALL | ||
375 | movb %al, (E820NR) | ||
376 | # Try three different memory detection schemes. First, try | ||
377 | # e820h, which lets us assemble a memory map, then try e801h, | ||
378 | # which returns a 32-bit memory size, and finally 88h, which | ||
379 | # returns 0-64m | ||
380 | |||
381 | # method E820H: | ||
382 | # the memory map from hell. e820h returns memory classified into | ||
383 | # a whole bunch of different types, and allows memory holes and | ||
384 | # everything. We scan through this memory map and build a list | ||
385 | # of the first 32 memory areas, which we return at [E820MAP]. | ||
386 | # This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm | ||
387 | |||
388 | #define SMAP 0x534d4150 | ||
389 | |||
390 | meme820: | ||
391 | xorl %ebx, %ebx # continuation counter | ||
392 | movw $E820MAP, %di # point into the whitelist | ||
393 | # so we can have the bios | ||
394 | # directly write into it. | ||
395 | |||
396 | jmpe820: | ||
397 | movl $0x0000e820, %eax # e820, upper word zeroed | ||
398 | movl $SMAP, %edx # ascii 'SMAP' | ||
399 | movl $20, %ecx # size of the e820rec | ||
400 | pushw %ds # data record. | ||
401 | popw %es | ||
402 | int $0x15 # make the call | ||
403 | jc bail820 # fall to e801 if it fails | ||
404 | |||
405 | cmpl $SMAP, %eax # check the return is `SMAP' | ||
406 | jne bail820 # fall to e801 if it fails | ||
407 | |||
408 | # cmpl $1, 16(%di) # is this usable memory? | ||
409 | # jne again820 | ||
410 | |||
411 | # If this is usable memory, we save it by simply advancing %di by | ||
412 | # sizeof(e820rec). | ||
413 | # | ||
414 | good820: | ||
415 | movb (E820NR), %al # up to 32 entries | ||
416 | cmpb $E820MAX, %al | ||
417 | jnl bail820 | ||
418 | |||
419 | incb (E820NR) | ||
420 | movw %di, %ax | ||
421 | addw $20, %ax | ||
422 | movw %ax, %di | ||
423 | again820: | ||
424 | cmpl $0, %ebx # check to see if | ||
425 | jne jmpe820 # %ebx is set to EOF | ||
426 | bail820: | ||
427 | |||
428 | |||
429 | # method E801H: | ||
430 | # memory size is in 1k chunksizes, to avoid confusing loadlin. | ||
431 | # we store the 0xe801 memory size in a completely different place, | ||
432 | # because it will most likely be longer than 16 bits. | ||
433 | # (use 1e0 because that's what Larry Augustine uses in his | ||
434 | # alternative new memory detection scheme, and it's sensible | ||
435 | # to write everything into the same place.) | ||
436 | |||
437 | meme801: | ||
438 | stc # fix to work around buggy | ||
439 | xorw %cx,%cx # BIOSes which dont clear/set | ||
440 | xorw %dx,%dx # carry on pass/error of | ||
441 | # e801h memory size call | ||
442 | # or merely pass cx,dx though | ||
443 | # without changing them. | ||
444 | movw $0xe801, %ax | ||
445 | int $0x15 | ||
446 | jc mem88 | ||
447 | |||
448 | cmpw $0x0, %cx # Kludge to handle BIOSes | ||
449 | jne e801usecxdx # which report their extended | ||
450 | cmpw $0x0, %dx # memory in AX/BX rather than | ||
451 | jne e801usecxdx # CX/DX. The spec I have read | ||
452 | movw %ax, %cx # seems to indicate AX/BX | ||
453 | movw %bx, %dx # are more reasonable anyway... | ||
454 | |||
455 | e801usecxdx: | ||
456 | andl $0xffff, %edx # clear sign extend | ||
457 | shll $6, %edx # and go from 64k to 1k chunks | ||
458 | movl %edx, (0x1e0) # store extended memory size | ||
459 | andl $0xffff, %ecx # clear sign extend | ||
460 | addl %ecx, (0x1e0) # and add lower memory into | ||
461 | # total size. | ||
462 | |||
463 | # Ye Olde Traditional Methode. Returns the memory size (up to 16mb or | ||
464 | # 64mb, depending on the bios) in ax. | ||
465 | mem88: | ||
466 | |||
467 | #endif | ||
468 | movb $0x88, %ah | ||
469 | int $0x15 | ||
470 | movw %ax, (2) | ||
471 | |||
472 | # Set the keyboard repeat rate to the max | ||
473 | movw $0x0305, %ax | ||
474 | xorw %bx, %bx | ||
475 | int $0x16 | ||
476 | |||
477 | # Check for video adapter and its parameters and allow the | ||
478 | # user to browse video modes. | ||
479 | call video # NOTE: we need %ds pointing | ||
480 | # to bootsector | ||
481 | |||
482 | # Get hd0 data... | ||
483 | xorw %ax, %ax | ||
484 | movw %ax, %ds | ||
485 | ldsw (4 * 0x41), %si | ||
486 | movw %cs, %ax # aka SETUPSEG | ||
487 | subw $DELTA_INITSEG, %ax # aka INITSEG | ||
488 | pushw %ax | ||
489 | movw %ax, %es | ||
490 | movw $0x0080, %di | ||
491 | movw $0x10, %cx | ||
492 | pushw %cx | ||
493 | cld | ||
494 | rep | ||
495 | movsb | ||
496 | # Get hd1 data... | ||
497 | xorw %ax, %ax | ||
498 | movw %ax, %ds | ||
499 | ldsw (4 * 0x46), %si | ||
500 | popw %cx | ||
501 | popw %es | ||
502 | movw $0x0090, %di | ||
503 | rep | ||
504 | movsb | ||
505 | # Check that there IS a hd1 :-) | ||
506 | movw $0x01500, %ax | ||
507 | movb $0x81, %dl | ||
508 | int $0x13 | ||
509 | jc no_disk1 | ||
510 | |||
511 | cmpb $3, %ah | ||
512 | je is_disk1 | ||
513 | |||
514 | no_disk1: | ||
515 | movw %cs, %ax # aka SETUPSEG | ||
516 | subw $DELTA_INITSEG, %ax # aka INITSEG | ||
517 | movw %ax, %es | ||
518 | movw $0x0090, %di | ||
519 | movw $0x10, %cx | ||
520 | xorw %ax, %ax | ||
521 | cld | ||
522 | rep | ||
523 | stosb | ||
524 | is_disk1: | ||
525 | |||
526 | # Check for PS/2 pointing device | ||
527 | movw %cs, %ax # aka SETUPSEG | ||
528 | subw $DELTA_INITSEG, %ax # aka INITSEG | ||
529 | movw %ax, %ds | ||
530 | movw $0, (0x1ff) # default is no pointing device | ||
531 | int $0x11 # int 0x11: equipment list | ||
532 | testb $0x04, %al # check if mouse installed | ||
533 | jz no_psmouse | ||
534 | |||
535 | movw $0xAA, (0x1ff) # device present | ||
536 | no_psmouse: | ||
537 | |||
538 | #include "../../i386/boot/edd.S" | ||
539 | |||
540 | # Now we want to move to protected mode ... | ||
541 | cmpw $0, %cs:realmode_swtch | ||
542 | jz rmodeswtch_normal | ||
543 | |||
544 | lcall *%cs:realmode_swtch | ||
545 | |||
546 | jmp rmodeswtch_end | ||
547 | |||
548 | rmodeswtch_normal: | ||
549 | pushw %cs | ||
550 | call default_switch | ||
551 | |||
552 | rmodeswtch_end: | ||
553 | # we get the code32 start address and modify the below 'jmpi' | ||
554 | # (loader may have changed it) | ||
555 | movl %cs:code32_start, %eax | ||
556 | movl %eax, %cs:code32 | ||
557 | |||
558 | # Now we move the system to its rightful place ... but we check if we have a | ||
559 | # big-kernel. In that case we *must* not move it ... | ||
560 | testb $LOADED_HIGH, %cs:loadflags | ||
561 | jz do_move0 # .. then we have a normal low | ||
562 | # loaded zImage | ||
563 | # .. or else we have a high | ||
564 | # loaded bzImage | ||
565 | jmp end_move # ... and we skip moving | ||
566 | |||
567 | do_move0: | ||
568 | movw $0x100, %ax # start of destination segment | ||
569 | movw %cs, %bp # aka SETUPSEG | ||
570 | subw $DELTA_INITSEG, %bp # aka INITSEG | ||
571 | movw %cs:start_sys_seg, %bx # start of source segment | ||
572 | cld | ||
573 | do_move: | ||
574 | movw %ax, %es # destination segment | ||
575 | incb %ah # instead of add ax,#0x100 | ||
576 | movw %bx, %ds # source segment | ||
577 | addw $0x100, %bx | ||
578 | subw %di, %di | ||
579 | subw %si, %si | ||
580 | movw $0x800, %cx | ||
581 | rep | ||
582 | movsw | ||
583 | cmpw %bp, %bx # assume start_sys_seg > 0x200, | ||
584 | # so we will perhaps read one | ||
585 | # page more than needed, but | ||
586 | # never overwrite INITSEG | ||
587 | # because destination is a | ||
588 | # minimum one page below source | ||
589 | jb do_move | ||
590 | |||
591 | end_move: | ||
592 | # then we load the segment descriptors | ||
593 | movw %cs, %ax # aka SETUPSEG | ||
594 | movw %ax, %ds | ||
595 | |||
596 | # Check whether we need to be downward compatible with version <=201 | ||
597 | cmpl $0, cmd_line_ptr | ||
598 | jne end_move_self # loader uses version >=202 features | ||
599 | cmpb $0x20, type_of_loader | ||
600 | je end_move_self # bootsect loader, we know of it | ||
601 | |||
602 | # Boot loader doesnt support boot protocol version 2.02. | ||
603 | # If we have our code not at 0x90000, we need to move it there now. | ||
604 | # We also then need to move the params behind it (commandline) | ||
605 | # Because we would overwrite the code on the current IP, we move | ||
606 | # it in two steps, jumping high after the first one. | ||
607 | movw %cs, %ax | ||
608 | cmpw $SETUPSEG, %ax | ||
609 | je end_move_self | ||
610 | |||
611 | cli # make sure we really have | ||
612 | # interrupts disabled ! | ||
613 | # because after this the stack | ||
614 | # should not be used | ||
615 | subw $DELTA_INITSEG, %ax # aka INITSEG | ||
616 | movw %ss, %dx | ||
617 | cmpw %ax, %dx | ||
618 | jb move_self_1 | ||
619 | |||
620 | addw $INITSEG, %dx | ||
621 | subw %ax, %dx # this will go into %ss after | ||
622 | # the move | ||
623 | move_self_1: | ||
624 | movw %ax, %ds | ||
625 | movw $INITSEG, %ax # real INITSEG | ||
626 | movw %ax, %es | ||
627 | movw %cs:setup_move_size, %cx | ||
628 | std # we have to move up, so we use | ||
629 | # direction down because the | ||
630 | # areas may overlap | ||
631 | movw %cx, %di | ||
632 | decw %di | ||
633 | movw %di, %si | ||
634 | subw $move_self_here+0x200, %cx | ||
635 | rep | ||
636 | movsb | ||
637 | ljmp $SETUPSEG, $move_self_here | ||
638 | |||
639 | move_self_here: | ||
640 | movw $move_self_here+0x200, %cx | ||
641 | rep | ||
642 | movsb | ||
643 | movw $SETUPSEG, %ax | ||
644 | movw %ax, %ds | ||
645 | movw %dx, %ss | ||
646 | end_move_self: # now we are at the right place | ||
647 | lidt idt_48 # load idt with 0,0 | ||
648 | xorl %eax, %eax # Compute gdt_base | ||
649 | movw %ds, %ax # (Convert %ds:gdt to a linear ptr) | ||
650 | shll $4, %eax | ||
651 | addl $gdt, %eax | ||
652 | movl %eax, (gdt_48+2) | ||
653 | lgdt gdt_48 # load gdt with whatever is | ||
654 | # appropriate | ||
655 | |||
656 | # that was painless, now we enable a20 | ||
657 | call empty_8042 | ||
658 | |||
659 | movb $0xD1, %al # command write | ||
660 | outb %al, $0x64 | ||
661 | call empty_8042 | ||
662 | |||
663 | movb $0xDF, %al # A20 on | ||
664 | outb %al, $0x60 | ||
665 | call empty_8042 | ||
666 | |||
667 | # | ||
668 | # You must preserve the other bits here. Otherwise embarrasing things | ||
669 | # like laptops powering off on boot happen. Corrected version by Kira | ||
670 | # Brown from Linux 2.2 | ||
671 | # | ||
672 | inb $0x92, %al # | ||
673 | orb $02, %al # "fast A20" version | ||
674 | outb %al, $0x92 # some chips have only this | ||
675 | |||
676 | # wait until a20 really *is* enabled; it can take a fair amount of | ||
677 | # time on certain systems; Toshiba Tecras are known to have this | ||
678 | # problem. The memory location used here (0x200) is the int 0x80 | ||
679 | # vector, which should be safe to use. | ||
680 | |||
681 | xorw %ax, %ax # segment 0x0000 | ||
682 | movw %ax, %fs | ||
683 | decw %ax # segment 0xffff (HMA) | ||
684 | movw %ax, %gs | ||
685 | a20_wait: | ||
686 | incw %ax # unused memory location <0xfff0 | ||
687 | movw %ax, %fs:(0x200) # we use the "int 0x80" vector | ||
688 | cmpw %gs:(0x210), %ax # and its corresponding HMA addr | ||
689 | je a20_wait # loop until no longer aliased | ||
690 | |||
691 | # make sure any possible coprocessor is properly reset.. | ||
692 | xorw %ax, %ax | ||
693 | outb %al, $0xf0 | ||
694 | call delay | ||
695 | |||
696 | outb %al, $0xf1 | ||
697 | call delay | ||
698 | |||
699 | # well, that went ok, I hope. Now we mask all interrupts - the rest | ||
700 | # is done in init_IRQ(). | ||
701 | movb $0xFF, %al # mask all interrupts for now | ||
702 | outb %al, $0xA1 | ||
703 | call delay | ||
704 | |||
705 | movb $0xFB, %al # mask all irq's but irq2 which | ||
706 | outb %al, $0x21 # is cascaded | ||
707 | |||
708 | # Well, that certainly wasn't fun :-(. Hopefully it works, and we don't | ||
709 | # need no steenking BIOS anyway (except for the initial loading :-). | ||
710 | # The BIOS-routine wants lots of unnecessary data, and it's less | ||
711 | # "interesting" anyway. This is how REAL programmers do it. | ||
712 | # | ||
713 | # Well, now's the time to actually move into protected mode. To make | ||
714 | # things as simple as possible, we do no register set-up or anything, | ||
715 | # we let the gnu-compiled 32-bit programs do that. We just jump to | ||
716 | # absolute address 0x1000 (or the loader supplied one), | ||
717 | # in 32-bit protected mode. | ||
718 | # | ||
719 | # Note that the short jump isn't strictly needed, although there are | ||
720 | # reasons why it might be a good idea. It won't hurt in any case. | ||
721 | movw $1, %ax # protected mode (PE) bit | ||
722 | lmsw %ax # This is it! | ||
723 | jmp flush_instr | ||
724 | |||
725 | flush_instr: | ||
726 | xorw %bx, %bx # Flag to indicate a boot | ||
727 | xorl %esi, %esi # Pointer to real-mode code | ||
728 | movw %cs, %si | ||
729 | subw $DELTA_INITSEG, %si | ||
730 | shll $4, %esi # Convert to 32-bit pointer | ||
731 | # NOTE: For high loaded big kernels we need a | ||
732 | # jmpi 0x100000,__KERNEL_CS | ||
733 | # | ||
734 | # but we yet haven't reloaded the CS register, so the default size | ||
735 | # of the target offset still is 16 bit. | ||
736 | # However, using an operant prefix (0x66), the CPU will properly | ||
737 | # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference | ||
738 | # Manual, Mixing 16-bit and 32-bit code, page 16-6) | ||
739 | |||
740 | .byte 0x66, 0xea # prefix + jmpi-opcode | ||
741 | code32: .long 0x1000 # will be set to 0x100000 | ||
742 | # for big kernels | ||
743 | .word __KERNEL_CS | ||
744 | |||
745 | # Here's a bunch of information about your current kernel.. | ||
746 | kernel_version: .ascii UTS_RELEASE | ||
747 | .ascii " (" | ||
748 | .ascii LINUX_COMPILE_BY | ||
749 | .ascii "@" | ||
750 | .ascii LINUX_COMPILE_HOST | ||
751 | .ascii ") " | ||
752 | .ascii UTS_VERSION | ||
753 | .byte 0 | ||
754 | |||
755 | # This is the default real mode switch routine. | ||
756 | # to be called just before protected mode transition | ||
757 | default_switch: | ||
758 | cli # no interrupts allowed ! | ||
759 | movb $0x80, %al # disable NMI for bootup | ||
760 | # sequence | ||
761 | outb %al, $0x70 | ||
762 | lret | ||
763 | |||
764 | |||
765 | # This routine checks that the keyboard command queue is empty | ||
766 | # (after emptying the output buffers) | ||
767 | # | ||
768 | # Some machines have delusions that the keyboard buffer is always full | ||
769 | # with no keyboard attached... | ||
770 | # | ||
771 | # If there is no keyboard controller, we will usually get 0xff | ||
772 | # to all the reads. With each IO taking a microsecond and | ||
773 | # a timeout of 100,000 iterations, this can take about half a | ||
774 | # second ("delay" == outb to port 0x80). That should be ok, | ||
775 | # and should also be plenty of time for a real keyboard controller | ||
776 | # to empty. | ||
777 | # | ||
778 | |||
779 | empty_8042: | ||
780 | pushl %ecx | ||
781 | movl $100000, %ecx | ||
782 | |||
783 | empty_8042_loop: | ||
784 | decl %ecx | ||
785 | jz empty_8042_end_loop | ||
786 | |||
787 | call delay | ||
788 | |||
789 | inb $0x64, %al # 8042 status port | ||
790 | testb $1, %al # output buffer? | ||
791 | jz no_output | ||
792 | |||
793 | call delay | ||
794 | inb $0x60, %al # read it | ||
795 | jmp empty_8042_loop | ||
796 | |||
797 | no_output: | ||
798 | testb $2, %al # is input buffer full? | ||
799 | jnz empty_8042_loop # yes - loop | ||
800 | empty_8042_end_loop: | ||
801 | popl %ecx | ||
802 | ret | ||
803 | |||
804 | # Read the cmos clock. Return the seconds in al | ||
805 | gettime: | ||
806 | pushw %cx | ||
807 | movb $0x02, %ah | ||
808 | int $0x1a | ||
809 | movb %dh, %al # %dh contains the seconds | ||
810 | andb $0x0f, %al | ||
811 | movb %dh, %ah | ||
812 | movb $0x04, %cl | ||
813 | shrb %cl, %ah | ||
814 | aad | ||
815 | popw %cx | ||
816 | ret | ||
817 | |||
818 | # Delay is needed after doing I/O | ||
819 | delay: | ||
820 | outb %al,$0x80 | ||
821 | ret | ||
822 | |||
823 | # Descriptor tables | ||
824 | gdt: | ||
825 | .word 0, 0, 0, 0 # dummy | ||
826 | |||
827 | .word 0, 0, 0, 0 # unused | ||
828 | |||
829 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
830 | .word 0 # base address = 0 | ||
831 | .word 0x9A00 # code read/exec | ||
832 | .word 0x00CF # granularity = 4096, 386 | ||
833 | # (+5th nibble of limit) | ||
834 | |||
835 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
836 | .word 0 # base address = 0 | ||
837 | .word 0x9200 # data read/write | ||
838 | .word 0x00CF # granularity = 4096, 386 | ||
839 | # (+5th nibble of limit) | ||
840 | idt_48: | ||
841 | .word 0 # idt limit = 0 | ||
842 | .word 0, 0 # idt base = 0L | ||
843 | gdt_48: | ||
844 | .word 0x8000 # gdt limit=2048, | ||
845 | # 256 GDT entries | ||
846 | |||
847 | .word 0, 0 # gdt base (filled in later) | ||
848 | |||
849 | # Include video setup & detection code | ||
850 | |||
851 | #include "video.S" | ||
852 | |||
853 | # Setup signature -- must be last | ||
854 | setup_sig1: .word SIG1 | ||
855 | setup_sig2: .word SIG2 | ||
856 | |||
857 | # After this point, there is some free space which is used by the video mode | ||
858 | # handling code to store the temporary mode table (not used by the kernel). | ||
859 | |||
860 | modelist: | ||
861 | |||
862 | .text | ||
863 | endtext: | ||
864 | .data | ||
865 | enddata: | ||
866 | .bss | ||
867 | endbss: | ||
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c new file mode 100644 index 000000000000..c2fa66313170 --- /dev/null +++ b/arch/x86_64/boot/tools/build.c | |||
@@ -0,0 +1,186 @@ | |||
1 | /* | ||
2 | * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 1997 Martin Mares | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | * This file builds a disk-image from three different files: | ||
10 | * | ||
11 | * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest | ||
12 | * - setup: 8086 machine code, sets up system parm | ||
13 | * - system: 80386 code for actual system | ||
14 | * | ||
15 | * It does some checking that all files are of the correct type, and | ||
16 | * just writes the result to stdout, removing headers and padding to | ||
17 | * the right amount. It also writes some system data to stderr. | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * Changes by tytso to allow root device specification | ||
22 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | ||
23 | * Cross compiling fixes by Gertjan van Wingerde, July 1996 | ||
24 | * Rewritten by Martin Mares, April 1997 | ||
25 | */ | ||
26 | |||
27 | #include <stdio.h> | ||
28 | #include <string.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdarg.h> | ||
31 | #include <sys/types.h> | ||
32 | #include <sys/stat.h> | ||
33 | #include <sys/sysmacros.h> | ||
34 | #include <unistd.h> | ||
35 | #include <fcntl.h> | ||
36 | #include <asm/boot.h> | ||
37 | |||
38 | typedef unsigned char byte; | ||
39 | typedef unsigned short word; | ||
40 | typedef unsigned long u32; | ||
41 | |||
42 | #define DEFAULT_MAJOR_ROOT 0 | ||
43 | #define DEFAULT_MINOR_ROOT 0 | ||
44 | |||
45 | /* Minimal number of setup sectors (see also bootsect.S) */ | ||
46 | #define SETUP_SECTS 4 | ||
47 | |||
48 | byte buf[1024]; | ||
49 | int fd; | ||
50 | int is_big_kernel; | ||
51 | |||
52 | void die(const char * str, ...) | ||
53 | { | ||
54 | va_list args; | ||
55 | va_start(args, str); | ||
56 | vfprintf(stderr, str, args); | ||
57 | fputc('\n', stderr); | ||
58 | exit(1); | ||
59 | } | ||
60 | |||
61 | void file_open(const char *name) | ||
62 | { | ||
63 | if ((fd = open(name, O_RDONLY, 0)) < 0) | ||
64 | die("Unable to open `%s': %m", name); | ||
65 | } | ||
66 | |||
67 | void usage(void) | ||
68 | { | ||
69 | die("Usage: build [-b] bootsect setup system [rootdev] [> image]"); | ||
70 | } | ||
71 | |||
72 | int main(int argc, char ** argv) | ||
73 | { | ||
74 | unsigned int i, c, sz, setup_sectors; | ||
75 | u32 sys_size; | ||
76 | byte major_root, minor_root; | ||
77 | struct stat sb; | ||
78 | |||
79 | if (argc > 2 && !strcmp(argv[1], "-b")) | ||
80 | { | ||
81 | is_big_kernel = 1; | ||
82 | argc--, argv++; | ||
83 | } | ||
84 | if ((argc < 4) || (argc > 5)) | ||
85 | usage(); | ||
86 | if (argc > 4) { | ||
87 | if (!strcmp(argv[4], "CURRENT")) { | ||
88 | if (stat("/", &sb)) { | ||
89 | perror("/"); | ||
90 | die("Couldn't stat /"); | ||
91 | } | ||
92 | major_root = major(sb.st_dev); | ||
93 | minor_root = minor(sb.st_dev); | ||
94 | } else if (strcmp(argv[4], "FLOPPY")) { | ||
95 | if (stat(argv[4], &sb)) { | ||
96 | perror(argv[4]); | ||
97 | die("Couldn't stat root device."); | ||
98 | } | ||
99 | major_root = major(sb.st_rdev); | ||
100 | minor_root = minor(sb.st_rdev); | ||
101 | } else { | ||
102 | major_root = 0; | ||
103 | minor_root = 0; | ||
104 | } | ||
105 | } else { | ||
106 | major_root = DEFAULT_MAJOR_ROOT; | ||
107 | minor_root = DEFAULT_MINOR_ROOT; | ||
108 | } | ||
109 | fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); | ||
110 | |||
111 | file_open(argv[1]); | ||
112 | i = read(fd, buf, sizeof(buf)); | ||
113 | fprintf(stderr,"Boot sector %d bytes.\n",i); | ||
114 | if (i != 512) | ||
115 | die("Boot block must be exactly 512 bytes"); | ||
116 | if (buf[510] != 0x55 || buf[511] != 0xaa) | ||
117 | die("Boot block hasn't got boot flag (0xAA55)"); | ||
118 | buf[508] = minor_root; | ||
119 | buf[509] = major_root; | ||
120 | if (write(1, buf, 512) != 512) | ||
121 | die("Write call failed"); | ||
122 | close (fd); | ||
123 | |||
124 | file_open(argv[2]); /* Copy the setup code */ | ||
125 | for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c ) | ||
126 | if (write(1, buf, c) != c) | ||
127 | die("Write call failed"); | ||
128 | if (c != 0) | ||
129 | die("read-error on `setup'"); | ||
130 | close (fd); | ||
131 | |||
132 | setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */ | ||
133 | /* for compatibility with ancient versions of LILO. */ | ||
134 | if (setup_sectors < SETUP_SECTS) | ||
135 | setup_sectors = SETUP_SECTS; | ||
136 | fprintf(stderr, "Setup is %d bytes.\n", i); | ||
137 | memset(buf, 0, sizeof(buf)); | ||
138 | while (i < setup_sectors * 512) { | ||
139 | c = setup_sectors * 512 - i; | ||
140 | if (c > sizeof(buf)) | ||
141 | c = sizeof(buf); | ||
142 | if (write(1, buf, c) != c) | ||
143 | die("Write call failed"); | ||
144 | i += c; | ||
145 | } | ||
146 | |||
147 | file_open(argv[3]); | ||
148 | if (fstat (fd, &sb)) | ||
149 | die("Unable to stat `%s': %m", argv[3]); | ||
150 | sz = sb.st_size; | ||
151 | fprintf (stderr, "System is %d kB\n", sz/1024); | ||
152 | sys_size = (sz + 15) / 16; | ||
153 | /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ | ||
154 | if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) | ||
155 | die("System is too big. Try using %smodules.", | ||
156 | is_big_kernel ? "" : "bzImage or "); | ||
157 | while (sz > 0) { | ||
158 | int l, n; | ||
159 | |||
160 | l = (sz > sizeof(buf)) ? sizeof(buf) : sz; | ||
161 | if ((n=read(fd, buf, l)) != l) { | ||
162 | if (n < 0) | ||
163 | die("Error reading %s: %m", argv[3]); | ||
164 | else | ||
165 | die("%s: Unexpected EOF", argv[3]); | ||
166 | } | ||
167 | if (write(1, buf, l) != l) | ||
168 | die("Write failed"); | ||
169 | sz -= l; | ||
170 | } | ||
171 | close(fd); | ||
172 | |||
173 | if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */ | ||
174 | die("Output: seek failed"); | ||
175 | buf[0] = setup_sectors; | ||
176 | if (write(1, buf, 1) != 1) | ||
177 | die("Write of setup sector count failed"); | ||
178 | if (lseek(1, 500, SEEK_SET) != 500) | ||
179 | die("Output: seek failed"); | ||
180 | buf[0] = (sys_size & 0xff); | ||
181 | buf[1] = ((sys_size >> 8) & 0xff); | ||
182 | if (write(1, buf, 2) != 2) | ||
183 | die("Write of image length failed"); | ||
184 | |||
185 | return 0; /* Everything is OK */ | ||
186 | } | ||
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S new file mode 100644 index 000000000000..0587477c99f2 --- /dev/null +++ b/arch/x86_64/boot/video.S | |||
@@ -0,0 +1,2007 @@ | |||
1 | /* video.S | ||
2 | * | ||
3 | * Display adapter & video mode setup, version 2.13 (14-May-99) | ||
4 | * | ||
5 | * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz> | ||
6 | * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson | ||
7 | * | ||
8 | * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999 | ||
9 | * | ||
10 | * For further information, look at Documentation/svga.txt. | ||
11 | * | ||
12 | */ | ||
13 | |||
14 | #include <linux/config.h> /* for CONFIG_VIDEO_* */ | ||
15 | |||
16 | /* Enable autodetection of SVGA adapters and modes. */ | ||
17 | #undef CONFIG_VIDEO_SVGA | ||
18 | |||
19 | /* Enable autodetection of VESA modes */ | ||
20 | #define CONFIG_VIDEO_VESA | ||
21 | |||
22 | /* Enable compacting of mode table */ | ||
23 | #define CONFIG_VIDEO_COMPACT | ||
24 | |||
25 | /* Retain screen contents when switching modes */ | ||
26 | #define CONFIG_VIDEO_RETAIN | ||
27 | |||
28 | /* Enable local mode list */ | ||
29 | #undef CONFIG_VIDEO_LOCAL | ||
30 | |||
31 | /* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ | ||
32 | #undef CONFIG_VIDEO_400_HACK | ||
33 | |||
34 | /* Hack that lets you force specific BIOS mode ID and specific dimensions */ | ||
35 | #undef CONFIG_VIDEO_GFX_HACK | ||
36 | #define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ | ||
37 | #define VIDEO_GFX_BIOS_BX 0x0102 | ||
38 | #define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ | ||
39 | |||
40 | /* This code uses an extended set of video mode numbers. These include: | ||
41 | * Aliases for standard modes | ||
42 | * NORMAL_VGA (-1) | ||
43 | * EXTENDED_VGA (-2) | ||
44 | * ASK_VGA (-3) | ||
45 | * Video modes numbered by menu position -- NOT RECOMMENDED because of lack | ||
46 | * of compatibility when extending the table. These are between 0x00 and 0xff. | ||
47 | */ | ||
48 | #define VIDEO_FIRST_MENU 0x0000 | ||
49 | |||
50 | /* Standard BIOS video modes (BIOS number + 0x0100) */ | ||
51 | #define VIDEO_FIRST_BIOS 0x0100 | ||
52 | |||
53 | /* VESA BIOS video modes (VESA number + 0x0200) */ | ||
54 | #define VIDEO_FIRST_VESA 0x0200 | ||
55 | |||
56 | /* Video7 special modes (BIOS number + 0x0900) */ | ||
57 | #define VIDEO_FIRST_V7 0x0900 | ||
58 | |||
59 | /* Special video modes */ | ||
60 | #define VIDEO_FIRST_SPECIAL 0x0f00 | ||
61 | #define VIDEO_80x25 0x0f00 | ||
62 | #define VIDEO_8POINT 0x0f01 | ||
63 | #define VIDEO_80x43 0x0f02 | ||
64 | #define VIDEO_80x28 0x0f03 | ||
65 | #define VIDEO_CURRENT_MODE 0x0f04 | ||
66 | #define VIDEO_80x30 0x0f05 | ||
67 | #define VIDEO_80x34 0x0f06 | ||
68 | #define VIDEO_80x60 0x0f07 | ||
69 | #define VIDEO_GFX_HACK 0x0f08 | ||
70 | #define VIDEO_LAST_SPECIAL 0x0f09 | ||
71 | |||
72 | /* Video modes given by resolution */ | ||
73 | #define VIDEO_FIRST_RESOLUTION 0x1000 | ||
74 | |||
75 | /* The "recalculate timings" flag */ | ||
76 | #define VIDEO_RECALC 0x8000 | ||
77 | |||
78 | /* Positions of various video parameters passed to the kernel */ | ||
79 | /* (see also include/linux/tty.h) */ | ||
80 | #define PARAM_CURSOR_POS 0x00 | ||
81 | #define PARAM_VIDEO_PAGE 0x04 | ||
82 | #define PARAM_VIDEO_MODE 0x06 | ||
83 | #define PARAM_VIDEO_COLS 0x07 | ||
84 | #define PARAM_VIDEO_EGA_BX 0x0a | ||
85 | #define PARAM_VIDEO_LINES 0x0e | ||
86 | #define PARAM_HAVE_VGA 0x0f | ||
87 | #define PARAM_FONT_POINTS 0x10 | ||
88 | |||
89 | #define PARAM_LFB_WIDTH 0x12 | ||
90 | #define PARAM_LFB_HEIGHT 0x14 | ||
91 | #define PARAM_LFB_DEPTH 0x16 | ||
92 | #define PARAM_LFB_BASE 0x18 | ||
93 | #define PARAM_LFB_SIZE 0x1c | ||
94 | #define PARAM_LFB_LINELENGTH 0x24 | ||
95 | #define PARAM_LFB_COLORS 0x26 | ||
96 | #define PARAM_VESAPM_SEG 0x2e | ||
97 | #define PARAM_VESAPM_OFF 0x30 | ||
98 | #define PARAM_LFB_PAGES 0x32 | ||
99 | #define PARAM_VESA_ATTRIB 0x34 | ||
100 | |||
101 | /* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ | ||
102 | #ifdef CONFIG_VIDEO_RETAIN | ||
103 | #define DO_STORE call store_screen | ||
104 | #else | ||
105 | #define DO_STORE | ||
106 | #endif /* CONFIG_VIDEO_RETAIN */ | ||
107 | |||
108 | # This is the main entry point called by setup.S | ||
109 | # %ds *must* be pointing to the bootsector | ||
110 | video: pushw %ds # We use different segments | ||
111 | pushw %ds # FS contains original DS | ||
112 | popw %fs | ||
113 | pushw %cs # DS is equal to CS | ||
114 | popw %ds | ||
115 | pushw %cs # ES is equal to CS | ||
116 | popw %es | ||
117 | xorw %ax, %ax | ||
118 | movw %ax, %gs # GS is zero | ||
119 | cld | ||
120 | call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) | ||
121 | #ifdef CONFIG_VIDEO_SELECT | ||
122 | movw %fs:(0x01fa), %ax # User selected video mode | ||
123 | cmpw $ASK_VGA, %ax # Bring up the menu | ||
124 | jz vid2 | ||
125 | |||
126 | call mode_set # Set the mode | ||
127 | jc vid1 | ||
128 | |||
129 | leaw badmdt, %si # Invalid mode ID | ||
130 | call prtstr | ||
131 | vid2: call mode_menu | ||
132 | vid1: | ||
133 | #ifdef CONFIG_VIDEO_RETAIN | ||
134 | call restore_screen # Restore screen contents | ||
135 | #endif /* CONFIG_VIDEO_RETAIN */ | ||
136 | call store_edid | ||
137 | #endif /* CONFIG_VIDEO_SELECT */ | ||
138 | call mode_params # Store mode parameters | ||
139 | popw %ds # Restore original DS | ||
140 | ret | ||
141 | |||
142 | # Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. | ||
143 | basic_detect: | ||
144 | movb $0, %fs:(PARAM_HAVE_VGA) | ||
145 | movb $0x12, %ah # Check EGA/VGA | ||
146 | movb $0x10, %bl | ||
147 | int $0x10 | ||
148 | movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel | ||
149 | cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. | ||
150 | je basret | ||
151 | |||
152 | incb adapter | ||
153 | movw $0x1a00, %ax # Check EGA or VGA? | ||
154 | int $0x10 | ||
155 | cmpb $0x1a, %al # 1a means VGA... | ||
156 | jne basret # anything else is EGA. | ||
157 | |||
158 | incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA | ||
159 | incb adapter | ||
160 | basret: ret | ||
161 | |||
162 | # Store the video mode parameters for later usage by the kernel. | ||
163 | # This is done by asking the BIOS except for the rows/columns | ||
164 | # parameters in the default 80x25 mode -- these are set directly, | ||
165 | # because some very obscure BIOSes supply insane values. | ||
166 | mode_params: | ||
167 | #ifdef CONFIG_VIDEO_SELECT | ||
168 | cmpb $0, graphic_mode | ||
169 | jnz mopar_gr | ||
170 | #endif | ||
171 | movb $0x03, %ah # Read cursor position | ||
172 | xorb %bh, %bh | ||
173 | int $0x10 | ||
174 | movw %dx, %fs:(PARAM_CURSOR_POS) | ||
175 | movb $0x0f, %ah # Read page/mode/width | ||
176 | int $0x10 | ||
177 | movw %bx, %fs:(PARAM_VIDEO_PAGE) | ||
178 | movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width | ||
179 | cmpb $0x7, %al # MDA/HGA => segment differs | ||
180 | jnz mopar0 | ||
181 | |||
182 | movw $0xb000, video_segment | ||
183 | mopar0: movw %gs:(0x485), %ax # Font size | ||
184 | movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) | ||
185 | movw force_size, %ax # Forced size? | ||
186 | orw %ax, %ax | ||
187 | jz mopar1 | ||
188 | |||
189 | movb %ah, %fs:(PARAM_VIDEO_COLS) | ||
190 | movb %al, %fs:(PARAM_VIDEO_LINES) | ||
191 | ret | ||
192 | |||
193 | mopar1: movb $25, %al | ||
194 | cmpb $0, adapter # If we are on CGA/MDA/HGA, the | ||
195 | jz mopar2 # screen must have 25 lines. | ||
196 | |||
197 | movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS | ||
198 | incb %al # location of max lines. | ||
199 | mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) | ||
200 | ret | ||
201 | |||
202 | #ifdef CONFIG_VIDEO_SELECT | ||
203 | # Fetching of VESA frame buffer parameters | ||
204 | mopar_gr: | ||
205 | leaw modelist+1024, %di | ||
206 | movb $0x23, %fs:(PARAM_HAVE_VGA) | ||
207 | movw 16(%di), %ax | ||
208 | movw %ax, %fs:(PARAM_LFB_LINELENGTH) | ||
209 | movw 18(%di), %ax | ||
210 | movw %ax, %fs:(PARAM_LFB_WIDTH) | ||
211 | movw 20(%di), %ax | ||
212 | movw %ax, %fs:(PARAM_LFB_HEIGHT) | ||
213 | movb 25(%di), %al | ||
214 | movb $0, %ah | ||
215 | movw %ax, %fs:(PARAM_LFB_DEPTH) | ||
216 | movb 29(%di), %al | ||
217 | movb $0, %ah | ||
218 | movw %ax, %fs:(PARAM_LFB_PAGES) | ||
219 | movl 40(%di), %eax | ||
220 | movl %eax, %fs:(PARAM_LFB_BASE) | ||
221 | movl 31(%di), %eax | ||
222 | movl %eax, %fs:(PARAM_LFB_COLORS) | ||
223 | movl 35(%di), %eax | ||
224 | movl %eax, %fs:(PARAM_LFB_COLORS+4) | ||
225 | movw 0(%di), %ax | ||
226 | movw %ax, %fs:(PARAM_VESA_ATTRIB) | ||
227 | |||
228 | # get video mem size | ||
229 | leaw modelist+1024, %di | ||
230 | movw $0x4f00, %ax | ||
231 | int $0x10 | ||
232 | xorl %eax, %eax | ||
233 | movw 18(%di), %ax | ||
234 | movl %eax, %fs:(PARAM_LFB_SIZE) | ||
235 | |||
236 | # switching the DAC to 8-bit is for <= 8 bpp only | ||
237 | movw %fs:(PARAM_LFB_DEPTH), %ax | ||
238 | cmpw $8, %ax | ||
239 | jg dac_done | ||
240 | |||
241 | # get DAC switching capability | ||
242 | xorl %eax, %eax | ||
243 | movb 10(%di), %al | ||
244 | testb $1, %al | ||
245 | jz dac_set | ||
246 | |||
247 | # attempt to switch DAC to 8-bit | ||
248 | movw $0x4f08, %ax | ||
249 | movw $0x0800, %bx | ||
250 | int $0x10 | ||
251 | cmpw $0x004f, %ax | ||
252 | jne dac_set | ||
253 | movb %bh, dac_size # store actual DAC size | ||
254 | |||
255 | dac_set: | ||
256 | # set color size to DAC size | ||
257 | movb dac_size, %al | ||
258 | movb %al, %fs:(PARAM_LFB_COLORS+0) | ||
259 | movb %al, %fs:(PARAM_LFB_COLORS+2) | ||
260 | movb %al, %fs:(PARAM_LFB_COLORS+4) | ||
261 | movb %al, %fs:(PARAM_LFB_COLORS+6) | ||
262 | |||
263 | # set color offsets to 0 | ||
264 | movb $0, %fs:(PARAM_LFB_COLORS+1) | ||
265 | movb $0, %fs:(PARAM_LFB_COLORS+3) | ||
266 | movb $0, %fs:(PARAM_LFB_COLORS+5) | ||
267 | movb $0, %fs:(PARAM_LFB_COLORS+7) | ||
268 | |||
269 | dac_done: | ||
270 | # get protected mode interface informations | ||
271 | movw $0x4f0a, %ax | ||
272 | xorw %bx, %bx | ||
273 | xorw %di, %di | ||
274 | int $0x10 | ||
275 | cmp $0x004f, %ax | ||
276 | jnz no_pm | ||
277 | |||
278 | movw %es, %fs:(PARAM_VESAPM_SEG) | ||
279 | movw %di, %fs:(PARAM_VESAPM_OFF) | ||
280 | no_pm: ret | ||
281 | |||
282 | # The video mode menu | ||
283 | mode_menu: | ||
284 | leaw keymsg, %si # "Return/Space/Timeout" message | ||
285 | call prtstr | ||
286 | call flush | ||
287 | nokey: call getkt | ||
288 | |||
289 | cmpb $0x0d, %al # ENTER ? | ||
290 | je listm # yes - manual mode selection | ||
291 | |||
292 | cmpb $0x20, %al # SPACE ? | ||
293 | je defmd1 # no - repeat | ||
294 | |||
295 | call beep | ||
296 | jmp nokey | ||
297 | |||
298 | defmd1: ret # No mode chosen? Default 80x25 | ||
299 | |||
300 | listm: call mode_table # List mode table | ||
301 | listm0: leaw name_bann, %si # Print adapter name | ||
302 | call prtstr | ||
303 | movw card_name, %si | ||
304 | orw %si, %si | ||
305 | jnz an2 | ||
306 | |||
307 | movb adapter, %al | ||
308 | leaw old_name, %si | ||
309 | orb %al, %al | ||
310 | jz an1 | ||
311 | |||
312 | leaw ega_name, %si | ||
313 | decb %al | ||
314 | jz an1 | ||
315 | |||
316 | leaw vga_name, %si | ||
317 | jmp an1 | ||
318 | |||
319 | an2: call prtstr | ||
320 | leaw svga_name, %si | ||
321 | an1: call prtstr | ||
322 | leaw listhdr, %si # Table header | ||
323 | call prtstr | ||
324 | movb $0x30, %dl # DL holds mode number | ||
325 | leaw modelist, %si | ||
326 | lm1: cmpw $ASK_VGA, (%si) # End? | ||
327 | jz lm2 | ||
328 | |||
329 | movb %dl, %al # Menu selection number | ||
330 | call prtchr | ||
331 | call prtsp2 | ||
332 | lodsw | ||
333 | call prthw # Mode ID | ||
334 | call prtsp2 | ||
335 | movb 0x1(%si), %al | ||
336 | call prtdec # Rows | ||
337 | movb $0x78, %al # the letter 'x' | ||
338 | call prtchr | ||
339 | lodsw | ||
340 | call prtdec # Columns | ||
341 | movb $0x0d, %al # New line | ||
342 | call prtchr | ||
343 | movb $0x0a, %al | ||
344 | call prtchr | ||
345 | incb %dl # Next character | ||
346 | cmpb $0x3a, %dl | ||
347 | jnz lm1 | ||
348 | |||
349 | movb $0x61, %dl | ||
350 | jmp lm1 | ||
351 | |||
352 | lm2: leaw prompt, %si # Mode prompt | ||
353 | call prtstr | ||
354 | leaw edit_buf, %di # Editor buffer | ||
355 | lm3: call getkey | ||
356 | cmpb $0x0d, %al # Enter? | ||
357 | jz lment | ||
358 | |||
359 | cmpb $0x08, %al # Backspace? | ||
360 | jz lmbs | ||
361 | |||
362 | cmpb $0x20, %al # Printable? | ||
363 | jc lm3 | ||
364 | |||
365 | cmpw $edit_buf+4, %di # Enough space? | ||
366 | jz lm3 | ||
367 | |||
368 | stosb | ||
369 | call prtchr | ||
370 | jmp lm3 | ||
371 | |||
372 | lmbs: cmpw $edit_buf, %di # Backspace | ||
373 | jz lm3 | ||
374 | |||
375 | decw %di | ||
376 | movb $0x08, %al | ||
377 | call prtchr | ||
378 | call prtspc | ||
379 | movb $0x08, %al | ||
380 | call prtchr | ||
381 | jmp lm3 | ||
382 | |||
383 | lment: movb $0, (%di) | ||
384 | leaw crlft, %si | ||
385 | call prtstr | ||
386 | leaw edit_buf, %si | ||
387 | cmpb $0, (%si) # Empty string = default mode | ||
388 | jz lmdef | ||
389 | |||
390 | cmpb $0, 1(%si) # One character = menu selection | ||
391 | jz mnusel | ||
392 | |||
393 | cmpw $0x6373, (%si) # "scan" => mode scanning | ||
394 | jnz lmhx | ||
395 | |||
396 | cmpw $0x6e61, 2(%si) | ||
397 | jz lmscan | ||
398 | |||
399 | lmhx: xorw %bx, %bx # Else => mode ID in hex | ||
400 | lmhex: lodsb | ||
401 | orb %al, %al | ||
402 | jz lmuse1 | ||
403 | |||
404 | subb $0x30, %al | ||
405 | jc lmbad | ||
406 | |||
407 | cmpb $10, %al | ||
408 | jc lmhx1 | ||
409 | |||
410 | subb $7, %al | ||
411 | andb $0xdf, %al | ||
412 | cmpb $10, %al | ||
413 | jc lmbad | ||
414 | |||
415 | cmpb $16, %al | ||
416 | jnc lmbad | ||
417 | |||
418 | lmhx1: shlw $4, %bx | ||
419 | orb %al, %bl | ||
420 | jmp lmhex | ||
421 | |||
422 | lmuse1: movw %bx, %ax | ||
423 | jmp lmuse | ||
424 | |||
425 | mnusel: lodsb # Menu selection | ||
426 | xorb %ah, %ah | ||
427 | subb $0x30, %al | ||
428 | jc lmbad | ||
429 | |||
430 | cmpb $10, %al | ||
431 | jc lmuse | ||
432 | |||
433 | cmpb $0x61-0x30, %al | ||
434 | jc lmbad | ||
435 | |||
436 | subb $0x61-0x30-10, %al | ||
437 | cmpb $36, %al | ||
438 | jnc lmbad | ||
439 | |||
440 | lmuse: call mode_set | ||
441 | jc lmdef | ||
442 | |||
443 | lmbad: leaw unknt, %si | ||
444 | call prtstr | ||
445 | jmp lm2 | ||
446 | lmscan: cmpb $0, adapter # Scanning only on EGA/VGA | ||
447 | jz lmbad | ||
448 | |||
449 | movw $0, mt_end # Scanning of modes is | ||
450 | movb $1, scanning # done as new autodetection. | ||
451 | call mode_table | ||
452 | jmp listm0 | ||
453 | lmdef: ret | ||
454 | |||
455 | # Additional parts of mode_set... (relative jumps, you know) | ||
456 | setv7: # Video7 extended modes | ||
457 | DO_STORE | ||
458 | subb $VIDEO_FIRST_V7>>8, %bh | ||
459 | movw $0x6f05, %ax | ||
460 | int $0x10 | ||
461 | stc | ||
462 | ret | ||
463 | |||
464 | _setrec: jmp setrec # Ugly... | ||
465 | _set_80x25: jmp set_80x25 | ||
466 | |||
467 | # Aliases for backward compatibility. | ||
468 | setalias: | ||
469 | movw $VIDEO_80x25, %ax | ||
470 | incw %bx | ||
471 | jz mode_set | ||
472 | |||
473 | movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al | ||
474 | incw %bx | ||
475 | jnz setbad # Fall-through! | ||
476 | |||
477 | # Setting of user mode (AX=mode ID) => CF=success | ||
478 | mode_set: | ||
479 | movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S | ||
480 | movw %ax, %bx | ||
481 | cmpb $0xff, %ah | ||
482 | jz setalias | ||
483 | |||
484 | testb $VIDEO_RECALC>>8, %ah | ||
485 | jnz _setrec | ||
486 | |||
487 | cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah | ||
488 | jnc setres | ||
489 | |||
490 | cmpb $VIDEO_FIRST_SPECIAL>>8, %ah | ||
491 | jz setspc | ||
492 | |||
493 | cmpb $VIDEO_FIRST_V7>>8, %ah | ||
494 | jz setv7 | ||
495 | |||
496 | cmpb $VIDEO_FIRST_VESA>>8, %ah | ||
497 | jnc check_vesa | ||
498 | |||
499 | orb %ah, %ah | ||
500 | jz setmenu | ||
501 | |||
502 | decb %ah | ||
503 | jz setbios | ||
504 | |||
505 | setbad: clc | ||
506 | movb $0, do_restore # The screen needn't be restored | ||
507 | ret | ||
508 | |||
509 | setvesa: | ||
510 | DO_STORE | ||
511 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
512 | movw $0x4f02, %ax # VESA BIOS mode set call | ||
513 | int $0x10 | ||
514 | cmpw $0x004f, %ax # AL=4f if implemented | ||
515 | jnz setbad # AH=0 if OK | ||
516 | |||
517 | stc | ||
518 | ret | ||
519 | |||
520 | setbios: | ||
521 | DO_STORE | ||
522 | int $0x10 # Standard BIOS mode set call | ||
523 | pushw %bx | ||
524 | movb $0x0f, %ah # Check if really set | ||
525 | int $0x10 | ||
526 | popw %bx | ||
527 | cmpb %bl, %al | ||
528 | jnz setbad | ||
529 | |||
530 | stc | ||
531 | ret | ||
532 | |||
533 | setspc: xorb %bh, %bh # Set special mode | ||
534 | cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl | ||
535 | jnc setbad | ||
536 | |||
537 | addw %bx, %bx | ||
538 | jmp *spec_inits(%bx) | ||
539 | |||
540 | setmenu: | ||
541 | orb %al, %al # 80x25 is an exception | ||
542 | jz _set_80x25 | ||
543 | |||
544 | pushw %bx # Set mode chosen from menu | ||
545 | call mode_table # Build the mode table | ||
546 | popw %ax | ||
547 | shlw $2, %ax | ||
548 | addw %ax, %si | ||
549 | cmpw %di, %si | ||
550 | jnc setbad | ||
551 | |||
552 | movw (%si), %ax # Fetch mode ID | ||
553 | _m_s: jmp mode_set | ||
554 | |||
555 | setres: pushw %bx # Set mode chosen by resolution | ||
556 | call mode_table | ||
557 | popw %bx | ||
558 | xchgb %bl, %bh | ||
559 | setr1: lodsw | ||
560 | cmpw $ASK_VGA, %ax # End of the list? | ||
561 | jz setbad | ||
562 | |||
563 | lodsw | ||
564 | cmpw %bx, %ax | ||
565 | jnz setr1 | ||
566 | |||
567 | movw -4(%si), %ax # Fetch mode ID | ||
568 | jmp _m_s | ||
569 | |||
570 | check_vesa: | ||
571 | leaw modelist+1024, %di | ||
572 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
573 | movw %bx, %cx # Get mode information structure | ||
574 | movw $0x4f01, %ax | ||
575 | int $0x10 | ||
576 | addb $VIDEO_FIRST_VESA>>8, %bh | ||
577 | cmpw $0x004f, %ax | ||
578 | jnz setbad | ||
579 | |||
580 | movb (%di), %al # Check capabilities. | ||
581 | andb $0x19, %al | ||
582 | cmpb $0x09, %al | ||
583 | jz setvesa # This is a text mode | ||
584 | |||
585 | movb (%di), %al # Check capabilities. | ||
586 | andb $0x99, %al | ||
587 | cmpb $0x99, %al | ||
588 | jnz _setbad # Doh! No linear frame buffer. | ||
589 | |||
590 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
591 | orw $0x4000, %bx # Use linear frame buffer | ||
592 | movw $0x4f02, %ax # VESA BIOS mode set call | ||
593 | int $0x10 | ||
594 | cmpw $0x004f, %ax # AL=4f if implemented | ||
595 | jnz _setbad # AH=0 if OK | ||
596 | |||
597 | movb $1, graphic_mode # flag graphic mode | ||
598 | movb $0, do_restore # no screen restore | ||
599 | stc | ||
600 | ret | ||
601 | |||
602 | _setbad: jmp setbad # Ugly... | ||
603 | |||
604 | # Recalculate vertical display end registers -- this fixes various | ||
605 | # inconsistencies of extended modes on many adapters. Called when | ||
606 | # the VIDEO_RECALC flag is set in the mode ID. | ||
607 | |||
608 | setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode | ||
609 | call mode_set | ||
610 | jnc rct3 | ||
611 | |||
612 | movw %gs:(0x485), %ax # Font size in pixels | ||
613 | movb %gs:(0x484), %bl # Number of rows | ||
614 | incb %bl | ||
615 | mulb %bl # Number of visible | ||
616 | decw %ax # scan lines - 1 | ||
617 | movw $0x3d4, %dx | ||
618 | movw %ax, %bx | ||
619 | movb $0x12, %al # Lower 8 bits | ||
620 | movb %bl, %ah | ||
621 | outw %ax, %dx | ||
622 | movb $0x07, %al # Bits 8 and 9 in the overflow register | ||
623 | call inidx | ||
624 | xchgb %al, %ah | ||
625 | andb $0xbd, %ah | ||
626 | shrb %bh | ||
627 | jnc rct1 | ||
628 | orb $0x02, %ah | ||
629 | rct1: shrb %bh | ||
630 | jnc rct2 | ||
631 | orb $0x40, %ah | ||
632 | rct2: movb $0x07, %al | ||
633 | outw %ax, %dx | ||
634 | stc | ||
635 | rct3: ret | ||
636 | |||
637 | # Table of routines for setting of the special modes. | ||
638 | spec_inits: | ||
639 | .word set_80x25 | ||
640 | .word set_8pixel | ||
641 | .word set_80x43 | ||
642 | .word set_80x28 | ||
643 | .word set_current | ||
644 | .word set_80x30 | ||
645 | .word set_80x34 | ||
646 | .word set_80x60 | ||
647 | .word set_gfx | ||
648 | |||
649 | # Set the 80x25 mode. If already set, do nothing. | ||
650 | set_80x25: | ||
651 | movw $0x5019, force_size # Override possibly broken BIOS | ||
652 | use_80x25: | ||
653 | #ifdef CONFIG_VIDEO_400_HACK | ||
654 | movw $0x1202, %ax # Force 400 scan lines | ||
655 | movb $0x30, %bl | ||
656 | int $0x10 | ||
657 | #else | ||
658 | movb $0x0f, %ah # Get current mode ID | ||
659 | int $0x10 | ||
660 | cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available | ||
661 | jz st80 # on CGA/MDA/HGA and is also available on EGAM | ||
662 | |||
663 | cmpw $0x5003, %ax # Unknown mode, force 80x25 color | ||
664 | jnz force3 | ||
665 | |||
666 | st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 | ||
667 | jz set80 | ||
668 | |||
669 | movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. | ||
670 | orb %al, %al # Some buggy BIOS'es set 0 rows | ||
671 | jz set80 | ||
672 | |||
673 | cmpb $24, %al # It's hopefully correct | ||
674 | jz set80 | ||
675 | #endif /* CONFIG_VIDEO_400_HACK */ | ||
676 | force3: DO_STORE | ||
677 | movw $0x0003, %ax # Forced set | ||
678 | int $0x10 | ||
679 | set80: stc | ||
680 | ret | ||
681 | |||
682 | # Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. | ||
683 | set_8pixel: | ||
684 | DO_STORE | ||
685 | call use_80x25 # The base is 80x25 | ||
686 | set_8pt: | ||
687 | movw $0x1112, %ax # Use 8x8 font | ||
688 | xorb %bl, %bl | ||
689 | int $0x10 | ||
690 | movw $0x1200, %ax # Use alternate print screen | ||
691 | movb $0x20, %bl | ||
692 | int $0x10 | ||
693 | movw $0x1201, %ax # Turn off cursor emulation | ||
694 | movb $0x34, %bl | ||
695 | int $0x10 | ||
696 | movb $0x01, %ah # Define cursor scan lines 6-7 | ||
697 | movw $0x0607, %cx | ||
698 | int $0x10 | ||
699 | set_current: | ||
700 | stc | ||
701 | ret | ||
702 | |||
703 | # Set the 80x28 mode. This mode works on all VGA's, because it's a standard | ||
704 | # 80x25 mode with 14-point fonts instead of 16-point. | ||
705 | set_80x28: | ||
706 | DO_STORE | ||
707 | call use_80x25 # The base is 80x25 | ||
708 | set14: movw $0x1111, %ax # Use 9x14 font | ||
709 | xorb %bl, %bl | ||
710 | int $0x10 | ||
711 | movb $0x01, %ah # Define cursor scan lines 11-12 | ||
712 | movw $0x0b0c, %cx | ||
713 | int $0x10 | ||
714 | stc | ||
715 | ret | ||
716 | |||
717 | # Set the 80x43 mode. This mode is works on all VGA's. | ||
718 | # It's a 350-scanline mode with 8-pixel font. | ||
719 | set_80x43: | ||
720 | DO_STORE | ||
721 | movw $0x1201, %ax # Set 350 scans | ||
722 | movb $0x30, %bl | ||
723 | int $0x10 | ||
724 | movw $0x0003, %ax # Reset video mode | ||
725 | int $0x10 | ||
726 | jmp set_8pt # Use 8-pixel font | ||
727 | |||
728 | # Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. | ||
729 | set_80x30: | ||
730 | call use_80x25 # Start with real 80x25 | ||
731 | DO_STORE | ||
732 | movw $0x3cc, %dx # Get CRTC port | ||
733 | inb %dx, %al | ||
734 | movb $0xd4, %dl | ||
735 | rorb %al # Mono or color? | ||
736 | jc set48a | ||
737 | |||
738 | movb $0xb4, %dl | ||
739 | set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) | ||
740 | call outidx | ||
741 | movw $0x0b06, %ax # Vertical total | ||
742 | call outidx | ||
743 | movw $0x3e07, %ax # (Vertical) overflow | ||
744 | call outidx | ||
745 | movw $0xea10, %ax # Vertical sync start | ||
746 | call outidx | ||
747 | movw $0xdf12, %ax # Vertical display end | ||
748 | call outidx | ||
749 | movw $0xe715, %ax # Vertical blank start | ||
750 | call outidx | ||
751 | movw $0x0416, %ax # Vertical blank end | ||
752 | call outidx | ||
753 | pushw %dx | ||
754 | movb $0xcc, %dl # Misc output register (read) | ||
755 | inb %dx, %al | ||
756 | movb $0xc2, %dl # (write) | ||
757 | andb $0x0d, %al # Preserve clock select bits and color bit | ||
758 | orb $0xe2, %al # Set correct sync polarity | ||
759 | outb %al, %dx | ||
760 | popw %dx | ||
761 | movw $0x501e, force_size | ||
762 | stc # That's all. | ||
763 | ret | ||
764 | |||
765 | # Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. | ||
766 | set_80x34: | ||
767 | call set_80x30 # Set 480 scans | ||
768 | call set14 # And 14-pt font | ||
769 | movw $0xdb12, %ax # VGA vertical display end | ||
770 | movw $0x5022, force_size | ||
771 | setvde: call outidx | ||
772 | stc | ||
773 | ret | ||
774 | |||
775 | # Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. | ||
776 | set_80x60: | ||
777 | call set_80x30 # Set 480 scans | ||
778 | call set_8pt # And 8-pt font | ||
779 | movw $0xdf12, %ax # VGA vertical display end | ||
780 | movw $0x503c, force_size | ||
781 | jmp setvde | ||
782 | |||
783 | # Special hack for ThinkPad graphics | ||
784 | set_gfx: | ||
785 | #ifdef CONFIG_VIDEO_GFX_HACK | ||
786 | movw $VIDEO_GFX_BIOS_AX, %ax | ||
787 | movw $VIDEO_GFX_BIOS_BX, %bx | ||
788 | int $0x10 | ||
789 | movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size | ||
790 | stc | ||
791 | #endif | ||
792 | ret | ||
793 | |||
794 | #ifdef CONFIG_VIDEO_RETAIN | ||
795 | |||
796 | # Store screen contents to temporary buffer. | ||
797 | store_screen: | ||
798 | cmpb $0, do_restore # Already stored? | ||
799 | jnz stsr | ||
800 | |||
801 | testb $CAN_USE_HEAP, loadflags # Have we space for storing? | ||
802 | jz stsr | ||
803 | |||
804 | pushw %ax | ||
805 | pushw %bx | ||
806 | pushw force_size # Don't force specific size | ||
807 | movw $0, force_size | ||
808 | call mode_params # Obtain params of current mode | ||
809 | popw force_size | ||
810 | movb %fs:(PARAM_VIDEO_LINES), %ah | ||
811 | movb %fs:(PARAM_VIDEO_COLS), %al | ||
812 | movw %ax, %bx # BX=dimensions | ||
813 | mulb %ah | ||
814 | movw %ax, %cx # CX=number of characters | ||
815 | addw %ax, %ax # Calculate image size | ||
816 | addw $modelist+1024+4, %ax | ||
817 | cmpw heap_end_ptr, %ax | ||
818 | jnc sts1 # Unfortunately, out of memory | ||
819 | |||
820 | movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params | ||
821 | leaw modelist+1024, %di | ||
822 | stosw | ||
823 | movw %bx, %ax | ||
824 | stosw | ||
825 | pushw %ds # Store the screen | ||
826 | movw video_segment, %ds | ||
827 | xorw %si, %si | ||
828 | rep | ||
829 | movsw | ||
830 | popw %ds | ||
831 | incb do_restore # Screen will be restored later | ||
832 | sts1: popw %bx | ||
833 | popw %ax | ||
834 | stsr: ret | ||
835 | |||
836 | # Restore screen contents from temporary buffer. | ||
837 | restore_screen: | ||
838 | cmpb $0, do_restore # Has the screen been stored? | ||
839 | jz res1 | ||
840 | |||
841 | call mode_params # Get parameters of current mode | ||
842 | movb %fs:(PARAM_VIDEO_LINES), %cl | ||
843 | movb %fs:(PARAM_VIDEO_COLS), %ch | ||
844 | leaw modelist+1024, %si # Screen buffer | ||
845 | lodsw # Set cursor position | ||
846 | movw %ax, %dx | ||
847 | cmpb %cl, %dh | ||
848 | jc res2 | ||
849 | |||
850 | movb %cl, %dh | ||
851 | decb %dh | ||
852 | res2: cmpb %ch, %dl | ||
853 | jc res3 | ||
854 | |||
855 | movb %ch, %dl | ||
856 | decb %dl | ||
857 | res3: movb $0x02, %ah | ||
858 | movb $0x00, %bh | ||
859 | int $0x10 | ||
860 | lodsw # Display size | ||
861 | movb %ah, %dl # DL=number of lines | ||
862 | movb $0, %ah # BX=phys. length of orig. line | ||
863 | movw %ax, %bx | ||
864 | cmpb %cl, %dl # Too many? | ||
865 | jc res4 | ||
866 | |||
867 | pushw %ax | ||
868 | movb %dl, %al | ||
869 | subb %cl, %al | ||
870 | mulb %bl | ||
871 | addw %ax, %si | ||
872 | addw %ax, %si | ||
873 | popw %ax | ||
874 | movb %cl, %dl | ||
875 | res4: cmpb %ch, %al # Too wide? | ||
876 | jc res5 | ||
877 | |||
878 | movb %ch, %al # AX=width of src. line | ||
879 | res5: movb $0, %cl | ||
880 | xchgb %ch, %cl | ||
881 | movw %cx, %bp # BP=width of dest. line | ||
882 | pushw %es | ||
883 | movw video_segment, %es | ||
884 | xorw %di, %di # Move the data | ||
885 | addw %bx, %bx # Convert BX and BP to _bytes_ | ||
886 | addw %bp, %bp | ||
887 | res6: pushw %si | ||
888 | pushw %di | ||
889 | movw %ax, %cx | ||
890 | rep | ||
891 | movsw | ||
892 | popw %di | ||
893 | popw %si | ||
894 | addw %bp, %di | ||
895 | addw %bx, %si | ||
896 | decb %dl | ||
897 | jnz res6 | ||
898 | |||
899 | popw %es # Done | ||
900 | res1: ret | ||
901 | #endif /* CONFIG_VIDEO_RETAIN */ | ||
902 | |||
903 | # Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) | ||
904 | outidx: outb %al, %dx | ||
905 | pushw %ax | ||
906 | movb %ah, %al | ||
907 | incw %dx | ||
908 | outb %al, %dx | ||
909 | decw %dx | ||
910 | popw %ax | ||
911 | ret | ||
912 | |||
913 | # Build the table of video modes (stored after the setup.S code at the | ||
914 | # `modelist' label. Each video mode record looks like: | ||
915 | # .word MODE-ID (our special mode ID (see above)) | ||
916 | # .byte rows (number of rows) | ||
917 | # .byte columns (number of columns) | ||
918 | # Returns address of the end of the table in DI, the end is marked | ||
919 | # with a ASK_VGA ID. | ||
920 | mode_table: | ||
921 | movw mt_end, %di # Already filled? | ||
922 | orw %di, %di | ||
923 | jnz mtab1x | ||
924 | |||
925 | leaw modelist, %di # Store standard modes: | ||
926 | movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) | ||
927 | stosl | ||
928 | movb adapter, %al # CGA/MDA/HGA -- no more modes | ||
929 | orb %al, %al | ||
930 | jz mtabe | ||
931 | |||
932 | decb %al | ||
933 | jnz mtabv | ||
934 | |||
935 | movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode | ||
936 | stosl | ||
937 | jmp mtabe | ||
938 | |||
939 | mtab1x: jmp mtab1 | ||
940 | |||
941 | mtabv: leaw vga_modes, %si # All modes for std VGA | ||
942 | movw $vga_modes_end-vga_modes, %cx | ||
943 | rep # I'm unable to use movsw as I don't know how to store a half | ||
944 | movsb # of the expression above to cx without using explicit shr. | ||
945 | |||
946 | cmpb $0, scanning # Mode scan requested? | ||
947 | jz mscan1 | ||
948 | |||
949 | call mode_scan | ||
950 | mscan1: | ||
951 | |||
952 | #ifdef CONFIG_VIDEO_LOCAL | ||
953 | call local_modes | ||
954 | #endif /* CONFIG_VIDEO_LOCAL */ | ||
955 | |||
956 | #ifdef CONFIG_VIDEO_VESA | ||
957 | call vesa_modes # Detect VESA VGA modes | ||
958 | #endif /* CONFIG_VIDEO_VESA */ | ||
959 | |||
960 | #ifdef CONFIG_VIDEO_SVGA | ||
961 | cmpb $0, scanning # Bypass when scanning | ||
962 | jnz mscan2 | ||
963 | |||
964 | call svga_modes # Detect SVGA cards & modes | ||
965 | mscan2: | ||
966 | #endif /* CONFIG_VIDEO_SVGA */ | ||
967 | |||
968 | mtabe: | ||
969 | |||
970 | #ifdef CONFIG_VIDEO_COMPACT | ||
971 | leaw modelist, %si | ||
972 | movw %di, %dx | ||
973 | movw %si, %di | ||
974 | cmt1: cmpw %dx, %si # Scan all modes | ||
975 | jz cmt2 | ||
976 | |||
977 | leaw modelist, %bx # Find in previous entries | ||
978 | movw 2(%si), %cx | ||
979 | cmt3: cmpw %bx, %si | ||
980 | jz cmt4 | ||
981 | |||
982 | cmpw 2(%bx), %cx # Found => don't copy this entry | ||
983 | jz cmt5 | ||
984 | |||
985 | addw $4, %bx | ||
986 | jmp cmt3 | ||
987 | |||
988 | cmt4: movsl # Copy entry | ||
989 | jmp cmt1 | ||
990 | |||
991 | cmt5: addw $4, %si # Skip entry | ||
992 | jmp cmt1 | ||
993 | |||
994 | cmt2: | ||
995 | #endif /* CONFIG_VIDEO_COMPACT */ | ||
996 | |||
997 | movw $ASK_VGA, (%di) # End marker | ||
998 | movw %di, mt_end | ||
999 | mtab1: leaw modelist, %si # SI=mode list, DI=list end | ||
1000 | ret0: ret | ||
1001 | |||
1002 | # Modes usable on all standard VGAs | ||
1003 | vga_modes: | ||
1004 | .word VIDEO_8POINT | ||
1005 | .word 0x5032 # 80x50 | ||
1006 | .word VIDEO_80x43 | ||
1007 | .word 0x502b # 80x43 | ||
1008 | .word VIDEO_80x28 | ||
1009 | .word 0x501c # 80x28 | ||
1010 | .word VIDEO_80x30 | ||
1011 | .word 0x501e # 80x30 | ||
1012 | .word VIDEO_80x34 | ||
1013 | .word 0x5022 # 80x34 | ||
1014 | .word VIDEO_80x60 | ||
1015 | .word 0x503c # 80x60 | ||
1016 | #ifdef CONFIG_VIDEO_GFX_HACK | ||
1017 | .word VIDEO_GFX_HACK | ||
1018 | .word VIDEO_GFX_DUMMY_RESOLUTION | ||
1019 | #endif | ||
1020 | |||
1021 | vga_modes_end: | ||
1022 | # Detect VESA modes. | ||
1023 | |||
1024 | #ifdef CONFIG_VIDEO_VESA | ||
1025 | vesa_modes: | ||
1026 | cmpb $2, adapter # VGA only | ||
1027 | jnz ret0 | ||
1028 | |||
1029 | movw %di, %bp # BP=original mode table end | ||
1030 | addw $0x200, %di # Buffer space | ||
1031 | movw $0x4f00, %ax # VESA Get card info call | ||
1032 | int $0x10 | ||
1033 | movw %bp, %di | ||
1034 | cmpw $0x004f, %ax # Successful? | ||
1035 | jnz ret0 | ||
1036 | |||
1037 | cmpw $0x4556, 0x200(%di) | ||
1038 | jnz ret0 | ||
1039 | |||
1040 | cmpw $0x4153, 0x202(%di) | ||
1041 | jnz ret0 | ||
1042 | |||
1043 | movw $vesa_name, card_name # Set name to "VESA VGA" | ||
1044 | pushw %gs | ||
1045 | lgsw 0x20e(%di), %si # GS:SI=mode list | ||
1046 | movw $128, %cx # Iteration limit | ||
1047 | vesa1: | ||
1048 | # gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. | ||
1049 | # XXX: lodsw %gs:(%si), %ax # Get next mode in the list | ||
1050 | gs; lodsw | ||
1051 | cmpw $0xffff, %ax # End of the table? | ||
1052 | jz vesar | ||
1053 | |||
1054 | cmpw $0x0080, %ax # Check validity of mode ID | ||
1055 | jc vesa2 | ||
1056 | |||
1057 | orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff | ||
1058 | jz vesan # Certain BIOSes report 0x80-0xff! | ||
1059 | |||
1060 | cmpw $0x0800, %ax | ||
1061 | jnc vesae | ||
1062 | |||
1063 | vesa2: pushw %cx | ||
1064 | movw %ax, %cx # Get mode information structure | ||
1065 | movw $0x4f01, %ax | ||
1066 | int $0x10 | ||
1067 | movw %cx, %bx # BX=mode number | ||
1068 | addb $VIDEO_FIRST_VESA>>8, %bh | ||
1069 | popw %cx | ||
1070 | cmpw $0x004f, %ax | ||
1071 | jnz vesan # Don't report errors (buggy BIOSES) | ||
1072 | |||
1073 | movb (%di), %al # Check capabilities. We require | ||
1074 | andb $0x19, %al # a color text mode. | ||
1075 | cmpb $0x09, %al | ||
1076 | jnz vesan | ||
1077 | |||
1078 | cmpw $0xb800, 8(%di) # Standard video memory address required | ||
1079 | jnz vesan | ||
1080 | |||
1081 | testb $2, (%di) # Mode characteristics supplied? | ||
1082 | movw %bx, (%di) # Store mode number | ||
1083 | jz vesa3 | ||
1084 | |||
1085 | xorw %dx, %dx | ||
1086 | movw 0x12(%di), %bx # Width | ||
1087 | orb %bh, %bh | ||
1088 | jnz vesan | ||
1089 | |||
1090 | movb %bl, 0x3(%di) | ||
1091 | movw 0x14(%di), %ax # Height | ||
1092 | orb %ah, %ah | ||
1093 | jnz vesan | ||
1094 | |||
1095 | movb %al, 2(%di) | ||
1096 | mulb %bl | ||
1097 | cmpw $8193, %ax # Small enough for Linux console driver? | ||
1098 | jnc vesan | ||
1099 | |||
1100 | jmp vesaok | ||
1101 | |||
1102 | vesa3: subw $0x8108, %bx # This mode has no detailed info specified, | ||
1103 | jc vesan # so it must be a standard VESA mode. | ||
1104 | |||
1105 | cmpw $5, %bx | ||
1106 | jnc vesan | ||
1107 | |||
1108 | movw vesa_text_mode_table(%bx), %ax | ||
1109 | movw %ax, 2(%di) | ||
1110 | vesaok: addw $4, %di # The mode is valid. Store it. | ||
1111 | vesan: loop vesa1 # Next mode. Limit exceeded => error | ||
1112 | vesae: leaw vesaer, %si | ||
1113 | call prtstr | ||
1114 | movw %bp, %di # Discard already found modes. | ||
1115 | vesar: popw %gs | ||
1116 | ret | ||
1117 | |||
1118 | # Dimensions of standard VESA text modes | ||
1119 | vesa_text_mode_table: | ||
1120 | .byte 60, 80 # 0108 | ||
1121 | .byte 25, 132 # 0109 | ||
1122 | .byte 43, 132 # 010A | ||
1123 | .byte 50, 132 # 010B | ||
1124 | .byte 60, 132 # 010C | ||
1125 | #endif /* CONFIG_VIDEO_VESA */ | ||
1126 | |||
1127 | # Scan for video modes. A bit dirty, but should work. | ||
1128 | mode_scan: | ||
1129 | movw $0x0100, %cx # Start with mode 0 | ||
1130 | scm1: movb $0, %ah # Test the mode | ||
1131 | movb %cl, %al | ||
1132 | int $0x10 | ||
1133 | movb $0x0f, %ah | ||
1134 | int $0x10 | ||
1135 | cmpb %cl, %al | ||
1136 | jnz scm2 # Mode not set | ||
1137 | |||
1138 | movw $0x3c0, %dx # Test if it's a text mode | ||
1139 | movb $0x10, %al # Mode bits | ||
1140 | call inidx | ||
1141 | andb $0x03, %al | ||
1142 | jnz scm2 | ||
1143 | |||
1144 | movb $0xce, %dl # Another set of mode bits | ||
1145 | movb $0x06, %al | ||
1146 | call inidx | ||
1147 | shrb %al | ||
1148 | jc scm2 | ||
1149 | |||
1150 | movb $0xd4, %dl # Cursor location | ||
1151 | movb $0x0f, %al | ||
1152 | call inidx | ||
1153 | orb %al, %al | ||
1154 | jnz scm2 | ||
1155 | |||
1156 | movw %cx, %ax # Ok, store the mode | ||
1157 | stosw | ||
1158 | movb %gs:(0x484), %al # Number of rows | ||
1159 | incb %al | ||
1160 | stosb | ||
1161 | movw %gs:(0x44a), %ax # Number of columns | ||
1162 | stosb | ||
1163 | scm2: incb %cl | ||
1164 | jns scm1 | ||
1165 | |||
1166 | movw $0x0003, %ax # Return back to mode 3 | ||
1167 | int $0x10 | ||
1168 | ret | ||
1169 | |||
1170 | tstidx: outw %ax, %dx # OUT DX,AX and inidx | ||
1171 | inidx: outb %al, %dx # Read from indexed VGA register | ||
1172 | incw %dx # AL=index, DX=index reg port -> AL=data | ||
1173 | inb %dx, %al | ||
1174 | decw %dx | ||
1175 | ret | ||
1176 | |||
1177 | # Try to detect type of SVGA card and supply (usually approximate) video | ||
1178 | # mode table for it. | ||
1179 | |||
1180 | #ifdef CONFIG_VIDEO_SVGA | ||
1181 | svga_modes: | ||
1182 | leaw svga_table, %si # Test all known SVGA adapters | ||
1183 | dosvga: lodsw | ||
1184 | movw %ax, %bp # Default mode table | ||
1185 | orw %ax, %ax | ||
1186 | jz didsv1 | ||
1187 | |||
1188 | lodsw # Pointer to test routine | ||
1189 | pushw %si | ||
1190 | pushw %di | ||
1191 | pushw %es | ||
1192 | movw $0xc000, %bx | ||
1193 | movw %bx, %es | ||
1194 | call *%ax # Call test routine | ||
1195 | popw %es | ||
1196 | popw %di | ||
1197 | popw %si | ||
1198 | orw %bp, %bp | ||
1199 | jz dosvga | ||
1200 | |||
1201 | movw %bp, %si # Found, copy the modes | ||
1202 | movb svga_prefix, %ah | ||
1203 | cpsvga: lodsb | ||
1204 | orb %al, %al | ||
1205 | jz didsv | ||
1206 | |||
1207 | stosw | ||
1208 | movsw | ||
1209 | jmp cpsvga | ||
1210 | |||
1211 | didsv: movw %si, card_name # Store pointer to card name | ||
1212 | didsv1: ret | ||
1213 | |||
1214 | # Table of all known SVGA cards. For each card, we store a pointer to | ||
1215 | # a table of video modes supported by the card and a pointer to a routine | ||
1216 | # used for testing of presence of the card. The video mode table is always | ||
1217 | # followed by the name of the card or the chipset. | ||
1218 | svga_table: | ||
1219 | .word ati_md, ati_test | ||
1220 | .word oak_md, oak_test | ||
1221 | .word paradise_md, paradise_test | ||
1222 | .word realtek_md, realtek_test | ||
1223 | .word s3_md, s3_test | ||
1224 | .word chips_md, chips_test | ||
1225 | .word video7_md, video7_test | ||
1226 | .word cirrus5_md, cirrus5_test | ||
1227 | .word cirrus6_md, cirrus6_test | ||
1228 | .word cirrus1_md, cirrus1_test | ||
1229 | .word ahead_md, ahead_test | ||
1230 | .word everex_md, everex_test | ||
1231 | .word genoa_md, genoa_test | ||
1232 | .word trident_md, trident_test | ||
1233 | .word tseng_md, tseng_test | ||
1234 | .word 0 | ||
1235 | |||
1236 | # Test routines and mode tables: | ||
1237 | |||
1238 | # S3 - The test algorithm was taken from the SuperProbe package | ||
1239 | # for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org | ||
1240 | s3_test: | ||
1241 | movw $0x0f35, %cx # we store some constants in cl/ch | ||
1242 | movw $0x03d4, %dx | ||
1243 | movb $0x38, %al | ||
1244 | call inidx | ||
1245 | movb %al, %bh # store current CRT-register 0x38 | ||
1246 | movw $0x0038, %ax | ||
1247 | call outidx # disable writing to special regs | ||
1248 | movb %cl, %al # check whether we can write special reg 0x35 | ||
1249 | call inidx | ||
1250 | movb %al, %bl # save the current value of CRT reg 0x35 | ||
1251 | andb $0xf0, %al # clear bits 0-3 | ||
1252 | movb %al, %ah | ||
1253 | movb %cl, %al # and write it to CRT reg 0x35 | ||
1254 | call outidx | ||
1255 | call inidx # now read it back | ||
1256 | andb %ch, %al # clear the upper 4 bits | ||
1257 | jz s3_2 # the first test failed. But we have a | ||
1258 | |||
1259 | movb %bl, %ah # second chance | ||
1260 | movb %cl, %al | ||
1261 | call outidx | ||
1262 | jmp s3_1 # do the other tests | ||
1263 | |||
1264 | s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 | ||
1265 | orb %bl, %ah # set the upper 4 bits of ah with the orig value | ||
1266 | call outidx # write ... | ||
1267 | call inidx # ... and reread | ||
1268 | andb %cl, %al # turn off the upper 4 bits | ||
1269 | pushw %ax | ||
1270 | movb %bl, %ah # restore old value in register 0x35 | ||
1271 | movb %cl, %al | ||
1272 | call outidx | ||
1273 | popw %ax | ||
1274 | cmpb %ch, %al # setting lower 4 bits was successful => bad | ||
1275 | je no_s3 # writing is allowed => this is not an S3 | ||
1276 | |||
1277 | s3_1: movw $0x4838, %ax # allow writing to special regs by putting | ||
1278 | call outidx # magic number into CRT-register 0x38 | ||
1279 | movb %cl, %al # check whether we can write special reg 0x35 | ||
1280 | call inidx | ||
1281 | movb %al, %bl | ||
1282 | andb $0xf0, %al | ||
1283 | movb %al, %ah | ||
1284 | movb %cl, %al | ||
1285 | call outidx | ||
1286 | call inidx | ||
1287 | andb %ch, %al | ||
1288 | jnz no_s3 # no, we can't write => no S3 | ||
1289 | |||
1290 | movw %cx, %ax | ||
1291 | orb %bl, %ah | ||
1292 | call outidx | ||
1293 | call inidx | ||
1294 | andb %ch, %al | ||
1295 | pushw %ax | ||
1296 | movb %bl, %ah # restore old value in register 0x35 | ||
1297 | movb %cl, %al | ||
1298 | call outidx | ||
1299 | popw %ax | ||
1300 | cmpb %ch, %al | ||
1301 | jne no_s31 # writing not possible => no S3 | ||
1302 | movb $0x30, %al | ||
1303 | call inidx # now get the S3 id ... | ||
1304 | leaw idS3, %di | ||
1305 | movw $0x10, %cx | ||
1306 | repne | ||
1307 | scasb | ||
1308 | je no_s31 | ||
1309 | |||
1310 | movb %bh, %ah | ||
1311 | movb $0x38, %al | ||
1312 | jmp s3rest | ||
1313 | |||
1314 | no_s3: movb $0x35, %al # restore CRT register 0x35 | ||
1315 | movb %bl, %ah | ||
1316 | call outidx | ||
1317 | no_s31: xorw %bp, %bp # Detection failed | ||
1318 | s3rest: movb %bh, %ah | ||
1319 | movb $0x38, %al # restore old value of CRT register 0x38 | ||
1320 | jmp outidx | ||
1321 | |||
1322 | idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 | ||
1323 | .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 | ||
1324 | |||
1325 | s3_md: .byte 0x54, 0x2b, 0x84 | ||
1326 | .byte 0x55, 0x19, 0x84 | ||
1327 | .byte 0 | ||
1328 | .ascii "S3" | ||
1329 | .byte 0 | ||
1330 | |||
1331 | # ATI cards. | ||
1332 | ati_test: | ||
1333 | leaw idati, %si | ||
1334 | movw $0x31, %di | ||
1335 | movw $0x09, %cx | ||
1336 | repe | ||
1337 | cmpsb | ||
1338 | je atiok | ||
1339 | |||
1340 | xorw %bp, %bp | ||
1341 | atiok: ret | ||
1342 | |||
1343 | idati: .ascii "761295520" | ||
1344 | |||
1345 | ati_md: .byte 0x23, 0x19, 0x84 | ||
1346 | .byte 0x33, 0x2c, 0x84 | ||
1347 | .byte 0x22, 0x1e, 0x64 | ||
1348 | .byte 0x21, 0x19, 0x64 | ||
1349 | .byte 0x58, 0x21, 0x50 | ||
1350 | .byte 0x5b, 0x1e, 0x50 | ||
1351 | .byte 0 | ||
1352 | .ascii "ATI" | ||
1353 | .byte 0 | ||
1354 | |||
1355 | # AHEAD | ||
1356 | ahead_test: | ||
1357 | movw $0x200f, %ax | ||
1358 | movw $0x3ce, %dx | ||
1359 | outw %ax, %dx | ||
1360 | incw %dx | ||
1361 | inb %dx, %al | ||
1362 | cmpb $0x20, %al | ||
1363 | je isahed | ||
1364 | |||
1365 | cmpb $0x21, %al | ||
1366 | je isahed | ||
1367 | |||
1368 | xorw %bp, %bp | ||
1369 | isahed: ret | ||
1370 | |||
1371 | ahead_md: | ||
1372 | .byte 0x22, 0x2c, 0x84 | ||
1373 | .byte 0x23, 0x19, 0x84 | ||
1374 | .byte 0x24, 0x1c, 0x84 | ||
1375 | .byte 0x2f, 0x32, 0xa0 | ||
1376 | .byte 0x32, 0x22, 0x50 | ||
1377 | .byte 0x34, 0x42, 0x50 | ||
1378 | .byte 0 | ||
1379 | .ascii "Ahead" | ||
1380 | .byte 0 | ||
1381 | |||
1382 | # Chips & Tech. | ||
1383 | chips_test: | ||
1384 | movw $0x3c3, %dx | ||
1385 | inb %dx, %al | ||
1386 | orb $0x10, %al | ||
1387 | outb %al, %dx | ||
1388 | movw $0x104, %dx | ||
1389 | inb %dx, %al | ||
1390 | movb %al, %bl | ||
1391 | movw $0x3c3, %dx | ||
1392 | inb %dx, %al | ||
1393 | andb $0xef, %al | ||
1394 | outb %al, %dx | ||
1395 | cmpb $0xa5, %bl | ||
1396 | je cantok | ||
1397 | |||
1398 | xorw %bp, %bp | ||
1399 | cantok: ret | ||
1400 | |||
1401 | chips_md: | ||
1402 | .byte 0x60, 0x19, 0x84 | ||
1403 | .byte 0x61, 0x32, 0x84 | ||
1404 | .byte 0 | ||
1405 | .ascii "Chips & Technologies" | ||
1406 | .byte 0 | ||
1407 | |||
1408 | # Cirrus Logic 5X0 | ||
1409 | cirrus1_test: | ||
1410 | movw $0x3d4, %dx | ||
1411 | movb $0x0c, %al | ||
1412 | outb %al, %dx | ||
1413 | incw %dx | ||
1414 | inb %dx, %al | ||
1415 | movb %al, %bl | ||
1416 | xorb %al, %al | ||
1417 | outb %al, %dx | ||
1418 | decw %dx | ||
1419 | movb $0x1f, %al | ||
1420 | outb %al, %dx | ||
1421 | incw %dx | ||
1422 | inb %dx, %al | ||
1423 | movb %al, %bh | ||
1424 | xorb %ah, %ah | ||
1425 | shlb $4, %al | ||
1426 | movw %ax, %cx | ||
1427 | movb %bh, %al | ||
1428 | shrb $4, %al | ||
1429 | addw %ax, %cx | ||
1430 | shlw $8, %cx | ||
1431 | addw $6, %cx | ||
1432 | movw %cx, %ax | ||
1433 | movw $0x3c4, %dx | ||
1434 | outw %ax, %dx | ||
1435 | incw %dx | ||
1436 | inb %dx, %al | ||
1437 | andb %al, %al | ||
1438 | jnz nocirr | ||
1439 | |||
1440 | movb %bh, %al | ||
1441 | outb %al, %dx | ||
1442 | inb %dx, %al | ||
1443 | cmpb $0x01, %al | ||
1444 | je iscirr | ||
1445 | |||
1446 | nocirr: xorw %bp, %bp | ||
1447 | iscirr: movw $0x3d4, %dx | ||
1448 | movb %bl, %al | ||
1449 | xorb %ah, %ah | ||
1450 | shlw $8, %ax | ||
1451 | addw $0x0c, %ax | ||
1452 | outw %ax, %dx | ||
1453 | ret | ||
1454 | |||
1455 | cirrus1_md: | ||
1456 | .byte 0x1f, 0x19, 0x84 | ||
1457 | .byte 0x20, 0x2c, 0x84 | ||
1458 | .byte 0x22, 0x1e, 0x84 | ||
1459 | .byte 0x31, 0x25, 0x64 | ||
1460 | .byte 0 | ||
1461 | .ascii "Cirrus Logic 5X0" | ||
1462 | .byte 0 | ||
1463 | |||
1464 | # Cirrus Logic 54XX | ||
1465 | cirrus5_test: | ||
1466 | movw $0x3c4, %dx | ||
1467 | movb $6, %al | ||
1468 | call inidx | ||
1469 | movb %al, %bl # BL=backup | ||
1470 | movw $6, %ax | ||
1471 | call tstidx | ||
1472 | cmpb $0x0f, %al | ||
1473 | jne c5fail | ||
1474 | |||
1475 | movw $0x1206, %ax | ||
1476 | call tstidx | ||
1477 | cmpb $0x12, %al | ||
1478 | jne c5fail | ||
1479 | |||
1480 | movb $0x1e, %al | ||
1481 | call inidx | ||
1482 | movb %al, %bh | ||
1483 | movb %bh, %ah | ||
1484 | andb $0xc0, %ah | ||
1485 | movb $0x1e, %al | ||
1486 | call tstidx | ||
1487 | andb $0x3f, %al | ||
1488 | jne c5xx | ||
1489 | |||
1490 | movb $0x1e, %al | ||
1491 | movb %bh, %ah | ||
1492 | orb $0x3f, %ah | ||
1493 | call tstidx | ||
1494 | xorb $0x3f, %al | ||
1495 | andb $0x3f, %al | ||
1496 | c5xx: pushf | ||
1497 | movb $0x1e, %al | ||
1498 | movb %bh, %ah | ||
1499 | outw %ax, %dx | ||
1500 | popf | ||
1501 | je c5done | ||
1502 | |||
1503 | c5fail: xorw %bp, %bp | ||
1504 | c5done: movb $6, %al | ||
1505 | movb %bl, %ah | ||
1506 | outw %ax, %dx | ||
1507 | ret | ||
1508 | |||
1509 | cirrus5_md: | ||
1510 | .byte 0x14, 0x19, 0x84 | ||
1511 | .byte 0x54, 0x2b, 0x84 | ||
1512 | .byte 0 | ||
1513 | .ascii "Cirrus Logic 54XX" | ||
1514 | .byte 0 | ||
1515 | |||
1516 | # Cirrus Logic 64XX -- no known extra modes, but must be identified, because | ||
1517 | # it's misidentified by the Ahead test. | ||
1518 | cirrus6_test: | ||
1519 | movw $0x3ce, %dx | ||
1520 | movb $0x0a, %al | ||
1521 | call inidx | ||
1522 | movb %al, %bl # BL=backup | ||
1523 | movw $0xce0a, %ax | ||
1524 | call tstidx | ||
1525 | orb %al, %al | ||
1526 | jne c2fail | ||
1527 | |||
1528 | movw $0xec0a, %ax | ||
1529 | call tstidx | ||
1530 | cmpb $0x01, %al | ||
1531 | jne c2fail | ||
1532 | |||
1533 | movb $0xaa, %al | ||
1534 | call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. | ||
1535 | shrb $4, %al | ||
1536 | subb $4, %al | ||
1537 | jz c6done | ||
1538 | |||
1539 | decb %al | ||
1540 | jz c6done | ||
1541 | |||
1542 | subb $2, %al | ||
1543 | jz c6done | ||
1544 | |||
1545 | decb %al | ||
1546 | jz c6done | ||
1547 | |||
1548 | c2fail: xorw %bp, %bp | ||
1549 | c6done: movb $0x0a, %al | ||
1550 | movb %bl, %ah | ||
1551 | outw %ax, %dx | ||
1552 | ret | ||
1553 | |||
1554 | cirrus6_md: | ||
1555 | .byte 0 | ||
1556 | .ascii "Cirrus Logic 64XX" | ||
1557 | .byte 0 | ||
1558 | |||
1559 | # Everex / Trident | ||
1560 | everex_test: | ||
1561 | movw $0x7000, %ax | ||
1562 | xorw %bx, %bx | ||
1563 | int $0x10 | ||
1564 | cmpb $0x70, %al | ||
1565 | jne noevrx | ||
1566 | |||
1567 | shrw $4, %dx | ||
1568 | cmpw $0x678, %dx | ||
1569 | je evtrid | ||
1570 | |||
1571 | cmpw $0x236, %dx | ||
1572 | jne evrxok | ||
1573 | |||
1574 | evtrid: leaw trident_md, %bp | ||
1575 | evrxok: ret | ||
1576 | |||
1577 | noevrx: xorw %bp, %bp | ||
1578 | ret | ||
1579 | |||
1580 | everex_md: | ||
1581 | .byte 0x03, 0x22, 0x50 | ||
1582 | .byte 0x04, 0x3c, 0x50 | ||
1583 | .byte 0x07, 0x2b, 0x64 | ||
1584 | .byte 0x08, 0x4b, 0x64 | ||
1585 | .byte 0x0a, 0x19, 0x84 | ||
1586 | .byte 0x0b, 0x2c, 0x84 | ||
1587 | .byte 0x16, 0x1e, 0x50 | ||
1588 | .byte 0x18, 0x1b, 0x64 | ||
1589 | .byte 0x21, 0x40, 0xa0 | ||
1590 | .byte 0x40, 0x1e, 0x84 | ||
1591 | .byte 0 | ||
1592 | .ascii "Everex/Trident" | ||
1593 | .byte 0 | ||
1594 | |||
1595 | # Genoa. | ||
1596 | genoa_test: | ||
1597 | leaw idgenoa, %si # Check Genoa 'clues' | ||
1598 | xorw %ax, %ax | ||
1599 | movb %es:(0x37), %al | ||
1600 | movw %ax, %di | ||
1601 | movw $0x04, %cx | ||
1602 | decw %si | ||
1603 | decw %di | ||
1604 | l1: incw %si | ||
1605 | incw %di | ||
1606 | movb (%si), %al | ||
1607 | testb %al, %al | ||
1608 | jz l2 | ||
1609 | |||
1610 | cmpb %es:(%di), %al | ||
1611 | l2: loope l1 | ||
1612 | orw %cx, %cx | ||
1613 | je isgen | ||
1614 | |||
1615 | xorw %bp, %bp | ||
1616 | isgen: ret | ||
1617 | |||
1618 | idgenoa: .byte 0x77, 0x00, 0x99, 0x66 | ||
1619 | |||
1620 | genoa_md: | ||
1621 | .byte 0x58, 0x20, 0x50 | ||
1622 | .byte 0x5a, 0x2a, 0x64 | ||
1623 | .byte 0x60, 0x19, 0x84 | ||
1624 | .byte 0x61, 0x1d, 0x84 | ||
1625 | .byte 0x62, 0x20, 0x84 | ||
1626 | .byte 0x63, 0x2c, 0x84 | ||
1627 | .byte 0x64, 0x3c, 0x84 | ||
1628 | .byte 0x6b, 0x4f, 0x64 | ||
1629 | .byte 0x72, 0x3c, 0x50 | ||
1630 | .byte 0x74, 0x42, 0x50 | ||
1631 | .byte 0x78, 0x4b, 0x64 | ||
1632 | .byte 0 | ||
1633 | .ascii "Genoa" | ||
1634 | .byte 0 | ||
1635 | |||
1636 | # OAK | ||
1637 | oak_test: | ||
1638 | leaw idoakvga, %si | ||
1639 | movw $0x08, %di | ||
1640 | movw $0x08, %cx | ||
1641 | repe | ||
1642 | cmpsb | ||
1643 | je isoak | ||
1644 | |||
1645 | xorw %bp, %bp | ||
1646 | isoak: ret | ||
1647 | |||
1648 | idoakvga: .ascii "OAK VGA " | ||
1649 | |||
1650 | oak_md: .byte 0x4e, 0x3c, 0x50 | ||
1651 | .byte 0x4f, 0x3c, 0x84 | ||
1652 | .byte 0x50, 0x19, 0x84 | ||
1653 | .byte 0x51, 0x2b, 0x84 | ||
1654 | .byte 0 | ||
1655 | .ascii "OAK" | ||
1656 | .byte 0 | ||
1657 | |||
1658 | # WD Paradise. | ||
1659 | paradise_test: | ||
1660 | leaw idparadise, %si | ||
1661 | movw $0x7d, %di | ||
1662 | movw $0x04, %cx | ||
1663 | repe | ||
1664 | cmpsb | ||
1665 | je ispara | ||
1666 | |||
1667 | xorw %bp, %bp | ||
1668 | ispara: ret | ||
1669 | |||
1670 | idparadise: .ascii "VGA=" | ||
1671 | |||
1672 | paradise_md: | ||
1673 | .byte 0x41, 0x22, 0x50 | ||
1674 | .byte 0x47, 0x1c, 0x84 | ||
1675 | .byte 0x55, 0x19, 0x84 | ||
1676 | .byte 0x54, 0x2c, 0x84 | ||
1677 | .byte 0 | ||
1678 | .ascii "Paradise" | ||
1679 | .byte 0 | ||
1680 | |||
1681 | # Trident. | ||
1682 | trident_test: | ||
1683 | movw $0x3c4, %dx | ||
1684 | movb $0x0e, %al | ||
1685 | outb %al, %dx | ||
1686 | incw %dx | ||
1687 | inb %dx, %al | ||
1688 | xchgb %al, %ah | ||
1689 | xorb %al, %al | ||
1690 | outb %al, %dx | ||
1691 | inb %dx, %al | ||
1692 | xchgb %ah, %al | ||
1693 | movb %al, %bl # Strange thing ... in the book this wasn't | ||
1694 | andb $0x02, %bl # necessary but it worked on my card which | ||
1695 | jz setb2 # is a trident. Without it the screen goes | ||
1696 | # blurred ... | ||
1697 | andb $0xfd, %al | ||
1698 | jmp clrb2 | ||
1699 | |||
1700 | setb2: orb $0x02, %al | ||
1701 | clrb2: outb %al, %dx | ||
1702 | andb $0x0f, %ah | ||
1703 | cmpb $0x02, %ah | ||
1704 | je istrid | ||
1705 | |||
1706 | xorw %bp, %bp | ||
1707 | istrid: ret | ||
1708 | |||
1709 | trident_md: | ||
1710 | .byte 0x50, 0x1e, 0x50 | ||
1711 | .byte 0x51, 0x2b, 0x50 | ||
1712 | .byte 0x52, 0x3c, 0x50 | ||
1713 | .byte 0x57, 0x19, 0x84 | ||
1714 | .byte 0x58, 0x1e, 0x84 | ||
1715 | .byte 0x59, 0x2b, 0x84 | ||
1716 | .byte 0x5a, 0x3c, 0x84 | ||
1717 | .byte 0 | ||
1718 | .ascii "Trident" | ||
1719 | .byte 0 | ||
1720 | |||
1721 | # Tseng. | ||
1722 | tseng_test: | ||
1723 | movw $0x3cd, %dx | ||
1724 | inb %dx, %al # Could things be this simple ! :-) | ||
1725 | movb %al, %bl | ||
1726 | movb $0x55, %al | ||
1727 | outb %al, %dx | ||
1728 | inb %dx, %al | ||
1729 | movb %al, %ah | ||
1730 | movb %bl, %al | ||
1731 | outb %al, %dx | ||
1732 | cmpb $0x55, %ah | ||
1733 | je istsen | ||
1734 | |||
1735 | isnot: xorw %bp, %bp | ||
1736 | istsen: ret | ||
1737 | |||
1738 | tseng_md: | ||
1739 | .byte 0x26, 0x3c, 0x50 | ||
1740 | .byte 0x2a, 0x28, 0x64 | ||
1741 | .byte 0x23, 0x19, 0x84 | ||
1742 | .byte 0x24, 0x1c, 0x84 | ||
1743 | .byte 0x22, 0x2c, 0x84 | ||
1744 | .byte 0x21, 0x3c, 0x84 | ||
1745 | .byte 0 | ||
1746 | .ascii "Tseng" | ||
1747 | .byte 0 | ||
1748 | |||
1749 | # Video7. | ||
1750 | video7_test: | ||
1751 | movw $0x3cc, %dx | ||
1752 | inb %dx, %al | ||
1753 | movw $0x3b4, %dx | ||
1754 | andb $0x01, %al | ||
1755 | jz even7 | ||
1756 | |||
1757 | movw $0x3d4, %dx | ||
1758 | even7: movb $0x0c, %al | ||
1759 | outb %al, %dx | ||
1760 | incw %dx | ||
1761 | inb %dx, %al | ||
1762 | movb %al, %bl | ||
1763 | movb $0x55, %al | ||
1764 | outb %al, %dx | ||
1765 | inb %dx, %al | ||
1766 | decw %dx | ||
1767 | movb $0x1f, %al | ||
1768 | outb %al, %dx | ||
1769 | incw %dx | ||
1770 | inb %dx, %al | ||
1771 | movb %al, %bh | ||
1772 | decw %dx | ||
1773 | movb $0x0c, %al | ||
1774 | outb %al, %dx | ||
1775 | incw %dx | ||
1776 | movb %bl, %al | ||
1777 | outb %al, %dx | ||
1778 | movb $0x55, %al | ||
1779 | xorb $0xea, %al | ||
1780 | cmpb %bh, %al | ||
1781 | jne isnot | ||
1782 | |||
1783 | movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching | ||
1784 | ret | ||
1785 | |||
1786 | video7_md: | ||
1787 | .byte 0x40, 0x2b, 0x50 | ||
1788 | .byte 0x43, 0x3c, 0x50 | ||
1789 | .byte 0x44, 0x3c, 0x64 | ||
1790 | .byte 0x41, 0x19, 0x84 | ||
1791 | .byte 0x42, 0x2c, 0x84 | ||
1792 | .byte 0x45, 0x1c, 0x84 | ||
1793 | .byte 0 | ||
1794 | .ascii "Video 7" | ||
1795 | .byte 0 | ||
1796 | |||
1797 | # Realtek VGA | ||
1798 | realtek_test: | ||
1799 | leaw idrtvga, %si | ||
1800 | movw $0x45, %di | ||
1801 | movw $0x0b, %cx | ||
1802 | repe | ||
1803 | cmpsb | ||
1804 | je isrt | ||
1805 | |||
1806 | xorw %bp, %bp | ||
1807 | isrt: ret | ||
1808 | |||
1809 | idrtvga: .ascii "REALTEK VGA" | ||
1810 | |||
1811 | realtek_md: | ||
1812 | .byte 0x1a, 0x3c, 0x50 | ||
1813 | .byte 0x1b, 0x19, 0x84 | ||
1814 | .byte 0x1c, 0x1e, 0x84 | ||
1815 | .byte 0x1d, 0x2b, 0x84 | ||
1816 | .byte 0x1e, 0x3c, 0x84 | ||
1817 | .byte 0 | ||
1818 | .ascii "REALTEK" | ||
1819 | .byte 0 | ||
1820 | |||
1821 | #endif /* CONFIG_VIDEO_SVGA */ | ||
1822 | |||
1823 | # User-defined local mode table (VGA only) | ||
1824 | #ifdef CONFIG_VIDEO_LOCAL | ||
1825 | local_modes: | ||
1826 | leaw local_mode_table, %si | ||
1827 | locm1: lodsw | ||
1828 | orw %ax, %ax | ||
1829 | jz locm2 | ||
1830 | |||
1831 | stosw | ||
1832 | movsw | ||
1833 | jmp locm1 | ||
1834 | |||
1835 | locm2: ret | ||
1836 | |||
1837 | # This is the table of local video modes which can be supplied manually | ||
1838 | # by the user. Each entry consists of mode ID (word) and dimensions | ||
1839 | # (byte for column count and another byte for row count). These modes | ||
1840 | # are placed before all SVGA and VESA modes and override them if table | ||
1841 | # compacting is enabled. The table must end with a zero word followed | ||
1842 | # by NUL-terminated video adapter name. | ||
1843 | local_mode_table: | ||
1844 | .word 0x0100 # Example: 40x25 | ||
1845 | .byte 25,40 | ||
1846 | .word 0 | ||
1847 | .ascii "Local" | ||
1848 | .byte 0 | ||
1849 | #endif /* CONFIG_VIDEO_LOCAL */ | ||
1850 | |||
1851 | # Read a key and return the ASCII code in al, scan code in ah | ||
1852 | getkey: xorb %ah, %ah | ||
1853 | int $0x16 | ||
1854 | ret | ||
1855 | |||
1856 | # Read a key with a timeout of 30 seconds. | ||
1857 | # The hardware clock is used to get the time. | ||
1858 | getkt: call gettime | ||
1859 | addb $30, %al # Wait 30 seconds | ||
1860 | cmpb $60, %al | ||
1861 | jl lminute | ||
1862 | |||
1863 | subb $60, %al | ||
1864 | lminute: | ||
1865 | movb %al, %cl | ||
1866 | again: movb $0x01, %ah | ||
1867 | int $0x16 | ||
1868 | jnz getkey # key pressed, so get it | ||
1869 | |||
1870 | call gettime | ||
1871 | cmpb %cl, %al | ||
1872 | jne again | ||
1873 | |||
1874 | movb $0x20, %al # timeout, return `space' | ||
1875 | ret | ||
1876 | |||
1877 | # Flush the keyboard buffer | ||
1878 | flush: movb $0x01, %ah | ||
1879 | int $0x16 | ||
1880 | jz empty | ||
1881 | |||
1882 | xorb %ah, %ah | ||
1883 | int $0x16 | ||
1884 | jmp flush | ||
1885 | |||
1886 | empty: ret | ||
1887 | |||
1888 | # Print hexadecimal number. | ||
1889 | prthw: pushw %ax | ||
1890 | movb %ah, %al | ||
1891 | call prthb | ||
1892 | popw %ax | ||
1893 | prthb: pushw %ax | ||
1894 | shrb $4, %al | ||
1895 | call prthn | ||
1896 | popw %ax | ||
1897 | andb $0x0f, %al | ||
1898 | prthn: cmpb $0x0a, %al | ||
1899 | jc prth1 | ||
1900 | |||
1901 | addb $0x07, %al | ||
1902 | prth1: addb $0x30, %al | ||
1903 | jmp prtchr | ||
1904 | |||
1905 | # Print decimal number in al | ||
1906 | prtdec: pushw %ax | ||
1907 | pushw %cx | ||
1908 | xorb %ah, %ah | ||
1909 | movb $0x0a, %cl | ||
1910 | idivb %cl | ||
1911 | cmpb $0x09, %al | ||
1912 | jbe lt100 | ||
1913 | |||
1914 | call prtdec | ||
1915 | jmp skip10 | ||
1916 | |||
1917 | lt100: addb $0x30, %al | ||
1918 | call prtchr | ||
1919 | skip10: movb %ah, %al | ||
1920 | addb $0x30, %al | ||
1921 | call prtchr | ||
1922 | popw %cx | ||
1923 | popw %ax | ||
1924 | ret | ||
1925 | |||
1926 | store_edid: | ||
1927 | pushw %es # just save all registers | ||
1928 | pushw %ax | ||
1929 | pushw %bx | ||
1930 | pushw %cx | ||
1931 | pushw %dx | ||
1932 | pushw %di | ||
1933 | |||
1934 | pushw %fs | ||
1935 | popw %es | ||
1936 | |||
1937 | movl $0x13131313, %eax # memset block with 0x13 | ||
1938 | movw $32, %cx | ||
1939 | movw $0x140, %di | ||
1940 | cld | ||
1941 | rep | ||
1942 | stosl | ||
1943 | |||
1944 | movw $0x4f15, %ax # do VBE/DDC | ||
1945 | movw $0x01, %bx | ||
1946 | movw $0x00, %cx | ||
1947 | movw $0x01, %dx | ||
1948 | movw $0x140, %di | ||
1949 | int $0x10 | ||
1950 | |||
1951 | popw %di # restore all registers | ||
1952 | popw %dx | ||
1953 | popw %cx | ||
1954 | popw %bx | ||
1955 | popw %ax | ||
1956 | popw %es | ||
1957 | ret | ||
1958 | |||
1959 | # VIDEO_SELECT-only variables | ||
1960 | mt_end: .word 0 # End of video mode table if built | ||
1961 | edit_buf: .space 6 # Line editor buffer | ||
1962 | card_name: .word 0 # Pointer to adapter name | ||
1963 | scanning: .byte 0 # Performing mode scan | ||
1964 | do_restore: .byte 0 # Screen contents altered during mode change | ||
1965 | svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes | ||
1966 | graphic_mode: .byte 0 # Graphic mode with a linear frame buffer | ||
1967 | dac_size: .byte 6 # DAC bit depth | ||
1968 | |||
1969 | # Status messages | ||
1970 | keymsg: .ascii "Press <RETURN> to see video modes available, " | ||
1971 | .ascii "<SPACE> to continue or wait 30 secs" | ||
1972 | .byte 0x0d, 0x0a, 0 | ||
1973 | |||
1974 | listhdr: .byte 0x0d, 0x0a | ||
1975 | .ascii "Mode: COLSxROWS:" | ||
1976 | |||
1977 | crlft: .byte 0x0d, 0x0a, 0 | ||
1978 | |||
1979 | prompt: .byte 0x0d, 0x0a | ||
1980 | .asciz "Enter mode number or `scan': " | ||
1981 | |||
1982 | unknt: .asciz "Unknown mode ID. Try again." | ||
1983 | |||
1984 | badmdt: .ascii "You passed an undefined mode number." | ||
1985 | .byte 0x0d, 0x0a, 0 | ||
1986 | |||
1987 | vesaer: .ascii "Error: Scanning of VESA modes failed. Please " | ||
1988 | .ascii "report to <mj@ucw.cz>." | ||
1989 | .byte 0x0d, 0x0a, 0 | ||
1990 | |||
1991 | old_name: .asciz "CGA/MDA/HGA" | ||
1992 | |||
1993 | ega_name: .asciz "EGA" | ||
1994 | |||
1995 | svga_name: .ascii " " | ||
1996 | |||
1997 | vga_name: .asciz "VGA" | ||
1998 | |||
1999 | vesa_name: .asciz "VESA" | ||
2000 | |||
2001 | name_bann: .asciz "Video adapter: " | ||
2002 | #endif /* CONFIG_VIDEO_SELECT */ | ||
2003 | |||
2004 | # Other variables: | ||
2005 | adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA | ||
2006 | video_segment: .word 0xb800 # Video memory segment | ||
2007 | force_size: .word 0 # Use this size instead of the one in BIOS vars | ||
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig new file mode 100644 index 000000000000..9ce51dee30b3 --- /dev/null +++ b/arch/x86_64/defconfig | |||
@@ -0,0 +1,1129 @@ | |||
1 | # | ||
2 | # Automatically generated make config: don't edit | ||
3 | # Linux kernel version: 2.6.11-bk7 | ||
4 | # Sat Mar 12 23:43:44 2005 | ||
5 | # | ||
6 | CONFIG_X86_64=y | ||
7 | CONFIG_64BIT=y | ||
8 | CONFIG_X86=y | ||
9 | CONFIG_MMU=y | ||
10 | CONFIG_RWSEM_GENERIC_SPINLOCK=y | ||
11 | CONFIG_GENERIC_CALIBRATE_DELAY=y | ||
12 | CONFIG_X86_CMPXCHG=y | ||
13 | CONFIG_EARLY_PRINTK=y | ||
14 | CONFIG_HPET_TIMER=y | ||
15 | CONFIG_HPET_EMULATE_RTC=y | ||
16 | CONFIG_GENERIC_ISA_DMA=y | ||
17 | CONFIG_GENERIC_IOMAP=y | ||
18 | |||
19 | # | ||
20 | # Code maturity level options | ||
21 | # | ||
22 | CONFIG_EXPERIMENTAL=y | ||
23 | CONFIG_CLEAN_COMPILE=y | ||
24 | CONFIG_LOCK_KERNEL=y | ||
25 | |||
26 | # | ||
27 | # General setup | ||
28 | # | ||
29 | CONFIG_LOCALVERSION="" | ||
30 | CONFIG_SWAP=y | ||
31 | CONFIG_SYSVIPC=y | ||
32 | CONFIG_POSIX_MQUEUE=y | ||
33 | # CONFIG_BSD_PROCESS_ACCT is not set | ||
34 | CONFIG_SYSCTL=y | ||
35 | # CONFIG_AUDIT is not set | ||
36 | CONFIG_LOG_BUF_SHIFT=18 | ||
37 | # CONFIG_HOTPLUG is not set | ||
38 | CONFIG_KOBJECT_UEVENT=y | ||
39 | CONFIG_IKCONFIG=y | ||
40 | CONFIG_IKCONFIG_PROC=y | ||
41 | # CONFIG_CPUSETS is not set | ||
42 | # CONFIG_EMBEDDED is not set | ||
43 | CONFIG_KALLSYMS=y | ||
44 | CONFIG_KALLSYMS_ALL=y | ||
45 | # CONFIG_KALLSYMS_EXTRA_PASS is not set | ||
46 | CONFIG_BASE_FULL=y | ||
47 | CONFIG_FUTEX=y | ||
48 | CONFIG_EPOLL=y | ||
49 | # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set | ||
50 | CONFIG_SHMEM=y | ||
51 | CONFIG_CC_ALIGN_FUNCTIONS=0 | ||
52 | CONFIG_CC_ALIGN_LABELS=0 | ||
53 | CONFIG_CC_ALIGN_LOOPS=0 | ||
54 | CONFIG_CC_ALIGN_JUMPS=0 | ||
55 | # CONFIG_TINY_SHMEM is not set | ||
56 | CONFIG_BASE_SMALL=0 | ||
57 | |||
58 | # | ||
59 | # Loadable module support | ||
60 | # | ||
61 | CONFIG_MODULES=y | ||
62 | CONFIG_MODULE_UNLOAD=y | ||
63 | CONFIG_MODULE_FORCE_UNLOAD=y | ||
64 | CONFIG_OBSOLETE_MODPARM=y | ||
65 | # CONFIG_MODVERSIONS is not set | ||
66 | # CONFIG_MODULE_SRCVERSION_ALL is not set | ||
67 | # CONFIG_KMOD is not set | ||
68 | CONFIG_STOP_MACHINE=y | ||
69 | |||
70 | # | ||
71 | # Processor type and features | ||
72 | # | ||
73 | # CONFIG_MK8 is not set | ||
74 | # CONFIG_MPSC is not set | ||
75 | CONFIG_GENERIC_CPU=y | ||
76 | CONFIG_X86_L1_CACHE_BYTES=128 | ||
77 | CONFIG_X86_L1_CACHE_SHIFT=7 | ||
78 | CONFIG_X86_TSC=y | ||
79 | CONFIG_X86_GOOD_APIC=y | ||
80 | # CONFIG_MICROCODE is not set | ||
81 | CONFIG_X86_MSR=y | ||
82 | CONFIG_X86_CPUID=y | ||
83 | CONFIG_X86_HT=y | ||
84 | CONFIG_X86_IO_APIC=y | ||
85 | CONFIG_X86_LOCAL_APIC=y | ||
86 | CONFIG_MTRR=y | ||
87 | CONFIG_SMP=y | ||
88 | # CONFIG_PREEMPT is not set | ||
89 | CONFIG_SCHED_SMT=y | ||
90 | CONFIG_K8_NUMA=y | ||
91 | # CONFIG_NUMA_EMU is not set | ||
92 | CONFIG_DISCONTIGMEM=y | ||
93 | CONFIG_NUMA=y | ||
94 | CONFIG_HAVE_DEC_LOCK=y | ||
95 | CONFIG_NR_CPUS=8 | ||
96 | CONFIG_GART_IOMMU=y | ||
97 | CONFIG_SWIOTLB=y | ||
98 | CONFIG_X86_MCE=y | ||
99 | CONFIG_X86_MCE_INTEL=y | ||
100 | CONFIG_SECCOMP=y | ||
101 | CONFIG_GENERIC_HARDIRQS=y | ||
102 | CONFIG_GENERIC_IRQ_PROBE=y | ||
103 | |||
104 | # | ||
105 | # Power management options | ||
106 | # | ||
107 | CONFIG_PM=y | ||
108 | # CONFIG_PM_DEBUG is not set | ||
109 | CONFIG_SOFTWARE_SUSPEND=y | ||
110 | CONFIG_PM_STD_PARTITION="" | ||
111 | |||
112 | # | ||
113 | # ACPI (Advanced Configuration and Power Interface) Support | ||
114 | # | ||
115 | CONFIG_ACPI=y | ||
116 | CONFIG_ACPI_BOOT=y | ||
117 | CONFIG_ACPI_INTERPRETER=y | ||
118 | CONFIG_ACPI_SLEEP=y | ||
119 | CONFIG_ACPI_SLEEP_PROC_FS=y | ||
120 | CONFIG_ACPI_AC=y | ||
121 | CONFIG_ACPI_BATTERY=y | ||
122 | CONFIG_ACPI_BUTTON=y | ||
123 | # CONFIG_ACPI_VIDEO is not set | ||
124 | CONFIG_ACPI_FAN=y | ||
125 | CONFIG_ACPI_PROCESSOR=y | ||
126 | CONFIG_ACPI_THERMAL=y | ||
127 | CONFIG_ACPI_NUMA=y | ||
128 | # CONFIG_ACPI_ASUS is not set | ||
129 | # CONFIG_ACPI_IBM is not set | ||
130 | CONFIG_ACPI_TOSHIBA=y | ||
131 | CONFIG_ACPI_BLACKLIST_YEAR=2001 | ||
132 | CONFIG_ACPI_DEBUG=y | ||
133 | CONFIG_ACPI_BUS=y | ||
134 | CONFIG_ACPI_EC=y | ||
135 | CONFIG_ACPI_POWER=y | ||
136 | CONFIG_ACPI_PCI=y | ||
137 | CONFIG_ACPI_SYSTEM=y | ||
138 | # CONFIG_ACPI_CONTAINER is not set | ||
139 | |||
140 | # | ||
141 | # CPU Frequency scaling | ||
142 | # | ||
143 | CONFIG_CPU_FREQ=y | ||
144 | # CONFIG_CPU_FREQ_DEBUG is not set | ||
145 | CONFIG_CPU_FREQ_STAT=y | ||
146 | # CONFIG_CPU_FREQ_STAT_DETAILS is not set | ||
147 | CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y | ||
148 | # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set | ||
149 | CONFIG_CPU_FREQ_GOV_PERFORMANCE=y | ||
150 | # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set | ||
151 | CONFIG_CPU_FREQ_GOV_USERSPACE=y | ||
152 | CONFIG_CPU_FREQ_GOV_ONDEMAND=y | ||
153 | CONFIG_CPU_FREQ_TABLE=y | ||
154 | |||
155 | # | ||
156 | # CPUFreq processor drivers | ||
157 | # | ||
158 | CONFIG_X86_POWERNOW_K8=y | ||
159 | CONFIG_X86_POWERNOW_K8_ACPI=y | ||
160 | # CONFIG_X86_SPEEDSTEP_CENTRINO is not set | ||
161 | CONFIG_X86_ACPI_CPUFREQ=y | ||
162 | |||
163 | # | ||
164 | # shared options | ||
165 | # | ||
166 | CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y | ||
167 | |||
168 | # | ||
169 | # Bus options (PCI etc.) | ||
170 | # | ||
171 | CONFIG_PCI=y | ||
172 | CONFIG_PCI_DIRECT=y | ||
173 | CONFIG_PCI_MMCONFIG=y | ||
174 | CONFIG_UNORDERED_IO=y | ||
175 | CONFIG_PCI_MSI=y | ||
176 | # CONFIG_PCI_LEGACY_PROC is not set | ||
177 | # CONFIG_PCI_NAMES is not set | ||
178 | |||
179 | # | ||
180 | # PCCARD (PCMCIA/CardBus) support | ||
181 | # | ||
182 | # CONFIG_PCCARD is not set | ||
183 | |||
184 | # | ||
185 | # PC-card bridges | ||
186 | # | ||
187 | |||
188 | # | ||
189 | # PCI Hotplug Support | ||
190 | # | ||
191 | # CONFIG_HOTPLUG_PCI is not set | ||
192 | |||
193 | # | ||
194 | # Executable file formats / Emulations | ||
195 | # | ||
196 | CONFIG_BINFMT_ELF=y | ||
197 | # CONFIG_BINFMT_MISC is not set | ||
198 | CONFIG_IA32_EMULATION=y | ||
199 | CONFIG_IA32_AOUT=y | ||
200 | CONFIG_COMPAT=y | ||
201 | CONFIG_SYSVIPC_COMPAT=y | ||
202 | CONFIG_UID16=y | ||
203 | |||
204 | # | ||
205 | # Device Drivers | ||
206 | # | ||
207 | |||
208 | # | ||
209 | # Generic Driver Options | ||
210 | # | ||
211 | CONFIG_STANDALONE=y | ||
212 | CONFIG_PREVENT_FIRMWARE_BUILD=y | ||
213 | # CONFIG_FW_LOADER is not set | ||
214 | # CONFIG_DEBUG_DRIVER is not set | ||
215 | |||
216 | # | ||
217 | # Memory Technology Devices (MTD) | ||
218 | # | ||
219 | # CONFIG_MTD is not set | ||
220 | |||
221 | # | ||
222 | # Parallel port support | ||
223 | # | ||
224 | # CONFIG_PARPORT is not set | ||
225 | |||
226 | # | ||
227 | # Plug and Play support | ||
228 | # | ||
229 | # CONFIG_PNP is not set | ||
230 | |||
231 | # | ||
232 | # Block devices | ||
233 | # | ||
234 | CONFIG_BLK_DEV_FD=y | ||
235 | # CONFIG_BLK_CPQ_DA is not set | ||
236 | # CONFIG_BLK_CPQ_CISS_DA is not set | ||
237 | # CONFIG_BLK_DEV_DAC960 is not set | ||
238 | # CONFIG_BLK_DEV_UMEM is not set | ||
239 | # CONFIG_BLK_DEV_COW_COMMON is not set | ||
240 | CONFIG_BLK_DEV_LOOP=y | ||
241 | # CONFIG_BLK_DEV_CRYPTOLOOP is not set | ||
242 | # CONFIG_BLK_DEV_NBD is not set | ||
243 | # CONFIG_BLK_DEV_SX8 is not set | ||
244 | # CONFIG_BLK_DEV_UB is not set | ||
245 | CONFIG_BLK_DEV_RAM=y | ||
246 | CONFIG_BLK_DEV_RAM_COUNT=16 | ||
247 | CONFIG_BLK_DEV_RAM_SIZE=4096 | ||
248 | CONFIG_BLK_DEV_INITRD=y | ||
249 | CONFIG_INITRAMFS_SOURCE="" | ||
250 | CONFIG_LBD=y | ||
251 | # CONFIG_CDROM_PKTCDVD is not set | ||
252 | |||
253 | # | ||
254 | # IO Schedulers | ||
255 | # | ||
256 | CONFIG_IOSCHED_NOOP=y | ||
257 | CONFIG_IOSCHED_AS=y | ||
258 | CONFIG_IOSCHED_DEADLINE=y | ||
259 | CONFIG_IOSCHED_CFQ=y | ||
260 | # CONFIG_ATA_OVER_ETH is not set | ||
261 | |||
262 | # | ||
263 | # ATA/ATAPI/MFM/RLL support | ||
264 | # | ||
265 | CONFIG_IDE=y | ||
266 | CONFIG_BLK_DEV_IDE=y | ||
267 | |||
268 | # | ||
269 | # Please see Documentation/ide.txt for help/info on IDE drives | ||
270 | # | ||
271 | # CONFIG_BLK_DEV_IDE_SATA is not set | ||
272 | # CONFIG_BLK_DEV_HD_IDE is not set | ||
273 | CONFIG_BLK_DEV_IDEDISK=y | ||
274 | CONFIG_IDEDISK_MULTI_MODE=y | ||
275 | CONFIG_BLK_DEV_IDECD=y | ||
276 | # CONFIG_BLK_DEV_IDETAPE is not set | ||
277 | # CONFIG_BLK_DEV_IDEFLOPPY is not set | ||
278 | # CONFIG_BLK_DEV_IDESCSI is not set | ||
279 | # CONFIG_IDE_TASK_IOCTL is not set | ||
280 | |||
281 | # | ||
282 | # IDE chipset support/bugfixes | ||
283 | # | ||
284 | CONFIG_IDE_GENERIC=y | ||
285 | # CONFIG_BLK_DEV_CMD640 is not set | ||
286 | CONFIG_BLK_DEV_IDEPCI=y | ||
287 | # CONFIG_IDEPCI_SHARE_IRQ is not set | ||
288 | # CONFIG_BLK_DEV_OFFBOARD is not set | ||
289 | # CONFIG_BLK_DEV_GENERIC is not set | ||
290 | # CONFIG_BLK_DEV_OPTI621 is not set | ||
291 | # CONFIG_BLK_DEV_RZ1000 is not set | ||
292 | CONFIG_BLK_DEV_IDEDMA_PCI=y | ||
293 | # CONFIG_BLK_DEV_IDEDMA_FORCED is not set | ||
294 | CONFIG_IDEDMA_PCI_AUTO=y | ||
295 | # CONFIG_IDEDMA_ONLYDISK is not set | ||
296 | # CONFIG_BLK_DEV_AEC62XX is not set | ||
297 | # CONFIG_BLK_DEV_ALI15X3 is not set | ||
298 | CONFIG_BLK_DEV_AMD74XX=y | ||
299 | # CONFIG_BLK_DEV_ATIIXP is not set | ||
300 | # CONFIG_BLK_DEV_CMD64X is not set | ||
301 | # CONFIG_BLK_DEV_TRIFLEX is not set | ||
302 | # CONFIG_BLK_DEV_CY82C693 is not set | ||
303 | # CONFIG_BLK_DEV_CS5520 is not set | ||
304 | # CONFIG_BLK_DEV_CS5530 is not set | ||
305 | # CONFIG_BLK_DEV_HPT34X is not set | ||
306 | # CONFIG_BLK_DEV_HPT366 is not set | ||
307 | # CONFIG_BLK_DEV_SC1200 is not set | ||
308 | CONFIG_BLK_DEV_PIIX=y | ||
309 | # CONFIG_BLK_DEV_NS87415 is not set | ||
310 | # CONFIG_BLK_DEV_PDC202XX_OLD is not set | ||
311 | # CONFIG_BLK_DEV_PDC202XX_NEW is not set | ||
312 | # CONFIG_BLK_DEV_SVWKS is not set | ||
313 | # CONFIG_BLK_DEV_SIIMAGE is not set | ||
314 | # CONFIG_BLK_DEV_SIS5513 is not set | ||
315 | # CONFIG_BLK_DEV_SLC90E66 is not set | ||
316 | # CONFIG_BLK_DEV_TRM290 is not set | ||
317 | # CONFIG_BLK_DEV_VIA82CXXX is not set | ||
318 | # CONFIG_IDE_ARM is not set | ||
319 | CONFIG_BLK_DEV_IDEDMA=y | ||
320 | # CONFIG_IDEDMA_IVB is not set | ||
321 | CONFIG_IDEDMA_AUTO=y | ||
322 | # CONFIG_BLK_DEV_HD is not set | ||
323 | |||
324 | # | ||
325 | # SCSI device support | ||
326 | # | ||
327 | CONFIG_SCSI=y | ||
328 | # CONFIG_SCSI_PROC_FS is not set | ||
329 | |||
330 | # | ||
331 | # SCSI support type (disk, tape, CD-ROM) | ||
332 | # | ||
333 | CONFIG_BLK_DEV_SD=y | ||
334 | # CONFIG_CHR_DEV_ST is not set | ||
335 | # CONFIG_CHR_DEV_OSST is not set | ||
336 | # CONFIG_BLK_DEV_SR is not set | ||
337 | # CONFIG_CHR_DEV_SG is not set | ||
338 | |||
339 | # | ||
340 | # Some SCSI devices (e.g. CD jukebox) support multiple LUNs | ||
341 | # | ||
342 | # CONFIG_SCSI_MULTI_LUN is not set | ||
343 | # CONFIG_SCSI_CONSTANTS is not set | ||
344 | # CONFIG_SCSI_LOGGING is not set | ||
345 | |||
346 | # | ||
347 | # SCSI Transport Attributes | ||
348 | # | ||
349 | # CONFIG_SCSI_SPI_ATTRS is not set | ||
350 | # CONFIG_SCSI_FC_ATTRS is not set | ||
351 | # CONFIG_SCSI_ISCSI_ATTRS is not set | ||
352 | |||
353 | # | ||
354 | # SCSI low-level drivers | ||
355 | # | ||
356 | CONFIG_BLK_DEV_3W_XXXX_RAID=y | ||
357 | # CONFIG_SCSI_3W_9XXX is not set | ||
358 | # CONFIG_SCSI_ACARD is not set | ||
359 | # CONFIG_SCSI_AACRAID is not set | ||
360 | # CONFIG_SCSI_AIC7XXX is not set | ||
361 | # CONFIG_SCSI_AIC7XXX_OLD is not set | ||
362 | CONFIG_SCSI_AIC79XX=y | ||
363 | CONFIG_AIC79XX_CMDS_PER_DEVICE=32 | ||
364 | CONFIG_AIC79XX_RESET_DELAY_MS=4000 | ||
365 | # CONFIG_AIC79XX_ENABLE_RD_STRM is not set | ||
366 | # CONFIG_AIC79XX_DEBUG_ENABLE is not set | ||
367 | CONFIG_AIC79XX_DEBUG_MASK=0 | ||
368 | # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set | ||
369 | # CONFIG_MEGARAID_NEWGEN is not set | ||
370 | # CONFIG_MEGARAID_LEGACY is not set | ||
371 | CONFIG_SCSI_SATA=y | ||
372 | # CONFIG_SCSI_SATA_AHCI is not set | ||
373 | # CONFIG_SCSI_SATA_SVW is not set | ||
374 | CONFIG_SCSI_ATA_PIIX=y | ||
375 | # CONFIG_SCSI_SATA_NV is not set | ||
376 | # CONFIG_SCSI_SATA_PROMISE is not set | ||
377 | # CONFIG_SCSI_SATA_QSTOR is not set | ||
378 | # CONFIG_SCSI_SATA_SX4 is not set | ||
379 | # CONFIG_SCSI_SATA_SIL is not set | ||
380 | # CONFIG_SCSI_SATA_SIS is not set | ||
381 | # CONFIG_SCSI_SATA_ULI is not set | ||
382 | CONFIG_SCSI_SATA_VIA=y | ||
383 | # CONFIG_SCSI_SATA_VITESSE is not set | ||
384 | # CONFIG_SCSI_BUSLOGIC is not set | ||
385 | # CONFIG_SCSI_DMX3191D is not set | ||
386 | # CONFIG_SCSI_EATA is not set | ||
387 | # CONFIG_SCSI_EATA_PIO is not set | ||
388 | # CONFIG_SCSI_FUTURE_DOMAIN is not set | ||
389 | # CONFIG_SCSI_GDTH is not set | ||
390 | # CONFIG_SCSI_IPS is not set | ||
391 | # CONFIG_SCSI_INITIO is not set | ||
392 | # CONFIG_SCSI_INIA100 is not set | ||
393 | # CONFIG_SCSI_SYM53C8XX_2 is not set | ||
394 | # CONFIG_SCSI_IPR is not set | ||
395 | # CONFIG_SCSI_QLOGIC_ISP is not set | ||
396 | # CONFIG_SCSI_QLOGIC_FC is not set | ||
397 | # CONFIG_SCSI_QLOGIC_1280 is not set | ||
398 | CONFIG_SCSI_QLA2XXX=y | ||
399 | # CONFIG_SCSI_QLA21XX is not set | ||
400 | # CONFIG_SCSI_QLA22XX is not set | ||
401 | # CONFIG_SCSI_QLA2300 is not set | ||
402 | # CONFIG_SCSI_QLA2322 is not set | ||
403 | # CONFIG_SCSI_QLA6312 is not set | ||
404 | # CONFIG_SCSI_DC395x is not set | ||
405 | # CONFIG_SCSI_DC390T is not set | ||
406 | # CONFIG_SCSI_DEBUG is not set | ||
407 | |||
408 | # | ||
409 | # Multi-device support (RAID and LVM) | ||
410 | # | ||
411 | # CONFIG_MD is not set | ||
412 | |||
413 | # | ||
414 | # Fusion MPT device support | ||
415 | # | ||
416 | CONFIG_FUSION=y | ||
417 | CONFIG_FUSION_MAX_SGE=40 | ||
418 | # CONFIG_FUSION_CTL is not set | ||
419 | |||
420 | # | ||
421 | # IEEE 1394 (FireWire) support | ||
422 | # | ||
423 | # CONFIG_IEEE1394 is not set | ||
424 | |||
425 | # | ||
426 | # I2O device support | ||
427 | # | ||
428 | # CONFIG_I2O is not set | ||
429 | |||
430 | # | ||
431 | # Networking support | ||
432 | # | ||
433 | CONFIG_NET=y | ||
434 | |||
435 | # | ||
436 | # Networking options | ||
437 | # | ||
438 | CONFIG_PACKET=y | ||
439 | # CONFIG_PACKET_MMAP is not set | ||
440 | # CONFIG_NETLINK_DEV is not set | ||
441 | CONFIG_UNIX=y | ||
442 | # CONFIG_NET_KEY is not set | ||
443 | CONFIG_INET=y | ||
444 | CONFIG_IP_MULTICAST=y | ||
445 | # CONFIG_IP_ADVANCED_ROUTER is not set | ||
446 | # CONFIG_IP_PNP is not set | ||
447 | # CONFIG_NET_IPIP is not set | ||
448 | # CONFIG_NET_IPGRE is not set | ||
449 | # CONFIG_IP_MROUTE is not set | ||
450 | # CONFIG_ARPD is not set | ||
451 | # CONFIG_SYN_COOKIES is not set | ||
452 | # CONFIG_INET_AH is not set | ||
453 | # CONFIG_INET_ESP is not set | ||
454 | # CONFIG_INET_IPCOMP is not set | ||
455 | # CONFIG_INET_TUNNEL is not set | ||
456 | CONFIG_IP_TCPDIAG=y | ||
457 | CONFIG_IP_TCPDIAG_IPV6=y | ||
458 | CONFIG_IPV6=y | ||
459 | # CONFIG_IPV6_PRIVACY is not set | ||
460 | # CONFIG_INET6_AH is not set | ||
461 | # CONFIG_INET6_ESP is not set | ||
462 | # CONFIG_INET6_IPCOMP is not set | ||
463 | # CONFIG_INET6_TUNNEL is not set | ||
464 | # CONFIG_IPV6_TUNNEL is not set | ||
465 | # CONFIG_NETFILTER is not set | ||
466 | |||
467 | # | ||
468 | # SCTP Configuration (EXPERIMENTAL) | ||
469 | # | ||
470 | # CONFIG_IP_SCTP is not set | ||
471 | # CONFIG_ATM is not set | ||
472 | # CONFIG_BRIDGE is not set | ||
473 | # CONFIG_VLAN_8021Q is not set | ||
474 | # CONFIG_DECNET is not set | ||
475 | # CONFIG_LLC2 is not set | ||
476 | # CONFIG_IPX is not set | ||
477 | # CONFIG_ATALK is not set | ||
478 | # CONFIG_X25 is not set | ||
479 | # CONFIG_LAPB is not set | ||
480 | # CONFIG_NET_DIVERT is not set | ||
481 | # CONFIG_ECONET is not set | ||
482 | # CONFIG_WAN_ROUTER is not set | ||
483 | |||
484 | # | ||
485 | # QoS and/or fair queueing | ||
486 | # | ||
487 | # CONFIG_NET_SCHED is not set | ||
488 | # CONFIG_NET_CLS_ROUTE is not set | ||
489 | |||
490 | # | ||
491 | # Network testing | ||
492 | # | ||
493 | # CONFIG_NET_PKTGEN is not set | ||
494 | CONFIG_NETPOLL=y | ||
495 | # CONFIG_NETPOLL_RX is not set | ||
496 | # CONFIG_NETPOLL_TRAP is not set | ||
497 | CONFIG_NET_POLL_CONTROLLER=y | ||
498 | # CONFIG_HAMRADIO is not set | ||
499 | # CONFIG_IRDA is not set | ||
500 | # CONFIG_BT is not set | ||
501 | CONFIG_NETDEVICES=y | ||
502 | # CONFIG_DUMMY is not set | ||
503 | # CONFIG_BONDING is not set | ||
504 | # CONFIG_EQUALIZER is not set | ||
505 | # CONFIG_TUN is not set | ||
506 | |||
507 | # | ||
508 | # ARCnet devices | ||
509 | # | ||
510 | # CONFIG_ARCNET is not set | ||
511 | |||
512 | # | ||
513 | # Ethernet (10 or 100Mbit) | ||
514 | # | ||
515 | CONFIG_NET_ETHERNET=y | ||
516 | CONFIG_MII=y | ||
517 | # CONFIG_HAPPYMEAL is not set | ||
518 | # CONFIG_SUNGEM is not set | ||
519 | # CONFIG_NET_VENDOR_3COM is not set | ||
520 | |||
521 | # | ||
522 | # Tulip family network device support | ||
523 | # | ||
524 | # CONFIG_NET_TULIP is not set | ||
525 | # CONFIG_HP100 is not set | ||
526 | CONFIG_NET_PCI=y | ||
527 | # CONFIG_PCNET32 is not set | ||
528 | CONFIG_AMD8111_ETH=y | ||
529 | # CONFIG_AMD8111E_NAPI is not set | ||
530 | # CONFIG_ADAPTEC_STARFIRE is not set | ||
531 | # CONFIG_B44 is not set | ||
532 | CONFIG_FORCEDETH=y | ||
533 | # CONFIG_DGRS is not set | ||
534 | # CONFIG_EEPRO100 is not set | ||
535 | # CONFIG_E100 is not set | ||
536 | # CONFIG_FEALNX is not set | ||
537 | # CONFIG_NATSEMI is not set | ||
538 | # CONFIG_NE2K_PCI is not set | ||
539 | CONFIG_8139CP=m | ||
540 | CONFIG_8139TOO=y | ||
541 | # CONFIG_8139TOO_PIO is not set | ||
542 | # CONFIG_8139TOO_TUNE_TWISTER is not set | ||
543 | # CONFIG_8139TOO_8129 is not set | ||
544 | # CONFIG_8139_OLD_RX_RESET is not set | ||
545 | # CONFIG_SIS900 is not set | ||
546 | # CONFIG_EPIC100 is not set | ||
547 | # CONFIG_SUNDANCE is not set | ||
548 | # CONFIG_VIA_RHINE is not set | ||
549 | |||
550 | # | ||
551 | # Ethernet (1000 Mbit) | ||
552 | # | ||
553 | # CONFIG_ACENIC is not set | ||
554 | # CONFIG_DL2K is not set | ||
555 | CONFIG_E1000=y | ||
556 | # CONFIG_E1000_NAPI is not set | ||
557 | # CONFIG_NS83820 is not set | ||
558 | # CONFIG_HAMACHI is not set | ||
559 | # CONFIG_YELLOWFIN is not set | ||
560 | # CONFIG_R8169 is not set | ||
561 | # CONFIG_SK98LIN is not set | ||
562 | # CONFIG_VIA_VELOCITY is not set | ||
563 | CONFIG_TIGON3=y | ||
564 | |||
565 | # | ||
566 | # Ethernet (10000 Mbit) | ||
567 | # | ||
568 | # CONFIG_IXGB is not set | ||
569 | CONFIG_S2IO=m | ||
570 | # CONFIG_S2IO_NAPI is not set | ||
571 | # CONFIG_2BUFF_MODE is not set | ||
572 | |||
573 | # | ||
574 | # Token Ring devices | ||
575 | # | ||
576 | # CONFIG_TR is not set | ||
577 | |||
578 | # | ||
579 | # Wireless LAN (non-hamradio) | ||
580 | # | ||
581 | # CONFIG_NET_RADIO is not set | ||
582 | |||
583 | # | ||
584 | # Wan interfaces | ||
585 | # | ||
586 | # CONFIG_WAN is not set | ||
587 | # CONFIG_FDDI is not set | ||
588 | # CONFIG_HIPPI is not set | ||
589 | # CONFIG_PPP is not set | ||
590 | # CONFIG_SLIP is not set | ||
591 | # CONFIG_NET_FC is not set | ||
592 | # CONFIG_SHAPER is not set | ||
593 | CONFIG_NETCONSOLE=y | ||
594 | |||
595 | # | ||
596 | # ISDN subsystem | ||
597 | # | ||
598 | # CONFIG_ISDN is not set | ||
599 | |||
600 | # | ||
601 | # Telephony Support | ||
602 | # | ||
603 | # CONFIG_PHONE is not set | ||
604 | |||
605 | # | ||
606 | # Input device support | ||
607 | # | ||
608 | CONFIG_INPUT=y | ||
609 | |||
610 | # | ||
611 | # Userland interfaces | ||
612 | # | ||
613 | CONFIG_INPUT_MOUSEDEV=y | ||
614 | CONFIG_INPUT_MOUSEDEV_PSAUX=y | ||
615 | CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 | ||
616 | CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 | ||
617 | # CONFIG_INPUT_JOYDEV is not set | ||
618 | # CONFIG_INPUT_TSDEV is not set | ||
619 | CONFIG_INPUT_EVDEV=y | ||
620 | # CONFIG_INPUT_EVBUG is not set | ||
621 | |||
622 | # | ||
623 | # Input Device Drivers | ||
624 | # | ||
625 | CONFIG_INPUT_KEYBOARD=y | ||
626 | CONFIG_KEYBOARD_ATKBD=y | ||
627 | # CONFIG_KEYBOARD_SUNKBD is not set | ||
628 | # CONFIG_KEYBOARD_LKKBD is not set | ||
629 | # CONFIG_KEYBOARD_XTKBD is not set | ||
630 | # CONFIG_KEYBOARD_NEWTON is not set | ||
631 | CONFIG_INPUT_MOUSE=y | ||
632 | CONFIG_MOUSE_PS2=y | ||
633 | # CONFIG_MOUSE_SERIAL is not set | ||
634 | # CONFIG_MOUSE_VSXXXAA is not set | ||
635 | # CONFIG_INPUT_JOYSTICK is not set | ||
636 | # CONFIG_INPUT_TOUCHSCREEN is not set | ||
637 | # CONFIG_INPUT_MISC is not set | ||
638 | |||
639 | # | ||
640 | # Hardware I/O ports | ||
641 | # | ||
642 | CONFIG_SERIO=y | ||
643 | CONFIG_SERIO_I8042=y | ||
644 | # CONFIG_SERIO_SERPORT is not set | ||
645 | # CONFIG_SERIO_CT82C710 is not set | ||
646 | # CONFIG_SERIO_PCIPS2 is not set | ||
647 | CONFIG_SERIO_LIBPS2=y | ||
648 | # CONFIG_SERIO_RAW is not set | ||
649 | # CONFIG_GAMEPORT is not set | ||
650 | CONFIG_SOUND_GAMEPORT=y | ||
651 | |||
652 | # | ||
653 | # Character devices | ||
654 | # | ||
655 | CONFIG_VT=y | ||
656 | CONFIG_VT_CONSOLE=y | ||
657 | CONFIG_HW_CONSOLE=y | ||
658 | # CONFIG_SERIAL_NONSTANDARD is not set | ||
659 | |||
660 | # | ||
661 | # Serial drivers | ||
662 | # | ||
663 | CONFIG_SERIAL_8250=y | ||
664 | CONFIG_SERIAL_8250_CONSOLE=y | ||
665 | # CONFIG_SERIAL_8250_ACPI is not set | ||
666 | CONFIG_SERIAL_8250_NR_UARTS=4 | ||
667 | # CONFIG_SERIAL_8250_EXTENDED is not set | ||
668 | |||
669 | # | ||
670 | # Non-8250 serial port support | ||
671 | # | ||
672 | CONFIG_SERIAL_CORE=y | ||
673 | CONFIG_SERIAL_CORE_CONSOLE=y | ||
674 | CONFIG_UNIX98_PTYS=y | ||
675 | CONFIG_LEGACY_PTYS=y | ||
676 | CONFIG_LEGACY_PTY_COUNT=256 | ||
677 | |||
678 | # | ||
679 | # IPMI | ||
680 | # | ||
681 | # CONFIG_IPMI_HANDLER is not set | ||
682 | |||
683 | # | ||
684 | # Watchdog Cards | ||
685 | # | ||
686 | # CONFIG_WATCHDOG is not set | ||
687 | CONFIG_HW_RANDOM=y | ||
688 | # CONFIG_NVRAM is not set | ||
689 | CONFIG_RTC=y | ||
690 | # CONFIG_DTLK is not set | ||
691 | # CONFIG_R3964 is not set | ||
692 | # CONFIG_APPLICOM is not set | ||
693 | |||
694 | # | ||
695 | # Ftape, the floppy tape device driver | ||
696 | # | ||
697 | CONFIG_AGP=y | ||
698 | CONFIG_AGP_AMD64=y | ||
699 | # CONFIG_DRM is not set | ||
700 | # CONFIG_MWAVE is not set | ||
701 | CONFIG_RAW_DRIVER=y | ||
702 | CONFIG_HPET=y | ||
703 | # CONFIG_HPET_RTC_IRQ is not set | ||
704 | CONFIG_HPET_MMAP=y | ||
705 | CONFIG_MAX_RAW_DEVS=256 | ||
706 | CONFIG_HANGCHECK_TIMER=y | ||
707 | |||
708 | # | ||
709 | # TPM devices | ||
710 | # | ||
711 | # CONFIG_TCG_TPM is not set | ||
712 | |||
713 | # | ||
714 | # I2C support | ||
715 | # | ||
716 | # CONFIG_I2C is not set | ||
717 | |||
718 | # | ||
719 | # Dallas's 1-wire bus | ||
720 | # | ||
721 | # CONFIG_W1 is not set | ||
722 | |||
723 | # | ||
724 | # Misc devices | ||
725 | # | ||
726 | # CONFIG_IBM_ASM is not set | ||
727 | |||
728 | # | ||
729 | # Multimedia devices | ||
730 | # | ||
731 | # CONFIG_VIDEO_DEV is not set | ||
732 | |||
733 | # | ||
734 | # Digital Video Broadcasting Devices | ||
735 | # | ||
736 | # CONFIG_DVB is not set | ||
737 | |||
738 | # | ||
739 | # Graphics support | ||
740 | # | ||
741 | # CONFIG_FB is not set | ||
742 | CONFIG_VIDEO_SELECT=y | ||
743 | |||
744 | # | ||
745 | # Console display driver support | ||
746 | # | ||
747 | CONFIG_VGA_CONSOLE=y | ||
748 | CONFIG_DUMMY_CONSOLE=y | ||
749 | |||
750 | # | ||
751 | # Sound | ||
752 | # | ||
753 | CONFIG_SOUND=y | ||
754 | |||
755 | # | ||
756 | # Advanced Linux Sound Architecture | ||
757 | # | ||
758 | # CONFIG_SND is not set | ||
759 | |||
760 | # | ||
761 | # Open Sound System | ||
762 | # | ||
763 | CONFIG_SOUND_PRIME=y | ||
764 | # CONFIG_SOUND_BT878 is not set | ||
765 | # CONFIG_SOUND_CMPCI is not set | ||
766 | # CONFIG_SOUND_EMU10K1 is not set | ||
767 | # CONFIG_SOUND_FUSION is not set | ||
768 | # CONFIG_SOUND_CS4281 is not set | ||
769 | # CONFIG_SOUND_ES1370 is not set | ||
770 | # CONFIG_SOUND_ES1371 is not set | ||
771 | # CONFIG_SOUND_ESSSOLO1 is not set | ||
772 | # CONFIG_SOUND_MAESTRO is not set | ||
773 | # CONFIG_SOUND_MAESTRO3 is not set | ||
774 | CONFIG_SOUND_ICH=y | ||
775 | # CONFIG_SOUND_SONICVIBES is not set | ||
776 | # CONFIG_SOUND_TRIDENT is not set | ||
777 | # CONFIG_SOUND_MSNDCLAS is not set | ||
778 | # CONFIG_SOUND_MSNDPIN is not set | ||
779 | # CONFIG_SOUND_VIA82CXXX is not set | ||
780 | # CONFIG_SOUND_OSS is not set | ||
781 | # CONFIG_SOUND_ALI5455 is not set | ||
782 | # CONFIG_SOUND_FORTE is not set | ||
783 | # CONFIG_SOUND_RME96XX is not set | ||
784 | # CONFIG_SOUND_AD1980 is not set | ||
785 | |||
786 | # | ||
787 | # USB support | ||
788 | # | ||
789 | CONFIG_USB=y | ||
790 | # CONFIG_USB_DEBUG is not set | ||
791 | |||
792 | # | ||
793 | # Miscellaneous USB options | ||
794 | # | ||
795 | CONFIG_USB_DEVICEFS=y | ||
796 | # CONFIG_USB_BANDWIDTH is not set | ||
797 | # CONFIG_USB_DYNAMIC_MINORS is not set | ||
798 | # CONFIG_USB_SUSPEND is not set | ||
799 | # CONFIG_USB_OTG is not set | ||
800 | CONFIG_USB_ARCH_HAS_HCD=y | ||
801 | CONFIG_USB_ARCH_HAS_OHCI=y | ||
802 | |||
803 | # | ||
804 | # USB Host Controller Drivers | ||
805 | # | ||
806 | CONFIG_USB_EHCI_HCD=y | ||
807 | # CONFIG_USB_EHCI_SPLIT_ISO is not set | ||
808 | # CONFIG_USB_EHCI_ROOT_HUB_TT is not set | ||
809 | CONFIG_USB_OHCI_HCD=y | ||
810 | # CONFIG_USB_OHCI_BIG_ENDIAN is not set | ||
811 | CONFIG_USB_OHCI_LITTLE_ENDIAN=y | ||
812 | CONFIG_USB_UHCI_HCD=y | ||
813 | # CONFIG_USB_SL811_HCD is not set | ||
814 | |||
815 | # | ||
816 | # USB Device Class drivers | ||
817 | # | ||
818 | # CONFIG_USB_AUDIO is not set | ||
819 | # CONFIG_USB_BLUETOOTH_TTY is not set | ||
820 | # CONFIG_USB_MIDI is not set | ||
821 | # CONFIG_USB_ACM is not set | ||
822 | CONFIG_USB_PRINTER=y | ||
823 | |||
824 | # | ||
825 | # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information | ||
826 | # | ||
827 | CONFIG_USB_STORAGE=y | ||
828 | # CONFIG_USB_STORAGE_DEBUG is not set | ||
829 | # CONFIG_USB_STORAGE_RW_DETECT is not set | ||
830 | # CONFIG_USB_STORAGE_DATAFAB is not set | ||
831 | # CONFIG_USB_STORAGE_FREECOM is not set | ||
832 | # CONFIG_USB_STORAGE_ISD200 is not set | ||
833 | # CONFIG_USB_STORAGE_DPCM is not set | ||
834 | # CONFIG_USB_STORAGE_USBAT is not set | ||
835 | # CONFIG_USB_STORAGE_SDDR09 is not set | ||
836 | # CONFIG_USB_STORAGE_SDDR55 is not set | ||
837 | # CONFIG_USB_STORAGE_JUMPSHOT is not set | ||
838 | |||
839 | # | ||
840 | # USB Input Devices | ||
841 | # | ||
842 | CONFIG_USB_HID=y | ||
843 | CONFIG_USB_HIDINPUT=y | ||
844 | # CONFIG_HID_FF is not set | ||
845 | # CONFIG_USB_HIDDEV is not set | ||
846 | # CONFIG_USB_AIPTEK is not set | ||
847 | # CONFIG_USB_WACOM is not set | ||
848 | # CONFIG_USB_KBTAB is not set | ||
849 | # CONFIG_USB_POWERMATE is not set | ||
850 | # CONFIG_USB_MTOUCH is not set | ||
851 | # CONFIG_USB_EGALAX is not set | ||
852 | # CONFIG_USB_XPAD is not set | ||
853 | # CONFIG_USB_ATI_REMOTE is not set | ||
854 | |||
855 | # | ||
856 | # USB Imaging devices | ||
857 | # | ||
858 | # CONFIG_USB_MDC800 is not set | ||
859 | # CONFIG_USB_MICROTEK is not set | ||
860 | |||
861 | # | ||
862 | # USB Multimedia devices | ||
863 | # | ||
864 | # CONFIG_USB_DABUSB is not set | ||
865 | |||
866 | # | ||
867 | # Video4Linux support is needed for USB Multimedia device support | ||
868 | # | ||
869 | |||
870 | # | ||
871 | # USB Network Adapters | ||
872 | # | ||
873 | # CONFIG_USB_CATC is not set | ||
874 | # CONFIG_USB_KAWETH is not set | ||
875 | # CONFIG_USB_PEGASUS is not set | ||
876 | # CONFIG_USB_RTL8150 is not set | ||
877 | # CONFIG_USB_USBNET is not set | ||
878 | CONFIG_USB_MON=y | ||
879 | |||
880 | # | ||
881 | # USB port drivers | ||
882 | # | ||
883 | |||
884 | # | ||
885 | # USB Serial Converter support | ||
886 | # | ||
887 | # CONFIG_USB_SERIAL is not set | ||
888 | |||
889 | # | ||
890 | # USB Miscellaneous drivers | ||
891 | # | ||
892 | # CONFIG_USB_EMI62 is not set | ||
893 | # CONFIG_USB_EMI26 is not set | ||
894 | # CONFIG_USB_AUERSWALD is not set | ||
895 | # CONFIG_USB_RIO500 is not set | ||
896 | # CONFIG_USB_LEGOTOWER is not set | ||
897 | # CONFIG_USB_LCD is not set | ||
898 | # CONFIG_USB_LED is not set | ||
899 | # CONFIG_USB_CYTHERM is not set | ||
900 | # CONFIG_USB_PHIDGETKIT is not set | ||
901 | # CONFIG_USB_PHIDGETSERVO is not set | ||
902 | # CONFIG_USB_IDMOUSE is not set | ||
903 | # CONFIG_USB_SISUSBVGA is not set | ||
904 | # CONFIG_USB_TEST is not set | ||
905 | |||
906 | # | ||
907 | # USB ATM/DSL drivers | ||
908 | # | ||
909 | |||
910 | # | ||
911 | # USB Gadget Support | ||
912 | # | ||
913 | # CONFIG_USB_GADGET is not set | ||
914 | |||
915 | # | ||
916 | # MMC/SD Card support | ||
917 | # | ||
918 | # CONFIG_MMC is not set | ||
919 | |||
920 | # | ||
921 | # InfiniBand support | ||
922 | # | ||
923 | # CONFIG_INFINIBAND is not set | ||
924 | |||
925 | # | ||
926 | # Firmware Drivers | ||
927 | # | ||
928 | # CONFIG_EDD is not set | ||
929 | |||
930 | # | ||
931 | # File systems | ||
932 | # | ||
933 | CONFIG_EXT2_FS=y | ||
934 | CONFIG_EXT2_FS_XATTR=y | ||
935 | CONFIG_EXT2_FS_POSIX_ACL=y | ||
936 | # CONFIG_EXT2_FS_SECURITY is not set | ||
937 | CONFIG_EXT3_FS=y | ||
938 | CONFIG_EXT3_FS_XATTR=y | ||
939 | CONFIG_EXT3_FS_POSIX_ACL=y | ||
940 | # CONFIG_EXT3_FS_SECURITY is not set | ||
941 | CONFIG_JBD=y | ||
942 | # CONFIG_JBD_DEBUG is not set | ||
943 | CONFIG_FS_MBCACHE=y | ||
944 | CONFIG_REISERFS_FS=y | ||
945 | # CONFIG_REISERFS_CHECK is not set | ||
946 | # CONFIG_REISERFS_PROC_INFO is not set | ||
947 | CONFIG_REISERFS_FS_XATTR=y | ||
948 | CONFIG_REISERFS_FS_POSIX_ACL=y | ||
949 | # CONFIG_REISERFS_FS_SECURITY is not set | ||
950 | # CONFIG_JFS_FS is not set | ||
951 | CONFIG_FS_POSIX_ACL=y | ||
952 | |||
953 | # | ||
954 | # XFS support | ||
955 | # | ||
956 | # CONFIG_XFS_FS is not set | ||
957 | # CONFIG_MINIX_FS is not set | ||
958 | # CONFIG_ROMFS_FS is not set | ||
959 | # CONFIG_QUOTA is not set | ||
960 | CONFIG_DNOTIFY=y | ||
961 | CONFIG_AUTOFS_FS=y | ||
962 | # CONFIG_AUTOFS4_FS is not set | ||
963 | |||
964 | # | ||
965 | # CD-ROM/DVD Filesystems | ||
966 | # | ||
967 | CONFIG_ISO9660_FS=y | ||
968 | # CONFIG_JOLIET is not set | ||
969 | # CONFIG_ZISOFS is not set | ||
970 | # CONFIG_UDF_FS is not set | ||
971 | |||
972 | # | ||
973 | # DOS/FAT/NT Filesystems | ||
974 | # | ||
975 | CONFIG_FAT_FS=y | ||
976 | CONFIG_MSDOS_FS=y | ||
977 | CONFIG_VFAT_FS=y | ||
978 | CONFIG_FAT_DEFAULT_CODEPAGE=437 | ||
979 | CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" | ||
980 | # CONFIG_NTFS_FS is not set | ||
981 | |||
982 | # | ||
983 | # Pseudo filesystems | ||
984 | # | ||
985 | CONFIG_PROC_FS=y | ||
986 | CONFIG_PROC_KCORE=y | ||
987 | CONFIG_SYSFS=y | ||
988 | # CONFIG_DEVFS_FS is not set | ||
989 | # CONFIG_DEVPTS_FS_XATTR is not set | ||
990 | CONFIG_TMPFS=y | ||
991 | # CONFIG_TMPFS_XATTR is not set | ||
992 | CONFIG_HUGETLBFS=y | ||
993 | CONFIG_HUGETLB_PAGE=y | ||
994 | CONFIG_RAMFS=y | ||
995 | |||
996 | # | ||
997 | # Miscellaneous filesystems | ||
998 | # | ||
999 | # CONFIG_ADFS_FS is not set | ||
1000 | # CONFIG_AFFS_FS is not set | ||
1001 | # CONFIG_HFS_FS is not set | ||
1002 | # CONFIG_HFSPLUS_FS is not set | ||
1003 | # CONFIG_BEFS_FS is not set | ||
1004 | # CONFIG_BFS_FS is not set | ||
1005 | # CONFIG_EFS_FS is not set | ||
1006 | # CONFIG_CRAMFS is not set | ||
1007 | # CONFIG_VXFS_FS is not set | ||
1008 | # CONFIG_HPFS_FS is not set | ||
1009 | # CONFIG_QNX4FS_FS is not set | ||
1010 | # CONFIG_SYSV_FS is not set | ||
1011 | # CONFIG_UFS_FS is not set | ||
1012 | |||
1013 | # | ||
1014 | # Network File Systems | ||
1015 | # | ||
1016 | CONFIG_NFS_FS=y | ||
1017 | CONFIG_NFS_V3=y | ||
1018 | # CONFIG_NFS_V4 is not set | ||
1019 | # CONFIG_NFS_DIRECTIO is not set | ||
1020 | CONFIG_NFSD=y | ||
1021 | CONFIG_NFSD_V3=y | ||
1022 | # CONFIG_NFSD_V4 is not set | ||
1023 | CONFIG_NFSD_TCP=y | ||
1024 | CONFIG_LOCKD=y | ||
1025 | CONFIG_LOCKD_V4=y | ||
1026 | CONFIG_EXPORTFS=y | ||
1027 | CONFIG_SUNRPC=y | ||
1028 | # CONFIG_RPCSEC_GSS_KRB5 is not set | ||
1029 | # CONFIG_RPCSEC_GSS_SPKM3 is not set | ||
1030 | # CONFIG_SMB_FS is not set | ||
1031 | # CONFIG_CIFS is not set | ||
1032 | # CONFIG_NCP_FS is not set | ||
1033 | # CONFIG_CODA_FS is not set | ||
1034 | # CONFIG_AFS_FS is not set | ||
1035 | |||
1036 | # | ||
1037 | # Partition Types | ||
1038 | # | ||
1039 | # CONFIG_PARTITION_ADVANCED is not set | ||
1040 | CONFIG_MSDOS_PARTITION=y | ||
1041 | |||
1042 | # | ||
1043 | # Native Language Support | ||
1044 | # | ||
1045 | CONFIG_NLS=y | ||
1046 | CONFIG_NLS_DEFAULT="iso8859-1" | ||
1047 | CONFIG_NLS_CODEPAGE_437=y | ||
1048 | # CONFIG_NLS_CODEPAGE_737 is not set | ||
1049 | # CONFIG_NLS_CODEPAGE_775 is not set | ||
1050 | # CONFIG_NLS_CODEPAGE_850 is not set | ||
1051 | # CONFIG_NLS_CODEPAGE_852 is not set | ||
1052 | # CONFIG_NLS_CODEPAGE_855 is not set | ||
1053 | # CONFIG_NLS_CODEPAGE_857 is not set | ||
1054 | # CONFIG_NLS_CODEPAGE_860 is not set | ||
1055 | # CONFIG_NLS_CODEPAGE_861 is not set | ||
1056 | # CONFIG_NLS_CODEPAGE_862 is not set | ||
1057 | # CONFIG_NLS_CODEPAGE_863 is not set | ||
1058 | # CONFIG_NLS_CODEPAGE_864 is not set | ||
1059 | # CONFIG_NLS_CODEPAGE_865 is not set | ||
1060 | # CONFIG_NLS_CODEPAGE_866 is not set | ||
1061 | # CONFIG_NLS_CODEPAGE_869 is not set | ||
1062 | # CONFIG_NLS_CODEPAGE_936 is not set | ||
1063 | # CONFIG_NLS_CODEPAGE_950 is not set | ||
1064 | # CONFIG_NLS_CODEPAGE_932 is not set | ||
1065 | # CONFIG_NLS_CODEPAGE_949 is not set | ||
1066 | # CONFIG_NLS_CODEPAGE_874 is not set | ||
1067 | # CONFIG_NLS_ISO8859_8 is not set | ||
1068 | # CONFIG_NLS_CODEPAGE_1250 is not set | ||
1069 | # CONFIG_NLS_CODEPAGE_1251 is not set | ||
1070 | CONFIG_NLS_ASCII=y | ||
1071 | CONFIG_NLS_ISO8859_1=y | ||
1072 | # CONFIG_NLS_ISO8859_2 is not set | ||
1073 | # CONFIG_NLS_ISO8859_3 is not set | ||
1074 | # CONFIG_NLS_ISO8859_4 is not set | ||
1075 | # CONFIG_NLS_ISO8859_5 is not set | ||
1076 | # CONFIG_NLS_ISO8859_6 is not set | ||
1077 | # CONFIG_NLS_ISO8859_7 is not set | ||
1078 | # CONFIG_NLS_ISO8859_9 is not set | ||
1079 | # CONFIG_NLS_ISO8859_13 is not set | ||
1080 | # CONFIG_NLS_ISO8859_14 is not set | ||
1081 | CONFIG_NLS_ISO8859_15=y | ||
1082 | # CONFIG_NLS_KOI8_R is not set | ||
1083 | # CONFIG_NLS_KOI8_U is not set | ||
1084 | CONFIG_NLS_UTF8=y | ||
1085 | |||
1086 | # | ||
1087 | # Profiling support | ||
1088 | # | ||
1089 | CONFIG_PROFILING=y | ||
1090 | CONFIG_OPROFILE=y | ||
1091 | |||
1092 | # | ||
1093 | # Kernel hacking | ||
1094 | # | ||
1095 | CONFIG_DEBUG_KERNEL=y | ||
1096 | CONFIG_MAGIC_SYSRQ=y | ||
1097 | # CONFIG_PRINTK_TIME is not set | ||
1098 | # CONFIG_SCHEDSTATS is not set | ||
1099 | # CONFIG_DEBUG_SLAB is not set | ||
1100 | # CONFIG_DEBUG_SPINLOCK is not set | ||
1101 | # CONFIG_DEBUG_SPINLOCK_SLEEP is not set | ||
1102 | # CONFIG_DEBUG_KOBJECT is not set | ||
1103 | # CONFIG_DEBUG_INFO is not set | ||
1104 | CONFIG_DEBUG_FS=y | ||
1105 | CONFIG_INIT_DEBUG=y | ||
1106 | # CONFIG_IOMMU_DEBUG is not set | ||
1107 | CONFIG_KPROBES=y | ||
1108 | |||
1109 | # | ||
1110 | # Security options | ||
1111 | # | ||
1112 | # CONFIG_KEYS is not set | ||
1113 | # CONFIG_SECURITY is not set | ||
1114 | |||
1115 | # | ||
1116 | # Cryptographic options | ||
1117 | # | ||
1118 | # CONFIG_CRYPTO is not set | ||
1119 | |||
1120 | # | ||
1121 | # Hardware crypto devices | ||
1122 | # | ||
1123 | |||
1124 | # | ||
1125 | # Library routines | ||
1126 | # | ||
1127 | # CONFIG_CRC_CCITT is not set | ||
1128 | CONFIG_CRC32=y | ||
1129 | # CONFIG_LIBCRC32C is not set | ||
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile new file mode 100644 index 000000000000..a12b19da4b59 --- /dev/null +++ b/arch/x86_64/ia32/Makefile | |||
@@ -0,0 +1,32 @@ | |||
1 | # | ||
2 | # Makefile for the ia32 kernel emulation subsystem. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ | ||
6 | ia32_signal.o tls32.o \ | ||
7 | ia32_binfmt.o fpu32.o ptrace32.o syscall32.o | ||
8 | |||
9 | sysv-$(CONFIG_SYSVIPC) := ipc32.o | ||
10 | obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) | ||
11 | |||
12 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o | ||
13 | |||
14 | $(obj)/syscall32.o: $(src)/syscall32.c \ | ||
15 | $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) | ||
16 | |||
17 | # Teach kbuild about targets | ||
18 | targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) | ||
19 | |||
20 | # The DSO images are built using a special linker script | ||
21 | quiet_cmd_syscall = SYSCALL $@ | ||
22 | cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ | ||
23 | -Wl,-soname=linux-gate.so.1 -o $@ \ | ||
24 | -Wl,-T,$(filter-out FORCE,$^) | ||
25 | |||
26 | $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ | ||
27 | $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE | ||
28 | $(call if_changed,syscall) | ||
29 | |||
30 | AFLAGS_vsyscall-sysenter.o = -m32 | ||
31 | AFLAGS_vsyscall-syscall.o = -m32 | ||
32 | CFLAGS_ia32_ioctl.o += -Ifs/ | ||
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c new file mode 100644 index 000000000000..1c23095f1813 --- /dev/null +++ b/arch/x86_64/ia32/fpu32.c | |||
@@ -0,0 +1,184 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. | ||
4 | * This is used for ptrace, signals and coredumps in 32bit emulation. | ||
5 | * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $ | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <asm/sigcontext32.h> | ||
10 | #include <asm/processor.h> | ||
11 | #include <asm/uaccess.h> | ||
12 | #include <asm/i387.h> | ||
13 | |||
14 | static inline unsigned short twd_i387_to_fxsr(unsigned short twd) | ||
15 | { | ||
16 | unsigned int tmp; /* to avoid 16 bit prefixes in the code */ | ||
17 | |||
18 | /* Transform each pair of bits into 01 (valid) or 00 (empty) */ | ||
19 | tmp = ~twd; | ||
20 | tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ | ||
21 | /* and move the valid bits to the lower byte. */ | ||
22 | tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ | ||
23 | tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ | ||
24 | tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ | ||
25 | return tmp; | ||
26 | } | ||
27 | |||
28 | static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) | ||
29 | { | ||
30 | struct _fpxreg *st = NULL; | ||
31 | unsigned long tos = (fxsave->swd >> 11) & 7; | ||
32 | unsigned long twd = (unsigned long) fxsave->twd; | ||
33 | unsigned long tag; | ||
34 | unsigned long ret = 0xffff0000; | ||
35 | int i; | ||
36 | |||
37 | #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); | ||
38 | |||
39 | for (i = 0 ; i < 8 ; i++) { | ||
40 | if (twd & 0x1) { | ||
41 | st = FPREG_ADDR( fxsave, (i - tos) & 7 ); | ||
42 | |||
43 | switch (st->exponent & 0x7fff) { | ||
44 | case 0x7fff: | ||
45 | tag = 2; /* Special */ | ||
46 | break; | ||
47 | case 0x0000: | ||
48 | if ( !st->significand[0] && | ||
49 | !st->significand[1] && | ||
50 | !st->significand[2] && | ||
51 | !st->significand[3] ) { | ||
52 | tag = 1; /* Zero */ | ||
53 | } else { | ||
54 | tag = 2; /* Special */ | ||
55 | } | ||
56 | break; | ||
57 | default: | ||
58 | if (st->significand[3] & 0x8000) { | ||
59 | tag = 0; /* Valid */ | ||
60 | } else { | ||
61 | tag = 2; /* Special */ | ||
62 | } | ||
63 | break; | ||
64 | } | ||
65 | } else { | ||
66 | tag = 3; /* Empty */ | ||
67 | } | ||
68 | ret |= (tag << (2 * i)); | ||
69 | twd = twd >> 1; | ||
70 | } | ||
71 | return ret; | ||
72 | } | ||
73 | |||
74 | |||
75 | static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, | ||
76 | struct _fpstate_ia32 __user *buf) | ||
77 | { | ||
78 | struct _fpxreg *to; | ||
79 | struct _fpreg __user *from; | ||
80 | int i; | ||
81 | u32 v; | ||
82 | int err = 0; | ||
83 | |||
84 | #define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) | ||
85 | G(0, fxsave->cwd); | ||
86 | G(1, fxsave->swd); | ||
87 | G(2, fxsave->twd); | ||
88 | fxsave->twd = twd_i387_to_fxsr(fxsave->twd); | ||
89 | G(3, fxsave->rip); | ||
90 | G(4, v); | ||
91 | fxsave->fop = v>>16; /* cs ignored */ | ||
92 | G(5, fxsave->rdp); | ||
93 | /* 6: ds ignored */ | ||
94 | #undef G | ||
95 | if (err) | ||
96 | return -1; | ||
97 | |||
98 | to = (struct _fpxreg *)&fxsave->st_space[0]; | ||
99 | from = &buf->_st[0]; | ||
100 | for (i = 0 ; i < 8 ; i++, to++, from++) { | ||
101 | if (__copy_from_user(to, from, sizeof(*from))) | ||
102 | return -1; | ||
103 | } | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | |||
108 | static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, | ||
109 | struct i387_fxsave_struct *fxsave, | ||
110 | struct pt_regs *regs, | ||
111 | struct task_struct *tsk) | ||
112 | { | ||
113 | struct _fpreg __user *to; | ||
114 | struct _fpxreg *from; | ||
115 | int i; | ||
116 | u16 cs,ds; | ||
117 | int err = 0; | ||
118 | |||
119 | if (tsk == current) { | ||
120 | /* should be actually ds/cs at fpu exception time, | ||
121 | but that information is not available in 64bit mode. */ | ||
122 | asm("movw %%ds,%0 " : "=r" (ds)); | ||
123 | asm("movw %%cs,%0 " : "=r" (cs)); | ||
124 | } else { /* ptrace. task has stopped. */ | ||
125 | ds = tsk->thread.ds; | ||
126 | cs = regs->cs; | ||
127 | } | ||
128 | |||
129 | #define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) | ||
130 | P(0, (u32)fxsave->cwd | 0xffff0000); | ||
131 | P(1, (u32)fxsave->swd | 0xffff0000); | ||
132 | P(2, twd_fxsr_to_i387(fxsave)); | ||
133 | P(3, (u32)fxsave->rip); | ||
134 | P(4, cs | ((u32)fxsave->fop) << 16); | ||
135 | P(5, fxsave->rdp); | ||
136 | P(6, 0xffff0000 | ds); | ||
137 | #undef P | ||
138 | |||
139 | if (err) | ||
140 | return -1; | ||
141 | |||
142 | to = &buf->_st[0]; | ||
143 | from = (struct _fpxreg *) &fxsave->st_space[0]; | ||
144 | for ( i = 0 ; i < 8 ; i++, to++, from++ ) { | ||
145 | if (__copy_to_user(to, from, sizeof(*to))) | ||
146 | return -1; | ||
147 | } | ||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) | ||
152 | { | ||
153 | clear_fpu(tsk); | ||
154 | if (!fsave) { | ||
155 | if (__copy_from_user(&tsk->thread.i387.fxsave, | ||
156 | &buf->_fxsr_env[0], | ||
157 | sizeof(struct i387_fxsave_struct))) | ||
158 | return -1; | ||
159 | tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
160 | set_stopped_child_used_math(tsk); | ||
161 | } | ||
162 | return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); | ||
163 | } | ||
164 | |||
165 | int save_i387_ia32(struct task_struct *tsk, | ||
166 | struct _fpstate_ia32 __user *buf, | ||
167 | struct pt_regs *regs, | ||
168 | int fsave) | ||
169 | { | ||
170 | int err = 0; | ||
171 | |||
172 | init_fpu(tsk); | ||
173 | if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) | ||
174 | return -1; | ||
175 | if (fsave) | ||
176 | return 0; | ||
177 | err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); | ||
178 | if (fsave) | ||
179 | return err ? -1 : 1; | ||
180 | err |= __put_user(X86_FXSR_MAGIC, &buf->magic); | ||
181 | err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, | ||
182 | sizeof(struct i387_fxsave_struct)); | ||
183 | return err ? -1 : 1; | ||
184 | } | ||
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c new file mode 100644 index 000000000000..1965efc974dc --- /dev/null +++ b/arch/x86_64/ia32/ia32_aout.c | |||
@@ -0,0 +1,529 @@ | |||
1 | /* | ||
2 | * a.out loader for x86-64 | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992, 1996 Linus Torvalds | ||
5 | * Hacked together by Andi Kleen | ||
6 | */ | ||
7 | |||
8 | #include <linux/module.h> | ||
9 | |||
10 | #include <linux/time.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/mman.h> | ||
14 | #include <linux/a.out.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/signal.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/file.h> | ||
20 | #include <linux/stat.h> | ||
21 | #include <linux/fcntl.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/user.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/binfmts.h> | ||
26 | #include <linux/personality.h> | ||
27 | #include <linux/init.h> | ||
28 | |||
29 | #include <asm/system.h> | ||
30 | #include <asm/uaccess.h> | ||
31 | #include <asm/pgalloc.h> | ||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/user32.h> | ||
34 | #include <asm/ia32.h> | ||
35 | |||
36 | #undef WARN_OLD | ||
37 | #undef CORE_DUMP /* probably broken */ | ||
38 | |||
39 | extern int ia32_setup_arg_pages(struct linux_binprm *bprm, | ||
40 | unsigned long stack_top, int exec_stack); | ||
41 | |||
42 | static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); | ||
43 | static int load_aout_library(struct file*); | ||
44 | |||
45 | #if CORE_DUMP | ||
46 | static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); | ||
47 | |||
48 | /* | ||
49 | * fill in the user structure for a core dump.. | ||
50 | */ | ||
51 | static void dump_thread32(struct pt_regs * regs, struct user32 * dump) | ||
52 | { | ||
53 | u32 fs,gs; | ||
54 | |||
55 | /* changed the size calculations - should hopefully work better. lbt */ | ||
56 | dump->magic = CMAGIC; | ||
57 | dump->start_code = 0; | ||
58 | dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); | ||
59 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | ||
60 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | ||
61 | dump->u_dsize -= dump->u_tsize; | ||
62 | dump->u_ssize = 0; | ||
63 | dump->u_debugreg[0] = current->thread.debugreg0; | ||
64 | dump->u_debugreg[1] = current->thread.debugreg1; | ||
65 | dump->u_debugreg[2] = current->thread.debugreg2; | ||
66 | dump->u_debugreg[3] = current->thread.debugreg3; | ||
67 | dump->u_debugreg[4] = 0; | ||
68 | dump->u_debugreg[5] = 0; | ||
69 | dump->u_debugreg[6] = current->thread.debugreg6; | ||
70 | dump->u_debugreg[7] = current->thread.debugreg7; | ||
71 | |||
72 | if (dump->start_stack < 0xc0000000) | ||
73 | dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; | ||
74 | |||
75 | dump->regs.ebx = regs->rbx; | ||
76 | dump->regs.ecx = regs->rcx; | ||
77 | dump->regs.edx = regs->rdx; | ||
78 | dump->regs.esi = regs->rsi; | ||
79 | dump->regs.edi = regs->rdi; | ||
80 | dump->regs.ebp = regs->rbp; | ||
81 | dump->regs.eax = regs->rax; | ||
82 | dump->regs.ds = current->thread.ds; | ||
83 | dump->regs.es = current->thread.es; | ||
84 | asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; | ||
85 | asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; | ||
86 | dump->regs.orig_eax = regs->orig_rax; | ||
87 | dump->regs.eip = regs->rip; | ||
88 | dump->regs.cs = regs->cs; | ||
89 | dump->regs.eflags = regs->eflags; | ||
90 | dump->regs.esp = regs->rsp; | ||
91 | dump->regs.ss = regs->ss; | ||
92 | |||
93 | #if 1 /* FIXME */ | ||
94 | dump->u_fpvalid = 0; | ||
95 | #else | ||
96 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); | ||
97 | #endif | ||
98 | } | ||
99 | |||
100 | #endif | ||
101 | |||
102 | static struct linux_binfmt aout_format = { | ||
103 | .module = THIS_MODULE, | ||
104 | .load_binary = load_aout_binary, | ||
105 | .load_shlib = load_aout_library, | ||
106 | #if CORE_DUMP | ||
107 | .core_dump = aout_core_dump, | ||
108 | #endif | ||
109 | .min_coredump = PAGE_SIZE | ||
110 | }; | ||
111 | |||
112 | static void set_brk(unsigned long start, unsigned long end) | ||
113 | { | ||
114 | start = PAGE_ALIGN(start); | ||
115 | end = PAGE_ALIGN(end); | ||
116 | if (end <= start) | ||
117 | return; | ||
118 | down_write(¤t->mm->mmap_sem); | ||
119 | do_brk(start, end - start); | ||
120 | up_write(¤t->mm->mmap_sem); | ||
121 | } | ||
122 | |||
123 | #if CORE_DUMP | ||
124 | /* | ||
125 | * These are the only things you should do on a core-file: use only these | ||
126 | * macros to write out all the necessary info. | ||
127 | */ | ||
128 | |||
129 | static int dump_write(struct file *file, const void *addr, int nr) | ||
130 | { | ||
131 | return file->f_op->write(file, addr, nr, &file->f_pos) == nr; | ||
132 | } | ||
133 | |||
134 | #define DUMP_WRITE(addr, nr) \ | ||
135 | if (!dump_write(file, (void *)(addr), (nr))) \ | ||
136 | goto end_coredump; | ||
137 | |||
138 | #define DUMP_SEEK(offset) \ | ||
139 | if (file->f_op->llseek) { \ | ||
140 | if (file->f_op->llseek(file,(offset),0) != (offset)) \ | ||
141 | goto end_coredump; \ | ||
142 | } else file->f_pos = (offset) | ||
143 | |||
144 | /* | ||
145 | * Routine writes a core dump image in the current directory. | ||
146 | * Currently only a stub-function. | ||
147 | * | ||
148 | * Note that setuid/setgid files won't make a core-dump if the uid/gid | ||
149 | * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" | ||
150 | * field, which also makes sure the core-dumps won't be recursive if the | ||
151 | * dumping of the process results in another error.. | ||
152 | */ | ||
153 | |||
154 | static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) | ||
155 | { | ||
156 | mm_segment_t fs; | ||
157 | int has_dumped = 0; | ||
158 | unsigned long dump_start, dump_size; | ||
159 | struct user32 dump; | ||
160 | # define START_DATA(u) (u.u_tsize << PAGE_SHIFT) | ||
161 | # define START_STACK(u) (u.start_stack) | ||
162 | |||
163 | fs = get_fs(); | ||
164 | set_fs(KERNEL_DS); | ||
165 | has_dumped = 1; | ||
166 | current->flags |= PF_DUMPCORE; | ||
167 | strncpy(dump.u_comm, current->comm, sizeof(current->comm)); | ||
168 | dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); | ||
169 | dump.signal = signr; | ||
170 | dump_thread32(regs, &dump); | ||
171 | |||
172 | /* If the size of the dump file exceeds the rlimit, then see what would happen | ||
173 | if we wrote the stack, but not the data area. */ | ||
174 | if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE > | ||
175 | current->signal->rlim[RLIMIT_CORE].rlim_cur) | ||
176 | dump.u_dsize = 0; | ||
177 | |||
178 | /* Make sure we have enough room to write the stack and data areas. */ | ||
179 | if ((dump.u_ssize+1) * PAGE_SIZE > | ||
180 | current->signal->rlim[RLIMIT_CORE].rlim_cur) | ||
181 | dump.u_ssize = 0; | ||
182 | |||
183 | /* make sure we actually have a data and stack area to dump */ | ||
184 | set_fs(USER_DS); | ||
185 | if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) | ||
186 | dump.u_dsize = 0; | ||
187 | if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) | ||
188 | dump.u_ssize = 0; | ||
189 | |||
190 | set_fs(KERNEL_DS); | ||
191 | /* struct user */ | ||
192 | DUMP_WRITE(&dump,sizeof(dump)); | ||
193 | /* Now dump all of the user data. Include malloced stuff as well */ | ||
194 | DUMP_SEEK(PAGE_SIZE); | ||
195 | /* now we start writing out the user space info */ | ||
196 | set_fs(USER_DS); | ||
197 | /* Dump the data area */ | ||
198 | if (dump.u_dsize != 0) { | ||
199 | dump_start = START_DATA(dump); | ||
200 | dump_size = dump.u_dsize << PAGE_SHIFT; | ||
201 | DUMP_WRITE(dump_start,dump_size); | ||
202 | } | ||
203 | /* Now prepare to dump the stack area */ | ||
204 | if (dump.u_ssize != 0) { | ||
205 | dump_start = START_STACK(dump); | ||
206 | dump_size = dump.u_ssize << PAGE_SHIFT; | ||
207 | DUMP_WRITE(dump_start,dump_size); | ||
208 | } | ||
209 | /* Finally dump the task struct. Not be used by gdb, but could be useful */ | ||
210 | set_fs(KERNEL_DS); | ||
211 | DUMP_WRITE(current,sizeof(*current)); | ||
212 | end_coredump: | ||
213 | set_fs(fs); | ||
214 | return has_dumped; | ||
215 | } | ||
216 | #endif | ||
217 | |||
218 | /* | ||
219 | * create_aout_tables() parses the env- and arg-strings in new user | ||
220 | * memory and creates the pointer tables from them, and puts their | ||
221 | * addresses on the "stack", returning the new stack pointer value. | ||
222 | */ | ||
223 | static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) | ||
224 | { | ||
225 | u32 __user *argv; | ||
226 | u32 __user *envp; | ||
227 | u32 __user *sp; | ||
228 | int argc = bprm->argc; | ||
229 | int envc = bprm->envc; | ||
230 | |||
231 | sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); | ||
232 | sp -= envc+1; | ||
233 | envp = sp; | ||
234 | sp -= argc+1; | ||
235 | argv = sp; | ||
236 | put_user((unsigned long) envp,--sp); | ||
237 | put_user((unsigned long) argv,--sp); | ||
238 | put_user(argc,--sp); | ||
239 | current->mm->arg_start = (unsigned long) p; | ||
240 | while (argc-->0) { | ||
241 | char c; | ||
242 | put_user((u32)(unsigned long)p,argv++); | ||
243 | do { | ||
244 | get_user(c,p++); | ||
245 | } while (c); | ||
246 | } | ||
247 | put_user(NULL,argv); | ||
248 | current->mm->arg_end = current->mm->env_start = (unsigned long) p; | ||
249 | while (envc-->0) { | ||
250 | char c; | ||
251 | put_user((u32)(unsigned long)p,envp++); | ||
252 | do { | ||
253 | get_user(c,p++); | ||
254 | } while (c); | ||
255 | } | ||
256 | put_user(NULL,envp); | ||
257 | current->mm->env_end = (unsigned long) p; | ||
258 | return sp; | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * These are the functions used to load a.out style executables and shared | ||
263 | * libraries. There is no binary dependent code anywhere else. | ||
264 | */ | ||
265 | |||
266 | static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) | ||
267 | { | ||
268 | struct exec ex; | ||
269 | unsigned long error; | ||
270 | unsigned long fd_offset; | ||
271 | unsigned long rlim; | ||
272 | int retval; | ||
273 | |||
274 | ex = *((struct exec *) bprm->buf); /* exec-header */ | ||
275 | if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && | ||
276 | N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || | ||
277 | N_TRSIZE(ex) || N_DRSIZE(ex) || | ||
278 | i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { | ||
279 | return -ENOEXEC; | ||
280 | } | ||
281 | |||
282 | fd_offset = N_TXTOFF(ex); | ||
283 | |||
284 | /* Check initial limits. This avoids letting people circumvent | ||
285 | * size limits imposed on them by creating programs with large | ||
286 | * arrays in the data or bss. | ||
287 | */ | ||
288 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | ||
289 | if (rlim >= RLIM_INFINITY) | ||
290 | rlim = ~0; | ||
291 | if (ex.a_data + ex.a_bss > rlim) | ||
292 | return -ENOMEM; | ||
293 | |||
294 | /* Flush all traces of the currently running executable */ | ||
295 | retval = flush_old_exec(bprm); | ||
296 | if (retval) | ||
297 | return retval; | ||
298 | |||
299 | regs->cs = __USER32_CS; | ||
300 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = | ||
301 | regs->r13 = regs->r14 = regs->r15 = 0; | ||
302 | |||
303 | /* OK, This is the point of no return */ | ||
304 | set_personality(PER_LINUX); | ||
305 | set_thread_flag(TIF_IA32); | ||
306 | clear_thread_flag(TIF_ABI_PENDING); | ||
307 | |||
308 | current->mm->end_code = ex.a_text + | ||
309 | (current->mm->start_code = N_TXTADDR(ex)); | ||
310 | current->mm->end_data = ex.a_data + | ||
311 | (current->mm->start_data = N_DATADDR(ex)); | ||
312 | current->mm->brk = ex.a_bss + | ||
313 | (current->mm->start_brk = N_BSSADDR(ex)); | ||
314 | current->mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
315 | |||
316 | set_mm_counter(current->mm, rss, 0); | ||
317 | current->mm->mmap = NULL; | ||
318 | compute_creds(bprm); | ||
319 | current->flags &= ~PF_FORKNOEXEC; | ||
320 | |||
321 | if (N_MAGIC(ex) == OMAGIC) { | ||
322 | unsigned long text_addr, map_size; | ||
323 | loff_t pos; | ||
324 | |||
325 | text_addr = N_TXTADDR(ex); | ||
326 | |||
327 | pos = 32; | ||
328 | map_size = ex.a_text+ex.a_data; | ||
329 | |||
330 | down_write(¤t->mm->mmap_sem); | ||
331 | error = do_brk(text_addr & PAGE_MASK, map_size); | ||
332 | up_write(¤t->mm->mmap_sem); | ||
333 | |||
334 | if (error != (text_addr & PAGE_MASK)) { | ||
335 | send_sig(SIGKILL, current, 0); | ||
336 | return error; | ||
337 | } | ||
338 | |||
339 | error = bprm->file->f_op->read(bprm->file, (char *)text_addr, | ||
340 | ex.a_text+ex.a_data, &pos); | ||
341 | if ((signed long)error < 0) { | ||
342 | send_sig(SIGKILL, current, 0); | ||
343 | return error; | ||
344 | } | ||
345 | |||
346 | flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); | ||
347 | } else { | ||
348 | #ifdef WARN_OLD | ||
349 | static unsigned long error_time, error_time2; | ||
350 | if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && | ||
351 | (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) | ||
352 | { | ||
353 | printk(KERN_NOTICE "executable not page aligned\n"); | ||
354 | error_time2 = jiffies; | ||
355 | } | ||
356 | |||
357 | if ((fd_offset & ~PAGE_MASK) != 0 && | ||
358 | (jiffies-error_time) > 5*HZ) | ||
359 | { | ||
360 | printk(KERN_WARNING | ||
361 | "fd_offset is not page aligned. Please convert program: %s\n", | ||
362 | bprm->file->f_dentry->d_name.name); | ||
363 | error_time = jiffies; | ||
364 | } | ||
365 | #endif | ||
366 | |||
367 | if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { | ||
368 | loff_t pos = fd_offset; | ||
369 | down_write(¤t->mm->mmap_sem); | ||
370 | do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); | ||
371 | up_write(¤t->mm->mmap_sem); | ||
372 | bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), | ||
373 | ex.a_text+ex.a_data, &pos); | ||
374 | flush_icache_range((unsigned long) N_TXTADDR(ex), | ||
375 | (unsigned long) N_TXTADDR(ex) + | ||
376 | ex.a_text+ex.a_data); | ||
377 | goto beyond_if; | ||
378 | } | ||
379 | |||
380 | down_write(¤t->mm->mmap_sem); | ||
381 | error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, | ||
382 | PROT_READ | PROT_EXEC, | ||
383 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, | ||
384 | fd_offset); | ||
385 | up_write(¤t->mm->mmap_sem); | ||
386 | |||
387 | if (error != N_TXTADDR(ex)) { | ||
388 | send_sig(SIGKILL, current, 0); | ||
389 | return error; | ||
390 | } | ||
391 | |||
392 | down_write(¤t->mm->mmap_sem); | ||
393 | error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, | ||
394 | PROT_READ | PROT_WRITE | PROT_EXEC, | ||
395 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, | ||
396 | fd_offset + ex.a_text); | ||
397 | up_write(¤t->mm->mmap_sem); | ||
398 | if (error != N_DATADDR(ex)) { | ||
399 | send_sig(SIGKILL, current, 0); | ||
400 | return error; | ||
401 | } | ||
402 | } | ||
403 | beyond_if: | ||
404 | set_binfmt(&aout_format); | ||
405 | |||
406 | set_brk(current->mm->start_brk, current->mm->brk); | ||
407 | |||
408 | retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); | ||
409 | if (retval < 0) { | ||
410 | /* Someone check-me: is this error path enough? */ | ||
411 | send_sig(SIGKILL, current, 0); | ||
412 | return retval; | ||
413 | } | ||
414 | |||
415 | current->mm->start_stack = | ||
416 | (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); | ||
417 | /* start thread */ | ||
418 | asm volatile("movl %0,%%fs" :: "r" (0)); \ | ||
419 | asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); | ||
420 | load_gs_index(0); | ||
421 | (regs)->rip = ex.a_entry; | ||
422 | (regs)->rsp = current->mm->start_stack; | ||
423 | (regs)->eflags = 0x200; | ||
424 | (regs)->cs = __USER32_CS; | ||
425 | (regs)->ss = __USER32_DS; | ||
426 | set_fs(USER_DS); | ||
427 | if (unlikely(current->ptrace & PT_PTRACED)) { | ||
428 | if (current->ptrace & PT_TRACE_EXEC) | ||
429 | ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); | ||
430 | else | ||
431 | send_sig(SIGTRAP, current, 0); | ||
432 | } | ||
433 | return 0; | ||
434 | } | ||
435 | |||
436 | static int load_aout_library(struct file *file) | ||
437 | { | ||
438 | struct inode * inode; | ||
439 | unsigned long bss, start_addr, len; | ||
440 | unsigned long error; | ||
441 | int retval; | ||
442 | struct exec ex; | ||
443 | |||
444 | inode = file->f_dentry->d_inode; | ||
445 | |||
446 | retval = -ENOEXEC; | ||
447 | error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); | ||
448 | if (error != sizeof(ex)) | ||
449 | goto out; | ||
450 | |||
451 | /* We come in here for the regular a.out style of shared libraries */ | ||
452 | if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || | ||
453 | N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || | ||
454 | i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { | ||
455 | goto out; | ||
456 | } | ||
457 | |||
458 | if (N_FLAGS(ex)) | ||
459 | goto out; | ||
460 | |||
461 | /* For QMAGIC, the starting address is 0x20 into the page. We mask | ||
462 | this off to get the starting address for the page */ | ||
463 | |||
464 | start_addr = ex.a_entry & 0xfffff000; | ||
465 | |||
466 | if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { | ||
467 | loff_t pos = N_TXTOFF(ex); | ||
468 | |||
469 | #ifdef WARN_OLD | ||
470 | static unsigned long error_time; | ||
471 | if ((jiffies-error_time) > 5*HZ) | ||
472 | { | ||
473 | printk(KERN_WARNING | ||
474 | "N_TXTOFF is not page aligned. Please convert library: %s\n", | ||
475 | file->f_dentry->d_name.name); | ||
476 | error_time = jiffies; | ||
477 | } | ||
478 | #endif | ||
479 | down_write(¤t->mm->mmap_sem); | ||
480 | do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); | ||
481 | up_write(¤t->mm->mmap_sem); | ||
482 | |||
483 | file->f_op->read(file, (char *)start_addr, | ||
484 | ex.a_text + ex.a_data, &pos); | ||
485 | flush_icache_range((unsigned long) start_addr, | ||
486 | (unsigned long) start_addr + ex.a_text + ex.a_data); | ||
487 | |||
488 | retval = 0; | ||
489 | goto out; | ||
490 | } | ||
491 | /* Now use mmap to map the library into memory. */ | ||
492 | down_write(¤t->mm->mmap_sem); | ||
493 | error = do_mmap(file, start_addr, ex.a_text + ex.a_data, | ||
494 | PROT_READ | PROT_WRITE | PROT_EXEC, | ||
495 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, | ||
496 | N_TXTOFF(ex)); | ||
497 | up_write(¤t->mm->mmap_sem); | ||
498 | retval = error; | ||
499 | if (error != start_addr) | ||
500 | goto out; | ||
501 | |||
502 | len = PAGE_ALIGN(ex.a_text + ex.a_data); | ||
503 | bss = ex.a_text + ex.a_data + ex.a_bss; | ||
504 | if (bss > len) { | ||
505 | down_write(¤t->mm->mmap_sem); | ||
506 | error = do_brk(start_addr + len, bss - len); | ||
507 | up_write(¤t->mm->mmap_sem); | ||
508 | retval = error; | ||
509 | if (error != start_addr + len) | ||
510 | goto out; | ||
511 | } | ||
512 | retval = 0; | ||
513 | out: | ||
514 | return retval; | ||
515 | } | ||
516 | |||
517 | static int __init init_aout_binfmt(void) | ||
518 | { | ||
519 | return register_binfmt(&aout_format); | ||
520 | } | ||
521 | |||
522 | static void __exit exit_aout_binfmt(void) | ||
523 | { | ||
524 | unregister_binfmt(&aout_format); | ||
525 | } | ||
526 | |||
527 | module_init(init_aout_binfmt); | ||
528 | module_exit(exit_aout_binfmt); | ||
529 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c new file mode 100644 index 000000000000..93d568dfa762 --- /dev/null +++ b/arch/x86_64/ia32/ia32_binfmt.c | |||
@@ -0,0 +1,434 @@ | |||
1 | /* | ||
2 | * Written 2000,2002 by Andi Kleen. | ||
3 | * | ||
4 | * Loosely based on the sparc64 and IA64 32bit emulation loaders. | ||
5 | * This tricks binfmt_elf.c into loading 32bit binaries using lots | ||
6 | * of ugly preprocessor tricks. Talk about very very poor man's inheritance. | ||
7 | */ | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/stddef.h> | ||
11 | #include <linux/rwsem.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/compat.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/binfmts.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/security.h> | ||
18 | |||
19 | #include <asm/segment.h> | ||
20 | #include <asm/ptrace.h> | ||
21 | #include <asm/processor.h> | ||
22 | #include <asm/user32.h> | ||
23 | #include <asm/sigcontext32.h> | ||
24 | #include <asm/fpu32.h> | ||
25 | #include <asm/i387.h> | ||
26 | #include <asm/uaccess.h> | ||
27 | #include <asm/ia32.h> | ||
28 | #include <asm/vsyscall32.h> | ||
29 | |||
30 | #define ELF_NAME "elf/i386" | ||
31 | |||
32 | #define AT_SYSINFO 32 | ||
33 | #define AT_SYSINFO_EHDR 33 | ||
34 | |||
35 | int sysctl_vsyscall32 = 1; | ||
36 | |||
37 | #define ARCH_DLINFO do { \ | ||
38 | if (sysctl_vsyscall32) { \ | ||
39 | NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ | ||
40 | NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ | ||
41 | } \ | ||
42 | } while(0) | ||
43 | |||
44 | struct file; | ||
45 | struct elf_phdr; | ||
46 | |||
47 | #define IA32_EMULATOR 1 | ||
48 | |||
49 | #define ELF_ET_DYN_BASE (TASK_UNMAPPED_32 + 0x1000000) | ||
50 | |||
51 | #undef ELF_ARCH | ||
52 | #define ELF_ARCH EM_386 | ||
53 | |||
54 | #undef ELF_CLASS | ||
55 | #define ELF_CLASS ELFCLASS32 | ||
56 | |||
57 | #define ELF_DATA ELFDATA2LSB | ||
58 | |||
59 | #define USE_ELF_CORE_DUMP 1 | ||
60 | |||
61 | /* Overwrite elfcore.h */ | ||
62 | #define _LINUX_ELFCORE_H 1 | ||
63 | typedef unsigned int elf_greg_t; | ||
64 | |||
65 | #define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) | ||
66 | typedef elf_greg_t elf_gregset_t[ELF_NGREG]; | ||
67 | |||
68 | /* | ||
69 | * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out | ||
70 | * extra segments containing the vsyscall DSO contents. Dumping its | ||
71 | * contents makes post-mortem fully interpretable later without matching up | ||
72 | * the same kernel and hardware config to see what PC values meant. | ||
73 | * Dumping its extra ELF program headers includes all the other information | ||
74 | * a debugger needs to easily find how the vsyscall DSO was being used. | ||
75 | */ | ||
76 | #define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum) | ||
77 | #define ELF_CORE_WRITE_EXTRA_PHDRS \ | ||
78 | do { \ | ||
79 | const struct elf32_phdr *const vsyscall_phdrs = \ | ||
80 | (const struct elf32_phdr *) (VSYSCALL32_BASE \ | ||
81 | + VSYSCALL32_EHDR->e_phoff); \ | ||
82 | int i; \ | ||
83 | Elf32_Off ofs = 0; \ | ||
84 | for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ | ||
85 | struct elf32_phdr phdr = vsyscall_phdrs[i]; \ | ||
86 | if (phdr.p_type == PT_LOAD) { \ | ||
87 | BUG_ON(ofs != 0); \ | ||
88 | ofs = phdr.p_offset = offset; \ | ||
89 | phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ | ||
90 | phdr.p_filesz = phdr.p_memsz; \ | ||
91 | offset += phdr.p_filesz; \ | ||
92 | } \ | ||
93 | else \ | ||
94 | phdr.p_offset += ofs; \ | ||
95 | phdr.p_paddr = 0; /* match other core phdrs */ \ | ||
96 | DUMP_WRITE(&phdr, sizeof(phdr)); \ | ||
97 | } \ | ||
98 | } while (0) | ||
99 | #define ELF_CORE_WRITE_EXTRA_DATA \ | ||
100 | do { \ | ||
101 | const struct elf32_phdr *const vsyscall_phdrs = \ | ||
102 | (const struct elf32_phdr *) (VSYSCALL32_BASE \ | ||
103 | + VSYSCALL32_EHDR->e_phoff); \ | ||
104 | int i; \ | ||
105 | for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ | ||
106 | if (vsyscall_phdrs[i].p_type == PT_LOAD) \ | ||
107 | DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \ | ||
108 | PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ | ||
109 | } \ | ||
110 | } while (0) | ||
111 | |||
112 | struct elf_siginfo | ||
113 | { | ||
114 | int si_signo; /* signal number */ | ||
115 | int si_code; /* extra code */ | ||
116 | int si_errno; /* errno */ | ||
117 | }; | ||
118 | |||
119 | #define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) | ||
120 | |||
121 | struct elf_prstatus | ||
122 | { | ||
123 | struct elf_siginfo pr_info; /* Info associated with signal */ | ||
124 | short pr_cursig; /* Current signal */ | ||
125 | unsigned int pr_sigpend; /* Set of pending signals */ | ||
126 | unsigned int pr_sighold; /* Set of held signals */ | ||
127 | pid_t pr_pid; | ||
128 | pid_t pr_ppid; | ||
129 | pid_t pr_pgrp; | ||
130 | pid_t pr_sid; | ||
131 | struct compat_timeval pr_utime; /* User time */ | ||
132 | struct compat_timeval pr_stime; /* System time */ | ||
133 | struct compat_timeval pr_cutime; /* Cumulative user time */ | ||
134 | struct compat_timeval pr_cstime; /* Cumulative system time */ | ||
135 | elf_gregset_t pr_reg; /* GP registers */ | ||
136 | int pr_fpvalid; /* True if math co-processor being used. */ | ||
137 | }; | ||
138 | |||
139 | #define ELF_PRARGSZ (80) /* Number of chars for args */ | ||
140 | |||
141 | struct elf_prpsinfo | ||
142 | { | ||
143 | char pr_state; /* numeric process state */ | ||
144 | char pr_sname; /* char for pr_state */ | ||
145 | char pr_zomb; /* zombie */ | ||
146 | char pr_nice; /* nice val */ | ||
147 | unsigned int pr_flag; /* flags */ | ||
148 | __u16 pr_uid; | ||
149 | __u16 pr_gid; | ||
150 | pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; | ||
151 | /* Lots missing */ | ||
152 | char pr_fname[16]; /* filename of executable */ | ||
153 | char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ | ||
154 | }; | ||
155 | |||
156 | #define __STR(x) #x | ||
157 | #define STR(x) __STR(x) | ||
158 | |||
159 | #define _GET_SEG(x) \ | ||
160 | ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; }) | ||
161 | |||
162 | /* Assumes current==process to be dumped */ | ||
163 | #define ELF_CORE_COPY_REGS(pr_reg, regs) \ | ||
164 | pr_reg[0] = regs->rbx; \ | ||
165 | pr_reg[1] = regs->rcx; \ | ||
166 | pr_reg[2] = regs->rdx; \ | ||
167 | pr_reg[3] = regs->rsi; \ | ||
168 | pr_reg[4] = regs->rdi; \ | ||
169 | pr_reg[5] = regs->rbp; \ | ||
170 | pr_reg[6] = regs->rax; \ | ||
171 | pr_reg[7] = _GET_SEG(ds); \ | ||
172 | pr_reg[8] = _GET_SEG(es); \ | ||
173 | pr_reg[9] = _GET_SEG(fs); \ | ||
174 | pr_reg[10] = _GET_SEG(gs); \ | ||
175 | pr_reg[11] = regs->orig_rax; \ | ||
176 | pr_reg[12] = regs->rip; \ | ||
177 | pr_reg[13] = regs->cs; \ | ||
178 | pr_reg[14] = regs->eflags; \ | ||
179 | pr_reg[15] = regs->rsp; \ | ||
180 | pr_reg[16] = regs->ss; | ||
181 | |||
182 | #define user user32 | ||
183 | |||
184 | #define __ASM_X86_64_ELF_H 1 | ||
185 | #define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack)) | ||
186 | //#include <asm/ia32.h> | ||
187 | #include <linux/elf.h> | ||
188 | |||
189 | typedef struct user_i387_ia32_struct elf_fpregset_t; | ||
190 | typedef struct user32_fxsr_struct elf_fpxregset_t; | ||
191 | |||
192 | |||
193 | static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) | ||
194 | { | ||
195 | ELF_CORE_COPY_REGS((*elfregs), regs) | ||
196 | } | ||
197 | |||
198 | static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) | ||
199 | { | ||
200 | struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0); | ||
201 | --pp; | ||
202 | ELF_CORE_COPY_REGS((*elfregs), pp); | ||
203 | /* fix wrong segments */ | ||
204 | (*elfregs)[7] = t->thread.ds; | ||
205 | (*elfregs)[9] = t->thread.fsindex; | ||
206 | (*elfregs)[10] = t->thread.gsindex; | ||
207 | (*elfregs)[8] = t->thread.es; | ||
208 | return 1; | ||
209 | } | ||
210 | |||
211 | static inline int | ||
212 | elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) | ||
213 | { | ||
214 | struct _fpstate_ia32 *fpstate = (void*)fpu; | ||
215 | mm_segment_t oldfs = get_fs(); | ||
216 | |||
217 | if (!tsk_used_math(tsk)) | ||
218 | return 0; | ||
219 | if (!regs) | ||
220 | regs = (struct pt_regs *)tsk->thread.rsp0; | ||
221 | --regs; | ||
222 | if (tsk == current) | ||
223 | unlazy_fpu(tsk); | ||
224 | set_fs(KERNEL_DS); | ||
225 | save_i387_ia32(tsk, fpstate, regs, 1); | ||
226 | /* Correct for i386 bug. It puts the fop into the upper 16bits of | ||
227 | the tag word (like FXSAVE), not into the fcs*/ | ||
228 | fpstate->cssel |= fpstate->tag & 0xffff0000; | ||
229 | set_fs(oldfs); | ||
230 | return 1; | ||
231 | } | ||
232 | |||
233 | #define ELF_CORE_COPY_XFPREGS 1 | ||
234 | static inline int | ||
235 | elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) | ||
236 | { | ||
237 | struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; | ||
238 | if (!tsk_used_math(t)) | ||
239 | return 0; | ||
240 | if (t == current) | ||
241 | unlazy_fpu(t); | ||
242 | memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); | ||
243 | xfpu->fcs = regs->cs; | ||
244 | xfpu->fos = t->thread.ds; /* right? */ | ||
245 | return 1; | ||
246 | } | ||
247 | |||
248 | #undef elf_check_arch | ||
249 | #define elf_check_arch(x) \ | ||
250 | ((x)->e_machine == EM_386) | ||
251 | |||
252 | extern int force_personality32; | ||
253 | |||
254 | #define ELF_EXEC_PAGESIZE PAGE_SIZE | ||
255 | #define ELF_HWCAP (boot_cpu_data.x86_capability[0]) | ||
256 | #define ELF_PLATFORM ("i686") | ||
257 | #define SET_PERSONALITY(ex, ibcs2) \ | ||
258 | do { \ | ||
259 | unsigned long new_flags = 0; \ | ||
260 | if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ | ||
261 | new_flags = _TIF_IA32; \ | ||
262 | if ((current_thread_info()->flags & _TIF_IA32) \ | ||
263 | != new_flags) \ | ||
264 | set_thread_flag(TIF_ABI_PENDING); \ | ||
265 | else \ | ||
266 | clear_thread_flag(TIF_ABI_PENDING); \ | ||
267 | /* XXX This overwrites the user set personality */ \ | ||
268 | current->personality |= force_personality32; \ | ||
269 | } while (0) | ||
270 | |||
271 | /* Override some function names */ | ||
272 | #define elf_format elf32_format | ||
273 | |||
274 | #define init_elf_binfmt init_elf32_binfmt | ||
275 | #define exit_elf_binfmt exit_elf32_binfmt | ||
276 | |||
277 | #define load_elf_binary load_elf32_binary | ||
278 | |||
279 | #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) | ||
280 | #define setup_arg_pages(bprm, stack_top, exec_stack) \ | ||
281 | ia32_setup_arg_pages(bprm, stack_top, exec_stack) | ||
282 | int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack); | ||
283 | |||
284 | #undef start_thread | ||
285 | #define start_thread(regs,new_rip,new_rsp) do { \ | ||
286 | asm volatile("movl %0,%%fs" :: "r" (0)); \ | ||
287 | asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ | ||
288 | load_gs_index(0); \ | ||
289 | (regs)->rip = (new_rip); \ | ||
290 | (regs)->rsp = (new_rsp); \ | ||
291 | (regs)->eflags = 0x200; \ | ||
292 | (regs)->cs = __USER32_CS; \ | ||
293 | (regs)->ss = __USER32_DS; \ | ||
294 | set_fs(USER_DS); \ | ||
295 | } while(0) | ||
296 | |||
297 | |||
298 | #define elf_map elf32_map | ||
299 | |||
300 | #include <linux/module.h> | ||
301 | |||
302 | MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); | ||
303 | MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); | ||
304 | |||
305 | #undef MODULE_DESCRIPTION | ||
306 | #undef MODULE_AUTHOR | ||
307 | |||
308 | #define elf_addr_t __u32 | ||
309 | |||
310 | #undef TASK_SIZE | ||
311 | #define TASK_SIZE 0xffffffff | ||
312 | |||
313 | static void elf32_init(struct pt_regs *); | ||
314 | |||
315 | #include "../../../fs/binfmt_elf.c" | ||
316 | |||
317 | static void elf32_init(struct pt_regs *regs) | ||
318 | { | ||
319 | struct task_struct *me = current; | ||
320 | regs->rdi = 0; | ||
321 | regs->rsi = 0; | ||
322 | regs->rdx = 0; | ||
323 | regs->rcx = 0; | ||
324 | regs->rax = 0; | ||
325 | regs->rbx = 0; | ||
326 | regs->rbp = 0; | ||
327 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = | ||
328 | regs->r13 = regs->r14 = regs->r15 = 0; | ||
329 | me->thread.fs = 0; | ||
330 | me->thread.gs = 0; | ||
331 | me->thread.fsindex = 0; | ||
332 | me->thread.gsindex = 0; | ||
333 | me->thread.ds = __USER_DS; | ||
334 | me->thread.es = __USER_DS; | ||
335 | } | ||
336 | |||
337 | int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) | ||
338 | { | ||
339 | unsigned long stack_base; | ||
340 | struct vm_area_struct *mpnt; | ||
341 | struct mm_struct *mm = current->mm; | ||
342 | int i, ret; | ||
343 | |||
344 | stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; | ||
345 | mm->arg_start = bprm->p + stack_base; | ||
346 | |||
347 | bprm->p += stack_base; | ||
348 | if (bprm->loader) | ||
349 | bprm->loader += stack_base; | ||
350 | bprm->exec += stack_base; | ||
351 | |||
352 | mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
353 | if (!mpnt) | ||
354 | return -ENOMEM; | ||
355 | |||
356 | if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { | ||
357 | kmem_cache_free(vm_area_cachep, mpnt); | ||
358 | return -ENOMEM; | ||
359 | } | ||
360 | |||
361 | memset(mpnt, 0, sizeof(*mpnt)); | ||
362 | |||
363 | down_write(&mm->mmap_sem); | ||
364 | { | ||
365 | mpnt->vm_mm = mm; | ||
366 | mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; | ||
367 | mpnt->vm_end = IA32_STACK_TOP; | ||
368 | if (executable_stack == EXSTACK_ENABLE_X) | ||
369 | mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; | ||
370 | else if (executable_stack == EXSTACK_DISABLE_X) | ||
371 | mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; | ||
372 | else | ||
373 | mpnt->vm_flags = VM_STACK_FLAGS; | ||
374 | mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? | ||
375 | PAGE_COPY_EXEC : PAGE_COPY; | ||
376 | if ((ret = insert_vm_struct(mm, mpnt))) { | ||
377 | up_write(&mm->mmap_sem); | ||
378 | kmem_cache_free(vm_area_cachep, mpnt); | ||
379 | return ret; | ||
380 | } | ||
381 | mm->stack_vm = mm->total_vm = vma_pages(mpnt); | ||
382 | } | ||
383 | |||
384 | for (i = 0 ; i < MAX_ARG_PAGES ; i++) { | ||
385 | struct page *page = bprm->page[i]; | ||
386 | if (page) { | ||
387 | bprm->page[i] = NULL; | ||
388 | install_arg_page(mpnt, page, stack_base); | ||
389 | } | ||
390 | stack_base += PAGE_SIZE; | ||
391 | } | ||
392 | up_write(&mm->mmap_sem); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | static unsigned long | ||
398 | elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) | ||
399 | { | ||
400 | unsigned long map_addr; | ||
401 | struct task_struct *me = current; | ||
402 | |||
403 | down_write(&me->mm->mmap_sem); | ||
404 | map_addr = do_mmap(filep, ELF_PAGESTART(addr), | ||
405 | eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, | ||
406 | type, | ||
407 | eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr)); | ||
408 | up_write(&me->mm->mmap_sem); | ||
409 | return(map_addr); | ||
410 | } | ||
411 | |||
412 | #ifdef CONFIG_SYSCTL | ||
413 | /* Register vsyscall32 into the ABI table */ | ||
414 | #include <linux/sysctl.h> | ||
415 | |||
416 | static ctl_table abi_table2[] = { | ||
417 | { 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL, | ||
418 | proc_dointvec }, | ||
419 | { 0, } | ||
420 | }; | ||
421 | |||
422 | static ctl_table abi_root_table2[] = { | ||
423 | { .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555, | ||
424 | .child = abi_table2 }, | ||
425 | { 0 }, | ||
426 | }; | ||
427 | |||
428 | static __init int ia32_binfmt_init(void) | ||
429 | { | ||
430 | register_sysctl_table(abi_root_table2, 1); | ||
431 | return 0; | ||
432 | } | ||
433 | __initcall(ia32_binfmt_init); | ||
434 | #endif | ||
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c new file mode 100644 index 000000000000..d259f8a6f811 --- /dev/null +++ b/arch/x86_64/ia32/ia32_ioctl.c | |||
@@ -0,0 +1,201 @@ | |||
1 | /* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $ | ||
2 | * ioctl32.c: Conversion between 32bit and 64bit native ioctls. | ||
3 | * | ||
4 | * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) | ||
5 | * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) | ||
6 | * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs | ||
7 | * | ||
8 | * These routines maintain argument size conversion between 32bit and 64bit | ||
9 | * ioctls. | ||
10 | */ | ||
11 | |||
12 | #define INCLUDES | ||
13 | #include <linux/syscalls.h> | ||
14 | #include "compat_ioctl.c" | ||
15 | #include <asm/mtrr.h> | ||
16 | #include <asm/ia32.h> | ||
17 | |||
18 | #define CODE | ||
19 | #include "compat_ioctl.c" | ||
20 | |||
21 | #ifndef TIOCGDEV | ||
22 | #define TIOCGDEV _IOR('T',0x32, unsigned int) | ||
23 | #endif | ||
24 | static int tiocgdev(unsigned fd, unsigned cmd, unsigned int __user *ptr) | ||
25 | { | ||
26 | |||
27 | struct file *file = fget(fd); | ||
28 | struct tty_struct *real_tty; | ||
29 | |||
30 | if (!file) | ||
31 | return -EBADF; | ||
32 | if (file->f_op->ioctl != tty_ioctl) | ||
33 | return -EINVAL; | ||
34 | real_tty = (struct tty_struct *)file->private_data; | ||
35 | if (!real_tty) | ||
36 | return -EINVAL; | ||
37 | return put_user(new_encode_dev(tty_devnum(real_tty)), ptr); | ||
38 | } | ||
39 | |||
40 | #define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */ | ||
41 | #define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */ | ||
42 | #define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */ | ||
43 | #define RTC_EPOCH_SET32 _IOW('p', 0x0e, unsigned) /* Set epoch */ | ||
44 | |||
45 | static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg) | ||
46 | { | ||
47 | unsigned long val; | ||
48 | mm_segment_t oldfs = get_fs(); | ||
49 | int ret; | ||
50 | |||
51 | switch (cmd) { | ||
52 | case RTC_IRQP_READ32: | ||
53 | set_fs(KERNEL_DS); | ||
54 | ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val); | ||
55 | set_fs(oldfs); | ||
56 | if (!ret) | ||
57 | ret = put_user(val, (unsigned int __user *) arg); | ||
58 | return ret; | ||
59 | |||
60 | case RTC_IRQP_SET32: | ||
61 | cmd = RTC_IRQP_SET; | ||
62 | break; | ||
63 | |||
64 | case RTC_EPOCH_READ32: | ||
65 | set_fs(KERNEL_DS); | ||
66 | ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val); | ||
67 | set_fs(oldfs); | ||
68 | if (!ret) | ||
69 | ret = put_user(val, (unsigned int __user *) arg); | ||
70 | return ret; | ||
71 | |||
72 | case RTC_EPOCH_SET32: | ||
73 | cmd = RTC_EPOCH_SET; | ||
74 | break; | ||
75 | } | ||
76 | return sys_ioctl(fd,cmd,arg); | ||
77 | } | ||
78 | |||
79 | /* /proc/mtrr ioctls */ | ||
80 | |||
81 | |||
82 | struct mtrr_sentry32 | ||
83 | { | ||
84 | compat_ulong_t base; /* Base address */ | ||
85 | compat_uint_t size; /* Size of region */ | ||
86 | compat_uint_t type; /* Type of region */ | ||
87 | }; | ||
88 | |||
89 | struct mtrr_gentry32 | ||
90 | { | ||
91 | compat_ulong_t regnum; /* Register number */ | ||
92 | compat_uint_t base; /* Base address */ | ||
93 | compat_uint_t size; /* Size of region */ | ||
94 | compat_uint_t type; /* Type of region */ | ||
95 | }; | ||
96 | |||
97 | #define MTRR_IOCTL_BASE 'M' | ||
98 | |||
99 | #define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32) | ||
100 | #define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32) | ||
101 | #define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32) | ||
102 | #define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32) | ||
103 | #define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32) | ||
104 | #define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32) | ||
105 | #define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32) | ||
106 | #define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32) | ||
107 | #define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32) | ||
108 | #define MTRRIOC32_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) | ||
109 | |||
110 | |||
111 | static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg) | ||
112 | { | ||
113 | struct mtrr_gentry g; | ||
114 | struct mtrr_sentry s; | ||
115 | int get = 0, err = 0; | ||
116 | struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg; | ||
117 | mm_segment_t oldfs = get_fs(); | ||
118 | |||
119 | switch (cmd) { | ||
120 | #define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break | ||
121 | #define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break | ||
122 | SET(ADD); | ||
123 | SET(SET); | ||
124 | SET(DEL); | ||
125 | GET(GET); | ||
126 | SET(KILL); | ||
127 | SET(ADD_PAGE); | ||
128 | SET(SET_PAGE); | ||
129 | SET(DEL_PAGE); | ||
130 | GET(GET_PAGE); | ||
131 | SET(KILL_PAGE); | ||
132 | } | ||
133 | |||
134 | if (get) { | ||
135 | err = get_user(g.regnum, &g32->regnum); | ||
136 | err |= get_user(g.base, &g32->base); | ||
137 | err |= get_user(g.size, &g32->size); | ||
138 | err |= get_user(g.type, &g32->type); | ||
139 | |||
140 | arg = (unsigned long)&g; | ||
141 | } else { | ||
142 | struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg; | ||
143 | err = get_user(s.base, &s32->base); | ||
144 | err |= get_user(s.size, &s32->size); | ||
145 | err |= get_user(s.type, &s32->type); | ||
146 | |||
147 | arg = (unsigned long)&s; | ||
148 | } | ||
149 | if (err) return err; | ||
150 | |||
151 | set_fs(KERNEL_DS); | ||
152 | err = sys_ioctl(fd, cmd, arg); | ||
153 | set_fs(oldfs); | ||
154 | |||
155 | if (!err && get) { | ||
156 | err = put_user(g.base, &g32->base); | ||
157 | err |= put_user(g.size, &g32->size); | ||
158 | err |= put_user(g.regnum, &g32->regnum); | ||
159 | err |= put_user(g.type, &g32->type); | ||
160 | } | ||
161 | return err; | ||
162 | } | ||
163 | |||
164 | #define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, | ||
165 | #define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl) | ||
166 | |||
167 | struct ioctl_trans ioctl_start[] = { | ||
168 | #include <linux/compat_ioctl.h> | ||
169 | #define DECLARES | ||
170 | #include "compat_ioctl.c" | ||
171 | COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS) | ||
172 | COMPATIBLE_IOCTL(HDIO_SCAN_HWIF) | ||
173 | COMPATIBLE_IOCTL(BLKRASET) | ||
174 | COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ | ||
175 | COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ | ||
176 | COMPATIBLE_IOCTL(FIOQSIZE) | ||
177 | |||
178 | /* And these ioctls need translation */ | ||
179 | HANDLE_IOCTL(TIOCGDEV, tiocgdev) | ||
180 | /* realtime device */ | ||
181 | HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl) | ||
182 | HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl) | ||
183 | HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl) | ||
184 | HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl) | ||
185 | HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl) | ||
186 | /* take care of sizeof(sizeof()) breakage */ | ||
187 | /* mtrr */ | ||
188 | HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32) | ||
189 | HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32) | ||
190 | HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32) | ||
191 | HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32) | ||
192 | HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32) | ||
193 | HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32) | ||
194 | HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32) | ||
195 | HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32) | ||
196 | HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32) | ||
197 | HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32) | ||
198 | }; | ||
199 | |||
200 | int ioctl_table_size = ARRAY_SIZE(ioctl_start); | ||
201 | |||
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c new file mode 100644 index 000000000000..fbd09b5126ce --- /dev/null +++ b/arch/x86_64/ia32/ia32_signal.c | |||
@@ -0,0 +1,621 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/ia32/ia32_signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
7 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
8 | * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen | ||
9 | * | ||
10 | * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $ | ||
11 | */ | ||
12 | |||
13 | #include <linux/sched.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/smp.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/signal.h> | ||
19 | #include <linux/errno.h> | ||
20 | #include <linux/wait.h> | ||
21 | #include <linux/ptrace.h> | ||
22 | #include <linux/unistd.h> | ||
23 | #include <linux/stddef.h> | ||
24 | #include <linux/personality.h> | ||
25 | #include <linux/compat.h> | ||
26 | #include <asm/ucontext.h> | ||
27 | #include <asm/uaccess.h> | ||
28 | #include <asm/i387.h> | ||
29 | #include <asm/ia32.h> | ||
30 | #include <asm/ptrace.h> | ||
31 | #include <asm/ia32_unistd.h> | ||
32 | #include <asm/user32.h> | ||
33 | #include <asm/sigcontext32.h> | ||
34 | #include <asm/fpu32.h> | ||
35 | #include <asm/proto.h> | ||
36 | #include <asm/vsyscall32.h> | ||
37 | |||
38 | #define DEBUG_SIG 0 | ||
39 | |||
40 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
41 | |||
42 | asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); | ||
43 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); | ||
44 | |||
45 | int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) | ||
46 | { | ||
47 | int err; | ||
48 | if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) | ||
49 | return -EFAULT; | ||
50 | |||
51 | /* If you change siginfo_t structure, please make sure that | ||
52 | this code is fixed accordingly. | ||
53 | It should never copy any pad contained in the structure | ||
54 | to avoid security leaks, but must copy the generic | ||
55 | 3 ints plus the relevant union member. */ | ||
56 | err = __put_user(from->si_signo, &to->si_signo); | ||
57 | err |= __put_user(from->si_errno, &to->si_errno); | ||
58 | err |= __put_user((short)from->si_code, &to->si_code); | ||
59 | |||
60 | if (from->si_code < 0) { | ||
61 | err |= __put_user(from->si_pid, &to->si_pid); | ||
62 | err |= __put_user(from->si_uid, &to->si_uid); | ||
63 | err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); | ||
64 | } else { | ||
65 | /* First 32bits of unions are always present: | ||
66 | * si_pid === si_band === si_tid === si_addr(LS half) */ | ||
67 | err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); | ||
68 | switch (from->si_code >> 16) { | ||
69 | case __SI_FAULT >> 16: | ||
70 | break; | ||
71 | case __SI_CHLD >> 16: | ||
72 | err |= __put_user(from->si_utime, &to->si_utime); | ||
73 | err |= __put_user(from->si_stime, &to->si_stime); | ||
74 | err |= __put_user(from->si_status, &to->si_status); | ||
75 | /* FALL THROUGH */ | ||
76 | default: | ||
77 | case __SI_KILL >> 16: | ||
78 | err |= __put_user(from->si_uid, &to->si_uid); | ||
79 | break; | ||
80 | case __SI_POLL >> 16: | ||
81 | err |= __put_user(from->si_fd, &to->si_fd); | ||
82 | break; | ||
83 | case __SI_TIMER >> 16: | ||
84 | err |= __put_user(from->si_overrun, &to->si_overrun); | ||
85 | err |= __put_user(ptr_to_compat(from->si_ptr), | ||
86 | &to->si_ptr); | ||
87 | break; | ||
88 | case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ | ||
89 | case __SI_MESGQ >> 16: | ||
90 | err |= __put_user(from->si_uid, &to->si_uid); | ||
91 | err |= __put_user(from->si_int, &to->si_int); | ||
92 | break; | ||
93 | } | ||
94 | } | ||
95 | return err; | ||
96 | } | ||
97 | |||
98 | int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) | ||
99 | { | ||
100 | int err; | ||
101 | u32 ptr32; | ||
102 | if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) | ||
103 | return -EFAULT; | ||
104 | |||
105 | err = __get_user(to->si_signo, &from->si_signo); | ||
106 | err |= __get_user(to->si_errno, &from->si_errno); | ||
107 | err |= __get_user(to->si_code, &from->si_code); | ||
108 | |||
109 | err |= __get_user(to->si_pid, &from->si_pid); | ||
110 | err |= __get_user(to->si_uid, &from->si_uid); | ||
111 | err |= __get_user(ptr32, &from->si_ptr); | ||
112 | to->si_ptr = compat_ptr(ptr32); | ||
113 | |||
114 | return err; | ||
115 | } | ||
116 | |||
117 | asmlinkage long | ||
118 | sys32_sigsuspend(int history0, int history1, old_sigset_t mask, | ||
119 | struct pt_regs *regs) | ||
120 | { | ||
121 | sigset_t saveset; | ||
122 | |||
123 | mask &= _BLOCKABLE; | ||
124 | spin_lock_irq(¤t->sighand->siglock); | ||
125 | saveset = current->blocked; | ||
126 | siginitset(¤t->blocked, mask); | ||
127 | recalc_sigpending(); | ||
128 | spin_unlock_irq(¤t->sighand->siglock); | ||
129 | |||
130 | regs->rax = -EINTR; | ||
131 | while (1) { | ||
132 | current->state = TASK_INTERRUPTIBLE; | ||
133 | schedule(); | ||
134 | if (do_signal(regs, &saveset)) | ||
135 | return -EINTR; | ||
136 | } | ||
137 | } | ||
138 | |||
139 | asmlinkage long | ||
140 | sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, | ||
141 | stack_ia32_t __user *uoss_ptr, | ||
142 | struct pt_regs *regs) | ||
143 | { | ||
144 | stack_t uss,uoss; | ||
145 | int ret; | ||
146 | mm_segment_t seg; | ||
147 | if (uss_ptr) { | ||
148 | u32 ptr; | ||
149 | memset(&uss,0,sizeof(stack_t)); | ||
150 | if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || | ||
151 | __get_user(ptr, &uss_ptr->ss_sp) || | ||
152 | __get_user(uss.ss_flags, &uss_ptr->ss_flags) || | ||
153 | __get_user(uss.ss_size, &uss_ptr->ss_size)) | ||
154 | return -EFAULT; | ||
155 | uss.ss_sp = compat_ptr(ptr); | ||
156 | } | ||
157 | seg = get_fs(); | ||
158 | set_fs(KERNEL_DS); | ||
159 | ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); | ||
160 | set_fs(seg); | ||
161 | if (ret >= 0 && uoss_ptr) { | ||
162 | if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || | ||
163 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | ||
164 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | ||
165 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | ||
166 | ret = -EFAULT; | ||
167 | } | ||
168 | return ret; | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * Do a signal return; undo the signal stack. | ||
173 | */ | ||
174 | |||
175 | struct sigframe | ||
176 | { | ||
177 | u32 pretcode; | ||
178 | int sig; | ||
179 | struct sigcontext_ia32 sc; | ||
180 | struct _fpstate_ia32 fpstate; | ||
181 | unsigned int extramask[_COMPAT_NSIG_WORDS-1]; | ||
182 | char retcode[8]; | ||
183 | }; | ||
184 | |||
185 | struct rt_sigframe | ||
186 | { | ||
187 | u32 pretcode; | ||
188 | int sig; | ||
189 | u32 pinfo; | ||
190 | u32 puc; | ||
191 | compat_siginfo_t info; | ||
192 | struct ucontext_ia32 uc; | ||
193 | struct _fpstate_ia32 fpstate; | ||
194 | char retcode[8]; | ||
195 | }; | ||
196 | |||
197 | static int | ||
198 | ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) | ||
199 | { | ||
200 | unsigned int err = 0; | ||
201 | |||
202 | /* Always make any pending restarted system calls return -EINTR */ | ||
203 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
204 | |||
205 | #if DEBUG_SIG | ||
206 | printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", | ||
207 | sc, sc->err, sc->eip, sc->cs, sc->eflags); | ||
208 | #endif | ||
209 | #define COPY(x) { \ | ||
210 | unsigned int reg; \ | ||
211 | err |= __get_user(reg, &sc->e ##x); \ | ||
212 | regs->r ## x = reg; \ | ||
213 | } | ||
214 | |||
215 | #define RELOAD_SEG(seg,mask) \ | ||
216 | { unsigned int cur; \ | ||
217 | unsigned short pre; \ | ||
218 | err |= __get_user(pre, &sc->seg); \ | ||
219 | asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ | ||
220 | pre |= mask; \ | ||
221 | if (pre != cur) loadsegment(seg,pre); } | ||
222 | |||
223 | /* Reload fs and gs if they have changed in the signal handler. | ||
224 | This does not handle long fs/gs base changes in the handler, but | ||
225 | does not clobber them at least in the normal case. */ | ||
226 | |||
227 | { | ||
228 | unsigned gs, oldgs; | ||
229 | err |= __get_user(gs, &sc->gs); | ||
230 | gs |= 3; | ||
231 | asm("movl %%gs,%0" : "=r" (oldgs)); | ||
232 | if (gs != oldgs) | ||
233 | load_gs_index(gs); | ||
234 | } | ||
235 | RELOAD_SEG(fs,3); | ||
236 | RELOAD_SEG(ds,3); | ||
237 | RELOAD_SEG(es,3); | ||
238 | |||
239 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | ||
240 | COPY(dx); COPY(cx); COPY(ip); | ||
241 | /* Don't touch extended registers */ | ||
242 | |||
243 | err |= __get_user(regs->cs, &sc->cs); | ||
244 | regs->cs |= 3; | ||
245 | err |= __get_user(regs->ss, &sc->ss); | ||
246 | regs->ss |= 3; | ||
247 | |||
248 | { | ||
249 | unsigned int tmpflags; | ||
250 | err |= __get_user(tmpflags, &sc->eflags); | ||
251 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
252 | regs->orig_rax = -1; /* disable syscall checks */ | ||
253 | } | ||
254 | |||
255 | { | ||
256 | u32 tmp; | ||
257 | struct _fpstate_ia32 __user * buf; | ||
258 | err |= __get_user(tmp, &sc->fpstate); | ||
259 | buf = compat_ptr(tmp); | ||
260 | if (buf) { | ||
261 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
262 | goto badframe; | ||
263 | err |= restore_i387_ia32(current, buf, 0); | ||
264 | } else { | ||
265 | struct task_struct *me = current; | ||
266 | if (used_math()) { | ||
267 | clear_fpu(me); | ||
268 | clear_used_math(); | ||
269 | } | ||
270 | } | ||
271 | } | ||
272 | |||
273 | { | ||
274 | u32 tmp; | ||
275 | err |= __get_user(tmp, &sc->eax); | ||
276 | *peax = tmp; | ||
277 | } | ||
278 | return err; | ||
279 | |||
280 | badframe: | ||
281 | return 1; | ||
282 | } | ||
283 | |||
284 | asmlinkage long sys32_sigreturn(struct pt_regs *regs) | ||
285 | { | ||
286 | struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); | ||
287 | sigset_t set; | ||
288 | unsigned int eax; | ||
289 | |||
290 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
291 | goto badframe; | ||
292 | if (__get_user(set.sig[0], &frame->sc.oldmask) | ||
293 | || (_COMPAT_NSIG_WORDS > 1 | ||
294 | && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, | ||
295 | sizeof(frame->extramask)))) | ||
296 | goto badframe; | ||
297 | |||
298 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
299 | spin_lock_irq(¤t->sighand->siglock); | ||
300 | current->blocked = set; | ||
301 | recalc_sigpending(); | ||
302 | spin_unlock_irq(¤t->sighand->siglock); | ||
303 | |||
304 | if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) | ||
305 | goto badframe; | ||
306 | return eax; | ||
307 | |||
308 | badframe: | ||
309 | signal_fault(regs, frame, "32bit sigreturn"); | ||
310 | return 0; | ||
311 | } | ||
312 | |||
313 | asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) | ||
314 | { | ||
315 | struct rt_sigframe __user *frame; | ||
316 | sigset_t set; | ||
317 | unsigned int eax; | ||
318 | struct pt_regs tregs; | ||
319 | |||
320 | frame = (struct rt_sigframe __user *)(regs->rsp - 4); | ||
321 | |||
322 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
323 | goto badframe; | ||
324 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) | ||
325 | goto badframe; | ||
326 | |||
327 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
328 | spin_lock_irq(¤t->sighand->siglock); | ||
329 | current->blocked = set; | ||
330 | recalc_sigpending(); | ||
331 | spin_unlock_irq(¤t->sighand->siglock); | ||
332 | |||
333 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
334 | goto badframe; | ||
335 | |||
336 | tregs = *regs; | ||
337 | if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) | ||
338 | goto badframe; | ||
339 | |||
340 | return eax; | ||
341 | |||
342 | badframe: | ||
343 | signal_fault(regs,frame,"32bit rt sigreturn"); | ||
344 | return 0; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * Set up a signal frame. | ||
349 | */ | ||
350 | |||
351 | static int | ||
352 | ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, | ||
353 | struct pt_regs *regs, unsigned int mask) | ||
354 | { | ||
355 | int tmp, err = 0; | ||
356 | u32 eflags; | ||
357 | |||
358 | tmp = 0; | ||
359 | __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); | ||
360 | err |= __put_user(tmp, (unsigned int __user *)&sc->gs); | ||
361 | __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); | ||
362 | err |= __put_user(tmp, (unsigned int __user *)&sc->fs); | ||
363 | __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); | ||
364 | err |= __put_user(tmp, (unsigned int __user *)&sc->ds); | ||
365 | __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); | ||
366 | err |= __put_user(tmp, (unsigned int __user *)&sc->es); | ||
367 | |||
368 | err |= __put_user((u32)regs->rdi, &sc->edi); | ||
369 | err |= __put_user((u32)regs->rsi, &sc->esi); | ||
370 | err |= __put_user((u32)regs->rbp, &sc->ebp); | ||
371 | err |= __put_user((u32)regs->rsp, &sc->esp); | ||
372 | err |= __put_user((u32)regs->rbx, &sc->ebx); | ||
373 | err |= __put_user((u32)regs->rdx, &sc->edx); | ||
374 | err |= __put_user((u32)regs->rcx, &sc->ecx); | ||
375 | err |= __put_user((u32)regs->rax, &sc->eax); | ||
376 | err |= __put_user((u32)regs->cs, &sc->cs); | ||
377 | err |= __put_user((u32)regs->ss, &sc->ss); | ||
378 | err |= __put_user(current->thread.trap_no, &sc->trapno); | ||
379 | err |= __put_user(current->thread.error_code, &sc->err); | ||
380 | err |= __put_user((u32)regs->rip, &sc->eip); | ||
381 | eflags = regs->eflags; | ||
382 | if (current->ptrace & PT_PTRACED) | ||
383 | eflags &= ~TF_MASK; | ||
384 | err |= __put_user((u32)eflags, &sc->eflags); | ||
385 | err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); | ||
386 | |||
387 | tmp = save_i387_ia32(current, fpstate, regs, 0); | ||
388 | if (tmp < 0) | ||
389 | err = -EFAULT; | ||
390 | else { | ||
391 | clear_used_math(); | ||
392 | stts(); | ||
393 | err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), | ||
394 | &sc->fpstate); | ||
395 | } | ||
396 | |||
397 | /* non-iBCS2 extensions.. */ | ||
398 | err |= __put_user(mask, &sc->oldmask); | ||
399 | err |= __put_user(current->thread.cr2, &sc->cr2); | ||
400 | |||
401 | return err; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * Determine which stack to use.. | ||
406 | */ | ||
407 | static void __user * | ||
408 | get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) | ||
409 | { | ||
410 | unsigned long rsp; | ||
411 | |||
412 | /* Default to using normal stack */ | ||
413 | rsp = regs->rsp; | ||
414 | |||
415 | /* This is the X/Open sanctioned signal stack switching. */ | ||
416 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
417 | if (sas_ss_flags(rsp) == 0) | ||
418 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
419 | } | ||
420 | |||
421 | /* This is the legacy signal stack switching. */ | ||
422 | else if ((regs->ss & 0xffff) != __USER_DS && | ||
423 | !(ka->sa.sa_flags & SA_RESTORER) && | ||
424 | ka->sa.sa_restorer) { | ||
425 | rsp = (unsigned long) ka->sa.sa_restorer; | ||
426 | } | ||
427 | |||
428 | return (void __user *)((rsp - frame_size) & -8UL); | ||
429 | } | ||
430 | |||
431 | void ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
432 | compat_sigset_t *set, struct pt_regs * regs) | ||
433 | { | ||
434 | struct sigframe __user *frame; | ||
435 | int err = 0; | ||
436 | |||
437 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
438 | |||
439 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
440 | goto give_sigsegv; | ||
441 | |||
442 | { | ||
443 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
444 | err |= __put_user((ed | ||
445 | && ed->signal_invmap | ||
446 | && sig < 32 | ||
447 | ? ed->signal_invmap[sig] | ||
448 | : sig), | ||
449 | &frame->sig); | ||
450 | } | ||
451 | if (err) | ||
452 | goto give_sigsegv; | ||
453 | |||
454 | err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, | ||
455 | set->sig[0]); | ||
456 | if (err) | ||
457 | goto give_sigsegv; | ||
458 | |||
459 | if (_COMPAT_NSIG_WORDS > 1) { | ||
460 | err |= __copy_to_user(frame->extramask, &set->sig[1], | ||
461 | sizeof(frame->extramask)); | ||
462 | } | ||
463 | if (err) | ||
464 | goto give_sigsegv; | ||
465 | |||
466 | /* Return stub is in 32bit vsyscall page */ | ||
467 | { | ||
468 | void __user *restorer = VSYSCALL32_SIGRETURN; | ||
469 | if (ka->sa.sa_flags & SA_RESTORER) | ||
470 | restorer = ka->sa.sa_restorer; | ||
471 | err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); | ||
472 | } | ||
473 | /* These are actually not used anymore, but left because some | ||
474 | gdb versions depend on them as a marker. */ | ||
475 | { | ||
476 | /* copy_to_user optimizes that into a single 8 byte store */ | ||
477 | static const struct { | ||
478 | u16 poplmovl; | ||
479 | u32 val; | ||
480 | u16 int80; | ||
481 | u16 pad; | ||
482 | } __attribute__((packed)) code = { | ||
483 | 0xb858, /* popl %eax ; movl $...,%eax */ | ||
484 | __NR_ia32_sigreturn, | ||
485 | 0x80cd, /* int $0x80 */ | ||
486 | 0, | ||
487 | }; | ||
488 | err |= __copy_to_user(frame->retcode, &code, 8); | ||
489 | } | ||
490 | if (err) | ||
491 | goto give_sigsegv; | ||
492 | |||
493 | /* Set up registers for signal handler */ | ||
494 | regs->rsp = (unsigned long) frame; | ||
495 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
496 | |||
497 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | ||
498 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | ||
499 | |||
500 | regs->cs = __USER32_CS; | ||
501 | regs->ss = __USER32_DS; | ||
502 | |||
503 | set_fs(USER_DS); | ||
504 | if (regs->eflags & TF_MASK) { | ||
505 | if (current->ptrace & PT_PTRACED) { | ||
506 | ptrace_notify(SIGTRAP); | ||
507 | } else { | ||
508 | regs->eflags &= ~TF_MASK; | ||
509 | } | ||
510 | } | ||
511 | |||
512 | #if DEBUG_SIG | ||
513 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
514 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
515 | #endif | ||
516 | |||
517 | return; | ||
518 | |||
519 | give_sigsegv: | ||
520 | force_sigsegv(sig, current); | ||
521 | } | ||
522 | |||
523 | void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
524 | compat_sigset_t *set, struct pt_regs * regs) | ||
525 | { | ||
526 | struct rt_sigframe __user *frame; | ||
527 | int err = 0; | ||
528 | |||
529 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
530 | |||
531 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
532 | goto give_sigsegv; | ||
533 | |||
534 | { | ||
535 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
536 | err |= __put_user((ed | ||
537 | && ed->signal_invmap | ||
538 | && sig < 32 | ||
539 | ? ed->signal_invmap[sig] | ||
540 | : sig), | ||
541 | &frame->sig); | ||
542 | } | ||
543 | err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); | ||
544 | err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); | ||
545 | err |= copy_siginfo_to_user32(&frame->info, info); | ||
546 | if (err) | ||
547 | goto give_sigsegv; | ||
548 | |||
549 | /* Create the ucontext. */ | ||
550 | err |= __put_user(0, &frame->uc.uc_flags); | ||
551 | err |= __put_user(0, &frame->uc.uc_link); | ||
552 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
553 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
554 | &frame->uc.uc_stack.ss_flags); | ||
555 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
556 | err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, | ||
557 | regs, set->sig[0]); | ||
558 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
559 | if (err) | ||
560 | goto give_sigsegv; | ||
561 | |||
562 | |||
563 | { | ||
564 | void __user *restorer = VSYSCALL32_RTSIGRETURN; | ||
565 | if (ka->sa.sa_flags & SA_RESTORER) | ||
566 | restorer = ka->sa.sa_restorer; | ||
567 | err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); | ||
568 | } | ||
569 | |||
570 | /* This is movl $,%eax ; int $0x80 */ | ||
571 | /* Not actually used anymore, but left because some gdb versions | ||
572 | need it. */ | ||
573 | { | ||
574 | /* __copy_to_user optimizes that into a single 8 byte store */ | ||
575 | static const struct { | ||
576 | u8 movl; | ||
577 | u32 val; | ||
578 | u16 int80; | ||
579 | u16 pad; | ||
580 | u8 pad2; | ||
581 | } __attribute__((packed)) code = { | ||
582 | 0xb8, | ||
583 | __NR_ia32_rt_sigreturn, | ||
584 | 0x80cd, | ||
585 | 0, | ||
586 | }; | ||
587 | err |= __copy_to_user(frame->retcode, &code, 8); | ||
588 | } | ||
589 | if (err) | ||
590 | goto give_sigsegv; | ||
591 | |||
592 | /* Set up registers for signal handler */ | ||
593 | regs->rsp = (unsigned long) frame; | ||
594 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
595 | |||
596 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | ||
597 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | ||
598 | |||
599 | regs->cs = __USER32_CS; | ||
600 | regs->ss = __USER32_DS; | ||
601 | |||
602 | set_fs(USER_DS); | ||
603 | if (regs->eflags & TF_MASK) { | ||
604 | if (current->ptrace & PT_PTRACED) { | ||
605 | ptrace_notify(SIGTRAP); | ||
606 | } else { | ||
607 | regs->eflags &= ~TF_MASK; | ||
608 | } | ||
609 | } | ||
610 | |||
611 | #if DEBUG_SIG | ||
612 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
613 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
614 | #endif | ||
615 | |||
616 | return; | ||
617 | |||
618 | give_sigsegv: | ||
619 | force_sigsegv(sig, current); | ||
620 | } | ||
621 | |||
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S new file mode 100644 index 000000000000..f3ca0db85b5b --- /dev/null +++ b/arch/x86_64/ia32/ia32entry.S | |||
@@ -0,0 +1,602 @@ | |||
1 | /* | ||
2 | * Compatibility mode system call entry point for x86-64. | ||
3 | * | ||
4 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. | ||
5 | */ | ||
6 | |||
7 | #include <asm/dwarf2.h> | ||
8 | #include <asm/calling.h> | ||
9 | #include <asm/offset.h> | ||
10 | #include <asm/current.h> | ||
11 | #include <asm/errno.h> | ||
12 | #include <asm/ia32_unistd.h> | ||
13 | #include <asm/thread_info.h> | ||
14 | #include <asm/segment.h> | ||
15 | #include <asm/vsyscall32.h> | ||
16 | #include <linux/linkage.h> | ||
17 | |||
18 | .macro IA32_ARG_FIXUP noebp=0 | ||
19 | movl %edi,%r8d | ||
20 | .if \noebp | ||
21 | .else | ||
22 | movl %ebp,%r9d | ||
23 | .endif | ||
24 | xchg %ecx,%esi | ||
25 | movl %ebx,%edi | ||
26 | movl %edx,%edx /* zero extension */ | ||
27 | .endm | ||
28 | |||
29 | /* clobbers %eax */ | ||
30 | .macro CLEAR_RREGS | ||
31 | xorl %eax,%eax | ||
32 | movq %rax,R11(%rsp) | ||
33 | movq %rax,R10(%rsp) | ||
34 | movq %rax,R9(%rsp) | ||
35 | movq %rax,R8(%rsp) | ||
36 | .endm | ||
37 | |||
38 | /* | ||
39 | * 32bit SYSENTER instruction entry. | ||
40 | * | ||
41 | * Arguments: | ||
42 | * %eax System call number. | ||
43 | * %ebx Arg1 | ||
44 | * %ecx Arg2 | ||
45 | * %edx Arg3 | ||
46 | * %esi Arg4 | ||
47 | * %edi Arg5 | ||
48 | * %ebp user stack | ||
49 | * 0(%ebp) Arg6 | ||
50 | * | ||
51 | * Interrupts off. | ||
52 | * | ||
53 | * This is purely a fast path. For anything complicated we use the int 0x80 | ||
54 | * path below. Set up a complete hardware stack frame to share code | ||
55 | * with the int 0x80 path. | ||
56 | */ | ||
57 | ENTRY(ia32_sysenter_target) | ||
58 | CFI_STARTPROC | ||
59 | swapgs | ||
60 | movq %gs:pda_kernelstack, %rsp | ||
61 | addq $(PDA_STACKOFFSET),%rsp | ||
62 | sti | ||
63 | movl %ebp,%ebp /* zero extension */ | ||
64 | pushq $__USER32_DS | ||
65 | pushq %rbp | ||
66 | pushfq | ||
67 | movl $VSYSCALL32_SYSEXIT, %r10d | ||
68 | pushq $__USER32_CS | ||
69 | movl %eax, %eax | ||
70 | pushq %r10 | ||
71 | pushq %rax | ||
72 | cld | ||
73 | SAVE_ARGS 0,0,1 | ||
74 | /* no need to do an access_ok check here because rbp has been | ||
75 | 32bit zero extended */ | ||
76 | 1: movl (%rbp),%r9d | ||
77 | .section __ex_table,"a" | ||
78 | .quad 1b,ia32_badarg | ||
79 | .previous | ||
80 | GET_THREAD_INFO(%r10) | ||
81 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
82 | jnz sysenter_tracesys | ||
83 | sysenter_do_call: | ||
84 | cmpl $(IA32_NR_syscalls),%eax | ||
85 | jae ia32_badsys | ||
86 | IA32_ARG_FIXUP 1 | ||
87 | call *ia32_sys_call_table(,%rax,8) | ||
88 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
89 | GET_THREAD_INFO(%r10) | ||
90 | cli | ||
91 | testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) | ||
92 | jnz int_ret_from_sys_call | ||
93 | /* clear IF, that popfq doesn't enable interrupts early */ | ||
94 | andl $~0x200,EFLAGS-R11(%rsp) | ||
95 | RESTORE_ARGS 1,24,1,1,1,1 | ||
96 | popfq | ||
97 | popq %rcx /* User %esp */ | ||
98 | movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ | ||
99 | swapgs | ||
100 | sti /* sti only takes effect after the next instruction */ | ||
101 | /* sysexit */ | ||
102 | .byte 0xf, 0x35 | ||
103 | |||
104 | sysenter_tracesys: | ||
105 | SAVE_REST | ||
106 | CLEAR_RREGS | ||
107 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
108 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
109 | call syscall_trace_enter | ||
110 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
111 | RESTORE_REST | ||
112 | movl %ebp, %ebp | ||
113 | /* no need to do an access_ok check here because rbp has been | ||
114 | 32bit zero extended */ | ||
115 | 1: movl (%rbp),%r9d | ||
116 | .section __ex_table,"a" | ||
117 | .quad 1b,ia32_badarg | ||
118 | .previous | ||
119 | jmp sysenter_do_call | ||
120 | CFI_ENDPROC | ||
121 | |||
122 | /* | ||
123 | * 32bit SYSCALL instruction entry. | ||
124 | * | ||
125 | * Arguments: | ||
126 | * %eax System call number. | ||
127 | * %ebx Arg1 | ||
128 | * %ecx return EIP | ||
129 | * %edx Arg3 | ||
130 | * %esi Arg4 | ||
131 | * %edi Arg5 | ||
132 | * %ebp Arg2 [note: not saved in the stack frame, should not be touched] | ||
133 | * %esp user stack | ||
134 | * 0(%esp) Arg6 | ||
135 | * | ||
136 | * Interrupts off. | ||
137 | * | ||
138 | * This is purely a fast path. For anything complicated we use the int 0x80 | ||
139 | * path below. Set up a complete hardware stack frame to share code | ||
140 | * with the int 0x80 path. | ||
141 | */ | ||
142 | ENTRY(ia32_cstar_target) | ||
143 | CFI_STARTPROC | ||
144 | swapgs | ||
145 | movl %esp,%r8d | ||
146 | movq %gs:pda_kernelstack,%rsp | ||
147 | sti | ||
148 | SAVE_ARGS 8,1,1 | ||
149 | movl %eax,%eax /* zero extension */ | ||
150 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
151 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
152 | movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ | ||
153 | movl %ebp,%ecx | ||
154 | movq $__USER32_CS,CS-ARGOFFSET(%rsp) | ||
155 | movq $__USER32_DS,SS-ARGOFFSET(%rsp) | ||
156 | movq %r11,EFLAGS-ARGOFFSET(%rsp) | ||
157 | movq %r8,RSP-ARGOFFSET(%rsp) | ||
158 | /* no need to do an access_ok check here because r8 has been | ||
159 | 32bit zero extended */ | ||
160 | /* hardware stack frame is complete now */ | ||
161 | 1: movl (%r8),%r9d | ||
162 | .section __ex_table,"a" | ||
163 | .quad 1b,ia32_badarg | ||
164 | .previous | ||
165 | GET_THREAD_INFO(%r10) | ||
166 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
167 | jnz cstar_tracesys | ||
168 | cstar_do_call: | ||
169 | cmpl $IA32_NR_syscalls,%eax | ||
170 | jae ia32_badsys | ||
171 | IA32_ARG_FIXUP 1 | ||
172 | call *ia32_sys_call_table(,%rax,8) | ||
173 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
174 | GET_THREAD_INFO(%r10) | ||
175 | cli | ||
176 | testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) | ||
177 | jnz int_ret_from_sys_call | ||
178 | RESTORE_ARGS 1,-ARG_SKIP,1,1,1 | ||
179 | movl RIP-ARGOFFSET(%rsp),%ecx | ||
180 | movl EFLAGS-ARGOFFSET(%rsp),%r11d | ||
181 | movl RSP-ARGOFFSET(%rsp),%esp | ||
182 | swapgs | ||
183 | sysretl | ||
184 | |||
185 | cstar_tracesys: | ||
186 | SAVE_REST | ||
187 | CLEAR_RREGS | ||
188 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
189 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
190 | call syscall_trace_enter | ||
191 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
192 | RESTORE_REST | ||
193 | movl RSP-ARGOFFSET(%rsp), %r8d | ||
194 | /* no need to do an access_ok check here because r8 has been | ||
195 | 32bit zero extended */ | ||
196 | 1: movl (%r8),%r9d | ||
197 | .section __ex_table,"a" | ||
198 | .quad 1b,ia32_badarg | ||
199 | .previous | ||
200 | jmp cstar_do_call | ||
201 | |||
202 | ia32_badarg: | ||
203 | movq $-EFAULT,%rax | ||
204 | jmp ia32_sysret | ||
205 | CFI_ENDPROC | ||
206 | |||
207 | /* | ||
208 | * Emulated IA32 system calls via int 0x80. | ||
209 | * | ||
210 | * Arguments: | ||
211 | * %eax System call number. | ||
212 | * %ebx Arg1 | ||
213 | * %ecx Arg2 | ||
214 | * %edx Arg3 | ||
215 | * %esi Arg4 | ||
216 | * %edi Arg5 | ||
217 | * %ebp Arg6 [note: not saved in the stack frame, should not be touched] | ||
218 | * | ||
219 | * Notes: | ||
220 | * Uses the same stack frame as the x86-64 version. | ||
221 | * All registers except %eax must be saved (but ptrace may violate that) | ||
222 | * Arguments are zero extended. For system calls that want sign extension and | ||
223 | * take long arguments a wrapper is needed. Most calls can just be called | ||
224 | * directly. | ||
225 | * Assumes it is only called from user space and entered with interrupts off. | ||
226 | */ | ||
227 | |||
228 | ENTRY(ia32_syscall) | ||
229 | CFI_STARTPROC | ||
230 | swapgs | ||
231 | sti | ||
232 | movl %eax,%eax | ||
233 | pushq %rax | ||
234 | cld | ||
235 | /* note the registers are not zero extended to the sf. | ||
236 | this could be a problem. */ | ||
237 | SAVE_ARGS 0,0,1 | ||
238 | GET_THREAD_INFO(%r10) | ||
239 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
240 | jnz ia32_tracesys | ||
241 | ia32_do_syscall: | ||
242 | cmpl $(IA32_NR_syscalls),%eax | ||
243 | jae ia32_badsys | ||
244 | IA32_ARG_FIXUP | ||
245 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative | ||
246 | ia32_sysret: | ||
247 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
248 | jmp int_ret_from_sys_call | ||
249 | |||
250 | ia32_tracesys: | ||
251 | SAVE_REST | ||
252 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
253 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
254 | call syscall_trace_enter | ||
255 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
256 | RESTORE_REST | ||
257 | jmp ia32_do_syscall | ||
258 | |||
259 | ia32_badsys: | ||
260 | movq $0,ORIG_RAX-ARGOFFSET(%rsp) | ||
261 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
262 | jmp int_ret_from_sys_call | ||
263 | |||
264 | ni_syscall: | ||
265 | movq %rax,%rdi | ||
266 | jmp sys32_ni_syscall | ||
267 | |||
268 | quiet_ni_syscall: | ||
269 | movq $-ENOSYS,%rax | ||
270 | ret | ||
271 | CFI_ENDPROC | ||
272 | |||
273 | .macro PTREGSCALL label, func, arg | ||
274 | .globl \label | ||
275 | \label: | ||
276 | leaq \func(%rip),%rax | ||
277 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
278 | jmp ia32_ptregs_common | ||
279 | .endm | ||
280 | |||
281 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi | ||
282 | PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi | ||
283 | PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx | ||
284 | PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx | ||
285 | PTREGSCALL stub32_execve, sys32_execve, %rcx | ||
286 | PTREGSCALL stub32_fork, sys_fork, %rdi | ||
287 | PTREGSCALL stub32_clone, sys32_clone, %rdx | ||
288 | PTREGSCALL stub32_vfork, sys_vfork, %rdi | ||
289 | PTREGSCALL stub32_iopl, sys_iopl, %rsi | ||
290 | PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
291 | |||
292 | ENTRY(ia32_ptregs_common) | ||
293 | CFI_STARTPROC | ||
294 | popq %r11 | ||
295 | SAVE_REST | ||
296 | call *%rax | ||
297 | RESTORE_REST | ||
298 | jmp ia32_sysret /* misbalances the return cache */ | ||
299 | CFI_ENDPROC | ||
300 | |||
301 | .data | ||
302 | .align 8 | ||
303 | .globl ia32_sys_call_table | ||
304 | ia32_sys_call_table: | ||
305 | .quad sys_restart_syscall | ||
306 | .quad sys_exit | ||
307 | .quad stub32_fork | ||
308 | .quad sys_read | ||
309 | .quad sys_write | ||
310 | .quad sys32_open /* 5 */ | ||
311 | .quad sys_close | ||
312 | .quad sys32_waitpid | ||
313 | .quad sys_creat | ||
314 | .quad sys_link | ||
315 | .quad sys_unlink /* 10 */ | ||
316 | .quad stub32_execve | ||
317 | .quad sys_chdir | ||
318 | .quad compat_sys_time | ||
319 | .quad sys_mknod | ||
320 | .quad sys_chmod /* 15 */ | ||
321 | .quad sys_lchown16 | ||
322 | .quad quiet_ni_syscall /* old break syscall holder */ | ||
323 | .quad sys_stat | ||
324 | .quad sys32_lseek | ||
325 | .quad sys_getpid /* 20 */ | ||
326 | .quad compat_sys_mount /* mount */ | ||
327 | .quad sys_oldumount /* old_umount */ | ||
328 | .quad sys_setuid16 | ||
329 | .quad sys_getuid16 | ||
330 | .quad compat_sys_stime /* stime */ /* 25 */ | ||
331 | .quad sys32_ptrace /* ptrace */ | ||
332 | .quad sys_alarm | ||
333 | .quad sys_fstat /* (old)fstat */ | ||
334 | .quad sys_pause | ||
335 | .quad compat_sys_utime /* 30 */ | ||
336 | .quad quiet_ni_syscall /* old stty syscall holder */ | ||
337 | .quad quiet_ni_syscall /* old gtty syscall holder */ | ||
338 | .quad sys_access | ||
339 | .quad sys_nice | ||
340 | .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ | ||
341 | .quad sys_sync | ||
342 | .quad sys32_kill | ||
343 | .quad sys_rename | ||
344 | .quad sys_mkdir | ||
345 | .quad sys_rmdir /* 40 */ | ||
346 | .quad sys_dup | ||
347 | .quad sys32_pipe | ||
348 | .quad compat_sys_times | ||
349 | .quad quiet_ni_syscall /* old prof syscall holder */ | ||
350 | .quad sys_brk /* 45 */ | ||
351 | .quad sys_setgid16 | ||
352 | .quad sys_getgid16 | ||
353 | .quad sys_signal | ||
354 | .quad sys_geteuid16 | ||
355 | .quad sys_getegid16 /* 50 */ | ||
356 | .quad sys_acct | ||
357 | .quad sys_umount /* new_umount */ | ||
358 | .quad quiet_ni_syscall /* old lock syscall holder */ | ||
359 | .quad compat_sys_ioctl | ||
360 | .quad compat_sys_fcntl64 /* 55 */ | ||
361 | .quad quiet_ni_syscall /* old mpx syscall holder */ | ||
362 | .quad sys_setpgid | ||
363 | .quad quiet_ni_syscall /* old ulimit syscall holder */ | ||
364 | .quad sys32_olduname | ||
365 | .quad sys_umask /* 60 */ | ||
366 | .quad sys_chroot | ||
367 | .quad sys32_ustat | ||
368 | .quad sys_dup2 | ||
369 | .quad sys_getppid | ||
370 | .quad sys_getpgrp /* 65 */ | ||
371 | .quad sys_setsid | ||
372 | .quad sys32_sigaction | ||
373 | .quad sys_sgetmask | ||
374 | .quad sys_ssetmask | ||
375 | .quad sys_setreuid16 /* 70 */ | ||
376 | .quad sys_setregid16 | ||
377 | .quad stub32_sigsuspend | ||
378 | .quad compat_sys_sigpending | ||
379 | .quad sys_sethostname | ||
380 | .quad compat_sys_setrlimit /* 75 */ | ||
381 | .quad compat_sys_old_getrlimit /* old_getrlimit */ | ||
382 | .quad compat_sys_getrusage | ||
383 | .quad sys32_gettimeofday | ||
384 | .quad sys32_settimeofday | ||
385 | .quad sys_getgroups16 /* 80 */ | ||
386 | .quad sys_setgroups16 | ||
387 | .quad sys32_old_select | ||
388 | .quad sys_symlink | ||
389 | .quad sys_lstat | ||
390 | .quad sys_readlink /* 85 */ | ||
391 | #ifdef CONFIG_IA32_AOUT | ||
392 | .quad sys_uselib | ||
393 | #else | ||
394 | .quad quiet_ni_syscall | ||
395 | #endif | ||
396 | .quad sys_swapon | ||
397 | .quad sys_reboot | ||
398 | .quad compat_sys_old_readdir | ||
399 | .quad sys32_mmap /* 90 */ | ||
400 | .quad sys_munmap | ||
401 | .quad sys_truncate | ||
402 | .quad sys_ftruncate | ||
403 | .quad sys_fchmod | ||
404 | .quad sys_fchown16 /* 95 */ | ||
405 | .quad sys_getpriority | ||
406 | .quad sys_setpriority | ||
407 | .quad quiet_ni_syscall /* old profil syscall holder */ | ||
408 | .quad compat_sys_statfs | ||
409 | .quad compat_sys_fstatfs /* 100 */ | ||
410 | .quad sys_ioperm | ||
411 | .quad compat_sys_socketcall | ||
412 | .quad sys_syslog | ||
413 | .quad compat_sys_setitimer | ||
414 | .quad compat_sys_getitimer /* 105 */ | ||
415 | .quad compat_sys_newstat | ||
416 | .quad compat_sys_newlstat | ||
417 | .quad compat_sys_newfstat | ||
418 | .quad sys32_uname | ||
419 | .quad stub32_iopl /* 110 */ | ||
420 | .quad sys_vhangup | ||
421 | .quad quiet_ni_syscall /* old "idle" system call */ | ||
422 | .quad sys32_vm86_warning /* vm86old */ | ||
423 | .quad compat_sys_wait4 | ||
424 | .quad sys_swapoff /* 115 */ | ||
425 | .quad sys32_sysinfo | ||
426 | .quad sys32_ipc | ||
427 | .quad sys_fsync | ||
428 | .quad stub32_sigreturn | ||
429 | .quad stub32_clone /* 120 */ | ||
430 | .quad sys_setdomainname | ||
431 | .quad sys_uname | ||
432 | .quad sys_modify_ldt | ||
433 | .quad sys32_adjtimex | ||
434 | .quad sys32_mprotect /* 125 */ | ||
435 | .quad compat_sys_sigprocmask | ||
436 | .quad quiet_ni_syscall /* create_module */ | ||
437 | .quad sys_init_module | ||
438 | .quad sys_delete_module | ||
439 | .quad quiet_ni_syscall /* 130 get_kernel_syms */ | ||
440 | .quad sys_quotactl | ||
441 | .quad sys_getpgid | ||
442 | .quad sys_fchdir | ||
443 | .quad quiet_ni_syscall /* bdflush */ | ||
444 | .quad sys_sysfs /* 135 */ | ||
445 | .quad sys_personality | ||
446 | .quad quiet_ni_syscall /* for afs_syscall */ | ||
447 | .quad sys_setfsuid16 | ||
448 | .quad sys_setfsgid16 | ||
449 | .quad sys_llseek /* 140 */ | ||
450 | .quad compat_sys_getdents | ||
451 | .quad compat_sys_select | ||
452 | .quad sys_flock | ||
453 | .quad sys_msync | ||
454 | .quad compat_sys_readv /* 145 */ | ||
455 | .quad compat_sys_writev | ||
456 | .quad sys_getsid | ||
457 | .quad sys_fdatasync | ||
458 | .quad sys32_sysctl /* sysctl */ | ||
459 | .quad sys_mlock /* 150 */ | ||
460 | .quad sys_munlock | ||
461 | .quad sys_mlockall | ||
462 | .quad sys_munlockall | ||
463 | .quad sys_sched_setparam | ||
464 | .quad sys_sched_getparam /* 155 */ | ||
465 | .quad sys_sched_setscheduler | ||
466 | .quad sys_sched_getscheduler | ||
467 | .quad sys_sched_yield | ||
468 | .quad sys_sched_get_priority_max | ||
469 | .quad sys_sched_get_priority_min /* 160 */ | ||
470 | .quad sys_sched_rr_get_interval | ||
471 | .quad compat_sys_nanosleep | ||
472 | .quad sys_mremap | ||
473 | .quad sys_setresuid16 | ||
474 | .quad sys_getresuid16 /* 165 */ | ||
475 | .quad sys32_vm86_warning /* vm86 */ | ||
476 | .quad quiet_ni_syscall /* query_module */ | ||
477 | .quad sys_poll | ||
478 | .quad compat_sys_nfsservctl | ||
479 | .quad sys_setresgid16 /* 170 */ | ||
480 | .quad sys_getresgid16 | ||
481 | .quad sys_prctl | ||
482 | .quad stub32_rt_sigreturn | ||
483 | .quad sys32_rt_sigaction | ||
484 | .quad sys32_rt_sigprocmask /* 175 */ | ||
485 | .quad sys32_rt_sigpending | ||
486 | .quad compat_sys_rt_sigtimedwait | ||
487 | .quad sys32_rt_sigqueueinfo | ||
488 | .quad stub32_rt_sigsuspend | ||
489 | .quad sys32_pread /* 180 */ | ||
490 | .quad sys32_pwrite | ||
491 | .quad sys_chown16 | ||
492 | .quad sys_getcwd | ||
493 | .quad sys_capget | ||
494 | .quad sys_capset | ||
495 | .quad stub32_sigaltstack | ||
496 | .quad sys32_sendfile | ||
497 | .quad quiet_ni_syscall /* streams1 */ | ||
498 | .quad quiet_ni_syscall /* streams2 */ | ||
499 | .quad stub32_vfork /* 190 */ | ||
500 | .quad compat_sys_getrlimit | ||
501 | .quad sys32_mmap2 | ||
502 | .quad sys32_truncate64 | ||
503 | .quad sys32_ftruncate64 | ||
504 | .quad sys32_stat64 /* 195 */ | ||
505 | .quad sys32_lstat64 | ||
506 | .quad sys32_fstat64 | ||
507 | .quad sys_lchown | ||
508 | .quad sys_getuid | ||
509 | .quad sys_getgid /* 200 */ | ||
510 | .quad sys_geteuid | ||
511 | .quad sys_getegid | ||
512 | .quad sys_setreuid | ||
513 | .quad sys_setregid | ||
514 | .quad sys_getgroups /* 205 */ | ||
515 | .quad sys_setgroups | ||
516 | .quad sys_fchown | ||
517 | .quad sys_setresuid | ||
518 | .quad sys_getresuid | ||
519 | .quad sys_setresgid /* 210 */ | ||
520 | .quad sys_getresgid | ||
521 | .quad sys_chown | ||
522 | .quad sys_setuid | ||
523 | .quad sys_setgid | ||
524 | .quad sys_setfsuid /* 215 */ | ||
525 | .quad sys_setfsgid | ||
526 | .quad sys_pivot_root | ||
527 | .quad sys_mincore | ||
528 | .quad sys_madvise | ||
529 | .quad compat_sys_getdents64 /* 220 getdents64 */ | ||
530 | .quad compat_sys_fcntl64 | ||
531 | .quad quiet_ni_syscall /* tux */ | ||
532 | .quad quiet_ni_syscall /* security */ | ||
533 | .quad sys_gettid | ||
534 | .quad sys_readahead /* 225 */ | ||
535 | .quad sys_setxattr | ||
536 | .quad sys_lsetxattr | ||
537 | .quad sys_fsetxattr | ||
538 | .quad sys_getxattr | ||
539 | .quad sys_lgetxattr /* 230 */ | ||
540 | .quad sys_fgetxattr | ||
541 | .quad sys_listxattr | ||
542 | .quad sys_llistxattr | ||
543 | .quad sys_flistxattr | ||
544 | .quad sys_removexattr /* 235 */ | ||
545 | .quad sys_lremovexattr | ||
546 | .quad sys_fremovexattr | ||
547 | .quad sys_tkill | ||
548 | .quad sys_sendfile64 | ||
549 | .quad compat_sys_futex /* 240 */ | ||
550 | .quad compat_sys_sched_setaffinity | ||
551 | .quad compat_sys_sched_getaffinity | ||
552 | .quad sys32_set_thread_area | ||
553 | .quad sys32_get_thread_area | ||
554 | .quad compat_sys_io_setup /* 245 */ | ||
555 | .quad sys_io_destroy | ||
556 | .quad compat_sys_io_getevents | ||
557 | .quad compat_sys_io_submit | ||
558 | .quad sys_io_cancel | ||
559 | .quad sys_fadvise64 /* 250 */ | ||
560 | .quad quiet_ni_syscall /* free_huge_pages */ | ||
561 | .quad sys_exit_group | ||
562 | .quad sys32_lookup_dcookie | ||
563 | .quad sys_epoll_create | ||
564 | .quad sys_epoll_ctl /* 255 */ | ||
565 | .quad sys_epoll_wait | ||
566 | .quad sys_remap_file_pages | ||
567 | .quad sys_set_tid_address | ||
568 | .quad sys32_timer_create | ||
569 | .quad compat_sys_timer_settime /* 260 */ | ||
570 | .quad compat_sys_timer_gettime | ||
571 | .quad sys_timer_getoverrun | ||
572 | .quad sys_timer_delete | ||
573 | .quad compat_sys_clock_settime | ||
574 | .quad compat_sys_clock_gettime /* 265 */ | ||
575 | .quad compat_sys_clock_getres | ||
576 | .quad compat_sys_clock_nanosleep | ||
577 | .quad compat_sys_statfs64 | ||
578 | .quad compat_sys_fstatfs64 | ||
579 | .quad sys_tgkill /* 270 */ | ||
580 | .quad compat_sys_utimes | ||
581 | .quad sys32_fadvise64_64 | ||
582 | .quad quiet_ni_syscall /* sys_vserver */ | ||
583 | .quad sys_mbind | ||
584 | .quad compat_sys_get_mempolicy /* 275 */ | ||
585 | .quad sys_set_mempolicy | ||
586 | .quad compat_sys_mq_open | ||
587 | .quad sys_mq_unlink | ||
588 | .quad compat_sys_mq_timedsend | ||
589 | .quad compat_sys_mq_timedreceive /* 280 */ | ||
590 | .quad compat_sys_mq_notify | ||
591 | .quad compat_sys_mq_getsetattr | ||
592 | .quad quiet_ni_syscall /* reserved for kexec */ | ||
593 | .quad compat_sys_waitid | ||
594 | .quad quiet_ni_syscall /* sys_altroot */ | ||
595 | .quad sys_add_key | ||
596 | .quad sys_request_key | ||
597 | .quad sys_keyctl | ||
598 | /* don't forget to change IA32_NR_syscalls */ | ||
599 | ia32_syscall_end: | ||
600 | .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 | ||
601 | .quad ni_syscall | ||
602 | .endr | ||
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c new file mode 100644 index 000000000000..369151dc3213 --- /dev/null +++ b/arch/x86_64/ia32/ipc32.c | |||
@@ -0,0 +1,57 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/spinlock.h> | ||
3 | #include <linux/list.h> | ||
4 | #include <linux/syscalls.h> | ||
5 | #include <linux/time.h> | ||
6 | #include <linux/sem.h> | ||
7 | #include <linux/msg.h> | ||
8 | #include <linux/shm.h> | ||
9 | #include <linux/ipc.h> | ||
10 | #include <linux/compat.h> | ||
11 | |||
12 | #include <asm-i386/ipc.h> | ||
13 | |||
14 | asmlinkage long | ||
15 | sys32_ipc(u32 call, int first, int second, int third, | ||
16 | compat_uptr_t ptr, u32 fifth) | ||
17 | { | ||
18 | int version; | ||
19 | |||
20 | version = call >> 16; /* hack for backward compatibility */ | ||
21 | call &= 0xffff; | ||
22 | |||
23 | switch (call) { | ||
24 | case SEMOP: | ||
25 | /* struct sembuf is the same on 32 and 64bit :)) */ | ||
26 | return sys_semtimedop(first, compat_ptr(ptr), second, NULL); | ||
27 | case SEMTIMEDOP: | ||
28 | return compat_sys_semtimedop(first, compat_ptr(ptr), second, | ||
29 | compat_ptr(fifth)); | ||
30 | case SEMGET: | ||
31 | return sys_semget(first, second, third); | ||
32 | case SEMCTL: | ||
33 | return compat_sys_semctl(first, second, third, compat_ptr(ptr)); | ||
34 | |||
35 | case MSGSND: | ||
36 | return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); | ||
37 | case MSGRCV: | ||
38 | return compat_sys_msgrcv(first, second, fifth, third, | ||
39 | version, compat_ptr(ptr)); | ||
40 | case MSGGET: | ||
41 | return sys_msgget((key_t) first, second); | ||
42 | case MSGCTL: | ||
43 | return compat_sys_msgctl(first, second, compat_ptr(ptr)); | ||
44 | |||
45 | case SHMAT: | ||
46 | return compat_sys_shmat(first, second, third, version, | ||
47 | compat_ptr(ptr)); | ||
48 | break; | ||
49 | case SHMDT: | ||
50 | return sys_shmdt(compat_ptr(ptr)); | ||
51 | case SHMGET: | ||
52 | return sys_shmget(first, (unsigned)second, third); | ||
53 | case SHMCTL: | ||
54 | return compat_sys_shmctl(first, second, compat_ptr(ptr)); | ||
55 | } | ||
56 | return -ENOSYS; | ||
57 | } | ||
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c new file mode 100644 index 000000000000..b98b6d2462f6 --- /dev/null +++ b/arch/x86_64/ia32/ptrace32.c | |||
@@ -0,0 +1,379 @@ | |||
1 | /* | ||
2 | * 32bit ptrace for x86-64. | ||
3 | * | ||
4 | * Copyright 2001,2002 Andi Kleen, SuSE Labs. | ||
5 | * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier | ||
6 | * copyright. | ||
7 | * | ||
8 | * This allows to access 64bit processes too; but there is no way to see the extended | ||
9 | * register contents. | ||
10 | * | ||
11 | * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $ | ||
12 | */ | ||
13 | |||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/unistd.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/ptrace.h> | ||
21 | #include <asm/ptrace.h> | ||
22 | #include <asm/compat.h> | ||
23 | #include <asm/uaccess.h> | ||
24 | #include <asm/user32.h> | ||
25 | #include <asm/user.h> | ||
26 | #include <asm/errno.h> | ||
27 | #include <asm/debugreg.h> | ||
28 | #include <asm/i387.h> | ||
29 | #include <asm/fpu32.h> | ||
30 | |||
31 | /* determines which flags the user has access to. */ | ||
32 | /* 1 = access 0 = no access */ | ||
33 | #define FLAG_MASK 0x44dd5UL | ||
34 | |||
35 | #define R32(l,q) \ | ||
36 | case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break | ||
37 | |||
38 | static int putreg32(struct task_struct *child, unsigned regno, u32 val) | ||
39 | { | ||
40 | int i; | ||
41 | __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); | ||
42 | |||
43 | switch (regno) { | ||
44 | case offsetof(struct user32, regs.fs): | ||
45 | if (val && (val & 3) != 3) return -EIO; | ||
46 | child->thread.fs = val & 0xffff; | ||
47 | break; | ||
48 | case offsetof(struct user32, regs.gs): | ||
49 | if (val && (val & 3) != 3) return -EIO; | ||
50 | child->thread.gs = val & 0xffff; | ||
51 | break; | ||
52 | case offsetof(struct user32, regs.ds): | ||
53 | if (val && (val & 3) != 3) return -EIO; | ||
54 | child->thread.ds = val & 0xffff; | ||
55 | break; | ||
56 | case offsetof(struct user32, regs.es): | ||
57 | child->thread.es = val & 0xffff; | ||
58 | break; | ||
59 | case offsetof(struct user32, regs.ss): | ||
60 | if ((val & 3) != 3) return -EIO; | ||
61 | stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; | ||
62 | break; | ||
63 | case offsetof(struct user32, regs.cs): | ||
64 | if ((val & 3) != 3) return -EIO; | ||
65 | stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; | ||
66 | break; | ||
67 | |||
68 | R32(ebx, rbx); | ||
69 | R32(ecx, rcx); | ||
70 | R32(edx, rdx); | ||
71 | R32(edi, rdi); | ||
72 | R32(esi, rsi); | ||
73 | R32(ebp, rbp); | ||
74 | R32(eax, rax); | ||
75 | R32(orig_eax, orig_rax); | ||
76 | R32(eip, rip); | ||
77 | R32(esp, rsp); | ||
78 | |||
79 | case offsetof(struct user32, regs.eflags): { | ||
80 | __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; | ||
81 | val &= FLAG_MASK; | ||
82 | *flags = val | (*flags & ~FLAG_MASK); | ||
83 | break; | ||
84 | } | ||
85 | |||
86 | case offsetof(struct user32, u_debugreg[4]): | ||
87 | case offsetof(struct user32, u_debugreg[5]): | ||
88 | return -EIO; | ||
89 | |||
90 | case offsetof(struct user32, u_debugreg[0]): | ||
91 | child->thread.debugreg0 = val; | ||
92 | break; | ||
93 | |||
94 | case offsetof(struct user32, u_debugreg[1]): | ||
95 | child->thread.debugreg1 = val; | ||
96 | break; | ||
97 | |||
98 | case offsetof(struct user32, u_debugreg[2]): | ||
99 | child->thread.debugreg2 = val; | ||
100 | break; | ||
101 | |||
102 | case offsetof(struct user32, u_debugreg[3]): | ||
103 | child->thread.debugreg3 = val; | ||
104 | break; | ||
105 | |||
106 | case offsetof(struct user32, u_debugreg[6]): | ||
107 | child->thread.debugreg6 = val; | ||
108 | break; | ||
109 | |||
110 | case offsetof(struct user32, u_debugreg[7]): | ||
111 | val &= ~DR_CONTROL_RESERVED; | ||
112 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
113 | * this awkward check.*/ | ||
114 | for(i=0; i<4; i++) | ||
115 | if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) | ||
116 | return -EIO; | ||
117 | child->thread.debugreg7 = val; | ||
118 | break; | ||
119 | |||
120 | default: | ||
121 | if (regno > sizeof(struct user32) || (regno & 3)) | ||
122 | return -EIO; | ||
123 | |||
124 | /* Other dummy fields in the virtual user structure are ignored */ | ||
125 | break; | ||
126 | } | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | #undef R32 | ||
131 | |||
132 | #define R32(l,q) \ | ||
133 | case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break | ||
134 | |||
135 | static int getreg32(struct task_struct *child, unsigned regno, u32 *val) | ||
136 | { | ||
137 | __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); | ||
138 | |||
139 | switch (regno) { | ||
140 | case offsetof(struct user32, regs.fs): | ||
141 | *val = child->thread.fs; | ||
142 | break; | ||
143 | case offsetof(struct user32, regs.gs): | ||
144 | *val = child->thread.gs; | ||
145 | break; | ||
146 | case offsetof(struct user32, regs.ds): | ||
147 | *val = child->thread.ds; | ||
148 | break; | ||
149 | case offsetof(struct user32, regs.es): | ||
150 | *val = child->thread.es; | ||
151 | break; | ||
152 | |||
153 | R32(cs, cs); | ||
154 | R32(ss, ss); | ||
155 | R32(ebx, rbx); | ||
156 | R32(ecx, rcx); | ||
157 | R32(edx, rdx); | ||
158 | R32(edi, rdi); | ||
159 | R32(esi, rsi); | ||
160 | R32(ebp, rbp); | ||
161 | R32(eax, rax); | ||
162 | R32(orig_eax, orig_rax); | ||
163 | R32(eip, rip); | ||
164 | R32(eflags, eflags); | ||
165 | R32(esp, rsp); | ||
166 | |||
167 | case offsetof(struct user32, u_debugreg[0]): | ||
168 | *val = child->thread.debugreg0; | ||
169 | break; | ||
170 | case offsetof(struct user32, u_debugreg[1]): | ||
171 | *val = child->thread.debugreg1; | ||
172 | break; | ||
173 | case offsetof(struct user32, u_debugreg[2]): | ||
174 | *val = child->thread.debugreg2; | ||
175 | break; | ||
176 | case offsetof(struct user32, u_debugreg[3]): | ||
177 | *val = child->thread.debugreg3; | ||
178 | break; | ||
179 | case offsetof(struct user32, u_debugreg[6]): | ||
180 | *val = child->thread.debugreg6; | ||
181 | break; | ||
182 | case offsetof(struct user32, u_debugreg[7]): | ||
183 | *val = child->thread.debugreg7; | ||
184 | break; | ||
185 | |||
186 | default: | ||
187 | if (regno > sizeof(struct user32) || (regno & 3)) | ||
188 | return -EIO; | ||
189 | |||
190 | /* Other dummy fields in the virtual user structure are ignored */ | ||
191 | *val = 0; | ||
192 | break; | ||
193 | } | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | #undef R32 | ||
198 | |||
199 | static struct task_struct *find_target(int request, int pid, int *err) | ||
200 | { | ||
201 | struct task_struct *child; | ||
202 | |||
203 | *err = -EPERM; | ||
204 | if (pid == 1) | ||
205 | return NULL; | ||
206 | |||
207 | *err = -ESRCH; | ||
208 | read_lock(&tasklist_lock); | ||
209 | child = find_task_by_pid(pid); | ||
210 | if (child) | ||
211 | get_task_struct(child); | ||
212 | read_unlock(&tasklist_lock); | ||
213 | if (child) { | ||
214 | *err = -EPERM; | ||
215 | if (child->pid == 1) | ||
216 | goto out; | ||
217 | *err = ptrace_check_attach(child, request == PTRACE_KILL); | ||
218 | if (*err < 0) | ||
219 | goto out; | ||
220 | return child; | ||
221 | } | ||
222 | out: | ||
223 | if (child) | ||
224 | put_task_struct(child); | ||
225 | return NULL; | ||
226 | |||
227 | } | ||
228 | |||
229 | asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) | ||
230 | { | ||
231 | struct task_struct *child; | ||
232 | struct pt_regs *childregs; | ||
233 | void __user *datap = compat_ptr(data); | ||
234 | int ret; | ||
235 | __u32 val; | ||
236 | |||
237 | switch (request) { | ||
238 | default: | ||
239 | return sys_ptrace(request, pid, addr, data); | ||
240 | |||
241 | case PTRACE_PEEKTEXT: | ||
242 | case PTRACE_PEEKDATA: | ||
243 | case PTRACE_POKEDATA: | ||
244 | case PTRACE_POKETEXT: | ||
245 | case PTRACE_POKEUSR: | ||
246 | case PTRACE_PEEKUSR: | ||
247 | case PTRACE_GETREGS: | ||
248 | case PTRACE_SETREGS: | ||
249 | case PTRACE_SETFPREGS: | ||
250 | case PTRACE_GETFPREGS: | ||
251 | case PTRACE_SETFPXREGS: | ||
252 | case PTRACE_GETFPXREGS: | ||
253 | case PTRACE_GETEVENTMSG: | ||
254 | break; | ||
255 | } | ||
256 | |||
257 | child = find_target(request, pid, &ret); | ||
258 | if (!child) | ||
259 | return ret; | ||
260 | |||
261 | childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); | ||
262 | |||
263 | switch (request) { | ||
264 | case PTRACE_PEEKDATA: | ||
265 | case PTRACE_PEEKTEXT: | ||
266 | ret = 0; | ||
267 | if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) | ||
268 | ret = -EIO; | ||
269 | else | ||
270 | ret = put_user(val, (unsigned int __user *)datap); | ||
271 | break; | ||
272 | |||
273 | case PTRACE_POKEDATA: | ||
274 | case PTRACE_POKETEXT: | ||
275 | ret = 0; | ||
276 | if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) | ||
277 | ret = -EIO; | ||
278 | break; | ||
279 | |||
280 | case PTRACE_PEEKUSR: | ||
281 | ret = getreg32(child, addr, &val); | ||
282 | if (ret == 0) | ||
283 | ret = put_user(val, (__u32 __user *)datap); | ||
284 | break; | ||
285 | |||
286 | case PTRACE_POKEUSR: | ||
287 | ret = putreg32(child, addr, data); | ||
288 | break; | ||
289 | |||
290 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
291 | int i; | ||
292 | if (!access_ok(VERIFY_WRITE, datap, 16*4)) { | ||
293 | ret = -EIO; | ||
294 | break; | ||
295 | } | ||
296 | ret = 0; | ||
297 | for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { | ||
298 | getreg32(child, i, &val); | ||
299 | ret |= __put_user(val,(u32 __user *)datap); | ||
300 | datap += sizeof(u32); | ||
301 | } | ||
302 | break; | ||
303 | } | ||
304 | |||
305 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
306 | unsigned long tmp; | ||
307 | int i; | ||
308 | if (!access_ok(VERIFY_READ, datap, 16*4)) { | ||
309 | ret = -EIO; | ||
310 | break; | ||
311 | } | ||
312 | ret = 0; | ||
313 | for ( i = 0; i <= 16*4; i += sizeof(u32) ) { | ||
314 | ret |= __get_user(tmp, (u32 __user *)datap); | ||
315 | putreg32(child, i, tmp); | ||
316 | datap += sizeof(u32); | ||
317 | } | ||
318 | break; | ||
319 | } | ||
320 | |||
321 | case PTRACE_GETFPREGS: | ||
322 | ret = -EIO; | ||
323 | if (!access_ok(VERIFY_READ, compat_ptr(data), | ||
324 | sizeof(struct user_i387_struct))) | ||
325 | break; | ||
326 | save_i387_ia32(child, datap, childregs, 1); | ||
327 | ret = 0; | ||
328 | break; | ||
329 | |||
330 | case PTRACE_SETFPREGS: | ||
331 | ret = -EIO; | ||
332 | if (!access_ok(VERIFY_WRITE, datap, | ||
333 | sizeof(struct user_i387_struct))) | ||
334 | break; | ||
335 | ret = 0; | ||
336 | /* don't check EFAULT to be bug-to-bug compatible to i386 */ | ||
337 | restore_i387_ia32(child, datap, 1); | ||
338 | break; | ||
339 | |||
340 | case PTRACE_GETFPXREGS: { | ||
341 | struct user32_fxsr_struct __user *u = datap; | ||
342 | init_fpu(child); | ||
343 | ret = -EIO; | ||
344 | if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) | ||
345 | break; | ||
346 | ret = -EFAULT; | ||
347 | if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) | ||
348 | break; | ||
349 | ret = __put_user(childregs->cs, &u->fcs); | ||
350 | ret |= __put_user(child->thread.ds, &u->fos); | ||
351 | break; | ||
352 | } | ||
353 | case PTRACE_SETFPXREGS: { | ||
354 | struct user32_fxsr_struct __user *u = datap; | ||
355 | unlazy_fpu(child); | ||
356 | ret = -EIO; | ||
357 | if (!access_ok(VERIFY_READ, u, sizeof(*u))) | ||
358 | break; | ||
359 | /* no checking to be bug-to-bug compatible with i386 */ | ||
360 | __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); | ||
361 | set_stopped_child_used_math(child); | ||
362 | child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
363 | ret = 0; | ||
364 | break; | ||
365 | } | ||
366 | |||
367 | case PTRACE_GETEVENTMSG: | ||
368 | ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); | ||
369 | break; | ||
370 | |||
371 | default: | ||
372 | ret = -EINVAL; | ||
373 | break; | ||
374 | } | ||
375 | |||
376 | put_task_struct(child); | ||
377 | return ret; | ||
378 | } | ||
379 | |||
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c new file mode 100644 index 000000000000..68a9ab06ee7c --- /dev/null +++ b/arch/x86_64/ia32/sys_ia32.c | |||
@@ -0,0 +1,1050 @@ | |||
1 | /* | ||
2 | * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on | ||
3 | * sys_sparc32 | ||
4 | * | ||
5 | * Copyright (C) 2000 VA Linux Co | ||
6 | * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> | ||
7 | * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> | ||
8 | * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
9 | * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) | ||
10 | * Copyright (C) 2000 Hewlett-Packard Co. | ||
11 | * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> | ||
12 | * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) | ||
13 | * | ||
14 | * These routines maintain argument size conversion between 32bit and 64bit | ||
15 | * environment. In 2.5 most of this should be moved to a generic directory. | ||
16 | * | ||
17 | * This file assumes that there is a hole at the end of user address space. | ||
18 | * | ||
19 | * Some of the functions are LE specific currently. These are hopefully all marked. | ||
20 | * This should be fixed. | ||
21 | */ | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/file.h> | ||
28 | #include <linux/signal.h> | ||
29 | #include <linux/syscalls.h> | ||
30 | #include <linux/resource.h> | ||
31 | #include <linux/times.h> | ||
32 | #include <linux/utsname.h> | ||
33 | #include <linux/timex.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/smp_lock.h> | ||
36 | #include <linux/sem.h> | ||
37 | #include <linux/msg.h> | ||
38 | #include <linux/mm.h> | ||
39 | #include <linux/shm.h> | ||
40 | #include <linux/slab.h> | ||
41 | #include <linux/uio.h> | ||
42 | #include <linux/nfs_fs.h> | ||
43 | #include <linux/quota.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/sunrpc/svc.h> | ||
46 | #include <linux/nfsd/nfsd.h> | ||
47 | #include <linux/nfsd/cache.h> | ||
48 | #include <linux/nfsd/xdr.h> | ||
49 | #include <linux/nfsd/syscall.h> | ||
50 | #include <linux/poll.h> | ||
51 | #include <linux/personality.h> | ||
52 | #include <linux/stat.h> | ||
53 | #include <linux/ipc.h> | ||
54 | #include <linux/rwsem.h> | ||
55 | #include <linux/binfmts.h> | ||
56 | #include <linux/init.h> | ||
57 | #include <linux/aio_abi.h> | ||
58 | #include <linux/aio.h> | ||
59 | #include <linux/compat.h> | ||
60 | #include <linux/vfs.h> | ||
61 | #include <linux/ptrace.h> | ||
62 | #include <linux/highuid.h> | ||
63 | #include <linux/vmalloc.h> | ||
64 | #include <asm/mman.h> | ||
65 | #include <asm/types.h> | ||
66 | #include <asm/uaccess.h> | ||
67 | #include <asm/semaphore.h> | ||
68 | #include <asm/atomic.h> | ||
69 | #include <asm/ldt.h> | ||
70 | |||
71 | #include <net/scm.h> | ||
72 | #include <net/sock.h> | ||
73 | #include <asm/ia32.h> | ||
74 | |||
75 | #define AA(__x) ((unsigned long)(__x)) | ||
76 | |||
77 | int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) | ||
78 | { | ||
79 | typeof(ubuf->st_uid) uid = 0; | ||
80 | typeof(ubuf->st_gid) gid = 0; | ||
81 | SET_UID(uid, kbuf->uid); | ||
82 | SET_GID(gid, kbuf->gid); | ||
83 | if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) | ||
84 | return -EOVERFLOW; | ||
85 | if (kbuf->size >= 0x7fffffff) | ||
86 | return -EOVERFLOW; | ||
87 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || | ||
88 | __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || | ||
89 | __put_user (kbuf->ino, &ubuf->st_ino) || | ||
90 | __put_user (kbuf->mode, &ubuf->st_mode) || | ||
91 | __put_user (kbuf->nlink, &ubuf->st_nlink) || | ||
92 | __put_user (uid, &ubuf->st_uid) || | ||
93 | __put_user (gid, &ubuf->st_gid) || | ||
94 | __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || | ||
95 | __put_user (kbuf->size, &ubuf->st_size) || | ||
96 | __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || | ||
97 | __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
98 | __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || | ||
99 | __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
100 | __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || | ||
101 | __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
102 | __put_user (kbuf->blksize, &ubuf->st_blksize) || | ||
103 | __put_user (kbuf->blocks, &ubuf->st_blocks)) | ||
104 | return -EFAULT; | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | asmlinkage long | ||
109 | sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) | ||
110 | { | ||
111 | return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); | ||
112 | } | ||
113 | |||
114 | asmlinkage long | ||
115 | sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) | ||
116 | { | ||
117 | return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); | ||
118 | } | ||
119 | |||
120 | /* Another set for IA32/LFS -- x86_64 struct stat is different due to | ||
121 | support for 64bit inode numbers. */ | ||
122 | |||
123 | static int | ||
124 | cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) | ||
125 | { | ||
126 | typeof(ubuf->st_uid) uid = 0; | ||
127 | typeof(ubuf->st_gid) gid = 0; | ||
128 | SET_UID(uid, stat->uid); | ||
129 | SET_GID(gid, stat->gid); | ||
130 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || | ||
131 | __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || | ||
132 | __put_user (stat->ino, &ubuf->__st_ino) || | ||
133 | __put_user (stat->ino, &ubuf->st_ino) || | ||
134 | __put_user (stat->mode, &ubuf->st_mode) || | ||
135 | __put_user (stat->nlink, &ubuf->st_nlink) || | ||
136 | __put_user (uid, &ubuf->st_uid) || | ||
137 | __put_user (gid, &ubuf->st_gid) || | ||
138 | __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || | ||
139 | __put_user (stat->size, &ubuf->st_size) || | ||
140 | __put_user (stat->atime.tv_sec, &ubuf->st_atime) || | ||
141 | __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
142 | __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || | ||
143 | __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
144 | __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || | ||
145 | __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
146 | __put_user (stat->blksize, &ubuf->st_blksize) || | ||
147 | __put_user (stat->blocks, &ubuf->st_blocks)) | ||
148 | return -EFAULT; | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | asmlinkage long | ||
153 | sys32_stat64(char __user * filename, struct stat64 __user *statbuf) | ||
154 | { | ||
155 | struct kstat stat; | ||
156 | int ret = vfs_stat(filename, &stat); | ||
157 | if (!ret) | ||
158 | ret = cp_stat64(statbuf, &stat); | ||
159 | return ret; | ||
160 | } | ||
161 | |||
162 | asmlinkage long | ||
163 | sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) | ||
164 | { | ||
165 | struct kstat stat; | ||
166 | int ret = vfs_lstat(filename, &stat); | ||
167 | if (!ret) | ||
168 | ret = cp_stat64(statbuf, &stat); | ||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | asmlinkage long | ||
173 | sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) | ||
174 | { | ||
175 | struct kstat stat; | ||
176 | int ret = vfs_fstat(fd, &stat); | ||
177 | if (!ret) | ||
178 | ret = cp_stat64(statbuf, &stat); | ||
179 | return ret; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * Linux/i386 didn't use to be able to handle more than | ||
184 | * 4 system call parameters, so these system calls used a memory | ||
185 | * block for parameter passing.. | ||
186 | */ | ||
187 | |||
188 | struct mmap_arg_struct { | ||
189 | unsigned int addr; | ||
190 | unsigned int len; | ||
191 | unsigned int prot; | ||
192 | unsigned int flags; | ||
193 | unsigned int fd; | ||
194 | unsigned int offset; | ||
195 | }; | ||
196 | |||
197 | asmlinkage long | ||
198 | sys32_mmap(struct mmap_arg_struct __user *arg) | ||
199 | { | ||
200 | struct mmap_arg_struct a; | ||
201 | struct file *file = NULL; | ||
202 | unsigned long retval; | ||
203 | struct mm_struct *mm ; | ||
204 | |||
205 | if (copy_from_user(&a, arg, sizeof(a))) | ||
206 | return -EFAULT; | ||
207 | |||
208 | if (a.offset & ~PAGE_MASK) | ||
209 | return -EINVAL; | ||
210 | |||
211 | if (!(a.flags & MAP_ANONYMOUS)) { | ||
212 | file = fget(a.fd); | ||
213 | if (!file) | ||
214 | return -EBADF; | ||
215 | } | ||
216 | |||
217 | mm = current->mm; | ||
218 | down_write(&mm->mmap_sem); | ||
219 | retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); | ||
220 | if (file) | ||
221 | fput(file); | ||
222 | |||
223 | up_write(&mm->mmap_sem); | ||
224 | |||
225 | return retval; | ||
226 | } | ||
227 | |||
228 | asmlinkage long | ||
229 | sys32_mprotect(unsigned long start, size_t len, unsigned long prot) | ||
230 | { | ||
231 | return sys_mprotect(start,len,prot); | ||
232 | } | ||
233 | |||
234 | asmlinkage long | ||
235 | sys32_pipe(int __user *fd) | ||
236 | { | ||
237 | int retval; | ||
238 | int fds[2]; | ||
239 | |||
240 | retval = do_pipe(fds); | ||
241 | if (retval) | ||
242 | goto out; | ||
243 | if (copy_to_user(fd, fds, sizeof(fds))) | ||
244 | retval = -EFAULT; | ||
245 | out: | ||
246 | return retval; | ||
247 | } | ||
248 | |||
249 | asmlinkage long | ||
250 | sys32_rt_sigaction(int sig, struct sigaction32 __user *act, | ||
251 | struct sigaction32 __user *oact, unsigned int sigsetsize) | ||
252 | { | ||
253 | struct k_sigaction new_ka, old_ka; | ||
254 | int ret; | ||
255 | compat_sigset_t set32; | ||
256 | |||
257 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
258 | if (sigsetsize != sizeof(compat_sigset_t)) | ||
259 | return -EINVAL; | ||
260 | |||
261 | if (act) { | ||
262 | compat_uptr_t handler, restorer; | ||
263 | |||
264 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
265 | __get_user(handler, &act->sa_handler) || | ||
266 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
267 | __get_user(restorer, &act->sa_restorer)|| | ||
268 | __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) | ||
269 | return -EFAULT; | ||
270 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
271 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
272 | /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ | ||
273 | switch (_NSIG_WORDS) { | ||
274 | case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] | ||
275 | | (((long)set32.sig[7]) << 32); | ||
276 | case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4] | ||
277 | | (((long)set32.sig[5]) << 32); | ||
278 | case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2] | ||
279 | | (((long)set32.sig[3]) << 32); | ||
280 | case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0] | ||
281 | | (((long)set32.sig[1]) << 32); | ||
282 | } | ||
283 | } | ||
284 | |||
285 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
286 | |||
287 | if (!ret && oact) { | ||
288 | /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ | ||
289 | switch (_NSIG_WORDS) { | ||
290 | case 4: | ||
291 | set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); | ||
292 | set32.sig[6] = old_ka.sa.sa_mask.sig[3]; | ||
293 | case 3: | ||
294 | set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32); | ||
295 | set32.sig[4] = old_ka.sa.sa_mask.sig[2]; | ||
296 | case 2: | ||
297 | set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32); | ||
298 | set32.sig[2] = old_ka.sa.sa_mask.sig[1]; | ||
299 | case 1: | ||
300 | set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32); | ||
301 | set32.sig[0] = old_ka.sa.sa_mask.sig[0]; | ||
302 | } | ||
303 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
304 | __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || | ||
305 | __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || | ||
306 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
307 | __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) | ||
308 | return -EFAULT; | ||
309 | } | ||
310 | |||
311 | return ret; | ||
312 | } | ||
313 | |||
314 | asmlinkage long | ||
315 | sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) | ||
316 | { | ||
317 | struct k_sigaction new_ka, old_ka; | ||
318 | int ret; | ||
319 | |||
320 | if (act) { | ||
321 | compat_old_sigset_t mask; | ||
322 | compat_uptr_t handler, restorer; | ||
323 | |||
324 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
325 | __get_user(handler, &act->sa_handler) || | ||
326 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
327 | __get_user(restorer, &act->sa_restorer) || | ||
328 | __get_user(mask, &act->sa_mask)) | ||
329 | return -EFAULT; | ||
330 | |||
331 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
332 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
333 | |||
334 | siginitset(&new_ka.sa.sa_mask, mask); | ||
335 | } | ||
336 | |||
337 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
338 | |||
339 | if (!ret && oact) { | ||
340 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
341 | __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || | ||
342 | __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || | ||
343 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
344 | __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) | ||
345 | return -EFAULT; | ||
346 | } | ||
347 | |||
348 | return ret; | ||
349 | } | ||
350 | |||
351 | asmlinkage long | ||
352 | sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, | ||
353 | compat_sigset_t __user *oset, unsigned int sigsetsize) | ||
354 | { | ||
355 | sigset_t s; | ||
356 | compat_sigset_t s32; | ||
357 | int ret; | ||
358 | mm_segment_t old_fs = get_fs(); | ||
359 | |||
360 | if (set) { | ||
361 | if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) | ||
362 | return -EFAULT; | ||
363 | switch (_NSIG_WORDS) { | ||
364 | case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); | ||
365 | case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); | ||
366 | case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); | ||
367 | case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); | ||
368 | } | ||
369 | } | ||
370 | set_fs (KERNEL_DS); | ||
371 | ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, | ||
372 | sigsetsize); | ||
373 | set_fs (old_fs); | ||
374 | if (ret) return ret; | ||
375 | if (oset) { | ||
376 | switch (_NSIG_WORDS) { | ||
377 | case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; | ||
378 | case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; | ||
379 | case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; | ||
380 | case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; | ||
381 | } | ||
382 | if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) | ||
383 | return -EFAULT; | ||
384 | } | ||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | static inline long | ||
389 | get_tv32(struct timeval *o, struct compat_timeval __user *i) | ||
390 | { | ||
391 | int err = -EFAULT; | ||
392 | if (access_ok(VERIFY_READ, i, sizeof(*i))) { | ||
393 | err = __get_user(o->tv_sec, &i->tv_sec); | ||
394 | err |= __get_user(o->tv_usec, &i->tv_usec); | ||
395 | } | ||
396 | return err; | ||
397 | } | ||
398 | |||
399 | static inline long | ||
400 | put_tv32(struct compat_timeval __user *o, struct timeval *i) | ||
401 | { | ||
402 | int err = -EFAULT; | ||
403 | if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { | ||
404 | err = __put_user(i->tv_sec, &o->tv_sec); | ||
405 | err |= __put_user(i->tv_usec, &o->tv_usec); | ||
406 | } | ||
407 | return err; | ||
408 | } | ||
409 | |||
410 | extern int do_setitimer(int which, struct itimerval *, struct itimerval *); | ||
411 | |||
412 | asmlinkage long | ||
413 | sys32_alarm(unsigned int seconds) | ||
414 | { | ||
415 | struct itimerval it_new, it_old; | ||
416 | unsigned int oldalarm; | ||
417 | |||
418 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; | ||
419 | it_new.it_value.tv_sec = seconds; | ||
420 | it_new.it_value.tv_usec = 0; | ||
421 | do_setitimer(ITIMER_REAL, &it_new, &it_old); | ||
422 | oldalarm = it_old.it_value.tv_sec; | ||
423 | /* ehhh.. We can't return 0 if we have an alarm pending.. */ | ||
424 | /* And we'd better return too much than too little anyway */ | ||
425 | if (it_old.it_value.tv_usec) | ||
426 | oldalarm++; | ||
427 | return oldalarm; | ||
428 | } | ||
429 | |||
430 | /* Translations due to time_t size differences. Which affects all | ||
431 | sorts of things, like timeval and itimerval. */ | ||
432 | |||
433 | extern struct timezone sys_tz; | ||
434 | |||
435 | asmlinkage long | ||
436 | sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) | ||
437 | { | ||
438 | if (tv) { | ||
439 | struct timeval ktv; | ||
440 | do_gettimeofday(&ktv); | ||
441 | if (put_tv32(tv, &ktv)) | ||
442 | return -EFAULT; | ||
443 | } | ||
444 | if (tz) { | ||
445 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
446 | return -EFAULT; | ||
447 | } | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | asmlinkage long | ||
452 | sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) | ||
453 | { | ||
454 | struct timeval ktv; | ||
455 | struct timespec kts; | ||
456 | struct timezone ktz; | ||
457 | |||
458 | if (tv) { | ||
459 | if (get_tv32(&ktv, tv)) | ||
460 | return -EFAULT; | ||
461 | kts.tv_sec = ktv.tv_sec; | ||
462 | kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; | ||
463 | } | ||
464 | if (tz) { | ||
465 | if (copy_from_user(&ktz, tz, sizeof(ktz))) | ||
466 | return -EFAULT; | ||
467 | } | ||
468 | |||
469 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | ||
470 | } | ||
471 | |||
472 | struct sel_arg_struct { | ||
473 | unsigned int n; | ||
474 | unsigned int inp; | ||
475 | unsigned int outp; | ||
476 | unsigned int exp; | ||
477 | unsigned int tvp; | ||
478 | }; | ||
479 | |||
480 | asmlinkage long | ||
481 | sys32_old_select(struct sel_arg_struct __user *arg) | ||
482 | { | ||
483 | struct sel_arg_struct a; | ||
484 | |||
485 | if (copy_from_user(&a, arg, sizeof(a))) | ||
486 | return -EFAULT; | ||
487 | return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), | ||
488 | compat_ptr(a.exp), compat_ptr(a.tvp)); | ||
489 | } | ||
490 | |||
491 | extern asmlinkage long | ||
492 | compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, | ||
493 | struct compat_rusage *ru); | ||
494 | |||
495 | asmlinkage long | ||
496 | sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) | ||
497 | { | ||
498 | return compat_sys_wait4(pid, stat_addr, options, NULL); | ||
499 | } | ||
500 | |||
501 | int sys32_ni_syscall(int call) | ||
502 | { | ||
503 | struct task_struct *me = current; | ||
504 | static char lastcomm[sizeof(me->comm)]; | ||
505 | |||
506 | if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { | ||
507 | printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", | ||
508 | call, me->comm); | ||
509 | strncpy(lastcomm, me->comm, sizeof(lastcomm)); | ||
510 | } | ||
511 | return -ENOSYS; | ||
512 | } | ||
513 | |||
514 | /* 32-bit timeval and related flotsam. */ | ||
515 | |||
516 | asmlinkage long | ||
517 | sys32_sysfs(int option, u32 arg1, u32 arg2) | ||
518 | { | ||
519 | return sys_sysfs(option, arg1, arg2); | ||
520 | } | ||
521 | |||
522 | struct sysinfo32 { | ||
523 | s32 uptime; | ||
524 | u32 loads[3]; | ||
525 | u32 totalram; | ||
526 | u32 freeram; | ||
527 | u32 sharedram; | ||
528 | u32 bufferram; | ||
529 | u32 totalswap; | ||
530 | u32 freeswap; | ||
531 | unsigned short procs; | ||
532 | unsigned short pad; | ||
533 | u32 totalhigh; | ||
534 | u32 freehigh; | ||
535 | u32 mem_unit; | ||
536 | char _f[20-2*sizeof(u32)-sizeof(int)]; | ||
537 | }; | ||
538 | |||
539 | asmlinkage long | ||
540 | sys32_sysinfo(struct sysinfo32 __user *info) | ||
541 | { | ||
542 | struct sysinfo s; | ||
543 | int ret; | ||
544 | mm_segment_t old_fs = get_fs (); | ||
545 | int bitcount = 0; | ||
546 | |||
547 | set_fs (KERNEL_DS); | ||
548 | ret = sys_sysinfo(&s); | ||
549 | set_fs (old_fs); | ||
550 | |||
551 | /* Check to see if any memory value is too large for 32-bit and scale | ||
552 | * down if needed | ||
553 | */ | ||
554 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | ||
555 | while (s.mem_unit < PAGE_SIZE) { | ||
556 | s.mem_unit <<= 1; | ||
557 | bitcount++; | ||
558 | } | ||
559 | s.totalram >>= bitcount; | ||
560 | s.freeram >>= bitcount; | ||
561 | s.sharedram >>= bitcount; | ||
562 | s.bufferram >>= bitcount; | ||
563 | s.totalswap >>= bitcount; | ||
564 | s.freeswap >>= bitcount; | ||
565 | s.totalhigh >>= bitcount; | ||
566 | s.freehigh >>= bitcount; | ||
567 | } | ||
568 | |||
569 | if (!access_ok(VERIFY_WRITE, info, sizeof(struct sysinfo32)) || | ||
570 | __put_user (s.uptime, &info->uptime) || | ||
571 | __put_user (s.loads[0], &info->loads[0]) || | ||
572 | __put_user (s.loads[1], &info->loads[1]) || | ||
573 | __put_user (s.loads[2], &info->loads[2]) || | ||
574 | __put_user (s.totalram, &info->totalram) || | ||
575 | __put_user (s.freeram, &info->freeram) || | ||
576 | __put_user (s.sharedram, &info->sharedram) || | ||
577 | __put_user (s.bufferram, &info->bufferram) || | ||
578 | __put_user (s.totalswap, &info->totalswap) || | ||
579 | __put_user (s.freeswap, &info->freeswap) || | ||
580 | __put_user (s.procs, &info->procs) || | ||
581 | __put_user (s.totalhigh, &info->totalhigh) || | ||
582 | __put_user (s.freehigh, &info->freehigh) || | ||
583 | __put_user (s.mem_unit, &info->mem_unit)) | ||
584 | return -EFAULT; | ||
585 | return 0; | ||
586 | } | ||
587 | |||
588 | asmlinkage long | ||
589 | sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) | ||
590 | { | ||
591 | struct timespec t; | ||
592 | int ret; | ||
593 | mm_segment_t old_fs = get_fs (); | ||
594 | |||
595 | set_fs (KERNEL_DS); | ||
596 | ret = sys_sched_rr_get_interval(pid, &t); | ||
597 | set_fs (old_fs); | ||
598 | if (put_compat_timespec(&t, interval)) | ||
599 | return -EFAULT; | ||
600 | return ret; | ||
601 | } | ||
602 | |||
603 | asmlinkage long | ||
604 | sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) | ||
605 | { | ||
606 | sigset_t s; | ||
607 | compat_sigset_t s32; | ||
608 | int ret; | ||
609 | mm_segment_t old_fs = get_fs(); | ||
610 | |||
611 | set_fs (KERNEL_DS); | ||
612 | ret = sys_rt_sigpending(&s, sigsetsize); | ||
613 | set_fs (old_fs); | ||
614 | if (!ret) { | ||
615 | switch (_NSIG_WORDS) { | ||
616 | case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; | ||
617 | case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; | ||
618 | case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; | ||
619 | case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; | ||
620 | } | ||
621 | if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) | ||
622 | return -EFAULT; | ||
623 | } | ||
624 | return ret; | ||
625 | } | ||
626 | |||
627 | asmlinkage long | ||
628 | sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) | ||
629 | { | ||
630 | siginfo_t info; | ||
631 | int ret; | ||
632 | mm_segment_t old_fs = get_fs(); | ||
633 | |||
634 | if (copy_siginfo_from_user32(&info, uinfo)) | ||
635 | return -EFAULT; | ||
636 | set_fs (KERNEL_DS); | ||
637 | ret = sys_rt_sigqueueinfo(pid, sig, &info); | ||
638 | set_fs (old_fs); | ||
639 | return ret; | ||
640 | } | ||
641 | |||
642 | /* These are here just in case some old ia32 binary calls it. */ | ||
643 | asmlinkage long | ||
644 | sys32_pause(void) | ||
645 | { | ||
646 | current->state = TASK_INTERRUPTIBLE; | ||
647 | schedule(); | ||
648 | return -ERESTARTNOHAND; | ||
649 | } | ||
650 | |||
651 | |||
652 | #ifdef CONFIG_SYSCTL | ||
653 | struct sysctl_ia32 { | ||
654 | unsigned int name; | ||
655 | int nlen; | ||
656 | unsigned int oldval; | ||
657 | unsigned int oldlenp; | ||
658 | unsigned int newval; | ||
659 | unsigned int newlen; | ||
660 | unsigned int __unused[4]; | ||
661 | }; | ||
662 | |||
663 | |||
664 | asmlinkage long | ||
665 | sys32_sysctl(struct sysctl_ia32 __user *args32) | ||
666 | { | ||
667 | struct sysctl_ia32 a32; | ||
668 | mm_segment_t old_fs = get_fs (); | ||
669 | void __user *oldvalp, *newvalp; | ||
670 | size_t oldlen; | ||
671 | int __user *namep; | ||
672 | long ret; | ||
673 | extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, | ||
674 | void *newval, size_t newlen); | ||
675 | |||
676 | |||
677 | if (copy_from_user(&a32, args32, sizeof (a32))) | ||
678 | return -EFAULT; | ||
679 | |||
680 | /* | ||
681 | * We need to pre-validate these because we have to disable address checking | ||
682 | * before calling do_sysctl() because of OLDLEN but we can't run the risk of the | ||
683 | * user specifying bad addresses here. Well, since we're dealing with 32 bit | ||
684 | * addresses, we KNOW that access_ok() will always succeed, so this is an | ||
685 | * expensive NOP, but so what... | ||
686 | */ | ||
687 | namep = compat_ptr(a32.name); | ||
688 | oldvalp = compat_ptr(a32.oldval); | ||
689 | newvalp = compat_ptr(a32.newval); | ||
690 | |||
691 | if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) | ||
692 | || !access_ok(VERIFY_WRITE, namep, 0) | ||
693 | || !access_ok(VERIFY_WRITE, oldvalp, 0) | ||
694 | || !access_ok(VERIFY_WRITE, newvalp, 0)) | ||
695 | return -EFAULT; | ||
696 | |||
697 | set_fs(KERNEL_DS); | ||
698 | lock_kernel(); | ||
699 | ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); | ||
700 | unlock_kernel(); | ||
701 | set_fs(old_fs); | ||
702 | |||
703 | if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) | ||
704 | return -EFAULT; | ||
705 | |||
706 | return ret; | ||
707 | } | ||
708 | #endif | ||
709 | |||
710 | /* warning: next two assume little endian */ | ||
711 | asmlinkage long | ||
712 | sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) | ||
713 | { | ||
714 | return sys_pread64(fd, ubuf, count, | ||
715 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | ||
716 | } | ||
717 | |||
718 | asmlinkage long | ||
719 | sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) | ||
720 | { | ||
721 | return sys_pwrite64(fd, ubuf, count, | ||
722 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | ||
723 | } | ||
724 | |||
725 | |||
726 | asmlinkage long | ||
727 | sys32_personality(unsigned long personality) | ||
728 | { | ||
729 | int ret; | ||
730 | if (personality(current->personality) == PER_LINUX32 && | ||
731 | personality == PER_LINUX) | ||
732 | personality = PER_LINUX32; | ||
733 | ret = sys_personality(personality); | ||
734 | if (ret == PER_LINUX32) | ||
735 | ret = PER_LINUX; | ||
736 | return ret; | ||
737 | } | ||
738 | |||
739 | asmlinkage long | ||
740 | sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) | ||
741 | { | ||
742 | mm_segment_t old_fs = get_fs(); | ||
743 | int ret; | ||
744 | off_t of; | ||
745 | |||
746 | if (offset && get_user(of, offset)) | ||
747 | return -EFAULT; | ||
748 | |||
749 | set_fs(KERNEL_DS); | ||
750 | ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); | ||
751 | set_fs(old_fs); | ||
752 | |||
753 | if (!ret && offset && put_user(of, offset)) | ||
754 | return -EFAULT; | ||
755 | |||
756 | return ret; | ||
757 | } | ||
758 | |||
759 | /* Handle adjtimex compatibility. */ | ||
760 | |||
761 | struct timex32 { | ||
762 | u32 modes; | ||
763 | s32 offset, freq, maxerror, esterror; | ||
764 | s32 status, constant, precision, tolerance; | ||
765 | struct compat_timeval time; | ||
766 | s32 tick; | ||
767 | s32 ppsfreq, jitter, shift, stabil; | ||
768 | s32 jitcnt, calcnt, errcnt, stbcnt; | ||
769 | s32 :32; s32 :32; s32 :32; s32 :32; | ||
770 | s32 :32; s32 :32; s32 :32; s32 :32; | ||
771 | s32 :32; s32 :32; s32 :32; s32 :32; | ||
772 | }; | ||
773 | |||
774 | extern int do_adjtimex(struct timex *); | ||
775 | |||
776 | asmlinkage long | ||
777 | sys32_adjtimex(struct timex32 __user *utp) | ||
778 | { | ||
779 | struct timex txc; | ||
780 | int ret; | ||
781 | |||
782 | memset(&txc, 0, sizeof(struct timex)); | ||
783 | |||
784 | if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) || | ||
785 | __get_user(txc.modes, &utp->modes) || | ||
786 | __get_user(txc.offset, &utp->offset) || | ||
787 | __get_user(txc.freq, &utp->freq) || | ||
788 | __get_user(txc.maxerror, &utp->maxerror) || | ||
789 | __get_user(txc.esterror, &utp->esterror) || | ||
790 | __get_user(txc.status, &utp->status) || | ||
791 | __get_user(txc.constant, &utp->constant) || | ||
792 | __get_user(txc.precision, &utp->precision) || | ||
793 | __get_user(txc.tolerance, &utp->tolerance) || | ||
794 | __get_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
795 | __get_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
796 | __get_user(txc.tick, &utp->tick) || | ||
797 | __get_user(txc.ppsfreq, &utp->ppsfreq) || | ||
798 | __get_user(txc.jitter, &utp->jitter) || | ||
799 | __get_user(txc.shift, &utp->shift) || | ||
800 | __get_user(txc.stabil, &utp->stabil) || | ||
801 | __get_user(txc.jitcnt, &utp->jitcnt) || | ||
802 | __get_user(txc.calcnt, &utp->calcnt) || | ||
803 | __get_user(txc.errcnt, &utp->errcnt) || | ||
804 | __get_user(txc.stbcnt, &utp->stbcnt)) | ||
805 | return -EFAULT; | ||
806 | |||
807 | ret = do_adjtimex(&txc); | ||
808 | |||
809 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) || | ||
810 | __put_user(txc.modes, &utp->modes) || | ||
811 | __put_user(txc.offset, &utp->offset) || | ||
812 | __put_user(txc.freq, &utp->freq) || | ||
813 | __put_user(txc.maxerror, &utp->maxerror) || | ||
814 | __put_user(txc.esterror, &utp->esterror) || | ||
815 | __put_user(txc.status, &utp->status) || | ||
816 | __put_user(txc.constant, &utp->constant) || | ||
817 | __put_user(txc.precision, &utp->precision) || | ||
818 | __put_user(txc.tolerance, &utp->tolerance) || | ||
819 | __put_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
820 | __put_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
821 | __put_user(txc.tick, &utp->tick) || | ||
822 | __put_user(txc.ppsfreq, &utp->ppsfreq) || | ||
823 | __put_user(txc.jitter, &utp->jitter) || | ||
824 | __put_user(txc.shift, &utp->shift) || | ||
825 | __put_user(txc.stabil, &utp->stabil) || | ||
826 | __put_user(txc.jitcnt, &utp->jitcnt) || | ||
827 | __put_user(txc.calcnt, &utp->calcnt) || | ||
828 | __put_user(txc.errcnt, &utp->errcnt) || | ||
829 | __put_user(txc.stbcnt, &utp->stbcnt)) | ||
830 | ret = -EFAULT; | ||
831 | |||
832 | return ret; | ||
833 | } | ||
834 | |||
835 | asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, | ||
836 | unsigned long prot, unsigned long flags, | ||
837 | unsigned long fd, unsigned long pgoff) | ||
838 | { | ||
839 | struct mm_struct *mm = current->mm; | ||
840 | unsigned long error; | ||
841 | struct file * file = NULL; | ||
842 | |||
843 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
844 | if (!(flags & MAP_ANONYMOUS)) { | ||
845 | file = fget(fd); | ||
846 | if (!file) | ||
847 | return -EBADF; | ||
848 | } | ||
849 | |||
850 | down_write(&mm->mmap_sem); | ||
851 | error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
852 | up_write(&mm->mmap_sem); | ||
853 | |||
854 | if (file) | ||
855 | fput(file); | ||
856 | return error; | ||
857 | } | ||
858 | |||
859 | asmlinkage long sys32_olduname(struct oldold_utsname __user * name) | ||
860 | { | ||
861 | int error; | ||
862 | |||
863 | if (!name) | ||
864 | return -EFAULT; | ||
865 | if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) | ||
866 | return -EFAULT; | ||
867 | |||
868 | down_read(&uts_sem); | ||
869 | |||
870 | error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); | ||
871 | __put_user(0,name->sysname+__OLD_UTS_LEN); | ||
872 | __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); | ||
873 | __put_user(0,name->nodename+__OLD_UTS_LEN); | ||
874 | __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); | ||
875 | __put_user(0,name->release+__OLD_UTS_LEN); | ||
876 | __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); | ||
877 | __put_user(0,name->version+__OLD_UTS_LEN); | ||
878 | { | ||
879 | char *arch = "x86_64"; | ||
880 | if (personality(current->personality) == PER_LINUX32) | ||
881 | arch = "i686"; | ||
882 | |||
883 | __copy_to_user(&name->machine,arch,strlen(arch)+1); | ||
884 | } | ||
885 | |||
886 | up_read(&uts_sem); | ||
887 | |||
888 | error = error ? -EFAULT : 0; | ||
889 | |||
890 | return error; | ||
891 | } | ||
892 | |||
893 | long sys32_uname(struct old_utsname __user * name) | ||
894 | { | ||
895 | int err; | ||
896 | if (!name) | ||
897 | return -EFAULT; | ||
898 | down_read(&uts_sem); | ||
899 | err=copy_to_user(name, &system_utsname, sizeof (*name)); | ||
900 | up_read(&uts_sem); | ||
901 | if (personality(current->personality) == PER_LINUX32) | ||
902 | err |= copy_to_user(&name->machine, "i686", 5); | ||
903 | return err?-EFAULT:0; | ||
904 | } | ||
905 | |||
906 | long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) | ||
907 | { | ||
908 | struct ustat u; | ||
909 | mm_segment_t seg; | ||
910 | int ret; | ||
911 | |||
912 | seg = get_fs(); | ||
913 | set_fs(KERNEL_DS); | ||
914 | ret = sys_ustat(dev,&u); | ||
915 | set_fs(seg); | ||
916 | if (ret >= 0) { | ||
917 | if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || | ||
918 | __put_user((__u32) u.f_tfree, &u32p->f_tfree) || | ||
919 | __put_user((__u32) u.f_tinode, &u32p->f_tfree) || | ||
920 | __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || | ||
921 | __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) | ||
922 | ret = -EFAULT; | ||
923 | } | ||
924 | return ret; | ||
925 | } | ||
926 | |||
927 | asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, | ||
928 | compat_uptr_t __user *envp, struct pt_regs *regs) | ||
929 | { | ||
930 | long error; | ||
931 | char * filename; | ||
932 | |||
933 | filename = getname(name); | ||
934 | error = PTR_ERR(filename); | ||
935 | if (IS_ERR(filename)) | ||
936 | return error; | ||
937 | error = compat_do_execve(filename, argv, envp, regs); | ||
938 | if (error == 0) { | ||
939 | task_lock(current); | ||
940 | current->ptrace &= ~PT_DTRACE; | ||
941 | task_unlock(current); | ||
942 | } | ||
943 | putname(filename); | ||
944 | return error; | ||
945 | } | ||
946 | |||
947 | asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, | ||
948 | struct pt_regs *regs) | ||
949 | { | ||
950 | void __user *parent_tid = (void __user *)regs->rdx; | ||
951 | void __user *child_tid = (void __user *)regs->rdi; | ||
952 | if (!newsp) | ||
953 | newsp = regs->rsp; | ||
954 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
955 | } | ||
956 | |||
957 | /* | ||
958 | * Some system calls that need sign extended arguments. This could be done by a generic wrapper. | ||
959 | */ | ||
960 | |||
961 | long sys32_lseek (unsigned int fd, int offset, unsigned int whence) | ||
962 | { | ||
963 | return sys_lseek(fd, offset, whence); | ||
964 | } | ||
965 | |||
966 | long sys32_kill(int pid, int sig) | ||
967 | { | ||
968 | return sys_kill(pid, sig); | ||
969 | } | ||
970 | |||
971 | asmlinkage long sys32_open(const char __user * filename, int flags, int mode) | ||
972 | { | ||
973 | char * tmp; | ||
974 | int fd, error; | ||
975 | |||
976 | /* don't force O_LARGEFILE */ | ||
977 | tmp = getname(filename); | ||
978 | fd = PTR_ERR(tmp); | ||
979 | if (!IS_ERR(tmp)) { | ||
980 | fd = get_unused_fd(); | ||
981 | if (fd >= 0) { | ||
982 | struct file *f = filp_open(tmp, flags, mode); | ||
983 | error = PTR_ERR(f); | ||
984 | if (IS_ERR(f)) { | ||
985 | put_unused_fd(fd); | ||
986 | fd = error; | ||
987 | } else | ||
988 | fd_install(fd, f); | ||
989 | } | ||
990 | putname(tmp); | ||
991 | } | ||
992 | return fd; | ||
993 | } | ||
994 | |||
995 | extern asmlinkage long | ||
996 | sys_timer_create(clockid_t which_clock, | ||
997 | struct sigevent __user *timer_event_spec, | ||
998 | timer_t __user * created_timer_id); | ||
999 | |||
1000 | long | ||
1001 | sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id) | ||
1002 | { | ||
1003 | struct sigevent __user *p = NULL; | ||
1004 | if (se32) { | ||
1005 | struct sigevent se; | ||
1006 | p = compat_alloc_user_space(sizeof(struct sigevent)); | ||
1007 | if (get_compat_sigevent(&se, se32) || | ||
1008 | copy_to_user(p, &se, sizeof(se))) | ||
1009 | return -EFAULT; | ||
1010 | } | ||
1011 | return sys_timer_create(clock, p, timer_id); | ||
1012 | } | ||
1013 | |||
1014 | long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, | ||
1015 | __u32 len_low, __u32 len_high, int advice) | ||
1016 | { | ||
1017 | return sys_fadvise64_64(fd, | ||
1018 | (((u64)offset_high)<<32) | offset_low, | ||
1019 | (((u64)len_high)<<32) | len_low, | ||
1020 | advice); | ||
1021 | } | ||
1022 | |||
1023 | long sys32_vm86_warning(void) | ||
1024 | { | ||
1025 | struct task_struct *me = current; | ||
1026 | static char lastcomm[sizeof(me->comm)]; | ||
1027 | if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { | ||
1028 | printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", | ||
1029 | me->comm); | ||
1030 | strncpy(lastcomm, me->comm, sizeof(lastcomm)); | ||
1031 | } | ||
1032 | return -ENOSYS; | ||
1033 | } | ||
1034 | |||
1035 | long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, | ||
1036 | char __user * buf, size_t len) | ||
1037 | { | ||
1038 | return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); | ||
1039 | } | ||
1040 | |||
1041 | static int __init ia32_init (void) | ||
1042 | { | ||
1043 | printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n"); | ||
1044 | return 0; | ||
1045 | } | ||
1046 | |||
1047 | __initcall(ia32_init); | ||
1048 | |||
1049 | extern unsigned long ia32_sys_call_table[]; | ||
1050 | EXPORT_SYMBOL(ia32_sys_call_table); | ||
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c new file mode 100644 index 000000000000..399ff4985099 --- /dev/null +++ b/arch/x86_64/ia32/syscall32.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* Copyright 2002,2003 Andi Kleen, SuSE Labs */ | ||
2 | |||
3 | /* vsyscall handling for 32bit processes. Map a stub page into it | ||
4 | on demand because 32bit cannot reach the kernel's fixmaps */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/gfp.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <asm/proto.h> | ||
13 | #include <asm/tlbflush.h> | ||
14 | #include <asm/ia32_unistd.h> | ||
15 | |||
16 | /* 32bit VDSOs mapped into user space. */ | ||
17 | asm(".section \".init.data\",\"aw\"\n" | ||
18 | "syscall32_syscall:\n" | ||
19 | ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n" | ||
20 | "syscall32_syscall_end:\n" | ||
21 | "syscall32_sysenter:\n" | ||
22 | ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n" | ||
23 | "syscall32_sysenter_end:\n" | ||
24 | ".previous"); | ||
25 | |||
26 | extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; | ||
27 | extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; | ||
28 | extern int sysctl_vsyscall32; | ||
29 | |||
30 | char *syscall32_page; | ||
31 | static int use_sysenter = -1; | ||
32 | |||
33 | /* | ||
34 | * Map the 32bit vsyscall page on demand. | ||
35 | * | ||
36 | * RED-PEN: This knows too much about high level VM. | ||
37 | * | ||
38 | * Alternative would be to generate a vma with appropriate backing options | ||
39 | * and let it be handled by generic VM. | ||
40 | */ | ||
41 | int __map_syscall32(struct mm_struct *mm, unsigned long address) | ||
42 | { | ||
43 | pgd_t *pgd; | ||
44 | pud_t *pud; | ||
45 | pte_t *pte; | ||
46 | pmd_t *pmd; | ||
47 | int err = -ENOMEM; | ||
48 | |||
49 | spin_lock(&mm->page_table_lock); | ||
50 | pgd = pgd_offset(mm, address); | ||
51 | pud = pud_alloc(mm, pgd, address); | ||
52 | if (pud) { | ||
53 | pmd = pmd_alloc(mm, pud, address); | ||
54 | if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { | ||
55 | if (pte_none(*pte)) { | ||
56 | set_pte(pte, | ||
57 | mk_pte(virt_to_page(syscall32_page), | ||
58 | PAGE_KERNEL_VSYSCALL32)); | ||
59 | } | ||
60 | /* Flush only the local CPU. Other CPUs taking a fault | ||
61 | will just end up here again | ||
62 | This probably not needed and just paranoia. */ | ||
63 | __flush_tlb_one(address); | ||
64 | err = 0; | ||
65 | } | ||
66 | } | ||
67 | spin_unlock(&mm->page_table_lock); | ||
68 | return err; | ||
69 | } | ||
70 | |||
71 | int map_syscall32(struct mm_struct *mm, unsigned long address) | ||
72 | { | ||
73 | int err; | ||
74 | down_read(&mm->mmap_sem); | ||
75 | err = __map_syscall32(mm, address); | ||
76 | up_read(&mm->mmap_sem); | ||
77 | return err; | ||
78 | } | ||
79 | |||
80 | static int __init init_syscall32(void) | ||
81 | { | ||
82 | syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); | ||
83 | if (!syscall32_page) | ||
84 | panic("Cannot allocate syscall32 page"); | ||
85 | SetPageReserved(virt_to_page(syscall32_page)); | ||
86 | if (use_sysenter > 0) { | ||
87 | memcpy(syscall32_page, syscall32_sysenter, | ||
88 | syscall32_sysenter_end - syscall32_sysenter); | ||
89 | } else { | ||
90 | memcpy(syscall32_page, syscall32_syscall, | ||
91 | syscall32_syscall_end - syscall32_syscall); | ||
92 | } | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | __initcall(init_syscall32); | ||
97 | |||
98 | /* May not be __init: called during resume */ | ||
99 | void syscall32_cpu_init(void) | ||
100 | { | ||
101 | if (use_sysenter < 0) | ||
102 | use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); | ||
103 | |||
104 | /* Load these always in case some future AMD CPU supports | ||
105 | SYSENTER from compat mode too. */ | ||
106 | checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
107 | checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
108 | checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
109 | |||
110 | wrmsrl(MSR_CSTAR, ia32_cstar_target); | ||
111 | } | ||
diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c new file mode 100644 index 000000000000..1cc4340de3ca --- /dev/null +++ b/arch/x86_64/ia32/tls32.c | |||
@@ -0,0 +1,163 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/user.h> | ||
5 | |||
6 | #include <asm/uaccess.h> | ||
7 | #include <asm/desc.h> | ||
8 | #include <asm/system.h> | ||
9 | #include <asm/ldt.h> | ||
10 | #include <asm/processor.h> | ||
11 | #include <asm/proto.h> | ||
12 | |||
13 | /* | ||
14 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. | ||
15 | */ | ||
16 | static int get_free_idx(void) | ||
17 | { | ||
18 | struct thread_struct *t = ¤t->thread; | ||
19 | int idx; | ||
20 | |||
21 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | ||
22 | if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) | ||
23 | return idx + GDT_ENTRY_TLS_MIN; | ||
24 | return -ESRCH; | ||
25 | } | ||
26 | |||
27 | /* | ||
28 | * Set a given TLS descriptor: | ||
29 | * When you want addresses > 32bit use arch_prctl() | ||
30 | */ | ||
31 | int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) | ||
32 | { | ||
33 | struct user_desc info; | ||
34 | struct n_desc_struct *desc; | ||
35 | int cpu, idx; | ||
36 | |||
37 | if (copy_from_user(&info, u_info, sizeof(info))) | ||
38 | return -EFAULT; | ||
39 | |||
40 | idx = info.entry_number; | ||
41 | |||
42 | /* | ||
43 | * index -1 means the kernel should try to find and | ||
44 | * allocate an empty descriptor: | ||
45 | */ | ||
46 | if (idx == -1) { | ||
47 | idx = get_free_idx(); | ||
48 | if (idx < 0) | ||
49 | return idx; | ||
50 | if (put_user(idx, &u_info->entry_number)) | ||
51 | return -EFAULT; | ||
52 | } | ||
53 | |||
54 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
55 | return -EINVAL; | ||
56 | |||
57 | desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
58 | |||
59 | /* | ||
60 | * We must not get preempted while modifying the TLS. | ||
61 | */ | ||
62 | cpu = get_cpu(); | ||
63 | |||
64 | if (LDT_empty(&info)) { | ||
65 | desc->a = 0; | ||
66 | desc->b = 0; | ||
67 | } else { | ||
68 | desc->a = LDT_entry_a(&info); | ||
69 | desc->b = LDT_entry_b(&info); | ||
70 | } | ||
71 | if (t == ¤t->thread) | ||
72 | load_TLS(t, cpu); | ||
73 | |||
74 | put_cpu(); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) | ||
79 | { | ||
80 | return do_set_thread_area(¤t->thread, u_info); | ||
81 | } | ||
82 | |||
83 | |||
84 | /* | ||
85 | * Get the current Thread-Local Storage area: | ||
86 | */ | ||
87 | |||
88 | #define GET_BASE(desc) ( \ | ||
89 | (((desc)->a >> 16) & 0x0000ffff) | \ | ||
90 | (((desc)->b << 16) & 0x00ff0000) | \ | ||
91 | ( (desc)->b & 0xff000000) ) | ||
92 | |||
93 | #define GET_LIMIT(desc) ( \ | ||
94 | ((desc)->a & 0x0ffff) | \ | ||
95 | ((desc)->b & 0xf0000) ) | ||
96 | |||
97 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) | ||
98 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | ||
99 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | ||
100 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | ||
101 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | ||
102 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | ||
103 | #define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) | ||
104 | |||
105 | int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) | ||
106 | { | ||
107 | struct user_desc info; | ||
108 | struct n_desc_struct *desc; | ||
109 | int idx; | ||
110 | |||
111 | if (get_user(idx, &u_info->entry_number)) | ||
112 | return -EFAULT; | ||
113 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
114 | return -EINVAL; | ||
115 | |||
116 | desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
117 | |||
118 | memset(&info, 0, sizeof(struct user_desc)); | ||
119 | info.entry_number = idx; | ||
120 | info.base_addr = GET_BASE(desc); | ||
121 | info.limit = GET_LIMIT(desc); | ||
122 | info.seg_32bit = GET_32BIT(desc); | ||
123 | info.contents = GET_CONTENTS(desc); | ||
124 | info.read_exec_only = !GET_WRITABLE(desc); | ||
125 | info.limit_in_pages = GET_LIMIT_PAGES(desc); | ||
126 | info.seg_not_present = !GET_PRESENT(desc); | ||
127 | info.useable = GET_USEABLE(desc); | ||
128 | info.lm = GET_LONGMODE(desc); | ||
129 | |||
130 | if (copy_to_user(u_info, &info, sizeof(info))) | ||
131 | return -EFAULT; | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) | ||
136 | { | ||
137 | return do_get_thread_area(¤t->thread, u_info); | ||
138 | } | ||
139 | |||
140 | |||
141 | int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) | ||
142 | { | ||
143 | struct n_desc_struct *desc; | ||
144 | struct user_desc info; | ||
145 | struct user_desc __user *cp; | ||
146 | int idx; | ||
147 | |||
148 | cp = (void __user *)childregs->rsi; | ||
149 | if (copy_from_user(&info, cp, sizeof(info))) | ||
150 | return -EFAULT; | ||
151 | if (LDT_empty(&info)) | ||
152 | return -EINVAL; | ||
153 | |||
154 | idx = info.entry_number; | ||
155 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
156 | return -EINVAL; | ||
157 | |||
158 | desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
159 | desc->a = LDT_entry_a(&info); | ||
160 | desc->b = LDT_entry_b(&info); | ||
161 | |||
162 | return 0; | ||
163 | } | ||
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S new file mode 100644 index 000000000000..ba4067d350e4 --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-sigreturn.S | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * Common code for the sigreturn entry points on the vsyscall page. | ||
3 | * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) | ||
4 | * to enter the kernel. | ||
5 | * This file is #include'd by vsyscall-*.S to define them after the | ||
6 | * vsyscall entry point. The addresses we get for these entry points | ||
7 | * by doing ".balign 32" must match in both versions of the page. | ||
8 | */ | ||
9 | |||
10 | .section .text.sigreturn,"ax" | ||
11 | .balign 32 | ||
12 | .globl __kernel_sigreturn | ||
13 | .type __kernel_sigreturn,@function | ||
14 | __kernel_sigreturn: | ||
15 | .LSTART_sigreturn: | ||
16 | popl %eax | ||
17 | movl $__NR_ia32_sigreturn, %eax | ||
18 | SYSCALL_ENTER_KERNEL | ||
19 | .LEND_sigreturn: | ||
20 | .size __kernel_sigreturn,.-.LSTART_sigreturn | ||
21 | |||
22 | .section .text.rtsigreturn,"ax" | ||
23 | .balign 32 | ||
24 | .globl __kernel_rt_sigreturn | ||
25 | .type __kernel_rt_sigreturn,@function | ||
26 | __kernel_rt_sigreturn: | ||
27 | .LSTART_rt_sigreturn: | ||
28 | movl $__NR_ia32_rt_sigreturn, %eax | ||
29 | SYSCALL_ENTER_KERNEL | ||
30 | .LEND_rt_sigreturn: | ||
31 | .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn | ||
32 | |||
33 | .section .eh_frame,"a",@progbits | ||
34 | .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ | ||
35 | .LSTARTFDE2: | ||
36 | .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */ | ||
37 | /* HACK: The dwarf2 unwind routines will subtract 1 from the | ||
38 | return address to get an address in the middle of the | ||
39 | presumed call instruction. Since we didn't get here via | ||
40 | a call, we need to include the nop before the real start | ||
41 | to make up for it. */ | ||
42 | .long .LSTART_sigreturn-1-. /* PC-relative start address */ | ||
43 | .long .LEND_sigreturn-.LSTART_sigreturn+1 | ||
44 | .uleb128 0 /* Augmentation length */ | ||
45 | /* What follows are the instructions for the table generation. | ||
46 | We record the locations of each register saved. This is | ||
47 | complicated by the fact that the "CFA" is always assumed to | ||
48 | be the value of the stack pointer in the caller. This means | ||
49 | that we must define the CFA of this body of code to be the | ||
50 | saved value of the stack pointer in the sigcontext. Which | ||
51 | also means that there is no fixed relation to the other | ||
52 | saved registers, which means that we must use DW_CFA_expression | ||
53 | to compute their addresses. It also means that when we | ||
54 | adjust the stack with the popl, we have to do it all over again. */ | ||
55 | |||
56 | #define do_cfa_expr(offset) \ | ||
57 | .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ | ||
58 | .uleb128 1f-0f; /* length */ \ | ||
59 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
60 | .sleb128 offset; /* offset */ \ | ||
61 | .byte 0x06; /* DW_OP_deref */ \ | ||
62 | 1: | ||
63 | |||
64 | #define do_expr(regno, offset) \ | ||
65 | .byte 0x10; /* DW_CFA_expression */ \ | ||
66 | .uleb128 regno; /* regno */ \ | ||
67 | .uleb128 1f-0f; /* length */ \ | ||
68 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
69 | .sleb128 offset; /* offset */ \ | ||
70 | 1: | ||
71 | |||
72 | do_cfa_expr(IA32_SIGCONTEXT_esp+4) | ||
73 | do_expr(0, IA32_SIGCONTEXT_eax+4) | ||
74 | do_expr(1, IA32_SIGCONTEXT_ecx+4) | ||
75 | do_expr(2, IA32_SIGCONTEXT_edx+4) | ||
76 | do_expr(3, IA32_SIGCONTEXT_ebx+4) | ||
77 | do_expr(5, IA32_SIGCONTEXT_ebp+4) | ||
78 | do_expr(6, IA32_SIGCONTEXT_esi+4) | ||
79 | do_expr(7, IA32_SIGCONTEXT_edi+4) | ||
80 | do_expr(8, IA32_SIGCONTEXT_eip+4) | ||
81 | |||
82 | .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ | ||
83 | |||
84 | do_cfa_expr(IA32_SIGCONTEXT_esp) | ||
85 | do_expr(0, IA32_SIGCONTEXT_eax) | ||
86 | do_expr(1, IA32_SIGCONTEXT_ecx) | ||
87 | do_expr(2, IA32_SIGCONTEXT_edx) | ||
88 | do_expr(3, IA32_SIGCONTEXT_ebx) | ||
89 | do_expr(5, IA32_SIGCONTEXT_ebp) | ||
90 | do_expr(6, IA32_SIGCONTEXT_esi) | ||
91 | do_expr(7, IA32_SIGCONTEXT_edi) | ||
92 | do_expr(8, IA32_SIGCONTEXT_eip) | ||
93 | |||
94 | .align 4 | ||
95 | .LENDFDE2: | ||
96 | |||
97 | .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ | ||
98 | .LSTARTFDE3: | ||
99 | .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */ | ||
100 | /* HACK: See above wrt unwind library assumptions. */ | ||
101 | .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ | ||
102 | .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 | ||
103 | .uleb128 0 /* Augmentation */ | ||
104 | /* What follows are the instructions for the table generation. | ||
105 | We record the locations of each register saved. This is | ||
106 | slightly less complicated than the above, since we don't | ||
107 | modify the stack pointer in the process. */ | ||
108 | |||
109 | do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) | ||
110 | do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) | ||
111 | do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) | ||
112 | do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) | ||
113 | do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) | ||
114 | do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) | ||
115 | do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) | ||
116 | do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) | ||
117 | do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) | ||
118 | |||
119 | .align 4 | ||
120 | .LENDFDE3: | ||
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S new file mode 100644 index 000000000000..e2aaf3de8a42 --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-syscall.S | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the syscall instruction. | ||
3 | */ | ||
4 | |||
5 | #include <asm/ia32_unistd.h> | ||
6 | #include <asm/offset.h> | ||
7 | #include <asm/segment.h> | ||
8 | |||
9 | .text | ||
10 | .section .text.vsyscall,"ax" | ||
11 | .globl __kernel_vsyscall | ||
12 | .type __kernel_vsyscall,@function | ||
13 | __kernel_vsyscall: | ||
14 | .LSTART_vsyscall: | ||
15 | push %ebp | ||
16 | .Lpush_ebp: | ||
17 | movl %ecx, %ebp | ||
18 | syscall | ||
19 | movl $__USER32_DS, %ecx | ||
20 | movl %ecx, %ss | ||
21 | movl %ebp, %ecx | ||
22 | popl %ebp | ||
23 | .Lpop_ebp: | ||
24 | ret | ||
25 | .LEND_vsyscall: | ||
26 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
27 | |||
28 | .section .eh_frame,"a",@progbits | ||
29 | .LSTARTFRAME: | ||
30 | .long .LENDCIE-.LSTARTCIE | ||
31 | .LSTARTCIE: | ||
32 | .long 0 /* CIE ID */ | ||
33 | .byte 1 /* Version number */ | ||
34 | .string "zR" /* NUL-terminated augmentation string */ | ||
35 | .uleb128 1 /* Code alignment factor */ | ||
36 | .sleb128 -4 /* Data alignment factor */ | ||
37 | .byte 8 /* Return address register column */ | ||
38 | .uleb128 1 /* Augmentation value length */ | ||
39 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
40 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
41 | .uleb128 4 | ||
42 | .uleb128 4 | ||
43 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
44 | .uleb128 1 | ||
45 | .align 4 | ||
46 | .LENDCIE: | ||
47 | |||
48 | .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ | ||
49 | .LSTARTFDE1: | ||
50 | .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ | ||
51 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
52 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
53 | .uleb128 0 /* Augmentation length */ | ||
54 | /* What follows are the instructions for the table generation. | ||
55 | We have to record all changes of the stack pointer. */ | ||
56 | .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ | ||
57 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
58 | .uleb128 8 | ||
59 | .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ | ||
60 | .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ | ||
61 | .byte 0xc5 /* DW_CFA_restore %ebp */ | ||
62 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
63 | .uleb128 4 | ||
64 | .align 4 | ||
65 | .LENDFDE1: | ||
66 | |||
67 | #define SYSCALL_ENTER_KERNEL syscall | ||
68 | #include "vsyscall-sigreturn.S" | ||
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S new file mode 100644 index 000000000000..8fb8e0ff3afa --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-sysenter.S | |||
@@ -0,0 +1,94 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the sysenter instruction. | ||
3 | */ | ||
4 | |||
5 | #include <asm/ia32_unistd.h> | ||
6 | #include <asm/offset.h> | ||
7 | |||
8 | .text | ||
9 | .section .text.vsyscall,"ax" | ||
10 | .globl __kernel_vsyscall | ||
11 | .type __kernel_vsyscall,@function | ||
12 | __kernel_vsyscall: | ||
13 | .LSTART_vsyscall: | ||
14 | push %ecx | ||
15 | .Lpush_ecx: | ||
16 | push %edx | ||
17 | .Lpush_edx: | ||
18 | push %ebp | ||
19 | .Lenter_kernel: | ||
20 | movl %esp,%ebp | ||
21 | sysenter | ||
22 | .space 7,0x90 | ||
23 | jmp .Lenter_kernel | ||
24 | /* 16: System call normal return point is here! */ | ||
25 | pop %ebp | ||
26 | .Lpop_ebp: | ||
27 | pop %edx | ||
28 | .Lpop_edx: | ||
29 | pop %ecx | ||
30 | .Lpop_ecx: | ||
31 | ret | ||
32 | .LEND_vsyscall: | ||
33 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
34 | |||
35 | .section .eh_frame,"a",@progbits | ||
36 | .LSTARTFRAME: | ||
37 | .long .LENDCIE-.LSTARTCIE | ||
38 | .LSTARTCIE: | ||
39 | .long 0 /* CIE ID */ | ||
40 | .byte 1 /* Version number */ | ||
41 | .string "zR" /* NUL-terminated augmentation string */ | ||
42 | .uleb128 1 /* Code alignment factor */ | ||
43 | .sleb128 -4 /* Data alignment factor */ | ||
44 | .byte 8 /* Return address register column */ | ||
45 | .uleb128 1 /* Augmentation value length */ | ||
46 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
47 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
48 | .uleb128 4 | ||
49 | .uleb128 4 | ||
50 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
51 | .uleb128 1 | ||
52 | .align 4 | ||
53 | .LENDCIE: | ||
54 | |||
55 | .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ | ||
56 | .LSTARTFDE1: | ||
57 | .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ | ||
58 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
59 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
60 | .uleb128 0 /* Augmentation length */ | ||
61 | /* What follows are the instructions for the table generation. | ||
62 | We have to record all changes of the stack pointer. */ | ||
63 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
64 | .long .Lpush_ecx-.LSTART_vsyscall | ||
65 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
66 | .byte 0x08 /* RA at offset 8 now */ | ||
67 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
68 | .long .Lpush_edx-.Lpush_ecx | ||
69 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
70 | .byte 0x0c /* RA at offset 12 now */ | ||
71 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
72 | .long .Lenter_kernel-.Lpush_edx | ||
73 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
74 | .byte 0x10 /* RA at offset 16 now */ | ||
75 | .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ | ||
76 | /* Finally the epilogue. */ | ||
77 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
78 | .long .Lpop_ebp-.Lenter_kernel | ||
79 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
80 | .byte 0x12 /* RA at offset 12 now */ | ||
81 | .byte 0xc5 /* DW_CFA_restore %ebp */ | ||
82 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
83 | .long .Lpop_edx-.Lpop_ebp | ||
84 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
85 | .byte 0x08 /* RA at offset 8 now */ | ||
86 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
87 | .long .Lpop_ecx-.Lpop_edx | ||
88 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
89 | .byte 0x04 /* RA at offset 4 now */ | ||
90 | .align 4 | ||
91 | .LENDFDE1: | ||
92 | |||
93 | #define SYSCALL_ENTER_KERNEL int $0x80 | ||
94 | #include "vsyscall-sigreturn.S" | ||
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds new file mode 100644 index 000000000000..fa4b4dd4a9ff --- /dev/null +++ b/arch/x86_64/ia32/vsyscall.lds | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Linker script for vsyscall DSO. The vsyscall page is an ELF shared | ||
3 | * object prelinked to its virtual address. This script controls its layout. | ||
4 | */ | ||
5 | |||
6 | /* This must match <asm/fixmap.h>. */ | ||
7 | VSYSCALL_BASE = 0xffffe000; | ||
8 | |||
9 | SECTIONS | ||
10 | { | ||
11 | . = VSYSCALL_BASE + SIZEOF_HEADERS; | ||
12 | |||
13 | .hash : { *(.hash) } :text | ||
14 | .dynsym : { *(.dynsym) } | ||
15 | .dynstr : { *(.dynstr) } | ||
16 | .gnu.version : { *(.gnu.version) } | ||
17 | .gnu.version_d : { *(.gnu.version_d) } | ||
18 | .gnu.version_r : { *(.gnu.version_r) } | ||
19 | |||
20 | /* This linker script is used both with -r and with -shared. | ||
21 | For the layouts to match, we need to skip more than enough | ||
22 | space for the dynamic symbol table et al. If this amount | ||
23 | is insufficient, ld -shared will barf. Just increase it here. */ | ||
24 | . = VSYSCALL_BASE + 0x400; | ||
25 | |||
26 | .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 | ||
27 | |||
28 | /* This is an 32bit object and we cannot easily get the offsets | ||
29 | into the 64bit kernel. Just hardcode them here. This assumes | ||
30 | that all the stubs don't need more than 0x100 bytes. */ | ||
31 | . = VSYSCALL_BASE + 0x500; | ||
32 | |||
33 | .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 | ||
34 | |||
35 | . = VSYSCALL_BASE + 0x600; | ||
36 | |||
37 | .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 | ||
38 | |||
39 | .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr | ||
40 | .eh_frame : { KEEP (*(.eh_frame)) } :text | ||
41 | .dynamic : { *(.dynamic) } :text :dynamic | ||
42 | .useless : { | ||
43 | *(.got.plt) *(.got) | ||
44 | *(.data .data.* .gnu.linkonce.d.*) | ||
45 | *(.dynbss) | ||
46 | *(.bss .bss.* .gnu.linkonce.b.*) | ||
47 | } :text | ||
48 | } | ||
49 | |||
50 | /* | ||
51 | * We must supply the ELF program headers explicitly to get just one | ||
52 | * PT_LOAD segment, and set the flags explicitly to make segments read-only. | ||
53 | */ | ||
54 | PHDRS | ||
55 | { | ||
56 | text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ | ||
57 | dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ | ||
58 | eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ | ||
59 | } | ||
60 | |||
61 | /* | ||
62 | * This controls what symbols we export from the DSO. | ||
63 | */ | ||
64 | VERSION | ||
65 | { | ||
66 | LINUX_2.5 { | ||
67 | global: | ||
68 | __kernel_vsyscall; | ||
69 | __kernel_sigreturn; | ||
70 | __kernel_rt_sigreturn; | ||
71 | |||
72 | local: *; | ||
73 | }; | ||
74 | } | ||
75 | |||
76 | /* The ELF entry point can be used to set the AT_SYSINFO value. */ | ||
77 | ENTRY(__kernel_vsyscall); | ||
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile new file mode 100644 index 000000000000..0a3318e08ab6 --- /dev/null +++ b/arch/x86_64/kernel/Makefile | |||
@@ -0,0 +1,45 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | extra-y := head.o head64.o init_task.o vmlinux.lds | ||
6 | EXTRA_AFLAGS := -traditional | ||
7 | obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ | ||
8 | ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ | ||
9 | x8664_ksyms.o i387.o syscall.o vsyscall.o \ | ||
10 | setup64.o bootflag.o e820.o reboot.o quirks.o | ||
11 | |||
12 | obj-$(CONFIG_X86_MCE) += mce.o | ||
13 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o | ||
14 | obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ | ||
15 | obj-$(CONFIG_ACPI_BOOT) += acpi/ | ||
16 | obj-$(CONFIG_X86_MSR) += msr.o | ||
17 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
18 | obj-$(CONFIG_X86_CPUID) += cpuid.o | ||
19 | obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o | ||
20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o | ||
21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ | ||
22 | genapic.o genapic_cluster.o genapic_flat.o | ||
23 | obj-$(CONFIG_PM) += suspend.o | ||
24 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o | ||
25 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
26 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | ||
27 | obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o | ||
28 | obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o | ||
29 | obj-$(CONFIG_SWIOTLB) += swiotlb.o | ||
30 | obj-$(CONFIG_KPROBES) += kprobes.o | ||
31 | |||
32 | obj-$(CONFIG_MODULES) += module.o | ||
33 | |||
34 | obj-y += topology.o | ||
35 | obj-y += intel_cacheinfo.o | ||
36 | |||
37 | CFLAGS_vsyscall.o := $(PROFILING) -g0 | ||
38 | |||
39 | bootflag-y += ../../i386/kernel/bootflag.o | ||
40 | cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o | ||
41 | topology-y += ../../i386/mach-default/topology.o | ||
42 | swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o | ||
43 | microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o | ||
44 | intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o | ||
45 | quirks-y += ../../i386/kernel/quirks.o | ||
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile new file mode 100644 index 000000000000..d2c2ee5f9a88 --- /dev/null +++ b/arch/x86_64/kernel/acpi/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | obj-$(CONFIG_ACPI_BOOT) := boot.o | ||
2 | boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o | ||
3 | obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o | ||
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c new file mode 100644 index 000000000000..7a275de6df22 --- /dev/null +++ b/arch/x86_64/kernel/acpi/sleep.c | |||
@@ -0,0 +1,132 @@ | |||
1 | /* | ||
2 | * acpi.c - Architecture-Specific Low-Level ACPI Support | ||
3 | * | ||
4 | * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> | ||
5 | * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> | ||
6 | * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> | ||
7 | * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) | ||
8 | * Copyright (C) 2003 Pavel Machek, SuSE Labs | ||
9 | * | ||
10 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License as published by | ||
14 | * the Free Software Foundation; either version 2 of the License, or | ||
15 | * (at your option) any later version. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
20 | * GNU General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public License | ||
23 | * along with this program; if not, write to the Free Software | ||
24 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
25 | * | ||
26 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
27 | */ | ||
28 | |||
29 | #include <linux/config.h> | ||
30 | #include <linux/kernel.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/types.h> | ||
33 | #include <linux/stddef.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/pci.h> | ||
36 | #include <linux/bootmem.h> | ||
37 | #include <linux/irq.h> | ||
38 | #include <linux/acpi.h> | ||
39 | #include <asm/mpspec.h> | ||
40 | #include <asm/io.h> | ||
41 | #include <asm/apic.h> | ||
42 | #include <asm/apicdef.h> | ||
43 | #include <asm/page.h> | ||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/pgalloc.h> | ||
46 | #include <asm/io_apic.h> | ||
47 | #include <asm/proto.h> | ||
48 | #include <asm/tlbflush.h> | ||
49 | |||
50 | |||
51 | /* -------------------------------------------------------------------------- | ||
52 | Low-Level Sleep Support | ||
53 | -------------------------------------------------------------------------- */ | ||
54 | |||
55 | #ifdef CONFIG_ACPI_SLEEP | ||
56 | |||
57 | /* address in low memory of the wakeup routine. */ | ||
58 | unsigned long acpi_wakeup_address = 0; | ||
59 | unsigned long acpi_video_flags; | ||
60 | extern char wakeup_start, wakeup_end; | ||
61 | |||
62 | extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); | ||
63 | |||
64 | static pgd_t low_ptr; | ||
65 | |||
66 | static void init_low_mapping(void) | ||
67 | { | ||
68 | pgd_t *slot0 = pgd_offset(current->mm, 0UL); | ||
69 | low_ptr = *slot0; | ||
70 | set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); | ||
71 | flush_tlb_all(); | ||
72 | } | ||
73 | |||
74 | /** | ||
75 | * acpi_save_state_mem - save kernel state | ||
76 | * | ||
77 | * Create an identity mapped page table and copy the wakeup routine to | ||
78 | * low memory. | ||
79 | */ | ||
80 | int acpi_save_state_mem (void) | ||
81 | { | ||
82 | init_low_mapping(); | ||
83 | |||
84 | memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); | ||
85 | acpi_copy_wakeup_routine(acpi_wakeup_address); | ||
86 | |||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * acpi_restore_state | ||
92 | */ | ||
93 | void acpi_restore_state_mem (void) | ||
94 | { | ||
95 | set_pgd(pgd_offset(current->mm, 0UL), low_ptr); | ||
96 | flush_tlb_all(); | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * acpi_reserve_bootmem - do _very_ early ACPI initialisation | ||
101 | * | ||
102 | * We allocate a page in low memory for the wakeup | ||
103 | * routine for when we come back from a sleep state. The | ||
104 | * runtime allocator allows specification of <16M pages, but not | ||
105 | * <1M pages. | ||
106 | */ | ||
107 | void __init acpi_reserve_bootmem(void) | ||
108 | { | ||
109 | acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); | ||
110 | if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) | ||
111 | printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); | ||
112 | } | ||
113 | |||
114 | static int __init acpi_sleep_setup(char *str) | ||
115 | { | ||
116 | while ((str != NULL) && (*str != '\0')) { | ||
117 | if (strncmp(str, "s3_bios", 7) == 0) | ||
118 | acpi_video_flags = 1; | ||
119 | if (strncmp(str, "s3_mode", 7) == 0) | ||
120 | acpi_video_flags |= 2; | ||
121 | str = strchr(str, ','); | ||
122 | if (str != NULL) | ||
123 | str += strspn(str, ", \t"); | ||
124 | } | ||
125 | return 1; | ||
126 | } | ||
127 | |||
128 | __setup("acpi_sleep=", acpi_sleep_setup); | ||
129 | |||
130 | #endif /*CONFIG_ACPI_SLEEP*/ | ||
131 | |||
132 | void acpi_pci_link_exit(void) {} | ||
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S new file mode 100644 index 000000000000..a4c630034cd4 --- /dev/null +++ b/arch/x86_64/kernel/acpi/wakeup.S | |||
@@ -0,0 +1,527 @@ | |||
1 | .text | ||
2 | #include <linux/linkage.h> | ||
3 | #include <asm/segment.h> | ||
4 | #include <asm/page.h> | ||
5 | #include <asm/msr.h> | ||
6 | |||
7 | # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 | ||
8 | # | ||
9 | # wakeup_code runs in real mode, and at unknown address (determined at run-time). | ||
10 | # Therefore it must only use relative jumps/calls. | ||
11 | # | ||
12 | # Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled | ||
13 | # | ||
14 | # If physical address of wakeup_code is 0x12345, BIOS should call us with | ||
15 | # cs = 0x1234, eip = 0x05 | ||
16 | # | ||
17 | |||
18 | |||
19 | ALIGN | ||
20 | .align 16 | ||
21 | ENTRY(wakeup_start) | ||
22 | wakeup_code: | ||
23 | wakeup_code_start = . | ||
24 | .code16 | ||
25 | |||
26 | # Running in *copy* of this code, somewhere in low 1MB. | ||
27 | |||
28 | movb $0xa1, %al ; outb %al, $0x80 | ||
29 | cli | ||
30 | cld | ||
31 | # setup data segment | ||
32 | movw %cs, %ax | ||
33 | movw %ax, %ds # Make ds:0 point to wakeup_start | ||
34 | movw %ax, %ss | ||
35 | mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board | ||
36 | |||
37 | pushl $0 # Kill any dangerous flags | ||
38 | popfl | ||
39 | |||
40 | movl real_magic - wakeup_code, %eax | ||
41 | cmpl $0x12345678, %eax | ||
42 | jne bogus_real_magic | ||
43 | |||
44 | testl $1, video_flags - wakeup_code | ||
45 | jz 1f | ||
46 | lcall $0xc000,$3 | ||
47 | movw %cs, %ax | ||
48 | movw %ax, %ds # Bios might have played with that | ||
49 | movw %ax, %ss | ||
50 | 1: | ||
51 | |||
52 | testl $2, video_flags - wakeup_code | ||
53 | jz 1f | ||
54 | mov video_mode - wakeup_code, %ax | ||
55 | call mode_seta | ||
56 | 1: | ||
57 | |||
58 | movw $0xb800, %ax | ||
59 | movw %ax,%fs | ||
60 | movw $0x0e00 + 'L', %fs:(0x10) | ||
61 | |||
62 | movb $0xa2, %al ; outb %al, $0x80 | ||
63 | |||
64 | lidt %ds:idt_48a - wakeup_code | ||
65 | xorl %eax, %eax | ||
66 | movw %ds, %ax # (Convert %ds:gdt to a linear ptr) | ||
67 | shll $4, %eax | ||
68 | addl $(gdta - wakeup_code), %eax | ||
69 | movl %eax, gdt_48a +2 - wakeup_code | ||
70 | lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is | ||
71 | # appropriate | ||
72 | |||
73 | movl $1, %eax # protected mode (PE) bit | ||
74 | lmsw %ax # This is it! | ||
75 | jmp 1f | ||
76 | 1: | ||
77 | |||
78 | .byte 0x66, 0xea # prefix + jmpi-opcode | ||
79 | .long wakeup_32 - __START_KERNEL_map | ||
80 | .word __KERNEL_CS | ||
81 | |||
82 | .code32 | ||
83 | wakeup_32: | ||
84 | # Running in this code, but at low address; paging is not yet turned on. | ||
85 | movb $0xa5, %al ; outb %al, $0x80 | ||
86 | |||
87 | /* Check if extended functions are implemented */ | ||
88 | movl $0x80000000, %eax | ||
89 | cpuid | ||
90 | cmpl $0x80000000, %eax | ||
91 | jbe bogus_cpu | ||
92 | wbinvd | ||
93 | mov $0x80000001, %eax | ||
94 | cpuid | ||
95 | btl $29, %edx | ||
96 | jnc bogus_cpu | ||
97 | movl %edx,%edi | ||
98 | |||
99 | movw $__KERNEL_DS, %ax | ||
100 | movw %ax, %ds | ||
101 | movw %ax, %es | ||
102 | movw %ax, %fs | ||
103 | movw %ax, %gs | ||
104 | |||
105 | movw $__KERNEL_DS, %ax | ||
106 | movw %ax, %ss | ||
107 | |||
108 | mov $(wakeup_stack - __START_KERNEL_map), %esp | ||
109 | movl saved_magic - __START_KERNEL_map, %eax | ||
110 | cmpl $0x9abcdef0, %eax | ||
111 | jne bogus_32_magic | ||
112 | |||
113 | /* | ||
114 | * Prepare for entering 64bits mode | ||
115 | */ | ||
116 | |||
117 | /* Enable PAE mode and PGE */ | ||
118 | xorl %eax, %eax | ||
119 | btsl $5, %eax | ||
120 | btsl $7, %eax | ||
121 | movl %eax, %cr4 | ||
122 | |||
123 | /* Setup early boot stage 4 level pagetables */ | ||
124 | movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax | ||
125 | movl %eax, %cr3 | ||
126 | |||
127 | /* Setup EFER (Extended Feature Enable Register) */ | ||
128 | movl $MSR_EFER, %ecx | ||
129 | rdmsr | ||
130 | /* Fool rdmsr and reset %eax to avoid dependences */ | ||
131 | xorl %eax, %eax | ||
132 | /* Enable Long Mode */ | ||
133 | btsl $_EFER_LME, %eax | ||
134 | /* Enable System Call */ | ||
135 | btsl $_EFER_SCE, %eax | ||
136 | |||
137 | /* No Execute supported? */ | ||
138 | btl $20,%edi | ||
139 | jnc 1f | ||
140 | btsl $_EFER_NX, %eax | ||
141 | 1: | ||
142 | |||
143 | /* Make changes effective */ | ||
144 | wrmsr | ||
145 | wbinvd | ||
146 | |||
147 | xorl %eax, %eax | ||
148 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | ||
149 | btsl $0, %eax /* Enable protected mode */ | ||
150 | btsl $1, %eax /* Enable MP */ | ||
151 | btsl $4, %eax /* Enable ET */ | ||
152 | btsl $5, %eax /* Enable NE */ | ||
153 | btsl $16, %eax /* Enable WP */ | ||
154 | btsl $18, %eax /* Enable AM */ | ||
155 | |||
156 | /* Make changes effective */ | ||
157 | movl %eax, %cr0 | ||
158 | /* At this point: | ||
159 | CR4.PAE must be 1 | ||
160 | CS.L must be 0 | ||
161 | CR3 must point to PML4 | ||
162 | Next instruction must be a branch | ||
163 | This must be on identity-mapped page | ||
164 | */ | ||
165 | jmp reach_compatibility_mode | ||
166 | reach_compatibility_mode: | ||
167 | movw $0x0e00 + 'i', %ds:(0xb8012) | ||
168 | movb $0xa8, %al ; outb %al, $0x80; | ||
169 | |||
170 | /* | ||
171 | * At this point we're in long mode but in 32bit compatibility mode | ||
172 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
173 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load | ||
174 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
175 | */ | ||
176 | |||
177 | movw $0x0e00 + 'n', %ds:(0xb8014) | ||
178 | movb $0xa9, %al ; outb %al, $0x80 | ||
179 | |||
180 | /* Load new GDT with the 64bit segment using 32bit descriptor */ | ||
181 | movl $(pGDT32 - __START_KERNEL_map), %eax | ||
182 | lgdt (%eax) | ||
183 | |||
184 | movl $(wakeup_jumpvector - __START_KERNEL_map), %eax | ||
185 | /* Finally jump in 64bit mode */ | ||
186 | ljmp *(%eax) | ||
187 | |||
188 | wakeup_jumpvector: | ||
189 | .long wakeup_long64 - __START_KERNEL_map | ||
190 | .word __KERNEL_CS | ||
191 | |||
192 | .code64 | ||
193 | |||
194 | /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ | ||
195 | wakeup_long64: | ||
196 | /* | ||
197 | * We must switch to a new descriptor in kernel space for the GDT | ||
198 | * because soon the kernel won't have access anymore to the userspace | ||
199 | * addresses where we're currently running on. We have to do that here | ||
200 | * because in 32bit we couldn't load a 64bit linear address. | ||
201 | */ | ||
202 | lgdt cpu_gdt_descr - __START_KERNEL_map | ||
203 | |||
204 | movw $0x0e00 + 'u', %ds:(0xb8016) | ||
205 | |||
206 | nop | ||
207 | nop | ||
208 | movw $__KERNEL_DS, %ax | ||
209 | movw %ax, %ss | ||
210 | movw %ax, %ds | ||
211 | movw %ax, %es | ||
212 | movw %ax, %fs | ||
213 | movw %ax, %gs | ||
214 | movq saved_esp, %rsp | ||
215 | |||
216 | movw $0x0e00 + 'x', %ds:(0xb8018) | ||
217 | movq saved_ebx, %rbx | ||
218 | movq saved_edi, %rdi | ||
219 | movq saved_esi, %rsi | ||
220 | movq saved_ebp, %rbp | ||
221 | |||
222 | movw $0x0e00 + '!', %ds:(0xb801a) | ||
223 | movq saved_eip, %rax | ||
224 | jmp *%rax | ||
225 | |||
226 | .code32 | ||
227 | |||
228 | .align 64 | ||
229 | gdta: | ||
230 | .word 0, 0, 0, 0 # dummy | ||
231 | |||
232 | .word 0, 0, 0, 0 # unused | ||
233 | |||
234 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
235 | .word 0 # base address = 0 | ||
236 | .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) | ||
237 | .word 0x00CF # granularity = 4096, 386 | ||
238 | # (+5th nibble of limit) | ||
239 | |||
240 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
241 | .word 0 # base address = 0 | ||
242 | .word 0x9200 # data read/write | ||
243 | .word 0x00CF # granularity = 4096, 386 | ||
244 | # (+5th nibble of limit) | ||
245 | # this is 64bit descriptor for code | ||
246 | .word 0xFFFF | ||
247 | .word 0 | ||
248 | .word 0x9A00 # code read/exec | ||
249 | .word 0x00AF # as above, but it is long mode and with D=0 | ||
250 | |||
251 | idt_48a: | ||
252 | .word 0 # idt limit = 0 | ||
253 | .word 0, 0 # idt base = 0L | ||
254 | |||
255 | gdt_48a: | ||
256 | .word 0x8000 # gdt limit=2048, | ||
257 | # 256 GDT entries | ||
258 | .word 0, 0 # gdt base (filled in later) | ||
259 | |||
260 | |||
261 | real_save_gdt: .word 0 | ||
262 | .quad 0 | ||
263 | real_magic: .quad 0 | ||
264 | video_mode: .quad 0 | ||
265 | video_flags: .quad 0 | ||
266 | |||
267 | bogus_real_magic: | ||
268 | movb $0xba,%al ; outb %al,$0x80 | ||
269 | jmp bogus_real_magic | ||
270 | |||
271 | bogus_32_magic: | ||
272 | movb $0xb3,%al ; outb %al,$0x80 | ||
273 | jmp bogus_32_magic | ||
274 | |||
275 | bogus_31_magic: | ||
276 | movb $0xb1,%al ; outb %al,$0x80 | ||
277 | jmp bogus_31_magic | ||
278 | |||
279 | bogus_cpu: | ||
280 | movb $0xbc,%al ; outb %al,$0x80 | ||
281 | jmp bogus_cpu | ||
282 | |||
283 | |||
284 | /* This code uses an extended set of video mode numbers. These include: | ||
285 | * Aliases for standard modes | ||
286 | * NORMAL_VGA (-1) | ||
287 | * EXTENDED_VGA (-2) | ||
288 | * ASK_VGA (-3) | ||
289 | * Video modes numbered by menu position -- NOT RECOMMENDED because of lack | ||
290 | * of compatibility when extending the table. These are between 0x00 and 0xff. | ||
291 | */ | ||
292 | #define VIDEO_FIRST_MENU 0x0000 | ||
293 | |||
294 | /* Standard BIOS video modes (BIOS number + 0x0100) */ | ||
295 | #define VIDEO_FIRST_BIOS 0x0100 | ||
296 | |||
297 | /* VESA BIOS video modes (VESA number + 0x0200) */ | ||
298 | #define VIDEO_FIRST_VESA 0x0200 | ||
299 | |||
300 | /* Video7 special modes (BIOS number + 0x0900) */ | ||
301 | #define VIDEO_FIRST_V7 0x0900 | ||
302 | |||
303 | # Setting of user mode (AX=mode ID) => CF=success | ||
304 | mode_seta: | ||
305 | movw %ax, %bx | ||
306 | #if 0 | ||
307 | cmpb $0xff, %ah | ||
308 | jz setalias | ||
309 | |||
310 | testb $VIDEO_RECALC>>8, %ah | ||
311 | jnz _setrec | ||
312 | |||
313 | cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah | ||
314 | jnc setres | ||
315 | |||
316 | cmpb $VIDEO_FIRST_SPECIAL>>8, %ah | ||
317 | jz setspc | ||
318 | |||
319 | cmpb $VIDEO_FIRST_V7>>8, %ah | ||
320 | jz setv7 | ||
321 | #endif | ||
322 | |||
323 | cmpb $VIDEO_FIRST_VESA>>8, %ah | ||
324 | jnc check_vesaa | ||
325 | #if 0 | ||
326 | orb %ah, %ah | ||
327 | jz setmenu | ||
328 | #endif | ||
329 | |||
330 | decb %ah | ||
331 | # jz setbios Add bios modes later | ||
332 | |||
333 | setbada: clc | ||
334 | ret | ||
335 | |||
336 | check_vesaa: | ||
337 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
338 | orw $0x4000, %bx # Use linear frame buffer | ||
339 | movw $0x4f02, %ax # VESA BIOS mode set call | ||
340 | int $0x10 | ||
341 | cmpw $0x004f, %ax # AL=4f if implemented | ||
342 | jnz _setbada # AH=0 if OK | ||
343 | |||
344 | stc | ||
345 | ret | ||
346 | |||
347 | _setbada: jmp setbada | ||
348 | |||
349 | .code64 | ||
350 | bogus_magic: | ||
351 | movw $0x0e00 + 'B', %ds:(0xb8018) | ||
352 | jmp bogus_magic | ||
353 | |||
354 | bogus_magic2: | ||
355 | movw $0x0e00 + '2', %ds:(0xb8018) | ||
356 | jmp bogus_magic2 | ||
357 | |||
358 | |||
359 | wakeup_stack_begin: # Stack grows down | ||
360 | |||
361 | .org 0xff0 | ||
362 | wakeup_stack: # Just below end of page | ||
363 | |||
364 | ENTRY(wakeup_end) | ||
365 | |||
366 | ## | ||
367 | # acpi_copy_wakeup_routine | ||
368 | # | ||
369 | # Copy the above routine to low memory. | ||
370 | # | ||
371 | # Parameters: | ||
372 | # %rdi: place to copy wakeup routine to | ||
373 | # | ||
374 | # Returned address is location of code in low memory (past data and stack) | ||
375 | # | ||
376 | ENTRY(acpi_copy_wakeup_routine) | ||
377 | pushq %rax | ||
378 | pushq %rcx | ||
379 | pushq %rdx | ||
380 | |||
381 | sgdt saved_gdt | ||
382 | sidt saved_idt | ||
383 | sldt saved_ldt | ||
384 | str saved_tss | ||
385 | |||
386 | movq %cr3, %rdx | ||
387 | movq %rdx, saved_cr3 | ||
388 | movq %cr4, %rdx | ||
389 | movq %rdx, saved_cr4 | ||
390 | movq %cr0, %rdx | ||
391 | movq %rdx, saved_cr0 | ||
392 | sgdt real_save_gdt - wakeup_start (,%rdi) | ||
393 | movl $MSR_EFER, %ecx | ||
394 | rdmsr | ||
395 | movl %eax, saved_efer | ||
396 | movl %edx, saved_efer2 | ||
397 | |||
398 | movl saved_video_mode, %edx | ||
399 | movl %edx, video_mode - wakeup_start (,%rdi) | ||
400 | movl acpi_video_flags, %edx | ||
401 | movl %edx, video_flags - wakeup_start (,%rdi) | ||
402 | movq $0x12345678, real_magic - wakeup_start (,%rdi) | ||
403 | movq $0x123456789abcdef0, %rdx | ||
404 | movq %rdx, saved_magic | ||
405 | |||
406 | movl saved_magic - __START_KERNEL_map, %eax | ||
407 | cmpl $0x9abcdef0, %eax | ||
408 | jne bogus_32_magic | ||
409 | |||
410 | # make sure %cr4 is set correctly (features, etc) | ||
411 | movl saved_cr4 - __START_KERNEL_map, %eax | ||
412 | movq %rax, %cr4 | ||
413 | |||
414 | movl saved_cr0 - __START_KERNEL_map, %eax | ||
415 | movq %rax, %cr0 | ||
416 | jmp 1f # Flush pipelines | ||
417 | 1: | ||
418 | # restore the regs we used | ||
419 | popq %rdx | ||
420 | popq %rcx | ||
421 | popq %rax | ||
422 | ENTRY(do_suspend_lowlevel_s4bios) | ||
423 | ret | ||
424 | |||
425 | .align 2 | ||
426 | .p2align 4,,15 | ||
427 | .globl do_suspend_lowlevel | ||
428 | .type do_suspend_lowlevel,@function | ||
429 | do_suspend_lowlevel: | ||
430 | .LFB5: | ||
431 | subq $8, %rsp | ||
432 | xorl %eax, %eax | ||
433 | call save_processor_state | ||
434 | |||
435 | movq %rsp, saved_context_esp(%rip) | ||
436 | movq %rax, saved_context_eax(%rip) | ||
437 | movq %rbx, saved_context_ebx(%rip) | ||
438 | movq %rcx, saved_context_ecx(%rip) | ||
439 | movq %rdx, saved_context_edx(%rip) | ||
440 | movq %rbp, saved_context_ebp(%rip) | ||
441 | movq %rsi, saved_context_esi(%rip) | ||
442 | movq %rdi, saved_context_edi(%rip) | ||
443 | movq %r8, saved_context_r08(%rip) | ||
444 | movq %r9, saved_context_r09(%rip) | ||
445 | movq %r10, saved_context_r10(%rip) | ||
446 | movq %r11, saved_context_r11(%rip) | ||
447 | movq %r12, saved_context_r12(%rip) | ||
448 | movq %r13, saved_context_r13(%rip) | ||
449 | movq %r14, saved_context_r14(%rip) | ||
450 | movq %r15, saved_context_r15(%rip) | ||
451 | pushfq ; popq saved_context_eflags(%rip) | ||
452 | |||
453 | movq $.L97, saved_eip(%rip) | ||
454 | |||
455 | movq %rsp,saved_esp | ||
456 | movq %rbp,saved_ebp | ||
457 | movq %rbx,saved_ebx | ||
458 | movq %rdi,saved_edi | ||
459 | movq %rsi,saved_esi | ||
460 | |||
461 | addq $8, %rsp | ||
462 | movl $3, %edi | ||
463 | xorl %eax, %eax | ||
464 | jmp acpi_enter_sleep_state | ||
465 | .L97: | ||
466 | .p2align 4,,7 | ||
467 | .L99: | ||
468 | .align 4 | ||
469 | movl $24, %eax | ||
470 | movw %ax, %ds | ||
471 | movq saved_context+58(%rip), %rax | ||
472 | movq %rax, %cr4 | ||
473 | movq saved_context+50(%rip), %rax | ||
474 | movq %rax, %cr3 | ||
475 | movq saved_context+42(%rip), %rax | ||
476 | movq %rax, %cr2 | ||
477 | movq saved_context+34(%rip), %rax | ||
478 | movq %rax, %cr0 | ||
479 | pushq saved_context_eflags(%rip) ; popfq | ||
480 | movq saved_context_esp(%rip), %rsp | ||
481 | movq saved_context_ebp(%rip), %rbp | ||
482 | movq saved_context_eax(%rip), %rax | ||
483 | movq saved_context_ebx(%rip), %rbx | ||
484 | movq saved_context_ecx(%rip), %rcx | ||
485 | movq saved_context_edx(%rip), %rdx | ||
486 | movq saved_context_esi(%rip), %rsi | ||
487 | movq saved_context_edi(%rip), %rdi | ||
488 | movq saved_context_r08(%rip), %r8 | ||
489 | movq saved_context_r09(%rip), %r9 | ||
490 | movq saved_context_r10(%rip), %r10 | ||
491 | movq saved_context_r11(%rip), %r11 | ||
492 | movq saved_context_r12(%rip), %r12 | ||
493 | movq saved_context_r13(%rip), %r13 | ||
494 | movq saved_context_r14(%rip), %r14 | ||
495 | movq saved_context_r15(%rip), %r15 | ||
496 | |||
497 | xorl %eax, %eax | ||
498 | addq $8, %rsp | ||
499 | jmp restore_processor_state | ||
500 | .LFE5: | ||
501 | .Lfe5: | ||
502 | .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel | ||
503 | |||
504 | .data | ||
505 | ALIGN | ||
506 | ENTRY(saved_ebp) .quad 0 | ||
507 | ENTRY(saved_esi) .quad 0 | ||
508 | ENTRY(saved_edi) .quad 0 | ||
509 | ENTRY(saved_ebx) .quad 0 | ||
510 | |||
511 | ENTRY(saved_eip) .quad 0 | ||
512 | ENTRY(saved_esp) .quad 0 | ||
513 | |||
514 | ENTRY(saved_magic) .quad 0 | ||
515 | |||
516 | ALIGN | ||
517 | # saved registers | ||
518 | saved_gdt: .quad 0,0 | ||
519 | saved_idt: .quad 0,0 | ||
520 | saved_ldt: .quad 0 | ||
521 | saved_tss: .quad 0 | ||
522 | |||
523 | saved_cr0: .quad 0 | ||
524 | saved_cr3: .quad 0 | ||
525 | saved_cr4: .quad 0 | ||
526 | saved_efer: .quad 0 | ||
527 | saved_efer2: .quad 0 | ||
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c new file mode 100644 index 000000000000..4baa99fe1e5c --- /dev/null +++ b/arch/x86_64/kernel/aperture.c | |||
@@ -0,0 +1,286 @@ | |||
1 | /* | ||
2 | * Firmware replacement code. | ||
3 | * | ||
4 | * Work around broken BIOSes that don't set an aperture or only set the | ||
5 | * aperture in the AGP bridge. | ||
6 | * If all fails map the aperture over some low memory. This is cheaper than | ||
7 | * doing bounce buffering. The memory is lost. This is done at early boot | ||
8 | * because only the bootmem allocator can allocate 32+MB. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ | ||
12 | */ | ||
13 | #include <linux/config.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/bootmem.h> | ||
18 | #include <linux/mmzone.h> | ||
19 | #include <linux/pci_ids.h> | ||
20 | #include <linux/pci.h> | ||
21 | #include <linux/bitops.h> | ||
22 | #include <asm/e820.h> | ||
23 | #include <asm/io.h> | ||
24 | #include <asm/proto.h> | ||
25 | #include <asm/pci-direct.h> | ||
26 | |||
27 | int iommu_aperture; | ||
28 | int iommu_aperture_disabled __initdata = 0; | ||
29 | int iommu_aperture_allowed __initdata = 0; | ||
30 | |||
31 | int fallback_aper_order __initdata = 1; /* 64MB */ | ||
32 | int fallback_aper_force __initdata = 0; | ||
33 | |||
34 | int fix_aperture __initdata = 1; | ||
35 | |||
36 | /* This code runs before the PCI subsystem is initialized, so just | ||
37 | access the northbridge directly. */ | ||
38 | |||
39 | #define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) | ||
40 | |||
41 | static u32 __init allocate_aperture(void) | ||
42 | { | ||
43 | #ifdef CONFIG_DISCONTIGMEM | ||
44 | pg_data_t *nd0 = NODE_DATA(0); | ||
45 | #else | ||
46 | pg_data_t *nd0 = &contig_page_data; | ||
47 | #endif | ||
48 | u32 aper_size; | ||
49 | void *p; | ||
50 | |||
51 | if (fallback_aper_order > 7) | ||
52 | fallback_aper_order = 7; | ||
53 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; | ||
54 | |||
55 | /* | ||
56 | * Aperture has to be naturally aligned. This means an 2GB aperture won't | ||
57 | * have much chances to find a place in the lower 4GB of memory. | ||
58 | * Unfortunately we cannot move it up because that would make the | ||
59 | * IOMMU useless. | ||
60 | */ | ||
61 | p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); | ||
62 | if (!p || __pa(p)+aper_size > 0xffffffff) { | ||
63 | printk("Cannot allocate aperture memory hole (%p,%uK)\n", | ||
64 | p, aper_size>>10); | ||
65 | if (p) | ||
66 | free_bootmem_node(nd0, (unsigned long)p, aper_size); | ||
67 | return 0; | ||
68 | } | ||
69 | printk("Mapping aperture over %d KB of RAM @ %lx\n", | ||
70 | aper_size >> 10, __pa(p)); | ||
71 | return (u32)__pa(p); | ||
72 | } | ||
73 | |||
74 | static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) | ||
75 | { | ||
76 | if (!aper_base) | ||
77 | return 0; | ||
78 | if (aper_size < 64*1024*1024) { | ||
79 | printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); | ||
80 | return 0; | ||
81 | } | ||
82 | if (aper_base + aper_size >= 0xffffffff) { | ||
83 | printk("Aperture from %s beyond 4GB. Ignoring.\n",name); | ||
84 | return 0; | ||
85 | } | ||
86 | if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { | ||
87 | printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); | ||
88 | return 0; | ||
89 | } | ||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | /* Find a PCI capability */ | ||
94 | static __u32 __init find_cap(int num, int slot, int func, int cap) | ||
95 | { | ||
96 | u8 pos; | ||
97 | int bytes; | ||
98 | if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) | ||
99 | return 0; | ||
100 | pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); | ||
101 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
102 | u8 id; | ||
103 | pos &= ~3; | ||
104 | id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); | ||
105 | if (id == 0xff) | ||
106 | break; | ||
107 | if (id == cap) | ||
108 | return pos; | ||
109 | pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); | ||
110 | } | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | /* Read a standard AGPv3 bridge header */ | ||
115 | static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | ||
116 | { | ||
117 | u32 apsize; | ||
118 | u32 apsizereg; | ||
119 | int nbits; | ||
120 | u32 aper_low, aper_hi; | ||
121 | u64 aper; | ||
122 | |||
123 | printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); | ||
124 | apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); | ||
125 | if (apsizereg == 0xffffffff) { | ||
126 | printk("APSIZE in AGP bridge unreadable\n"); | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | apsize = apsizereg & 0xfff; | ||
131 | /* Some BIOS use weird encodings not in the AGPv3 table. */ | ||
132 | if (apsize & 0xff) | ||
133 | apsize |= 0xf00; | ||
134 | nbits = hweight16(apsize); | ||
135 | *order = 7 - nbits; | ||
136 | if ((int)*order < 0) /* < 32MB */ | ||
137 | *order = 0; | ||
138 | |||
139 | aper_low = read_pci_config(num,slot,func, 0x10); | ||
140 | aper_hi = read_pci_config(num,slot,func,0x14); | ||
141 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); | ||
142 | |||
143 | printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", | ||
144 | aper, 32 << *order, apsizereg); | ||
145 | |||
146 | if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) | ||
147 | return 0; | ||
148 | return (u32)aper; | ||
149 | } | ||
150 | |||
151 | /* Look for an AGP bridge. Windows only expects the aperture in the | ||
152 | AGP bridge and some BIOS forget to initialize the Northbridge too. | ||
153 | Work around this here. | ||
154 | |||
155 | Do an PCI bus scan by hand because we're running before the PCI | ||
156 | subsystem. | ||
157 | |||
158 | All K8 AGP bridges are AGPv3 compliant, so we can do this scan | ||
159 | generically. It's probably overkill to always scan all slots because | ||
160 | the AGP bridges should be always an own bus on the HT hierarchy, | ||
161 | but do it here for future safety. */ | ||
162 | static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | ||
163 | { | ||
164 | int num, slot, func; | ||
165 | |||
166 | /* Poor man's PCI discovery */ | ||
167 | for (num = 0; num < 32; num++) { | ||
168 | for (slot = 0; slot < 32; slot++) { | ||
169 | for (func = 0; func < 8; func++) { | ||
170 | u32 class, cap; | ||
171 | u8 type; | ||
172 | class = read_pci_config(num,slot,func, | ||
173 | PCI_CLASS_REVISION); | ||
174 | if (class == 0xffffffff) | ||
175 | break; | ||
176 | |||
177 | switch (class >> 16) { | ||
178 | case PCI_CLASS_BRIDGE_HOST: | ||
179 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ | ||
180 | /* AGP bridge? */ | ||
181 | cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); | ||
182 | if (!cap) | ||
183 | break; | ||
184 | *valid_agp = 1; | ||
185 | return read_agp(num,slot,func,cap,order); | ||
186 | } | ||
187 | |||
188 | /* No multi-function device? */ | ||
189 | type = read_pci_config_byte(num,slot,func, | ||
190 | PCI_HEADER_TYPE); | ||
191 | if (!(type & 0x80)) | ||
192 | break; | ||
193 | } | ||
194 | } | ||
195 | } | ||
196 | printk("No AGP bridge found\n"); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | void __init iommu_hole_init(void) | ||
201 | { | ||
202 | int fix, num; | ||
203 | u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; | ||
204 | u64 aper_base, last_aper_base = 0; | ||
205 | int valid_agp = 0; | ||
206 | |||
207 | if (iommu_aperture_disabled || !fix_aperture) | ||
208 | return; | ||
209 | |||
210 | printk("Checking aperture...\n"); | ||
211 | |||
212 | fix = 0; | ||
213 | for (num = 24; num < 32; num++) { | ||
214 | char name[30]; | ||
215 | if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) | ||
216 | continue; | ||
217 | |||
218 | iommu_aperture = 1; | ||
219 | |||
220 | aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; | ||
221 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
222 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | ||
223 | aper_base <<= 25; | ||
224 | |||
225 | printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, | ||
226 | aper_base, aper_size>>20); | ||
227 | |||
228 | sprintf(name, "northbridge cpu %d", num-24); | ||
229 | |||
230 | if (!aperture_valid(name, aper_base, aper_size)) { | ||
231 | fix = 1; | ||
232 | break; | ||
233 | } | ||
234 | |||
235 | if ((last_aper_order && aper_order != last_aper_order) || | ||
236 | (last_aper_base && aper_base != last_aper_base)) { | ||
237 | fix = 1; | ||
238 | break; | ||
239 | } | ||
240 | last_aper_order = aper_order; | ||
241 | last_aper_base = aper_base; | ||
242 | } | ||
243 | |||
244 | if (!fix && !fallback_aper_force) | ||
245 | return; | ||
246 | |||
247 | if (!fallback_aper_force) | ||
248 | aper_alloc = search_agp_bridge(&aper_order, &valid_agp); | ||
249 | |||
250 | if (aper_alloc) { | ||
251 | /* Got the aperture from the AGP bridge */ | ||
252 | } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || | ||
253 | force_iommu || | ||
254 | valid_agp || | ||
255 | fallback_aper_force) { | ||
256 | printk("Your BIOS doesn't leave a aperture memory hole\n"); | ||
257 | printk("Please enable the IOMMU option in the BIOS setup\n"); | ||
258 | printk("This costs you %d MB of RAM\n", | ||
259 | 32 << fallback_aper_order); | ||
260 | |||
261 | aper_order = fallback_aper_order; | ||
262 | aper_alloc = allocate_aperture(); | ||
263 | if (!aper_alloc) { | ||
264 | /* Could disable AGP and IOMMU here, but it's probably | ||
265 | not worth it. But the later users cannot deal with | ||
266 | bad apertures and turning on the aperture over memory | ||
267 | causes very strange problems, so it's better to | ||
268 | panic early. */ | ||
269 | panic("Not enough memory for aperture"); | ||
270 | } | ||
271 | } else { | ||
272 | return; | ||
273 | } | ||
274 | |||
275 | /* Fix up the north bridges */ | ||
276 | for (num = 24; num < 32; num++) { | ||
277 | if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) | ||
278 | continue; | ||
279 | |||
280 | /* Don't enable translation yet. That is done later. | ||
281 | Assume this BIOS didn't initialise the GART so | ||
282 | just overwrite all previous bits */ | ||
283 | write_pci_config(0, num, 3, 0x90, aper_order<<1); | ||
284 | write_pci_config(0, num, 3, 0x94, aper_alloc>>25); | ||
285 | } | ||
286 | } | ||
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c new file mode 100644 index 000000000000..9826f03dc9a3 --- /dev/null +++ b/arch/x86_64/kernel/apic.c | |||
@@ -0,0 +1,1088 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/init.h> | ||
19 | |||
20 | #include <linux/mm.h> | ||
21 | #include <linux/irq.h> | ||
22 | #include <linux/delay.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/smp_lock.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/mc146818rtc.h> | ||
27 | #include <linux/kernel_stat.h> | ||
28 | #include <linux/sysdev.h> | ||
29 | |||
30 | #include <asm/atomic.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/mtrr.h> | ||
33 | #include <asm/mpspec.h> | ||
34 | #include <asm/pgalloc.h> | ||
35 | #include <asm/mach_apic.h> | ||
36 | |||
37 | int apic_verbosity; | ||
38 | |||
39 | int disable_apic_timer __initdata; | ||
40 | |||
41 | /* Using APIC to generate smp_local_timer_interrupt? */ | ||
42 | int using_apic_timer = 0; | ||
43 | |||
44 | static DEFINE_PER_CPU(int, prof_multiplier) = 1; | ||
45 | static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; | ||
46 | static DEFINE_PER_CPU(int, prof_counter) = 1; | ||
47 | |||
48 | static void apic_pm_activate(void); | ||
49 | |||
50 | void enable_NMI_through_LVT0 (void * dummy) | ||
51 | { | ||
52 | unsigned int v, ver; | ||
53 | |||
54 | ver = apic_read(APIC_LVR); | ||
55 | ver = GET_APIC_VERSION(ver); | ||
56 | v = APIC_DM_NMI; /* unmask and set to NMI */ | ||
57 | apic_write_around(APIC_LVT0, v); | ||
58 | } | ||
59 | |||
60 | int get_maxlvt(void) | ||
61 | { | ||
62 | unsigned int v, ver, maxlvt; | ||
63 | |||
64 | v = apic_read(APIC_LVR); | ||
65 | ver = GET_APIC_VERSION(v); | ||
66 | maxlvt = GET_APIC_MAXLVT(v); | ||
67 | return maxlvt; | ||
68 | } | ||
69 | |||
70 | void clear_local_APIC(void) | ||
71 | { | ||
72 | int maxlvt; | ||
73 | unsigned int v; | ||
74 | |||
75 | maxlvt = get_maxlvt(); | ||
76 | |||
77 | /* | ||
78 | * Masking an LVT entry on a P6 can trigger a local APIC error | ||
79 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
80 | */ | ||
81 | if (maxlvt >= 3) { | ||
82 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
83 | apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
84 | } | ||
85 | /* | ||
86 | * Careful: we have to set masks only first to deassert | ||
87 | * any level-triggered sources. | ||
88 | */ | ||
89 | v = apic_read(APIC_LVTT); | ||
90 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | ||
91 | v = apic_read(APIC_LVT0); | ||
92 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
93 | v = apic_read(APIC_LVT1); | ||
94 | apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); | ||
95 | if (maxlvt >= 4) { | ||
96 | v = apic_read(APIC_LVTPC); | ||
97 | apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Clean APIC state for other OSs: | ||
102 | */ | ||
103 | apic_write_around(APIC_LVTT, APIC_LVT_MASKED); | ||
104 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | ||
105 | apic_write_around(APIC_LVT1, APIC_LVT_MASKED); | ||
106 | if (maxlvt >= 3) | ||
107 | apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); | ||
108 | if (maxlvt >= 4) | ||
109 | apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); | ||
110 | v = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
111 | if (APIC_INTEGRATED(v)) { /* !82489DX */ | ||
112 | if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ | ||
113 | apic_write(APIC_ESR, 0); | ||
114 | apic_read(APIC_ESR); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | void __init connect_bsp_APIC(void) | ||
119 | { | ||
120 | if (pic_mode) { | ||
121 | /* | ||
122 | * Do not trust the local APIC being empty at bootup. | ||
123 | */ | ||
124 | clear_local_APIC(); | ||
125 | /* | ||
126 | * PIC mode, enable APIC mode in the IMCR, i.e. | ||
127 | * connect BSP's local APIC to INT and NMI lines. | ||
128 | */ | ||
129 | apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); | ||
130 | outb(0x70, 0x22); | ||
131 | outb(0x01, 0x23); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | void disconnect_bsp_APIC(void) | ||
136 | { | ||
137 | if (pic_mode) { | ||
138 | /* | ||
139 | * Put the board back into PIC mode (has an effect | ||
140 | * only on certain older boards). Note that APIC | ||
141 | * interrupts, including IPIs, won't work beyond | ||
142 | * this point! The only exception are INIT IPIs. | ||
143 | */ | ||
144 | apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); | ||
145 | outb(0x70, 0x22); | ||
146 | outb(0x00, 0x23); | ||
147 | } | ||
148 | } | ||
149 | |||
150 | void disable_local_APIC(void) | ||
151 | { | ||
152 | unsigned int value; | ||
153 | |||
154 | clear_local_APIC(); | ||
155 | |||
156 | /* | ||
157 | * Disable APIC (implies clearing of registers | ||
158 | * for 82489DX!). | ||
159 | */ | ||
160 | value = apic_read(APIC_SPIV); | ||
161 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
162 | apic_write_around(APIC_SPIV, value); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * This is to verify that we're looking at a real local APIC. | ||
167 | * Check these against your board if the CPUs aren't getting | ||
168 | * started for no apparent reason. | ||
169 | */ | ||
170 | int __init verify_local_APIC(void) | ||
171 | { | ||
172 | unsigned int reg0, reg1; | ||
173 | |||
174 | /* | ||
175 | * The version register is read-only in a real APIC. | ||
176 | */ | ||
177 | reg0 = apic_read(APIC_LVR); | ||
178 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
179 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
180 | reg1 = apic_read(APIC_LVR); | ||
181 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
182 | |||
183 | /* | ||
184 | * The two version reads above should print the same | ||
185 | * numbers. If the second one is different, then we | ||
186 | * poke at a non-APIC. | ||
187 | */ | ||
188 | if (reg1 != reg0) | ||
189 | return 0; | ||
190 | |||
191 | /* | ||
192 | * Check if the version looks reasonably. | ||
193 | */ | ||
194 | reg1 = GET_APIC_VERSION(reg0); | ||
195 | if (reg1 == 0x00 || reg1 == 0xff) | ||
196 | return 0; | ||
197 | reg1 = get_maxlvt(); | ||
198 | if (reg1 < 0x02 || reg1 == 0xff) | ||
199 | return 0; | ||
200 | |||
201 | /* | ||
202 | * The ID register is read/write in a real APIC. | ||
203 | */ | ||
204 | reg0 = apic_read(APIC_ID); | ||
205 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
206 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
207 | reg1 = apic_read(APIC_ID); | ||
208 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
209 | apic_write(APIC_ID, reg0); | ||
210 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
211 | return 0; | ||
212 | |||
213 | /* | ||
214 | * The next two are just to see if we have sane values. | ||
215 | * They're only really relevant if we're in Virtual Wire | ||
216 | * compatibility mode, but most boxes are anymore. | ||
217 | */ | ||
218 | reg0 = apic_read(APIC_LVT0); | ||
219 | apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); | ||
220 | reg1 = apic_read(APIC_LVT1); | ||
221 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
222 | |||
223 | return 1; | ||
224 | } | ||
225 | |||
226 | void __init sync_Arb_IDs(void) | ||
227 | { | ||
228 | /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ | ||
229 | unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
230 | if (ver >= 0x14) /* P4 or higher */ | ||
231 | return; | ||
232 | |||
233 | /* | ||
234 | * Wait for idle. | ||
235 | */ | ||
236 | apic_wait_icr_idle(); | ||
237 | |||
238 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
239 | apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | ||
240 | | APIC_DM_INIT); | ||
241 | } | ||
242 | |||
243 | extern void __error_in_apic_c (void); | ||
244 | |||
245 | /* | ||
246 | * An initial setup of the virtual wire mode. | ||
247 | */ | ||
248 | void __init init_bsp_APIC(void) | ||
249 | { | ||
250 | unsigned int value, ver; | ||
251 | |||
252 | /* | ||
253 | * Don't do the setup now if we have a SMP BIOS as the | ||
254 | * through-I/O-APIC virtual wire mode might be active. | ||
255 | */ | ||
256 | if (smp_found_config || !cpu_has_apic) | ||
257 | return; | ||
258 | |||
259 | value = apic_read(APIC_LVR); | ||
260 | ver = GET_APIC_VERSION(value); | ||
261 | |||
262 | /* | ||
263 | * Do not trust the local APIC being empty at bootup. | ||
264 | */ | ||
265 | clear_local_APIC(); | ||
266 | |||
267 | /* | ||
268 | * Enable APIC. | ||
269 | */ | ||
270 | value = apic_read(APIC_SPIV); | ||
271 | value &= ~APIC_VECTOR_MASK; | ||
272 | value |= APIC_SPIV_APIC_ENABLED; | ||
273 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
274 | value |= SPURIOUS_APIC_VECTOR; | ||
275 | apic_write_around(APIC_SPIV, value); | ||
276 | |||
277 | /* | ||
278 | * Set up the virtual wire mode. | ||
279 | */ | ||
280 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
281 | value = APIC_DM_NMI; | ||
282 | if (!APIC_INTEGRATED(ver)) /* 82489DX */ | ||
283 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
284 | apic_write_around(APIC_LVT1, value); | ||
285 | } | ||
286 | |||
287 | void __init setup_local_APIC (void) | ||
288 | { | ||
289 | unsigned int value, ver, maxlvt; | ||
290 | |||
291 | /* Pound the ESR really hard over the head with a big hammer - mbligh */ | ||
292 | if (esr_disable) { | ||
293 | apic_write(APIC_ESR, 0); | ||
294 | apic_write(APIC_ESR, 0); | ||
295 | apic_write(APIC_ESR, 0); | ||
296 | apic_write(APIC_ESR, 0); | ||
297 | } | ||
298 | |||
299 | value = apic_read(APIC_LVR); | ||
300 | ver = GET_APIC_VERSION(value); | ||
301 | |||
302 | if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) | ||
303 | __error_in_apic_c(); | ||
304 | |||
305 | /* | ||
306 | * Double-check whether this APIC is really registered. | ||
307 | * This is meaningless in clustered apic mode, so we skip it. | ||
308 | */ | ||
309 | if (!apic_id_registered()) | ||
310 | BUG(); | ||
311 | |||
312 | /* | ||
313 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
314 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
315 | * document number 292116). So here it goes... | ||
316 | */ | ||
317 | init_apic_ldr(); | ||
318 | |||
319 | /* | ||
320 | * Set Task Priority to 'accept all'. We never change this | ||
321 | * later on. | ||
322 | */ | ||
323 | value = apic_read(APIC_TASKPRI); | ||
324 | value &= ~APIC_TPRI_MASK; | ||
325 | apic_write_around(APIC_TASKPRI, value); | ||
326 | |||
327 | /* | ||
328 | * Now that we are all set up, enable the APIC | ||
329 | */ | ||
330 | value = apic_read(APIC_SPIV); | ||
331 | value &= ~APIC_VECTOR_MASK; | ||
332 | /* | ||
333 | * Enable APIC | ||
334 | */ | ||
335 | value |= APIC_SPIV_APIC_ENABLED; | ||
336 | |||
337 | /* | ||
338 | * Some unknown Intel IO/APIC (or APIC) errata is biting us with | ||
339 | * certain networking cards. If high frequency interrupts are | ||
340 | * happening on a particular IOAPIC pin, plus the IOAPIC routing | ||
341 | * entry is masked/unmasked at a high rate as well then sooner or | ||
342 | * later IOAPIC line gets 'stuck', no more interrupts are received | ||
343 | * from the device. If focus CPU is disabled then the hang goes | ||
344 | * away, oh well :-( | ||
345 | * | ||
346 | * [ This bug can be reproduced easily with a level-triggered | ||
347 | * PCI Ne2000 networking cards and PII/PIII processors, dual | ||
348 | * BX chipset. ] | ||
349 | */ | ||
350 | /* | ||
351 | * Actually disabling the focus CPU check just makes the hang less | ||
352 | * frequent as it makes the interrupt distributon model be more | ||
353 | * like LRU than MRU (the short-term load is more even across CPUs). | ||
354 | * See also the comment in end_level_ioapic_irq(). --macro | ||
355 | */ | ||
356 | #if 1 | ||
357 | /* Enable focus processor (bit==0) */ | ||
358 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
359 | #else | ||
360 | /* Disable focus processor (bit==1) */ | ||
361 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
362 | #endif | ||
363 | /* | ||
364 | * Set spurious IRQ vector | ||
365 | */ | ||
366 | value |= SPURIOUS_APIC_VECTOR; | ||
367 | apic_write_around(APIC_SPIV, value); | ||
368 | |||
369 | /* | ||
370 | * Set up LVT0, LVT1: | ||
371 | * | ||
372 | * set up through-local-APIC on the BP's LINT0. This is not | ||
373 | * strictly necessary in pure symmetric-IO mode, but sometimes | ||
374 | * we delegate interrupts to the 8259A. | ||
375 | */ | ||
376 | /* | ||
377 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
378 | */ | ||
379 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
380 | if (!smp_processor_id() && (pic_mode || !value)) { | ||
381 | value = APIC_DM_EXTINT; | ||
382 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); | ||
383 | } else { | ||
384 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
385 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); | ||
386 | } | ||
387 | apic_write_around(APIC_LVT0, value); | ||
388 | |||
389 | /* | ||
390 | * only the BP should see the LINT1 NMI signal, obviously. | ||
391 | */ | ||
392 | if (!smp_processor_id()) | ||
393 | value = APIC_DM_NMI; | ||
394 | else | ||
395 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
396 | if (!APIC_INTEGRATED(ver)) /* 82489DX */ | ||
397 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
398 | apic_write_around(APIC_LVT1, value); | ||
399 | |||
400 | if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ | ||
401 | unsigned oldvalue; | ||
402 | maxlvt = get_maxlvt(); | ||
403 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
404 | apic_write(APIC_ESR, 0); | ||
405 | oldvalue = apic_read(APIC_ESR); | ||
406 | value = ERROR_APIC_VECTOR; // enables sending errors | ||
407 | apic_write_around(APIC_LVTERR, value); | ||
408 | /* | ||
409 | * spec says clear errors after enabling vector. | ||
410 | */ | ||
411 | if (maxlvt > 3) | ||
412 | apic_write(APIC_ESR, 0); | ||
413 | value = apic_read(APIC_ESR); | ||
414 | if (value != oldvalue) | ||
415 | apic_printk(APIC_VERBOSE, | ||
416 | "ESR value after enabling vector: %08x, after %08x\n", | ||
417 | oldvalue, value); | ||
418 | } else { | ||
419 | if (esr_disable) | ||
420 | /* | ||
421 | * Something untraceble is creating bad interrupts on | ||
422 | * secondary quads ... for the moment, just leave the | ||
423 | * ESR disabled - we can't do anything useful with the | ||
424 | * errors anyway - mbligh | ||
425 | */ | ||
426 | apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); | ||
427 | else | ||
428 | apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); | ||
429 | } | ||
430 | |||
431 | nmi_watchdog_default(); | ||
432 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
433 | setup_apic_nmi_watchdog(); | ||
434 | apic_pm_activate(); | ||
435 | } | ||
436 | |||
437 | #ifdef CONFIG_PM | ||
438 | |||
439 | static struct { | ||
440 | /* 'active' is true if the local APIC was enabled by us and | ||
441 | not the BIOS; this signifies that we are also responsible | ||
442 | for disabling it before entering apm/acpi suspend */ | ||
443 | int active; | ||
444 | /* r/w apic fields */ | ||
445 | unsigned int apic_id; | ||
446 | unsigned int apic_taskpri; | ||
447 | unsigned int apic_ldr; | ||
448 | unsigned int apic_dfr; | ||
449 | unsigned int apic_spiv; | ||
450 | unsigned int apic_lvtt; | ||
451 | unsigned int apic_lvtpc; | ||
452 | unsigned int apic_lvt0; | ||
453 | unsigned int apic_lvt1; | ||
454 | unsigned int apic_lvterr; | ||
455 | unsigned int apic_tmict; | ||
456 | unsigned int apic_tdcr; | ||
457 | unsigned int apic_thmr; | ||
458 | } apic_pm_state; | ||
459 | |||
460 | static int lapic_suspend(struct sys_device *dev, u32 state) | ||
461 | { | ||
462 | unsigned long flags; | ||
463 | |||
464 | if (!apic_pm_state.active) | ||
465 | return 0; | ||
466 | |||
467 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
468 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
469 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
470 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
471 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
472 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
473 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
474 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
475 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
476 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
477 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
478 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
479 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
480 | local_save_flags(flags); | ||
481 | local_irq_disable(); | ||
482 | disable_local_APIC(); | ||
483 | local_irq_restore(flags); | ||
484 | return 0; | ||
485 | } | ||
486 | |||
487 | static int lapic_resume(struct sys_device *dev) | ||
488 | { | ||
489 | unsigned int l, h; | ||
490 | unsigned long flags; | ||
491 | |||
492 | if (!apic_pm_state.active) | ||
493 | return 0; | ||
494 | |||
495 | /* XXX: Pavel needs this for S3 resume, but can't explain why */ | ||
496 | set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); | ||
497 | |||
498 | local_irq_save(flags); | ||
499 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
500 | l &= ~MSR_IA32_APICBASE_BASE; | ||
501 | l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; | ||
502 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
503 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
504 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
505 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
506 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
507 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
508 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
509 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
510 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
511 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
512 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
513 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
514 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
515 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
516 | apic_write(APIC_ESR, 0); | ||
517 | apic_read(APIC_ESR); | ||
518 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
519 | apic_write(APIC_ESR, 0); | ||
520 | apic_read(APIC_ESR); | ||
521 | local_irq_restore(flags); | ||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static struct sysdev_class lapic_sysclass = { | ||
526 | set_kset_name("lapic"), | ||
527 | .resume = lapic_resume, | ||
528 | .suspend = lapic_suspend, | ||
529 | }; | ||
530 | |||
531 | static struct sys_device device_lapic = { | ||
532 | .id = 0, | ||
533 | .cls = &lapic_sysclass, | ||
534 | }; | ||
535 | |||
536 | static void __init apic_pm_activate(void) | ||
537 | { | ||
538 | apic_pm_state.active = 1; | ||
539 | } | ||
540 | |||
541 | static int __init init_lapic_sysfs(void) | ||
542 | { | ||
543 | int error; | ||
544 | if (!cpu_has_apic) | ||
545 | return 0; | ||
546 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
547 | error = sysdev_class_register(&lapic_sysclass); | ||
548 | if (!error) | ||
549 | error = sysdev_register(&device_lapic); | ||
550 | return error; | ||
551 | } | ||
552 | device_initcall(init_lapic_sysfs); | ||
553 | |||
554 | #else /* CONFIG_PM */ | ||
555 | |||
556 | static void apic_pm_activate(void) { } | ||
557 | |||
558 | #endif /* CONFIG_PM */ | ||
559 | |||
560 | static int __init apic_set_verbosity(char *str) | ||
561 | { | ||
562 | if (strcmp("debug", str) == 0) | ||
563 | apic_verbosity = APIC_DEBUG; | ||
564 | else if (strcmp("verbose", str) == 0) | ||
565 | apic_verbosity = APIC_VERBOSE; | ||
566 | else | ||
567 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
568 | " use apic=verbose or apic=debug", str); | ||
569 | |||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | __setup("apic=", apic_set_verbosity); | ||
574 | |||
575 | /* | ||
576 | * Detect and enable local APICs on non-SMP boards. | ||
577 | * Original code written by Keir Fraser. | ||
578 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
579 | * not correctly set up (usually the APIC timer won't work etc.) | ||
580 | */ | ||
581 | |||
582 | static int __init detect_init_APIC (void) | ||
583 | { | ||
584 | if (!cpu_has_apic) { | ||
585 | printk(KERN_INFO "No local APIC present\n"); | ||
586 | return -1; | ||
587 | } | ||
588 | |||
589 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
590 | boot_cpu_id = 0; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | void __init init_apic_mappings(void) | ||
595 | { | ||
596 | unsigned long apic_phys; | ||
597 | |||
598 | /* | ||
599 | * If no local APIC can be found then set up a fake all | ||
600 | * zeroes page to simulate the local APIC and another | ||
601 | * one for the IO-APIC. | ||
602 | */ | ||
603 | if (!smp_found_config && detect_init_APIC()) { | ||
604 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
605 | apic_phys = __pa(apic_phys); | ||
606 | } else | ||
607 | apic_phys = mp_lapic_addr; | ||
608 | |||
609 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
610 | apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); | ||
611 | |||
612 | /* | ||
613 | * Fetch the APIC ID of the BSP in case we have a | ||
614 | * default configuration (or the MP table is broken). | ||
615 | */ | ||
616 | if (boot_cpu_id == -1U) | ||
617 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
618 | |||
619 | #ifdef CONFIG_X86_IO_APIC | ||
620 | { | ||
621 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
622 | int i; | ||
623 | |||
624 | for (i = 0; i < nr_ioapics; i++) { | ||
625 | if (smp_found_config) { | ||
626 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
627 | } else { | ||
628 | ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
629 | ioapic_phys = __pa(ioapic_phys); | ||
630 | } | ||
631 | set_fixmap_nocache(idx, ioapic_phys); | ||
632 | apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", | ||
633 | __fix_to_virt(idx), ioapic_phys); | ||
634 | idx++; | ||
635 | } | ||
636 | } | ||
637 | #endif | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * This function sets up the local APIC timer, with a timeout of | ||
642 | * 'clocks' APIC bus clock. During calibration we actually call | ||
643 | * this function twice on the boot CPU, once with a bogus timeout | ||
644 | * value, second time for real. The other (noncalibrating) CPUs | ||
645 | * call this function only once, with the real, calibrated value. | ||
646 | * | ||
647 | * We do reads before writes even if unnecessary, to get around the | ||
648 | * P5 APIC double write bug. | ||
649 | */ | ||
650 | |||
651 | #define APIC_DIVISOR 16 | ||
652 | |||
653 | static void __setup_APIC_LVTT(unsigned int clocks) | ||
654 | { | ||
655 | unsigned int lvtt_value, tmp_value, ver; | ||
656 | |||
657 | ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
658 | lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; | ||
659 | if (!APIC_INTEGRATED(ver)) | ||
660 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | ||
661 | apic_write_around(APIC_LVTT, lvtt_value); | ||
662 | |||
663 | /* | ||
664 | * Divide PICLK by 16 | ||
665 | */ | ||
666 | tmp_value = apic_read(APIC_TDCR); | ||
667 | apic_write_around(APIC_TDCR, (tmp_value | ||
668 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ||
669 | | APIC_TDR_DIV_16); | ||
670 | |||
671 | apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); | ||
672 | } | ||
673 | |||
674 | static void setup_APIC_timer(unsigned int clocks) | ||
675 | { | ||
676 | unsigned long flags; | ||
677 | |||
678 | local_irq_save(flags); | ||
679 | |||
680 | /* For some reasons this doesn't work on Simics, so fake it for now */ | ||
681 | if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { | ||
682 | __setup_APIC_LVTT(clocks); | ||
683 | return; | ||
684 | } | ||
685 | |||
686 | /* wait for irq slice */ | ||
687 | if (vxtime.hpet_address) { | ||
688 | int trigger = hpet_readl(HPET_T0_CMP); | ||
689 | while (hpet_readl(HPET_COUNTER) >= trigger) | ||
690 | /* do nothing */ ; | ||
691 | while (hpet_readl(HPET_COUNTER) < trigger) | ||
692 | /* do nothing */ ; | ||
693 | } else { | ||
694 | int c1, c2; | ||
695 | outb_p(0x00, 0x43); | ||
696 | c2 = inb_p(0x40); | ||
697 | c2 |= inb_p(0x40) << 8; | ||
698 | do { | ||
699 | c1 = c2; | ||
700 | outb_p(0x00, 0x43); | ||
701 | c2 = inb_p(0x40); | ||
702 | c2 |= inb_p(0x40) << 8; | ||
703 | } while (c2 - c1 < 300); | ||
704 | } | ||
705 | |||
706 | __setup_APIC_LVTT(clocks); | ||
707 | |||
708 | local_irq_restore(flags); | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * In this function we calibrate APIC bus clocks to the external | ||
713 | * timer. Unfortunately we cannot use jiffies and the timer irq | ||
714 | * to calibrate, since some later bootup code depends on getting | ||
715 | * the first irq? Ugh. | ||
716 | * | ||
717 | * We want to do the calibration only once since we | ||
718 | * want to have local timer irqs syncron. CPUs connected | ||
719 | * by the same APIC bus have the very same bus frequency. | ||
720 | * And we want to have irqs off anyways, no accidental | ||
721 | * APIC irq that way. | ||
722 | */ | ||
723 | |||
724 | #define TICK_COUNT 100000000 | ||
725 | |||
726 | static int __init calibrate_APIC_clock(void) | ||
727 | { | ||
728 | int apic, apic_start, tsc, tsc_start; | ||
729 | int result; | ||
730 | /* | ||
731 | * Put whatever arbitrary (but long enough) timeout | ||
732 | * value into the APIC clock, we just want to get the | ||
733 | * counter running for calibration. | ||
734 | */ | ||
735 | __setup_APIC_LVTT(1000000000); | ||
736 | |||
737 | apic_start = apic_read(APIC_TMCCT); | ||
738 | rdtscl(tsc_start); | ||
739 | |||
740 | do { | ||
741 | apic = apic_read(APIC_TMCCT); | ||
742 | rdtscl(tsc); | ||
743 | } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); | ||
744 | |||
745 | result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); | ||
746 | |||
747 | printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", | ||
748 | result / 1000 / 1000, result / 1000 % 1000); | ||
749 | |||
750 | return result * APIC_DIVISOR / HZ; | ||
751 | } | ||
752 | |||
753 | static unsigned int calibration_result; | ||
754 | |||
755 | void __init setup_boot_APIC_clock (void) | ||
756 | { | ||
757 | if (disable_apic_timer) { | ||
758 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
759 | return; | ||
760 | } | ||
761 | |||
762 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | ||
763 | using_apic_timer = 1; | ||
764 | |||
765 | local_irq_disable(); | ||
766 | |||
767 | calibration_result = calibrate_APIC_clock(); | ||
768 | /* | ||
769 | * Now set up the timer for real. | ||
770 | */ | ||
771 | setup_APIC_timer(calibration_result); | ||
772 | |||
773 | local_irq_enable(); | ||
774 | } | ||
775 | |||
776 | void __init setup_secondary_APIC_clock(void) | ||
777 | { | ||
778 | local_irq_disable(); /* FIXME: Do we need this? --RR */ | ||
779 | setup_APIC_timer(calibration_result); | ||
780 | local_irq_enable(); | ||
781 | } | ||
782 | |||
783 | void __init disable_APIC_timer(void) | ||
784 | { | ||
785 | if (using_apic_timer) { | ||
786 | unsigned long v; | ||
787 | |||
788 | v = apic_read(APIC_LVTT); | ||
789 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | ||
790 | } | ||
791 | } | ||
792 | |||
793 | void enable_APIC_timer(void) | ||
794 | { | ||
795 | if (using_apic_timer) { | ||
796 | unsigned long v; | ||
797 | |||
798 | v = apic_read(APIC_LVTT); | ||
799 | apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); | ||
800 | } | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * the frequency of the profiling timer can be changed | ||
805 | * by writing a multiplier value into /proc/profile. | ||
806 | */ | ||
807 | int setup_profiling_timer(unsigned int multiplier) | ||
808 | { | ||
809 | int i; | ||
810 | |||
811 | /* | ||
812 | * Sanity check. [at least 500 APIC cycles should be | ||
813 | * between APIC interrupts as a rule of thumb, to avoid | ||
814 | * irqs flooding us] | ||
815 | */ | ||
816 | if ( (!multiplier) || (calibration_result/multiplier < 500)) | ||
817 | return -EINVAL; | ||
818 | |||
819 | /* | ||
820 | * Set the new multiplier for each CPU. CPUs don't start using the | ||
821 | * new values until the next timer interrupt in which they do process | ||
822 | * accounting. At that time they also adjust their APIC timers | ||
823 | * accordingly. | ||
824 | */ | ||
825 | for (i = 0; i < NR_CPUS; ++i) | ||
826 | per_cpu(prof_multiplier, i) = multiplier; | ||
827 | |||
828 | return 0; | ||
829 | } | ||
830 | |||
831 | #undef APIC_DIVISOR | ||
832 | |||
833 | /* | ||
834 | * Local timer interrupt handler. It does both profiling and | ||
835 | * process statistics/rescheduling. | ||
836 | * | ||
837 | * We do profiling in every local tick, statistics/rescheduling | ||
838 | * happen only every 'profiling multiplier' ticks. The default | ||
839 | * multiplier is 1 and it can be changed by writing the new multiplier | ||
840 | * value into /proc/profile. | ||
841 | */ | ||
842 | |||
843 | void smp_local_timer_interrupt(struct pt_regs *regs) | ||
844 | { | ||
845 | int cpu = smp_processor_id(); | ||
846 | |||
847 | profile_tick(CPU_PROFILING, regs); | ||
848 | if (--per_cpu(prof_counter, cpu) <= 0) { | ||
849 | /* | ||
850 | * The multiplier may have changed since the last time we got | ||
851 | * to this point as a result of the user writing to | ||
852 | * /proc/profile. In this case we need to adjust the APIC | ||
853 | * timer accordingly. | ||
854 | * | ||
855 | * Interrupts are already masked off at this point. | ||
856 | */ | ||
857 | per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); | ||
858 | if (per_cpu(prof_counter, cpu) != | ||
859 | per_cpu(prof_old_multiplier, cpu)) { | ||
860 | __setup_APIC_LVTT(calibration_result/ | ||
861 | per_cpu(prof_counter, cpu)); | ||
862 | per_cpu(prof_old_multiplier, cpu) = | ||
863 | per_cpu(prof_counter, cpu); | ||
864 | } | ||
865 | |||
866 | #ifdef CONFIG_SMP | ||
867 | update_process_times(user_mode(regs)); | ||
868 | #endif | ||
869 | } | ||
870 | |||
871 | /* | ||
872 | * We take the 'long' return path, and there every subsystem | ||
873 | * grabs the appropriate locks (kernel lock/ irq lock). | ||
874 | * | ||
875 | * we might want to decouple profiling from the 'long path', | ||
876 | * and do the profiling totally in assembly. | ||
877 | * | ||
878 | * Currently this isn't too much of an issue (performance wise), | ||
879 | * we can take more than 100K local irqs per second on a 100 MHz P5. | ||
880 | */ | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Local APIC timer interrupt. This is the most natural way for doing | ||
885 | * local interrupts, but local timer interrupts can be emulated by | ||
886 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
887 | * | ||
888 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
889 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
890 | */ | ||
891 | void smp_apic_timer_interrupt(struct pt_regs *regs) | ||
892 | { | ||
893 | /* | ||
894 | * the NMI deadlock-detector uses this. | ||
895 | */ | ||
896 | add_pda(apic_timer_irqs, 1); | ||
897 | |||
898 | /* | ||
899 | * NOTE! We'd better ACK the irq immediately, | ||
900 | * because timer handling can be slow. | ||
901 | */ | ||
902 | ack_APIC_irq(); | ||
903 | /* | ||
904 | * update_process_times() expects us to have done irq_enter(). | ||
905 | * Besides, if we don't timer interrupts ignore the global | ||
906 | * interrupt lock, which is the WrongThing (tm) to do. | ||
907 | */ | ||
908 | irq_enter(); | ||
909 | smp_local_timer_interrupt(regs); | ||
910 | irq_exit(); | ||
911 | } | ||
912 | |||
913 | /* | ||
914 | * oem_force_hpet_timer -- force HPET mode for some boxes. | ||
915 | * | ||
916 | * Thus far, the major user of this is IBM's Summit2 series: | ||
917 | * | ||
918 | * Clustered boxes may have unsynced TSC problems if they are | ||
919 | * multi-chassis. Use available data to take a good guess. | ||
920 | * If in doubt, go HPET. | ||
921 | */ | ||
922 | __init int oem_force_hpet_timer(void) | ||
923 | { | ||
924 | int i, clusters, zeros; | ||
925 | unsigned id; | ||
926 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
927 | |||
928 | bitmap_empty(clustermap, NUM_APIC_CLUSTERS); | ||
929 | |||
930 | for (i = 0; i < NR_CPUS; i++) { | ||
931 | id = bios_cpu_apicid[i]; | ||
932 | if (id != BAD_APICID) | ||
933 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
934 | } | ||
935 | |||
936 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
937 | * the APIC clusters they have been allocated. Only present CPUs have | ||
938 | * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since | ||
939 | * clusters are allocated sequentially, count zeros only if they are | ||
940 | * bounded by ones. | ||
941 | */ | ||
942 | clusters = 0; | ||
943 | zeros = 0; | ||
944 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
945 | if (test_bit(i, clustermap)) { | ||
946 | clusters += 1 + zeros; | ||
947 | zeros = 0; | ||
948 | } else | ||
949 | ++zeros; | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * If clusters > 2, then should be multi-chassis. Return 1 for HPET. | ||
954 | * Else return 0 to use TSC. | ||
955 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
956 | * out, but AFAIK this will work even for them. | ||
957 | */ | ||
958 | return (clusters > 2); | ||
959 | } | ||
960 | |||
961 | /* | ||
962 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
963 | */ | ||
964 | asmlinkage void smp_spurious_interrupt(void) | ||
965 | { | ||
966 | unsigned int v; | ||
967 | irq_enter(); | ||
968 | /* | ||
969 | * Check if this really is a spurious interrupt and ACK it | ||
970 | * if it is a vectored one. Just in case... | ||
971 | * Spurious interrupts should not be ACKed. | ||
972 | */ | ||
973 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
974 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
975 | ack_APIC_irq(); | ||
976 | |||
977 | #if 0 | ||
978 | static unsigned long last_warning; | ||
979 | static unsigned long skipped; | ||
980 | |||
981 | /* see sw-dev-man vol 3, chapter 7.4.13.5 */ | ||
982 | if (time_before(last_warning+30*HZ,jiffies)) { | ||
983 | printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", | ||
984 | smp_processor_id(), skipped); | ||
985 | last_warning = jiffies; | ||
986 | skipped = 0; | ||
987 | } else { | ||
988 | skipped++; | ||
989 | } | ||
990 | #endif | ||
991 | irq_exit(); | ||
992 | } | ||
993 | |||
994 | /* | ||
995 | * This interrupt should never happen with our APIC/SMP architecture | ||
996 | */ | ||
997 | |||
998 | asmlinkage void smp_error_interrupt(void) | ||
999 | { | ||
1000 | unsigned int v, v1; | ||
1001 | |||
1002 | irq_enter(); | ||
1003 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1004 | v = apic_read(APIC_ESR); | ||
1005 | apic_write(APIC_ESR, 0); | ||
1006 | v1 = apic_read(APIC_ESR); | ||
1007 | ack_APIC_irq(); | ||
1008 | atomic_inc(&irq_err_count); | ||
1009 | |||
1010 | /* Here is what the APIC error bits mean: | ||
1011 | 0: Send CS error | ||
1012 | 1: Receive CS error | ||
1013 | 2: Send accept error | ||
1014 | 3: Receive accept error | ||
1015 | 4: Reserved | ||
1016 | 5: Send illegal vector | ||
1017 | 6: Received illegal vector | ||
1018 | 7: Illegal register address | ||
1019 | */ | ||
1020 | printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | ||
1021 | smp_processor_id(), v , v1); | ||
1022 | irq_exit(); | ||
1023 | } | ||
1024 | |||
1025 | int disable_apic; | ||
1026 | |||
1027 | /* | ||
1028 | * This initializes the IO-APIC and APIC hardware if this is | ||
1029 | * a UP kernel. | ||
1030 | */ | ||
1031 | int __init APIC_init_uniprocessor (void) | ||
1032 | { | ||
1033 | if (disable_apic) { | ||
1034 | printk(KERN_INFO "Apic disabled\n"); | ||
1035 | return -1; | ||
1036 | } | ||
1037 | if (!cpu_has_apic) { | ||
1038 | disable_apic = 1; | ||
1039 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1040 | return -1; | ||
1041 | } | ||
1042 | |||
1043 | verify_local_APIC(); | ||
1044 | |||
1045 | connect_bsp_APIC(); | ||
1046 | |||
1047 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
1048 | apic_write_around(APIC_ID, boot_cpu_id); | ||
1049 | |||
1050 | setup_local_APIC(); | ||
1051 | |||
1052 | #ifdef CONFIG_X86_IO_APIC | ||
1053 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | ||
1054 | setup_IO_APIC(); | ||
1055 | else | ||
1056 | nr_ioapics = 0; | ||
1057 | #endif | ||
1058 | setup_boot_APIC_clock(); | ||
1059 | |||
1060 | return 0; | ||
1061 | } | ||
1062 | |||
1063 | static __init int setup_disableapic(char *str) | ||
1064 | { | ||
1065 | disable_apic = 1; | ||
1066 | return 0; | ||
1067 | } | ||
1068 | |||
1069 | static __init int setup_nolapic(char *str) | ||
1070 | { | ||
1071 | disable_apic = 1; | ||
1072 | return 0; | ||
1073 | } | ||
1074 | |||
1075 | static __init int setup_noapictimer(char *str) | ||
1076 | { | ||
1077 | disable_apic_timer = 1; | ||
1078 | return 0; | ||
1079 | } | ||
1080 | |||
1081 | /* dummy parsing: see setup.c */ | ||
1082 | |||
1083 | __setup("disableapic", setup_disableapic); | ||
1084 | __setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ | ||
1085 | |||
1086 | __setup("noapictimer", setup_noapictimer); | ||
1087 | |||
1088 | /* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ | ||
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c new file mode 100644 index 000000000000..35b4c3fcbb37 --- /dev/null +++ b/arch/x86_64/kernel/asm-offsets.c | |||
@@ -0,0 +1,69 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by assembly language modules. | ||
3 | * This code generates raw asm output which is post-processed to extract | ||
4 | * and format the required data. | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <linux/stddef.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <linux/hardirq.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/pda.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/segment.h> | ||
15 | #include <asm/thread_info.h> | ||
16 | #include <asm/ia32.h> | ||
17 | |||
18 | #define DEFINE(sym, val) \ | ||
19 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
20 | |||
21 | #define BLANK() asm volatile("\n->" : : ) | ||
22 | |||
23 | int main(void) | ||
24 | { | ||
25 | #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) | ||
26 | ENTRY(state); | ||
27 | ENTRY(flags); | ||
28 | ENTRY(thread); | ||
29 | ENTRY(pid); | ||
30 | BLANK(); | ||
31 | #undef ENTRY | ||
32 | #define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) | ||
33 | ENTRY(flags); | ||
34 | ENTRY(addr_limit); | ||
35 | ENTRY(preempt_count); | ||
36 | BLANK(); | ||
37 | #undef ENTRY | ||
38 | #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) | ||
39 | ENTRY(kernelstack); | ||
40 | ENTRY(oldrsp); | ||
41 | ENTRY(pcurrent); | ||
42 | ENTRY(irqrsp); | ||
43 | ENTRY(irqcount); | ||
44 | ENTRY(cpunumber); | ||
45 | ENTRY(irqstackptr); | ||
46 | BLANK(); | ||
47 | #undef ENTRY | ||
48 | #ifdef CONFIG_IA32_EMULATION | ||
49 | #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) | ||
50 | ENTRY(eax); | ||
51 | ENTRY(ebx); | ||
52 | ENTRY(ecx); | ||
53 | ENTRY(edx); | ||
54 | ENTRY(esi); | ||
55 | ENTRY(edi); | ||
56 | ENTRY(ebp); | ||
57 | ENTRY(esp); | ||
58 | ENTRY(eip); | ||
59 | BLANK(); | ||
60 | #undef ENTRY | ||
61 | DEFINE(IA32_RT_SIGFRAME_sigcontext, | ||
62 | offsetof (struct rt_sigframe32, uc.uc_mcontext)); | ||
63 | BLANK(); | ||
64 | #endif | ||
65 | DEFINE(pbe_address, offsetof(struct pbe, address)); | ||
66 | DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); | ||
67 | DEFINE(pbe_next, offsetof(struct pbe, next)); | ||
68 | return 0; | ||
69 | } | ||
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig new file mode 100644 index 000000000000..81f1562e5393 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Kconfig | |||
@@ -0,0 +1,96 @@ | |||
1 | # | ||
2 | # CPU Frequency scaling | ||
3 | # | ||
4 | |||
5 | menu "CPU Frequency scaling" | ||
6 | |||
7 | source "drivers/cpufreq/Kconfig" | ||
8 | |||
9 | if CPU_FREQ | ||
10 | |||
11 | comment "CPUFreq processor drivers" | ||
12 | |||
13 | config X86_POWERNOW_K8 | ||
14 | tristate "AMD Opteron/Athlon64 PowerNow!" | ||
15 | select CPU_FREQ_TABLE | ||
16 | help | ||
17 | This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. | ||
18 | |||
19 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
20 | |||
21 | If in doubt, say N. | ||
22 | |||
23 | config X86_POWERNOW_K8_ACPI | ||
24 | bool | ||
25 | depends on X86_POWERNOW_K8 && ACPI_PROCESSOR | ||
26 | depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) | ||
27 | default y | ||
28 | |||
29 | config X86_SPEEDSTEP_CENTRINO | ||
30 | tristate "Intel Enhanced SpeedStep" | ||
31 | select CPU_FREQ_TABLE | ||
32 | depends on ACPI_PROCESSOR | ||
33 | help | ||
34 | This adds the CPUFreq driver for Enhanced SpeedStep enabled | ||
35 | mobile CPUs. This means Intel Pentium M (Centrino) CPUs | ||
36 | or 64bit enabled Intel Xeons. | ||
37 | |||
38 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
39 | |||
40 | If in doubt, say N. | ||
41 | |||
42 | config X86_SPEEDSTEP_CENTRINO_ACPI | ||
43 | bool | ||
44 | depends on X86_SPEEDSTEP_CENTRINO | ||
45 | default y | ||
46 | |||
47 | config X86_ACPI_CPUFREQ | ||
48 | tristate "ACPI Processor P-States driver" | ||
49 | depends on ACPI_PROCESSOR | ||
50 | help | ||
51 | This driver adds a CPUFreq driver which utilizes the ACPI | ||
52 | Processor Performance States. | ||
53 | |||
54 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
55 | |||
56 | If in doubt, say N. | ||
57 | |||
58 | comment "shared options" | ||
59 | |||
60 | config X86_ACPI_CPUFREQ_PROC_INTF | ||
61 | bool "/proc/acpi/processor/../performance interface (deprecated)" | ||
62 | depends on PROC_FS | ||
63 | depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI | ||
64 | help | ||
65 | This enables the deprecated /proc/acpi/processor/../performance | ||
66 | interface. While it is helpful for debugging, the generic, | ||
67 | cross-architecture cpufreq interfaces should be used. | ||
68 | |||
69 | If in doubt, say N. | ||
70 | |||
71 | config X86_P4_CLOCKMOD | ||
72 | tristate "Intel Pentium 4 clock modulation" | ||
73 | depends on EMBEDDED | ||
74 | help | ||
75 | This adds the clock modulation driver for Intel Pentium 4 / XEON | ||
76 | processors. When enabled it will lower CPU temperature by skipping | ||
77 | clocks. | ||
78 | |||
79 | This driver should be only used in exceptional | ||
80 | circumstances when very low power is needed because it causes severe | ||
81 | slowdowns and noticeable latencies. Normally Speedstep should be used | ||
82 | instead. | ||
83 | |||
84 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
85 | |||
86 | Unless you are absolutely sure say N. | ||
87 | |||
88 | |||
89 | config X86_SPEEDSTEP_LIB | ||
90 | tristate | ||
91 | default X86_P4_CLOCKMOD | ||
92 | |||
93 | endif | ||
94 | |||
95 | endmenu | ||
96 | |||
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile new file mode 100644 index 000000000000..d8b593879224 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Makefile | |||
@@ -0,0 +1,17 @@ | |||
1 | # | ||
2 | # Reuse the i386 cpufreq drivers | ||
3 | # | ||
4 | |||
5 | SRCDIR := ../../../i386/kernel/cpu/cpufreq | ||
6 | |||
7 | obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o | ||
8 | obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o | ||
9 | obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o | ||
10 | obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o | ||
11 | obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o | ||
12 | |||
13 | powernow-k8-objs := ${SRCDIR}/powernow-k8.o | ||
14 | speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o | ||
15 | acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o | ||
16 | p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o | ||
17 | speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o | ||
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c new file mode 100644 index 000000000000..657003e461e6 --- /dev/null +++ b/arch/x86_64/kernel/e820.c | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ | ||
5 | */ | ||
6 | #include <linux/config.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/bootmem.h> | ||
11 | #include <linux/ioport.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <asm/page.h> | ||
14 | #include <asm/e820.h> | ||
15 | #include <asm/proto.h> | ||
16 | #include <asm/bootsetup.h> | ||
17 | |||
18 | extern char _end[]; | ||
19 | |||
20 | /* | ||
21 | * PFN of last memory page. | ||
22 | */ | ||
23 | unsigned long end_pfn; | ||
24 | |||
25 | /* | ||
26 | * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | ||
27 | * The direct mapping extends to end_pfn_map, so that we can directly access | ||
28 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
29 | */ | ||
30 | unsigned long end_pfn_map; | ||
31 | |||
32 | /* | ||
33 | * Last pfn which the user wants to use. | ||
34 | */ | ||
35 | unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; | ||
36 | |||
37 | extern struct resource code_resource, data_resource; | ||
38 | |||
39 | /* Check for some hardcoded bad areas that early boot is not allowed to touch */ | ||
40 | static inline int bad_addr(unsigned long *addrp, unsigned long size) | ||
41 | { | ||
42 | unsigned long addr = *addrp, last = addr + size; | ||
43 | |||
44 | /* various gunk below that needed for SMP startup */ | ||
45 | if (addr < 0x8000) { | ||
46 | *addrp = 0x8000; | ||
47 | return 1; | ||
48 | } | ||
49 | |||
50 | /* direct mapping tables of the kernel */ | ||
51 | if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | ||
52 | *addrp = table_end << PAGE_SHIFT; | ||
53 | return 1; | ||
54 | } | ||
55 | |||
56 | /* initrd */ | ||
57 | #ifdef CONFIG_BLK_DEV_INITRD | ||
58 | if (LOADER_TYPE && INITRD_START && last >= INITRD_START && | ||
59 | addr < INITRD_START+INITRD_SIZE) { | ||
60 | *addrp = INITRD_START + INITRD_SIZE; | ||
61 | return 1; | ||
62 | } | ||
63 | #endif | ||
64 | /* kernel code + 640k memory hole (later should not be needed, but | ||
65 | be paranoid for now) */ | ||
66 | if (last >= 640*1024 && addr < __pa_symbol(&_end)) { | ||
67 | *addrp = __pa_symbol(&_end); | ||
68 | return 1; | ||
69 | } | ||
70 | /* XXX ramdisk image here? */ | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) | ||
75 | { | ||
76 | int i; | ||
77 | for (i = 0; i < e820.nr_map; i++) { | ||
78 | struct e820entry *ei = &e820.map[i]; | ||
79 | if (type && ei->type != type) | ||
80 | continue; | ||
81 | if (ei->addr >= end || ei->addr + ei->size < start) | ||
82 | continue; | ||
83 | return 1; | ||
84 | } | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Find a free area in a specific range. | ||
90 | */ | ||
91 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | ||
92 | { | ||
93 | int i; | ||
94 | for (i = 0; i < e820.nr_map; i++) { | ||
95 | struct e820entry *ei = &e820.map[i]; | ||
96 | unsigned long addr = ei->addr, last; | ||
97 | if (ei->type != E820_RAM) | ||
98 | continue; | ||
99 | if (addr < start) | ||
100 | addr = start; | ||
101 | if (addr > ei->addr + ei->size) | ||
102 | continue; | ||
103 | while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) | ||
104 | ; | ||
105 | last = addr + size; | ||
106 | if (last > ei->addr + ei->size) | ||
107 | continue; | ||
108 | if (last > end) | ||
109 | continue; | ||
110 | return addr; | ||
111 | } | ||
112 | return -1UL; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Free bootmem based on the e820 table for a node. | ||
117 | */ | ||
118 | void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) | ||
119 | { | ||
120 | int i; | ||
121 | for (i = 0; i < e820.nr_map; i++) { | ||
122 | struct e820entry *ei = &e820.map[i]; | ||
123 | unsigned long last, addr; | ||
124 | |||
125 | if (ei->type != E820_RAM || | ||
126 | ei->addr+ei->size <= start || | ||
127 | ei->addr > end) | ||
128 | continue; | ||
129 | |||
130 | addr = round_up(ei->addr, PAGE_SIZE); | ||
131 | if (addr < start) | ||
132 | addr = start; | ||
133 | |||
134 | last = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
135 | if (last >= end) | ||
136 | last = end; | ||
137 | |||
138 | if (last > addr && last-addr >= PAGE_SIZE) | ||
139 | free_bootmem_node(pgdat, addr, last-addr); | ||
140 | } | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Find the highest page frame number we have available | ||
145 | */ | ||
146 | unsigned long __init e820_end_of_ram(void) | ||
147 | { | ||
148 | int i; | ||
149 | unsigned long end_pfn = 0; | ||
150 | |||
151 | for (i = 0; i < e820.nr_map; i++) { | ||
152 | struct e820entry *ei = &e820.map[i]; | ||
153 | unsigned long start, end; | ||
154 | |||
155 | start = round_up(ei->addr, PAGE_SIZE); | ||
156 | end = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
157 | if (start >= end) | ||
158 | continue; | ||
159 | if (ei->type == E820_RAM) { | ||
160 | if (end > end_pfn<<PAGE_SHIFT) | ||
161 | end_pfn = end>>PAGE_SHIFT; | ||
162 | } else { | ||
163 | if (end > end_pfn_map<<PAGE_SHIFT) | ||
164 | end_pfn_map = end>>PAGE_SHIFT; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | if (end_pfn > end_pfn_map) | ||
169 | end_pfn_map = end_pfn; | ||
170 | if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | ||
171 | end_pfn_map = MAXMEM>>PAGE_SHIFT; | ||
172 | if (end_pfn > end_user_pfn) | ||
173 | end_pfn = end_user_pfn; | ||
174 | if (end_pfn > end_pfn_map) | ||
175 | end_pfn = end_pfn_map; | ||
176 | |||
177 | return end_pfn; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Mark e820 reserved areas as busy for the resource manager. | ||
182 | */ | ||
183 | void __init e820_reserve_resources(void) | ||
184 | { | ||
185 | int i; | ||
186 | for (i = 0; i < e820.nr_map; i++) { | ||
187 | struct resource *res; | ||
188 | if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) | ||
189 | continue; | ||
190 | res = alloc_bootmem_low(sizeof(struct resource)); | ||
191 | switch (e820.map[i].type) { | ||
192 | case E820_RAM: res->name = "System RAM"; break; | ||
193 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
194 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
195 | default: res->name = "reserved"; | ||
196 | } | ||
197 | res->start = e820.map[i].addr; | ||
198 | res->end = res->start + e820.map[i].size - 1; | ||
199 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
200 | request_resource(&iomem_resource, res); | ||
201 | if (e820.map[i].type == E820_RAM) { | ||
202 | /* | ||
203 | * We don't know which RAM region contains kernel data, | ||
204 | * so we try it repeatedly and let the resource manager | ||
205 | * test it. | ||
206 | */ | ||
207 | request_resource(res, &code_resource); | ||
208 | request_resource(res, &data_resource); | ||
209 | } | ||
210 | } | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Add a memory region to the kernel e820 map. | ||
215 | */ | ||
216 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | ||
217 | { | ||
218 | int x = e820.nr_map; | ||
219 | |||
220 | if (x == E820MAX) { | ||
221 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
222 | return; | ||
223 | } | ||
224 | |||
225 | e820.map[x].addr = start; | ||
226 | e820.map[x].size = size; | ||
227 | e820.map[x].type = type; | ||
228 | e820.nr_map++; | ||
229 | } | ||
230 | |||
231 | void __init e820_print_map(char *who) | ||
232 | { | ||
233 | int i; | ||
234 | |||
235 | for (i = 0; i < e820.nr_map; i++) { | ||
236 | printk(" %s: %016Lx - %016Lx ", who, | ||
237 | (unsigned long long) e820.map[i].addr, | ||
238 | (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | ||
239 | switch (e820.map[i].type) { | ||
240 | case E820_RAM: printk("(usable)\n"); | ||
241 | break; | ||
242 | case E820_RESERVED: | ||
243 | printk("(reserved)\n"); | ||
244 | break; | ||
245 | case E820_ACPI: | ||
246 | printk("(ACPI data)\n"); | ||
247 | break; | ||
248 | case E820_NVS: | ||
249 | printk("(ACPI NVS)\n"); | ||
250 | break; | ||
251 | default: printk("type %u\n", e820.map[i].type); | ||
252 | break; | ||
253 | } | ||
254 | } | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Sanitize the BIOS e820 map. | ||
259 | * | ||
260 | * Some e820 responses include overlapping entries. The following | ||
261 | * replaces the original e820 map with a new one, removing overlaps. | ||
262 | * | ||
263 | */ | ||
264 | static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
265 | { | ||
266 | struct change_member { | ||
267 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
268 | unsigned long long addr; /* address for this change point */ | ||
269 | }; | ||
270 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
271 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
272 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
273 | static struct e820entry new_bios[E820MAX] __initdata; | ||
274 | struct change_member *change_tmp; | ||
275 | unsigned long current_type, last_type; | ||
276 | unsigned long long last_addr; | ||
277 | int chgidx, still_changing; | ||
278 | int overlap_entries; | ||
279 | int new_bios_entry; | ||
280 | int old_nr, new_nr; | ||
281 | int i; | ||
282 | |||
283 | /* | ||
284 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
285 | |||
286 | Sample memory map (w/overlaps): | ||
287 | ____22__________________ | ||
288 | ______________________4_ | ||
289 | ____1111________________ | ||
290 | _44_____________________ | ||
291 | 11111111________________ | ||
292 | ____________________33__ | ||
293 | ___________44___________ | ||
294 | __________33333_________ | ||
295 | ______________22________ | ||
296 | ___________________2222_ | ||
297 | _________111111111______ | ||
298 | _____________________11_ | ||
299 | _________________4______ | ||
300 | |||
301 | Sanitized equivalent (no overlap): | ||
302 | 1_______________________ | ||
303 | _44_____________________ | ||
304 | ___1____________________ | ||
305 | ____22__________________ | ||
306 | ______11________________ | ||
307 | _________1______________ | ||
308 | __________3_____________ | ||
309 | ___________44___________ | ||
310 | _____________33_________ | ||
311 | _______________2________ | ||
312 | ________________1_______ | ||
313 | _________________4______ | ||
314 | ___________________2____ | ||
315 | ____________________33__ | ||
316 | ______________________4_ | ||
317 | */ | ||
318 | |||
319 | /* if there's only one memory region, don't bother */ | ||
320 | if (*pnr_map < 2) | ||
321 | return -1; | ||
322 | |||
323 | old_nr = *pnr_map; | ||
324 | |||
325 | /* bail out if we find any unreasonable addresses in bios map */ | ||
326 | for (i=0; i<old_nr; i++) | ||
327 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
328 | return -1; | ||
329 | |||
330 | /* create pointers for initial change-point information (for sorting) */ | ||
331 | for (i=0; i < 2*old_nr; i++) | ||
332 | change_point[i] = &change_point_list[i]; | ||
333 | |||
334 | /* record all known change-points (starting and ending addresses) */ | ||
335 | chgidx = 0; | ||
336 | for (i=0; i < old_nr; i++) { | ||
337 | change_point[chgidx]->addr = biosmap[i].addr; | ||
338 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
339 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
340 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
341 | } | ||
342 | |||
343 | /* sort change-point list by memory addresses (low -> high) */ | ||
344 | still_changing = 1; | ||
345 | while (still_changing) { | ||
346 | still_changing = 0; | ||
347 | for (i=1; i < 2*old_nr; i++) { | ||
348 | /* if <current_addr> > <last_addr>, swap */ | ||
349 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
350 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
351 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
352 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
353 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
354 | ) | ||
355 | { | ||
356 | change_tmp = change_point[i]; | ||
357 | change_point[i] = change_point[i-1]; | ||
358 | change_point[i-1] = change_tmp; | ||
359 | still_changing=1; | ||
360 | } | ||
361 | } | ||
362 | } | ||
363 | |||
364 | /* create a new bios memory map, removing overlaps */ | ||
365 | overlap_entries=0; /* number of entries in the overlap table */ | ||
366 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
367 | last_type = 0; /* start with undefined memory type */ | ||
368 | last_addr = 0; /* start with 0 as last starting address */ | ||
369 | /* loop through change-points, determining affect on the new bios map */ | ||
370 | for (chgidx=0; chgidx < 2*old_nr; chgidx++) | ||
371 | { | ||
372 | /* keep track of all overlapping bios entries */ | ||
373 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
374 | { | ||
375 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
376 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
377 | } | ||
378 | else | ||
379 | { | ||
380 | /* remove entry from list (order independent, so swap with last) */ | ||
381 | for (i=0; i<overlap_entries; i++) | ||
382 | { | ||
383 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
384 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
385 | } | ||
386 | overlap_entries--; | ||
387 | } | ||
388 | /* if there are overlapping entries, decide which "type" to use */ | ||
389 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
390 | current_type = 0; | ||
391 | for (i=0; i<overlap_entries; i++) | ||
392 | if (overlap_list[i]->type > current_type) | ||
393 | current_type = overlap_list[i]->type; | ||
394 | /* continue building up new bios map based on this information */ | ||
395 | if (current_type != last_type) { | ||
396 | if (last_type != 0) { | ||
397 | new_bios[new_bios_entry].size = | ||
398 | change_point[chgidx]->addr - last_addr; | ||
399 | /* move forward only if the new size was non-zero */ | ||
400 | if (new_bios[new_bios_entry].size != 0) | ||
401 | if (++new_bios_entry >= E820MAX) | ||
402 | break; /* no more space left for new bios entries */ | ||
403 | } | ||
404 | if (current_type != 0) { | ||
405 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
406 | new_bios[new_bios_entry].type = current_type; | ||
407 | last_addr=change_point[chgidx]->addr; | ||
408 | } | ||
409 | last_type = current_type; | ||
410 | } | ||
411 | } | ||
412 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
413 | |||
414 | /* copy new bios mapping into original location */ | ||
415 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
416 | *pnr_map = new_nr; | ||
417 | |||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * Copy the BIOS e820 map into a safe place. | ||
423 | * | ||
424 | * Sanity-check it while we're at it.. | ||
425 | * | ||
426 | * If we're lucky and live on a modern system, the setup code | ||
427 | * will have given us a memory map that we can use to properly | ||
428 | * set up memory. If we aren't, we'll fake a memory map. | ||
429 | * | ||
430 | * We check to see that the memory map contains at least 2 elements | ||
431 | * before we'll use it, because the detection code in setup.S may | ||
432 | * not be perfect and most every PC known to man has two memory | ||
433 | * regions: one from 0 to 640k, and one from 1mb up. (The IBM | ||
434 | * thinkpad 560x, for example, does not cooperate with the memory | ||
435 | * detection code.) | ||
436 | */ | ||
437 | static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | ||
438 | { | ||
439 | /* Only one memory region (or negative)? Ignore it */ | ||
440 | if (nr_map < 2) | ||
441 | return -1; | ||
442 | |||
443 | do { | ||
444 | unsigned long start = biosmap->addr; | ||
445 | unsigned long size = biosmap->size; | ||
446 | unsigned long end = start + size; | ||
447 | unsigned long type = biosmap->type; | ||
448 | |||
449 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
450 | if (start > end) | ||
451 | return -1; | ||
452 | |||
453 | /* | ||
454 | * Some BIOSes claim RAM in the 640k - 1M region. | ||
455 | * Not right. Fix it up. | ||
456 | * | ||
457 | * This should be removed on Hammer which is supposed to not | ||
458 | * have non e820 covered ISA mappings there, but I had some strange | ||
459 | * problems so it stays for now. -AK | ||
460 | */ | ||
461 | if (type == E820_RAM) { | ||
462 | if (start < 0x100000ULL && end > 0xA0000ULL) { | ||
463 | if (start < 0xA0000ULL) | ||
464 | add_memory_region(start, 0xA0000ULL-start, type); | ||
465 | if (end <= 0x100000ULL) | ||
466 | continue; | ||
467 | start = 0x100000ULL; | ||
468 | size = end - start; | ||
469 | } | ||
470 | } | ||
471 | |||
472 | add_memory_region(start, size, type); | ||
473 | } while (biosmap++,--nr_map); | ||
474 | return 0; | ||
475 | } | ||
476 | |||
477 | void __init setup_memory_region(void) | ||
478 | { | ||
479 | char *who = "BIOS-e820"; | ||
480 | |||
481 | /* | ||
482 | * Try to copy the BIOS-supplied E820-map. | ||
483 | * | ||
484 | * Otherwise fake a memory map; one section from 0k->640k, | ||
485 | * the next section from 1mb->appropriate_mem_k | ||
486 | */ | ||
487 | sanitize_e820_map(E820_MAP, &E820_MAP_NR); | ||
488 | if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { | ||
489 | unsigned long mem_size; | ||
490 | |||
491 | /* compare results from other methods and take the greater */ | ||
492 | if (ALT_MEM_K < EXT_MEM_K) { | ||
493 | mem_size = EXT_MEM_K; | ||
494 | who = "BIOS-88"; | ||
495 | } else { | ||
496 | mem_size = ALT_MEM_K; | ||
497 | who = "BIOS-e801"; | ||
498 | } | ||
499 | |||
500 | e820.nr_map = 0; | ||
501 | add_memory_region(0, LOWMEMSIZE(), E820_RAM); | ||
502 | add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); | ||
503 | } | ||
504 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
505 | e820_print_map(who); | ||
506 | } | ||
507 | |||
508 | void __init parse_memopt(char *p, char **from) | ||
509 | { | ||
510 | end_user_pfn = memparse(p, from); | ||
511 | end_user_pfn >>= PAGE_SHIFT; | ||
512 | } | ||
513 | |||
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c new file mode 100644 index 000000000000..750bcd0655dc --- /dev/null +++ b/arch/x86_64/kernel/early_printk.c | |||
@@ -0,0 +1,220 @@ | |||
1 | #include <linux/console.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <asm/io.h> | ||
6 | #include <asm/processor.h> | ||
7 | |||
8 | /* Simple VGA output */ | ||
9 | |||
10 | #ifdef __i386__ | ||
11 | #define VGABASE (__ISA_IO_base + 0xb8000) | ||
12 | #else | ||
13 | #define VGABASE ((void __iomem *)0xffffffff800b8000UL) | ||
14 | #endif | ||
15 | |||
16 | #define MAX_YPOS 25 | ||
17 | #define MAX_XPOS 80 | ||
18 | |||
19 | static int current_ypos = 1, current_xpos = 0; | ||
20 | |||
21 | static void early_vga_write(struct console *con, const char *str, unsigned n) | ||
22 | { | ||
23 | char c; | ||
24 | int i, k, j; | ||
25 | |||
26 | while ((c = *str++) != '\0' && n-- > 0) { | ||
27 | if (current_ypos >= MAX_YPOS) { | ||
28 | /* scroll 1 line up */ | ||
29 | for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { | ||
30 | for (i = 0; i < MAX_XPOS; i++) { | ||
31 | writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), | ||
32 | VGABASE + 2*(MAX_XPOS*j + i)); | ||
33 | } | ||
34 | } | ||
35 | for (i = 0; i < MAX_XPOS; i++) | ||
36 | writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); | ||
37 | current_ypos = MAX_YPOS-1; | ||
38 | } | ||
39 | if (c == '\n') { | ||
40 | current_xpos = 0; | ||
41 | current_ypos++; | ||
42 | } else if (c != '\r') { | ||
43 | writew(((0x7 << 8) | (unsigned short) c), | ||
44 | VGABASE + 2*(MAX_XPOS*current_ypos + | ||
45 | current_xpos++)); | ||
46 | if (current_xpos >= MAX_XPOS) { | ||
47 | current_xpos = 0; | ||
48 | current_ypos++; | ||
49 | } | ||
50 | } | ||
51 | } | ||
52 | } | ||
53 | |||
54 | static struct console early_vga_console = { | ||
55 | .name = "earlyvga", | ||
56 | .write = early_vga_write, | ||
57 | .flags = CON_PRINTBUFFER, | ||
58 | .index = -1, | ||
59 | }; | ||
60 | |||
61 | /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ | ||
62 | |||
63 | int early_serial_base = 0x3f8; /* ttyS0 */ | ||
64 | |||
65 | #define XMTRDY 0x20 | ||
66 | |||
67 | #define DLAB 0x80 | ||
68 | |||
69 | #define TXR 0 /* Transmit register (WRITE) */ | ||
70 | #define RXR 0 /* Receive register (READ) */ | ||
71 | #define IER 1 /* Interrupt Enable */ | ||
72 | #define IIR 2 /* Interrupt ID */ | ||
73 | #define FCR 2 /* FIFO control */ | ||
74 | #define LCR 3 /* Line control */ | ||
75 | #define MCR 4 /* Modem control */ | ||
76 | #define LSR 5 /* Line Status */ | ||
77 | #define MSR 6 /* Modem Status */ | ||
78 | #define DLL 0 /* Divisor Latch Low */ | ||
79 | #define DLH 1 /* Divisor latch High */ | ||
80 | |||
81 | static int early_serial_putc(unsigned char ch) | ||
82 | { | ||
83 | unsigned timeout = 0xffff; | ||
84 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | ||
85 | cpu_relax(); | ||
86 | outb(ch, early_serial_base + TXR); | ||
87 | return timeout ? 0 : -1; | ||
88 | } | ||
89 | |||
90 | static void early_serial_write(struct console *con, const char *s, unsigned n) | ||
91 | { | ||
92 | while (*s && n-- > 0) { | ||
93 | early_serial_putc(*s); | ||
94 | if (*s == '\n') | ||
95 | early_serial_putc('\r'); | ||
96 | s++; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | #define DEFAULT_BAUD 9600 | ||
101 | |||
102 | static __init void early_serial_init(char *s) | ||
103 | { | ||
104 | unsigned char c; | ||
105 | unsigned divisor; | ||
106 | unsigned baud = DEFAULT_BAUD; | ||
107 | char *e; | ||
108 | |||
109 | if (*s == ',') | ||
110 | ++s; | ||
111 | |||
112 | if (*s) { | ||
113 | unsigned port; | ||
114 | if (!strncmp(s,"0x",2)) { | ||
115 | early_serial_base = simple_strtoul(s, &e, 16); | ||
116 | } else { | ||
117 | static int bases[] = { 0x3f8, 0x2f8 }; | ||
118 | |||
119 | if (!strncmp(s,"ttyS",4)) | ||
120 | s += 4; | ||
121 | port = simple_strtoul(s, &e, 10); | ||
122 | if (port > 1 || s == e) | ||
123 | port = 0; | ||
124 | early_serial_base = bases[port]; | ||
125 | } | ||
126 | s += strcspn(s, ","); | ||
127 | if (*s == ',') | ||
128 | s++; | ||
129 | } | ||
130 | |||
131 | outb(0x3, early_serial_base + LCR); /* 8n1 */ | ||
132 | outb(0, early_serial_base + IER); /* no interrupt */ | ||
133 | outb(0, early_serial_base + FCR); /* no fifo */ | ||
134 | outb(0x3, early_serial_base + MCR); /* DTR + RTS */ | ||
135 | |||
136 | if (*s) { | ||
137 | baud = simple_strtoul(s, &e, 0); | ||
138 | if (baud == 0 || s == e) | ||
139 | baud = DEFAULT_BAUD; | ||
140 | } | ||
141 | |||
142 | divisor = 115200 / baud; | ||
143 | c = inb(early_serial_base + LCR); | ||
144 | outb(c | DLAB, early_serial_base + LCR); | ||
145 | outb(divisor & 0xff, early_serial_base + DLL); | ||
146 | outb((divisor >> 8) & 0xff, early_serial_base + DLH); | ||
147 | outb(c & ~DLAB, early_serial_base + LCR); | ||
148 | } | ||
149 | |||
150 | static struct console early_serial_console = { | ||
151 | .name = "earlyser", | ||
152 | .write = early_serial_write, | ||
153 | .flags = CON_PRINTBUFFER, | ||
154 | .index = -1, | ||
155 | }; | ||
156 | |||
157 | /* Direct interface for emergencies */ | ||
158 | struct console *early_console = &early_vga_console; | ||
159 | static int early_console_initialized = 0; | ||
160 | |||
161 | void early_printk(const char *fmt, ...) | ||
162 | { | ||
163 | char buf[512]; | ||
164 | int n; | ||
165 | va_list ap; | ||
166 | |||
167 | va_start(ap,fmt); | ||
168 | n = vscnprintf(buf,512,fmt,ap); | ||
169 | early_console->write(early_console,buf,n); | ||
170 | va_end(ap); | ||
171 | } | ||
172 | |||
173 | static int keep_early; | ||
174 | |||
175 | int __init setup_early_printk(char *opt) | ||
176 | { | ||
177 | char *space; | ||
178 | char buf[256]; | ||
179 | |||
180 | if (early_console_initialized) | ||
181 | return -1; | ||
182 | |||
183 | opt = strchr(opt, '=') + 1; | ||
184 | |||
185 | strlcpy(buf,opt,sizeof(buf)); | ||
186 | space = strchr(buf, ' '); | ||
187 | if (space) | ||
188 | *space = 0; | ||
189 | |||
190 | if (strstr(buf,"keep")) | ||
191 | keep_early = 1; | ||
192 | |||
193 | if (!strncmp(buf, "serial", 6)) { | ||
194 | early_serial_init(buf + 6); | ||
195 | early_console = &early_serial_console; | ||
196 | } else if (!strncmp(buf, "ttyS", 4)) { | ||
197 | early_serial_init(buf); | ||
198 | early_console = &early_serial_console; | ||
199 | } else if (!strncmp(buf, "vga", 3)) { | ||
200 | early_console = &early_vga_console; | ||
201 | } | ||
202 | early_console_initialized = 1; | ||
203 | register_console(early_console); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | void __init disable_early_printk(void) | ||
208 | { | ||
209 | if (!early_console_initialized || !early_console) | ||
210 | return; | ||
211 | if (!keep_early) { | ||
212 | printk("disabling early console\n"); | ||
213 | unregister_console(early_console); | ||
214 | early_console_initialized = 0; | ||
215 | } else { | ||
216 | printk("keeping early console\n"); | ||
217 | } | ||
218 | } | ||
219 | |||
220 | __setup("earlyprintk=", setup_early_printk); | ||
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S new file mode 100644 index 000000000000..e126284db7a8 --- /dev/null +++ b/arch/x86_64/kernel/entry.S | |||
@@ -0,0 +1,920 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/entry.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
7 | * | ||
8 | * $Id$ | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * entry.S contains the system-call and fault low-level handling routines. | ||
13 | * | ||
14 | * NOTE: This code handles signal-recognition, which happens every time | ||
15 | * after an interrupt and after each system call. | ||
16 | * | ||
17 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
18 | * only done for syscall tracing, signals or fork/exec et.al. | ||
19 | * | ||
20 | * A note on terminology: | ||
21 | * - top of stack: Architecture defined interrupt frame from SS to RIP | ||
22 | * at the top of the kernel process stack. | ||
23 | * - partial stack frame: partially saved registers upto R11. | ||
24 | * - full stack frame: Like partial stack frame, but all register saved. | ||
25 | * | ||
26 | * TODO: | ||
27 | * - schedule it carefully for the final hardware. | ||
28 | */ | ||
29 | |||
30 | #define ASSEMBLY 1 | ||
31 | #include <linux/config.h> | ||
32 | #include <linux/linkage.h> | ||
33 | #include <asm/segment.h> | ||
34 | #include <asm/smp.h> | ||
35 | #include <asm/cache.h> | ||
36 | #include <asm/errno.h> | ||
37 | #include <asm/dwarf2.h> | ||
38 | #include <asm/calling.h> | ||
39 | #include <asm/offset.h> | ||
40 | #include <asm/msr.h> | ||
41 | #include <asm/unistd.h> | ||
42 | #include <asm/thread_info.h> | ||
43 | #include <asm/hw_irq.h> | ||
44 | |||
45 | .code64 | ||
46 | |||
47 | #ifdef CONFIG_PREEMPT | ||
48 | #define preempt_stop cli | ||
49 | #else | ||
50 | #define preempt_stop | ||
51 | #define retint_kernel retint_restore_args | ||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | * C code is not supposed to know about undefined top of stack. Every time | ||
56 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
57 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
58 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
59 | * manipulation. | ||
60 | */ | ||
61 | |||
62 | /* %rsp:at FRAMEEND */ | ||
63 | .macro FIXUP_TOP_OF_STACK tmp | ||
64 | movq %gs:pda_oldrsp,\tmp | ||
65 | movq \tmp,RSP(%rsp) | ||
66 | movq $__USER_DS,SS(%rsp) | ||
67 | movq $__USER_CS,CS(%rsp) | ||
68 | movq $-1,RCX(%rsp) | ||
69 | movq R11(%rsp),\tmp /* get eflags */ | ||
70 | movq \tmp,EFLAGS(%rsp) | ||
71 | .endm | ||
72 | |||
73 | .macro RESTORE_TOP_OF_STACK tmp,offset=0 | ||
74 | movq RSP-\offset(%rsp),\tmp | ||
75 | movq \tmp,%gs:pda_oldrsp | ||
76 | movq EFLAGS-\offset(%rsp),\tmp | ||
77 | movq \tmp,R11-\offset(%rsp) | ||
78 | .endm | ||
79 | |||
80 | .macro FAKE_STACK_FRAME child_rip | ||
81 | /* push in order ss, rsp, eflags, cs, rip */ | ||
82 | xorq %rax, %rax | ||
83 | pushq %rax /* ss */ | ||
84 | CFI_ADJUST_CFA_OFFSET 8 | ||
85 | pushq %rax /* rsp */ | ||
86 | CFI_ADJUST_CFA_OFFSET 8 | ||
87 | CFI_OFFSET rip,0 | ||
88 | pushq $(1<<9) /* eflags - interrupts on */ | ||
89 | CFI_ADJUST_CFA_OFFSET 8 | ||
90 | pushq $__KERNEL_CS /* cs */ | ||
91 | CFI_ADJUST_CFA_OFFSET 8 | ||
92 | pushq \child_rip /* rip */ | ||
93 | CFI_ADJUST_CFA_OFFSET 8 | ||
94 | CFI_OFFSET rip,0 | ||
95 | pushq %rax /* orig rax */ | ||
96 | CFI_ADJUST_CFA_OFFSET 8 | ||
97 | .endm | ||
98 | |||
99 | .macro UNFAKE_STACK_FRAME | ||
100 | addq $8*6, %rsp | ||
101 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
102 | .endm | ||
103 | |||
104 | .macro CFI_DEFAULT_STACK | ||
105 | CFI_ADJUST_CFA_OFFSET (SS) | ||
106 | CFI_OFFSET r15,R15-SS | ||
107 | CFI_OFFSET r14,R14-SS | ||
108 | CFI_OFFSET r13,R13-SS | ||
109 | CFI_OFFSET r12,R12-SS | ||
110 | CFI_OFFSET rbp,RBP-SS | ||
111 | CFI_OFFSET rbx,RBX-SS | ||
112 | CFI_OFFSET r11,R11-SS | ||
113 | CFI_OFFSET r10,R10-SS | ||
114 | CFI_OFFSET r9,R9-SS | ||
115 | CFI_OFFSET r8,R8-SS | ||
116 | CFI_OFFSET rax,RAX-SS | ||
117 | CFI_OFFSET rcx,RCX-SS | ||
118 | CFI_OFFSET rdx,RDX-SS | ||
119 | CFI_OFFSET rsi,RSI-SS | ||
120 | CFI_OFFSET rdi,RDI-SS | ||
121 | CFI_OFFSET rsp,RSP-SS | ||
122 | CFI_OFFSET rip,RIP-SS | ||
123 | .endm | ||
124 | /* | ||
125 | * A newly forked process directly context switches into this. | ||
126 | */ | ||
127 | /* rdi: prev */ | ||
128 | ENTRY(ret_from_fork) | ||
129 | CFI_STARTPROC | ||
130 | CFI_DEFAULT_STACK | ||
131 | call schedule_tail | ||
132 | GET_THREAD_INFO(%rcx) | ||
133 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | ||
134 | jnz rff_trace | ||
135 | rff_action: | ||
136 | RESTORE_REST | ||
137 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
138 | je int_ret_from_sys_call | ||
139 | testl $_TIF_IA32,threadinfo_flags(%rcx) | ||
140 | jnz int_ret_from_sys_call | ||
141 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | ||
142 | jmp ret_from_sys_call | ||
143 | rff_trace: | ||
144 | movq %rsp,%rdi | ||
145 | call syscall_trace_leave | ||
146 | GET_THREAD_INFO(%rcx) | ||
147 | jmp rff_action | ||
148 | CFI_ENDPROC | ||
149 | |||
150 | /* | ||
151 | * System call entry. Upto 6 arguments in registers are supported. | ||
152 | * | ||
153 | * SYSCALL does not save anything on the stack and does not change the | ||
154 | * stack pointer. | ||
155 | */ | ||
156 | |||
157 | /* | ||
158 | * Register setup: | ||
159 | * rax system call number | ||
160 | * rdi arg0 | ||
161 | * rcx return address for syscall/sysret, C arg3 | ||
162 | * rsi arg1 | ||
163 | * rdx arg2 | ||
164 | * r10 arg3 (--> moved to rcx for C) | ||
165 | * r8 arg4 | ||
166 | * r9 arg5 | ||
167 | * r11 eflags for syscall/sysret, temporary for C | ||
168 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
169 | * | ||
170 | * Interrupts are off on entry. | ||
171 | * Only called from user space. | ||
172 | * | ||
173 | * XXX if we had a free scratch register we could save the RSP into the stack frame | ||
174 | * and report it properly in ps. Unfortunately we haven't. | ||
175 | */ | ||
176 | |||
177 | ENTRY(system_call) | ||
178 | CFI_STARTPROC | ||
179 | swapgs | ||
180 | movq %rsp,%gs:pda_oldrsp | ||
181 | movq %gs:pda_kernelstack,%rsp | ||
182 | sti | ||
183 | SAVE_ARGS 8,1 | ||
184 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
185 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
186 | GET_THREAD_INFO(%rcx) | ||
187 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | ||
188 | jnz tracesys | ||
189 | cmpq $__NR_syscall_max,%rax | ||
190 | ja badsys | ||
191 | movq %r10,%rcx | ||
192 | call *sys_call_table(,%rax,8) # XXX: rip relative | ||
193 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
194 | /* | ||
195 | * Syscall return path ending with SYSRET (fast path) | ||
196 | * Has incomplete stack frame and undefined top of stack. | ||
197 | */ | ||
198 | .globl ret_from_sys_call | ||
199 | ret_from_sys_call: | ||
200 | movl $_TIF_WORK_MASK,%edi | ||
201 | /* edi: flagmask */ | ||
202 | sysret_check: | ||
203 | GET_THREAD_INFO(%rcx) | ||
204 | cli | ||
205 | movl threadinfo_flags(%rcx),%edx | ||
206 | andl %edi,%edx | ||
207 | jnz sysret_careful | ||
208 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
209 | RESTORE_ARGS 0,-ARG_SKIP,1 | ||
210 | movq %gs:pda_oldrsp,%rsp | ||
211 | swapgs | ||
212 | sysretq | ||
213 | |||
214 | /* Handle reschedules */ | ||
215 | /* edx: work, edi: workmask */ | ||
216 | sysret_careful: | ||
217 | bt $TIF_NEED_RESCHED,%edx | ||
218 | jnc sysret_signal | ||
219 | sti | ||
220 | pushq %rdi | ||
221 | call schedule | ||
222 | popq %rdi | ||
223 | jmp sysret_check | ||
224 | |||
225 | /* Handle a signal */ | ||
226 | sysret_signal: | ||
227 | sti | ||
228 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
229 | jz 1f | ||
230 | |||
231 | /* Really a signal */ | ||
232 | /* edx: work flags (arg3) */ | ||
233 | leaq do_notify_resume(%rip),%rax | ||
234 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | ||
235 | xorl %esi,%esi # oldset -> arg2 | ||
236 | call ptregscall_common | ||
237 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
238 | jmp sysret_check | ||
239 | |||
240 | /* Do syscall tracing */ | ||
241 | tracesys: | ||
242 | SAVE_REST | ||
243 | movq $-ENOSYS,RAX(%rsp) | ||
244 | FIXUP_TOP_OF_STACK %rdi | ||
245 | movq %rsp,%rdi | ||
246 | call syscall_trace_enter | ||
247 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
248 | RESTORE_REST | ||
249 | cmpq $__NR_syscall_max,%rax | ||
250 | ja 1f | ||
251 | movq %r10,%rcx /* fixup for C */ | ||
252 | call *sys_call_table(,%rax,8) | ||
253 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
254 | 1: SAVE_REST | ||
255 | movq %rsp,%rdi | ||
256 | call syscall_trace_leave | ||
257 | RESTORE_TOP_OF_STACK %rbx | ||
258 | RESTORE_REST | ||
259 | jmp ret_from_sys_call | ||
260 | |||
261 | badsys: | ||
262 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
263 | jmp ret_from_sys_call | ||
264 | |||
265 | /* | ||
266 | * Syscall return path ending with IRET. | ||
267 | * Has correct top of stack, but partial stack frame. | ||
268 | */ | ||
269 | ENTRY(int_ret_from_sys_call) | ||
270 | cli | ||
271 | testl $3,CS-ARGOFFSET(%rsp) | ||
272 | je retint_restore_args | ||
273 | movl $_TIF_ALLWORK_MASK,%edi | ||
274 | /* edi: mask to check */ | ||
275 | int_with_check: | ||
276 | GET_THREAD_INFO(%rcx) | ||
277 | movl threadinfo_flags(%rcx),%edx | ||
278 | andl %edi,%edx | ||
279 | jnz int_careful | ||
280 | jmp retint_swapgs | ||
281 | |||
282 | /* Either reschedule or signal or syscall exit tracking needed. */ | ||
283 | /* First do a reschedule test. */ | ||
284 | /* edx: work, edi: workmask */ | ||
285 | int_careful: | ||
286 | bt $TIF_NEED_RESCHED,%edx | ||
287 | jnc int_very_careful | ||
288 | sti | ||
289 | pushq %rdi | ||
290 | call schedule | ||
291 | popq %rdi | ||
292 | jmp int_with_check | ||
293 | |||
294 | /* handle signals and tracing -- both require a full stack frame */ | ||
295 | int_very_careful: | ||
296 | sti | ||
297 | SAVE_REST | ||
298 | /* Check for syscall exit trace */ | ||
299 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | ||
300 | jz int_signal | ||
301 | pushq %rdi | ||
302 | leaq 8(%rsp),%rdi # &ptregs -> arg1 | ||
303 | call syscall_trace_leave | ||
304 | popq %rdi | ||
305 | btr $TIF_SYSCALL_TRACE,%edi | ||
306 | btr $TIF_SYSCALL_AUDIT,%edi | ||
307 | btr $TIF_SINGLESTEP,%edi | ||
308 | jmp int_restore_rest | ||
309 | |||
310 | int_signal: | ||
311 | testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx | ||
312 | jz 1f | ||
313 | movq %rsp,%rdi # &ptregs -> arg1 | ||
314 | xorl %esi,%esi # oldset -> arg2 | ||
315 | call do_notify_resume | ||
316 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
317 | int_restore_rest: | ||
318 | RESTORE_REST | ||
319 | jmp int_with_check | ||
320 | CFI_ENDPROC | ||
321 | |||
322 | /* | ||
323 | * Certain special system calls that need to save a complete full stack frame. | ||
324 | */ | ||
325 | |||
326 | .macro PTREGSCALL label,func,arg | ||
327 | .globl \label | ||
328 | \label: | ||
329 | leaq \func(%rip),%rax | ||
330 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
331 | jmp ptregscall_common | ||
332 | .endm | ||
333 | |||
334 | PTREGSCALL stub_clone, sys_clone, %r8 | ||
335 | PTREGSCALL stub_fork, sys_fork, %rdi | ||
336 | PTREGSCALL stub_vfork, sys_vfork, %rdi | ||
337 | PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
338 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | ||
339 | PTREGSCALL stub_iopl, sys_iopl, %rsi | ||
340 | |||
341 | ENTRY(ptregscall_common) | ||
342 | CFI_STARTPROC | ||
343 | popq %r11 | ||
344 | CFI_ADJUST_CFA_OFFSET -8 | ||
345 | SAVE_REST | ||
346 | movq %r11, %r15 | ||
347 | FIXUP_TOP_OF_STACK %r11 | ||
348 | call *%rax | ||
349 | RESTORE_TOP_OF_STACK %r11 | ||
350 | movq %r15, %r11 | ||
351 | RESTORE_REST | ||
352 | pushq %r11 | ||
353 | CFI_ADJUST_CFA_OFFSET 8 | ||
354 | ret | ||
355 | CFI_ENDPROC | ||
356 | |||
357 | ENTRY(stub_execve) | ||
358 | CFI_STARTPROC | ||
359 | popq %r11 | ||
360 | CFI_ADJUST_CFA_OFFSET -8 | ||
361 | SAVE_REST | ||
362 | movq %r11, %r15 | ||
363 | FIXUP_TOP_OF_STACK %r11 | ||
364 | call sys_execve | ||
365 | GET_THREAD_INFO(%rcx) | ||
366 | bt $TIF_IA32,threadinfo_flags(%rcx) | ||
367 | jc exec_32bit | ||
368 | RESTORE_TOP_OF_STACK %r11 | ||
369 | movq %r15, %r11 | ||
370 | RESTORE_REST | ||
371 | push %r11 | ||
372 | ret | ||
373 | |||
374 | exec_32bit: | ||
375 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
376 | movq %rax,RAX(%rsp) | ||
377 | RESTORE_REST | ||
378 | jmp int_ret_from_sys_call | ||
379 | CFI_ENDPROC | ||
380 | |||
381 | /* | ||
382 | * sigreturn is special because it needs to restore all registers on return. | ||
383 | * This cannot be done with SYSRET, so use the IRET return path instead. | ||
384 | */ | ||
385 | ENTRY(stub_rt_sigreturn) | ||
386 | CFI_STARTPROC | ||
387 | addq $8, %rsp | ||
388 | SAVE_REST | ||
389 | movq %rsp,%rdi | ||
390 | FIXUP_TOP_OF_STACK %r11 | ||
391 | call sys_rt_sigreturn | ||
392 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | ||
393 | RESTORE_REST | ||
394 | jmp int_ret_from_sys_call | ||
395 | CFI_ENDPROC | ||
396 | |||
397 | /* | ||
398 | * Interrupt entry/exit. | ||
399 | * | ||
400 | * Interrupt entry points save only callee clobbered registers in fast path. | ||
401 | * | ||
402 | * Entry runs with interrupts off. | ||
403 | */ | ||
404 | |||
405 | /* 0(%rsp): interrupt number */ | ||
406 | .macro interrupt func | ||
407 | CFI_STARTPROC simple | ||
408 | CFI_DEF_CFA rsp,(SS-RDI) | ||
409 | CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) | ||
410 | CFI_REL_OFFSET rip,(RIP-ORIG_RAX) | ||
411 | cld | ||
412 | #ifdef CONFIG_DEBUG_INFO | ||
413 | SAVE_ALL | ||
414 | movq %rsp,%rdi | ||
415 | /* | ||
416 | * Setup a stack frame pointer. This allows gdb to trace | ||
417 | * back to the original stack. | ||
418 | */ | ||
419 | movq %rsp,%rbp | ||
420 | CFI_DEF_CFA_REGISTER rbp | ||
421 | #else | ||
422 | SAVE_ARGS | ||
423 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | ||
424 | #endif | ||
425 | testl $3,CS(%rdi) | ||
426 | je 1f | ||
427 | swapgs | ||
428 | 1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count | ||
429 | movq %gs:pda_irqstackptr,%rax | ||
430 | cmoveq %rax,%rsp | ||
431 | pushq %rdi # save old stack | ||
432 | call \func | ||
433 | .endm | ||
434 | |||
435 | ENTRY(common_interrupt) | ||
436 | interrupt do_IRQ | ||
437 | /* 0(%rsp): oldrsp-ARGOFFSET */ | ||
438 | ret_from_intr: | ||
439 | popq %rdi | ||
440 | cli | ||
441 | subl $1,%gs:pda_irqcount | ||
442 | #ifdef CONFIG_DEBUG_INFO | ||
443 | movq RBP(%rdi),%rbp | ||
444 | #endif | ||
445 | leaq ARGOFFSET(%rdi),%rsp | ||
446 | exit_intr: | ||
447 | GET_THREAD_INFO(%rcx) | ||
448 | testl $3,CS-ARGOFFSET(%rsp) | ||
449 | je retint_kernel | ||
450 | |||
451 | /* Interrupt came from user space */ | ||
452 | /* | ||
453 | * Has a correct top of stack, but a partial stack frame | ||
454 | * %rcx: thread info. Interrupts off. | ||
455 | */ | ||
456 | retint_with_reschedule: | ||
457 | movl $_TIF_WORK_MASK,%edi | ||
458 | retint_check: | ||
459 | movl threadinfo_flags(%rcx),%edx | ||
460 | andl %edi,%edx | ||
461 | jnz retint_careful | ||
462 | retint_swapgs: | ||
463 | cli | ||
464 | swapgs | ||
465 | retint_restore_args: | ||
466 | cli | ||
467 | RESTORE_ARGS 0,8,0 | ||
468 | iret_label: | ||
469 | iretq | ||
470 | |||
471 | .section __ex_table,"a" | ||
472 | .quad iret_label,bad_iret | ||
473 | .previous | ||
474 | .section .fixup,"ax" | ||
475 | /* force a signal here? this matches i386 behaviour */ | ||
476 | /* running with kernel gs */ | ||
477 | bad_iret: | ||
478 | movq $-9999,%rdi /* better code? */ | ||
479 | jmp do_exit | ||
480 | .previous | ||
481 | |||
482 | /* edi: workmask, edx: work */ | ||
483 | retint_careful: | ||
484 | bt $TIF_NEED_RESCHED,%edx | ||
485 | jnc retint_signal | ||
486 | sti | ||
487 | pushq %rdi | ||
488 | call schedule | ||
489 | popq %rdi | ||
490 | GET_THREAD_INFO(%rcx) | ||
491 | cli | ||
492 | jmp retint_check | ||
493 | |||
494 | retint_signal: | ||
495 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
496 | jz retint_swapgs | ||
497 | sti | ||
498 | SAVE_REST | ||
499 | movq $-1,ORIG_RAX(%rsp) | ||
500 | xorq %rsi,%rsi # oldset | ||
501 | movq %rsp,%rdi # &pt_regs | ||
502 | call do_notify_resume | ||
503 | RESTORE_REST | ||
504 | cli | ||
505 | movl $_TIF_NEED_RESCHED,%edi | ||
506 | GET_THREAD_INFO(%rcx) | ||
507 | jmp retint_check | ||
508 | |||
509 | #ifdef CONFIG_PREEMPT | ||
510 | /* Returning to kernel space. Check if we need preemption */ | ||
511 | /* rcx: threadinfo. interrupts off. */ | ||
512 | .p2align | ||
513 | retint_kernel: | ||
514 | cmpl $0,threadinfo_preempt_count(%rcx) | ||
515 | jnz retint_restore_args | ||
516 | bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | ||
517 | jnc retint_restore_args | ||
518 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
519 | jnc retint_restore_args | ||
520 | call preempt_schedule_irq | ||
521 | jmp exit_intr | ||
522 | #endif | ||
523 | CFI_ENDPROC | ||
524 | |||
525 | /* | ||
526 | * APIC interrupts. | ||
527 | */ | ||
528 | .macro apicinterrupt num,func | ||
529 | pushq $\num-256 | ||
530 | interrupt \func | ||
531 | jmp ret_from_intr | ||
532 | CFI_ENDPROC | ||
533 | .endm | ||
534 | |||
535 | ENTRY(thermal_interrupt) | ||
536 | apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt | ||
537 | |||
538 | #ifdef CONFIG_SMP | ||
539 | ENTRY(reschedule_interrupt) | ||
540 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | ||
541 | |||
542 | ENTRY(invalidate_interrupt) | ||
543 | apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt | ||
544 | |||
545 | ENTRY(call_function_interrupt) | ||
546 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | ||
547 | #endif | ||
548 | |||
549 | #ifdef CONFIG_X86_LOCAL_APIC | ||
550 | ENTRY(apic_timer_interrupt) | ||
551 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | ||
552 | |||
553 | ENTRY(error_interrupt) | ||
554 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | ||
555 | |||
556 | ENTRY(spurious_interrupt) | ||
557 | apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt | ||
558 | #endif | ||
559 | |||
560 | /* | ||
561 | * Exception entry points. | ||
562 | */ | ||
563 | .macro zeroentry sym | ||
564 | pushq $0 /* push error code/oldrax */ | ||
565 | pushq %rax /* push real oldrax to the rdi slot */ | ||
566 | leaq \sym(%rip),%rax | ||
567 | jmp error_entry | ||
568 | .endm | ||
569 | |||
570 | .macro errorentry sym | ||
571 | pushq %rax | ||
572 | leaq \sym(%rip),%rax | ||
573 | jmp error_entry | ||
574 | .endm | ||
575 | |||
576 | /* error code is on the stack already */ | ||
577 | /* handle NMI like exceptions that can happen everywhere */ | ||
578 | .macro paranoidentry sym | ||
579 | SAVE_ALL | ||
580 | cld | ||
581 | movl $1,%ebx | ||
582 | movl $MSR_GS_BASE,%ecx | ||
583 | rdmsr | ||
584 | testl %edx,%edx | ||
585 | js 1f | ||
586 | swapgs | ||
587 | xorl %ebx,%ebx | ||
588 | 1: movq %rsp,%rdi | ||
589 | movq ORIG_RAX(%rsp),%rsi | ||
590 | movq $-1,ORIG_RAX(%rsp) | ||
591 | call \sym | ||
592 | .endm | ||
593 | |||
594 | /* | ||
595 | * Exception entry point. This expects an error code/orig_rax on the stack | ||
596 | * and the exception handler in %rax. | ||
597 | */ | ||
598 | ENTRY(error_entry) | ||
599 | CFI_STARTPROC simple | ||
600 | CFI_DEF_CFA rsp,(SS-RDI) | ||
601 | CFI_REL_OFFSET rsp,(RSP-RDI) | ||
602 | CFI_REL_OFFSET rip,(RIP-RDI) | ||
603 | /* rdi slot contains rax, oldrax contains error code */ | ||
604 | cld | ||
605 | subq $14*8,%rsp | ||
606 | CFI_ADJUST_CFA_OFFSET (14*8) | ||
607 | movq %rsi,13*8(%rsp) | ||
608 | CFI_REL_OFFSET rsi,RSI | ||
609 | movq 14*8(%rsp),%rsi /* load rax from rdi slot */ | ||
610 | movq %rdx,12*8(%rsp) | ||
611 | CFI_REL_OFFSET rdx,RDX | ||
612 | movq %rcx,11*8(%rsp) | ||
613 | CFI_REL_OFFSET rcx,RCX | ||
614 | movq %rsi,10*8(%rsp) /* store rax */ | ||
615 | CFI_REL_OFFSET rax,RAX | ||
616 | movq %r8, 9*8(%rsp) | ||
617 | CFI_REL_OFFSET r8,R8 | ||
618 | movq %r9, 8*8(%rsp) | ||
619 | CFI_REL_OFFSET r9,R9 | ||
620 | movq %r10,7*8(%rsp) | ||
621 | CFI_REL_OFFSET r10,R10 | ||
622 | movq %r11,6*8(%rsp) | ||
623 | CFI_REL_OFFSET r11,R11 | ||
624 | movq %rbx,5*8(%rsp) | ||
625 | CFI_REL_OFFSET rbx,RBX | ||
626 | movq %rbp,4*8(%rsp) | ||
627 | CFI_REL_OFFSET rbp,RBP | ||
628 | movq %r12,3*8(%rsp) | ||
629 | CFI_REL_OFFSET r12,R12 | ||
630 | movq %r13,2*8(%rsp) | ||
631 | CFI_REL_OFFSET r13,R13 | ||
632 | movq %r14,1*8(%rsp) | ||
633 | CFI_REL_OFFSET r14,R14 | ||
634 | movq %r15,(%rsp) | ||
635 | CFI_REL_OFFSET r15,R15 | ||
636 | xorl %ebx,%ebx | ||
637 | testl $3,CS(%rsp) | ||
638 | je error_kernelspace | ||
639 | error_swapgs: | ||
640 | swapgs | ||
641 | error_sti: | ||
642 | movq %rdi,RDI(%rsp) | ||
643 | movq %rsp,%rdi | ||
644 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
645 | movq $-1,ORIG_RAX(%rsp) | ||
646 | call *%rax | ||
647 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | ||
648 | error_exit: | ||
649 | movl %ebx,%eax | ||
650 | RESTORE_REST | ||
651 | cli | ||
652 | GET_THREAD_INFO(%rcx) | ||
653 | testl %eax,%eax | ||
654 | jne retint_kernel | ||
655 | movl threadinfo_flags(%rcx),%edx | ||
656 | movl $_TIF_WORK_MASK,%edi | ||
657 | andl %edi,%edx | ||
658 | jnz retint_careful | ||
659 | swapgs | ||
660 | RESTORE_ARGS 0,8,0 | ||
661 | iretq | ||
662 | CFI_ENDPROC | ||
663 | |||
664 | error_kernelspace: | ||
665 | incl %ebx | ||
666 | /* There are two places in the kernel that can potentially fault with | ||
667 | usergs. Handle them here. The exception handlers after | ||
668 | iret run with kernel gs again, so don't set the user space flag. | ||
669 | B stepping K8s sometimes report an truncated RIP for IRET | ||
670 | exceptions returning to compat mode. Check for these here too. */ | ||
671 | leaq iret_label(%rip),%rbp | ||
672 | cmpq %rbp,RIP(%rsp) | ||
673 | je error_swapgs | ||
674 | movl %ebp,%ebp /* zero extend */ | ||
675 | cmpq %rbp,RIP(%rsp) | ||
676 | je error_swapgs | ||
677 | cmpq $gs_change,RIP(%rsp) | ||
678 | je error_swapgs | ||
679 | jmp error_sti | ||
680 | |||
681 | /* Reload gs selector with exception handling */ | ||
682 | /* edi: new selector */ | ||
683 | ENTRY(load_gs_index) | ||
684 | pushf | ||
685 | cli | ||
686 | swapgs | ||
687 | gs_change: | ||
688 | movl %edi,%gs | ||
689 | 2: mfence /* workaround */ | ||
690 | swapgs | ||
691 | popf | ||
692 | ret | ||
693 | |||
694 | .section __ex_table,"a" | ||
695 | .align 8 | ||
696 | .quad gs_change,bad_gs | ||
697 | .previous | ||
698 | .section .fixup,"ax" | ||
699 | /* running with kernelgs */ | ||
700 | bad_gs: | ||
701 | swapgs /* switch back to user gs */ | ||
702 | xorl %eax,%eax | ||
703 | movl %eax,%gs | ||
704 | jmp 2b | ||
705 | .previous | ||
706 | |||
707 | /* | ||
708 | * Create a kernel thread. | ||
709 | * | ||
710 | * C extern interface: | ||
711 | * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | ||
712 | * | ||
713 | * asm input arguments: | ||
714 | * rdi: fn, rsi: arg, rdx: flags | ||
715 | */ | ||
716 | ENTRY(kernel_thread) | ||
717 | CFI_STARTPROC | ||
718 | FAKE_STACK_FRAME $child_rip | ||
719 | SAVE_ALL | ||
720 | |||
721 | # rdi: flags, rsi: usp, rdx: will be &pt_regs | ||
722 | movq %rdx,%rdi | ||
723 | orq kernel_thread_flags(%rip),%rdi | ||
724 | movq $-1, %rsi | ||
725 | movq %rsp, %rdx | ||
726 | |||
727 | xorl %r8d,%r8d | ||
728 | xorl %r9d,%r9d | ||
729 | |||
730 | # clone now | ||
731 | call do_fork | ||
732 | movq %rax,RAX(%rsp) | ||
733 | xorl %edi,%edi | ||
734 | |||
735 | /* | ||
736 | * It isn't worth to check for reschedule here, | ||
737 | * so internally to the x86_64 port you can rely on kernel_thread() | ||
738 | * not to reschedule the child before returning, this avoids the need | ||
739 | * of hacks for example to fork off the per-CPU idle tasks. | ||
740 | * [Hopefully no generic code relies on the reschedule -AK] | ||
741 | */ | ||
742 | RESTORE_ALL | ||
743 | UNFAKE_STACK_FRAME | ||
744 | ret | ||
745 | CFI_ENDPROC | ||
746 | |||
747 | |||
748 | child_rip: | ||
749 | /* | ||
750 | * Here we are in the child and the registers are set as they were | ||
751 | * at kernel_thread() invocation in the parent. | ||
752 | */ | ||
753 | movq %rdi, %rax | ||
754 | movq %rsi, %rdi | ||
755 | call *%rax | ||
756 | # exit | ||
757 | xorq %rdi, %rdi | ||
758 | call do_exit | ||
759 | |||
760 | /* | ||
761 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | ||
762 | * | ||
763 | * C extern interface: | ||
764 | * extern long execve(char *name, char **argv, char **envp) | ||
765 | * | ||
766 | * asm input arguments: | ||
767 | * rdi: name, rsi: argv, rdx: envp | ||
768 | * | ||
769 | * We want to fallback into: | ||
770 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | ||
771 | * | ||
772 | * do_sys_execve asm fallback arguments: | ||
773 | * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | ||
774 | */ | ||
775 | ENTRY(execve) | ||
776 | CFI_STARTPROC | ||
777 | FAKE_STACK_FRAME $0 | ||
778 | SAVE_ALL | ||
779 | call sys_execve | ||
780 | movq %rax, RAX(%rsp) | ||
781 | RESTORE_REST | ||
782 | testq %rax,%rax | ||
783 | je int_ret_from_sys_call | ||
784 | RESTORE_ARGS | ||
785 | UNFAKE_STACK_FRAME | ||
786 | ret | ||
787 | CFI_ENDPROC | ||
788 | |||
789 | ENTRY(page_fault) | ||
790 | errorentry do_page_fault | ||
791 | |||
792 | ENTRY(coprocessor_error) | ||
793 | zeroentry do_coprocessor_error | ||
794 | |||
795 | ENTRY(simd_coprocessor_error) | ||
796 | zeroentry do_simd_coprocessor_error | ||
797 | |||
798 | ENTRY(device_not_available) | ||
799 | zeroentry math_state_restore | ||
800 | |||
801 | /* runs on exception stack */ | ||
802 | ENTRY(debug) | ||
803 | CFI_STARTPROC | ||
804 | pushq $0 | ||
805 | CFI_ADJUST_CFA_OFFSET 8 | ||
806 | paranoidentry do_debug | ||
807 | /* switch back to process stack to restore the state ptrace touched */ | ||
808 | movq %rax,%rsp | ||
809 | testl $3,CS(%rsp) | ||
810 | jnz paranoid_userspace | ||
811 | jmp paranoid_exit | ||
812 | CFI_ENDPROC | ||
813 | |||
814 | /* runs on exception stack */ | ||
815 | ENTRY(nmi) | ||
816 | CFI_STARTPROC | ||
817 | pushq $-1 | ||
818 | CFI_ADJUST_CFA_OFFSET 8 | ||
819 | paranoidentry do_nmi | ||
820 | /* ebx: no swapgs flag */ | ||
821 | paranoid_exit: | ||
822 | testl %ebx,%ebx /* swapgs needed? */ | ||
823 | jnz paranoid_restore | ||
824 | paranoid_swapgs: | ||
825 | cli | ||
826 | swapgs | ||
827 | paranoid_restore: | ||
828 | RESTORE_ALL 8 | ||
829 | iretq | ||
830 | paranoid_userspace: | ||
831 | cli | ||
832 | GET_THREAD_INFO(%rcx) | ||
833 | movl threadinfo_flags(%rcx),%edx | ||
834 | testl $_TIF_NEED_RESCHED,%edx | ||
835 | jnz paranoid_resched | ||
836 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
837 | jnz paranoid_signal | ||
838 | jmp paranoid_swapgs | ||
839 | paranoid_resched: | ||
840 | sti | ||
841 | call schedule | ||
842 | jmp paranoid_exit | ||
843 | paranoid_signal: | ||
844 | sti | ||
845 | xorl %esi,%esi /* oldset */ | ||
846 | movq %rsp,%rdi /* &pt_regs */ | ||
847 | call do_notify_resume | ||
848 | jmp paranoid_exit | ||
849 | CFI_ENDPROC | ||
850 | |||
851 | ENTRY(int3) | ||
852 | zeroentry do_int3 | ||
853 | |||
854 | ENTRY(overflow) | ||
855 | zeroentry do_overflow | ||
856 | |||
857 | ENTRY(bounds) | ||
858 | zeroentry do_bounds | ||
859 | |||
860 | ENTRY(invalid_op) | ||
861 | zeroentry do_invalid_op | ||
862 | |||
863 | ENTRY(coprocessor_segment_overrun) | ||
864 | zeroentry do_coprocessor_segment_overrun | ||
865 | |||
866 | ENTRY(reserved) | ||
867 | zeroentry do_reserved | ||
868 | |||
869 | /* runs on exception stack */ | ||
870 | ENTRY(double_fault) | ||
871 | CFI_STARTPROC | ||
872 | paranoidentry do_double_fault | ||
873 | movq %rax,%rsp | ||
874 | testl $3,CS(%rsp) | ||
875 | jnz paranoid_userspace | ||
876 | jmp paranoid_exit | ||
877 | CFI_ENDPROC | ||
878 | |||
879 | ENTRY(invalid_TSS) | ||
880 | errorentry do_invalid_TSS | ||
881 | |||
882 | ENTRY(segment_not_present) | ||
883 | errorentry do_segment_not_present | ||
884 | |||
885 | /* runs on exception stack */ | ||
886 | ENTRY(stack_segment) | ||
887 | CFI_STARTPROC | ||
888 | paranoidentry do_stack_segment | ||
889 | movq %rax,%rsp | ||
890 | testl $3,CS(%rsp) | ||
891 | jnz paranoid_userspace | ||
892 | jmp paranoid_exit | ||
893 | CFI_ENDPROC | ||
894 | |||
895 | ENTRY(general_protection) | ||
896 | errorentry do_general_protection | ||
897 | |||
898 | ENTRY(alignment_check) | ||
899 | errorentry do_alignment_check | ||
900 | |||
901 | ENTRY(divide_error) | ||
902 | zeroentry do_divide_error | ||
903 | |||
904 | ENTRY(spurious_interrupt_bug) | ||
905 | zeroentry do_spurious_interrupt_bug | ||
906 | |||
907 | #ifdef CONFIG_X86_MCE | ||
908 | /* runs on exception stack */ | ||
909 | ENTRY(machine_check) | ||
910 | CFI_STARTPROC | ||
911 | pushq $0 | ||
912 | CFI_ADJUST_CFA_OFFSET 8 | ||
913 | paranoidentry do_machine_check | ||
914 | jmp paranoid_exit | ||
915 | CFI_ENDPROC | ||
916 | #endif | ||
917 | |||
918 | ENTRY(call_debug) | ||
919 | zeroentry do_call_debug | ||
920 | |||
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c new file mode 100644 index 000000000000..d2c42fb99cb3 --- /dev/null +++ b/arch/x86_64/kernel/genapic.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Generic APIC sub-arch probe layer. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/module.h> | ||
19 | |||
20 | #include <asm/smp.h> | ||
21 | #include <asm/ipi.h> | ||
22 | |||
23 | /* which logical CPU number maps to which CPU (physical APIC ID) */ | ||
24 | u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
25 | EXPORT_SYMBOL(x86_cpu_to_apicid); | ||
26 | u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
27 | |||
28 | extern struct genapic apic_cluster; | ||
29 | extern struct genapic apic_flat; | ||
30 | |||
31 | struct genapic *genapic = &apic_flat; | ||
32 | |||
33 | |||
34 | /* | ||
35 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | ||
36 | */ | ||
37 | void __init clustered_apic_check(void) | ||
38 | { | ||
39 | long i; | ||
40 | u8 clusters, max_cluster; | ||
41 | u8 id; | ||
42 | u8 cluster_cnt[NUM_APIC_CLUSTERS]; | ||
43 | |||
44 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||
45 | /* AMD always uses flat mode right now */ | ||
46 | genapic = &apic_flat; | ||
47 | goto print; | ||
48 | } | ||
49 | |||
50 | memset(cluster_cnt, 0, sizeof(cluster_cnt)); | ||
51 | |||
52 | for (i = 0; i < NR_CPUS; i++) { | ||
53 | id = bios_cpu_apicid[i]; | ||
54 | if (id != BAD_APICID) | ||
55 | cluster_cnt[APIC_CLUSTERID(id)]++; | ||
56 | } | ||
57 | |||
58 | clusters = 0; | ||
59 | max_cluster = 0; | ||
60 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
61 | if (cluster_cnt[i] > 0) { | ||
62 | ++clusters; | ||
63 | if (cluster_cnt[i] > max_cluster) | ||
64 | max_cluster = cluster_cnt[i]; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, | ||
70 | * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical | ||
71 | * else physical mode. | ||
72 | * (We don't use lowest priority delivery + HW APIC IRQ steering, so | ||
73 | * can ignore the clustered logical case and go straight to physical.) | ||
74 | */ | ||
75 | if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) | ||
76 | genapic = &apic_flat; | ||
77 | else | ||
78 | genapic = &apic_cluster; | ||
79 | |||
80 | print: | ||
81 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | ||
82 | } | ||
83 | |||
84 | /* Same for both flat and clustered. */ | ||
85 | |||
86 | void send_IPI_self(int vector) | ||
87 | { | ||
88 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | ||
89 | } | ||
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c new file mode 100644 index 000000000000..9703da7202e3 --- /dev/null +++ b/arch/x86_64/kernel/genapic_cluster.c | |||
@@ -0,0 +1,130 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. | ||
6 | * (A more realistic maximum is around 230 CPUs.) | ||
7 | * | ||
8 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
9 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
10 | * James Cleverdon. | ||
11 | */ | ||
12 | #include <linux/config.h> | ||
13 | #include <linux/threads.h> | ||
14 | #include <linux/cpumask.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/ctype.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <asm/smp.h> | ||
20 | #include <asm/ipi.h> | ||
21 | |||
22 | |||
23 | /* | ||
24 | * Set up the logical destination ID. | ||
25 | * | ||
26 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
27 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
28 | * document number 292116). So here it goes... | ||
29 | */ | ||
30 | static void cluster_init_apic_ldr(void) | ||
31 | { | ||
32 | unsigned long val, id; | ||
33 | long i, count; | ||
34 | u8 lid; | ||
35 | u8 my_id = hard_smp_processor_id(); | ||
36 | u8 my_cluster = APIC_CLUSTER(my_id); | ||
37 | |||
38 | /* Create logical APIC IDs by counting CPUs already in cluster. */ | ||
39 | for (count = 0, i = NR_CPUS; --i >= 0; ) { | ||
40 | lid = x86_cpu_to_log_apicid[i]; | ||
41 | if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) | ||
42 | ++count; | ||
43 | } | ||
44 | /* | ||
45 | * We only have a 4 wide bitmap in cluster mode. There's no way | ||
46 | * to get above 60 CPUs and still give each one it's own bit. | ||
47 | * But, we're using physical IRQ delivery, so we don't care. | ||
48 | * Use bit 3 for the 4th through Nth CPU in each cluster. | ||
49 | */ | ||
50 | if (count >= XAPIC_DEST_CPUS_SHIFT) | ||
51 | count = 3; | ||
52 | id = my_cluster | (1UL << count); | ||
53 | x86_cpu_to_log_apicid[smp_processor_id()] = id; | ||
54 | apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); | ||
55 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
56 | val |= SET_APIC_LOGICAL_ID(id); | ||
57 | apic_write_around(APIC_LDR, val); | ||
58 | } | ||
59 | |||
60 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | ||
61 | |||
62 | static cpumask_t cluster_target_cpus(void) | ||
63 | { | ||
64 | return cpumask_of_cpu(0); | ||
65 | } | ||
66 | |||
67 | static void cluster_send_IPI_mask(cpumask_t mask, int vector) | ||
68 | { | ||
69 | send_IPI_mask_sequence(mask, vector); | ||
70 | } | ||
71 | |||
72 | static void cluster_send_IPI_allbutself(int vector) | ||
73 | { | ||
74 | cpumask_t mask = cpu_online_map; | ||
75 | cpu_clear(smp_processor_id(), mask); | ||
76 | |||
77 | if (!cpus_empty(mask)) | ||
78 | cluster_send_IPI_mask(mask, vector); | ||
79 | } | ||
80 | |||
81 | static void cluster_send_IPI_all(int vector) | ||
82 | { | ||
83 | cluster_send_IPI_mask(cpu_online_map, vector); | ||
84 | } | ||
85 | |||
86 | static int cluster_apic_id_registered(void) | ||
87 | { | ||
88 | return 1; | ||
89 | } | ||
90 | |||
91 | static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) | ||
92 | { | ||
93 | int cpu; | ||
94 | |||
95 | /* | ||
96 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
97 | * May as well be the first. | ||
98 | */ | ||
99 | cpu = first_cpu(cpumask); | ||
100 | if ((unsigned)cpu < NR_CPUS) | ||
101 | return x86_cpu_to_apicid[cpu]; | ||
102 | else | ||
103 | return BAD_APICID; | ||
104 | } | ||
105 | |||
106 | /* cpuid returns the value latched in the HW at reset, not the APIC ID | ||
107 | * register's value. For any box whose BIOS changes APIC IDs, like | ||
108 | * clustered APIC systems, we must use hard_smp_processor_id. | ||
109 | * | ||
110 | * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. | ||
111 | */ | ||
112 | static unsigned int phys_pkg_id(int index_msb) | ||
113 | { | ||
114 | return hard_smp_processor_id() >> index_msb; | ||
115 | } | ||
116 | |||
117 | struct genapic apic_cluster = { | ||
118 | .name = "clustered", | ||
119 | .int_delivery_mode = dest_Fixed, | ||
120 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | ||
121 | .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, | ||
122 | .target_cpus = cluster_target_cpus, | ||
123 | .apic_id_registered = cluster_apic_id_registered, | ||
124 | .init_apic_ldr = cluster_init_apic_ldr, | ||
125 | .send_IPI_all = cluster_send_IPI_all, | ||
126 | .send_IPI_allbutself = cluster_send_IPI_allbutself, | ||
127 | .send_IPI_mask = cluster_send_IPI_mask, | ||
128 | .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, | ||
129 | .phys_pkg_id = phys_pkg_id, | ||
130 | }; | ||
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c new file mode 100644 index 000000000000..b4cbbad04226 --- /dev/null +++ b/arch/x86_64/kernel/genapic_flat.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/ipi.h> | ||
20 | |||
21 | |||
22 | static cpumask_t flat_target_cpus(void) | ||
23 | { | ||
24 | return cpu_online_map; | ||
25 | } | ||
26 | |||
27 | /* | ||
28 | * Set up the logical destination ID. | ||
29 | * | ||
30 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
31 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
32 | * document number 292116). So here it goes... | ||
33 | */ | ||
34 | static void flat_init_apic_ldr(void) | ||
35 | { | ||
36 | unsigned long val; | ||
37 | unsigned long num, id; | ||
38 | |||
39 | num = smp_processor_id(); | ||
40 | id = 1UL << num; | ||
41 | x86_cpu_to_log_apicid[num] = id; | ||
42 | apic_write_around(APIC_DFR, APIC_DFR_FLAT); | ||
43 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
44 | val |= SET_APIC_LOGICAL_ID(id); | ||
45 | apic_write_around(APIC_LDR, val); | ||
46 | } | ||
47 | |||
48 | static void flat_send_IPI_allbutself(int vector) | ||
49 | { | ||
50 | /* | ||
51 | * if there are no other CPUs in the system then | ||
52 | * we get an APIC send error if we try to broadcast. | ||
53 | * thus we have to avoid sending IPIs in this case. | ||
54 | */ | ||
55 | if (num_online_cpus() > 1) | ||
56 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); | ||
57 | } | ||
58 | |||
59 | static void flat_send_IPI_all(int vector) | ||
60 | { | ||
61 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
62 | } | ||
63 | |||
64 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
65 | { | ||
66 | unsigned long mask = cpus_addr(cpumask)[0]; | ||
67 | unsigned long cfg; | ||
68 | unsigned long flags; | ||
69 | |||
70 | local_save_flags(flags); | ||
71 | local_irq_disable(); | ||
72 | |||
73 | /* | ||
74 | * Wait for idle. | ||
75 | */ | ||
76 | apic_wait_icr_idle(); | ||
77 | |||
78 | /* | ||
79 | * prepare target chip field | ||
80 | */ | ||
81 | cfg = __prepare_ICR2(mask); | ||
82 | apic_write_around(APIC_ICR2, cfg); | ||
83 | |||
84 | /* | ||
85 | * program the ICR | ||
86 | */ | ||
87 | cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); | ||
88 | |||
89 | /* | ||
90 | * Send the IPI. The write to APIC_ICR fires this off. | ||
91 | */ | ||
92 | apic_write_around(APIC_ICR, cfg); | ||
93 | local_irq_restore(flags); | ||
94 | } | ||
95 | |||
96 | static int flat_apic_id_registered(void) | ||
97 | { | ||
98 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); | ||
99 | } | ||
100 | |||
101 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
102 | { | ||
103 | return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; | ||
104 | } | ||
105 | |||
106 | static unsigned int phys_pkg_id(int index_msb) | ||
107 | { | ||
108 | u32 ebx; | ||
109 | |||
110 | ebx = cpuid_ebx(1); | ||
111 | return ((ebx >> 24) & 0xFF) >> index_msb; | ||
112 | } | ||
113 | |||
114 | struct genapic apic_flat = { | ||
115 | .name = "flat", | ||
116 | .int_delivery_mode = dest_LowestPrio, | ||
117 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | ||
118 | .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, | ||
119 | .target_cpus = flat_target_cpus, | ||
120 | .apic_id_registered = flat_apic_id_registered, | ||
121 | .init_apic_ldr = flat_init_apic_ldr, | ||
122 | .send_IPI_all = flat_send_IPI_all, | ||
123 | .send_IPI_allbutself = flat_send_IPI_allbutself, | ||
124 | .send_IPI_mask = flat_send_IPI_mask, | ||
125 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | ||
126 | .phys_pkg_id = phys_pkg_id, | ||
127 | }; | ||
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S new file mode 100644 index 000000000000..b6d8725c1f61 --- /dev/null +++ b/arch/x86_64/kernel/head.S | |||
@@ -0,0 +1,396 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | ||
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | ||
8 | * | ||
9 | * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ | ||
10 | */ | ||
11 | |||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | #include <linux/threads.h> | ||
15 | #include <asm/desc.h> | ||
16 | #include <asm/segment.h> | ||
17 | #include <asm/page.h> | ||
18 | #include <asm/msr.h> | ||
19 | #include <asm/cache.h> | ||
20 | |||
21 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | ||
22 | * because we need identity-mapped pages on setup so define __START_KERNEL to | ||
23 | * 0x100000 for this stage | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | .text | ||
28 | .code32 | ||
29 | .globl startup_32 | ||
30 | /* %bx: 1 if coming from smp trampoline on secondary cpu */ | ||
31 | startup_32: | ||
32 | |||
33 | /* | ||
34 | * At this point the CPU runs in 32bit protected mode (CS.D = 1) with | ||
35 | * paging disabled and the point of this file is to switch to 64bit | ||
36 | * long mode with a kernel mapping for kerneland to jump into the | ||
37 | * kernel virtual addresses. | ||
38 | * There is no stack until we set one up. | ||
39 | */ | ||
40 | |||
41 | /* Initialize the %ds segment register */ | ||
42 | movl $__KERNEL_DS,%eax | ||
43 | movl %eax,%ds | ||
44 | |||
45 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
46 | lgdt pGDT32 - __START_KERNEL_map | ||
47 | |||
48 | /* If the CPU doesn't support CPUID this will double fault. | ||
49 | * Unfortunately it is hard to check for CPUID without a stack. | ||
50 | */ | ||
51 | |||
52 | /* Check if extended functions are implemented */ | ||
53 | movl $0x80000000, %eax | ||
54 | cpuid | ||
55 | cmpl $0x80000000, %eax | ||
56 | jbe no_long_mode | ||
57 | /* Check if long mode is implemented */ | ||
58 | mov $0x80000001, %eax | ||
59 | cpuid | ||
60 | btl $29, %edx | ||
61 | jnc no_long_mode | ||
62 | |||
63 | /* | ||
64 | * Prepare for entering 64bits mode | ||
65 | */ | ||
66 | |||
67 | /* Enable PAE mode */ | ||
68 | xorl %eax, %eax | ||
69 | btsl $5, %eax | ||
70 | movl %eax, %cr4 | ||
71 | |||
72 | /* Setup early boot stage 4 level pagetables */ | ||
73 | movl $(init_level4_pgt - __START_KERNEL_map), %eax | ||
74 | movl %eax, %cr3 | ||
75 | |||
76 | /* Setup EFER (Extended Feature Enable Register) */ | ||
77 | movl $MSR_EFER, %ecx | ||
78 | rdmsr | ||
79 | |||
80 | /* Enable Long Mode */ | ||
81 | btsl $_EFER_LME, %eax | ||
82 | |||
83 | /* Make changes effective */ | ||
84 | wrmsr | ||
85 | |||
86 | xorl %eax, %eax | ||
87 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | ||
88 | btsl $0, %eax /* Enable protected mode */ | ||
89 | /* Make changes effective */ | ||
90 | movl %eax, %cr0 | ||
91 | /* | ||
92 | * At this point we're in long mode but in 32bit compatibility mode | ||
93 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
94 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
95 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
96 | */ | ||
97 | ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) | ||
98 | |||
99 | .code64 | ||
100 | .org 0x100 | ||
101 | .globl startup_64 | ||
102 | startup_64: | ||
103 | /* We come here either from startup_32 | ||
104 | * or directly from a 64bit bootloader. | ||
105 | * Since we may have come directly from a bootloader we | ||
106 | * reload the page tables here. | ||
107 | */ | ||
108 | |||
109 | /* Enable PAE mode and PGE */ | ||
110 | xorq %rax, %rax | ||
111 | btsq $5, %rax | ||
112 | btsq $7, %rax | ||
113 | movq %rax, %cr4 | ||
114 | |||
115 | /* Setup early boot stage 4 level pagetables. */ | ||
116 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
117 | movq %rax, %cr3 | ||
118 | |||
119 | /* Check if nx is implemented */ | ||
120 | movl $0x80000001, %eax | ||
121 | cpuid | ||
122 | movl %edx,%edi | ||
123 | |||
124 | /* Setup EFER (Extended Feature Enable Register) */ | ||
125 | movl $MSR_EFER, %ecx | ||
126 | rdmsr | ||
127 | |||
128 | /* Enable System Call */ | ||
129 | btsl $_EFER_SCE, %eax | ||
130 | |||
131 | /* No Execute supported? */ | ||
132 | btl $20,%edi | ||
133 | jnc 1f | ||
134 | btsl $_EFER_NX, %eax | ||
135 | 1: | ||
136 | /* Make changes effective */ | ||
137 | wrmsr | ||
138 | |||
139 | /* Setup cr0 */ | ||
140 | xorq %rax, %rax | ||
141 | btsq $31, %rax /* Enable paging */ | ||
142 | btsq $0, %rax /* Enable protected mode */ | ||
143 | btsq $1, %rax /* Enable MP */ | ||
144 | btsq $4, %rax /* Enable ET */ | ||
145 | btsq $5, %rax /* Enable NE */ | ||
146 | btsq $16, %rax /* Enable WP */ | ||
147 | btsq $18, %rax /* Enable AM */ | ||
148 | /* Make changes effective */ | ||
149 | movq %rax, %cr0 | ||
150 | |||
151 | /* Setup a boot time stack */ | ||
152 | movq init_rsp(%rip),%rsp | ||
153 | |||
154 | /* zero EFLAGS after setting rsp */ | ||
155 | pushq $0 | ||
156 | popfq | ||
157 | |||
158 | /* | ||
159 | * We must switch to a new descriptor in kernel space for the GDT | ||
160 | * because soon the kernel won't have access anymore to the userspace | ||
161 | * addresses where we're currently running on. We have to do that here | ||
162 | * because in 32bit we couldn't load a 64bit linear address. | ||
163 | */ | ||
164 | lgdt cpu_gdt_descr | ||
165 | |||
166 | /* | ||
167 | * Setup up a dummy PDA. this is just for some early bootup code | ||
168 | * that does in_interrupt() | ||
169 | */ | ||
170 | movl $MSR_GS_BASE,%ecx | ||
171 | movq $empty_zero_page,%rax | ||
172 | movq %rax,%rdx | ||
173 | shrq $32,%rdx | ||
174 | wrmsr | ||
175 | |||
176 | /* set up data segments. actually 0 would do too */ | ||
177 | movl $__KERNEL_DS,%eax | ||
178 | movl %eax,%ds | ||
179 | movl %eax,%ss | ||
180 | movl %eax,%es | ||
181 | |||
182 | /* esi is pointer to real mode structure with interesting info. | ||
183 | pass it to C */ | ||
184 | movl %esi, %edi | ||
185 | |||
186 | /* Finally jump to run C code and to be on real kernel address | ||
187 | * Since we are running on identity-mapped space we have to jump | ||
188 | * to the full 64bit address , this is only possible as indirect | ||
189 | * jump | ||
190 | */ | ||
191 | movq initial_code(%rip),%rax | ||
192 | jmp *%rax | ||
193 | |||
194 | /* SMP bootup changes these two */ | ||
195 | .globl initial_code | ||
196 | initial_code: | ||
197 | .quad x86_64_start_kernel | ||
198 | .globl init_rsp | ||
199 | init_rsp: | ||
200 | .quad init_thread_union+THREAD_SIZE-8 | ||
201 | |||
202 | ENTRY(early_idt_handler) | ||
203 | xorl %eax,%eax | ||
204 | movq 8(%rsp),%rsi # get rip | ||
205 | movq (%rsp),%rdx | ||
206 | movq %cr2,%rcx | ||
207 | leaq early_idt_msg(%rip),%rdi | ||
208 | call early_printk | ||
209 | 1: hlt | ||
210 | jmp 1b | ||
211 | |||
212 | early_idt_msg: | ||
213 | .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" | ||
214 | |||
215 | .code32 | ||
216 | ENTRY(no_long_mode) | ||
217 | /* This isn't an x86-64 CPU so hang */ | ||
218 | 1: | ||
219 | jmp 1b | ||
220 | |||
221 | .org 0xf00 | ||
222 | .globl pGDT32 | ||
223 | pGDT32: | ||
224 | .word gdt_end-cpu_gdt_table | ||
225 | .long cpu_gdt_table-__START_KERNEL_map | ||
226 | |||
227 | .org 0xf10 | ||
228 | ljumpvector: | ||
229 | .long startup_64-__START_KERNEL_map | ||
230 | .word __KERNEL_CS | ||
231 | |||
232 | ENTRY(stext) | ||
233 | ENTRY(_stext) | ||
234 | |||
235 | /* | ||
236 | * This default setting generates an ident mapping at address 0x100000 | ||
237 | * and a mapping for the kernel that precisely maps virtual address | ||
238 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
239 | * 2Mbyte large pages provided by PAE mode) | ||
240 | */ | ||
241 | .org 0x1000 | ||
242 | ENTRY(init_level4_pgt) | ||
243 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | ||
244 | .fill 255,8,0 | ||
245 | .quad 0x000000000010a007 | ||
246 | .fill 254,8,0 | ||
247 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
248 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | ||
249 | |||
250 | .org 0x2000 | ||
251 | ENTRY(level3_ident_pgt) | ||
252 | .quad 0x0000000000104007 | ||
253 | .fill 511,8,0 | ||
254 | |||
255 | .org 0x3000 | ||
256 | ENTRY(level3_kernel_pgt) | ||
257 | .fill 510,8,0 | ||
258 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | ||
259 | .quad 0x0000000000105007 /* -> level2_kernel_pgt */ | ||
260 | .fill 1,8,0 | ||
261 | |||
262 | .org 0x4000 | ||
263 | ENTRY(level2_ident_pgt) | ||
264 | /* 40MB for bootup. */ | ||
265 | .quad 0x0000000000000283 | ||
266 | .quad 0x0000000000200183 | ||
267 | .quad 0x0000000000400183 | ||
268 | .quad 0x0000000000600183 | ||
269 | .quad 0x0000000000800183 | ||
270 | .quad 0x0000000000A00183 | ||
271 | .quad 0x0000000000C00183 | ||
272 | .quad 0x0000000000E00183 | ||
273 | .quad 0x0000000001000183 | ||
274 | .quad 0x0000000001200183 | ||
275 | .quad 0x0000000001400183 | ||
276 | .quad 0x0000000001600183 | ||
277 | .quad 0x0000000001800183 | ||
278 | .quad 0x0000000001A00183 | ||
279 | .quad 0x0000000001C00183 | ||
280 | .quad 0x0000000001E00183 | ||
281 | .quad 0x0000000002000183 | ||
282 | .quad 0x0000000002200183 | ||
283 | .quad 0x0000000002400183 | ||
284 | .quad 0x0000000002600183 | ||
285 | /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ | ||
286 | .globl temp_boot_pmds | ||
287 | temp_boot_pmds: | ||
288 | .fill 492,8,0 | ||
289 | |||
290 | .org 0x5000 | ||
291 | ENTRY(level2_kernel_pgt) | ||
292 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | ||
293 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | ||
294 | /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ | ||
295 | .quad 0x0000000000000183 | ||
296 | .quad 0x0000000000200183 | ||
297 | .quad 0x0000000000400183 | ||
298 | .quad 0x0000000000600183 | ||
299 | .quad 0x0000000000800183 | ||
300 | .quad 0x0000000000A00183 | ||
301 | .quad 0x0000000000C00183 | ||
302 | .quad 0x0000000000E00183 | ||
303 | .quad 0x0000000001000183 | ||
304 | .quad 0x0000000001200183 | ||
305 | .quad 0x0000000001400183 | ||
306 | .quad 0x0000000001600183 | ||
307 | .quad 0x0000000001800183 | ||
308 | .quad 0x0000000001A00183 | ||
309 | .quad 0x0000000001C00183 | ||
310 | .quad 0x0000000001E00183 | ||
311 | .quad 0x0000000002000183 | ||
312 | .quad 0x0000000002200183 | ||
313 | .quad 0x0000000002400183 | ||
314 | .quad 0x0000000002600183 | ||
315 | /* Module mapping starts here */ | ||
316 | .fill 492,8,0 | ||
317 | |||
318 | .org 0x6000 | ||
319 | ENTRY(empty_zero_page) | ||
320 | |||
321 | .org 0x7000 | ||
322 | ENTRY(empty_bad_page) | ||
323 | |||
324 | .org 0x8000 | ||
325 | ENTRY(empty_bad_pte_table) | ||
326 | |||
327 | .org 0x9000 | ||
328 | ENTRY(empty_bad_pmd_table) | ||
329 | |||
330 | .org 0xa000 | ||
331 | ENTRY(level3_physmem_pgt) | ||
332 | .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ | ||
333 | |||
334 | .org 0xb000 | ||
335 | #ifdef CONFIG_ACPI_SLEEP | ||
336 | ENTRY(wakeup_level4_pgt) | ||
337 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | ||
338 | .fill 255,8,0 | ||
339 | .quad 0x000000000010a007 | ||
340 | .fill 254,8,0 | ||
341 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
342 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | ||
343 | #endif | ||
344 | |||
345 | .data | ||
346 | |||
347 | .align 16 | ||
348 | .globl cpu_gdt_descr | ||
349 | cpu_gdt_descr: | ||
350 | .word gdt_end-cpu_gdt_table | ||
351 | gdt: | ||
352 | .quad cpu_gdt_table | ||
353 | #ifdef CONFIG_SMP | ||
354 | .rept NR_CPUS-1 | ||
355 | .word 0 | ||
356 | .quad 0 | ||
357 | .endr | ||
358 | #endif | ||
359 | |||
360 | /* We need valid kernel segments for data and code in long mode too | ||
361 | * IRET will check the segment types kkeil 2000/10/28 | ||
362 | * Also sysret mandates a special GDT layout | ||
363 | */ | ||
364 | |||
365 | .align L1_CACHE_BYTES | ||
366 | |||
367 | /* The TLS descriptors are currently at a different place compared to i386. | ||
368 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
369 | |||
370 | ENTRY(cpu_gdt_table) | ||
371 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
372 | .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ | ||
373 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | ||
374 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | ||
375 | .quad 0x00cffa000000ffff /* __USER32_CS */ | ||
376 | .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ | ||
377 | .quad 0x00affa000000ffff /* __USER_CS */ | ||
378 | .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ | ||
379 | .quad 0,0 /* TSS */ | ||
380 | .quad 0,0 /* LDT */ | ||
381 | .quad 0,0,0 /* three TLS descriptors */ | ||
382 | .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ | ||
383 | /* base must be patched for real base address. */ | ||
384 | gdt_end: | ||
385 | /* asm/segment.h:GDT_ENTRIES must match this */ | ||
386 | /* This should be a multiple of the cache line size */ | ||
387 | /* GDTs of other CPUs: */ | ||
388 | .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) | ||
389 | |||
390 | .align L1_CACHE_BYTES | ||
391 | ENTRY(idt_table) | ||
392 | .rept 256 | ||
393 | .quad 0 | ||
394 | .quad 0 | ||
395 | .endr | ||
396 | |||
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c new file mode 100644 index 000000000000..6cad46c98a23 --- /dev/null +++ b/arch/x86_64/kernel/head64.c | |||
@@ -0,0 +1,117 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head64.c -- prepare to run common code | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * | ||
6 | * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ | ||
7 | */ | ||
8 | |||
9 | #include <linux/init.h> | ||
10 | #include <linux/linkage.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/percpu.h> | ||
15 | |||
16 | #include <asm/processor.h> | ||
17 | #include <asm/proto.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/bootsetup.h> | ||
20 | #include <asm/setup.h> | ||
21 | #include <asm/desc.h> | ||
22 | |||
23 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | ||
24 | yet. */ | ||
25 | static void __init clear_bss(void) | ||
26 | { | ||
27 | extern char __bss_start[], __bss_end[]; | ||
28 | memset(__bss_start, 0, | ||
29 | (unsigned long) __bss_end - (unsigned long) __bss_start); | ||
30 | } | ||
31 | |||
32 | extern char x86_boot_params[2048]; | ||
33 | |||
34 | #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ | ||
35 | #define OLD_CL_MAGIC_ADDR 0x90020 | ||
36 | #define OLD_CL_MAGIC 0xA33F | ||
37 | #define OLD_CL_BASE_ADDR 0x90000 | ||
38 | #define OLD_CL_OFFSET 0x90022 | ||
39 | |||
40 | extern char saved_command_line[]; | ||
41 | |||
42 | static void __init copy_bootdata(char *real_mode_data) | ||
43 | { | ||
44 | int new_data; | ||
45 | char * command_line; | ||
46 | |||
47 | memcpy(x86_boot_params, real_mode_data, 2048); | ||
48 | new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); | ||
49 | if (!new_data) { | ||
50 | if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { | ||
51 | printk("so old bootloader that it does not support commandline?!\n"); | ||
52 | return; | ||
53 | } | ||
54 | new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; | ||
55 | printk("old bootloader convention, maybe loadlin?\n"); | ||
56 | } | ||
57 | command_line = (char *) ((u64)(new_data)); | ||
58 | memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); | ||
59 | printk("Bootdata ok (command line is %s)\n", saved_command_line); | ||
60 | } | ||
61 | |||
62 | static void __init setup_boot_cpu_data(void) | ||
63 | { | ||
64 | unsigned int dummy, eax; | ||
65 | |||
66 | /* get vendor info */ | ||
67 | cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, | ||
68 | (unsigned int *)&boot_cpu_data.x86_vendor_id[0], | ||
69 | (unsigned int *)&boot_cpu_data.x86_vendor_id[8], | ||
70 | (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); | ||
71 | |||
72 | /* get cpu type */ | ||
73 | cpuid(1, &eax, &dummy, &dummy, | ||
74 | (unsigned int *) &boot_cpu_data.x86_capability); | ||
75 | boot_cpu_data.x86 = (eax >> 8) & 0xf; | ||
76 | boot_cpu_data.x86_model = (eax >> 4) & 0xf; | ||
77 | boot_cpu_data.x86_mask = eax & 0xf; | ||
78 | } | ||
79 | |||
80 | extern char _end[]; | ||
81 | |||
82 | void __init x86_64_start_kernel(char * real_mode_data) | ||
83 | { | ||
84 | char *s; | ||
85 | int i; | ||
86 | |||
87 | for (i = 0; i < 256; i++) | ||
88 | set_intr_gate(i, early_idt_handler); | ||
89 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
90 | clear_bss(); | ||
91 | pda_init(0); | ||
92 | copy_bootdata(real_mode_data); | ||
93 | #ifdef CONFIG_SMP | ||
94 | cpu_set(0, cpu_online_map); | ||
95 | #endif | ||
96 | /* default console: */ | ||
97 | if (!strstr(saved_command_line, "console=")) | ||
98 | strcat(saved_command_line, " console=tty0"); | ||
99 | s = strstr(saved_command_line, "earlyprintk="); | ||
100 | if (s != NULL) | ||
101 | setup_early_printk(s); | ||
102 | #ifdef CONFIG_DISCONTIGMEM | ||
103 | s = strstr(saved_command_line, "numa="); | ||
104 | if (s != NULL) | ||
105 | numa_setup(s+5); | ||
106 | #endif | ||
107 | #ifdef CONFIG_X86_IO_APIC | ||
108 | if (strstr(saved_command_line, "disableapic")) | ||
109 | disable_apic = 1; | ||
110 | #endif | ||
111 | /* You need early console to see that */ | ||
112 | if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) | ||
113 | panic("Kernel too big for kernel mapping\n"); | ||
114 | |||
115 | setup_boot_cpu_data(); | ||
116 | start_kernel(); | ||
117 | } | ||
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c new file mode 100644 index 000000000000..ba139cac57ce --- /dev/null +++ b/arch/x86_64/kernel/i387.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/i387.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * General FPU state handling cleanups | ||
9 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
10 | * | ||
11 | * x86-64 rework 2002 Andi Kleen. | ||
12 | * Does direct fxsave in and out of user space now for signal handlers. | ||
13 | * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, | ||
14 | * the 64bit user space sees a FXSAVE frame directly. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <asm/processor.h> | ||
21 | #include <asm/i387.h> | ||
22 | #include <asm/sigcontext.h> | ||
23 | #include <asm/user.h> | ||
24 | #include <asm/ptrace.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | |||
27 | unsigned int mxcsr_feature_mask = 0xffffffff; | ||
28 | |||
29 | void mxcsr_feature_mask_init(void) | ||
30 | { | ||
31 | unsigned int mask; | ||
32 | clts(); | ||
33 | memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
34 | asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); | ||
35 | mask = current->thread.i387.fxsave.mxcsr_mask; | ||
36 | if (mask == 0) mask = 0x0000ffbf; | ||
37 | mxcsr_feature_mask &= mask; | ||
38 | stts(); | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Called at bootup to set up the initial FPU state that is later cloned | ||
43 | * into all processes. | ||
44 | */ | ||
45 | void __init fpu_init(void) | ||
46 | { | ||
47 | unsigned long oldcr0 = read_cr0(); | ||
48 | extern void __bad_fxsave_alignment(void); | ||
49 | |||
50 | if (offsetof(struct task_struct, thread.i387.fxsave) & 15) | ||
51 | __bad_fxsave_alignment(); | ||
52 | set_in_cr4(X86_CR4_OSFXSR); | ||
53 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
54 | |||
55 | write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ | ||
56 | |||
57 | mxcsr_feature_mask_init(); | ||
58 | /* clean state in init */ | ||
59 | current_thread_info()->status = 0; | ||
60 | clear_used_math(); | ||
61 | } | ||
62 | |||
63 | void init_fpu(struct task_struct *child) | ||
64 | { | ||
65 | if (tsk_used_math(child)) { | ||
66 | if (child == current) | ||
67 | unlazy_fpu(child); | ||
68 | return; | ||
69 | } | ||
70 | memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
71 | child->thread.i387.fxsave.cwd = 0x37f; | ||
72 | child->thread.i387.fxsave.mxcsr = 0x1f80; | ||
73 | /* only the device not available exception or ptrace can call init_fpu */ | ||
74 | set_stopped_child_used_math(child); | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Signal frame handlers. | ||
79 | */ | ||
80 | |||
81 | int save_i387(struct _fpstate __user *buf) | ||
82 | { | ||
83 | struct task_struct *tsk = current; | ||
84 | int err = 0; | ||
85 | |||
86 | { | ||
87 | extern void bad_user_i387_struct(void); | ||
88 | if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) | ||
89 | bad_user_i387_struct(); | ||
90 | } | ||
91 | |||
92 | if ((unsigned long)buf % 16) | ||
93 | printk("save_i387: bad fpstate %p\n",buf); | ||
94 | |||
95 | if (!used_math()) | ||
96 | return 0; | ||
97 | clear_used_math(); /* trigger finit */ | ||
98 | if (tsk->thread_info->status & TS_USEDFPU) { | ||
99 | err = save_i387_checking((struct i387_fxsave_struct __user *)buf); | ||
100 | if (err) return err; | ||
101 | stts(); | ||
102 | } else { | ||
103 | if (__copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
104 | sizeof(struct i387_fxsave_struct))) | ||
105 | return -1; | ||
106 | } | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * ptrace request handlers. | ||
112 | */ | ||
113 | |||
114 | int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) | ||
115 | { | ||
116 | init_fpu(tsk); | ||
117 | return __copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
118 | sizeof(struct user_i387_struct)) ? -EFAULT : 0; | ||
119 | } | ||
120 | |||
121 | int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) | ||
122 | { | ||
123 | if (__copy_from_user(&tsk->thread.i387.fxsave, buf, | ||
124 | sizeof(struct user_i387_struct))) | ||
125 | return -EFAULT; | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * FPU state for core dumps. | ||
131 | */ | ||
132 | |||
133 | int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) | ||
134 | { | ||
135 | struct task_struct *tsk = current; | ||
136 | |||
137 | if (!used_math()) | ||
138 | return 0; | ||
139 | |||
140 | unlazy_fpu(tsk); | ||
141 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
142 | return 1; | ||
143 | } | ||
144 | |||
145 | int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) | ||
146 | { | ||
147 | int fpvalid = !!tsk_used_math(tsk); | ||
148 | |||
149 | if (fpvalid) { | ||
150 | if (tsk == current) | ||
151 | unlazy_fpu(tsk); | ||
152 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
153 | } | ||
154 | return fpvalid; | ||
155 | } | ||
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c new file mode 100644 index 000000000000..d52701246010 --- /dev/null +++ b/arch/x86_64/kernel/i8259.c | |||
@@ -0,0 +1,579 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/config.h> | ||
3 | #include <linux/errno.h> | ||
4 | #include <linux/signal.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/ioport.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/timex.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/random.h> | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/sysdev.h> | ||
15 | #include <linux/bitops.h> | ||
16 | |||
17 | #include <asm/acpi.h> | ||
18 | #include <asm/atomic.h> | ||
19 | #include <asm/system.h> | ||
20 | #include <asm/io.h> | ||
21 | #include <asm/irq.h> | ||
22 | #include <asm/hw_irq.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/delay.h> | ||
25 | #include <asm/desc.h> | ||
26 | #include <asm/apic.h> | ||
27 | |||
28 | #include <linux/irq.h> | ||
29 | |||
30 | /* | ||
31 | * Common place to define all x86 IRQ vectors | ||
32 | * | ||
33 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
34 | * | ||
35 | * These macros create the low-level assembly IRQ routines that save | ||
36 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
37 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
38 | * interrupt-controller happy. | ||
39 | */ | ||
40 | |||
41 | #define BI(x,y) \ | ||
42 | BUILD_IRQ(x##y) | ||
43 | |||
44 | #define BUILD_16_IRQS(x) \ | ||
45 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
46 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
47 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
48 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
49 | |||
50 | #define BUILD_14_IRQS(x) \ | ||
51 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
52 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
53 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
54 | BI(x,c) BI(x,d) | ||
55 | |||
56 | /* | ||
57 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
58 | * (these are usually mapped to vectors 0x20-0x2f) | ||
59 | */ | ||
60 | BUILD_16_IRQS(0x0) | ||
61 | |||
62 | #ifdef CONFIG_X86_LOCAL_APIC | ||
63 | /* | ||
64 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
65 | * are unused but an SMP system is supposed to have enough memory ... | ||
66 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
67 | * across the spectrum, so we really want to be prepared to get all | ||
68 | * of these. Plus, more powerful systems might have more than 64 | ||
69 | * IO-APIC registers. | ||
70 | * | ||
71 | * (these are usually mapped into the 0x30-0xff vector range) | ||
72 | */ | ||
73 | BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
74 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
75 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
76 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) | ||
77 | |||
78 | #ifdef CONFIG_PCI_MSI | ||
79 | BUILD_14_IRQS(0xe) | ||
80 | #endif | ||
81 | |||
82 | #endif | ||
83 | |||
84 | #undef BUILD_16_IRQS | ||
85 | #undef BUILD_14_IRQS | ||
86 | #undef BI | ||
87 | |||
88 | |||
89 | #define IRQ(x,y) \ | ||
90 | IRQ##x##y##_interrupt | ||
91 | |||
92 | #define IRQLIST_16(x) \ | ||
93 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
94 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
95 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
96 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
97 | |||
98 | #define IRQLIST_14(x) \ | ||
99 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
100 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
101 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
102 | IRQ(x,c), IRQ(x,d) | ||
103 | |||
104 | void (*interrupt[NR_IRQS])(void) = { | ||
105 | IRQLIST_16(0x0), | ||
106 | |||
107 | #ifdef CONFIG_X86_IO_APIC | ||
108 | IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
109 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
110 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
111 | IRQLIST_16(0xc), IRQLIST_16(0xd) | ||
112 | |||
113 | #ifdef CONFIG_PCI_MSI | ||
114 | , IRQLIST_14(0xe) | ||
115 | #endif | ||
116 | |||
117 | #endif | ||
118 | }; | ||
119 | |||
120 | #undef IRQ | ||
121 | #undef IRQLIST_16 | ||
122 | #undef IRQLIST_14 | ||
123 | |||
124 | /* | ||
125 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
126 | * present in the majority of PC/AT boxes. | ||
127 | * plus some generic x86 specific things if generic specifics makes | ||
128 | * any sense at all. | ||
129 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
130 | * moves to arch independent land | ||
131 | */ | ||
132 | |||
133 | DEFINE_SPINLOCK(i8259A_lock); | ||
134 | |||
135 | static void end_8259A_irq (unsigned int irq) | ||
136 | { | ||
137 | if (irq > 256) { | ||
138 | char var; | ||
139 | printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); | ||
140 | |||
141 | BUG(); | ||
142 | } | ||
143 | |||
144 | if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && | ||
145 | irq_desc[irq].action) | ||
146 | enable_8259A_irq(irq); | ||
147 | } | ||
148 | |||
149 | #define shutdown_8259A_irq disable_8259A_irq | ||
150 | |||
151 | static void mask_and_ack_8259A(unsigned int); | ||
152 | |||
153 | static unsigned int startup_8259A_irq(unsigned int irq) | ||
154 | { | ||
155 | enable_8259A_irq(irq); | ||
156 | return 0; /* never anything pending */ | ||
157 | } | ||
158 | |||
159 | static struct hw_interrupt_type i8259A_irq_type = { | ||
160 | "XT-PIC", | ||
161 | startup_8259A_irq, | ||
162 | shutdown_8259A_irq, | ||
163 | enable_8259A_irq, | ||
164 | disable_8259A_irq, | ||
165 | mask_and_ack_8259A, | ||
166 | end_8259A_irq, | ||
167 | NULL | ||
168 | }; | ||
169 | |||
170 | /* | ||
171 | * 8259A PIC functions to handle ISA devices: | ||
172 | */ | ||
173 | |||
174 | /* | ||
175 | * This contains the irq mask for both 8259A irq controllers, | ||
176 | */ | ||
177 | static unsigned int cached_irq_mask = 0xffff; | ||
178 | |||
179 | #define __byte(x,y) (((unsigned char *)&(y))[x]) | ||
180 | #define cached_21 (__byte(0,cached_irq_mask)) | ||
181 | #define cached_A1 (__byte(1,cached_irq_mask)) | ||
182 | |||
183 | /* | ||
184 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
185 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
186 | * it's fed to the master 8259A's IR0 line only. | ||
187 | * | ||
188 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
189 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
190 | * at IRQ setup time. | ||
191 | */ | ||
192 | unsigned long io_apic_irqs; | ||
193 | |||
194 | void disable_8259A_irq(unsigned int irq) | ||
195 | { | ||
196 | unsigned int mask = 1 << irq; | ||
197 | unsigned long flags; | ||
198 | |||
199 | spin_lock_irqsave(&i8259A_lock, flags); | ||
200 | cached_irq_mask |= mask; | ||
201 | if (irq & 8) | ||
202 | outb(cached_A1,0xA1); | ||
203 | else | ||
204 | outb(cached_21,0x21); | ||
205 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
206 | } | ||
207 | |||
208 | void enable_8259A_irq(unsigned int irq) | ||
209 | { | ||
210 | unsigned int mask = ~(1 << irq); | ||
211 | unsigned long flags; | ||
212 | |||
213 | spin_lock_irqsave(&i8259A_lock, flags); | ||
214 | cached_irq_mask &= mask; | ||
215 | if (irq & 8) | ||
216 | outb(cached_A1,0xA1); | ||
217 | else | ||
218 | outb(cached_21,0x21); | ||
219 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
220 | } | ||
221 | |||
222 | int i8259A_irq_pending(unsigned int irq) | ||
223 | { | ||
224 | unsigned int mask = 1<<irq; | ||
225 | unsigned long flags; | ||
226 | int ret; | ||
227 | |||
228 | spin_lock_irqsave(&i8259A_lock, flags); | ||
229 | if (irq < 8) | ||
230 | ret = inb(0x20) & mask; | ||
231 | else | ||
232 | ret = inb(0xA0) & (mask >> 8); | ||
233 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
234 | |||
235 | return ret; | ||
236 | } | ||
237 | |||
238 | void make_8259A_irq(unsigned int irq) | ||
239 | { | ||
240 | disable_irq_nosync(irq); | ||
241 | io_apic_irqs &= ~(1<<irq); | ||
242 | irq_desc[irq].handler = &i8259A_irq_type; | ||
243 | enable_irq(irq); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * This function assumes to be called rarely. Switching between | ||
248 | * 8259A registers is slow. | ||
249 | * This has to be protected by the irq controller spinlock | ||
250 | * before being called. | ||
251 | */ | ||
252 | static inline int i8259A_irq_real(unsigned int irq) | ||
253 | { | ||
254 | int value; | ||
255 | int irqmask = 1<<irq; | ||
256 | |||
257 | if (irq < 8) { | ||
258 | outb(0x0B,0x20); /* ISR register */ | ||
259 | value = inb(0x20) & irqmask; | ||
260 | outb(0x0A,0x20); /* back to the IRR register */ | ||
261 | return value; | ||
262 | } | ||
263 | outb(0x0B,0xA0); /* ISR register */ | ||
264 | value = inb(0xA0) & (irqmask >> 8); | ||
265 | outb(0x0A,0xA0); /* back to the IRR register */ | ||
266 | return value; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Careful! The 8259A is a fragile beast, it pretty | ||
271 | * much _has_ to be done exactly like this (mask it | ||
272 | * first, _then_ send the EOI, and the order of EOI | ||
273 | * to the two 8259s is important! | ||
274 | */ | ||
275 | static void mask_and_ack_8259A(unsigned int irq) | ||
276 | { | ||
277 | unsigned int irqmask = 1 << irq; | ||
278 | unsigned long flags; | ||
279 | |||
280 | spin_lock_irqsave(&i8259A_lock, flags); | ||
281 | /* | ||
282 | * Lightweight spurious IRQ detection. We do not want | ||
283 | * to overdo spurious IRQ handling - it's usually a sign | ||
284 | * of hardware problems, so we only do the checks we can | ||
285 | * do without slowing down good hardware unnecesserily. | ||
286 | * | ||
287 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
288 | * usually resulting from the 8259A-1|2 PICs) occur | ||
289 | * even if the IRQ is masked in the 8259A. Thus we | ||
290 | * can check spurious 8259A IRQs without doing the | ||
291 | * quite slow i8259A_irq_real() call for every IRQ. | ||
292 | * This does not cover 100% of spurious interrupts, | ||
293 | * but should be enough to warn the user that there | ||
294 | * is something bad going on ... | ||
295 | */ | ||
296 | if (cached_irq_mask & irqmask) | ||
297 | goto spurious_8259A_irq; | ||
298 | cached_irq_mask |= irqmask; | ||
299 | |||
300 | handle_real_irq: | ||
301 | if (irq & 8) { | ||
302 | inb(0xA1); /* DUMMY - (do we need this?) */ | ||
303 | outb(cached_A1,0xA1); | ||
304 | outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ | ||
305 | outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ | ||
306 | } else { | ||
307 | inb(0x21); /* DUMMY - (do we need this?) */ | ||
308 | outb(cached_21,0x21); | ||
309 | outb(0x60+irq,0x20); /* 'Specific EOI' to master */ | ||
310 | } | ||
311 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
312 | return; | ||
313 | |||
314 | spurious_8259A_irq: | ||
315 | /* | ||
316 | * this is the slow path - should happen rarely. | ||
317 | */ | ||
318 | if (i8259A_irq_real(irq)) | ||
319 | /* | ||
320 | * oops, the IRQ _is_ in service according to the | ||
321 | * 8259A - not spurious, go handle it. | ||
322 | */ | ||
323 | goto handle_real_irq; | ||
324 | |||
325 | { | ||
326 | static int spurious_irq_mask; | ||
327 | /* | ||
328 | * At this point we can be sure the IRQ is spurious, | ||
329 | * lets ACK and report it. [once per IRQ] | ||
330 | */ | ||
331 | if (!(spurious_irq_mask & irqmask)) { | ||
332 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
333 | spurious_irq_mask |= irqmask; | ||
334 | } | ||
335 | atomic_inc(&irq_err_count); | ||
336 | /* | ||
337 | * Theoretically we do not have to handle this IRQ, | ||
338 | * but in Linux this does not cause problems and is | ||
339 | * simpler for us. | ||
340 | */ | ||
341 | goto handle_real_irq; | ||
342 | } | ||
343 | } | ||
344 | |||
345 | void init_8259A(int auto_eoi) | ||
346 | { | ||
347 | unsigned long flags; | ||
348 | |||
349 | spin_lock_irqsave(&i8259A_lock, flags); | ||
350 | |||
351 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
352 | outb(0xff, 0xA1); /* mask all of 8259A-2 */ | ||
353 | |||
354 | /* | ||
355 | * outb_p - this has to work on a wide range of PC hardware. | ||
356 | */ | ||
357 | outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ | ||
358 | outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ | ||
359 | outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ | ||
360 | if (auto_eoi) | ||
361 | outb_p(0x03, 0x21); /* master does Auto EOI */ | ||
362 | else | ||
363 | outb_p(0x01, 0x21); /* master expects normal EOI */ | ||
364 | |||
365 | outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ | ||
366 | outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ | ||
367 | outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ | ||
368 | outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode | ||
369 | is to be investigated) */ | ||
370 | |||
371 | if (auto_eoi) | ||
372 | /* | ||
373 | * in AEOI mode we just have to mask the interrupt | ||
374 | * when acking. | ||
375 | */ | ||
376 | i8259A_irq_type.ack = disable_8259A_irq; | ||
377 | else | ||
378 | i8259A_irq_type.ack = mask_and_ack_8259A; | ||
379 | |||
380 | udelay(100); /* wait for 8259A to initialize */ | ||
381 | |||
382 | outb(cached_21, 0x21); /* restore master IRQ mask */ | ||
383 | outb(cached_A1, 0xA1); /* restore slave IRQ mask */ | ||
384 | |||
385 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
386 | } | ||
387 | |||
388 | static char irq_trigger[2]; | ||
389 | /** | ||
390 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
391 | */ | ||
392 | static void restore_ELCR(char *trigger) | ||
393 | { | ||
394 | outb(trigger[0], 0x4d0); | ||
395 | outb(trigger[1], 0x4d1); | ||
396 | } | ||
397 | |||
398 | static void save_ELCR(char *trigger) | ||
399 | { | ||
400 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
401 | trigger[0] = inb(0x4d0) & 0xF8; | ||
402 | trigger[1] = inb(0x4d1) & 0xDE; | ||
403 | } | ||
404 | |||
405 | static int i8259A_resume(struct sys_device *dev) | ||
406 | { | ||
407 | init_8259A(0); | ||
408 | restore_ELCR(irq_trigger); | ||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static int i8259A_suspend(struct sys_device *dev, u32 state) | ||
413 | { | ||
414 | save_ELCR(irq_trigger); | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static struct sysdev_class i8259_sysdev_class = { | ||
419 | set_kset_name("i8259"), | ||
420 | .suspend = i8259A_suspend, | ||
421 | .resume = i8259A_resume, | ||
422 | }; | ||
423 | |||
424 | static struct sys_device device_i8259A = { | ||
425 | .id = 0, | ||
426 | .cls = &i8259_sysdev_class, | ||
427 | }; | ||
428 | |||
429 | static int __init i8259A_init_sysfs(void) | ||
430 | { | ||
431 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
432 | if (!error) | ||
433 | error = sysdev_register(&device_i8259A); | ||
434 | return error; | ||
435 | } | ||
436 | |||
437 | device_initcall(i8259A_init_sysfs); | ||
438 | |||
439 | /* | ||
440 | * IRQ2 is cascade interrupt to second interrupt controller | ||
441 | */ | ||
442 | |||
443 | static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; | ||
444 | |||
445 | void __init init_ISA_irqs (void) | ||
446 | { | ||
447 | int i; | ||
448 | |||
449 | #ifdef CONFIG_X86_LOCAL_APIC | ||
450 | init_bsp_APIC(); | ||
451 | #endif | ||
452 | init_8259A(0); | ||
453 | |||
454 | for (i = 0; i < NR_IRQS; i++) { | ||
455 | irq_desc[i].status = IRQ_DISABLED; | ||
456 | irq_desc[i].action = NULL; | ||
457 | irq_desc[i].depth = 1; | ||
458 | |||
459 | if (i < 16) { | ||
460 | /* | ||
461 | * 16 old-style INTA-cycle interrupts: | ||
462 | */ | ||
463 | irq_desc[i].handler = &i8259A_irq_type; | ||
464 | } else { | ||
465 | /* | ||
466 | * 'high' PCI IRQs filled in on demand | ||
467 | */ | ||
468 | irq_desc[i].handler = &no_irq_type; | ||
469 | } | ||
470 | } | ||
471 | } | ||
472 | |||
473 | void apic_timer_interrupt(void); | ||
474 | void spurious_interrupt(void); | ||
475 | void error_interrupt(void); | ||
476 | void reschedule_interrupt(void); | ||
477 | void call_function_interrupt(void); | ||
478 | void invalidate_interrupt(void); | ||
479 | void thermal_interrupt(void); | ||
480 | void i8254_timer_resume(void); | ||
481 | |||
482 | static void setup_timer(void) | ||
483 | { | ||
484 | outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
485 | udelay(10); | ||
486 | outb_p(LATCH & 0xff , 0x40); /* LSB */ | ||
487 | udelay(10); | ||
488 | outb(LATCH >> 8 , 0x40); /* MSB */ | ||
489 | } | ||
490 | |||
491 | static int timer_resume(struct sys_device *dev) | ||
492 | { | ||
493 | setup_timer(); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | void i8254_timer_resume(void) | ||
498 | { | ||
499 | setup_timer(); | ||
500 | } | ||
501 | |||
502 | static struct sysdev_class timer_sysclass = { | ||
503 | set_kset_name("timer"), | ||
504 | .resume = timer_resume, | ||
505 | }; | ||
506 | |||
507 | static struct sys_device device_timer = { | ||
508 | .id = 0, | ||
509 | .cls = &timer_sysclass, | ||
510 | }; | ||
511 | |||
512 | static int __init init_timer_sysfs(void) | ||
513 | { | ||
514 | int error = sysdev_class_register(&timer_sysclass); | ||
515 | if (!error) | ||
516 | error = sysdev_register(&device_timer); | ||
517 | return error; | ||
518 | } | ||
519 | |||
520 | device_initcall(init_timer_sysfs); | ||
521 | |||
522 | void __init init_IRQ(void) | ||
523 | { | ||
524 | int i; | ||
525 | |||
526 | init_ISA_irqs(); | ||
527 | /* | ||
528 | * Cover the whole vector space, no vector can escape | ||
529 | * us. (some of these will be overridden and become | ||
530 | * 'special' SMP interrupts) | ||
531 | */ | ||
532 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
533 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
534 | if (i >= NR_IRQS) | ||
535 | break; | ||
536 | if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { | ||
537 | set_intr_gate(vector, interrupt[i]); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | #ifdef CONFIG_SMP | ||
542 | /* | ||
543 | * IRQ0 must be given a fixed assignment and initialized, | ||
544 | * because it's used before the IO-APIC is set up. | ||
545 | */ | ||
546 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
547 | |||
548 | /* | ||
549 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
550 | * IPI, driven by wakeup. | ||
551 | */ | ||
552 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
553 | |||
554 | /* IPI for invalidation */ | ||
555 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | ||
556 | |||
557 | /* IPI for generic function call */ | ||
558 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
559 | #endif | ||
560 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
561 | |||
562 | #ifdef CONFIG_X86_LOCAL_APIC | ||
563 | /* self generated IPI for local APIC timer */ | ||
564 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
565 | |||
566 | /* IPI vectors for APIC spurious and error interrupts */ | ||
567 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
568 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
569 | #endif | ||
570 | |||
571 | /* | ||
572 | * Set the clock to HZ Hz, we already have a valid | ||
573 | * vector now: | ||
574 | */ | ||
575 | setup_timer(); | ||
576 | |||
577 | if (!acpi_ioapic) | ||
578 | setup_irq(2, &irq2); | ||
579 | } | ||
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c new file mode 100644 index 000000000000..c4dc91491157 --- /dev/null +++ b/arch/x86_64/kernel/init_task.c | |||
@@ -0,0 +1,49 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct fs_struct init_fs = INIT_FS; | ||
14 | static struct files_struct init_files = INIT_FILES; | ||
15 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
16 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
17 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
18 | |||
19 | EXPORT_SYMBOL(init_mm); | ||
20 | |||
21 | /* | ||
22 | * Initial task structure. | ||
23 | * | ||
24 | * We need to make sure that this is 8192-byte aligned due to the | ||
25 | * way process stacks are handled. This is done by having a special | ||
26 | * "init_task" linker map entry.. | ||
27 | */ | ||
28 | union thread_union init_thread_union | ||
29 | __attribute__((__section__(".data.init_task"))) = | ||
30 | { INIT_THREAD_INFO(init_task) }; | ||
31 | |||
32 | /* | ||
33 | * Initial task structure. | ||
34 | * | ||
35 | * All other task structs will be allocated on slabs in fork.c | ||
36 | */ | ||
37 | struct task_struct init_task = INIT_TASK(init_task); | ||
38 | |||
39 | EXPORT_SYMBOL(init_task); | ||
40 | /* | ||
41 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
42 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
43 | * so they are allowed to end up in the .data.cacheline_aligned | ||
44 | * section. Since TSS's are completely CPU-local, we want them | ||
45 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
46 | */ | ||
47 | DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; | ||
48 | |||
49 | #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) | ||
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c new file mode 100644 index 000000000000..adebe70c180e --- /dev/null +++ b/arch/x86_64/kernel/io_apic.c | |||
@@ -0,0 +1,1982 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/irq.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/delay.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/config.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/mc146818rtc.h> | ||
32 | #include <linux/acpi.h> | ||
33 | #include <linux/sysdev.h> | ||
34 | |||
35 | #include <asm/io.h> | ||
36 | #include <asm/smp.h> | ||
37 | #include <asm/desc.h> | ||
38 | #include <asm/proto.h> | ||
39 | #include <asm/mach_apic.h> | ||
40 | |||
41 | #define __apicdebuginit __init | ||
42 | |||
43 | int sis_apic_bug; /* not actually supported, dummy for compile */ | ||
44 | |||
45 | static DEFINE_SPINLOCK(ioapic_lock); | ||
46 | |||
47 | /* | ||
48 | * # of IRQ routing registers | ||
49 | */ | ||
50 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
51 | |||
52 | /* | ||
53 | * Rough estimation of how many shared IRQs there are, can | ||
54 | * be changed anytime. | ||
55 | */ | ||
56 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
57 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
58 | |||
59 | /* | ||
60 | * This is performance-critical, we want to do it O(1) | ||
61 | * | ||
62 | * the indexing order of this array favors 1:1 mappings | ||
63 | * between pins and IRQs. | ||
64 | */ | ||
65 | |||
66 | static struct irq_pin_list { | ||
67 | short apic, pin, next; | ||
68 | } irq_2_pin[PIN_MAP_SIZE]; | ||
69 | |||
70 | int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; | ||
71 | #ifdef CONFIG_PCI_MSI | ||
72 | #define vector_to_irq(vector) \ | ||
73 | (platform_legacy_irq(vector) ? vector : vector_irq[vector]) | ||
74 | #else | ||
75 | #define vector_to_irq(vector) (vector) | ||
76 | #endif | ||
77 | |||
78 | /* | ||
79 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
80 | * shared ISA-space IRQs, so we have to support them. We are super | ||
81 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
82 | */ | ||
83 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
84 | { | ||
85 | static int first_free_entry = NR_IRQS; | ||
86 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
87 | |||
88 | while (entry->next) | ||
89 | entry = irq_2_pin + entry->next; | ||
90 | |||
91 | if (entry->pin != -1) { | ||
92 | entry->next = first_free_entry; | ||
93 | entry = irq_2_pin + entry->next; | ||
94 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
95 | panic("io_apic.c: whoops"); | ||
96 | } | ||
97 | entry->apic = apic; | ||
98 | entry->pin = pin; | ||
99 | } | ||
100 | |||
101 | #define __DO_ACTION(R, ACTION, FINAL) \ | ||
102 | \ | ||
103 | { \ | ||
104 | int pin; \ | ||
105 | struct irq_pin_list *entry = irq_2_pin + irq; \ | ||
106 | \ | ||
107 | for (;;) { \ | ||
108 | unsigned int reg; \ | ||
109 | pin = entry->pin; \ | ||
110 | if (pin == -1) \ | ||
111 | break; \ | ||
112 | reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | ||
113 | reg ACTION; \ | ||
114 | io_apic_modify(entry->apic, reg); \ | ||
115 | if (!entry->next) \ | ||
116 | break; \ | ||
117 | entry = irq_2_pin + entry->next; \ | ||
118 | } \ | ||
119 | FINAL; \ | ||
120 | } | ||
121 | |||
122 | #define DO_ACTION(name,R,ACTION, FINAL) \ | ||
123 | \ | ||
124 | static void name##_IO_APIC_irq (unsigned int irq) \ | ||
125 | __DO_ACTION(R, ACTION, FINAL) | ||
126 | |||
127 | DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | ||
128 | /* mask = 1 */ | ||
129 | DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | ||
130 | /* mask = 0 */ | ||
131 | |||
132 | static void mask_IO_APIC_irq (unsigned int irq) | ||
133 | { | ||
134 | unsigned long flags; | ||
135 | |||
136 | spin_lock_irqsave(&ioapic_lock, flags); | ||
137 | __mask_IO_APIC_irq(irq); | ||
138 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
139 | } | ||
140 | |||
141 | static void unmask_IO_APIC_irq (unsigned int irq) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&ioapic_lock, flags); | ||
146 | __unmask_IO_APIC_irq(irq); | ||
147 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
148 | } | ||
149 | |||
150 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
151 | { | ||
152 | struct IO_APIC_route_entry entry; | ||
153 | unsigned long flags; | ||
154 | |||
155 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
156 | spin_lock_irqsave(&ioapic_lock, flags); | ||
157 | *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | ||
158 | *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | ||
159 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
160 | if (entry.delivery_mode == dest_SMI) | ||
161 | return; | ||
162 | /* | ||
163 | * Disable it in the IO-APIC irq-routing table: | ||
164 | */ | ||
165 | memset(&entry, 0, sizeof(entry)); | ||
166 | entry.mask = 1; | ||
167 | spin_lock_irqsave(&ioapic_lock, flags); | ||
168 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); | ||
169 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); | ||
170 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
171 | } | ||
172 | |||
173 | static void clear_IO_APIC (void) | ||
174 | { | ||
175 | int apic, pin; | ||
176 | |||
177 | for (apic = 0; apic < nr_ioapics; apic++) | ||
178 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
179 | clear_IO_APIC_pin(apic, pin); | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | ||
184 | * specific CPU-side IRQs. | ||
185 | */ | ||
186 | |||
187 | #define MAX_PIRQS 8 | ||
188 | static int pirq_entries [MAX_PIRQS]; | ||
189 | static int pirqs_enabled; | ||
190 | int skip_ioapic_setup; | ||
191 | int ioapic_force; | ||
192 | |||
193 | /* dummy parsing: see setup.c */ | ||
194 | |||
195 | static int __init disable_ioapic_setup(char *str) | ||
196 | { | ||
197 | skip_ioapic_setup = 1; | ||
198 | return 1; | ||
199 | } | ||
200 | |||
201 | static int __init enable_ioapic_setup(char *str) | ||
202 | { | ||
203 | ioapic_force = 1; | ||
204 | skip_ioapic_setup = 0; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | __setup("noapic", disable_ioapic_setup); | ||
209 | __setup("apic", enable_ioapic_setup); | ||
210 | |||
211 | #include <asm/pci-direct.h> | ||
212 | #include <linux/pci_ids.h> | ||
213 | #include <linux/pci.h> | ||
214 | |||
215 | /* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC | ||
216 | off. Check for an Nvidia or VIA PCI bridge and turn it off. | ||
217 | Use pci direct infrastructure because this runs before the PCI subsystem. | ||
218 | |||
219 | Can be overwritten with "apic" | ||
220 | |||
221 | And another hack to disable the IOMMU on VIA chipsets. | ||
222 | |||
223 | Kludge-O-Rama. */ | ||
224 | void __init check_ioapic(void) | ||
225 | { | ||
226 | int num,slot,func; | ||
227 | if (ioapic_force) | ||
228 | return; | ||
229 | |||
230 | /* Poor man's PCI discovery */ | ||
231 | for (num = 0; num < 32; num++) { | ||
232 | for (slot = 0; slot < 32; slot++) { | ||
233 | for (func = 0; func < 8; func++) { | ||
234 | u32 class; | ||
235 | u32 vendor; | ||
236 | u8 type; | ||
237 | class = read_pci_config(num,slot,func, | ||
238 | PCI_CLASS_REVISION); | ||
239 | if (class == 0xffffffff) | ||
240 | break; | ||
241 | |||
242 | if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) | ||
243 | continue; | ||
244 | |||
245 | vendor = read_pci_config(num, slot, func, | ||
246 | PCI_VENDOR_ID); | ||
247 | vendor &= 0xffff; | ||
248 | switch (vendor) { | ||
249 | case PCI_VENDOR_ID_VIA: | ||
250 | #ifdef CONFIG_GART_IOMMU | ||
251 | if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || | ||
252 | force_iommu) && | ||
253 | !iommu_aperture_allowed) { | ||
254 | printk(KERN_INFO | ||
255 | "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); | ||
256 | iommu_aperture_disabled = 1; | ||
257 | } | ||
258 | #endif | ||
259 | return; | ||
260 | case PCI_VENDOR_ID_NVIDIA: | ||
261 | #ifdef CONFIG_ACPI | ||
262 | /* All timer overrides on Nvidia | ||
263 | seem to be wrong. Skip them. */ | ||
264 | acpi_skip_timer_override = 1; | ||
265 | printk(KERN_INFO | ||
266 | "Nvidia board detected. Ignoring ACPI timer override.\n"); | ||
267 | #endif | ||
268 | /* RED-PEN skip them on mptables too? */ | ||
269 | return; | ||
270 | } | ||
271 | |||
272 | /* No multi-function device? */ | ||
273 | type = read_pci_config_byte(num,slot,func, | ||
274 | PCI_HEADER_TYPE); | ||
275 | if (!(type & 0x80)) | ||
276 | break; | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | } | ||
281 | |||
282 | static int __init ioapic_pirq_setup(char *str) | ||
283 | { | ||
284 | int i, max; | ||
285 | int ints[MAX_PIRQS+1]; | ||
286 | |||
287 | get_options(str, ARRAY_SIZE(ints), ints); | ||
288 | |||
289 | for (i = 0; i < MAX_PIRQS; i++) | ||
290 | pirq_entries[i] = -1; | ||
291 | |||
292 | pirqs_enabled = 1; | ||
293 | apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); | ||
294 | max = MAX_PIRQS; | ||
295 | if (ints[0] < MAX_PIRQS) | ||
296 | max = ints[0]; | ||
297 | |||
298 | for (i = 0; i < max; i++) { | ||
299 | apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | ||
300 | /* | ||
301 | * PIRQs are mapped upside down, usually. | ||
302 | */ | ||
303 | pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | ||
304 | } | ||
305 | return 1; | ||
306 | } | ||
307 | |||
308 | __setup("pirq=", ioapic_pirq_setup); | ||
309 | |||
310 | /* | ||
311 | * Find the IRQ entry number of a certain pin. | ||
312 | */ | ||
313 | static int find_irq_entry(int apic, int pin, int type) | ||
314 | { | ||
315 | int i; | ||
316 | |||
317 | for (i = 0; i < mp_irq_entries; i++) | ||
318 | if (mp_irqs[i].mpc_irqtype == type && | ||
319 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | ||
320 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | ||
321 | mp_irqs[i].mpc_dstirq == pin) | ||
322 | return i; | ||
323 | |||
324 | return -1; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
329 | */ | ||
330 | static int __init find_isa_irq_pin(int irq, int type) | ||
331 | { | ||
332 | int i; | ||
333 | |||
334 | for (i = 0; i < mp_irq_entries; i++) { | ||
335 | int lbus = mp_irqs[i].mpc_srcbus; | ||
336 | |||
337 | if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | ||
338 | mp_bus_id_to_type[lbus] == MP_BUS_EISA || | ||
339 | mp_bus_id_to_type[lbus] == MP_BUS_MCA) && | ||
340 | (mp_irqs[i].mpc_irqtype == type) && | ||
341 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
342 | |||
343 | return mp_irqs[i].mpc_dstirq; | ||
344 | } | ||
345 | return -1; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Find a specific PCI IRQ entry. | ||
350 | * Not an __init, possibly needed by modules | ||
351 | */ | ||
352 | static int pin_2_irq(int idx, int apic, int pin); | ||
353 | |||
354 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
355 | { | ||
356 | int apic, i, best_guess = -1; | ||
357 | |||
358 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
359 | bus, slot, pin); | ||
360 | if (mp_bus_id_to_pci_bus[bus] == -1) { | ||
361 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
362 | return -1; | ||
363 | } | ||
364 | for (i = 0; i < mp_irq_entries; i++) { | ||
365 | int lbus = mp_irqs[i].mpc_srcbus; | ||
366 | |||
367 | for (apic = 0; apic < nr_ioapics; apic++) | ||
368 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | ||
369 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | ||
370 | break; | ||
371 | |||
372 | if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && | ||
373 | !mp_irqs[i].mpc_irqtype && | ||
374 | (bus == lbus) && | ||
375 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | ||
376 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | ||
377 | |||
378 | if (!(apic || IO_APIC_IRQ(irq))) | ||
379 | continue; | ||
380 | |||
381 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | ||
382 | return irq; | ||
383 | /* | ||
384 | * Use the first all-but-pin matching entry as a | ||
385 | * best-guess fuzzy result for broken mptables. | ||
386 | */ | ||
387 | if (best_guess < 0) | ||
388 | best_guess = irq; | ||
389 | } | ||
390 | } | ||
391 | return best_guess; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * EISA Edge/Level control register, ELCR | ||
396 | */ | ||
397 | static int EISA_ELCR(unsigned int irq) | ||
398 | { | ||
399 | if (irq < 16) { | ||
400 | unsigned int port = 0x4d0 + (irq >> 3); | ||
401 | return (inb(port) >> (irq & 7)) & 1; | ||
402 | } | ||
403 | apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); | ||
404 | return 0; | ||
405 | } | ||
406 | |||
407 | /* EISA interrupts are always polarity zero and can be edge or level | ||
408 | * trigger depending on the ELCR value. If an interrupt is listed as | ||
409 | * EISA conforming in the MP table, that means its trigger type must | ||
410 | * be read in from the ELCR */ | ||
411 | |||
412 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | ||
413 | #define default_EISA_polarity(idx) (0) | ||
414 | |||
415 | /* ISA interrupts are always polarity zero edge triggered, | ||
416 | * when listed as conforming in the MP table. */ | ||
417 | |||
418 | #define default_ISA_trigger(idx) (0) | ||
419 | #define default_ISA_polarity(idx) (0) | ||
420 | |||
421 | /* PCI interrupts are always polarity one level triggered, | ||
422 | * when listed as conforming in the MP table. */ | ||
423 | |||
424 | #define default_PCI_trigger(idx) (1) | ||
425 | #define default_PCI_polarity(idx) (1) | ||
426 | |||
427 | /* MCA interrupts are always polarity zero level triggered, | ||
428 | * when listed as conforming in the MP table. */ | ||
429 | |||
430 | #define default_MCA_trigger(idx) (1) | ||
431 | #define default_MCA_polarity(idx) (0) | ||
432 | |||
433 | static int __init MPBIOS_polarity(int idx) | ||
434 | { | ||
435 | int bus = mp_irqs[idx].mpc_srcbus; | ||
436 | int polarity; | ||
437 | |||
438 | /* | ||
439 | * Determine IRQ line polarity (high active or low active): | ||
440 | */ | ||
441 | switch (mp_irqs[idx].mpc_irqflag & 3) | ||
442 | { | ||
443 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
444 | { | ||
445 | switch (mp_bus_id_to_type[bus]) | ||
446 | { | ||
447 | case MP_BUS_ISA: /* ISA pin */ | ||
448 | { | ||
449 | polarity = default_ISA_polarity(idx); | ||
450 | break; | ||
451 | } | ||
452 | case MP_BUS_EISA: /* EISA pin */ | ||
453 | { | ||
454 | polarity = default_EISA_polarity(idx); | ||
455 | break; | ||
456 | } | ||
457 | case MP_BUS_PCI: /* PCI pin */ | ||
458 | { | ||
459 | polarity = default_PCI_polarity(idx); | ||
460 | break; | ||
461 | } | ||
462 | case MP_BUS_MCA: /* MCA pin */ | ||
463 | { | ||
464 | polarity = default_MCA_polarity(idx); | ||
465 | break; | ||
466 | } | ||
467 | default: | ||
468 | { | ||
469 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
470 | polarity = 1; | ||
471 | break; | ||
472 | } | ||
473 | } | ||
474 | break; | ||
475 | } | ||
476 | case 1: /* high active */ | ||
477 | { | ||
478 | polarity = 0; | ||
479 | break; | ||
480 | } | ||
481 | case 2: /* reserved */ | ||
482 | { | ||
483 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
484 | polarity = 1; | ||
485 | break; | ||
486 | } | ||
487 | case 3: /* low active */ | ||
488 | { | ||
489 | polarity = 1; | ||
490 | break; | ||
491 | } | ||
492 | default: /* invalid */ | ||
493 | { | ||
494 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
495 | polarity = 1; | ||
496 | break; | ||
497 | } | ||
498 | } | ||
499 | return polarity; | ||
500 | } | ||
501 | |||
502 | static int MPBIOS_trigger(int idx) | ||
503 | { | ||
504 | int bus = mp_irqs[idx].mpc_srcbus; | ||
505 | int trigger; | ||
506 | |||
507 | /* | ||
508 | * Determine IRQ trigger mode (edge or level sensitive): | ||
509 | */ | ||
510 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | ||
511 | { | ||
512 | case 0: /* conforms, ie. bus-type dependent */ | ||
513 | { | ||
514 | switch (mp_bus_id_to_type[bus]) | ||
515 | { | ||
516 | case MP_BUS_ISA: /* ISA pin */ | ||
517 | { | ||
518 | trigger = default_ISA_trigger(idx); | ||
519 | break; | ||
520 | } | ||
521 | case MP_BUS_EISA: /* EISA pin */ | ||
522 | { | ||
523 | trigger = default_EISA_trigger(idx); | ||
524 | break; | ||
525 | } | ||
526 | case MP_BUS_PCI: /* PCI pin */ | ||
527 | { | ||
528 | trigger = default_PCI_trigger(idx); | ||
529 | break; | ||
530 | } | ||
531 | case MP_BUS_MCA: /* MCA pin */ | ||
532 | { | ||
533 | trigger = default_MCA_trigger(idx); | ||
534 | break; | ||
535 | } | ||
536 | default: | ||
537 | { | ||
538 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
539 | trigger = 1; | ||
540 | break; | ||
541 | } | ||
542 | } | ||
543 | break; | ||
544 | } | ||
545 | case 1: /* edge */ | ||
546 | { | ||
547 | trigger = 0; | ||
548 | break; | ||
549 | } | ||
550 | case 2: /* reserved */ | ||
551 | { | ||
552 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
553 | trigger = 1; | ||
554 | break; | ||
555 | } | ||
556 | case 3: /* level */ | ||
557 | { | ||
558 | trigger = 1; | ||
559 | break; | ||
560 | } | ||
561 | default: /* invalid */ | ||
562 | { | ||
563 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
564 | trigger = 0; | ||
565 | break; | ||
566 | } | ||
567 | } | ||
568 | return trigger; | ||
569 | } | ||
570 | |||
571 | static inline int irq_polarity(int idx) | ||
572 | { | ||
573 | return MPBIOS_polarity(idx); | ||
574 | } | ||
575 | |||
576 | static inline int irq_trigger(int idx) | ||
577 | { | ||
578 | return MPBIOS_trigger(idx); | ||
579 | } | ||
580 | |||
581 | static int pin_2_irq(int idx, int apic, int pin) | ||
582 | { | ||
583 | int irq, i; | ||
584 | int bus = mp_irqs[idx].mpc_srcbus; | ||
585 | |||
586 | /* | ||
587 | * Debugging check, we are in big trouble if this message pops up! | ||
588 | */ | ||
589 | if (mp_irqs[idx].mpc_dstirq != pin) | ||
590 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
591 | |||
592 | switch (mp_bus_id_to_type[bus]) | ||
593 | { | ||
594 | case MP_BUS_ISA: /* ISA pin */ | ||
595 | case MP_BUS_EISA: | ||
596 | case MP_BUS_MCA: | ||
597 | { | ||
598 | irq = mp_irqs[idx].mpc_srcbusirq; | ||
599 | break; | ||
600 | } | ||
601 | case MP_BUS_PCI: /* PCI pin */ | ||
602 | { | ||
603 | /* | ||
604 | * PCI IRQs are mapped in order | ||
605 | */ | ||
606 | i = irq = 0; | ||
607 | while (i < apic) | ||
608 | irq += nr_ioapic_registers[i++]; | ||
609 | irq += pin; | ||
610 | break; | ||
611 | } | ||
612 | default: | ||
613 | { | ||
614 | printk(KERN_ERR "unknown bus type %d.\n",bus); | ||
615 | irq = 0; | ||
616 | break; | ||
617 | } | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * PCI IRQ command line redirection. Yes, limits are hardcoded. | ||
622 | */ | ||
623 | if ((pin >= 16) && (pin <= 23)) { | ||
624 | if (pirq_entries[pin-16] != -1) { | ||
625 | if (!pirq_entries[pin-16]) { | ||
626 | apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); | ||
627 | } else { | ||
628 | irq = pirq_entries[pin-16]; | ||
629 | apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", | ||
630 | pin-16, irq); | ||
631 | } | ||
632 | } | ||
633 | } | ||
634 | return irq; | ||
635 | } | ||
636 | |||
637 | static inline int IO_APIC_irq_trigger(int irq) | ||
638 | { | ||
639 | int apic, idx, pin; | ||
640 | |||
641 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
642 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
643 | idx = find_irq_entry(apic,pin,mp_INT); | ||
644 | if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | ||
645 | return irq_trigger(idx); | ||
646 | } | ||
647 | } | ||
648 | /* | ||
649 | * nonexistent IRQs are edge default | ||
650 | */ | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | ||
655 | u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; | ||
656 | |||
657 | int assign_irq_vector(int irq) | ||
658 | { | ||
659 | static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; | ||
660 | |||
661 | BUG_ON(irq >= NR_IRQ_VECTORS); | ||
662 | if (IO_APIC_VECTOR(irq) > 0) | ||
663 | return IO_APIC_VECTOR(irq); | ||
664 | next: | ||
665 | current_vector += 8; | ||
666 | if (current_vector == IA32_SYSCALL_VECTOR) | ||
667 | goto next; | ||
668 | |||
669 | if (current_vector >= FIRST_SYSTEM_VECTOR) { | ||
670 | offset++; | ||
671 | if (!(offset%8)) | ||
672 | return -ENOSPC; | ||
673 | current_vector = FIRST_DEVICE_VECTOR + offset; | ||
674 | } | ||
675 | |||
676 | vector_irq[current_vector] = irq; | ||
677 | if (irq != AUTO_ASSIGN) | ||
678 | IO_APIC_VECTOR(irq) = current_vector; | ||
679 | |||
680 | return current_vector; | ||
681 | } | ||
682 | |||
683 | extern void (*interrupt[NR_IRQS])(void); | ||
684 | static struct hw_interrupt_type ioapic_level_type; | ||
685 | static struct hw_interrupt_type ioapic_edge_type; | ||
686 | |||
687 | #define IOAPIC_AUTO -1 | ||
688 | #define IOAPIC_EDGE 0 | ||
689 | #define IOAPIC_LEVEL 1 | ||
690 | |||
691 | static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) | ||
692 | { | ||
693 | if (use_pci_vector() && !platform_legacy_irq(irq)) { | ||
694 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
695 | trigger == IOAPIC_LEVEL) | ||
696 | irq_desc[vector].handler = &ioapic_level_type; | ||
697 | else | ||
698 | irq_desc[vector].handler = &ioapic_edge_type; | ||
699 | set_intr_gate(vector, interrupt[vector]); | ||
700 | } else { | ||
701 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
702 | trigger == IOAPIC_LEVEL) | ||
703 | irq_desc[irq].handler = &ioapic_level_type; | ||
704 | else | ||
705 | irq_desc[irq].handler = &ioapic_edge_type; | ||
706 | set_intr_gate(vector, interrupt[irq]); | ||
707 | } | ||
708 | } | ||
709 | |||
710 | static void __init setup_IO_APIC_irqs(void) | ||
711 | { | ||
712 | struct IO_APIC_route_entry entry; | ||
713 | int apic, pin, idx, irq, first_notcon = 1, vector; | ||
714 | unsigned long flags; | ||
715 | |||
716 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
717 | |||
718 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
719 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
720 | |||
721 | /* | ||
722 | * add it to the IO-APIC irq-routing table: | ||
723 | */ | ||
724 | memset(&entry,0,sizeof(entry)); | ||
725 | |||
726 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
727 | entry.dest_mode = INT_DEST_MODE; | ||
728 | entry.mask = 0; /* enable IRQ */ | ||
729 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
730 | |||
731 | idx = find_irq_entry(apic,pin,mp_INT); | ||
732 | if (idx == -1) { | ||
733 | if (first_notcon) { | ||
734 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
735 | first_notcon = 0; | ||
736 | } else | ||
737 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
738 | continue; | ||
739 | } | ||
740 | |||
741 | entry.trigger = irq_trigger(idx); | ||
742 | entry.polarity = irq_polarity(idx); | ||
743 | |||
744 | if (irq_trigger(idx)) { | ||
745 | entry.trigger = 1; | ||
746 | entry.mask = 1; | ||
747 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
748 | } | ||
749 | |||
750 | irq = pin_2_irq(idx, apic, pin); | ||
751 | add_pin_to_irq(irq, apic, pin); | ||
752 | |||
753 | if (!apic && !IO_APIC_IRQ(irq)) | ||
754 | continue; | ||
755 | |||
756 | if (IO_APIC_IRQ(irq)) { | ||
757 | vector = assign_irq_vector(irq); | ||
758 | entry.vector = vector; | ||
759 | |||
760 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); | ||
761 | if (!apic && (irq < 16)) | ||
762 | disable_8259A_irq(irq); | ||
763 | } | ||
764 | spin_lock_irqsave(&ioapic_lock, flags); | ||
765 | io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
766 | io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
767 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
768 | } | ||
769 | } | ||
770 | |||
771 | if (!first_notcon) | ||
772 | apic_printk(APIC_VERBOSE," not connected.\n"); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * Set up the 8259A-master output pin as broadcast to all | ||
777 | * CPUs. | ||
778 | */ | ||
779 | static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) | ||
780 | { | ||
781 | struct IO_APIC_route_entry entry; | ||
782 | unsigned long flags; | ||
783 | |||
784 | memset(&entry,0,sizeof(entry)); | ||
785 | |||
786 | disable_8259A_irq(0); | ||
787 | |||
788 | /* mask LVT0 */ | ||
789 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
790 | |||
791 | /* | ||
792 | * We use logical delivery to get the timer IRQ | ||
793 | * to the first CPU. | ||
794 | */ | ||
795 | entry.dest_mode = INT_DEST_MODE; | ||
796 | entry.mask = 0; /* unmask IRQ now */ | ||
797 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
798 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
799 | entry.polarity = 0; | ||
800 | entry.trigger = 0; | ||
801 | entry.vector = vector; | ||
802 | |||
803 | /* | ||
804 | * The timer IRQ doesn't have to know that behind the | ||
805 | * scene we have a 8259A-master in AEOI mode ... | ||
806 | */ | ||
807 | irq_desc[0].handler = &ioapic_edge_type; | ||
808 | |||
809 | /* | ||
810 | * Add it to the IO-APIC irq-routing table: | ||
811 | */ | ||
812 | spin_lock_irqsave(&ioapic_lock, flags); | ||
813 | io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); | ||
814 | io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); | ||
815 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
816 | |||
817 | enable_8259A_irq(0); | ||
818 | } | ||
819 | |||
820 | void __init UNEXPECTED_IO_APIC(void) | ||
821 | { | ||
822 | } | ||
823 | |||
824 | void __apicdebuginit print_IO_APIC(void) | ||
825 | { | ||
826 | int apic, i; | ||
827 | union IO_APIC_reg_00 reg_00; | ||
828 | union IO_APIC_reg_01 reg_01; | ||
829 | union IO_APIC_reg_02 reg_02; | ||
830 | unsigned long flags; | ||
831 | |||
832 | if (apic_verbosity == APIC_QUIET) | ||
833 | return; | ||
834 | |||
835 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
836 | for (i = 0; i < nr_ioapics; i++) | ||
837 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
838 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | ||
839 | |||
840 | /* | ||
841 | * We are a bit conservative about what we expect. We have to | ||
842 | * know about every hardware change ASAP. | ||
843 | */ | ||
844 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
845 | |||
846 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
847 | |||
848 | spin_lock_irqsave(&ioapic_lock, flags); | ||
849 | reg_00.raw = io_apic_read(apic, 0); | ||
850 | reg_01.raw = io_apic_read(apic, 1); | ||
851 | if (reg_01.bits.version >= 0x10) | ||
852 | reg_02.raw = io_apic_read(apic, 2); | ||
853 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
854 | |||
855 | printk("\n"); | ||
856 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | ||
857 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
858 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
859 | if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) | ||
860 | UNEXPECTED_IO_APIC(); | ||
861 | |||
862 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | ||
863 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
864 | if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ | ||
865 | (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ | ||
866 | (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ | ||
867 | (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ | ||
868 | (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ | ||
869 | (reg_01.bits.entries != 0x2E) && | ||
870 | (reg_01.bits.entries != 0x3F) && | ||
871 | (reg_01.bits.entries != 0x03) | ||
872 | ) | ||
873 | UNEXPECTED_IO_APIC(); | ||
874 | |||
875 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
876 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
877 | if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ | ||
878 | (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ | ||
879 | (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ | ||
880 | (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ | ||
881 | (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ | ||
882 | (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ | ||
883 | ) | ||
884 | UNEXPECTED_IO_APIC(); | ||
885 | if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) | ||
886 | UNEXPECTED_IO_APIC(); | ||
887 | |||
888 | if (reg_01.bits.version >= 0x10) { | ||
889 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
890 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
891 | if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) | ||
892 | UNEXPECTED_IO_APIC(); | ||
893 | } | ||
894 | |||
895 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
896 | |||
897 | printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | ||
898 | " Stat Dest Deli Vect: \n"); | ||
899 | |||
900 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
901 | struct IO_APIC_route_entry entry; | ||
902 | |||
903 | spin_lock_irqsave(&ioapic_lock, flags); | ||
904 | *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); | ||
905 | *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); | ||
906 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
907 | |||
908 | printk(KERN_DEBUG " %02x %03X %02X ", | ||
909 | i, | ||
910 | entry.dest.logical.logical_dest, | ||
911 | entry.dest.physical.physical_dest | ||
912 | ); | ||
913 | |||
914 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
915 | entry.mask, | ||
916 | entry.trigger, | ||
917 | entry.irr, | ||
918 | entry.polarity, | ||
919 | entry.delivery_status, | ||
920 | entry.dest_mode, | ||
921 | entry.delivery_mode, | ||
922 | entry.vector | ||
923 | ); | ||
924 | } | ||
925 | } | ||
926 | if (use_pci_vector()) | ||
927 | printk(KERN_INFO "Using vector-based indexing\n"); | ||
928 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
929 | for (i = 0; i < NR_IRQS; i++) { | ||
930 | struct irq_pin_list *entry = irq_2_pin + i; | ||
931 | if (entry->pin < 0) | ||
932 | continue; | ||
933 | if (use_pci_vector() && !platform_legacy_irq(i)) | ||
934 | printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); | ||
935 | else | ||
936 | printk(KERN_DEBUG "IRQ%d ", i); | ||
937 | for (;;) { | ||
938 | printk("-> %d:%d", entry->apic, entry->pin); | ||
939 | if (!entry->next) | ||
940 | break; | ||
941 | entry = irq_2_pin + entry->next; | ||
942 | } | ||
943 | printk("\n"); | ||
944 | } | ||
945 | |||
946 | printk(KERN_INFO ".................................... done.\n"); | ||
947 | |||
948 | return; | ||
949 | } | ||
950 | |||
951 | #if 0 | ||
952 | |||
953 | static __apicdebuginit void print_APIC_bitfield (int base) | ||
954 | { | ||
955 | unsigned int v; | ||
956 | int i, j; | ||
957 | |||
958 | if (apic_verbosity == APIC_QUIET) | ||
959 | return; | ||
960 | |||
961 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
962 | for (i = 0; i < 8; i++) { | ||
963 | v = apic_read(base + i*0x10); | ||
964 | for (j = 0; j < 32; j++) { | ||
965 | if (v & (1<<j)) | ||
966 | printk("1"); | ||
967 | else | ||
968 | printk("0"); | ||
969 | } | ||
970 | printk("\n"); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | void __apicdebuginit print_local_APIC(void * dummy) | ||
975 | { | ||
976 | unsigned int v, ver, maxlvt; | ||
977 | |||
978 | if (apic_verbosity == APIC_QUIET) | ||
979 | return; | ||
980 | |||
981 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
982 | smp_processor_id(), hard_smp_processor_id()); | ||
983 | v = apic_read(APIC_ID); | ||
984 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | ||
985 | v = apic_read(APIC_LVR); | ||
986 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
987 | ver = GET_APIC_VERSION(v); | ||
988 | maxlvt = get_maxlvt(); | ||
989 | |||
990 | v = apic_read(APIC_TASKPRI); | ||
991 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
992 | |||
993 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
994 | v = apic_read(APIC_ARBPRI); | ||
995 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
996 | v & APIC_ARBPRI_MASK); | ||
997 | v = apic_read(APIC_PROCPRI); | ||
998 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
999 | } | ||
1000 | |||
1001 | v = apic_read(APIC_EOI); | ||
1002 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1003 | v = apic_read(APIC_RRR); | ||
1004 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1005 | v = apic_read(APIC_LDR); | ||
1006 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1007 | v = apic_read(APIC_DFR); | ||
1008 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1009 | v = apic_read(APIC_SPIV); | ||
1010 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1011 | |||
1012 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1013 | print_APIC_bitfield(APIC_ISR); | ||
1014 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1015 | print_APIC_bitfield(APIC_TMR); | ||
1016 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1017 | print_APIC_bitfield(APIC_IRR); | ||
1018 | |||
1019 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1020 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1021 | apic_write(APIC_ESR, 0); | ||
1022 | v = apic_read(APIC_ESR); | ||
1023 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1024 | } | ||
1025 | |||
1026 | v = apic_read(APIC_ICR); | ||
1027 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | ||
1028 | v = apic_read(APIC_ICR2); | ||
1029 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1030 | |||
1031 | v = apic_read(APIC_LVTT); | ||
1032 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1033 | |||
1034 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1035 | v = apic_read(APIC_LVTPC); | ||
1036 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1037 | } | ||
1038 | v = apic_read(APIC_LVT0); | ||
1039 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1040 | v = apic_read(APIC_LVT1); | ||
1041 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1042 | |||
1043 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1044 | v = apic_read(APIC_LVTERR); | ||
1045 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1046 | } | ||
1047 | |||
1048 | v = apic_read(APIC_TMICT); | ||
1049 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1050 | v = apic_read(APIC_TMCCT); | ||
1051 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1052 | v = apic_read(APIC_TDCR); | ||
1053 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1054 | printk("\n"); | ||
1055 | } | ||
1056 | |||
1057 | void print_all_local_APICs (void) | ||
1058 | { | ||
1059 | on_each_cpu(print_local_APIC, NULL, 1, 1); | ||
1060 | } | ||
1061 | |||
1062 | void __apicdebuginit print_PIC(void) | ||
1063 | { | ||
1064 | extern spinlock_t i8259A_lock; | ||
1065 | unsigned int v; | ||
1066 | unsigned long flags; | ||
1067 | |||
1068 | if (apic_verbosity == APIC_QUIET) | ||
1069 | return; | ||
1070 | |||
1071 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1072 | |||
1073 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1074 | |||
1075 | v = inb(0xa1) << 8 | inb(0x21); | ||
1076 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1077 | |||
1078 | v = inb(0xa0) << 8 | inb(0x20); | ||
1079 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1080 | |||
1081 | outb(0x0b,0xa0); | ||
1082 | outb(0x0b,0x20); | ||
1083 | v = inb(0xa0) << 8 | inb(0x20); | ||
1084 | outb(0x0a,0xa0); | ||
1085 | outb(0x0a,0x20); | ||
1086 | |||
1087 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1088 | |||
1089 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1090 | |||
1091 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1092 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1093 | } | ||
1094 | |||
1095 | #endif /* 0 */ | ||
1096 | |||
1097 | static void __init enable_IO_APIC(void) | ||
1098 | { | ||
1099 | union IO_APIC_reg_01 reg_01; | ||
1100 | int i; | ||
1101 | unsigned long flags; | ||
1102 | |||
1103 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1104 | irq_2_pin[i].pin = -1; | ||
1105 | irq_2_pin[i].next = 0; | ||
1106 | } | ||
1107 | if (!pirqs_enabled) | ||
1108 | for (i = 0; i < MAX_PIRQS; i++) | ||
1109 | pirq_entries[i] = -1; | ||
1110 | |||
1111 | /* | ||
1112 | * The number of IO-APIC IRQ registers (== #pins): | ||
1113 | */ | ||
1114 | for (i = 0; i < nr_ioapics; i++) { | ||
1115 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1116 | reg_01.raw = io_apic_read(i, 1); | ||
1117 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1118 | nr_ioapic_registers[i] = reg_01.bits.entries+1; | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Do not trust the IO-APIC being empty at bootup | ||
1123 | */ | ||
1124 | clear_IO_APIC(); | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1128 | * Not an __init, needed by the reboot code | ||
1129 | */ | ||
1130 | void disable_IO_APIC(void) | ||
1131 | { | ||
1132 | /* | ||
1133 | * Clear the IO-APIC before rebooting: | ||
1134 | */ | ||
1135 | clear_IO_APIC(); | ||
1136 | |||
1137 | disconnect_bsp_APIC(); | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * function to set the IO-APIC physical IDs based on the | ||
1142 | * values stored in the MPC table. | ||
1143 | * | ||
1144 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | ||
1145 | */ | ||
1146 | |||
1147 | static void __init setup_ioapic_ids_from_mpc (void) | ||
1148 | { | ||
1149 | union IO_APIC_reg_00 reg_00; | ||
1150 | int apic; | ||
1151 | int i; | ||
1152 | unsigned char old_id; | ||
1153 | unsigned long flags; | ||
1154 | |||
1155 | /* | ||
1156 | * Set the IOAPIC ID to the value stored in the MPC table. | ||
1157 | */ | ||
1158 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1159 | |||
1160 | /* Read the register 0 value */ | ||
1161 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1162 | reg_00.raw = io_apic_read(apic, 0); | ||
1163 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1164 | |||
1165 | old_id = mp_ioapics[apic].mpc_apicid; | ||
1166 | |||
1167 | |||
1168 | printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); | ||
1169 | |||
1170 | |||
1171 | /* | ||
1172 | * We need to adjust the IRQ routing table | ||
1173 | * if the ID changed. | ||
1174 | */ | ||
1175 | if (old_id != mp_ioapics[apic].mpc_apicid) | ||
1176 | for (i = 0; i < mp_irq_entries; i++) | ||
1177 | if (mp_irqs[i].mpc_dstapic == old_id) | ||
1178 | mp_irqs[i].mpc_dstapic | ||
1179 | = mp_ioapics[apic].mpc_apicid; | ||
1180 | |||
1181 | /* | ||
1182 | * Read the right value from the MPC table and | ||
1183 | * write it into the ID register. | ||
1184 | */ | ||
1185 | apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", | ||
1186 | mp_ioapics[apic].mpc_apicid); | ||
1187 | |||
1188 | reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | ||
1189 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1190 | io_apic_write(apic, 0, reg_00.raw); | ||
1191 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1192 | |||
1193 | /* | ||
1194 | * Sanity check | ||
1195 | */ | ||
1196 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1197 | reg_00.raw = io_apic_read(apic, 0); | ||
1198 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1199 | if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | ||
1200 | printk("could not set ID!\n"); | ||
1201 | else | ||
1202 | apic_printk(APIC_VERBOSE," ok.\n"); | ||
1203 | } | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1207 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1208 | * about the timer IRQ. We do the following to work around the situation: | ||
1209 | * | ||
1210 | * - timer IRQ defaults to IO-APIC IRQ | ||
1211 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1212 | * back to ISA timer IRQs | ||
1213 | */ | ||
1214 | static int __init timer_irq_works(void) | ||
1215 | { | ||
1216 | unsigned long t1 = jiffies; | ||
1217 | |||
1218 | local_irq_enable(); | ||
1219 | /* Let ten ticks pass... */ | ||
1220 | mdelay((10 * 1000) / HZ); | ||
1221 | |||
1222 | /* | ||
1223 | * Expect a few ticks at least, to be sure some possible | ||
1224 | * glue logic does not lock up after one or two first | ||
1225 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1226 | * might have cached one ExtINT interrupt. Finally, at | ||
1227 | * least one tick may be lost due to delays. | ||
1228 | */ | ||
1229 | |||
1230 | /* jiffies wrap? */ | ||
1231 | if (jiffies - t1 > 4) | ||
1232 | return 1; | ||
1233 | return 0; | ||
1234 | } | ||
1235 | |||
1236 | /* | ||
1237 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1238 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1239 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1240 | * better to do it this way as thus we do not have to be aware of | ||
1241 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1242 | */ | ||
1243 | /* | ||
1244 | * Edge triggered needs to resend any interrupt | ||
1245 | * that was delayed but this is now handled in the device | ||
1246 | * independent code. | ||
1247 | */ | ||
1248 | |||
1249 | /* | ||
1250 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1251 | * nasty - we need to make sure that we get the edge. | ||
1252 | * If it is already asserted for some reason, we need | ||
1253 | * return 1 to indicate that is was pending. | ||
1254 | * | ||
1255 | * This is not complete - we should be able to fake | ||
1256 | * an edge even if it isn't on the 8259A... | ||
1257 | */ | ||
1258 | |||
1259 | static unsigned int startup_edge_ioapic_irq(unsigned int irq) | ||
1260 | { | ||
1261 | int was_pending = 0; | ||
1262 | unsigned long flags; | ||
1263 | |||
1264 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1265 | if (irq < 16) { | ||
1266 | disable_8259A_irq(irq); | ||
1267 | if (i8259A_irq_pending(irq)) | ||
1268 | was_pending = 1; | ||
1269 | } | ||
1270 | __unmask_IO_APIC_irq(irq); | ||
1271 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1272 | |||
1273 | return was_pending; | ||
1274 | } | ||
1275 | |||
1276 | /* | ||
1277 | * Once we have recorded IRQ_PENDING already, we can mask the | ||
1278 | * interrupt for real. This prevents IRQ storms from unhandled | ||
1279 | * devices. | ||
1280 | */ | ||
1281 | static void ack_edge_ioapic_irq(unsigned int irq) | ||
1282 | { | ||
1283 | if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) | ||
1284 | == (IRQ_PENDING | IRQ_DISABLED)) | ||
1285 | mask_IO_APIC_irq(irq); | ||
1286 | ack_APIC_irq(); | ||
1287 | } | ||
1288 | |||
1289 | /* | ||
1290 | * Level triggered interrupts can just be masked, | ||
1291 | * and shutting down and starting up the interrupt | ||
1292 | * is the same as enabling and disabling them -- except | ||
1293 | * with a startup need to return a "was pending" value. | ||
1294 | * | ||
1295 | * Level triggered interrupts are special because we | ||
1296 | * do not touch any IO-APIC register while handling | ||
1297 | * them. We ack the APIC in the end-IRQ handler, not | ||
1298 | * in the start-IRQ-handler. Protection against reentrance | ||
1299 | * from the same interrupt is still provided, both by the | ||
1300 | * generic IRQ layer and by the fact that an unacked local | ||
1301 | * APIC does not accept IRQs. | ||
1302 | */ | ||
1303 | static unsigned int startup_level_ioapic_irq (unsigned int irq) | ||
1304 | { | ||
1305 | unmask_IO_APIC_irq(irq); | ||
1306 | |||
1307 | return 0; /* don't check for pending */ | ||
1308 | } | ||
1309 | |||
1310 | static void end_level_ioapic_irq (unsigned int irq) | ||
1311 | { | ||
1312 | ack_APIC_irq(); | ||
1313 | } | ||
1314 | |||
1315 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | ||
1316 | { | ||
1317 | unsigned long flags; | ||
1318 | unsigned int dest; | ||
1319 | |||
1320 | dest = cpu_mask_to_apicid(mask); | ||
1321 | |||
1322 | /* | ||
1323 | * Only the high 8 bits are valid. | ||
1324 | */ | ||
1325 | dest = SET_APIC_LOGICAL_ID(dest); | ||
1326 | |||
1327 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1328 | __DO_ACTION(1, = dest, ) | ||
1329 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1330 | } | ||
1331 | |||
1332 | #ifdef CONFIG_PCI_MSI | ||
1333 | static unsigned int startup_edge_ioapic_vector(unsigned int vector) | ||
1334 | { | ||
1335 | int irq = vector_to_irq(vector); | ||
1336 | |||
1337 | return startup_edge_ioapic_irq(irq); | ||
1338 | } | ||
1339 | |||
1340 | static void ack_edge_ioapic_vector(unsigned int vector) | ||
1341 | { | ||
1342 | int irq = vector_to_irq(vector); | ||
1343 | |||
1344 | ack_edge_ioapic_irq(irq); | ||
1345 | } | ||
1346 | |||
1347 | static unsigned int startup_level_ioapic_vector (unsigned int vector) | ||
1348 | { | ||
1349 | int irq = vector_to_irq(vector); | ||
1350 | |||
1351 | return startup_level_ioapic_irq (irq); | ||
1352 | } | ||
1353 | |||
1354 | static void end_level_ioapic_vector (unsigned int vector) | ||
1355 | { | ||
1356 | int irq = vector_to_irq(vector); | ||
1357 | |||
1358 | end_level_ioapic_irq(irq); | ||
1359 | } | ||
1360 | |||
1361 | static void mask_IO_APIC_vector (unsigned int vector) | ||
1362 | { | ||
1363 | int irq = vector_to_irq(vector); | ||
1364 | |||
1365 | mask_IO_APIC_irq(irq); | ||
1366 | } | ||
1367 | |||
1368 | static void unmask_IO_APIC_vector (unsigned int vector) | ||
1369 | { | ||
1370 | int irq = vector_to_irq(vector); | ||
1371 | |||
1372 | unmask_IO_APIC_irq(irq); | ||
1373 | } | ||
1374 | |||
1375 | static void set_ioapic_affinity_vector (unsigned int vector, | ||
1376 | cpumask_t cpu_mask) | ||
1377 | { | ||
1378 | int irq = vector_to_irq(vector); | ||
1379 | |||
1380 | set_ioapic_affinity_irq(irq, cpu_mask); | ||
1381 | } | ||
1382 | #endif | ||
1383 | |||
1384 | /* | ||
1385 | * Level and edge triggered IO-APIC interrupts need different handling, | ||
1386 | * so we use two separate IRQ descriptors. Edge triggered IRQs can be | ||
1387 | * handled with the level-triggered descriptor, but that one has slightly | ||
1388 | * more overhead. Level-triggered interrupts cannot be handled with the | ||
1389 | * edge-triggered handler, without risking IRQ storms and other ugly | ||
1390 | * races. | ||
1391 | */ | ||
1392 | |||
1393 | static struct hw_interrupt_type ioapic_edge_type = { | ||
1394 | .typename = "IO-APIC-edge", | ||
1395 | .startup = startup_edge_ioapic, | ||
1396 | .shutdown = shutdown_edge_ioapic, | ||
1397 | .enable = enable_edge_ioapic, | ||
1398 | .disable = disable_edge_ioapic, | ||
1399 | .ack = ack_edge_ioapic, | ||
1400 | .end = end_edge_ioapic, | ||
1401 | .set_affinity = set_ioapic_affinity, | ||
1402 | }; | ||
1403 | |||
1404 | static struct hw_interrupt_type ioapic_level_type = { | ||
1405 | .typename = "IO-APIC-level", | ||
1406 | .startup = startup_level_ioapic, | ||
1407 | .shutdown = shutdown_level_ioapic, | ||
1408 | .enable = enable_level_ioapic, | ||
1409 | .disable = disable_level_ioapic, | ||
1410 | .ack = mask_and_ack_level_ioapic, | ||
1411 | .end = end_level_ioapic, | ||
1412 | .set_affinity = set_ioapic_affinity, | ||
1413 | }; | ||
1414 | |||
1415 | static inline void init_IO_APIC_traps(void) | ||
1416 | { | ||
1417 | int irq; | ||
1418 | |||
1419 | /* | ||
1420 | * NOTE! The local APIC isn't very good at handling | ||
1421 | * multiple interrupts at the same interrupt level. | ||
1422 | * As the interrupt level is determined by taking the | ||
1423 | * vector number and shifting that right by 4, we | ||
1424 | * want to spread these out a bit so that they don't | ||
1425 | * all fall in the same interrupt level. | ||
1426 | * | ||
1427 | * Also, we've got to be careful not to trash gate | ||
1428 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
1429 | */ | ||
1430 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
1431 | int tmp = irq; | ||
1432 | if (use_pci_vector()) { | ||
1433 | if (!platform_legacy_irq(tmp)) | ||
1434 | if ((tmp = vector_to_irq(tmp)) == -1) | ||
1435 | continue; | ||
1436 | } | ||
1437 | if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { | ||
1438 | /* | ||
1439 | * Hmm.. We don't have an entry for this, | ||
1440 | * so default to an old-fashioned 8259 | ||
1441 | * interrupt if we can.. | ||
1442 | */ | ||
1443 | if (irq < 16) | ||
1444 | make_8259A_irq(irq); | ||
1445 | else | ||
1446 | /* Strange. Oh, well.. */ | ||
1447 | irq_desc[irq].handler = &no_irq_type; | ||
1448 | } | ||
1449 | } | ||
1450 | } | ||
1451 | |||
1452 | static void enable_lapic_irq (unsigned int irq) | ||
1453 | { | ||
1454 | unsigned long v; | ||
1455 | |||
1456 | v = apic_read(APIC_LVT0); | ||
1457 | apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
1458 | } | ||
1459 | |||
1460 | static void disable_lapic_irq (unsigned int irq) | ||
1461 | { | ||
1462 | unsigned long v; | ||
1463 | |||
1464 | v = apic_read(APIC_LVT0); | ||
1465 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
1466 | } | ||
1467 | |||
1468 | static void ack_lapic_irq (unsigned int irq) | ||
1469 | { | ||
1470 | ack_APIC_irq(); | ||
1471 | } | ||
1472 | |||
1473 | static void end_lapic_irq (unsigned int i) { /* nothing */ } | ||
1474 | |||
1475 | static struct hw_interrupt_type lapic_irq_type = { | ||
1476 | .typename = "local-APIC-edge", | ||
1477 | .startup = NULL, /* startup_irq() not used for IRQ0 */ | ||
1478 | .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | ||
1479 | .enable = enable_lapic_irq, | ||
1480 | .disable = disable_lapic_irq, | ||
1481 | .ack = ack_lapic_irq, | ||
1482 | .end = end_lapic_irq, | ||
1483 | }; | ||
1484 | |||
1485 | static void setup_nmi (void) | ||
1486 | { | ||
1487 | /* | ||
1488 | * Dirty trick to enable the NMI watchdog ... | ||
1489 | * We put the 8259A master into AEOI mode and | ||
1490 | * unmask on all local APICs LVT0 as NMI. | ||
1491 | * | ||
1492 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
1493 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
1494 | * the NMI handler or the timer interrupt. | ||
1495 | */ | ||
1496 | printk(KERN_INFO "activating NMI Watchdog ..."); | ||
1497 | |||
1498 | enable_NMI_through_LVT0(NULL); | ||
1499 | |||
1500 | printk(" done.\n"); | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * This looks a bit hackish but it's about the only one way of sending | ||
1505 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
1506 | * not support the ExtINT mode, unfortunately. We need to send these | ||
1507 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
1508 | * 8259A interrupt line asserted until INTA. --macro | ||
1509 | */ | ||
1510 | static inline void unlock_ExtINT_logic(void) | ||
1511 | { | ||
1512 | int pin, i; | ||
1513 | struct IO_APIC_route_entry entry0, entry1; | ||
1514 | unsigned char save_control, save_freq_select; | ||
1515 | unsigned long flags; | ||
1516 | |||
1517 | pin = find_isa_irq_pin(8, mp_INT); | ||
1518 | if (pin == -1) | ||
1519 | return; | ||
1520 | |||
1521 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1522 | *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); | ||
1523 | *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); | ||
1524 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1525 | clear_IO_APIC_pin(0, pin); | ||
1526 | |||
1527 | memset(&entry1, 0, sizeof(entry1)); | ||
1528 | |||
1529 | entry1.dest_mode = 0; /* physical delivery */ | ||
1530 | entry1.mask = 0; /* unmask IRQ now */ | ||
1531 | entry1.dest.physical.physical_dest = hard_smp_processor_id(); | ||
1532 | entry1.delivery_mode = dest_ExtINT; | ||
1533 | entry1.polarity = entry0.polarity; | ||
1534 | entry1.trigger = 0; | ||
1535 | entry1.vector = 0; | ||
1536 | |||
1537 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1538 | io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | ||
1539 | io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | ||
1540 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1541 | |||
1542 | save_control = CMOS_READ(RTC_CONTROL); | ||
1543 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
1544 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
1545 | RTC_FREQ_SELECT); | ||
1546 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
1547 | |||
1548 | i = 100; | ||
1549 | while (i-- > 0) { | ||
1550 | mdelay(10); | ||
1551 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
1552 | i -= 10; | ||
1553 | } | ||
1554 | |||
1555 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
1556 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
1557 | clear_IO_APIC_pin(0, pin); | ||
1558 | |||
1559 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1560 | io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | ||
1561 | io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | ||
1562 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1563 | } | ||
1564 | |||
1565 | /* | ||
1566 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
1567 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
1568 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
1569 | * fanatically on his truly buggy board. | ||
1570 | */ | ||
1571 | static inline void check_timer(void) | ||
1572 | { | ||
1573 | int pin1, pin2; | ||
1574 | int vector; | ||
1575 | |||
1576 | /* | ||
1577 | * get/set the timer IRQ vector: | ||
1578 | */ | ||
1579 | disable_8259A_irq(0); | ||
1580 | vector = assign_irq_vector(0); | ||
1581 | set_intr_gate(vector, interrupt[0]); | ||
1582 | |||
1583 | /* | ||
1584 | * Subtle, code in do_timer_interrupt() expects an AEOI | ||
1585 | * mode for the 8259A whenever interrupts are routed | ||
1586 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
1587 | * the 8259A which implies the virtual wire has to be | ||
1588 | * disabled in the local APIC. | ||
1589 | */ | ||
1590 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1591 | init_8259A(1); | ||
1592 | enable_8259A_irq(0); | ||
1593 | |||
1594 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
1595 | pin2 = find_isa_irq_pin(0, mp_ExtINT); | ||
1596 | |||
1597 | apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); | ||
1598 | |||
1599 | if (pin1 != -1) { | ||
1600 | /* | ||
1601 | * Ok, does IRQ0 through the IOAPIC work? | ||
1602 | */ | ||
1603 | unmask_IO_APIC_irq(0); | ||
1604 | if (timer_irq_works()) { | ||
1605 | nmi_watchdog_default(); | ||
1606 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1607 | disable_8259A_irq(0); | ||
1608 | setup_nmi(); | ||
1609 | enable_8259A_irq(0); | ||
1610 | check_nmi_watchdog(); | ||
1611 | } | ||
1612 | return; | ||
1613 | } | ||
1614 | clear_IO_APIC_pin(0, pin1); | ||
1615 | apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); | ||
1616 | } | ||
1617 | |||
1618 | apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); | ||
1619 | if (pin2 != -1) { | ||
1620 | apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); | ||
1621 | /* | ||
1622 | * legacy devices should be connected to IO APIC #0 | ||
1623 | */ | ||
1624 | setup_ExtINT_IRQ0_pin(pin2, vector); | ||
1625 | if (timer_irq_works()) { | ||
1626 | printk("works.\n"); | ||
1627 | nmi_watchdog_default(); | ||
1628 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1629 | setup_nmi(); | ||
1630 | check_nmi_watchdog(); | ||
1631 | } | ||
1632 | return; | ||
1633 | } | ||
1634 | /* | ||
1635 | * Cleanup, just in case ... | ||
1636 | */ | ||
1637 | clear_IO_APIC_pin(0, pin2); | ||
1638 | } | ||
1639 | printk(" failed.\n"); | ||
1640 | |||
1641 | if (nmi_watchdog) { | ||
1642 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | ||
1643 | nmi_watchdog = 0; | ||
1644 | } | ||
1645 | |||
1646 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | ||
1647 | |||
1648 | disable_8259A_irq(0); | ||
1649 | irq_desc[0].handler = &lapic_irq_type; | ||
1650 | apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | ||
1651 | enable_8259A_irq(0); | ||
1652 | |||
1653 | if (timer_irq_works()) { | ||
1654 | apic_printk(APIC_QUIET, " works.\n"); | ||
1655 | return; | ||
1656 | } | ||
1657 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | ||
1658 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1659 | |||
1660 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | ||
1661 | |||
1662 | init_8259A(0); | ||
1663 | make_8259A_irq(0); | ||
1664 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
1665 | |||
1666 | unlock_ExtINT_logic(); | ||
1667 | |||
1668 | if (timer_irq_works()) { | ||
1669 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1670 | return; | ||
1671 | } | ||
1672 | apic_printk(APIC_VERBOSE," failed :(.\n"); | ||
1673 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | ||
1674 | } | ||
1675 | |||
1676 | /* | ||
1677 | * | ||
1678 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | ||
1679 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | ||
1680 | * Linux doesn't really care, as it's not actually used | ||
1681 | * for any interrupt handling anyway. | ||
1682 | */ | ||
1683 | #define PIC_IRQS (1<<2) | ||
1684 | |||
1685 | void __init setup_IO_APIC(void) | ||
1686 | { | ||
1687 | enable_IO_APIC(); | ||
1688 | |||
1689 | if (acpi_ioapic) | ||
1690 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
1691 | else | ||
1692 | io_apic_irqs = ~PIC_IRQS; | ||
1693 | |||
1694 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | ||
1695 | |||
1696 | /* | ||
1697 | * Set up the IO-APIC IRQ routing table. | ||
1698 | */ | ||
1699 | if (!acpi_ioapic) | ||
1700 | setup_ioapic_ids_from_mpc(); | ||
1701 | sync_Arb_IDs(); | ||
1702 | setup_IO_APIC_irqs(); | ||
1703 | init_IO_APIC_traps(); | ||
1704 | check_timer(); | ||
1705 | if (!acpi_ioapic) | ||
1706 | print_IO_APIC(); | ||
1707 | } | ||
1708 | |||
1709 | struct sysfs_ioapic_data { | ||
1710 | struct sys_device dev; | ||
1711 | struct IO_APIC_route_entry entry[0]; | ||
1712 | }; | ||
1713 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | ||
1714 | |||
1715 | static int ioapic_suspend(struct sys_device *dev, u32 state) | ||
1716 | { | ||
1717 | struct IO_APIC_route_entry *entry; | ||
1718 | struct sysfs_ioapic_data *data; | ||
1719 | unsigned long flags; | ||
1720 | int i; | ||
1721 | |||
1722 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1723 | entry = data->entry; | ||
1724 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1725 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | ||
1726 | *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); | ||
1727 | *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); | ||
1728 | } | ||
1729 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1730 | |||
1731 | return 0; | ||
1732 | } | ||
1733 | |||
1734 | static int ioapic_resume(struct sys_device *dev) | ||
1735 | { | ||
1736 | struct IO_APIC_route_entry *entry; | ||
1737 | struct sysfs_ioapic_data *data; | ||
1738 | unsigned long flags; | ||
1739 | union IO_APIC_reg_00 reg_00; | ||
1740 | int i; | ||
1741 | |||
1742 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1743 | entry = data->entry; | ||
1744 | |||
1745 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1746 | reg_00.raw = io_apic_read(dev->id, 0); | ||
1747 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | ||
1748 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | ||
1749 | io_apic_write(dev->id, 0, reg_00.raw); | ||
1750 | } | ||
1751 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | ||
1752 | io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); | ||
1753 | io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); | ||
1754 | } | ||
1755 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1756 | |||
1757 | return 0; | ||
1758 | } | ||
1759 | |||
1760 | static struct sysdev_class ioapic_sysdev_class = { | ||
1761 | set_kset_name("ioapic"), | ||
1762 | .suspend = ioapic_suspend, | ||
1763 | .resume = ioapic_resume, | ||
1764 | }; | ||
1765 | |||
1766 | static int __init ioapic_init_sysfs(void) | ||
1767 | { | ||
1768 | struct sys_device * dev; | ||
1769 | int i, size, error = 0; | ||
1770 | |||
1771 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
1772 | if (error) | ||
1773 | return error; | ||
1774 | |||
1775 | for (i = 0; i < nr_ioapics; i++ ) { | ||
1776 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
1777 | * sizeof(struct IO_APIC_route_entry); | ||
1778 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | ||
1779 | if (!mp_ioapic_data[i]) { | ||
1780 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1781 | continue; | ||
1782 | } | ||
1783 | memset(mp_ioapic_data[i], 0, size); | ||
1784 | dev = &mp_ioapic_data[i]->dev; | ||
1785 | dev->id = i; | ||
1786 | dev->cls = &ioapic_sysdev_class; | ||
1787 | error = sysdev_register(dev); | ||
1788 | if (error) { | ||
1789 | kfree(mp_ioapic_data[i]); | ||
1790 | mp_ioapic_data[i] = NULL; | ||
1791 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1792 | continue; | ||
1793 | } | ||
1794 | } | ||
1795 | |||
1796 | return 0; | ||
1797 | } | ||
1798 | |||
1799 | device_initcall(ioapic_init_sysfs); | ||
1800 | |||
1801 | /* -------------------------------------------------------------------------- | ||
1802 | ACPI-based IOAPIC Configuration | ||
1803 | -------------------------------------------------------------------------- */ | ||
1804 | |||
1805 | #ifdef CONFIG_ACPI_BOOT | ||
1806 | |||
1807 | #define IO_APIC_MAX_ID 0xFE | ||
1808 | |||
1809 | int __init io_apic_get_unique_id (int ioapic, int apic_id) | ||
1810 | { | ||
1811 | union IO_APIC_reg_00 reg_00; | ||
1812 | static physid_mask_t apic_id_map; | ||
1813 | unsigned long flags; | ||
1814 | int i = 0; | ||
1815 | |||
1816 | /* | ||
1817 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
1818 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
1819 | * supports up to 16 on one shared APIC bus. | ||
1820 | * | ||
1821 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
1822 | * advantage of new APIC bus architecture. | ||
1823 | */ | ||
1824 | |||
1825 | if (physids_empty(apic_id_map)) | ||
1826 | apic_id_map = phys_cpu_present_map; | ||
1827 | |||
1828 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1829 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1830 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1831 | |||
1832 | if (apic_id >= IO_APIC_MAX_ID) { | ||
1833 | apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
1834 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
1835 | apic_id = reg_00.bits.ID; | ||
1836 | } | ||
1837 | |||
1838 | /* | ||
1839 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
1840 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1841 | */ | ||
1842 | if (physid_isset(apic_id, apic_id_map)) { | ||
1843 | |||
1844 | for (i = 0; i < IO_APIC_MAX_ID; i++) { | ||
1845 | if (!physid_isset(i, apic_id_map)) | ||
1846 | break; | ||
1847 | } | ||
1848 | |||
1849 | if (i == IO_APIC_MAX_ID) | ||
1850 | panic("Max apic_id exceeded!\n"); | ||
1851 | |||
1852 | apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
1853 | "trying %d\n", ioapic, apic_id, i); | ||
1854 | |||
1855 | apic_id = i; | ||
1856 | } | ||
1857 | |||
1858 | physid_set(apic_id, apic_id_map); | ||
1859 | |||
1860 | if (reg_00.bits.ID != apic_id) { | ||
1861 | reg_00.bits.ID = apic_id; | ||
1862 | |||
1863 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1864 | io_apic_write(ioapic, 0, reg_00.raw); | ||
1865 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1866 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1867 | |||
1868 | /* Sanity check */ | ||
1869 | if (reg_00.bits.ID != apic_id) | ||
1870 | panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); | ||
1871 | } | ||
1872 | |||
1873 | apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
1874 | |||
1875 | return apic_id; | ||
1876 | } | ||
1877 | |||
1878 | |||
1879 | int __init io_apic_get_version (int ioapic) | ||
1880 | { | ||
1881 | union IO_APIC_reg_01 reg_01; | ||
1882 | unsigned long flags; | ||
1883 | |||
1884 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1885 | reg_01.raw = io_apic_read(ioapic, 1); | ||
1886 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1887 | |||
1888 | return reg_01.bits.version; | ||
1889 | } | ||
1890 | |||
1891 | |||
1892 | int __init io_apic_get_redir_entries (int ioapic) | ||
1893 | { | ||
1894 | union IO_APIC_reg_01 reg_01; | ||
1895 | unsigned long flags; | ||
1896 | |||
1897 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1898 | reg_01.raw = io_apic_read(ioapic, 1); | ||
1899 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1900 | |||
1901 | return reg_01.bits.entries; | ||
1902 | } | ||
1903 | |||
1904 | |||
1905 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | ||
1906 | { | ||
1907 | struct IO_APIC_route_entry entry; | ||
1908 | unsigned long flags; | ||
1909 | |||
1910 | if (!IO_APIC_IRQ(irq)) { | ||
1911 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
1912 | ioapic); | ||
1913 | return -EINVAL; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1917 | * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | ||
1918 | * Note that we mask (disable) IRQs now -- these get enabled when the | ||
1919 | * corresponding device driver registers for this IRQ. | ||
1920 | */ | ||
1921 | |||
1922 | memset(&entry,0,sizeof(entry)); | ||
1923 | |||
1924 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1925 | entry.dest_mode = INT_DEST_MODE; | ||
1926 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
1927 | entry.trigger = edge_level; | ||
1928 | entry.polarity = active_high_low; | ||
1929 | entry.mask = 1; /* Disabled (masked) */ | ||
1930 | |||
1931 | /* | ||
1932 | * IRQs < 16 are already in the irq_2_pin[] map | ||
1933 | */ | ||
1934 | if (irq >= 16) | ||
1935 | add_pin_to_irq(irq, ioapic, pin); | ||
1936 | |||
1937 | entry.vector = assign_irq_vector(irq); | ||
1938 | |||
1939 | apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " | ||
1940 | "IRQ %d Mode:%i Active:%i)\n", ioapic, | ||
1941 | mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | ||
1942 | edge_level, active_high_low); | ||
1943 | |||
1944 | ioapic_register_intr(irq, entry.vector, edge_level); | ||
1945 | |||
1946 | if (!ioapic && (irq < 16)) | ||
1947 | disable_8259A_irq(irq); | ||
1948 | |||
1949 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1950 | io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
1951 | io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
1952 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1953 | |||
1954 | return 0; | ||
1955 | } | ||
1956 | |||
1957 | #endif /*CONFIG_ACPI_BOOT*/ | ||
1958 | |||
1959 | |||
1960 | /* | ||
1961 | * This function currently is only a helper for the i386 smp boot process where | ||
1962 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
1963 | * so mask in all cases should simply be TARGET_CPUS | ||
1964 | */ | ||
1965 | void __init setup_ioapic_dest(void) | ||
1966 | { | ||
1967 | int pin, ioapic, irq, irq_entry; | ||
1968 | |||
1969 | if (skip_ioapic_setup == 1) | ||
1970 | return; | ||
1971 | |||
1972 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
1973 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
1974 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
1975 | if (irq_entry == -1) | ||
1976 | continue; | ||
1977 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
1978 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
1979 | } | ||
1980 | |||
1981 | } | ||
1982 | } | ||
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c new file mode 100644 index 000000000000..cde0e868efe3 --- /dev/null +++ b/arch/x86_64/kernel/ioport.c | |||
@@ -0,0 +1,117 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ioport.c | ||
3 | * | ||
4 | * This contains the io-permission bitmap code - written by obz, with changes | ||
5 | * by Linus. | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/ioport.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | |||
19 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | ||
20 | static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | ||
21 | { | ||
22 | int i; | ||
23 | if (new_value) | ||
24 | for (i = base; i < base + extent; i++) | ||
25 | __set_bit(i, bitmap); | ||
26 | else | ||
27 | for (i = base; i < base + extent; i++) | ||
28 | clear_bit(i, bitmap); | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * this changes the io permissions bitmap in the current task. | ||
33 | */ | ||
34 | asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | ||
35 | { | ||
36 | unsigned int i, max_long, bytes, bytes_updated; | ||
37 | struct thread_struct * t = ¤t->thread; | ||
38 | struct tss_struct * tss; | ||
39 | unsigned long *bitmap; | ||
40 | |||
41 | if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | ||
42 | return -EINVAL; | ||
43 | if (turn_on && !capable(CAP_SYS_RAWIO)) | ||
44 | return -EPERM; | ||
45 | |||
46 | /* | ||
47 | * If it's the first ioperm() call in this thread's lifetime, set the | ||
48 | * IO bitmap up. ioperm() is much less timing critical than clone(), | ||
49 | * this is why we delay this operation until now: | ||
50 | */ | ||
51 | if (!t->io_bitmap_ptr) { | ||
52 | bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
53 | if (!bitmap) | ||
54 | return -ENOMEM; | ||
55 | |||
56 | memset(bitmap, 0xff, IO_BITMAP_BYTES); | ||
57 | t->io_bitmap_ptr = bitmap; | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * do it in the per-thread copy and in the TSS ... | ||
62 | * | ||
63 | * Disable preemption via get_cpu() - we must not switch away | ||
64 | * because the ->io_bitmap_max value must match the bitmap | ||
65 | * contents: | ||
66 | */ | ||
67 | tss = &per_cpu(init_tss, get_cpu()); | ||
68 | |||
69 | set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | ||
70 | |||
71 | /* | ||
72 | * Search for a (possibly new) maximum. This is simple and stupid, | ||
73 | * to keep it obviously correct: | ||
74 | */ | ||
75 | max_long = 0; | ||
76 | for (i = 0; i < IO_BITMAP_LONGS; i++) | ||
77 | if (t->io_bitmap_ptr[i] != ~0UL) | ||
78 | max_long = i; | ||
79 | |||
80 | bytes = (max_long + 1) * sizeof(long); | ||
81 | bytes_updated = max(bytes, t->io_bitmap_max); | ||
82 | |||
83 | t->io_bitmap_max = bytes; | ||
84 | |||
85 | /* Update the TSS: */ | ||
86 | memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); | ||
87 | |||
88 | put_cpu(); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * sys_iopl has to be used when you want to access the IO ports | ||
95 | * beyond the 0x3ff range: to get the full 65536 ports bitmapped | ||
96 | * you'd need 8kB of bitmaps/process, which is a bit excessive. | ||
97 | * | ||
98 | * Here we just change the eflags value on the stack: we allow | ||
99 | * only the super-user to do it. This depends on the stack-layout | ||
100 | * on system-call entry - see also fork() and the signal handling | ||
101 | * code. | ||
102 | */ | ||
103 | |||
104 | asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) | ||
105 | { | ||
106 | unsigned int old = (regs->eflags >> 12) & 3; | ||
107 | |||
108 | if (level > 3) | ||
109 | return -EINVAL; | ||
110 | /* Trying to gain more privileges? */ | ||
111 | if (level > old) { | ||
112 | if (!capable(CAP_SYS_RAWIO)) | ||
113 | return -EPERM; | ||
114 | } | ||
115 | regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); | ||
116 | return 0; | ||
117 | } | ||
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c new file mode 100644 index 000000000000..62b112e4deb4 --- /dev/null +++ b/arch/x86_64/kernel/irq.c | |||
@@ -0,0 +1,108 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/irq.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the lowest level x86_64-specific interrupt | ||
7 | * entry and irq statistics code. All the remaining irq logic is | ||
8 | * done by the generic kernel/irq/ code and in the | ||
9 | * x86_64-specific irq controller code. (e.g. i8259.c and | ||
10 | * io_apic.c.) | ||
11 | */ | ||
12 | |||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/io_apic.h> | ||
19 | |||
20 | atomic_t irq_err_count; | ||
21 | #ifdef CONFIG_X86_IO_APIC | ||
22 | #ifdef APIC_MISMATCH_DEBUG | ||
23 | atomic_t irq_mis_count; | ||
24 | #endif | ||
25 | #endif | ||
26 | |||
27 | /* | ||
28 | * Generic, controller-independent functions: | ||
29 | */ | ||
30 | |||
31 | int show_interrupts(struct seq_file *p, void *v) | ||
32 | { | ||
33 | int i = *(loff_t *) v, j; | ||
34 | struct irqaction * action; | ||
35 | unsigned long flags; | ||
36 | |||
37 | if (i == 0) { | ||
38 | seq_printf(p, " "); | ||
39 | for (j=0; j<NR_CPUS; j++) | ||
40 | if (cpu_online(j)) | ||
41 | seq_printf(p, "CPU%d ",j); | ||
42 | seq_putc(p, '\n'); | ||
43 | } | ||
44 | |||
45 | if (i < NR_IRQS) { | ||
46 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
47 | action = irq_desc[i].action; | ||
48 | if (!action) | ||
49 | goto skip; | ||
50 | seq_printf(p, "%3d: ",i); | ||
51 | #ifndef CONFIG_SMP | ||
52 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
53 | #else | ||
54 | for (j=0; j<NR_CPUS; j++) | ||
55 | if (cpu_online(j)) | ||
56 | seq_printf(p, "%10u ", | ||
57 | kstat_cpu(j).irqs[i]); | ||
58 | #endif | ||
59 | seq_printf(p, " %14s", irq_desc[i].handler->typename); | ||
60 | |||
61 | seq_printf(p, " %s", action->name); | ||
62 | for (action=action->next; action; action = action->next) | ||
63 | seq_printf(p, ", %s", action->name); | ||
64 | seq_putc(p, '\n'); | ||
65 | skip: | ||
66 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
67 | } else if (i == NR_IRQS) { | ||
68 | seq_printf(p, "NMI: "); | ||
69 | for (j = 0; j < NR_CPUS; j++) | ||
70 | if (cpu_online(j)) | ||
71 | seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); | ||
72 | seq_putc(p, '\n'); | ||
73 | #ifdef CONFIG_X86_LOCAL_APIC | ||
74 | seq_printf(p, "LOC: "); | ||
75 | for (j = 0; j < NR_CPUS; j++) | ||
76 | if (cpu_online(j)) | ||
77 | seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); | ||
78 | seq_putc(p, '\n'); | ||
79 | #endif | ||
80 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
81 | #ifdef CONFIG_X86_IO_APIC | ||
82 | #ifdef APIC_MISMATCH_DEBUG | ||
83 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | ||
84 | #endif | ||
85 | #endif | ||
86 | } | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * do_IRQ handles all normal device IRQ's (the special | ||
92 | * SMP cross-CPU interrupts have their own specific | ||
93 | * handlers). | ||
94 | */ | ||
95 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | ||
96 | { | ||
97 | /* high bits used in ret_from_ code */ | ||
98 | unsigned irq = regs->orig_rax & 0xff; | ||
99 | |||
100 | irq_enter(); | ||
101 | BUG_ON(irq > 256); | ||
102 | |||
103 | __do_IRQ(irq, regs); | ||
104 | irq_exit(); | ||
105 | |||
106 | return 1; | ||
107 | } | ||
108 | |||
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c new file mode 100644 index 000000000000..4f2a852299b6 --- /dev/null +++ b/arch/x86_64/kernel/kprobes.c | |||
@@ -0,0 +1,631 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/x86_64/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi | ||
27 | * <prasanna@in.ibm.com> adapted for x86_64 | ||
28 | * 2005-Mar Roland McGrath <roland@redhat.com> | ||
29 | * Fixed to handle %rip-relative addressing mode correctly. | ||
30 | */ | ||
31 | |||
32 | #include <linux/config.h> | ||
33 | #include <linux/kprobes.h> | ||
34 | #include <linux/ptrace.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/preempt.h> | ||
39 | #include <linux/moduleloader.h> | ||
40 | |||
41 | #include <asm/pgtable.h> | ||
42 | #include <asm/kdebug.h> | ||
43 | |||
44 | static DECLARE_MUTEX(kprobe_mutex); | ||
45 | |||
46 | /* kprobe_status settings */ | ||
47 | #define KPROBE_HIT_ACTIVE 0x00000001 | ||
48 | #define KPROBE_HIT_SS 0x00000002 | ||
49 | |||
50 | static struct kprobe *current_kprobe; | ||
51 | static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; | ||
52 | static struct pt_regs jprobe_saved_regs; | ||
53 | static long *jprobe_saved_rsp; | ||
54 | static kprobe_opcode_t *get_insn_slot(void); | ||
55 | static void free_insn_slot(kprobe_opcode_t *slot); | ||
56 | void jprobe_return_end(void); | ||
57 | |||
58 | /* copy of the kernel stack at the probe fire time */ | ||
59 | static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; | ||
60 | |||
61 | /* | ||
62 | * returns non-zero if opcode modifies the interrupt flag. | ||
63 | */ | ||
64 | static inline int is_IF_modifier(kprobe_opcode_t *insn) | ||
65 | { | ||
66 | switch (*insn) { | ||
67 | case 0xfa: /* cli */ | ||
68 | case 0xfb: /* sti */ | ||
69 | case 0xcf: /* iret/iretd */ | ||
70 | case 0x9d: /* popf/popfd */ | ||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) | ||
75 | return 1; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | int arch_prepare_kprobe(struct kprobe *p) | ||
80 | { | ||
81 | /* insn: must be on special executable page on x86_64. */ | ||
82 | up(&kprobe_mutex); | ||
83 | p->ainsn.insn = get_insn_slot(); | ||
84 | down(&kprobe_mutex); | ||
85 | if (!p->ainsn.insn) { | ||
86 | return -ENOMEM; | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Determine if the instruction uses the %rip-relative addressing mode. | ||
93 | * If it does, return the address of the 32-bit displacement word. | ||
94 | * If not, return null. | ||
95 | */ | ||
96 | static inline s32 *is_riprel(u8 *insn) | ||
97 | { | ||
98 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
99 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
100 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
101 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
102 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
103 | << (row % 64)) | ||
104 | static const u64 onebyte_has_modrm[256 / 64] = { | ||
105 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
106 | /* ------------------------------- */ | ||
107 | W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ | ||
108 | W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ | ||
109 | W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ | ||
110 | W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ | ||
111 | W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ | ||
112 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ | ||
113 | W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ | ||
114 | W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ | ||
115 | W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ | ||
116 | W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ | ||
117 | W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ | ||
118 | W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ | ||
119 | W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ | ||
120 | W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ | ||
121 | W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ | ||
122 | W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ | ||
123 | /* ------------------------------- */ | ||
124 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
125 | }; | ||
126 | static const u64 twobyte_has_modrm[256 / 64] = { | ||
127 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
128 | /* ------------------------------- */ | ||
129 | W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ | ||
130 | W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ | ||
131 | W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ | ||
132 | W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ | ||
133 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ | ||
134 | W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ | ||
135 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ | ||
136 | W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ | ||
137 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ | ||
138 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ | ||
139 | W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ | ||
140 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ | ||
141 | W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ | ||
142 | W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ | ||
143 | W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ | ||
144 | W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ | ||
145 | /* ------------------------------- */ | ||
146 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
147 | }; | ||
148 | #undef W | ||
149 | int need_modrm; | ||
150 | |||
151 | /* Skip legacy instruction prefixes. */ | ||
152 | while (1) { | ||
153 | switch (*insn) { | ||
154 | case 0x66: | ||
155 | case 0x67: | ||
156 | case 0x2e: | ||
157 | case 0x3e: | ||
158 | case 0x26: | ||
159 | case 0x64: | ||
160 | case 0x65: | ||
161 | case 0x36: | ||
162 | case 0xf0: | ||
163 | case 0xf3: | ||
164 | case 0xf2: | ||
165 | ++insn; | ||
166 | continue; | ||
167 | } | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | /* Skip REX instruction prefix. */ | ||
172 | if ((*insn & 0xf0) == 0x40) | ||
173 | ++insn; | ||
174 | |||
175 | if (*insn == 0x0f) { /* Two-byte opcode. */ | ||
176 | ++insn; | ||
177 | need_modrm = test_bit(*insn, twobyte_has_modrm); | ||
178 | } else { /* One-byte opcode. */ | ||
179 | need_modrm = test_bit(*insn, onebyte_has_modrm); | ||
180 | } | ||
181 | |||
182 | if (need_modrm) { | ||
183 | u8 modrm = *++insn; | ||
184 | if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ | ||
185 | /* Displacement follows ModRM byte. */ | ||
186 | return (s32 *) ++insn; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | /* No %rip-relative addressing mode here. */ | ||
191 | return NULL; | ||
192 | } | ||
193 | |||
194 | void arch_copy_kprobe(struct kprobe *p) | ||
195 | { | ||
196 | s32 *ripdisp; | ||
197 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); | ||
198 | ripdisp = is_riprel(p->ainsn.insn); | ||
199 | if (ripdisp) { | ||
200 | /* | ||
201 | * The copied instruction uses the %rip-relative | ||
202 | * addressing mode. Adjust the displacement for the | ||
203 | * difference between the original location of this | ||
204 | * instruction and the location of the copy that will | ||
205 | * actually be run. The tricky bit here is making sure | ||
206 | * that the sign extension happens correctly in this | ||
207 | * calculation, since we need a signed 32-bit result to | ||
208 | * be sign-extended to 64 bits when it's added to the | ||
209 | * %rip value and yield the same 64-bit result that the | ||
210 | * sign-extension of the original signed 32-bit | ||
211 | * displacement would have given. | ||
212 | */ | ||
213 | s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; | ||
214 | BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ | ||
215 | *ripdisp = disp; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | void arch_remove_kprobe(struct kprobe *p) | ||
220 | { | ||
221 | up(&kprobe_mutex); | ||
222 | free_insn_slot(p->ainsn.insn); | ||
223 | down(&kprobe_mutex); | ||
224 | } | ||
225 | |||
226 | static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) | ||
227 | { | ||
228 | *p->addr = p->opcode; | ||
229 | regs->rip = (unsigned long)p->addr; | ||
230 | } | ||
231 | |||
232 | static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
233 | { | ||
234 | regs->eflags |= TF_MASK; | ||
235 | regs->eflags &= ~IF_MASK; | ||
236 | /*single step inline if the instruction is an int3*/ | ||
237 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
238 | regs->rip = (unsigned long)p->addr; | ||
239 | else | ||
240 | regs->rip = (unsigned long)p->ainsn.insn; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Interrupts are disabled on entry as trap3 is an interrupt gate and they | ||
245 | * remain disabled thorough out this function. | ||
246 | */ | ||
247 | int kprobe_handler(struct pt_regs *regs) | ||
248 | { | ||
249 | struct kprobe *p; | ||
250 | int ret = 0; | ||
251 | kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); | ||
252 | |||
253 | /* We're in an interrupt, but this is clear and BUG()-safe. */ | ||
254 | preempt_disable(); | ||
255 | |||
256 | /* Check we're not actually recursing */ | ||
257 | if (kprobe_running()) { | ||
258 | /* We *are* holding lock here, so this is safe. | ||
259 | Disarm the probe we just hit, and ignore it. */ | ||
260 | p = get_kprobe(addr); | ||
261 | if (p) { | ||
262 | if (kprobe_status == KPROBE_HIT_SS) { | ||
263 | regs->eflags &= ~TF_MASK; | ||
264 | regs->eflags |= kprobe_saved_rflags; | ||
265 | unlock_kprobes(); | ||
266 | goto no_kprobe; | ||
267 | } | ||
268 | disarm_kprobe(p, regs); | ||
269 | ret = 1; | ||
270 | } else { | ||
271 | p = current_kprobe; | ||
272 | if (p->break_handler && p->break_handler(p, regs)) { | ||
273 | goto ss_probe; | ||
274 | } | ||
275 | } | ||
276 | /* If it's not ours, can't be delete race, (we hold lock). */ | ||
277 | goto no_kprobe; | ||
278 | } | ||
279 | |||
280 | lock_kprobes(); | ||
281 | p = get_kprobe(addr); | ||
282 | if (!p) { | ||
283 | unlock_kprobes(); | ||
284 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
285 | /* | ||
286 | * The breakpoint instruction was removed right | ||
287 | * after we hit it. Another cpu has removed | ||
288 | * either a probepoint or a debugger breakpoint | ||
289 | * at this address. In either case, no further | ||
290 | * handling of this interrupt is appropriate. | ||
291 | */ | ||
292 | ret = 1; | ||
293 | } | ||
294 | /* Not one of ours: let kernel handle it */ | ||
295 | goto no_kprobe; | ||
296 | } | ||
297 | |||
298 | kprobe_status = KPROBE_HIT_ACTIVE; | ||
299 | current_kprobe = p; | ||
300 | kprobe_saved_rflags = kprobe_old_rflags | ||
301 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
302 | if (is_IF_modifier(p->ainsn.insn)) | ||
303 | kprobe_saved_rflags &= ~IF_MASK; | ||
304 | |||
305 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
306 | /* handler has already set things up, so skip ss setup */ | ||
307 | return 1; | ||
308 | |||
309 | ss_probe: | ||
310 | prepare_singlestep(p, regs); | ||
311 | kprobe_status = KPROBE_HIT_SS; | ||
312 | return 1; | ||
313 | |||
314 | no_kprobe: | ||
315 | preempt_enable_no_resched(); | ||
316 | return ret; | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Called after single-stepping. p->addr is the address of the | ||
321 | * instruction whose first byte has been replaced by the "int 3" | ||
322 | * instruction. To avoid the SMP problems that can occur when we | ||
323 | * temporarily put back the original opcode to single-step, we | ||
324 | * single-stepped a copy of the instruction. The address of this | ||
325 | * copy is p->ainsn.insn. | ||
326 | * | ||
327 | * This function prepares to return from the post-single-step | ||
328 | * interrupt. We have to fix up the stack as follows: | ||
329 | * | ||
330 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
331 | * the new rip is relative to the copied instruction. We need to make | ||
332 | * it relative to the original instruction. | ||
333 | * | ||
334 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
335 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
336 | * | ||
337 | * 2) If the single-stepped instruction was a call, the return address | ||
338 | * that is atop the stack is the address following the copied instruction. | ||
339 | * We need to make it the address following the original instruction. | ||
340 | */ | ||
341 | static void resume_execution(struct kprobe *p, struct pt_regs *regs) | ||
342 | { | ||
343 | unsigned long *tos = (unsigned long *)regs->rsp; | ||
344 | unsigned long next_rip = 0; | ||
345 | unsigned long copy_rip = (unsigned long)p->ainsn.insn; | ||
346 | unsigned long orig_rip = (unsigned long)p->addr; | ||
347 | kprobe_opcode_t *insn = p->ainsn.insn; | ||
348 | |||
349 | /*skip the REX prefix*/ | ||
350 | if (*insn >= 0x40 && *insn <= 0x4f) | ||
351 | insn++; | ||
352 | |||
353 | switch (*insn) { | ||
354 | case 0x9c: /* pushfl */ | ||
355 | *tos &= ~(TF_MASK | IF_MASK); | ||
356 | *tos |= kprobe_old_rflags; | ||
357 | break; | ||
358 | case 0xe8: /* call relative - Fix return addr */ | ||
359 | *tos = orig_rip + (*tos - copy_rip); | ||
360 | break; | ||
361 | case 0xff: | ||
362 | if ((*insn & 0x30) == 0x10) { | ||
363 | /* call absolute, indirect */ | ||
364 | /* Fix return addr; rip is correct. */ | ||
365 | next_rip = regs->rip; | ||
366 | *tos = orig_rip + (*tos - copy_rip); | ||
367 | } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
368 | ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
369 | /* rip is correct. */ | ||
370 | next_rip = regs->rip; | ||
371 | } | ||
372 | break; | ||
373 | case 0xea: /* jmp absolute -- rip is correct */ | ||
374 | next_rip = regs->rip; | ||
375 | break; | ||
376 | default: | ||
377 | break; | ||
378 | } | ||
379 | |||
380 | regs->eflags &= ~TF_MASK; | ||
381 | if (next_rip) { | ||
382 | regs->rip = next_rip; | ||
383 | } else { | ||
384 | regs->rip = orig_rip + (regs->rip - copy_rip); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Interrupts are disabled on entry as trap1 is an interrupt gate and they | ||
390 | * remain disabled thoroughout this function. And we hold kprobe lock. | ||
391 | */ | ||
392 | int post_kprobe_handler(struct pt_regs *regs) | ||
393 | { | ||
394 | if (!kprobe_running()) | ||
395 | return 0; | ||
396 | |||
397 | if (current_kprobe->post_handler) | ||
398 | current_kprobe->post_handler(current_kprobe, regs, 0); | ||
399 | |||
400 | resume_execution(current_kprobe, regs); | ||
401 | regs->eflags |= kprobe_saved_rflags; | ||
402 | |||
403 | unlock_kprobes(); | ||
404 | preempt_enable_no_resched(); | ||
405 | |||
406 | /* | ||
407 | * if somebody else is singlestepping across a probe point, eflags | ||
408 | * will have TF set, in which case, continue the remaining processing | ||
409 | * of do_debug, as if this is not a probe hit. | ||
410 | */ | ||
411 | if (regs->eflags & TF_MASK) | ||
412 | return 0; | ||
413 | |||
414 | return 1; | ||
415 | } | ||
416 | |||
417 | /* Interrupts disabled, kprobe_lock held. */ | ||
418 | int kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
419 | { | ||
420 | if (current_kprobe->fault_handler | ||
421 | && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) | ||
422 | return 1; | ||
423 | |||
424 | if (kprobe_status & KPROBE_HIT_SS) { | ||
425 | resume_execution(current_kprobe, regs); | ||
426 | regs->eflags |= kprobe_old_rflags; | ||
427 | |||
428 | unlock_kprobes(); | ||
429 | preempt_enable_no_resched(); | ||
430 | } | ||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Wrapper routine for handling exceptions. | ||
436 | */ | ||
437 | int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | ||
438 | void *data) | ||
439 | { | ||
440 | struct die_args *args = (struct die_args *)data; | ||
441 | switch (val) { | ||
442 | case DIE_INT3: | ||
443 | if (kprobe_handler(args->regs)) | ||
444 | return NOTIFY_STOP; | ||
445 | break; | ||
446 | case DIE_DEBUG: | ||
447 | if (post_kprobe_handler(args->regs)) | ||
448 | return NOTIFY_STOP; | ||
449 | break; | ||
450 | case DIE_GPF: | ||
451 | if (kprobe_running() && | ||
452 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
453 | return NOTIFY_STOP; | ||
454 | break; | ||
455 | case DIE_PAGE_FAULT: | ||
456 | if (kprobe_running() && | ||
457 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
458 | return NOTIFY_STOP; | ||
459 | break; | ||
460 | default: | ||
461 | break; | ||
462 | } | ||
463 | return NOTIFY_DONE; | ||
464 | } | ||
465 | |||
466 | int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
467 | { | ||
468 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
469 | unsigned long addr; | ||
470 | |||
471 | jprobe_saved_regs = *regs; | ||
472 | jprobe_saved_rsp = (long *) regs->rsp; | ||
473 | addr = (unsigned long)jprobe_saved_rsp; | ||
474 | /* | ||
475 | * As Linus pointed out, gcc assumes that the callee | ||
476 | * owns the argument space and could overwrite it, e.g. | ||
477 | * tailcall optimization. So, to be absolutely safe | ||
478 | * we also save and restore enough stack bytes to cover | ||
479 | * the argument area. | ||
480 | */ | ||
481 | memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); | ||
482 | regs->eflags &= ~IF_MASK; | ||
483 | regs->rip = (unsigned long)(jp->entry); | ||
484 | return 1; | ||
485 | } | ||
486 | |||
487 | void jprobe_return(void) | ||
488 | { | ||
489 | preempt_enable_no_resched(); | ||
490 | asm volatile (" xchg %%rbx,%%rsp \n" | ||
491 | " int3 \n" | ||
492 | " .globl jprobe_return_end \n" | ||
493 | " jprobe_return_end: \n" | ||
494 | " nop \n"::"b" | ||
495 | (jprobe_saved_rsp):"memory"); | ||
496 | } | ||
497 | |||
498 | int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
499 | { | ||
500 | u8 *addr = (u8 *) (regs->rip - 1); | ||
501 | unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; | ||
502 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
503 | |||
504 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
505 | if ((long *)regs->rsp != jprobe_saved_rsp) { | ||
506 | struct pt_regs *saved_regs = | ||
507 | container_of(jprobe_saved_rsp, struct pt_regs, rsp); | ||
508 | printk("current rsp %p does not match saved rsp %p\n", | ||
509 | (long *)regs->rsp, jprobe_saved_rsp); | ||
510 | printk("Saved registers for jprobe %p\n", jp); | ||
511 | show_registers(saved_regs); | ||
512 | printk("Current registers\n"); | ||
513 | show_registers(regs); | ||
514 | BUG(); | ||
515 | } | ||
516 | *regs = jprobe_saved_regs; | ||
517 | memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, | ||
518 | MIN_STACK_SIZE(stack_addr)); | ||
519 | return 1; | ||
520 | } | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. | ||
526 | * By default on x86_64, pages we get from kmalloc or vmalloc are not | ||
527 | * executable. Single-stepping an instruction on such a page yields an | ||
528 | * oops. So instead of storing the instruction copies in their respective | ||
529 | * kprobe objects, we allocate a page, map it executable, and store all the | ||
530 | * instruction copies there. (We can allocate additional pages if somebody | ||
531 | * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE | ||
532 | * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) | ||
533 | * bytes. | ||
534 | */ | ||
535 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) | ||
536 | struct kprobe_insn_page { | ||
537 | struct hlist_node hlist; | ||
538 | kprobe_opcode_t *insns; /* page of instruction slots */ | ||
539 | char slot_used[INSNS_PER_PAGE]; | ||
540 | int nused; | ||
541 | }; | ||
542 | |||
543 | static struct hlist_head kprobe_insn_pages; | ||
544 | |||
545 | /** | ||
546 | * get_insn_slot() - Find a slot on an executable page for an instruction. | ||
547 | * We allocate an executable page if there's no room on existing ones. | ||
548 | */ | ||
549 | static kprobe_opcode_t *get_insn_slot(void) | ||
550 | { | ||
551 | struct kprobe_insn_page *kip; | ||
552 | struct hlist_node *pos; | ||
553 | |||
554 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
555 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
556 | if (kip->nused < INSNS_PER_PAGE) { | ||
557 | int i; | ||
558 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
559 | if (!kip->slot_used[i]) { | ||
560 | kip->slot_used[i] = 1; | ||
561 | kip->nused++; | ||
562 | return kip->insns + (i*MAX_INSN_SIZE); | ||
563 | } | ||
564 | } | ||
565 | /* Surprise! No unused slots. Fix kip->nused. */ | ||
566 | kip->nused = INSNS_PER_PAGE; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | ||
571 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | ||
572 | if (!kip) { | ||
573 | return NULL; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * For the %rip-relative displacement fixups to be doable, we | ||
578 | * need our instruction copy to be within +/- 2GB of any data it | ||
579 | * might access via %rip. That is, within 2GB of where the | ||
580 | * kernel image and loaded module images reside. So we allocate | ||
581 | * a page in the module loading area. | ||
582 | */ | ||
583 | kip->insns = module_alloc(PAGE_SIZE); | ||
584 | if (!kip->insns) { | ||
585 | kfree(kip); | ||
586 | return NULL; | ||
587 | } | ||
588 | INIT_HLIST_NODE(&kip->hlist); | ||
589 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); | ||
590 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | ||
591 | kip->slot_used[0] = 1; | ||
592 | kip->nused = 1; | ||
593 | return kip->insns; | ||
594 | } | ||
595 | |||
596 | /** | ||
597 | * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). | ||
598 | */ | ||
599 | static void free_insn_slot(kprobe_opcode_t *slot) | ||
600 | { | ||
601 | struct kprobe_insn_page *kip; | ||
602 | struct hlist_node *pos; | ||
603 | |||
604 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
605 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
606 | if (kip->insns <= slot | ||
607 | && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { | ||
608 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
609 | kip->slot_used[i] = 0; | ||
610 | kip->nused--; | ||
611 | if (kip->nused == 0) { | ||
612 | /* | ||
613 | * Page is no longer in use. Free it unless | ||
614 | * it's the last one. We keep the last one | ||
615 | * so as not to have to set it up again the | ||
616 | * next time somebody inserts a probe. | ||
617 | */ | ||
618 | hlist_del(&kip->hlist); | ||
619 | if (hlist_empty(&kprobe_insn_pages)) { | ||
620 | INIT_HLIST_NODE(&kip->hlist); | ||
621 | hlist_add_head(&kip->hlist, | ||
622 | &kprobe_insn_pages); | ||
623 | } else { | ||
624 | module_free(NULL, kip->insns); | ||
625 | kfree(kip); | ||
626 | } | ||
627 | } | ||
628 | return; | ||
629 | } | ||
630 | } | ||
631 | } | ||
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c new file mode 100644 index 000000000000..d7e5d0cf4285 --- /dev/null +++ b/arch/x86_64/kernel/ldt.c | |||
@@ -0,0 +1,253 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ldt.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | ||
5 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright (C) 2002 Andi Kleen | ||
7 | * | ||
8 | * This handles calls from both 32bit and 64bit mode. | ||
9 | */ | ||
10 | |||
11 | #include <linux/errno.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/smp.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/vmalloc.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/system.h> | ||
22 | #include <asm/ldt.h> | ||
23 | #include <asm/desc.h> | ||
24 | #include <asm/proto.h> | ||
25 | |||
26 | #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | ||
27 | static void flush_ldt(void *null) | ||
28 | { | ||
29 | if (current->active_mm) | ||
30 | load_LDT(¤t->active_mm->context); | ||
31 | } | ||
32 | #endif | ||
33 | |||
34 | static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | ||
35 | { | ||
36 | void *oldldt; | ||
37 | void *newldt; | ||
38 | unsigned oldsize; | ||
39 | |||
40 | if (mincount <= (unsigned)pc->size) | ||
41 | return 0; | ||
42 | oldsize = pc->size; | ||
43 | mincount = (mincount+511)&(~511); | ||
44 | if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
45 | newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | ||
46 | else | ||
47 | newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | ||
48 | |||
49 | if (!newldt) | ||
50 | return -ENOMEM; | ||
51 | |||
52 | if (oldsize) | ||
53 | memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | ||
54 | oldldt = pc->ldt; | ||
55 | memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | ||
56 | wmb(); | ||
57 | pc->ldt = newldt; | ||
58 | wmb(); | ||
59 | pc->size = mincount; | ||
60 | wmb(); | ||
61 | if (reload) { | ||
62 | #ifdef CONFIG_SMP | ||
63 | cpumask_t mask; | ||
64 | |||
65 | preempt_disable(); | ||
66 | mask = cpumask_of_cpu(smp_processor_id()); | ||
67 | load_LDT(pc); | ||
68 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | ||
69 | smp_call_function(flush_ldt, NULL, 1, 1); | ||
70 | preempt_enable(); | ||
71 | #else | ||
72 | load_LDT(pc); | ||
73 | #endif | ||
74 | } | ||
75 | if (oldsize) { | ||
76 | if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
77 | vfree(oldldt); | ||
78 | else | ||
79 | kfree(oldldt); | ||
80 | } | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | ||
85 | { | ||
86 | int err = alloc_ldt(new, old->size, 0); | ||
87 | if (err < 0) | ||
88 | return err; | ||
89 | memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * we do not have to muck with descriptors here, that is | ||
95 | * done in switch_mm() as needed. | ||
96 | */ | ||
97 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
98 | { | ||
99 | struct mm_struct * old_mm; | ||
100 | int retval = 0; | ||
101 | |||
102 | init_MUTEX(&mm->context.sem); | ||
103 | mm->context.size = 0; | ||
104 | old_mm = current->mm; | ||
105 | if (old_mm && old_mm->context.size > 0) { | ||
106 | down(&old_mm->context.sem); | ||
107 | retval = copy_ldt(&mm->context, &old_mm->context); | ||
108 | up(&old_mm->context.sem); | ||
109 | } | ||
110 | return retval; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * | ||
115 | * Don't touch the LDT register - we're already in the next thread. | ||
116 | */ | ||
117 | void destroy_context(struct mm_struct *mm) | ||
118 | { | ||
119 | if (mm->context.size) { | ||
120 | if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
121 | vfree(mm->context.ldt); | ||
122 | else | ||
123 | kfree(mm->context.ldt); | ||
124 | mm->context.size = 0; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | static int read_ldt(void __user * ptr, unsigned long bytecount) | ||
129 | { | ||
130 | int err; | ||
131 | unsigned long size; | ||
132 | struct mm_struct * mm = current->mm; | ||
133 | |||
134 | if (!mm->context.size) | ||
135 | return 0; | ||
136 | if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | ||
137 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | ||
138 | |||
139 | down(&mm->context.sem); | ||
140 | size = mm->context.size*LDT_ENTRY_SIZE; | ||
141 | if (size > bytecount) | ||
142 | size = bytecount; | ||
143 | |||
144 | err = 0; | ||
145 | if (copy_to_user(ptr, mm->context.ldt, size)) | ||
146 | err = -EFAULT; | ||
147 | up(&mm->context.sem); | ||
148 | if (err < 0) | ||
149 | goto error_return; | ||
150 | if (size != bytecount) { | ||
151 | /* zero-fill the rest */ | ||
152 | if (clear_user(ptr+size, bytecount-size) != 0) { | ||
153 | err = -EFAULT; | ||
154 | goto error_return; | ||
155 | } | ||
156 | } | ||
157 | return bytecount; | ||
158 | error_return: | ||
159 | return err; | ||
160 | } | ||
161 | |||
162 | static int read_default_ldt(void __user * ptr, unsigned long bytecount) | ||
163 | { | ||
164 | /* Arbitrary number */ | ||
165 | /* x86-64 default LDT is all zeros */ | ||
166 | if (bytecount > 128) | ||
167 | bytecount = 128; | ||
168 | if (clear_user(ptr, bytecount)) | ||
169 | return -EFAULT; | ||
170 | return bytecount; | ||
171 | } | ||
172 | |||
173 | static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | ||
174 | { | ||
175 | struct task_struct *me = current; | ||
176 | struct mm_struct * mm = me->mm; | ||
177 | __u32 entry_1, entry_2, *lp; | ||
178 | int error; | ||
179 | struct user_desc ldt_info; | ||
180 | |||
181 | error = -EINVAL; | ||
182 | |||
183 | if (bytecount != sizeof(ldt_info)) | ||
184 | goto out; | ||
185 | error = -EFAULT; | ||
186 | if (copy_from_user(&ldt_info, ptr, bytecount)) | ||
187 | goto out; | ||
188 | |||
189 | error = -EINVAL; | ||
190 | if (ldt_info.entry_number >= LDT_ENTRIES) | ||
191 | goto out; | ||
192 | if (ldt_info.contents == 3) { | ||
193 | if (oldmode) | ||
194 | goto out; | ||
195 | if (ldt_info.seg_not_present == 0) | ||
196 | goto out; | ||
197 | } | ||
198 | |||
199 | down(&mm->context.sem); | ||
200 | if (ldt_info.entry_number >= (unsigned)mm->context.size) { | ||
201 | error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | ||
202 | if (error < 0) | ||
203 | goto out_unlock; | ||
204 | } | ||
205 | |||
206 | lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | ||
207 | |||
208 | /* Allow LDTs to be cleared by the user. */ | ||
209 | if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | ||
210 | if (oldmode || LDT_empty(&ldt_info)) { | ||
211 | entry_1 = 0; | ||
212 | entry_2 = 0; | ||
213 | goto install; | ||
214 | } | ||
215 | } | ||
216 | |||
217 | entry_1 = LDT_entry_a(&ldt_info); | ||
218 | entry_2 = LDT_entry_b(&ldt_info); | ||
219 | if (oldmode) | ||
220 | entry_2 &= ~(1 << 20); | ||
221 | |||
222 | /* Install the new entry ... */ | ||
223 | install: | ||
224 | *lp = entry_1; | ||
225 | *(lp+1) = entry_2; | ||
226 | error = 0; | ||
227 | |||
228 | out_unlock: | ||
229 | up(&mm->context.sem); | ||
230 | out: | ||
231 | return error; | ||
232 | } | ||
233 | |||
234 | asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | ||
235 | { | ||
236 | int ret = -ENOSYS; | ||
237 | |||
238 | switch (func) { | ||
239 | case 0: | ||
240 | ret = read_ldt(ptr, bytecount); | ||
241 | break; | ||
242 | case 1: | ||
243 | ret = write_ldt(ptr, bytecount, 1); | ||
244 | break; | ||
245 | case 2: | ||
246 | ret = read_default_ldt(ptr, bytecount); | ||
247 | break; | ||
248 | case 0x11: | ||
249 | ret = write_ldt(ptr, bytecount, 0); | ||
250 | break; | ||
251 | } | ||
252 | return ret; | ||
253 | } | ||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c new file mode 100644 index 000000000000..86f9fd85016a --- /dev/null +++ b/arch/x86_64/kernel/mce.c | |||
@@ -0,0 +1,548 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/rcupdate.h> | ||
14 | #include <linux/kallsyms.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/miscdevice.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <asm/processor.h> | ||
19 | #include <asm/msr.h> | ||
20 | #include <asm/mce.h> | ||
21 | #include <asm/kdebug.h> | ||
22 | #include <asm/uaccess.h> | ||
23 | |||
24 | #define MISC_MCELOG_MINOR 227 | ||
25 | #define NR_BANKS 5 | ||
26 | |||
27 | static int mce_dont_init; | ||
28 | |||
29 | /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, | ||
30 | 3: never panic or exit (for testing only) */ | ||
31 | static int tolerant = 1; | ||
32 | static int banks; | ||
33 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | ||
34 | static unsigned long console_logged; | ||
35 | static int notify_user; | ||
36 | |||
37 | /* | ||
38 | * Lockless MCE logging infrastructure. | ||
39 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
40 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
41 | */ | ||
42 | |||
43 | struct mce_log mcelog = { | ||
44 | MCE_LOG_SIGNATURE, | ||
45 | MCE_LOG_LEN, | ||
46 | }; | ||
47 | |||
48 | void mce_log(struct mce *mce) | ||
49 | { | ||
50 | unsigned next, entry; | ||
51 | mce->finished = 0; | ||
52 | smp_wmb(); | ||
53 | for (;;) { | ||
54 | entry = rcu_dereference(mcelog.next); | ||
55 | /* When the buffer fills up discard new entries. Assume | ||
56 | that the earlier errors are the more interesting. */ | ||
57 | if (entry >= MCE_LOG_LEN) { | ||
58 | set_bit(MCE_OVERFLOW, &mcelog.flags); | ||
59 | return; | ||
60 | } | ||
61 | /* Old left over entry. Skip. */ | ||
62 | if (mcelog.entry[entry].finished) | ||
63 | continue; | ||
64 | smp_rmb(); | ||
65 | next = entry + 1; | ||
66 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
67 | break; | ||
68 | } | ||
69 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
70 | smp_wmb(); | ||
71 | mcelog.entry[entry].finished = 1; | ||
72 | smp_wmb(); | ||
73 | |||
74 | if (!test_and_set_bit(0, &console_logged)) | ||
75 | notify_user = 1; | ||
76 | } | ||
77 | |||
78 | static void print_mce(struct mce *m) | ||
79 | { | ||
80 | printk(KERN_EMERG "\n" | ||
81 | KERN_EMERG | ||
82 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
83 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
84 | if (m->rip) { | ||
85 | printk(KERN_EMERG | ||
86 | "RIP%s %02x:<%016Lx> ", | ||
87 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
88 | m->cs, m->rip); | ||
89 | if (m->cs == __KERNEL_CS) | ||
90 | print_symbol("{%s}", m->rip); | ||
91 | printk("\n"); | ||
92 | } | ||
93 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | ||
94 | if (m->addr) | ||
95 | printk("ADDR %Lx ", m->addr); | ||
96 | if (m->misc) | ||
97 | printk("MISC %Lx ", m->misc); | ||
98 | printk("\n"); | ||
99 | } | ||
100 | |||
101 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
102 | { | ||
103 | int i; | ||
104 | oops_begin(); | ||
105 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
106 | unsigned long tsc = mcelog.entry[i].tsc; | ||
107 | if (time_before(tsc, start)) | ||
108 | continue; | ||
109 | print_mce(&mcelog.entry[i]); | ||
110 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
111 | backup = NULL; | ||
112 | } | ||
113 | if (backup) | ||
114 | print_mce(backup); | ||
115 | if (tolerant >= 3) | ||
116 | printk("Fake panic: %s\n", msg); | ||
117 | else | ||
118 | panic(msg); | ||
119 | } | ||
120 | |||
121 | static int mce_available(struct cpuinfo_x86 *c) | ||
122 | { | ||
123 | return test_bit(X86_FEATURE_MCE, &c->x86_capability) && | ||
124 | test_bit(X86_FEATURE_MCA, &c->x86_capability); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * The actual machine check handler | ||
129 | */ | ||
130 | |||
131 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
132 | { | ||
133 | struct mce m, panicm; | ||
134 | int nowayout = (tolerant < 1); | ||
135 | int kill_it = 0; | ||
136 | u64 mcestart = 0; | ||
137 | int i; | ||
138 | int panicm_found = 0; | ||
139 | |||
140 | if (regs) | ||
141 | notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); | ||
142 | if (!banks) | ||
143 | return; | ||
144 | |||
145 | memset(&m, 0, sizeof(struct mce)); | ||
146 | m.cpu = hard_smp_processor_id(); | ||
147 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
148 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
149 | kill_it = 1; | ||
150 | |||
151 | rdtscll(mcestart); | ||
152 | barrier(); | ||
153 | |||
154 | for (i = 0; i < banks; i++) { | ||
155 | if (!bank[i]) | ||
156 | continue; | ||
157 | |||
158 | m.misc = 0; | ||
159 | m.addr = 0; | ||
160 | m.bank = i; | ||
161 | m.tsc = 0; | ||
162 | |||
163 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
164 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
165 | continue; | ||
166 | |||
167 | if (m.status & MCI_STATUS_EN) { | ||
168 | /* In theory _OVER could be a nowayout too, but | ||
169 | assume any overflowed errors were no fatal. */ | ||
170 | nowayout |= !!(m.status & MCI_STATUS_PCC); | ||
171 | kill_it |= !!(m.status & MCI_STATUS_UC); | ||
172 | } | ||
173 | |||
174 | if (m.status & MCI_STATUS_MISCV) | ||
175 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
176 | if (m.status & MCI_STATUS_ADDRV) | ||
177 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
178 | |||
179 | if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { | ||
180 | m.rip = regs->rip; | ||
181 | m.cs = regs->cs; | ||
182 | } else { | ||
183 | m.rip = 0; | ||
184 | m.cs = 0; | ||
185 | } | ||
186 | |||
187 | if (error_code != -1) | ||
188 | rdtscll(m.tsc); | ||
189 | wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); | ||
190 | mce_log(&m); | ||
191 | |||
192 | /* Did this bank cause the exception? */ | ||
193 | /* Assume that the bank with uncorrectable errors did it, | ||
194 | and that there is only a single one. */ | ||
195 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
196 | panicm = m; | ||
197 | panicm_found = 1; | ||
198 | } | ||
199 | |||
200 | tainted |= TAINT_MACHINE_CHECK; | ||
201 | } | ||
202 | |||
203 | /* Never do anything final in the polling timer */ | ||
204 | if (!regs) | ||
205 | goto out; | ||
206 | |||
207 | /* If we didn't find an uncorrectable error, pick | ||
208 | the last one (shouldn't happen, just being safe). */ | ||
209 | if (!panicm_found) | ||
210 | panicm = m; | ||
211 | if (nowayout) | ||
212 | mce_panic("Machine check", &panicm, mcestart); | ||
213 | if (kill_it) { | ||
214 | int user_space = 0; | ||
215 | |||
216 | if (m.mcgstatus & MCG_STATUS_RIPV) | ||
217 | user_space = panicm.rip && (panicm.cs & 3); | ||
218 | |||
219 | /* When the machine was in user space and the CPU didn't get | ||
220 | confused it's normally not necessary to panic, unless you | ||
221 | are paranoid (tolerant == 0) | ||
222 | |||
223 | RED-PEN could be more tolerant for MCEs in idle, | ||
224 | but most likely they occur at boot anyways, where | ||
225 | it is best to just halt the machine. */ | ||
226 | if ((!user_space && (panic_on_oops || tolerant < 2)) || | ||
227 | (unsigned)current->pid <= 1) | ||
228 | mce_panic("Uncorrected machine check", &panicm, mcestart); | ||
229 | |||
230 | /* do_exit takes an awful lot of locks and has as | ||
231 | slight risk of deadlocking. If you don't want that | ||
232 | don't set tolerant >= 2 */ | ||
233 | if (tolerant < 3) | ||
234 | do_exit(SIGBUS); | ||
235 | } | ||
236 | |||
237 | out: | ||
238 | /* Last thing done in the machine check exception to clear state. */ | ||
239 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Periodic polling timer for "silent" machine check errors. | ||
244 | */ | ||
245 | |||
246 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
247 | static void mcheck_timer(void *data); | ||
248 | static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); | ||
249 | |||
250 | static void mcheck_check_cpu(void *info) | ||
251 | { | ||
252 | if (mce_available(¤t_cpu_data)) | ||
253 | do_machine_check(NULL, 0); | ||
254 | } | ||
255 | |||
256 | static void mcheck_timer(void *data) | ||
257 | { | ||
258 | on_each_cpu(mcheck_check_cpu, NULL, 1, 1); | ||
259 | schedule_delayed_work(&mcheck_work, check_interval * HZ); | ||
260 | |||
261 | /* | ||
262 | * It's ok to read stale data here for notify_user and | ||
263 | * console_logged as we'll simply get the updated versions | ||
264 | * on the next mcheck_timer execution and atomic operations | ||
265 | * on console_logged act as synchronization for notify_user | ||
266 | * writes. | ||
267 | */ | ||
268 | if (notify_user && console_logged) { | ||
269 | notify_user = 0; | ||
270 | clear_bit(0, &console_logged); | ||
271 | printk(KERN_INFO "Machine check events logged\n"); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | |||
276 | static __init int periodic_mcheck_init(void) | ||
277 | { | ||
278 | if (check_interval) | ||
279 | schedule_delayed_work(&mcheck_work, check_interval*HZ); | ||
280 | return 0; | ||
281 | } | ||
282 | __initcall(periodic_mcheck_init); | ||
283 | |||
284 | |||
285 | /* | ||
286 | * Initialize Machine Checks for a CPU. | ||
287 | */ | ||
288 | static void mce_init(void *dummy) | ||
289 | { | ||
290 | u64 cap; | ||
291 | int i; | ||
292 | |||
293 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
294 | banks = cap & 0xff; | ||
295 | if (banks > NR_BANKS) { | ||
296 | printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); | ||
297 | banks = NR_BANKS; | ||
298 | } | ||
299 | |||
300 | /* Log the machine checks left over from the previous reset. | ||
301 | This also clears all registers */ | ||
302 | do_machine_check(NULL, -1); | ||
303 | |||
304 | set_in_cr4(X86_CR4_MCE); | ||
305 | |||
306 | if (cap & MCG_CTL_P) | ||
307 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
308 | |||
309 | for (i = 0; i < banks; i++) { | ||
310 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
311 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | /* Add per CPU specific workarounds here */ | ||
316 | static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
317 | { | ||
318 | /* This should be disabled by the BIOS, but isn't always */ | ||
319 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { | ||
320 | /* disable GART TBL walk error reporting, which trips off | ||
321 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
322 | clear_bit(10, &bank[4]); | ||
323 | } | ||
324 | } | ||
325 | |||
326 | static void __init mce_cpu_features(struct cpuinfo_x86 *c) | ||
327 | { | ||
328 | switch (c->x86_vendor) { | ||
329 | case X86_VENDOR_INTEL: | ||
330 | mce_intel_feature_init(c); | ||
331 | break; | ||
332 | default: | ||
333 | break; | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Called for each booted CPU to set up machine checks. | ||
339 | * Must be called with preempt off. | ||
340 | */ | ||
341 | void __init mcheck_init(struct cpuinfo_x86 *c) | ||
342 | { | ||
343 | static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; | ||
344 | |||
345 | mce_cpu_quirks(c); | ||
346 | |||
347 | if (mce_dont_init || | ||
348 | cpu_test_and_set(smp_processor_id(), mce_cpus) || | ||
349 | !mce_available(c)) | ||
350 | return; | ||
351 | |||
352 | mce_init(NULL); | ||
353 | mce_cpu_features(c); | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * Character device to read and clear the MCE log. | ||
358 | */ | ||
359 | |||
360 | static void collect_tscs(void *data) | ||
361 | { | ||
362 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
363 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
364 | } | ||
365 | |||
366 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) | ||
367 | { | ||
368 | unsigned long cpu_tsc[NR_CPUS]; | ||
369 | static DECLARE_MUTEX(mce_read_sem); | ||
370 | unsigned next; | ||
371 | char __user *buf = ubuf; | ||
372 | int i, err; | ||
373 | |||
374 | down(&mce_read_sem); | ||
375 | next = rcu_dereference(mcelog.next); | ||
376 | |||
377 | /* Only supports full reads right now */ | ||
378 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
379 | up(&mce_read_sem); | ||
380 | return -EINVAL; | ||
381 | } | ||
382 | |||
383 | err = 0; | ||
384 | for (i = 0; i < next; i++) { | ||
385 | if (!mcelog.entry[i].finished) | ||
386 | continue; | ||
387 | smp_rmb(); | ||
388 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
389 | buf += sizeof(struct mce); | ||
390 | } | ||
391 | |||
392 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | ||
393 | mcelog.next = 0; | ||
394 | |||
395 | synchronize_kernel(); | ||
396 | |||
397 | /* Collect entries that were still getting written before the synchronize. */ | ||
398 | |||
399 | on_each_cpu(collect_tscs, cpu_tsc, 1, 1); | ||
400 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
401 | if (mcelog.entry[i].finished && | ||
402 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
403 | err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); | ||
404 | smp_rmb(); | ||
405 | buf += sizeof(struct mce); | ||
406 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
407 | } | ||
408 | } | ||
409 | up(&mce_read_sem); | ||
410 | return err ? -EFAULT : buf - ubuf; | ||
411 | } | ||
412 | |||
413 | static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) | ||
414 | { | ||
415 | int __user *p = (int __user *)arg; | ||
416 | if (!capable(CAP_SYS_ADMIN)) | ||
417 | return -EPERM; | ||
418 | switch (cmd) { | ||
419 | case MCE_GET_RECORD_LEN: | ||
420 | return put_user(sizeof(struct mce), p); | ||
421 | case MCE_GET_LOG_LEN: | ||
422 | return put_user(MCE_LOG_LEN, p); | ||
423 | case MCE_GETCLEAR_FLAGS: { | ||
424 | unsigned flags; | ||
425 | do { | ||
426 | flags = mcelog.flags; | ||
427 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
428 | return put_user(flags, p); | ||
429 | } | ||
430 | default: | ||
431 | return -ENOTTY; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | static struct file_operations mce_chrdev_ops = { | ||
436 | .read = mce_read, | ||
437 | .ioctl = mce_ioctl, | ||
438 | }; | ||
439 | |||
440 | static struct miscdevice mce_log_device = { | ||
441 | MISC_MCELOG_MINOR, | ||
442 | "mcelog", | ||
443 | &mce_chrdev_ops, | ||
444 | }; | ||
445 | |||
446 | /* | ||
447 | * Old style boot options parsing. Only for compatibility. | ||
448 | */ | ||
449 | |||
450 | static int __init mcheck_disable(char *str) | ||
451 | { | ||
452 | mce_dont_init = 1; | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | /* mce=off disables machine check. Note you can reenable it later | ||
457 | using sysfs */ | ||
458 | static int __init mcheck_enable(char *str) | ||
459 | { | ||
460 | if (!strcmp(str, "off")) | ||
461 | mce_dont_init = 1; | ||
462 | else | ||
463 | printk("mce= argument %s ignored. Please use /sys", str); | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | __setup("nomce", mcheck_disable); | ||
468 | __setup("mce", mcheck_enable); | ||
469 | |||
470 | /* | ||
471 | * Sysfs support | ||
472 | */ | ||
473 | |||
474 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ | ||
475 | static int mce_resume(struct sys_device *dev) | ||
476 | { | ||
477 | on_each_cpu(mce_init, NULL, 1, 1); | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | /* Reinit MCEs after user configuration changes */ | ||
482 | static void mce_restart(void) | ||
483 | { | ||
484 | if (check_interval) | ||
485 | cancel_delayed_work(&mcheck_work); | ||
486 | /* Timer race is harmless here */ | ||
487 | on_each_cpu(mce_init, NULL, 1, 1); | ||
488 | if (check_interval) | ||
489 | schedule_delayed_work(&mcheck_work, check_interval*HZ); | ||
490 | } | ||
491 | |||
492 | static struct sysdev_class mce_sysclass = { | ||
493 | .resume = mce_resume, | ||
494 | set_kset_name("machinecheck"), | ||
495 | }; | ||
496 | |||
497 | static struct sys_device device_mce = { | ||
498 | .id = 0, | ||
499 | .cls = &mce_sysclass, | ||
500 | }; | ||
501 | |||
502 | /* Why are there no generic functions for this? */ | ||
503 | #define ACCESSOR(name, var, start) \ | ||
504 | static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ | ||
505 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
506 | } \ | ||
507 | static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ | ||
508 | char *end; \ | ||
509 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
510 | if (end == buf) return -EINVAL; \ | ||
511 | var = new; \ | ||
512 | start; \ | ||
513 | return end-buf; \ | ||
514 | } \ | ||
515 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
516 | |||
517 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | ||
518 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | ||
519 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | ||
520 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | ||
521 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | ||
522 | ACCESSOR(tolerant,tolerant,) | ||
523 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
524 | |||
525 | static __init int mce_init_device(void) | ||
526 | { | ||
527 | int err; | ||
528 | if (!mce_available(&boot_cpu_data)) | ||
529 | return -EIO; | ||
530 | err = sysdev_class_register(&mce_sysclass); | ||
531 | if (!err) | ||
532 | err = sysdev_register(&device_mce); | ||
533 | if (!err) { | ||
534 | /* could create per CPU objects, but it is not worth it. */ | ||
535 | sysdev_create_file(&device_mce, &attr_bank0ctl); | ||
536 | sysdev_create_file(&device_mce, &attr_bank1ctl); | ||
537 | sysdev_create_file(&device_mce, &attr_bank2ctl); | ||
538 | sysdev_create_file(&device_mce, &attr_bank3ctl); | ||
539 | sysdev_create_file(&device_mce, &attr_bank4ctl); | ||
540 | sysdev_create_file(&device_mce, &attr_tolerant); | ||
541 | sysdev_create_file(&device_mce, &attr_check_interval); | ||
542 | } | ||
543 | |||
544 | misc_register(&mce_log_device); | ||
545 | return err; | ||
546 | |||
547 | } | ||
548 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c new file mode 100644 index 000000000000..4db9a640069f --- /dev/null +++ b/arch/x86_64/kernel/mce_intel.c | |||
@@ -0,0 +1,99 @@ | |||
1 | /* | ||
2 | * Intel specific MCE features. | ||
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/percpu.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/msr.h> | ||
11 | #include <asm/mce.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | |||
14 | static DEFINE_PER_CPU(unsigned long, next_check); | ||
15 | |||
16 | asmlinkage void smp_thermal_interrupt(void) | ||
17 | { | ||
18 | struct mce m; | ||
19 | |||
20 | ack_APIC_irq(); | ||
21 | |||
22 | irq_enter(); | ||
23 | if (time_before(jiffies, __get_cpu_var(next_check))) | ||
24 | goto done; | ||
25 | |||
26 | __get_cpu_var(next_check) = jiffies + HZ*300; | ||
27 | memset(&m, 0, sizeof(m)); | ||
28 | m.cpu = smp_processor_id(); | ||
29 | m.bank = MCE_THERMAL_BANK; | ||
30 | rdtscll(m.tsc); | ||
31 | rdmsrl(MSR_IA32_THERM_STATUS, m.status); | ||
32 | if (m.status & 0x1) { | ||
33 | printk(KERN_EMERG | ||
34 | "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); | ||
35 | add_taint(TAINT_MACHINE_CHECK); | ||
36 | } else { | ||
37 | printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); | ||
38 | } | ||
39 | |||
40 | mce_log(&m); | ||
41 | done: | ||
42 | irq_exit(); | ||
43 | } | ||
44 | |||
45 | static void __init intel_init_thermal(struct cpuinfo_x86 *c) | ||
46 | { | ||
47 | u32 l, h; | ||
48 | int tm2 = 0; | ||
49 | unsigned int cpu = smp_processor_id(); | ||
50 | |||
51 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
52 | return; | ||
53 | |||
54 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
55 | return; | ||
56 | |||
57 | /* first check if TM1 is already enabled by the BIOS, in which | ||
58 | * case there might be some SMM goo which handles it, so we can't even | ||
59 | * put a handler since it might be delivered via SMI already. | ||
60 | */ | ||
61 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
62 | h = apic_read(APIC_LVTTHMR); | ||
63 | if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { | ||
64 | printk(KERN_DEBUG | ||
65 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
66 | return; | ||
67 | } | ||
68 | |||
69 | if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) | ||
70 | tm2 = 1; | ||
71 | |||
72 | if (h & APIC_VECTOR_MASK) { | ||
73 | printk(KERN_DEBUG | ||
74 | "CPU%d: Thermal LVT vector (%#x) already " | ||
75 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | h = THERMAL_APIC_VECTOR; | ||
80 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
81 | apic_write_around(APIC_LVTTHMR, h); | ||
82 | |||
83 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
84 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
85 | |||
86 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
87 | wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); | ||
88 | |||
89 | l = apic_read(APIC_LVTTHMR); | ||
90 | apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
91 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
92 | cpu, tm2 ? "TM2" : "TM1"); | ||
93 | return; | ||
94 | } | ||
95 | |||
96 | void __init mce_intel_feature_init(struct cpuinfo_x86 *c) | ||
97 | { | ||
98 | intel_init_thermal(c); | ||
99 | } | ||
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c new file mode 100644 index 000000000000..c2ffea8845ed --- /dev/null +++ b/arch/x86_64/kernel/module.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* Kernel module help for x86-64 | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #include <linux/moduleloader.h> | ||
20 | #include <linux/elf.h> | ||
21 | #include <linux/vmalloc.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/slab.h> | ||
26 | |||
27 | #include <asm/system.h> | ||
28 | #include <asm/page.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | |||
31 | #define DEBUGP(fmt...) | ||
32 | |||
33 | void module_free(struct module *mod, void *module_region) | ||
34 | { | ||
35 | vfree(module_region); | ||
36 | } | ||
37 | |||
38 | void *module_alloc(unsigned long size) | ||
39 | { | ||
40 | struct vm_struct *area; | ||
41 | |||
42 | if (!size) | ||
43 | return NULL; | ||
44 | size = PAGE_ALIGN(size); | ||
45 | if (size > MODULES_LEN) | ||
46 | return NULL; | ||
47 | |||
48 | area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); | ||
49 | if (!area) | ||
50 | return NULL; | ||
51 | |||
52 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | ||
53 | } | ||
54 | |||
55 | /* We don't need anything special. */ | ||
56 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
57 | Elf_Shdr *sechdrs, | ||
58 | char *secstrings, | ||
59 | struct module *mod) | ||
60 | { | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | int apply_relocate_add(Elf64_Shdr *sechdrs, | ||
65 | const char *strtab, | ||
66 | unsigned int symindex, | ||
67 | unsigned int relsec, | ||
68 | struct module *me) | ||
69 | { | ||
70 | unsigned int i; | ||
71 | Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; | ||
72 | Elf64_Sym *sym; | ||
73 | void *loc; | ||
74 | u64 val; | ||
75 | |||
76 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
77 | sechdrs[relsec].sh_info); | ||
78 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
79 | /* This is where to make the change */ | ||
80 | loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
81 | + rel[i].r_offset; | ||
82 | |||
83 | /* This is the symbol it is referring to. Note that all | ||
84 | undefined symbols have been resolved. */ | ||
85 | sym = (Elf64_Sym *)sechdrs[symindex].sh_addr | ||
86 | + ELF64_R_SYM(rel[i].r_info); | ||
87 | |||
88 | DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", | ||
89 | (int)ELF64_R_TYPE(rel[i].r_info), | ||
90 | sym->st_value, rel[i].r_addend, (u64)loc); | ||
91 | |||
92 | val = sym->st_value + rel[i].r_addend; | ||
93 | |||
94 | switch (ELF64_R_TYPE(rel[i].r_info)) { | ||
95 | case R_X86_64_NONE: | ||
96 | break; | ||
97 | case R_X86_64_64: | ||
98 | *(u64 *)loc = val; | ||
99 | break; | ||
100 | case R_X86_64_32: | ||
101 | *(u32 *)loc = val; | ||
102 | if (val != *(u32 *)loc) | ||
103 | goto overflow; | ||
104 | break; | ||
105 | case R_X86_64_32S: | ||
106 | *(s32 *)loc = val; | ||
107 | if ((s64)val != *(s32 *)loc) | ||
108 | goto overflow; | ||
109 | break; | ||
110 | case R_X86_64_PC32: | ||
111 | val -= (u64)loc; | ||
112 | *(u32 *)loc = val; | ||
113 | #if 0 | ||
114 | if ((s64)val != *(s32 *)loc) | ||
115 | goto overflow; | ||
116 | #endif | ||
117 | break; | ||
118 | default: | ||
119 | printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", | ||
120 | me->name, ELF64_R_TYPE(rel[i].r_info)); | ||
121 | return -ENOEXEC; | ||
122 | } | ||
123 | } | ||
124 | return 0; | ||
125 | |||
126 | overflow: | ||
127 | printk(KERN_ERR "overflow in relocation type %d val %Lx\n", | ||
128 | (int)ELF64_R_TYPE(rel[i].r_info), val); | ||
129 | printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", | ||
130 | me->name); | ||
131 | return -ENOEXEC; | ||
132 | } | ||
133 | |||
134 | int apply_relocate(Elf_Shdr *sechdrs, | ||
135 | const char *strtab, | ||
136 | unsigned int symindex, | ||
137 | unsigned int relsec, | ||
138 | struct module *me) | ||
139 | { | ||
140 | printk("non add relocation not supported\n"); | ||
141 | return -ENOSYS; | ||
142 | } | ||
143 | |||
144 | extern void apply_alternatives(void *start, void *end); | ||
145 | |||
146 | int module_finalize(const Elf_Ehdr *hdr, | ||
147 | const Elf_Shdr *sechdrs, | ||
148 | struct module *me) | ||
149 | { | ||
150 | const Elf_Shdr *s; | ||
151 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
152 | |||
153 | /* look for .altinstructions to patch */ | ||
154 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
155 | void *seg; | ||
156 | if (strcmp(".altinstructions", secstrings + s->sh_name)) | ||
157 | continue; | ||
158 | seg = (void *)s->sh_addr; | ||
159 | apply_alternatives(seg, seg + s->sh_size); | ||
160 | } | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | void module_arch_cleanup(struct module *mod) | ||
165 | { | ||
166 | } | ||
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c new file mode 100644 index 000000000000..7ec031c6ca10 --- /dev/null +++ b/arch/x86_64/kernel/mpparse.c | |||
@@ -0,0 +1,949 @@ | |||
1 | /* | ||
2 | * Intel Multiprocessor Specification 1.1 and 1.4 | ||
3 | * compliant MP-table parsing routines. | ||
4 | * | ||
5 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
6 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes | ||
9 | * Erich Boleyn : MP v1.4 and additional changes. | ||
10 | * Alan Cox : Added EBDA scanning | ||
11 | * Ingo Molnar : various cleanups and rewrites | ||
12 | * Maciej W. Rozycki: Bits for default MP configurations | ||
13 | * Paul Diefenbaugh: Added full ACPI support | ||
14 | */ | ||
15 | |||
16 | #include <linux/mm.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/config.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/smp_lock.h> | ||
23 | #include <linux/kernel_stat.h> | ||
24 | #include <linux/mc146818rtc.h> | ||
25 | #include <linux/acpi.h> | ||
26 | |||
27 | #include <asm/smp.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/mpspec.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/io_apic.h> | ||
32 | #include <asm/proto.h> | ||
33 | |||
34 | /* Have we found an MP table */ | ||
35 | int smp_found_config; | ||
36 | unsigned int __initdata maxcpus = NR_CPUS; | ||
37 | |||
38 | int acpi_found_madt; | ||
39 | |||
40 | /* | ||
41 | * Various Linux-internal data structures created from the | ||
42 | * MP-table. | ||
43 | */ | ||
44 | int apic_version [MAX_APICS]; | ||
45 | unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
46 | int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
47 | cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; | ||
48 | |||
49 | static int mp_current_pci_id = 0; | ||
50 | /* I/O APIC entries */ | ||
51 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
52 | |||
53 | /* # of MP IRQ source entries */ | ||
54 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
55 | |||
56 | /* MP IRQ source entries */ | ||
57 | int mp_irq_entries; | ||
58 | |||
59 | int nr_ioapics; | ||
60 | int pic_mode; | ||
61 | unsigned long mp_lapic_addr = 0; | ||
62 | |||
63 | |||
64 | |||
65 | /* Processor that is doing the boot up */ | ||
66 | unsigned int boot_cpu_id = -1U; | ||
67 | /* Internal processor count */ | ||
68 | static unsigned int num_processors = 0; | ||
69 | |||
70 | /* Bitmask of physically existing CPUs */ | ||
71 | physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | ||
72 | |||
73 | /* ACPI MADT entry parsing functions */ | ||
74 | #ifdef CONFIG_ACPI_BOOT | ||
75 | extern struct acpi_boot_flags acpi_boot; | ||
76 | #ifdef CONFIG_X86_LOCAL_APIC | ||
77 | extern int acpi_parse_lapic (acpi_table_entry_header *header); | ||
78 | extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); | ||
79 | extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); | ||
80 | #endif /*CONFIG_X86_LOCAL_APIC*/ | ||
81 | #ifdef CONFIG_X86_IO_APIC | ||
82 | extern int acpi_parse_ioapic (acpi_table_entry_header *header); | ||
83 | #endif /*CONFIG_X86_IO_APIC*/ | ||
84 | #endif /*CONFIG_ACPI_BOOT*/ | ||
85 | |||
86 | u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
87 | |||
88 | |||
89 | /* | ||
90 | * Intel MP BIOS table parsing routines: | ||
91 | */ | ||
92 | |||
93 | /* | ||
94 | * Checksum an MP configuration block. | ||
95 | */ | ||
96 | |||
97 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
98 | { | ||
99 | int sum = 0; | ||
100 | |||
101 | while (len--) | ||
102 | sum += *mp++; | ||
103 | |||
104 | return sum & 0xFF; | ||
105 | } | ||
106 | |||
107 | static void __init MP_processor_info (struct mpc_config_processor *m) | ||
108 | { | ||
109 | int ver; | ||
110 | |||
111 | if (!(m->mpc_cpuflag & CPU_ENABLED)) | ||
112 | return; | ||
113 | |||
114 | printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", | ||
115 | m->mpc_apicid, | ||
116 | (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, | ||
117 | (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, | ||
118 | m->mpc_apicver); | ||
119 | |||
120 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
121 | Dprintk(" Bootup CPU\n"); | ||
122 | boot_cpu_id = m->mpc_apicid; | ||
123 | } | ||
124 | if (num_processors >= NR_CPUS) { | ||
125 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
126 | " Processor ignored.\n", NR_CPUS); | ||
127 | return; | ||
128 | } | ||
129 | if (num_processors >= maxcpus) { | ||
130 | printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." | ||
131 | " Processor ignored.\n", maxcpus); | ||
132 | return; | ||
133 | } | ||
134 | |||
135 | num_processors++; | ||
136 | |||
137 | if (m->mpc_apicid > MAX_APICS) { | ||
138 | printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", | ||
139 | m->mpc_apicid, MAX_APICS); | ||
140 | return; | ||
141 | } | ||
142 | ver = m->mpc_apicver; | ||
143 | |||
144 | physid_set(m->mpc_apicid, phys_cpu_present_map); | ||
145 | /* | ||
146 | * Validate version | ||
147 | */ | ||
148 | if (ver == 0x0) { | ||
149 | printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); | ||
150 | ver = 0x10; | ||
151 | } | ||
152 | apic_version[m->mpc_apicid] = ver; | ||
153 | bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; | ||
154 | } | ||
155 | |||
156 | static void __init MP_bus_info (struct mpc_config_bus *m) | ||
157 | { | ||
158 | char str[7]; | ||
159 | |||
160 | memcpy(str, m->mpc_bustype, 6); | ||
161 | str[6] = 0; | ||
162 | Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | ||
163 | |||
164 | if (strncmp(str, "ISA", 3) == 0) { | ||
165 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | ||
166 | } else if (strncmp(str, "EISA", 4) == 0) { | ||
167 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; | ||
168 | } else if (strncmp(str, "PCI", 3) == 0) { | ||
169 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | ||
170 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
171 | mp_current_pci_id++; | ||
172 | } else if (strncmp(str, "MCA", 3) == 0) { | ||
173 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; | ||
174 | } else { | ||
175 | printk(KERN_ERR "Unknown bustype %s\n", str); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | ||
180 | { | ||
181 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | ||
182 | return; | ||
183 | |||
184 | printk("I/O APIC #%d Version %d at 0x%X.\n", | ||
185 | m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | ||
186 | if (nr_ioapics >= MAX_IO_APICS) { | ||
187 | printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", | ||
188 | MAX_IO_APICS, nr_ioapics); | ||
189 | panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); | ||
190 | } | ||
191 | if (!m->mpc_apicaddr) { | ||
192 | printk(KERN_ERR "WARNING: bogus zero I/O APIC address" | ||
193 | " found in MP table, skipping!\n"); | ||
194 | return; | ||
195 | } | ||
196 | mp_ioapics[nr_ioapics] = *m; | ||
197 | nr_ioapics++; | ||
198 | } | ||
199 | |||
200 | static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | ||
201 | { | ||
202 | mp_irqs [mp_irq_entries] = *m; | ||
203 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
204 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
205 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
206 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | ||
207 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | ||
208 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
209 | panic("Max # of irq sources exceeded!!\n"); | ||
210 | } | ||
211 | |||
212 | static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | ||
213 | { | ||
214 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | ||
215 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | ||
216 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
217 | (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | ||
218 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | ||
219 | /* | ||
220 | * Well it seems all SMP boards in existence | ||
221 | * use ExtINT/LVT1 == LINT0 and | ||
222 | * NMI/LVT2 == LINT1 - the following check | ||
223 | * will show us if this assumptions is false. | ||
224 | * Until then we do not have to add baggage. | ||
225 | */ | ||
226 | if ((m->mpc_irqtype == mp_ExtINT) && | ||
227 | (m->mpc_destapiclint != 0)) | ||
228 | BUG(); | ||
229 | if ((m->mpc_irqtype == mp_NMI) && | ||
230 | (m->mpc_destapiclint != 1)) | ||
231 | BUG(); | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * Read/parse the MPC | ||
236 | */ | ||
237 | |||
238 | static int __init smp_read_mpc(struct mp_config_table *mpc) | ||
239 | { | ||
240 | char str[16]; | ||
241 | int count=sizeof(*mpc); | ||
242 | unsigned char *mpt=((unsigned char *)mpc)+count; | ||
243 | |||
244 | if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | ||
245 | printk("SMP mptable: bad signature [%c%c%c%c]!\n", | ||
246 | mpc->mpc_signature[0], | ||
247 | mpc->mpc_signature[1], | ||
248 | mpc->mpc_signature[2], | ||
249 | mpc->mpc_signature[3]); | ||
250 | return 0; | ||
251 | } | ||
252 | if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | ||
253 | printk("SMP mptable: checksum error!\n"); | ||
254 | return 0; | ||
255 | } | ||
256 | if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | ||
257 | printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", | ||
258 | mpc->mpc_spec); | ||
259 | return 0; | ||
260 | } | ||
261 | if (!mpc->mpc_lapic) { | ||
262 | printk(KERN_ERR "SMP mptable: null local APIC address!\n"); | ||
263 | return 0; | ||
264 | } | ||
265 | memcpy(str,mpc->mpc_oem,8); | ||
266 | str[8]=0; | ||
267 | printk(KERN_INFO "OEM ID: %s ",str); | ||
268 | |||
269 | memcpy(str,mpc->mpc_productid,12); | ||
270 | str[12]=0; | ||
271 | printk(KERN_INFO "Product ID: %s ",str); | ||
272 | |||
273 | printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); | ||
274 | |||
275 | /* save the local APIC address, it might be non-default */ | ||
276 | if (!acpi_lapic) | ||
277 | mp_lapic_addr = mpc->mpc_lapic; | ||
278 | |||
279 | /* | ||
280 | * Now process the configuration blocks. | ||
281 | */ | ||
282 | while (count < mpc->mpc_length) { | ||
283 | switch(*mpt) { | ||
284 | case MP_PROCESSOR: | ||
285 | { | ||
286 | struct mpc_config_processor *m= | ||
287 | (struct mpc_config_processor *)mpt; | ||
288 | if (!acpi_lapic) | ||
289 | MP_processor_info(m); | ||
290 | mpt += sizeof(*m); | ||
291 | count += sizeof(*m); | ||
292 | break; | ||
293 | } | ||
294 | case MP_BUS: | ||
295 | { | ||
296 | struct mpc_config_bus *m= | ||
297 | (struct mpc_config_bus *)mpt; | ||
298 | MP_bus_info(m); | ||
299 | mpt += sizeof(*m); | ||
300 | count += sizeof(*m); | ||
301 | break; | ||
302 | } | ||
303 | case MP_IOAPIC: | ||
304 | { | ||
305 | struct mpc_config_ioapic *m= | ||
306 | (struct mpc_config_ioapic *)mpt; | ||
307 | MP_ioapic_info(m); | ||
308 | mpt+=sizeof(*m); | ||
309 | count+=sizeof(*m); | ||
310 | break; | ||
311 | } | ||
312 | case MP_INTSRC: | ||
313 | { | ||
314 | struct mpc_config_intsrc *m= | ||
315 | (struct mpc_config_intsrc *)mpt; | ||
316 | |||
317 | MP_intsrc_info(m); | ||
318 | mpt+=sizeof(*m); | ||
319 | count+=sizeof(*m); | ||
320 | break; | ||
321 | } | ||
322 | case MP_LINTSRC: | ||
323 | { | ||
324 | struct mpc_config_lintsrc *m= | ||
325 | (struct mpc_config_lintsrc *)mpt; | ||
326 | MP_lintsrc_info(m); | ||
327 | mpt+=sizeof(*m); | ||
328 | count+=sizeof(*m); | ||
329 | break; | ||
330 | } | ||
331 | } | ||
332 | } | ||
333 | clustered_apic_check(); | ||
334 | if (!num_processors) | ||
335 | printk(KERN_ERR "SMP mptable: no processors registered!\n"); | ||
336 | return num_processors; | ||
337 | } | ||
338 | |||
339 | static int __init ELCR_trigger(unsigned int irq) | ||
340 | { | ||
341 | unsigned int port; | ||
342 | |||
343 | port = 0x4d0 + (irq >> 3); | ||
344 | return (inb(port) >> (irq & 7)) & 1; | ||
345 | } | ||
346 | |||
347 | static void __init construct_default_ioirq_mptable(int mpc_default_type) | ||
348 | { | ||
349 | struct mpc_config_intsrc intsrc; | ||
350 | int i; | ||
351 | int ELCR_fallback = 0; | ||
352 | |||
353 | intsrc.mpc_type = MP_INTSRC; | ||
354 | intsrc.mpc_irqflag = 0; /* conforming */ | ||
355 | intsrc.mpc_srcbus = 0; | ||
356 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | ||
357 | |||
358 | intsrc.mpc_irqtype = mp_INT; | ||
359 | |||
360 | /* | ||
361 | * If true, we have an ISA/PCI system with no IRQ entries | ||
362 | * in the MP table. To prevent the PCI interrupts from being set up | ||
363 | * incorrectly, we try to use the ELCR. The sanity check to see if | ||
364 | * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | ||
365 | * never be level sensitive, so we simply see if the ELCR agrees. | ||
366 | * If it does, we assume it's valid. | ||
367 | */ | ||
368 | if (mpc_default_type == 5) { | ||
369 | printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | ||
370 | |||
371 | if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | ||
372 | printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); | ||
373 | else { | ||
374 | printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | ||
375 | ELCR_fallback = 1; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | for (i = 0; i < 16; i++) { | ||
380 | switch (mpc_default_type) { | ||
381 | case 2: | ||
382 | if (i == 0 || i == 13) | ||
383 | continue; /* IRQ0 & IRQ13 not connected */ | ||
384 | /* fall through */ | ||
385 | default: | ||
386 | if (i == 2) | ||
387 | continue; /* IRQ2 is never connected */ | ||
388 | } | ||
389 | |||
390 | if (ELCR_fallback) { | ||
391 | /* | ||
392 | * If the ELCR indicates a level-sensitive interrupt, we | ||
393 | * copy that information over to the MP table in the | ||
394 | * irqflag field (level sensitive, active high polarity). | ||
395 | */ | ||
396 | if (ELCR_trigger(i)) | ||
397 | intsrc.mpc_irqflag = 13; | ||
398 | else | ||
399 | intsrc.mpc_irqflag = 0; | ||
400 | } | ||
401 | |||
402 | intsrc.mpc_srcbusirq = i; | ||
403 | intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | ||
404 | MP_intsrc_info(&intsrc); | ||
405 | } | ||
406 | |||
407 | intsrc.mpc_irqtype = mp_ExtINT; | ||
408 | intsrc.mpc_srcbusirq = 0; | ||
409 | intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | ||
410 | MP_intsrc_info(&intsrc); | ||
411 | } | ||
412 | |||
413 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
414 | { | ||
415 | struct mpc_config_processor processor; | ||
416 | struct mpc_config_bus bus; | ||
417 | struct mpc_config_ioapic ioapic; | ||
418 | struct mpc_config_lintsrc lintsrc; | ||
419 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
420 | int i; | ||
421 | |||
422 | /* | ||
423 | * local APIC has default address | ||
424 | */ | ||
425 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
426 | |||
427 | /* | ||
428 | * 2 CPUs, numbered 0 & 1. | ||
429 | */ | ||
430 | processor.mpc_type = MP_PROCESSOR; | ||
431 | /* Either an integrated APIC or a discrete 82489DX. */ | ||
432 | processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
433 | processor.mpc_cpuflag = CPU_ENABLED; | ||
434 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
435 | (boot_cpu_data.x86_model << 4) | | ||
436 | boot_cpu_data.x86_mask; | ||
437 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
438 | processor.mpc_reserved[0] = 0; | ||
439 | processor.mpc_reserved[1] = 0; | ||
440 | for (i = 0; i < 2; i++) { | ||
441 | processor.mpc_apicid = i; | ||
442 | MP_processor_info(&processor); | ||
443 | } | ||
444 | |||
445 | bus.mpc_type = MP_BUS; | ||
446 | bus.mpc_busid = 0; | ||
447 | switch (mpc_default_type) { | ||
448 | default: | ||
449 | printk(KERN_ERR "???\nUnknown standard configuration %d\n", | ||
450 | mpc_default_type); | ||
451 | /* fall through */ | ||
452 | case 1: | ||
453 | case 5: | ||
454 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
455 | break; | ||
456 | case 2: | ||
457 | case 6: | ||
458 | case 3: | ||
459 | memcpy(bus.mpc_bustype, "EISA ", 6); | ||
460 | break; | ||
461 | case 4: | ||
462 | case 7: | ||
463 | memcpy(bus.mpc_bustype, "MCA ", 6); | ||
464 | } | ||
465 | MP_bus_info(&bus); | ||
466 | if (mpc_default_type > 4) { | ||
467 | bus.mpc_busid = 1; | ||
468 | memcpy(bus.mpc_bustype, "PCI ", 6); | ||
469 | MP_bus_info(&bus); | ||
470 | } | ||
471 | |||
472 | ioapic.mpc_type = MP_IOAPIC; | ||
473 | ioapic.mpc_apicid = 2; | ||
474 | ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
475 | ioapic.mpc_flags = MPC_APIC_USABLE; | ||
476 | ioapic.mpc_apicaddr = 0xFEC00000; | ||
477 | MP_ioapic_info(&ioapic); | ||
478 | |||
479 | /* | ||
480 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | ||
481 | */ | ||
482 | construct_default_ioirq_mptable(mpc_default_type); | ||
483 | |||
484 | lintsrc.mpc_type = MP_LINTSRC; | ||
485 | lintsrc.mpc_irqflag = 0; /* conforming */ | ||
486 | lintsrc.mpc_srcbusid = 0; | ||
487 | lintsrc.mpc_srcbusirq = 0; | ||
488 | lintsrc.mpc_destapic = MP_APIC_ALL; | ||
489 | for (i = 0; i < 2; i++) { | ||
490 | lintsrc.mpc_irqtype = linttypes[i]; | ||
491 | lintsrc.mpc_destapiclint = i; | ||
492 | MP_lintsrc_info(&lintsrc); | ||
493 | } | ||
494 | } | ||
495 | |||
496 | static struct intel_mp_floating *mpf_found; | ||
497 | |||
498 | /* | ||
499 | * Scan the memory blocks for an SMP configuration block. | ||
500 | */ | ||
501 | void __init get_smp_config (void) | ||
502 | { | ||
503 | struct intel_mp_floating *mpf = mpf_found; | ||
504 | |||
505 | /* | ||
506 | * ACPI may be used to obtain the entire SMP configuration or just to | ||
507 | * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that | ||
508 | * ACPI supports both logical (e.g. Hyper-Threading) and physical | ||
509 | * processors, where MPS only supports physical. | ||
510 | */ | ||
511 | if (acpi_lapic && acpi_ioapic) { | ||
512 | printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | ||
513 | return; | ||
514 | } | ||
515 | else if (acpi_lapic) | ||
516 | printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | ||
517 | |||
518 | printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | ||
519 | if (mpf->mpf_feature2 & (1<<7)) { | ||
520 | printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | ||
521 | pic_mode = 1; | ||
522 | } else { | ||
523 | printk(KERN_INFO " Virtual Wire compatibility mode.\n"); | ||
524 | pic_mode = 0; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Now see if we need to read further. | ||
529 | */ | ||
530 | if (mpf->mpf_feature1 != 0) { | ||
531 | |||
532 | printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | ||
533 | construct_default_ISA_mptable(mpf->mpf_feature1); | ||
534 | |||
535 | } else if (mpf->mpf_physptr) { | ||
536 | |||
537 | /* | ||
538 | * Read the physical hardware table. Anything here will | ||
539 | * override the defaults. | ||
540 | */ | ||
541 | if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { | ||
542 | smp_found_config = 0; | ||
543 | printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | ||
544 | printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | ||
545 | return; | ||
546 | } | ||
547 | /* | ||
548 | * If there are no explicit MP IRQ entries, then we are | ||
549 | * broken. We set up most of the low 16 IO-APIC pins to | ||
550 | * ISA defaults and hope it will work. | ||
551 | */ | ||
552 | if (!mp_irq_entries) { | ||
553 | struct mpc_config_bus bus; | ||
554 | |||
555 | printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | ||
556 | |||
557 | bus.mpc_type = MP_BUS; | ||
558 | bus.mpc_busid = 0; | ||
559 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
560 | MP_bus_info(&bus); | ||
561 | |||
562 | construct_default_ioirq_mptable(0); | ||
563 | } | ||
564 | |||
565 | } else | ||
566 | BUG(); | ||
567 | |||
568 | printk(KERN_INFO "Processors: %d\n", num_processors); | ||
569 | /* | ||
570 | * Only use the first configuration found. | ||
571 | */ | ||
572 | } | ||
573 | |||
574 | static int __init smp_scan_config (unsigned long base, unsigned long length) | ||
575 | { | ||
576 | extern void __bad_mpf_size(void); | ||
577 | unsigned int *bp = phys_to_virt(base); | ||
578 | struct intel_mp_floating *mpf; | ||
579 | |||
580 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | ||
581 | if (sizeof(*mpf) != 16) | ||
582 | __bad_mpf_size(); | ||
583 | |||
584 | while (length > 0) { | ||
585 | mpf = (struct intel_mp_floating *)bp; | ||
586 | if ((*bp == SMP_MAGIC_IDENT) && | ||
587 | (mpf->mpf_length == 1) && | ||
588 | !mpf_checksum((unsigned char *)bp, 16) && | ||
589 | ((mpf->mpf_specification == 1) | ||
590 | || (mpf->mpf_specification == 4)) ) { | ||
591 | |||
592 | smp_found_config = 1; | ||
593 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); | ||
594 | if (mpf->mpf_physptr) | ||
595 | reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); | ||
596 | mpf_found = mpf; | ||
597 | return 1; | ||
598 | } | ||
599 | bp += 4; | ||
600 | length -= 16; | ||
601 | } | ||
602 | return 0; | ||
603 | } | ||
604 | |||
605 | void __init find_intel_smp (void) | ||
606 | { | ||
607 | unsigned int address; | ||
608 | |||
609 | /* | ||
610 | * FIXME: Linux assumes you have 640K of base ram.. | ||
611 | * this continues the error... | ||
612 | * | ||
613 | * 1) Scan the bottom 1K for a signature | ||
614 | * 2) Scan the top 1K of base RAM | ||
615 | * 3) Scan the 64K of bios | ||
616 | */ | ||
617 | if (smp_scan_config(0x0,0x400) || | ||
618 | smp_scan_config(639*0x400,0x400) || | ||
619 | smp_scan_config(0xF0000,0x10000)) | ||
620 | return; | ||
621 | /* | ||
622 | * If it is an SMP machine we should know now, unless the | ||
623 | * configuration is in an EISA/MCA bus machine with an | ||
624 | * extended bios data area. | ||
625 | * | ||
626 | * there is a real-mode segmented pointer pointing to the | ||
627 | * 4K EBDA area at 0x40E, calculate and scan it here. | ||
628 | * | ||
629 | * NOTE! There are Linux loaders that will corrupt the EBDA | ||
630 | * area, and as such this kind of SMP config may be less | ||
631 | * trustworthy, simply because the SMP table may have been | ||
632 | * stomped on during early boot. These loaders are buggy and | ||
633 | * should be fixed. | ||
634 | */ | ||
635 | |||
636 | address = *(unsigned short *)phys_to_virt(0x40E); | ||
637 | address <<= 4; | ||
638 | if (smp_scan_config(address, 0x1000)) | ||
639 | return; | ||
640 | |||
641 | /* If we have come this far, we did not find an MP table */ | ||
642 | printk(KERN_INFO "No mptable found.\n"); | ||
643 | } | ||
644 | |||
645 | /* | ||
646 | * - Intel MP Configuration Table | ||
647 | */ | ||
648 | void __init find_smp_config (void) | ||
649 | { | ||
650 | #ifdef CONFIG_X86_LOCAL_APIC | ||
651 | find_intel_smp(); | ||
652 | #endif | ||
653 | } | ||
654 | |||
655 | |||
656 | /* -------------------------------------------------------------------------- | ||
657 | ACPI-based MP Configuration | ||
658 | -------------------------------------------------------------------------- */ | ||
659 | |||
660 | #ifdef CONFIG_ACPI_BOOT | ||
661 | |||
662 | void __init mp_register_lapic_address ( | ||
663 | u64 address) | ||
664 | { | ||
665 | mp_lapic_addr = (unsigned long) address; | ||
666 | |||
667 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
668 | |||
669 | if (boot_cpu_id == -1U) | ||
670 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
671 | |||
672 | Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); | ||
673 | } | ||
674 | |||
675 | |||
676 | void __init mp_register_lapic ( | ||
677 | u8 id, | ||
678 | u8 enabled) | ||
679 | { | ||
680 | struct mpc_config_processor processor; | ||
681 | int boot_cpu = 0; | ||
682 | |||
683 | if (id >= MAX_APICS) { | ||
684 | printk(KERN_WARNING "Processor #%d invalid (max %d)\n", | ||
685 | id, MAX_APICS); | ||
686 | return; | ||
687 | } | ||
688 | |||
689 | if (id == boot_cpu_physical_apicid) | ||
690 | boot_cpu = 1; | ||
691 | |||
692 | processor.mpc_type = MP_PROCESSOR; | ||
693 | processor.mpc_apicid = id; | ||
694 | processor.mpc_apicver = 0x10; /* TBD: lapic version */ | ||
695 | processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | ||
696 | processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | ||
697 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
698 | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | ||
699 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
700 | processor.mpc_reserved[0] = 0; | ||
701 | processor.mpc_reserved[1] = 0; | ||
702 | |||
703 | MP_processor_info(&processor); | ||
704 | } | ||
705 | |||
706 | #ifdef CONFIG_X86_IO_APIC | ||
707 | |||
708 | #define MP_ISA_BUS 0 | ||
709 | #define MP_MAX_IOAPIC_PIN 127 | ||
710 | |||
711 | static struct mp_ioapic_routing { | ||
712 | int apic_id; | ||
713 | int gsi_start; | ||
714 | int gsi_end; | ||
715 | u32 pin_programmed[4]; | ||
716 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
717 | |||
718 | |||
719 | static int mp_find_ioapic ( | ||
720 | int gsi) | ||
721 | { | ||
722 | int i = 0; | ||
723 | |||
724 | /* Find the IOAPIC that manages this GSI. */ | ||
725 | for (i = 0; i < nr_ioapics; i++) { | ||
726 | if ((gsi >= mp_ioapic_routing[i].gsi_start) | ||
727 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
728 | return i; | ||
729 | } | ||
730 | |||
731 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
732 | |||
733 | return -1; | ||
734 | } | ||
735 | |||
736 | |||
737 | void __init mp_register_ioapic ( | ||
738 | u8 id, | ||
739 | u32 address, | ||
740 | u32 gsi_base) | ||
741 | { | ||
742 | int idx = 0; | ||
743 | |||
744 | if (nr_ioapics >= MAX_IO_APICS) { | ||
745 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
746 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
747 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
748 | } | ||
749 | if (!address) { | ||
750 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
751 | " found in MADT table, skipping!\n"); | ||
752 | return; | ||
753 | } | ||
754 | |||
755 | idx = nr_ioapics++; | ||
756 | |||
757 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | ||
758 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | ||
759 | mp_ioapics[idx].mpc_apicaddr = address; | ||
760 | |||
761 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
762 | mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); | ||
763 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | ||
764 | |||
765 | /* | ||
766 | * Build basic IRQ lookup table to facilitate gsi->io_apic lookups | ||
767 | * and to prevent reprogramming of IOAPIC pins (PCI IRQs). | ||
768 | */ | ||
769 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | ||
770 | mp_ioapic_routing[idx].gsi_start = gsi_base; | ||
771 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
772 | io_apic_get_redir_entries(idx); | ||
773 | |||
774 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | ||
775 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | ||
776 | mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | ||
777 | mp_ioapic_routing[idx].gsi_start, | ||
778 | mp_ioapic_routing[idx].gsi_end); | ||
779 | |||
780 | return; | ||
781 | } | ||
782 | |||
783 | |||
784 | void __init mp_override_legacy_irq ( | ||
785 | u8 bus_irq, | ||
786 | u8 polarity, | ||
787 | u8 trigger, | ||
788 | u32 gsi) | ||
789 | { | ||
790 | struct mpc_config_intsrc intsrc; | ||
791 | int ioapic = -1; | ||
792 | int pin = -1; | ||
793 | |||
794 | /* | ||
795 | * Convert 'gsi' to 'ioapic.pin'. | ||
796 | */ | ||
797 | ioapic = mp_find_ioapic(gsi); | ||
798 | if (ioapic < 0) | ||
799 | return; | ||
800 | pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
801 | |||
802 | /* | ||
803 | * TBD: This check is for faulty timer entries, where the override | ||
804 | * erroneously sets the trigger to level, resulting in a HUGE | ||
805 | * increase of timer interrupts! | ||
806 | */ | ||
807 | if ((bus_irq == 0) && (trigger == 3)) | ||
808 | trigger = 1; | ||
809 | |||
810 | intsrc.mpc_type = MP_INTSRC; | ||
811 | intsrc.mpc_irqtype = mp_INT; | ||
812 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | ||
813 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
814 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | ||
815 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | ||
816 | intsrc.mpc_dstirq = pin; /* INTIN# */ | ||
817 | |||
818 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | ||
819 | intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
820 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
821 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | ||
822 | |||
823 | mp_irqs[mp_irq_entries] = intsrc; | ||
824 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
825 | panic("Max # of irq sources exceeded!\n"); | ||
826 | |||
827 | return; | ||
828 | } | ||
829 | |||
830 | |||
831 | void __init mp_config_acpi_legacy_irqs (void) | ||
832 | { | ||
833 | struct mpc_config_intsrc intsrc; | ||
834 | int i = 0; | ||
835 | int ioapic = -1; | ||
836 | |||
837 | /* | ||
838 | * Fabricate the legacy ISA bus (bus #31). | ||
839 | */ | ||
840 | mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | ||
841 | Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | ||
842 | |||
843 | /* | ||
844 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
845 | */ | ||
846 | ioapic = mp_find_ioapic(0); | ||
847 | if (ioapic < 0) | ||
848 | return; | ||
849 | |||
850 | intsrc.mpc_type = MP_INTSRC; | ||
851 | intsrc.mpc_irqflag = 0; /* Conforming */ | ||
852 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
853 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | ||
854 | |||
855 | /* | ||
856 | * Use the default configuration for the IRQs 0-15. Unless | ||
857 | * overridden by (MADT) interrupt source override entries. | ||
858 | */ | ||
859 | for (i = 0; i < 16; i++) { | ||
860 | int idx; | ||
861 | |||
862 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
863 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
864 | |||
865 | /* Do we already have a mapping for this ISA IRQ? */ | ||
866 | if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | ||
867 | break; | ||
868 | |||
869 | /* Do we already have a mapping for this IOAPIC pin */ | ||
870 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
871 | (irq->mpc_dstirq == i)) | ||
872 | break; | ||
873 | } | ||
874 | |||
875 | if (idx != mp_irq_entries) { | ||
876 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
877 | continue; /* IRQ already used */ | ||
878 | } | ||
879 | |||
880 | intsrc.mpc_irqtype = mp_INT; | ||
881 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
882 | intsrc.mpc_dstirq = i; | ||
883 | |||
884 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | ||
885 | "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
886 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
887 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | ||
888 | intsrc.mpc_dstirq); | ||
889 | |||
890 | mp_irqs[mp_irq_entries] = intsrc; | ||
891 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
892 | panic("Max # of irq sources exceeded!\n"); | ||
893 | } | ||
894 | |||
895 | return; | ||
896 | } | ||
897 | |||
898 | int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) | ||
899 | { | ||
900 | int ioapic = -1; | ||
901 | int ioapic_pin = 0; | ||
902 | int idx, bit = 0; | ||
903 | |||
904 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | ||
905 | return gsi; | ||
906 | |||
907 | #ifdef CONFIG_ACPI_BUS | ||
908 | /* Don't set up the ACPI SCI because it's already set up */ | ||
909 | if (acpi_fadt.sci_int == gsi) | ||
910 | return gsi; | ||
911 | #endif | ||
912 | |||
913 | ioapic = mp_find_ioapic(gsi); | ||
914 | if (ioapic < 0) { | ||
915 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
916 | return gsi; | ||
917 | } | ||
918 | |||
919 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
920 | |||
921 | /* | ||
922 | * Avoid pin reprogramming. PRTs typically include entries | ||
923 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
924 | * we only program the IOAPIC on the first. | ||
925 | */ | ||
926 | bit = ioapic_pin % 32; | ||
927 | idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | ||
928 | if (idx > 3) { | ||
929 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
930 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
931 | ioapic_pin); | ||
932 | return gsi; | ||
933 | } | ||
934 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | ||
935 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
936 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
937 | return gsi; | ||
938 | } | ||
939 | |||
940 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | ||
941 | |||
942 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
943 | edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
944 | active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
945 | return gsi; | ||
946 | } | ||
947 | |||
948 | #endif /*CONFIG_X86_IO_APIC*/ | ||
949 | #endif /*CONFIG_ACPI_BOOT*/ | ||
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c new file mode 100644 index 000000000000..598953ab0154 --- /dev/null +++ b/arch/x86_64/kernel/msr.c | |||
@@ -0,0 +1,279 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2000 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, | ||
8 | * USA; either version 2 of the License, or (at your option) any later | ||
9 | * version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * msr.c | ||
15 | * | ||
16 | * x86 MSR access device | ||
17 | * | ||
18 | * This device is accessed by lseek() to the appropriate register number | ||
19 | * and then read/write in chunks of 8 bytes. A larger size means multiple | ||
20 | * reads or writes of the same register. | ||
21 | * | ||
22 | * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on | ||
23 | * an SMP box will direct the access to CPU %d. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/config.h> | ||
28 | |||
29 | #include <linux/types.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/fcntl.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/poll.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/smp_lock.h> | ||
36 | #include <linux/major.h> | ||
37 | #include <linux/fs.h> | ||
38 | |||
39 | #include <asm/processor.h> | ||
40 | #include <asm/msr.h> | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/system.h> | ||
43 | |||
44 | /* Note: "err" is handled in a funny way below. Otherwise one version | ||
45 | of gcc or another breaks. */ | ||
46 | |||
47 | static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) | ||
48 | { | ||
49 | int err; | ||
50 | |||
51 | asm volatile ("1: wrmsr\n" | ||
52 | "2:\n" | ||
53 | ".section .fixup,\"ax\"\n" | ||
54 | "3: movl %4,%0\n" | ||
55 | " jmp 2b\n" | ||
56 | ".previous\n" | ||
57 | ".section __ex_table,\"a\"\n" | ||
58 | " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) | ||
59 | :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); | ||
60 | |||
61 | return err; | ||
62 | } | ||
63 | |||
64 | static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) | ||
65 | { | ||
66 | int err; | ||
67 | |||
68 | asm volatile ("1: rdmsr\n" | ||
69 | "2:\n" | ||
70 | ".section .fixup,\"ax\"\n" | ||
71 | "3: movl %4,%0\n" | ||
72 | " jmp 2b\n" | ||
73 | ".previous\n" | ||
74 | ".section __ex_table,\"a\"\n" | ||
75 | " .align 8\n" | ||
76 | " .quad 1b,3b\n" | ||
77 | ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) | ||
78 | :"c"(reg), "i"(-EIO), "0"(0)); | ||
79 | |||
80 | return err; | ||
81 | } | ||
82 | |||
83 | #ifdef CONFIG_SMP | ||
84 | |||
85 | struct msr_command { | ||
86 | int cpu; | ||
87 | int err; | ||
88 | u32 reg; | ||
89 | u32 data[2]; | ||
90 | }; | ||
91 | |||
92 | static void msr_smp_wrmsr(void *cmd_block) | ||
93 | { | ||
94 | struct msr_command *cmd = (struct msr_command *)cmd_block; | ||
95 | |||
96 | if (cmd->cpu == smp_processor_id()) | ||
97 | cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); | ||
98 | } | ||
99 | |||
100 | static void msr_smp_rdmsr(void *cmd_block) | ||
101 | { | ||
102 | struct msr_command *cmd = (struct msr_command *)cmd_block; | ||
103 | |||
104 | if (cmd->cpu == smp_processor_id()) | ||
105 | cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); | ||
106 | } | ||
107 | |||
108 | static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) | ||
109 | { | ||
110 | struct msr_command cmd; | ||
111 | int ret; | ||
112 | |||
113 | preempt_disable(); | ||
114 | if (cpu == smp_processor_id()) { | ||
115 | ret = wrmsr_eio(reg, eax, edx); | ||
116 | } else { | ||
117 | cmd.cpu = cpu; | ||
118 | cmd.reg = reg; | ||
119 | cmd.data[0] = eax; | ||
120 | cmd.data[1] = edx; | ||
121 | |||
122 | smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); | ||
123 | ret = cmd.err; | ||
124 | } | ||
125 | preempt_enable(); | ||
126 | return ret; | ||
127 | } | ||
128 | |||
129 | static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) | ||
130 | { | ||
131 | struct msr_command cmd; | ||
132 | int ret; | ||
133 | |||
134 | preempt_disable(); | ||
135 | if (cpu == smp_processor_id()) { | ||
136 | ret = rdmsr_eio(reg, eax, edx); | ||
137 | } else { | ||
138 | cmd.cpu = cpu; | ||
139 | cmd.reg = reg; | ||
140 | |||
141 | smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); | ||
142 | |||
143 | *eax = cmd.data[0]; | ||
144 | *edx = cmd.data[1]; | ||
145 | |||
146 | ret = cmd.err; | ||
147 | } | ||
148 | preempt_enable(); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | #else /* ! CONFIG_SMP */ | ||
153 | |||
154 | static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) | ||
155 | { | ||
156 | return wrmsr_eio(reg, eax, edx); | ||
157 | } | ||
158 | |||
159 | static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) | ||
160 | { | ||
161 | return rdmsr_eio(reg, eax, edx); | ||
162 | } | ||
163 | |||
164 | #endif /* ! CONFIG_SMP */ | ||
165 | |||
166 | static loff_t msr_seek(struct file *file, loff_t offset, int orig) | ||
167 | { | ||
168 | loff_t ret = -EINVAL; | ||
169 | |||
170 | lock_kernel(); | ||
171 | switch (orig) { | ||
172 | case 0: | ||
173 | file->f_pos = offset; | ||
174 | ret = file->f_pos; | ||
175 | break; | ||
176 | case 1: | ||
177 | file->f_pos += offset; | ||
178 | ret = file->f_pos; | ||
179 | } | ||
180 | unlock_kernel(); | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | static ssize_t msr_read(struct file *file, char __user * buf, | ||
185 | size_t count, loff_t * ppos) | ||
186 | { | ||
187 | u32 __user *tmp = (u32 __user *) buf; | ||
188 | u32 data[2]; | ||
189 | size_t rv; | ||
190 | u32 reg = *ppos; | ||
191 | int cpu = iminor(file->f_dentry->d_inode); | ||
192 | int err; | ||
193 | |||
194 | if (count % 8) | ||
195 | return -EINVAL; /* Invalid chunk size */ | ||
196 | |||
197 | for (rv = 0; count; count -= 8) { | ||
198 | err = do_rdmsr(cpu, reg, &data[0], &data[1]); | ||
199 | if (err) | ||
200 | return err; | ||
201 | if (copy_to_user(tmp, &data, 8)) | ||
202 | return -EFAULT; | ||
203 | tmp += 2; | ||
204 | } | ||
205 | |||
206 | return ((char __user *)tmp) - buf; | ||
207 | } | ||
208 | |||
209 | static ssize_t msr_write(struct file *file, const char __user *buf, | ||
210 | size_t count, loff_t *ppos) | ||
211 | { | ||
212 | const u32 __user *tmp = (const u32 __user *)buf; | ||
213 | u32 data[2]; | ||
214 | size_t rv; | ||
215 | u32 reg = *ppos; | ||
216 | int cpu = iminor(file->f_dentry->d_inode); | ||
217 | int err; | ||
218 | |||
219 | if (count % 8) | ||
220 | return -EINVAL; /* Invalid chunk size */ | ||
221 | |||
222 | for (rv = 0; count; count -= 8) { | ||
223 | if (copy_from_user(&data, tmp, 8)) | ||
224 | return -EFAULT; | ||
225 | err = do_wrmsr(cpu, reg, data[0], data[1]); | ||
226 | if (err) | ||
227 | return err; | ||
228 | tmp += 2; | ||
229 | } | ||
230 | |||
231 | return ((char __user *)tmp) - buf; | ||
232 | } | ||
233 | |||
234 | static int msr_open(struct inode *inode, struct file *file) | ||
235 | { | ||
236 | unsigned int cpu = iminor(file->f_dentry->d_inode); | ||
237 | struct cpuinfo_x86 *c = &(cpu_data)[cpu]; | ||
238 | |||
239 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | ||
240 | return -ENXIO; /* No such CPU */ | ||
241 | if (!cpu_has(c, X86_FEATURE_MSR)) | ||
242 | return -EIO; /* MSR not supported */ | ||
243 | |||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * File operations we support | ||
249 | */ | ||
250 | static struct file_operations msr_fops = { | ||
251 | .owner = THIS_MODULE, | ||
252 | .llseek = msr_seek, | ||
253 | .read = msr_read, | ||
254 | .write = msr_write, | ||
255 | .open = msr_open, | ||
256 | }; | ||
257 | |||
258 | static int __init msr_init(void) | ||
259 | { | ||
260 | if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { | ||
261 | printk(KERN_ERR "msr: unable to get major %d for msr\n", | ||
262 | MSR_MAJOR); | ||
263 | return -EBUSY; | ||
264 | } | ||
265 | |||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | static void __exit msr_exit(void) | ||
270 | { | ||
271 | unregister_chrdev(MSR_MAJOR, "cpu/msr"); | ||
272 | } | ||
273 | |||
274 | module_init(msr_init); | ||
275 | module_exit(msr_exit) | ||
276 | |||
277 | MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); | ||
278 | MODULE_DESCRIPTION("x86 generic MSR driver"); | ||
279 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c new file mode 100644 index 000000000000..d9867de6a626 --- /dev/null +++ b/arch/x86_64/kernel/nmi.c | |||
@@ -0,0 +1,488 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/nmi.c | ||
3 | * | ||
4 | * NMI watchdog support on APIC systems | ||
5 | * | ||
6 | * Started by Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes: | ||
9 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
10 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
11 | * Pavel Machek and | ||
12 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/smp_lock.h> | ||
21 | #include <linux/interrupt.h> | ||
22 | #include <linux/mc146818rtc.h> | ||
23 | #include <linux/kernel_stat.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/nmi.h> | ||
27 | #include <linux/sysctl.h> | ||
28 | |||
29 | #include <asm/smp.h> | ||
30 | #include <asm/mtrr.h> | ||
31 | #include <asm/mpspec.h> | ||
32 | #include <asm/nmi.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/proto.h> | ||
35 | #include <asm/kdebug.h> | ||
36 | |||
37 | /* | ||
38 | * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: | ||
39 | * - it may be reserved by some other driver, or not | ||
40 | * - when not reserved by some other driver, it may be used for | ||
41 | * the NMI watchdog, or not | ||
42 | * | ||
43 | * This is maintained separately from nmi_active because the NMI | ||
44 | * watchdog may also be driven from the I/O APIC timer. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(lapic_nmi_owner_lock); | ||
47 | static unsigned int lapic_nmi_owner; | ||
48 | #define LAPIC_NMI_WATCHDOG (1<<0) | ||
49 | #define LAPIC_NMI_RESERVED (1<<1) | ||
50 | |||
51 | /* nmi_active: | ||
52 | * +1: the lapic NMI watchdog is active, but can be disabled | ||
53 | * 0: the lapic NMI watchdog has not been set up, and cannot | ||
54 | * be enabled | ||
55 | * -1: the lapic NMI watchdog is disabled, but can be enabled | ||
56 | */ | ||
57 | int nmi_active; /* oprofile uses this */ | ||
58 | int panic_on_timeout; | ||
59 | |||
60 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
61 | static unsigned int nmi_hz = HZ; | ||
62 | unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ | ||
63 | |||
64 | /* Note that these events don't tick when the CPU idles. This means | ||
65 | the frequency varies with CPU load. */ | ||
66 | |||
67 | #define K7_EVNTSEL_ENABLE (1 << 22) | ||
68 | #define K7_EVNTSEL_INT (1 << 20) | ||
69 | #define K7_EVNTSEL_OS (1 << 17) | ||
70 | #define K7_EVNTSEL_USR (1 << 16) | ||
71 | #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 | ||
72 | #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING | ||
73 | |||
74 | #define P6_EVNTSEL0_ENABLE (1 << 22) | ||
75 | #define P6_EVNTSEL_INT (1 << 20) | ||
76 | #define P6_EVNTSEL_OS (1 << 17) | ||
77 | #define P6_EVNTSEL_USR (1 << 16) | ||
78 | #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 | ||
79 | #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED | ||
80 | |||
81 | /* Run after command line and cpu_init init, but before all other checks */ | ||
82 | void __init nmi_watchdog_default(void) | ||
83 | { | ||
84 | if (nmi_watchdog != NMI_DEFAULT) | ||
85 | return; | ||
86 | |||
87 | /* For some reason the IO APIC watchdog doesn't work on the AMD | ||
88 | 8111 chipset. For now switch to local APIC mode using | ||
89 | perfctr0 there. On Intel CPUs we don't have code to handle | ||
90 | the perfctr and the IO-APIC seems to work, so use that. */ | ||
91 | |||
92 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||
93 | nmi_watchdog = NMI_LOCAL_APIC; | ||
94 | printk(KERN_INFO | ||
95 | "Using local APIC NMI watchdog using perfctr0\n"); | ||
96 | } else { | ||
97 | printk(KERN_INFO "Using IO APIC NMI watchdog\n"); | ||
98 | nmi_watchdog = NMI_IO_APIC; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | /* Why is there no CPUID flag for this? */ | ||
103 | static __init int cpu_has_lapic(void) | ||
104 | { | ||
105 | switch (boot_cpu_data.x86_vendor) { | ||
106 | case X86_VENDOR_INTEL: | ||
107 | case X86_VENDOR_AMD: | ||
108 | return boot_cpu_data.x86 >= 6; | ||
109 | /* .... add more cpus here or find a different way to figure this out. */ | ||
110 | default: | ||
111 | return 0; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | int __init check_nmi_watchdog (void) | ||
116 | { | ||
117 | int counts[NR_CPUS]; | ||
118 | int cpu; | ||
119 | |||
120 | if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { | ||
121 | nmi_watchdog = NMI_NONE; | ||
122 | return -1; | ||
123 | } | ||
124 | |||
125 | printk(KERN_INFO "testing NMI watchdog ... "); | ||
126 | |||
127 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
128 | counts[cpu] = cpu_pda[cpu].__nmi_count; | ||
129 | local_irq_enable(); | ||
130 | mdelay((10*1000)/nmi_hz); // wait 10 ticks | ||
131 | |||
132 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
133 | #ifdef CONFIG_SMP | ||
134 | /* Check cpu_callin_map here because that is set | ||
135 | after the timer is started. */ | ||
136 | if (!cpu_isset(cpu, cpu_callin_map)) | ||
137 | continue; | ||
138 | #endif | ||
139 | if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { | ||
140 | printk("CPU#%d: NMI appears to be stuck (%d)!\n", | ||
141 | cpu, | ||
142 | cpu_pda[cpu].__nmi_count); | ||
143 | nmi_active = 0; | ||
144 | lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; | ||
145 | return -1; | ||
146 | } | ||
147 | } | ||
148 | printk("OK.\n"); | ||
149 | |||
150 | /* now that we know it works we can reduce NMI frequency to | ||
151 | something more reasonable; makes a difference in some configs */ | ||
152 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
153 | nmi_hz = 1; | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | int __init setup_nmi_watchdog(char *str) | ||
159 | { | ||
160 | int nmi; | ||
161 | |||
162 | if (!strncmp(str,"panic",5)) { | ||
163 | panic_on_timeout = 1; | ||
164 | str = strchr(str, ','); | ||
165 | if (!str) | ||
166 | return 1; | ||
167 | ++str; | ||
168 | } | ||
169 | |||
170 | get_option(&str, &nmi); | ||
171 | |||
172 | if (nmi >= NMI_INVALID) | ||
173 | return 0; | ||
174 | nmi_watchdog = nmi; | ||
175 | return 1; | ||
176 | } | ||
177 | |||
178 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
179 | |||
180 | static void disable_lapic_nmi_watchdog(void) | ||
181 | { | ||
182 | if (nmi_active <= 0) | ||
183 | return; | ||
184 | switch (boot_cpu_data.x86_vendor) { | ||
185 | case X86_VENDOR_AMD: | ||
186 | wrmsr(MSR_K7_EVNTSEL0, 0, 0); | ||
187 | break; | ||
188 | case X86_VENDOR_INTEL: | ||
189 | wrmsr(MSR_IA32_EVNTSEL0, 0, 0); | ||
190 | break; | ||
191 | } | ||
192 | nmi_active = -1; | ||
193 | /* tell do_nmi() and others that we're not active any more */ | ||
194 | nmi_watchdog = 0; | ||
195 | } | ||
196 | |||
197 | static void enable_lapic_nmi_watchdog(void) | ||
198 | { | ||
199 | if (nmi_active < 0) { | ||
200 | nmi_watchdog = NMI_LOCAL_APIC; | ||
201 | setup_apic_nmi_watchdog(); | ||
202 | } | ||
203 | } | ||
204 | |||
205 | int reserve_lapic_nmi(void) | ||
206 | { | ||
207 | unsigned int old_owner; | ||
208 | |||
209 | spin_lock(&lapic_nmi_owner_lock); | ||
210 | old_owner = lapic_nmi_owner; | ||
211 | lapic_nmi_owner |= LAPIC_NMI_RESERVED; | ||
212 | spin_unlock(&lapic_nmi_owner_lock); | ||
213 | if (old_owner & LAPIC_NMI_RESERVED) | ||
214 | return -EBUSY; | ||
215 | if (old_owner & LAPIC_NMI_WATCHDOG) | ||
216 | disable_lapic_nmi_watchdog(); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | void release_lapic_nmi(void) | ||
221 | { | ||
222 | unsigned int new_owner; | ||
223 | |||
224 | spin_lock(&lapic_nmi_owner_lock); | ||
225 | new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; | ||
226 | lapic_nmi_owner = new_owner; | ||
227 | spin_unlock(&lapic_nmi_owner_lock); | ||
228 | if (new_owner & LAPIC_NMI_WATCHDOG) | ||
229 | enable_lapic_nmi_watchdog(); | ||
230 | } | ||
231 | |||
232 | void disable_timer_nmi_watchdog(void) | ||
233 | { | ||
234 | if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) | ||
235 | return; | ||
236 | |||
237 | disable_irq(0); | ||
238 | unset_nmi_callback(); | ||
239 | nmi_active = -1; | ||
240 | nmi_watchdog = NMI_NONE; | ||
241 | } | ||
242 | |||
243 | void enable_timer_nmi_watchdog(void) | ||
244 | { | ||
245 | if (nmi_active < 0) { | ||
246 | nmi_watchdog = NMI_IO_APIC; | ||
247 | touch_nmi_watchdog(); | ||
248 | nmi_active = 1; | ||
249 | enable_irq(0); | ||
250 | } | ||
251 | } | ||
252 | |||
253 | #ifdef CONFIG_PM | ||
254 | |||
255 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
256 | |||
257 | static int lapic_nmi_suspend(struct sys_device *dev, u32 state) | ||
258 | { | ||
259 | nmi_pm_active = nmi_active; | ||
260 | disable_lapic_nmi_watchdog(); | ||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | static int lapic_nmi_resume(struct sys_device *dev) | ||
265 | { | ||
266 | if (nmi_pm_active > 0) | ||
267 | enable_lapic_nmi_watchdog(); | ||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | static struct sysdev_class nmi_sysclass = { | ||
272 | set_kset_name("lapic_nmi"), | ||
273 | .resume = lapic_nmi_resume, | ||
274 | .suspend = lapic_nmi_suspend, | ||
275 | }; | ||
276 | |||
277 | static struct sys_device device_lapic_nmi = { | ||
278 | .id = 0, | ||
279 | .cls = &nmi_sysclass, | ||
280 | }; | ||
281 | |||
282 | static int __init init_lapic_nmi_sysfs(void) | ||
283 | { | ||
284 | int error; | ||
285 | |||
286 | if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) | ||
287 | return 0; | ||
288 | |||
289 | error = sysdev_class_register(&nmi_sysclass); | ||
290 | if (!error) | ||
291 | error = sysdev_register(&device_lapic_nmi); | ||
292 | return error; | ||
293 | } | ||
294 | /* must come after the local APIC's device_initcall() */ | ||
295 | late_initcall(init_lapic_nmi_sysfs); | ||
296 | |||
297 | #endif /* CONFIG_PM */ | ||
298 | |||
299 | /* | ||
300 | * Activate the NMI watchdog via the local APIC. | ||
301 | * Original code written by Keith Owens. | ||
302 | */ | ||
303 | |||
304 | static void setup_k7_watchdog(void) | ||
305 | { | ||
306 | int i; | ||
307 | unsigned int evntsel; | ||
308 | |||
309 | /* No check, so can start with slow frequency */ | ||
310 | nmi_hz = 1; | ||
311 | |||
312 | /* XXX should check these in EFER */ | ||
313 | |||
314 | nmi_perfctr_msr = MSR_K7_PERFCTR0; | ||
315 | |||
316 | for(i = 0; i < 4; ++i) { | ||
317 | /* Simulator may not support it */ | ||
318 | if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) | ||
319 | return; | ||
320 | wrmsrl(MSR_K7_PERFCTR0+i, 0UL); | ||
321 | } | ||
322 | |||
323 | evntsel = K7_EVNTSEL_INT | ||
324 | | K7_EVNTSEL_OS | ||
325 | | K7_EVNTSEL_USR | ||
326 | | K7_NMI_EVENT; | ||
327 | |||
328 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | ||
329 | wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); | ||
330 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
331 | evntsel |= K7_EVNTSEL_ENABLE; | ||
332 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | ||
333 | } | ||
334 | |||
335 | void setup_apic_nmi_watchdog(void) | ||
336 | { | ||
337 | switch (boot_cpu_data.x86_vendor) { | ||
338 | case X86_VENDOR_AMD: | ||
339 | if (boot_cpu_data.x86 < 6) | ||
340 | return; | ||
341 | if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) | ||
342 | return; | ||
343 | setup_k7_watchdog(); | ||
344 | break; | ||
345 | default: | ||
346 | return; | ||
347 | } | ||
348 | lapic_nmi_owner = LAPIC_NMI_WATCHDOG; | ||
349 | nmi_active = 1; | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
354 | * is to check it's local APIC timer IRQ counts. If they are not | ||
355 | * changing then that CPU has some problem. | ||
356 | * | ||
357 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
358 | * have to check the current processor. | ||
359 | * | ||
360 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
361 | * careful not to rely on unsafe variables. The printk might lock | ||
362 | * up though, so we have to break up any console locks first ... | ||
363 | * [when there will be more tty-related locks, break them up | ||
364 | * here too!] | ||
365 | */ | ||
366 | |||
367 | static unsigned int | ||
368 | last_irq_sums [NR_CPUS], | ||
369 | alert_counter [NR_CPUS]; | ||
370 | |||
371 | void touch_nmi_watchdog (void) | ||
372 | { | ||
373 | int i; | ||
374 | |||
375 | /* | ||
376 | * Just reset the alert counters, (other CPUs might be | ||
377 | * spinning on locks we hold): | ||
378 | */ | ||
379 | for (i = 0; i < NR_CPUS; i++) | ||
380 | alert_counter[i] = 0; | ||
381 | } | ||
382 | |||
383 | void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) | ||
384 | { | ||
385 | int sum, cpu; | ||
386 | |||
387 | cpu = safe_smp_processor_id(); | ||
388 | sum = read_pda(apic_timer_irqs); | ||
389 | if (last_irq_sums[cpu] == sum) { | ||
390 | /* | ||
391 | * Ayiee, looks like this CPU is stuck ... | ||
392 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
393 | */ | ||
394 | alert_counter[cpu]++; | ||
395 | if (alert_counter[cpu] == 5*nmi_hz) { | ||
396 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
397 | == NOTIFY_STOP) { | ||
398 | alert_counter[cpu] = 0; | ||
399 | return; | ||
400 | } | ||
401 | die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); | ||
402 | } | ||
403 | } else { | ||
404 | last_irq_sums[cpu] = sum; | ||
405 | alert_counter[cpu] = 0; | ||
406 | } | ||
407 | if (nmi_perfctr_msr) | ||
408 | wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); | ||
409 | } | ||
410 | |||
411 | static int dummy_nmi_callback(struct pt_regs * regs, int cpu) | ||
412 | { | ||
413 | return 0; | ||
414 | } | ||
415 | |||
416 | static nmi_callback_t nmi_callback = dummy_nmi_callback; | ||
417 | |||
418 | asmlinkage void do_nmi(struct pt_regs * regs, long error_code) | ||
419 | { | ||
420 | int cpu = safe_smp_processor_id(); | ||
421 | |||
422 | nmi_enter(); | ||
423 | add_pda(__nmi_count,1); | ||
424 | if (!nmi_callback(regs, cpu)) | ||
425 | default_do_nmi(regs); | ||
426 | nmi_exit(); | ||
427 | } | ||
428 | |||
429 | void set_nmi_callback(nmi_callback_t callback) | ||
430 | { | ||
431 | nmi_callback = callback; | ||
432 | } | ||
433 | |||
434 | void unset_nmi_callback(void) | ||
435 | { | ||
436 | nmi_callback = dummy_nmi_callback; | ||
437 | } | ||
438 | |||
439 | #ifdef CONFIG_SYSCTL | ||
440 | |||
441 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
442 | { | ||
443 | unsigned char reason = get_nmi_reason(); | ||
444 | char buf[64]; | ||
445 | |||
446 | if (!(reason & 0xc0)) { | ||
447 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
448 | die_nmi(buf,regs); | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * proc handler for /proc/sys/kernel/unknown_nmi_panic | ||
455 | */ | ||
456 | int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, | ||
457 | void __user *buffer, size_t *length, loff_t *ppos) | ||
458 | { | ||
459 | int old_state; | ||
460 | |||
461 | old_state = unknown_nmi_panic; | ||
462 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
463 | if (!!old_state == !!unknown_nmi_panic) | ||
464 | return 0; | ||
465 | |||
466 | if (unknown_nmi_panic) { | ||
467 | if (reserve_lapic_nmi() < 0) { | ||
468 | unknown_nmi_panic = 0; | ||
469 | return -EBUSY; | ||
470 | } else { | ||
471 | set_nmi_callback(unknown_nmi_panic_callback); | ||
472 | } | ||
473 | } else { | ||
474 | release_lapic_nmi(); | ||
475 | unset_nmi_callback(); | ||
476 | } | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | #endif | ||
481 | |||
482 | EXPORT_SYMBOL(nmi_active); | ||
483 | EXPORT_SYMBOL(nmi_watchdog); | ||
484 | EXPORT_SYMBOL(reserve_lapic_nmi); | ||
485 | EXPORT_SYMBOL(release_lapic_nmi); | ||
486 | EXPORT_SYMBOL(disable_timer_nmi_watchdog); | ||
487 | EXPORT_SYMBOL(enable_timer_nmi_watchdog); | ||
488 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c new file mode 100644 index 000000000000..cab471cf3edb --- /dev/null +++ b/arch/x86_64/kernel/pci-dma.c | |||
@@ -0,0 +1,60 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/pci.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/io.h> | ||
11 | |||
12 | /* Map a set of buffers described by scatterlist in streaming | ||
13 | * mode for DMA. This is the scatter-gather version of the | ||
14 | * above pci_map_single interface. Here the scatter gather list | ||
15 | * elements are each tagged with the appropriate dma address | ||
16 | * and length. They are obtained via sg_dma_{address,length}(SG). | ||
17 | * | ||
18 | * NOTE: An implementation may be able to use a smaller number of | ||
19 | * DMA address/length pairs than there are SG table elements. | ||
20 | * (for example via virtual mapping capabilities) | ||
21 | * The routine returns the number of addr/length pairs actually | ||
22 | * used, at most nents. | ||
23 | * | ||
24 | * Device ownership issues as mentioned above for pci_map_single are | ||
25 | * the same here. | ||
26 | */ | ||
27 | int dma_map_sg(struct device *hwdev, struct scatterlist *sg, | ||
28 | int nents, int direction) | ||
29 | { | ||
30 | int i; | ||
31 | |||
32 | BUG_ON(direction == DMA_NONE); | ||
33 | for (i = 0; i < nents; i++ ) { | ||
34 | struct scatterlist *s = &sg[i]; | ||
35 | BUG_ON(!s->page); | ||
36 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
37 | s->dma_length = s->length; | ||
38 | } | ||
39 | return nents; | ||
40 | } | ||
41 | |||
42 | EXPORT_SYMBOL(dma_map_sg); | ||
43 | |||
44 | /* Unmap a set of streaming mode DMA translations. | ||
45 | * Again, cpu read rules concerning calls here are the same as for | ||
46 | * pci_unmap_single() above. | ||
47 | */ | ||
48 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, | ||
49 | int nents, int dir) | ||
50 | { | ||
51 | int i; | ||
52 | for (i = 0; i < nents; i++) { | ||
53 | struct scatterlist *s = &sg[i]; | ||
54 | BUG_ON(s->page == NULL); | ||
55 | BUG_ON(s->dma_address == 0); | ||
56 | dma_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | EXPORT_SYMBOL(dma_unmap_sg); | ||
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c new file mode 100644 index 000000000000..57f35c68aa34 --- /dev/null +++ b/arch/x86_64/kernel/pci-gart.c | |||
@@ -0,0 +1,980 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support for AMD Hammer. | ||
3 | * | ||
4 | * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. | ||
5 | * This allows to use PCI devices that only support 32bit addresses on systems | ||
6 | * with more than 4GB. | ||
7 | * | ||
8 | * See Documentation/DMA-mapping.txt for the interface specification. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ctype.h> | ||
16 | #include <linux/agp_backend.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/spinlock.h> | ||
21 | #include <linux/pci.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/topology.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/bitops.h> | ||
26 | #include <asm/atomic.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/cacheflush.h> | ||
32 | #include <asm/kdebug.h> | ||
33 | |||
34 | dma_addr_t bad_dma_address; | ||
35 | |||
36 | unsigned long iommu_bus_base; /* GART remapping area (physical) */ | ||
37 | static unsigned long iommu_size; /* size of remapping area bytes */ | ||
38 | static unsigned long iommu_pages; /* .. and in pages */ | ||
39 | |||
40 | u32 *iommu_gatt_base; /* Remapping table */ | ||
41 | |||
42 | int no_iommu; | ||
43 | static int no_agp; | ||
44 | #ifdef CONFIG_IOMMU_DEBUG | ||
45 | int panic_on_overflow = 1; | ||
46 | int force_iommu = 1; | ||
47 | #else | ||
48 | int panic_on_overflow = 0; | ||
49 | int force_iommu = 0; | ||
50 | #endif | ||
51 | int iommu_merge = 1; | ||
52 | int iommu_sac_force = 0; | ||
53 | |||
54 | /* If this is disabled the IOMMU will use an optimized flushing strategy | ||
55 | of only flushing when an mapping is reused. With it true the GART is flushed | ||
56 | for every mapping. Problem is that doing the lazy flush seems to trigger | ||
57 | bugs with some popular PCI cards, in particular 3ware (but has been also | ||
58 | also seen with Qlogic at least). */ | ||
59 | int iommu_fullflush = 1; | ||
60 | |||
61 | /* This tells the BIO block layer to assume merging. Default to off | ||
62 | because we cannot guarantee merging later. */ | ||
63 | int iommu_bio_merge = 0; | ||
64 | |||
65 | #define MAX_NB 8 | ||
66 | |||
67 | /* Allocation bitmap for the remapping area */ | ||
68 | static DEFINE_SPINLOCK(iommu_bitmap_lock); | ||
69 | static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ | ||
70 | |||
71 | static u32 gart_unmapped_entry; | ||
72 | |||
73 | #define GPTE_VALID 1 | ||
74 | #define GPTE_COHERENT 2 | ||
75 | #define GPTE_ENCODE(x) \ | ||
76 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) | ||
77 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) | ||
78 | |||
79 | #define to_pages(addr,size) \ | ||
80 | (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) | ||
81 | |||
82 | #define for_all_nb(dev) \ | ||
83 | dev = NULL; \ | ||
84 | while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ | ||
85 | if (dev->bus->number == 0 && \ | ||
86 | (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) | ||
87 | |||
88 | static struct pci_dev *northbridges[MAX_NB]; | ||
89 | static u32 northbridge_flush_word[MAX_NB]; | ||
90 | |||
91 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
92 | |||
93 | #ifdef CONFIG_AGP | ||
94 | #define AGPEXTERN extern | ||
95 | #else | ||
96 | #define AGPEXTERN | ||
97 | #endif | ||
98 | |||
99 | /* backdoor interface to AGP driver */ | ||
100 | AGPEXTERN int agp_memory_reserved; | ||
101 | AGPEXTERN __u32 *agp_gatt_table; | ||
102 | |||
103 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ | ||
104 | static int need_flush; /* global flush state. set for each gart wrap */ | ||
105 | static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, | ||
106 | size_t size, int dir, int do_panic); | ||
107 | |||
108 | /* Dummy device used for NULL arguments (normally ISA). Better would | ||
109 | be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ | ||
110 | static struct device fallback_dev = { | ||
111 | .bus_id = "fallback device", | ||
112 | .coherent_dma_mask = 0xffffffff, | ||
113 | .dma_mask = &fallback_dev.coherent_dma_mask, | ||
114 | }; | ||
115 | |||
116 | static unsigned long alloc_iommu(int size) | ||
117 | { | ||
118 | unsigned long offset, flags; | ||
119 | |||
120 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
121 | offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); | ||
122 | if (offset == -1) { | ||
123 | need_flush = 1; | ||
124 | offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); | ||
125 | } | ||
126 | if (offset != -1) { | ||
127 | set_bit_string(iommu_gart_bitmap, offset, size); | ||
128 | next_bit = offset+size; | ||
129 | if (next_bit >= iommu_pages) { | ||
130 | next_bit = 0; | ||
131 | need_flush = 1; | ||
132 | } | ||
133 | } | ||
134 | if (iommu_fullflush) | ||
135 | need_flush = 1; | ||
136 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
137 | return offset; | ||
138 | } | ||
139 | |||
140 | static void free_iommu(unsigned long offset, int size) | ||
141 | { | ||
142 | unsigned long flags; | ||
143 | if (size == 1) { | ||
144 | clear_bit(offset, iommu_gart_bitmap); | ||
145 | return; | ||
146 | } | ||
147 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
148 | __clear_bit_string(iommu_gart_bitmap, offset, size); | ||
149 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * Use global flush state to avoid races with multiple flushers. | ||
154 | */ | ||
155 | static void flush_gart(struct device *dev) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | int flushed = 0; | ||
159 | int i, max; | ||
160 | |||
161 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
162 | if (need_flush) { | ||
163 | max = 0; | ||
164 | for (i = 0; i < MAX_NB; i++) { | ||
165 | if (!northbridges[i]) | ||
166 | continue; | ||
167 | pci_write_config_dword(northbridges[i], 0x9c, | ||
168 | northbridge_flush_word[i] | 1); | ||
169 | flushed++; | ||
170 | max = i; | ||
171 | } | ||
172 | for (i = 0; i <= max; i++) { | ||
173 | u32 w; | ||
174 | if (!northbridges[i]) | ||
175 | continue; | ||
176 | /* Make sure the hardware actually executed the flush. */ | ||
177 | do { | ||
178 | pci_read_config_dword(northbridges[i], 0x9c, &w); | ||
179 | } while (w & 1); | ||
180 | } | ||
181 | if (!flushed) | ||
182 | printk("nothing to flush?\n"); | ||
183 | need_flush = 0; | ||
184 | } | ||
185 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
186 | } | ||
187 | |||
188 | /* Allocate DMA memory on node near device */ | ||
189 | noinline | ||
190 | static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) | ||
191 | { | ||
192 | struct page *page; | ||
193 | int node; | ||
194 | if (dev->bus == &pci_bus_type) { | ||
195 | cpumask_t mask; | ||
196 | mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); | ||
197 | node = cpu_to_node(first_cpu(mask)); | ||
198 | } else | ||
199 | node = numa_node_id(); | ||
200 | page = alloc_pages_node(node, gfp, order); | ||
201 | return page ? page_address(page) : NULL; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * Allocate memory for a coherent mapping. | ||
206 | */ | ||
207 | void * | ||
208 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | ||
209 | unsigned gfp) | ||
210 | { | ||
211 | void *memory; | ||
212 | unsigned long dma_mask = 0; | ||
213 | u64 bus; | ||
214 | |||
215 | if (!dev) | ||
216 | dev = &fallback_dev; | ||
217 | dma_mask = dev->coherent_dma_mask; | ||
218 | if (dma_mask == 0) | ||
219 | dma_mask = 0xffffffff; | ||
220 | |||
221 | /* Kludge to make it bug-to-bug compatible with i386. i386 | ||
222 | uses the normal dma_mask for alloc_coherent. */ | ||
223 | dma_mask &= *dev->dma_mask; | ||
224 | |||
225 | again: | ||
226 | memory = dma_alloc_pages(dev, gfp, get_order(size)); | ||
227 | if (memory == NULL) | ||
228 | return NULL; | ||
229 | |||
230 | { | ||
231 | int high, mmu; | ||
232 | bus = virt_to_bus(memory); | ||
233 | high = (bus + size) >= dma_mask; | ||
234 | mmu = high; | ||
235 | if (force_iommu && !(gfp & GFP_DMA)) | ||
236 | mmu = 1; | ||
237 | if (no_iommu || dma_mask < 0xffffffffUL) { | ||
238 | if (high) { | ||
239 | free_pages((unsigned long)memory, | ||
240 | get_order(size)); | ||
241 | |||
242 | if (swiotlb) { | ||
243 | return | ||
244 | swiotlb_alloc_coherent(dev, size, | ||
245 | dma_handle, | ||
246 | gfp); | ||
247 | } | ||
248 | |||
249 | if (!(gfp & GFP_DMA)) { | ||
250 | gfp |= GFP_DMA; | ||
251 | goto again; | ||
252 | } | ||
253 | return NULL; | ||
254 | } | ||
255 | mmu = 0; | ||
256 | } | ||
257 | memset(memory, 0, size); | ||
258 | if (!mmu) { | ||
259 | *dma_handle = virt_to_bus(memory); | ||
260 | return memory; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0); | ||
265 | if (*dma_handle == bad_dma_address) | ||
266 | goto error; | ||
267 | flush_gart(dev); | ||
268 | return memory; | ||
269 | |||
270 | error: | ||
271 | if (panic_on_overflow) | ||
272 | panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size); | ||
273 | free_pages((unsigned long)memory, get_order(size)); | ||
274 | return NULL; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * Unmap coherent memory. | ||
279 | * The caller must ensure that the device has finished accessing the mapping. | ||
280 | */ | ||
281 | void dma_free_coherent(struct device *dev, size_t size, | ||
282 | void *vaddr, dma_addr_t bus) | ||
283 | { | ||
284 | if (swiotlb) { | ||
285 | swiotlb_free_coherent(dev, size, vaddr, bus); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | dma_unmap_single(dev, bus, size, 0); | ||
290 | free_pages((unsigned long)vaddr, get_order(size)); | ||
291 | } | ||
292 | |||
293 | #ifdef CONFIG_IOMMU_LEAK | ||
294 | |||
295 | #define SET_LEAK(x) if (iommu_leak_tab) \ | ||
296 | iommu_leak_tab[x] = __builtin_return_address(0); | ||
297 | #define CLEAR_LEAK(x) if (iommu_leak_tab) \ | ||
298 | iommu_leak_tab[x] = NULL; | ||
299 | |||
300 | /* Debugging aid for drivers that don't free their IOMMU tables */ | ||
301 | static void **iommu_leak_tab; | ||
302 | static int leak_trace; | ||
303 | int iommu_leak_pages = 20; | ||
304 | void dump_leak(void) | ||
305 | { | ||
306 | int i; | ||
307 | static int dump; | ||
308 | if (dump || !iommu_leak_tab) return; | ||
309 | dump = 1; | ||
310 | show_stack(NULL,NULL); | ||
311 | /* Very crude. dump some from the end of the table too */ | ||
312 | printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); | ||
313 | for (i = 0; i < iommu_leak_pages; i+=2) { | ||
314 | printk("%lu: ", iommu_pages-i); | ||
315 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); | ||
316 | printk("%c", (i+1)%2 == 0 ? '\n' : ' '); | ||
317 | } | ||
318 | printk("\n"); | ||
319 | } | ||
320 | #else | ||
321 | #define SET_LEAK(x) | ||
322 | #define CLEAR_LEAK(x) | ||
323 | #endif | ||
324 | |||
325 | static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) | ||
326 | { | ||
327 | /* | ||
328 | * Ran out of IOMMU space for this operation. This is very bad. | ||
329 | * Unfortunately the drivers cannot handle this operation properly. | ||
330 | * Return some non mapped prereserved space in the aperture and | ||
331 | * let the Northbridge deal with it. This will result in garbage | ||
332 | * in the IO operation. When the size exceeds the prereserved space | ||
333 | * memory corruption will occur or random memory will be DMAed | ||
334 | * out. Hopefully no network devices use single mappings that big. | ||
335 | */ | ||
336 | |||
337 | printk(KERN_ERR | ||
338 | "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", | ||
339 | size, dev->bus_id); | ||
340 | |||
341 | if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { | ||
342 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
343 | panic("PCI-DMA: Memory would be corrupted\n"); | ||
344 | if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
345 | panic("PCI-DMA: Random memory would be DMAed\n"); | ||
346 | } | ||
347 | |||
348 | #ifdef CONFIG_IOMMU_LEAK | ||
349 | dump_leak(); | ||
350 | #endif | ||
351 | } | ||
352 | |||
353 | static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) | ||
354 | { | ||
355 | u64 mask = *dev->dma_mask; | ||
356 | int high = addr + size >= mask; | ||
357 | int mmu = high; | ||
358 | if (force_iommu) | ||
359 | mmu = 1; | ||
360 | if (no_iommu) { | ||
361 | if (high) | ||
362 | panic("PCI-DMA: high address but no IOMMU.\n"); | ||
363 | mmu = 0; | ||
364 | } | ||
365 | return mmu; | ||
366 | } | ||
367 | |||
368 | static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | ||
369 | { | ||
370 | u64 mask = *dev->dma_mask; | ||
371 | int high = addr + size >= mask; | ||
372 | int mmu = high; | ||
373 | if (no_iommu) { | ||
374 | if (high) | ||
375 | panic("PCI-DMA: high address but no IOMMU.\n"); | ||
376 | mmu = 0; | ||
377 | } | ||
378 | return mmu; | ||
379 | } | ||
380 | |||
381 | /* Map a single continuous physical area into the IOMMU. | ||
382 | * Caller needs to check if the iommu is needed and flush. | ||
383 | */ | ||
384 | static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, | ||
385 | size_t size, int dir, int do_panic) | ||
386 | { | ||
387 | unsigned long npages = to_pages(phys_mem, size); | ||
388 | unsigned long iommu_page = alloc_iommu(npages); | ||
389 | int i; | ||
390 | if (iommu_page == -1) { | ||
391 | if (!nonforced_iommu(dev, phys_mem, size)) | ||
392 | return phys_mem; | ||
393 | if (panic_on_overflow) | ||
394 | panic("dma_map_area overflow %lu bytes\n", size); | ||
395 | iommu_full(dev, size, dir, do_panic); | ||
396 | return bad_dma_address; | ||
397 | } | ||
398 | |||
399 | for (i = 0; i < npages; i++) { | ||
400 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); | ||
401 | SET_LEAK(iommu_page + i); | ||
402 | phys_mem += PAGE_SIZE; | ||
403 | } | ||
404 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | ||
405 | } | ||
406 | |||
407 | /* Map a single area into the IOMMU */ | ||
408 | dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) | ||
409 | { | ||
410 | unsigned long phys_mem, bus; | ||
411 | |||
412 | BUG_ON(dir == DMA_NONE); | ||
413 | |||
414 | if (swiotlb) | ||
415 | return swiotlb_map_single(dev,addr,size,dir); | ||
416 | if (!dev) | ||
417 | dev = &fallback_dev; | ||
418 | |||
419 | phys_mem = virt_to_phys(addr); | ||
420 | if (!need_iommu(dev, phys_mem, size)) | ||
421 | return phys_mem; | ||
422 | |||
423 | bus = dma_map_area(dev, phys_mem, size, dir, 1); | ||
424 | flush_gart(dev); | ||
425 | return bus; | ||
426 | } | ||
427 | |||
428 | /* Fallback for dma_map_sg in case of overflow */ | ||
429 | static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, | ||
430 | int nents, int dir) | ||
431 | { | ||
432 | int i; | ||
433 | |||
434 | #ifdef CONFIG_IOMMU_DEBUG | ||
435 | printk(KERN_DEBUG "dma_map_sg overflow\n"); | ||
436 | #endif | ||
437 | |||
438 | for (i = 0; i < nents; i++ ) { | ||
439 | struct scatterlist *s = &sg[i]; | ||
440 | unsigned long addr = page_to_phys(s->page) + s->offset; | ||
441 | if (nonforced_iommu(dev, addr, s->length)) { | ||
442 | addr = dma_map_area(dev, addr, s->length, dir, 0); | ||
443 | if (addr == bad_dma_address) { | ||
444 | if (i > 0) | ||
445 | dma_unmap_sg(dev, sg, i, dir); | ||
446 | nents = 0; | ||
447 | sg[0].dma_length = 0; | ||
448 | break; | ||
449 | } | ||
450 | } | ||
451 | s->dma_address = addr; | ||
452 | s->dma_length = s->length; | ||
453 | } | ||
454 | flush_gart(dev); | ||
455 | return nents; | ||
456 | } | ||
457 | |||
458 | /* Map multiple scatterlist entries continuous into the first. */ | ||
459 | static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
460 | struct scatterlist *sout, unsigned long pages) | ||
461 | { | ||
462 | unsigned long iommu_start = alloc_iommu(pages); | ||
463 | unsigned long iommu_page = iommu_start; | ||
464 | int i; | ||
465 | |||
466 | if (iommu_start == -1) | ||
467 | return -1; | ||
468 | |||
469 | for (i = start; i < stopat; i++) { | ||
470 | struct scatterlist *s = &sg[i]; | ||
471 | unsigned long pages, addr; | ||
472 | unsigned long phys_addr = s->dma_address; | ||
473 | |||
474 | BUG_ON(i > start && s->offset); | ||
475 | if (i == start) { | ||
476 | *sout = *s; | ||
477 | sout->dma_address = iommu_bus_base; | ||
478 | sout->dma_address += iommu_page*PAGE_SIZE + s->offset; | ||
479 | sout->dma_length = s->length; | ||
480 | } else { | ||
481 | sout->dma_length += s->length; | ||
482 | } | ||
483 | |||
484 | addr = phys_addr; | ||
485 | pages = to_pages(s->offset, s->length); | ||
486 | while (pages--) { | ||
487 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | ||
488 | SET_LEAK(iommu_page); | ||
489 | addr += PAGE_SIZE; | ||
490 | iommu_page++; | ||
491 | } | ||
492 | } | ||
493 | BUG_ON(iommu_page - iommu_start != pages); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
498 | struct scatterlist *sout, | ||
499 | unsigned long pages, int need) | ||
500 | { | ||
501 | if (!need) { | ||
502 | BUG_ON(stopat - start != 1); | ||
503 | *sout = sg[start]; | ||
504 | sout->dma_length = sg[start].length; | ||
505 | return 0; | ||
506 | } | ||
507 | return __dma_map_cont(sg, start, stopat, sout, pages); | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * DMA map all entries in a scatterlist. | ||
512 | * Merge chunks that have page aligned sizes into a continuous mapping. | ||
513 | */ | ||
514 | int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
515 | { | ||
516 | int i; | ||
517 | int out; | ||
518 | int start; | ||
519 | unsigned long pages = 0; | ||
520 | int need = 0, nextneed; | ||
521 | |||
522 | BUG_ON(dir == DMA_NONE); | ||
523 | if (nents == 0) | ||
524 | return 0; | ||
525 | |||
526 | if (swiotlb) | ||
527 | return swiotlb_map_sg(dev,sg,nents,dir); | ||
528 | if (!dev) | ||
529 | dev = &fallback_dev; | ||
530 | |||
531 | out = 0; | ||
532 | start = 0; | ||
533 | for (i = 0; i < nents; i++) { | ||
534 | struct scatterlist *s = &sg[i]; | ||
535 | dma_addr_t addr = page_to_phys(s->page) + s->offset; | ||
536 | s->dma_address = addr; | ||
537 | BUG_ON(s->length == 0); | ||
538 | |||
539 | nextneed = need_iommu(dev, addr, s->length); | ||
540 | |||
541 | /* Handle the previous not yet processed entries */ | ||
542 | if (i > start) { | ||
543 | struct scatterlist *ps = &sg[i-1]; | ||
544 | /* Can only merge when the last chunk ends on a page | ||
545 | boundary and the new one doesn't have an offset. */ | ||
546 | if (!iommu_merge || !nextneed || !need || s->offset || | ||
547 | (ps->offset + ps->length) % PAGE_SIZE) { | ||
548 | if (dma_map_cont(sg, start, i, sg+out, pages, | ||
549 | need) < 0) | ||
550 | goto error; | ||
551 | out++; | ||
552 | pages = 0; | ||
553 | start = i; | ||
554 | } | ||
555 | } | ||
556 | |||
557 | need = nextneed; | ||
558 | pages += to_pages(s->offset, s->length); | ||
559 | } | ||
560 | if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) | ||
561 | goto error; | ||
562 | out++; | ||
563 | flush_gart(dev); | ||
564 | if (out < nents) | ||
565 | sg[out].dma_length = 0; | ||
566 | return out; | ||
567 | |||
568 | error: | ||
569 | flush_gart(NULL); | ||
570 | dma_unmap_sg(dev, sg, nents, dir); | ||
571 | /* When it was forced try again unforced */ | ||
572 | if (force_iommu) | ||
573 | return dma_map_sg_nonforce(dev, sg, nents, dir); | ||
574 | if (panic_on_overflow) | ||
575 | panic("dma_map_sg: overflow on %lu pages\n", pages); | ||
576 | iommu_full(dev, pages << PAGE_SHIFT, dir, 0); | ||
577 | for (i = 0; i < nents; i++) | ||
578 | sg[i].dma_address = bad_dma_address; | ||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * Free a DMA mapping. | ||
584 | */ | ||
585 | void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, | ||
586 | size_t size, int direction) | ||
587 | { | ||
588 | unsigned long iommu_page; | ||
589 | int npages; | ||
590 | int i; | ||
591 | |||
592 | if (swiotlb) { | ||
593 | swiotlb_unmap_single(dev,dma_addr,size,direction); | ||
594 | return; | ||
595 | } | ||
596 | |||
597 | if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || | ||
598 | dma_addr >= iommu_bus_base + iommu_size) | ||
599 | return; | ||
600 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | ||
601 | npages = to_pages(dma_addr, size); | ||
602 | for (i = 0; i < npages; i++) { | ||
603 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | ||
604 | CLEAR_LEAK(iommu_page + i); | ||
605 | } | ||
606 | free_iommu(iommu_page, npages); | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * Wrapper for pci_unmap_single working with scatterlists. | ||
611 | */ | ||
612 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
613 | { | ||
614 | int i; | ||
615 | if (swiotlb) { | ||
616 | swiotlb_unmap_sg(dev,sg,nents,dir); | ||
617 | return; | ||
618 | } | ||
619 | for (i = 0; i < nents; i++) { | ||
620 | struct scatterlist *s = &sg[i]; | ||
621 | if (!s->dma_length || !s->length) | ||
622 | break; | ||
623 | dma_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
624 | } | ||
625 | } | ||
626 | |||
627 | int dma_supported(struct device *dev, u64 mask) | ||
628 | { | ||
629 | /* Copied from i386. Doesn't make much sense, because it will | ||
630 | only work for pci_alloc_coherent. | ||
631 | The caller just has to use GFP_DMA in this case. */ | ||
632 | if (mask < 0x00ffffff) | ||
633 | return 0; | ||
634 | |||
635 | /* Tell the device to use SAC when IOMMU force is on. | ||
636 | This allows the driver to use cheaper accesses in some cases. | ||
637 | |||
638 | Problem with this is that if we overflow the IOMMU area | ||
639 | and return DAC as fallback address the device may not handle it correctly. | ||
640 | |||
641 | As a special case some controllers have a 39bit address mode | ||
642 | that is as efficient as 32bit (aic79xx). Don't force SAC for these. | ||
643 | Assume all masks <= 40 bits are of this type. Normally this doesn't | ||
644 | make any difference, but gives more gentle handling of IOMMU overflow. */ | ||
645 | if (iommu_sac_force && (mask >= 0xffffffffffULL)) { | ||
646 | printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); | ||
647 | return 0; | ||
648 | } | ||
649 | |||
650 | return 1; | ||
651 | } | ||
652 | |||
653 | int dma_get_cache_alignment(void) | ||
654 | { | ||
655 | return boot_cpu_data.x86_clflush_size; | ||
656 | } | ||
657 | |||
658 | EXPORT_SYMBOL(dma_unmap_sg); | ||
659 | EXPORT_SYMBOL(dma_map_sg); | ||
660 | EXPORT_SYMBOL(dma_map_single); | ||
661 | EXPORT_SYMBOL(dma_unmap_single); | ||
662 | EXPORT_SYMBOL(dma_supported); | ||
663 | EXPORT_SYMBOL(no_iommu); | ||
664 | EXPORT_SYMBOL(force_iommu); | ||
665 | EXPORT_SYMBOL(bad_dma_address); | ||
666 | EXPORT_SYMBOL(iommu_bio_merge); | ||
667 | EXPORT_SYMBOL(iommu_sac_force); | ||
668 | EXPORT_SYMBOL(dma_get_cache_alignment); | ||
669 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
670 | EXPORT_SYMBOL(dma_free_coherent); | ||
671 | |||
672 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) | ||
673 | { | ||
674 | unsigned long a; | ||
675 | if (!iommu_size) { | ||
676 | iommu_size = aper_size; | ||
677 | if (!no_agp) | ||
678 | iommu_size /= 2; | ||
679 | } | ||
680 | |||
681 | a = aper + iommu_size; | ||
682 | iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; | ||
683 | |||
684 | if (iommu_size < 64*1024*1024) | ||
685 | printk(KERN_WARNING | ||
686 | "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); | ||
687 | |||
688 | return iommu_size; | ||
689 | } | ||
690 | |||
691 | static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | ||
692 | { | ||
693 | unsigned aper_size = 0, aper_base_32; | ||
694 | u64 aper_base; | ||
695 | unsigned aper_order; | ||
696 | |||
697 | pci_read_config_dword(dev, 0x94, &aper_base_32); | ||
698 | pci_read_config_dword(dev, 0x90, &aper_order); | ||
699 | aper_order = (aper_order >> 1) & 7; | ||
700 | |||
701 | aper_base = aper_base_32 & 0x7fff; | ||
702 | aper_base <<= 25; | ||
703 | |||
704 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
705 | if (aper_base + aper_size >= 0xffffffff || !aper_size) | ||
706 | aper_base = 0; | ||
707 | |||
708 | *size = aper_size; | ||
709 | return aper_base; | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * Private Northbridge GATT initialization in case we cannot use the | ||
714 | * AGP driver for some reason. | ||
715 | */ | ||
716 | static __init int init_k8_gatt(struct agp_kern_info *info) | ||
717 | { | ||
718 | struct pci_dev *dev; | ||
719 | void *gatt; | ||
720 | unsigned aper_base, new_aper_base; | ||
721 | unsigned aper_size, gatt_size, new_aper_size; | ||
722 | |||
723 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | ||
724 | aper_size = aper_base = info->aper_size = 0; | ||
725 | for_all_nb(dev) { | ||
726 | new_aper_base = read_aperture(dev, &new_aper_size); | ||
727 | if (!new_aper_base) | ||
728 | goto nommu; | ||
729 | |||
730 | if (!aper_base) { | ||
731 | aper_size = new_aper_size; | ||
732 | aper_base = new_aper_base; | ||
733 | } | ||
734 | if (aper_size != new_aper_size || aper_base != new_aper_base) | ||
735 | goto nommu; | ||
736 | } | ||
737 | if (!aper_base) | ||
738 | goto nommu; | ||
739 | info->aper_base = aper_base; | ||
740 | info->aper_size = aper_size>>20; | ||
741 | |||
742 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); | ||
743 | gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); | ||
744 | if (!gatt) | ||
745 | panic("Cannot allocate GATT table"); | ||
746 | memset(gatt, 0, gatt_size); | ||
747 | agp_gatt_table = gatt; | ||
748 | |||
749 | for_all_nb(dev) { | ||
750 | u32 ctl; | ||
751 | u32 gatt_reg; | ||
752 | |||
753 | gatt_reg = __pa(gatt) >> 12; | ||
754 | gatt_reg <<= 4; | ||
755 | pci_write_config_dword(dev, 0x98, gatt_reg); | ||
756 | pci_read_config_dword(dev, 0x90, &ctl); | ||
757 | |||
758 | ctl |= 1; | ||
759 | ctl &= ~((1<<4) | (1<<5)); | ||
760 | |||
761 | pci_write_config_dword(dev, 0x90, ctl); | ||
762 | } | ||
763 | flush_gart(NULL); | ||
764 | |||
765 | printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); | ||
766 | return 0; | ||
767 | |||
768 | nommu: | ||
769 | /* Should not happen anymore */ | ||
770 | printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" | ||
771 | KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); | ||
772 | return -1; | ||
773 | } | ||
774 | |||
775 | extern int agp_amd64_init(void); | ||
776 | |||
777 | static int __init pci_iommu_init(void) | ||
778 | { | ||
779 | struct agp_kern_info info; | ||
780 | unsigned long aper_size; | ||
781 | unsigned long iommu_start; | ||
782 | struct pci_dev *dev; | ||
783 | unsigned long scratch; | ||
784 | long i; | ||
785 | |||
786 | #ifndef CONFIG_AGP_AMD64 | ||
787 | no_agp = 1; | ||
788 | #else | ||
789 | /* Makefile puts PCI initialization via subsys_initcall first. */ | ||
790 | /* Add other K8 AGP bridge drivers here */ | ||
791 | no_agp = no_agp || | ||
792 | (agp_amd64_init() < 0) || | ||
793 | (agp_copy_info(agp_bridge, &info) < 0); | ||
794 | #endif | ||
795 | |||
796 | if (swiotlb) { | ||
797 | no_iommu = 1; | ||
798 | printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); | ||
799 | return -1; | ||
800 | } | ||
801 | |||
802 | if (no_iommu || | ||
803 | (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || | ||
804 | !iommu_aperture || | ||
805 | (no_agp && init_k8_gatt(&info) < 0)) { | ||
806 | printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); | ||
807 | no_iommu = 1; | ||
808 | return -1; | ||
809 | } | ||
810 | |||
811 | aper_size = info.aper_size * 1024 * 1024; | ||
812 | iommu_size = check_iommu_size(info.aper_base, aper_size); | ||
813 | iommu_pages = iommu_size >> PAGE_SHIFT; | ||
814 | |||
815 | iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, | ||
816 | get_order(iommu_pages/8)); | ||
817 | if (!iommu_gart_bitmap) | ||
818 | panic("Cannot allocate iommu bitmap\n"); | ||
819 | memset(iommu_gart_bitmap, 0, iommu_pages/8); | ||
820 | |||
821 | #ifdef CONFIG_IOMMU_LEAK | ||
822 | if (leak_trace) { | ||
823 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, | ||
824 | get_order(iommu_pages*sizeof(void *))); | ||
825 | if (iommu_leak_tab) | ||
826 | memset(iommu_leak_tab, 0, iommu_pages * 8); | ||
827 | else | ||
828 | printk("PCI-DMA: Cannot allocate leak trace area\n"); | ||
829 | } | ||
830 | #endif | ||
831 | |||
832 | /* | ||
833 | * Out of IOMMU space handling. | ||
834 | * Reserve some invalid pages at the beginning of the GART. | ||
835 | */ | ||
836 | set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); | ||
837 | |||
838 | agp_memory_reserved = iommu_size; | ||
839 | printk(KERN_INFO | ||
840 | "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", | ||
841 | iommu_size>>20); | ||
842 | |||
843 | iommu_start = aper_size - iommu_size; | ||
844 | iommu_bus_base = info.aper_base + iommu_start; | ||
845 | bad_dma_address = iommu_bus_base; | ||
846 | iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); | ||
847 | |||
848 | /* | ||
849 | * Unmap the IOMMU part of the GART. The alias of the page is | ||
850 | * always mapped with cache enabled and there is no full cache | ||
851 | * coherency across the GART remapping. The unmapping avoids | ||
852 | * automatic prefetches from the CPU allocating cache lines in | ||
853 | * there. All CPU accesses are done via the direct mapping to | ||
854 | * the backing memory. The GART address is only used by PCI | ||
855 | * devices. | ||
856 | */ | ||
857 | clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); | ||
858 | |||
859 | /* | ||
860 | * Try to workaround a bug (thanks to BenH) | ||
861 | * Set unmapped entries to a scratch page instead of 0. | ||
862 | * Any prefetches that hit unmapped entries won't get an bus abort | ||
863 | * then. | ||
864 | */ | ||
865 | scratch = get_zeroed_page(GFP_KERNEL); | ||
866 | if (!scratch) | ||
867 | panic("Cannot allocate iommu scratch page"); | ||
868 | gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); | ||
869 | for (i = EMERGENCY_PAGES; i < iommu_pages; i++) | ||
870 | iommu_gatt_base[i] = gart_unmapped_entry; | ||
871 | |||
872 | for_all_nb(dev) { | ||
873 | u32 flag; | ||
874 | int cpu = PCI_SLOT(dev->devfn) - 24; | ||
875 | if (cpu >= MAX_NB) | ||
876 | continue; | ||
877 | northbridges[cpu] = dev; | ||
878 | pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ | ||
879 | northbridge_flush_word[cpu] = flag; | ||
880 | } | ||
881 | |||
882 | flush_gart(NULL); | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | /* Must execute after PCI subsystem */ | ||
888 | fs_initcall(pci_iommu_init); | ||
889 | |||
890 | /* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] | ||
891 | [,forcesac][,fullflush][,nomerge][,biomerge] | ||
892 | size set size of iommu (in bytes) | ||
893 | noagp don't initialize the AGP driver and use full aperture. | ||
894 | off don't use the IOMMU | ||
895 | leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) | ||
896 | memaper[=order] allocate an own aperture over RAM with size 32MB^order. | ||
897 | noforce don't force IOMMU usage. Default. | ||
898 | force Force IOMMU. | ||
899 | merge Do lazy merging. This may improve performance on some block devices. | ||
900 | Implies force (experimental) | ||
901 | biomerge Do merging at the BIO layer. This is more efficient than merge, | ||
902 | but should be only done with very big IOMMUs. Implies merge,force. | ||
903 | nomerge Don't do SG merging. | ||
904 | forcesac For SAC mode for masks <40bits (experimental) | ||
905 | fullflush Flush IOMMU on each allocation (default) | ||
906 | nofullflush Don't use IOMMU fullflush | ||
907 | allowed overwrite iommu off workarounds for specific chipsets. | ||
908 | soft Use software bounce buffering (default for Intel machines) | ||
909 | noaperture Don't touch the aperture for AGP. | ||
910 | */ | ||
911 | __init int iommu_setup(char *p) | ||
912 | { | ||
913 | int arg; | ||
914 | |||
915 | while (*p) { | ||
916 | if (!strncmp(p,"noagp",5)) | ||
917 | no_agp = 1; | ||
918 | if (!strncmp(p,"off",3)) | ||
919 | no_iommu = 1; | ||
920 | if (!strncmp(p,"force",5)) { | ||
921 | force_iommu = 1; | ||
922 | iommu_aperture_allowed = 1; | ||
923 | } | ||
924 | if (!strncmp(p,"allowed",7)) | ||
925 | iommu_aperture_allowed = 1; | ||
926 | if (!strncmp(p,"noforce",7)) { | ||
927 | iommu_merge = 0; | ||
928 | force_iommu = 0; | ||
929 | } | ||
930 | if (!strncmp(p, "memaper", 7)) { | ||
931 | fallback_aper_force = 1; | ||
932 | p += 7; | ||
933 | if (*p == '=') { | ||
934 | ++p; | ||
935 | if (get_option(&p, &arg)) | ||
936 | fallback_aper_order = arg; | ||
937 | } | ||
938 | } | ||
939 | if (!strncmp(p, "biomerge",8)) { | ||
940 | iommu_bio_merge = 4096; | ||
941 | iommu_merge = 1; | ||
942 | force_iommu = 1; | ||
943 | } | ||
944 | if (!strncmp(p, "panic",5)) | ||
945 | panic_on_overflow = 1; | ||
946 | if (!strncmp(p, "nopanic",7)) | ||
947 | panic_on_overflow = 0; | ||
948 | if (!strncmp(p, "merge",5)) { | ||
949 | iommu_merge = 1; | ||
950 | force_iommu = 1; | ||
951 | } | ||
952 | if (!strncmp(p, "nomerge",7)) | ||
953 | iommu_merge = 0; | ||
954 | if (!strncmp(p, "forcesac",8)) | ||
955 | iommu_sac_force = 1; | ||
956 | if (!strncmp(p, "fullflush",8)) | ||
957 | iommu_fullflush = 1; | ||
958 | if (!strncmp(p, "nofullflush",11)) | ||
959 | iommu_fullflush = 0; | ||
960 | if (!strncmp(p, "soft",4)) | ||
961 | swiotlb = 1; | ||
962 | if (!strncmp(p, "noaperture",10)) | ||
963 | fix_aperture = 0; | ||
964 | #ifdef CONFIG_IOMMU_LEAK | ||
965 | if (!strncmp(p,"leak",4)) { | ||
966 | leak_trace = 1; | ||
967 | p += 4; | ||
968 | if (*p == '=') ++p; | ||
969 | if (isdigit(*p) && get_option(&p, &arg)) | ||
970 | iommu_leak_pages = arg; | ||
971 | } else | ||
972 | #endif | ||
973 | if (isdigit(*p) && get_option(&p, &arg)) | ||
974 | iommu_size = arg; | ||
975 | p += strcspn(p, ","); | ||
976 | if (*p == ',') | ||
977 | ++p; | ||
978 | } | ||
979 | return 1; | ||
980 | } | ||
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c new file mode 100644 index 000000000000..67d90b89af0b --- /dev/null +++ b/arch/x86_64/kernel/pci-nommu.c | |||
@@ -0,0 +1,94 @@ | |||
1 | /* Fallback functions when the main IOMMU code is not compiled in. This | ||
2 | code is roughly equivalent to i386. */ | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/pci.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <asm/proto.h> | ||
8 | #include <asm/processor.h> | ||
9 | |||
10 | int iommu_merge = 0; | ||
11 | EXPORT_SYMBOL(iommu_merge); | ||
12 | |||
13 | dma_addr_t bad_dma_address; | ||
14 | EXPORT_SYMBOL(bad_dma_address); | ||
15 | |||
16 | int iommu_bio_merge = 0; | ||
17 | EXPORT_SYMBOL(iommu_bio_merge); | ||
18 | |||
19 | int iommu_sac_force = 0; | ||
20 | EXPORT_SYMBOL(iommu_sac_force); | ||
21 | |||
22 | /* | ||
23 | * Dummy IO MMU functions | ||
24 | */ | ||
25 | |||
26 | void *dma_alloc_coherent(struct device *hwdev, size_t size, | ||
27 | dma_addr_t *dma_handle, unsigned gfp) | ||
28 | { | ||
29 | void *ret; | ||
30 | u64 mask; | ||
31 | int order = get_order(size); | ||
32 | |||
33 | if (hwdev) | ||
34 | mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; | ||
35 | else | ||
36 | mask = 0xffffffff; | ||
37 | for (;;) { | ||
38 | ret = (void *)__get_free_pages(gfp, order); | ||
39 | if (ret == NULL) | ||
40 | return NULL; | ||
41 | *dma_handle = virt_to_bus(ret); | ||
42 | if ((*dma_handle & ~mask) == 0) | ||
43 | break; | ||
44 | free_pages((unsigned long)ret, order); | ||
45 | if (gfp & GFP_DMA) | ||
46 | return NULL; | ||
47 | gfp |= GFP_DMA; | ||
48 | } | ||
49 | |||
50 | memset(ret, 0, size); | ||
51 | return ret; | ||
52 | } | ||
53 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
54 | |||
55 | void dma_free_coherent(struct device *hwdev, size_t size, | ||
56 | void *vaddr, dma_addr_t dma_handle) | ||
57 | { | ||
58 | free_pages((unsigned long)vaddr, get_order(size)); | ||
59 | } | ||
60 | EXPORT_SYMBOL(dma_free_coherent); | ||
61 | |||
62 | int dma_supported(struct device *hwdev, u64 mask) | ||
63 | { | ||
64 | /* | ||
65 | * we fall back to GFP_DMA when the mask isn't all 1s, | ||
66 | * so we can't guarantee allocations that must be | ||
67 | * within a tighter range than GFP_DMA.. | ||
68 | * RED-PEN this won't work for pci_map_single. Caller has to | ||
69 | * use GFP_DMA in the first place. | ||
70 | */ | ||
71 | if (mask < 0x00ffffff) | ||
72 | return 0; | ||
73 | |||
74 | return 1; | ||
75 | } | ||
76 | EXPORT_SYMBOL(dma_supported); | ||
77 | |||
78 | int dma_get_cache_alignment(void) | ||
79 | { | ||
80 | return boot_cpu_data.x86_clflush_size; | ||
81 | } | ||
82 | EXPORT_SYMBOL(dma_get_cache_alignment); | ||
83 | |||
84 | static int __init check_ram(void) | ||
85 | { | ||
86 | if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { | ||
87 | printk( | ||
88 | KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" | ||
89 | KERN_ERR "WARNING 32bit PCI may malfunction.\n"); | ||
90 | } | ||
91 | return 0; | ||
92 | } | ||
93 | __initcall(check_ram); | ||
94 | |||
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c new file mode 100644 index 000000000000..9922d2ba24a3 --- /dev/null +++ b/arch/x86_64/kernel/process.c | |||
@@ -0,0 +1,770 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/process.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | * | ||
9 | * X86-64 port | ||
10 | * Andi Kleen. | ||
11 | * | ||
12 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This file handles the architecture-dependent parts of process handling.. | ||
17 | */ | ||
18 | |||
19 | #include <stdarg.h> | ||
20 | |||
21 | #include <linux/errno.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/elfcore.h> | ||
26 | #include <linux/smp.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/user.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/a.out.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/delay.h> | ||
33 | #include <linux/irq.h> | ||
34 | #include <linux/ptrace.h> | ||
35 | #include <linux/utsname.h> | ||
36 | #include <linux/random.h> | ||
37 | |||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/pgtable.h> | ||
40 | #include <asm/system.h> | ||
41 | #include <asm/io.h> | ||
42 | #include <asm/processor.h> | ||
43 | #include <asm/i387.h> | ||
44 | #include <asm/mmu_context.h> | ||
45 | #include <asm/pda.h> | ||
46 | #include <asm/prctl.h> | ||
47 | #include <asm/kdebug.h> | ||
48 | #include <asm/desc.h> | ||
49 | #include <asm/proto.h> | ||
50 | #include <asm/ia32.h> | ||
51 | |||
52 | asmlinkage extern void ret_from_fork(void); | ||
53 | |||
54 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | ||
55 | |||
56 | static atomic_t hlt_counter = ATOMIC_INIT(0); | ||
57 | |||
58 | unsigned long boot_option_idle_override = 0; | ||
59 | EXPORT_SYMBOL(boot_option_idle_override); | ||
60 | |||
61 | /* | ||
62 | * Powermanagement idle function, if any.. | ||
63 | */ | ||
64 | void (*pm_idle)(void); | ||
65 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | ||
66 | |||
67 | void disable_hlt(void) | ||
68 | { | ||
69 | atomic_inc(&hlt_counter); | ||
70 | } | ||
71 | |||
72 | EXPORT_SYMBOL(disable_hlt); | ||
73 | |||
74 | void enable_hlt(void) | ||
75 | { | ||
76 | atomic_dec(&hlt_counter); | ||
77 | } | ||
78 | |||
79 | EXPORT_SYMBOL(enable_hlt); | ||
80 | |||
81 | /* | ||
82 | * We use this if we don't have any better | ||
83 | * idle routine.. | ||
84 | */ | ||
85 | void default_idle(void) | ||
86 | { | ||
87 | if (!atomic_read(&hlt_counter)) { | ||
88 | local_irq_disable(); | ||
89 | if (!need_resched()) | ||
90 | safe_halt(); | ||
91 | else | ||
92 | local_irq_enable(); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * On SMP it's slightly faster (but much more power-consuming!) | ||
98 | * to poll the ->need_resched flag instead of waiting for the | ||
99 | * cross-CPU IPI to arrive. Use this option with caution. | ||
100 | */ | ||
101 | static void poll_idle (void) | ||
102 | { | ||
103 | int oldval; | ||
104 | |||
105 | local_irq_enable(); | ||
106 | |||
107 | /* | ||
108 | * Deal with another CPU just having chosen a thread to | ||
109 | * run here: | ||
110 | */ | ||
111 | oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); | ||
112 | |||
113 | if (!oldval) { | ||
114 | set_thread_flag(TIF_POLLING_NRFLAG); | ||
115 | asm volatile( | ||
116 | "2:" | ||
117 | "testl %0,%1;" | ||
118 | "rep; nop;" | ||
119 | "je 2b;" | ||
120 | : : | ||
121 | "i" (_TIF_NEED_RESCHED), | ||
122 | "m" (current_thread_info()->flags)); | ||
123 | } else { | ||
124 | set_need_resched(); | ||
125 | } | ||
126 | } | ||
127 | |||
128 | void cpu_idle_wait(void) | ||
129 | { | ||
130 | unsigned int cpu, this_cpu = get_cpu(); | ||
131 | cpumask_t map; | ||
132 | |||
133 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | ||
134 | put_cpu(); | ||
135 | |||
136 | cpus_clear(map); | ||
137 | for_each_online_cpu(cpu) { | ||
138 | per_cpu(cpu_idle_state, cpu) = 1; | ||
139 | cpu_set(cpu, map); | ||
140 | } | ||
141 | |||
142 | __get_cpu_var(cpu_idle_state) = 0; | ||
143 | |||
144 | wmb(); | ||
145 | do { | ||
146 | ssleep(1); | ||
147 | for_each_online_cpu(cpu) { | ||
148 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | ||
149 | cpu_clear(cpu, map); | ||
150 | } | ||
151 | cpus_and(map, map, cpu_online_map); | ||
152 | } while (!cpus_empty(map)); | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
155 | |||
156 | /* | ||
157 | * The idle thread. There's no useful work to be | ||
158 | * done, so just try to conserve power and have a | ||
159 | * low exit latency (ie sit in a loop waiting for | ||
160 | * somebody to say that they'd like to reschedule) | ||
161 | */ | ||
162 | void cpu_idle (void) | ||
163 | { | ||
164 | /* endless idle loop with no priority at all */ | ||
165 | while (1) { | ||
166 | while (!need_resched()) { | ||
167 | void (*idle)(void); | ||
168 | |||
169 | if (__get_cpu_var(cpu_idle_state)) | ||
170 | __get_cpu_var(cpu_idle_state) = 0; | ||
171 | |||
172 | rmb(); | ||
173 | idle = pm_idle; | ||
174 | if (!idle) | ||
175 | idle = default_idle; | ||
176 | idle(); | ||
177 | } | ||
178 | |||
179 | schedule(); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
185 | * which can obviate IPI to trigger checking of need_resched. | ||
186 | * We execute MONITOR against need_resched and enter optimized wait state | ||
187 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
188 | * up from MWAIT (without an IPI). | ||
189 | */ | ||
190 | static void mwait_idle(void) | ||
191 | { | ||
192 | local_irq_enable(); | ||
193 | |||
194 | if (!need_resched()) { | ||
195 | set_thread_flag(TIF_POLLING_NRFLAG); | ||
196 | do { | ||
197 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
198 | if (need_resched()) | ||
199 | break; | ||
200 | __mwait(0, 0); | ||
201 | } while (!need_resched()); | ||
202 | clear_thread_flag(TIF_POLLING_NRFLAG); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | void __init select_idle_routine(const struct cpuinfo_x86 *c) | ||
207 | { | ||
208 | static int printed; | ||
209 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | ||
210 | /* | ||
211 | * Skip, if setup has overridden idle. | ||
212 | * One CPU supports mwait => All CPUs supports mwait | ||
213 | */ | ||
214 | if (!pm_idle) { | ||
215 | if (!printed) { | ||
216 | printk("using mwait in idle threads.\n"); | ||
217 | printed = 1; | ||
218 | } | ||
219 | pm_idle = mwait_idle; | ||
220 | } | ||
221 | } | ||
222 | } | ||
223 | |||
224 | static int __init idle_setup (char *str) | ||
225 | { | ||
226 | if (!strncmp(str, "poll", 4)) { | ||
227 | printk("using polling idle threads.\n"); | ||
228 | pm_idle = poll_idle; | ||
229 | } | ||
230 | |||
231 | boot_option_idle_override = 1; | ||
232 | return 1; | ||
233 | } | ||
234 | |||
235 | __setup("idle=", idle_setup); | ||
236 | |||
237 | /* Prints also some state that isn't saved in the pt_regs */ | ||
238 | void __show_regs(struct pt_regs * regs) | ||
239 | { | ||
240 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | ||
241 | unsigned int fsindex,gsindex; | ||
242 | unsigned int ds,cs,es; | ||
243 | |||
244 | printk("\n"); | ||
245 | print_modules(); | ||
246 | printk("Pid: %d, comm: %.20s %s %s\n", | ||
247 | current->pid, current->comm, print_tainted(), system_utsname.release); | ||
248 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | ||
249 | printk_address(regs->rip); | ||
250 | printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); | ||
251 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | ||
252 | regs->rax, regs->rbx, regs->rcx); | ||
253 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | ||
254 | regs->rdx, regs->rsi, regs->rdi); | ||
255 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | ||
256 | regs->rbp, regs->r8, regs->r9); | ||
257 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | ||
258 | regs->r10, regs->r11, regs->r12); | ||
259 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | ||
260 | regs->r13, regs->r14, regs->r15); | ||
261 | |||
262 | asm("movl %%ds,%0" : "=r" (ds)); | ||
263 | asm("movl %%cs,%0" : "=r" (cs)); | ||
264 | asm("movl %%es,%0" : "=r" (es)); | ||
265 | asm("movl %%fs,%0" : "=r" (fsindex)); | ||
266 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
267 | |||
268 | rdmsrl(MSR_FS_BASE, fs); | ||
269 | rdmsrl(MSR_GS_BASE, gs); | ||
270 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | ||
271 | |||
272 | asm("movq %%cr0, %0": "=r" (cr0)); | ||
273 | asm("movq %%cr2, %0": "=r" (cr2)); | ||
274 | asm("movq %%cr3, %0": "=r" (cr3)); | ||
275 | asm("movq %%cr4, %0": "=r" (cr4)); | ||
276 | |||
277 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | ||
278 | fs,fsindex,gs,gsindex,shadowgs); | ||
279 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | ||
280 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | ||
281 | } | ||
282 | |||
283 | void show_regs(struct pt_regs *regs) | ||
284 | { | ||
285 | __show_regs(regs); | ||
286 | show_trace(®s->rsp); | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Free current thread data structures etc.. | ||
291 | */ | ||
292 | void exit_thread(void) | ||
293 | { | ||
294 | struct task_struct *me = current; | ||
295 | struct thread_struct *t = &me->thread; | ||
296 | if (me->thread.io_bitmap_ptr) { | ||
297 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | ||
298 | |||
299 | kfree(t->io_bitmap_ptr); | ||
300 | t->io_bitmap_ptr = NULL; | ||
301 | /* | ||
302 | * Careful, clear this in the TSS too: | ||
303 | */ | ||
304 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | ||
305 | t->io_bitmap_max = 0; | ||
306 | put_cpu(); | ||
307 | } | ||
308 | } | ||
309 | |||
310 | void flush_thread(void) | ||
311 | { | ||
312 | struct task_struct *tsk = current; | ||
313 | struct thread_info *t = current_thread_info(); | ||
314 | |||
315 | if (t->flags & _TIF_ABI_PENDING) | ||
316 | t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); | ||
317 | |||
318 | tsk->thread.debugreg0 = 0; | ||
319 | tsk->thread.debugreg1 = 0; | ||
320 | tsk->thread.debugreg2 = 0; | ||
321 | tsk->thread.debugreg3 = 0; | ||
322 | tsk->thread.debugreg6 = 0; | ||
323 | tsk->thread.debugreg7 = 0; | ||
324 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | ||
325 | /* | ||
326 | * Forget coprocessor state.. | ||
327 | */ | ||
328 | clear_fpu(tsk); | ||
329 | clear_used_math(); | ||
330 | } | ||
331 | |||
332 | void release_thread(struct task_struct *dead_task) | ||
333 | { | ||
334 | if (dead_task->mm) { | ||
335 | if (dead_task->mm->context.size) { | ||
336 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | ||
337 | dead_task->comm, | ||
338 | dead_task->mm->context.ldt, | ||
339 | dead_task->mm->context.size); | ||
340 | BUG(); | ||
341 | } | ||
342 | } | ||
343 | } | ||
344 | |||
345 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | ||
346 | { | ||
347 | struct user_desc ud = { | ||
348 | .base_addr = addr, | ||
349 | .limit = 0xfffff, | ||
350 | .seg_32bit = 1, | ||
351 | .limit_in_pages = 1, | ||
352 | .useable = 1, | ||
353 | }; | ||
354 | struct n_desc_struct *desc = (void *)t->thread.tls_array; | ||
355 | desc += tls; | ||
356 | desc->a = LDT_entry_a(&ud); | ||
357 | desc->b = LDT_entry_b(&ud); | ||
358 | } | ||
359 | |||
360 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | ||
361 | { | ||
362 | struct desc_struct *desc = (void *)t->thread.tls_array; | ||
363 | desc += tls; | ||
364 | return desc->base0 | | ||
365 | (((u32)desc->base1) << 16) | | ||
366 | (((u32)desc->base2) << 24); | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * This gets called before we allocate a new thread and copy | ||
371 | * the current task into it. | ||
372 | */ | ||
373 | void prepare_to_copy(struct task_struct *tsk) | ||
374 | { | ||
375 | unlazy_fpu(tsk); | ||
376 | } | ||
377 | |||
378 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | ||
379 | unsigned long unused, | ||
380 | struct task_struct * p, struct pt_regs * regs) | ||
381 | { | ||
382 | int err; | ||
383 | struct pt_regs * childregs; | ||
384 | struct task_struct *me = current; | ||
385 | |||
386 | childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; | ||
387 | |||
388 | *childregs = *regs; | ||
389 | |||
390 | childregs->rax = 0; | ||
391 | childregs->rsp = rsp; | ||
392 | if (rsp == ~0UL) { | ||
393 | childregs->rsp = (unsigned long)childregs; | ||
394 | } | ||
395 | |||
396 | p->thread.rsp = (unsigned long) childregs; | ||
397 | p->thread.rsp0 = (unsigned long) (childregs+1); | ||
398 | p->thread.userrsp = me->thread.userrsp; | ||
399 | |||
400 | set_ti_thread_flag(p->thread_info, TIF_FORK); | ||
401 | |||
402 | p->thread.fs = me->thread.fs; | ||
403 | p->thread.gs = me->thread.gs; | ||
404 | |||
405 | asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); | ||
406 | asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); | ||
407 | asm("movl %%es,%0" : "=m" (p->thread.es)); | ||
408 | asm("movl %%ds,%0" : "=m" (p->thread.ds)); | ||
409 | |||
410 | if (unlikely(me->thread.io_bitmap_ptr != NULL)) { | ||
411 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
412 | if (!p->thread.io_bitmap_ptr) { | ||
413 | p->thread.io_bitmap_max = 0; | ||
414 | return -ENOMEM; | ||
415 | } | ||
416 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * Set a new TLS for the child thread? | ||
421 | */ | ||
422 | if (clone_flags & CLONE_SETTLS) { | ||
423 | #ifdef CONFIG_IA32_EMULATION | ||
424 | if (test_thread_flag(TIF_IA32)) | ||
425 | err = ia32_child_tls(p, childregs); | ||
426 | else | ||
427 | #endif | ||
428 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | ||
429 | if (err) | ||
430 | goto out; | ||
431 | } | ||
432 | err = 0; | ||
433 | out: | ||
434 | if (err && p->thread.io_bitmap_ptr) { | ||
435 | kfree(p->thread.io_bitmap_ptr); | ||
436 | p->thread.io_bitmap_max = 0; | ||
437 | } | ||
438 | return err; | ||
439 | } | ||
440 | |||
441 | /* | ||
442 | * This special macro can be used to load a debugging register | ||
443 | */ | ||
444 | #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) | ||
445 | |||
446 | /* | ||
447 | * switch_to(x,y) should switch tasks from x to y. | ||
448 | * | ||
449 | * This could still be optimized: | ||
450 | * - fold all the options into a flag word and test it with a single test. | ||
451 | * - could test fs/gs bitsliced | ||
452 | */ | ||
453 | struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
454 | { | ||
455 | struct thread_struct *prev = &prev_p->thread, | ||
456 | *next = &next_p->thread; | ||
457 | int cpu = smp_processor_id(); | ||
458 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
459 | |||
460 | unlazy_fpu(prev_p); | ||
461 | |||
462 | /* | ||
463 | * Reload esp0, LDT and the page table pointer: | ||
464 | */ | ||
465 | tss->rsp0 = next->rsp0; | ||
466 | |||
467 | /* | ||
468 | * Switch DS and ES. | ||
469 | * This won't pick up thread selector changes, but I guess that is ok. | ||
470 | */ | ||
471 | asm volatile("movl %%es,%0" : "=m" (prev->es)); | ||
472 | if (unlikely(next->es | prev->es)) | ||
473 | loadsegment(es, next->es); | ||
474 | |||
475 | asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); | ||
476 | if (unlikely(next->ds | prev->ds)) | ||
477 | loadsegment(ds, next->ds); | ||
478 | |||
479 | load_TLS(next, cpu); | ||
480 | |||
481 | /* | ||
482 | * Switch FS and GS. | ||
483 | */ | ||
484 | { | ||
485 | unsigned fsindex; | ||
486 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | ||
487 | /* segment register != 0 always requires a reload. | ||
488 | also reload when it has changed. | ||
489 | when prev process used 64bit base always reload | ||
490 | to avoid an information leak. */ | ||
491 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | ||
492 | loadsegment(fs, next->fsindex); | ||
493 | /* check if the user used a selector != 0 | ||
494 | * if yes clear 64bit base, since overloaded base | ||
495 | * is always mapped to the Null selector | ||
496 | */ | ||
497 | if (fsindex) | ||
498 | prev->fs = 0; | ||
499 | } | ||
500 | /* when next process has a 64bit base use it */ | ||
501 | if (next->fs) | ||
502 | wrmsrl(MSR_FS_BASE, next->fs); | ||
503 | prev->fsindex = fsindex; | ||
504 | } | ||
505 | { | ||
506 | unsigned gsindex; | ||
507 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | ||
508 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | ||
509 | load_gs_index(next->gsindex); | ||
510 | if (gsindex) | ||
511 | prev->gs = 0; | ||
512 | } | ||
513 | if (next->gs) | ||
514 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
515 | prev->gsindex = gsindex; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Switch the PDA context. | ||
520 | */ | ||
521 | prev->userrsp = read_pda(oldrsp); | ||
522 | write_pda(oldrsp, next->userrsp); | ||
523 | write_pda(pcurrent, next_p); | ||
524 | write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); | ||
525 | |||
526 | /* | ||
527 | * Now maybe reload the debug registers | ||
528 | */ | ||
529 | if (unlikely(next->debugreg7)) { | ||
530 | loaddebug(next, 0); | ||
531 | loaddebug(next, 1); | ||
532 | loaddebug(next, 2); | ||
533 | loaddebug(next, 3); | ||
534 | /* no 4 and 5 */ | ||
535 | loaddebug(next, 6); | ||
536 | loaddebug(next, 7); | ||
537 | } | ||
538 | |||
539 | |||
540 | /* | ||
541 | * Handle the IO bitmap | ||
542 | */ | ||
543 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { | ||
544 | if (next->io_bitmap_ptr) | ||
545 | /* | ||
546 | * Copy the relevant range of the IO bitmap. | ||
547 | * Normally this is 128 bytes or less: | ||
548 | */ | ||
549 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, | ||
550 | max(prev->io_bitmap_max, next->io_bitmap_max)); | ||
551 | else { | ||
552 | /* | ||
553 | * Clear any possible leftover bits: | ||
554 | */ | ||
555 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | ||
556 | } | ||
557 | } | ||
558 | |||
559 | return prev_p; | ||
560 | } | ||
561 | |||
562 | /* | ||
563 | * sys_execve() executes a new program. | ||
564 | */ | ||
565 | asmlinkage | ||
566 | long sys_execve(char __user *name, char __user * __user *argv, | ||
567 | char __user * __user *envp, struct pt_regs regs) | ||
568 | { | ||
569 | long error; | ||
570 | char * filename; | ||
571 | |||
572 | filename = getname(name); | ||
573 | error = PTR_ERR(filename); | ||
574 | if (IS_ERR(filename)) | ||
575 | return error; | ||
576 | error = do_execve(filename, argv, envp, ®s); | ||
577 | if (error == 0) { | ||
578 | task_lock(current); | ||
579 | current->ptrace &= ~PT_DTRACE; | ||
580 | task_unlock(current); | ||
581 | } | ||
582 | putname(filename); | ||
583 | return error; | ||
584 | } | ||
585 | |||
586 | void set_personality_64bit(void) | ||
587 | { | ||
588 | /* inherit personality from parent */ | ||
589 | |||
590 | /* Make sure to be in 64bit mode */ | ||
591 | clear_thread_flag(TIF_IA32); | ||
592 | |||
593 | /* TBD: overwrites user setup. Should have two bits. | ||
594 | But 64bit processes have always behaved this way, | ||
595 | so it's not too bad. The main problem is just that | ||
596 | 32bit childs are affected again. */ | ||
597 | current->personality &= ~READ_IMPLIES_EXEC; | ||
598 | } | ||
599 | |||
600 | asmlinkage long sys_fork(struct pt_regs *regs) | ||
601 | { | ||
602 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | ||
603 | } | ||
604 | |||
605 | asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | ||
606 | { | ||
607 | if (!newsp) | ||
608 | newsp = regs->rsp; | ||
609 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
610 | } | ||
611 | |||
612 | /* | ||
613 | * This is trivial, and on the face of it looks like it | ||
614 | * could equally well be done in user mode. | ||
615 | * | ||
616 | * Not so, for quite unobvious reasons - register pressure. | ||
617 | * In user mode vfork() cannot have a stack frame, and if | ||
618 | * done by calling the "clone()" system call directly, you | ||
619 | * do not have enough call-clobbered registers to hold all | ||
620 | * the information you need. | ||
621 | */ | ||
622 | asmlinkage long sys_vfork(struct pt_regs *regs) | ||
623 | { | ||
624 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | ||
625 | NULL, NULL); | ||
626 | } | ||
627 | |||
628 | unsigned long get_wchan(struct task_struct *p) | ||
629 | { | ||
630 | unsigned long stack; | ||
631 | u64 fp,rip; | ||
632 | int count = 0; | ||
633 | |||
634 | if (!p || p == current || p->state==TASK_RUNNING) | ||
635 | return 0; | ||
636 | stack = (unsigned long)p->thread_info; | ||
637 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | ||
638 | return 0; | ||
639 | fp = *(u64 *)(p->thread.rsp); | ||
640 | do { | ||
641 | if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) | ||
642 | return 0; | ||
643 | rip = *(u64 *)(fp+8); | ||
644 | if (!in_sched_functions(rip)) | ||
645 | return rip; | ||
646 | fp = *(u64 *)fp; | ||
647 | } while (count++ < 16); | ||
648 | return 0; | ||
649 | } | ||
650 | |||
651 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | ||
652 | { | ||
653 | int ret = 0; | ||
654 | int doit = task == current; | ||
655 | int cpu; | ||
656 | |||
657 | switch (code) { | ||
658 | case ARCH_SET_GS: | ||
659 | if (addr >= TASK_SIZE) | ||
660 | return -EPERM; | ||
661 | cpu = get_cpu(); | ||
662 | /* handle small bases via the GDT because that's faster to | ||
663 | switch. */ | ||
664 | if (addr <= 0xffffffff) { | ||
665 | set_32bit_tls(task, GS_TLS, addr); | ||
666 | if (doit) { | ||
667 | load_TLS(&task->thread, cpu); | ||
668 | load_gs_index(GS_TLS_SEL); | ||
669 | } | ||
670 | task->thread.gsindex = GS_TLS_SEL; | ||
671 | task->thread.gs = 0; | ||
672 | } else { | ||
673 | task->thread.gsindex = 0; | ||
674 | task->thread.gs = addr; | ||
675 | if (doit) { | ||
676 | load_gs_index(0); | ||
677 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | ||
678 | } | ||
679 | } | ||
680 | put_cpu(); | ||
681 | break; | ||
682 | case ARCH_SET_FS: | ||
683 | /* Not strictly needed for fs, but do it for symmetry | ||
684 | with gs */ | ||
685 | if (addr >= TASK_SIZE) | ||
686 | return -EPERM; | ||
687 | cpu = get_cpu(); | ||
688 | /* handle small bases via the GDT because that's faster to | ||
689 | switch. */ | ||
690 | if (addr <= 0xffffffff) { | ||
691 | set_32bit_tls(task, FS_TLS, addr); | ||
692 | if (doit) { | ||
693 | load_TLS(&task->thread, cpu); | ||
694 | asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); | ||
695 | } | ||
696 | task->thread.fsindex = FS_TLS_SEL; | ||
697 | task->thread.fs = 0; | ||
698 | } else { | ||
699 | task->thread.fsindex = 0; | ||
700 | task->thread.fs = addr; | ||
701 | if (doit) { | ||
702 | /* set the selector to 0 to not confuse | ||
703 | __switch_to */ | ||
704 | asm volatile("movl %0,%%fs" :: "r" (0)); | ||
705 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | ||
706 | } | ||
707 | } | ||
708 | put_cpu(); | ||
709 | break; | ||
710 | case ARCH_GET_FS: { | ||
711 | unsigned long base; | ||
712 | if (task->thread.fsindex == FS_TLS_SEL) | ||
713 | base = read_32bit_tls(task, FS_TLS); | ||
714 | else if (doit) { | ||
715 | rdmsrl(MSR_FS_BASE, base); | ||
716 | } else | ||
717 | base = task->thread.fs; | ||
718 | ret = put_user(base, (unsigned long __user *)addr); | ||
719 | break; | ||
720 | } | ||
721 | case ARCH_GET_GS: { | ||
722 | unsigned long base; | ||
723 | if (task->thread.gsindex == GS_TLS_SEL) | ||
724 | base = read_32bit_tls(task, GS_TLS); | ||
725 | else if (doit) { | ||
726 | rdmsrl(MSR_KERNEL_GS_BASE, base); | ||
727 | } else | ||
728 | base = task->thread.gs; | ||
729 | ret = put_user(base, (unsigned long __user *)addr); | ||
730 | break; | ||
731 | } | ||
732 | |||
733 | default: | ||
734 | ret = -EINVAL; | ||
735 | break; | ||
736 | } | ||
737 | |||
738 | return ret; | ||
739 | } | ||
740 | |||
741 | long sys_arch_prctl(int code, unsigned long addr) | ||
742 | { | ||
743 | return do_arch_prctl(current, code, addr); | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * Capture the user space registers if the task is not running (in user space) | ||
748 | */ | ||
749 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | ||
750 | { | ||
751 | struct pt_regs *pp, ptregs; | ||
752 | |||
753 | pp = (struct pt_regs *)(tsk->thread.rsp0); | ||
754 | --pp; | ||
755 | |||
756 | ptregs = *pp; | ||
757 | ptregs.cs &= 0xffff; | ||
758 | ptregs.ss &= 0xffff; | ||
759 | |||
760 | elf_core_copy_regs(regs, &ptregs); | ||
761 | |||
762 | return 1; | ||
763 | } | ||
764 | |||
765 | unsigned long arch_align_stack(unsigned long sp) | ||
766 | { | ||
767 | if (randomize_va_space) | ||
768 | sp -= get_random_int() % 8192; | ||
769 | return sp & ~0xf; | ||
770 | } | ||
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c new file mode 100644 index 000000000000..0b7b101debdf --- /dev/null +++ b/arch/x86_64/kernel/ptrace.c | |||
@@ -0,0 +1,547 @@ | |||
1 | /* ptrace.c */ | ||
2 | /* By Ross Biro 1/23/92 */ | ||
3 | /* | ||
4 | * Pentium III FXSR, SSE support | ||
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
6 | * | ||
7 | * x86-64 port 2000-2002 Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/user.h> | ||
18 | #include <linux/security.h> | ||
19 | #include <linux/audit.h> | ||
20 | #include <linux/seccomp.h> | ||
21 | |||
22 | #include <asm/uaccess.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <asm/processor.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/debugreg.h> | ||
28 | #include <asm/ldt.h> | ||
29 | #include <asm/desc.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/ia32.h> | ||
32 | |||
33 | /* | ||
34 | * does not yet catch signals sent when the child dies. | ||
35 | * in exit.c or in signal.c. | ||
36 | */ | ||
37 | |||
38 | /* determines which flags the user has access to. */ | ||
39 | /* 1 = access 0 = no access */ | ||
40 | #define FLAG_MASK 0x44dd5UL | ||
41 | |||
42 | /* set's the trap flag. */ | ||
43 | #define TRAP_FLAG 0x100UL | ||
44 | |||
45 | /* | ||
46 | * eflags and offset of eflags on child stack.. | ||
47 | */ | ||
48 | #define EFLAGS offsetof(struct pt_regs, eflags) | ||
49 | #define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) | ||
50 | |||
51 | /* | ||
52 | * this routine will get a word off of the processes privileged stack. | ||
53 | * the offset is how far from the base addr as stored in the TSS. | ||
54 | * this routine assumes that all the privileged stacks are in our | ||
55 | * data space. | ||
56 | */ | ||
57 | static inline unsigned long get_stack_long(struct task_struct *task, int offset) | ||
58 | { | ||
59 | unsigned char *stack; | ||
60 | |||
61 | stack = (unsigned char *)task->thread.rsp0; | ||
62 | stack += offset; | ||
63 | return (*((unsigned long *)stack)); | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * this routine will put a word on the processes privileged stack. | ||
68 | * the offset is how far from the base addr as stored in the TSS. | ||
69 | * this routine assumes that all the privileged stacks are in our | ||
70 | * data space. | ||
71 | */ | ||
72 | static inline long put_stack_long(struct task_struct *task, int offset, | ||
73 | unsigned long data) | ||
74 | { | ||
75 | unsigned char * stack; | ||
76 | |||
77 | stack = (unsigned char *) task->thread.rsp0; | ||
78 | stack += offset; | ||
79 | *(unsigned long *) stack = data; | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Called by kernel/ptrace.c when detaching.. | ||
85 | * | ||
86 | * Make sure the single step bit is not set. | ||
87 | */ | ||
88 | void ptrace_disable(struct task_struct *child) | ||
89 | { | ||
90 | long tmp; | ||
91 | |||
92 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
93 | tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; | ||
94 | put_stack_long(child, EFL_OFFSET, tmp); | ||
95 | } | ||
96 | |||
97 | static int putreg(struct task_struct *child, | ||
98 | unsigned long regno, unsigned long value) | ||
99 | { | ||
100 | unsigned long tmp; | ||
101 | |||
102 | /* Some code in the 64bit emulation may not be 64bit clean. | ||
103 | Don't take any chances. */ | ||
104 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
105 | value &= 0xffffffff; | ||
106 | switch (regno) { | ||
107 | case offsetof(struct user_regs_struct,fs): | ||
108 | if (value && (value & 3) != 3) | ||
109 | return -EIO; | ||
110 | child->thread.fsindex = value & 0xffff; | ||
111 | return 0; | ||
112 | case offsetof(struct user_regs_struct,gs): | ||
113 | if (value && (value & 3) != 3) | ||
114 | return -EIO; | ||
115 | child->thread.gsindex = value & 0xffff; | ||
116 | return 0; | ||
117 | case offsetof(struct user_regs_struct,ds): | ||
118 | if (value && (value & 3) != 3) | ||
119 | return -EIO; | ||
120 | child->thread.ds = value & 0xffff; | ||
121 | return 0; | ||
122 | case offsetof(struct user_regs_struct,es): | ||
123 | if (value && (value & 3) != 3) | ||
124 | return -EIO; | ||
125 | child->thread.es = value & 0xffff; | ||
126 | return 0; | ||
127 | case offsetof(struct user_regs_struct,ss): | ||
128 | if ((value & 3) != 3) | ||
129 | return -EIO; | ||
130 | value &= 0xffff; | ||
131 | return 0; | ||
132 | case offsetof(struct user_regs_struct,fs_base): | ||
133 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | ||
134 | return -EIO; | ||
135 | child->thread.fs = value; | ||
136 | return 0; | ||
137 | case offsetof(struct user_regs_struct,gs_base): | ||
138 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | ||
139 | return -EIO; | ||
140 | child->thread.gs = value; | ||
141 | return 0; | ||
142 | case offsetof(struct user_regs_struct, eflags): | ||
143 | value &= FLAG_MASK; | ||
144 | tmp = get_stack_long(child, EFL_OFFSET); | ||
145 | tmp &= ~FLAG_MASK; | ||
146 | value |= tmp; | ||
147 | break; | ||
148 | case offsetof(struct user_regs_struct,cs): | ||
149 | if ((value & 3) != 3) | ||
150 | return -EIO; | ||
151 | value &= 0xffff; | ||
152 | break; | ||
153 | } | ||
154 | put_stack_long(child, regno - sizeof(struct pt_regs), value); | ||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | static unsigned long getreg(struct task_struct *child, unsigned long regno) | ||
159 | { | ||
160 | unsigned long val; | ||
161 | switch (regno) { | ||
162 | case offsetof(struct user_regs_struct, fs): | ||
163 | return child->thread.fsindex; | ||
164 | case offsetof(struct user_regs_struct, gs): | ||
165 | return child->thread.gsindex; | ||
166 | case offsetof(struct user_regs_struct, ds): | ||
167 | return child->thread.ds; | ||
168 | case offsetof(struct user_regs_struct, es): | ||
169 | return child->thread.es; | ||
170 | case offsetof(struct user_regs_struct, fs_base): | ||
171 | return child->thread.fs; | ||
172 | case offsetof(struct user_regs_struct, gs_base): | ||
173 | return child->thread.gs; | ||
174 | default: | ||
175 | regno = regno - sizeof(struct pt_regs); | ||
176 | val = get_stack_long(child, regno); | ||
177 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
178 | val &= 0xffffffff; | ||
179 | return val; | ||
180 | } | ||
181 | |||
182 | } | ||
183 | |||
184 | asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) | ||
185 | { | ||
186 | struct task_struct *child; | ||
187 | long i, ret; | ||
188 | unsigned ui; | ||
189 | |||
190 | /* This lock_kernel fixes a subtle race with suid exec */ | ||
191 | lock_kernel(); | ||
192 | ret = -EPERM; | ||
193 | if (request == PTRACE_TRACEME) { | ||
194 | /* are we already being traced? */ | ||
195 | if (current->ptrace & PT_PTRACED) | ||
196 | goto out; | ||
197 | ret = security_ptrace(current->parent, current); | ||
198 | if (ret) | ||
199 | goto out; | ||
200 | /* set the ptrace bit in the process flags. */ | ||
201 | current->ptrace |= PT_PTRACED; | ||
202 | ret = 0; | ||
203 | goto out; | ||
204 | } | ||
205 | ret = -ESRCH; | ||
206 | read_lock(&tasklist_lock); | ||
207 | child = find_task_by_pid(pid); | ||
208 | if (child) | ||
209 | get_task_struct(child); | ||
210 | read_unlock(&tasklist_lock); | ||
211 | if (!child) | ||
212 | goto out; | ||
213 | |||
214 | ret = -EPERM; | ||
215 | if (pid == 1) /* you may not mess with init */ | ||
216 | goto out_tsk; | ||
217 | |||
218 | if (request == PTRACE_ATTACH) { | ||
219 | ret = ptrace_attach(child); | ||
220 | goto out_tsk; | ||
221 | } | ||
222 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
223 | if (ret < 0) | ||
224 | goto out_tsk; | ||
225 | |||
226 | switch (request) { | ||
227 | /* when I and D space are separate, these will need to be fixed. */ | ||
228 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | ||
229 | case PTRACE_PEEKDATA: { | ||
230 | unsigned long tmp; | ||
231 | int copied; | ||
232 | |||
233 | copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); | ||
234 | ret = -EIO; | ||
235 | if (copied != sizeof(tmp)) | ||
236 | break; | ||
237 | ret = put_user(tmp,(unsigned long __user *) data); | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | /* read the word at location addr in the USER area. */ | ||
242 | case PTRACE_PEEKUSR: { | ||
243 | unsigned long tmp; | ||
244 | |||
245 | ret = -EIO; | ||
246 | if ((addr & 7) || | ||
247 | addr > sizeof(struct user) - 7) | ||
248 | break; | ||
249 | |||
250 | switch (addr) { | ||
251 | case 0 ... sizeof(struct user_regs_struct): | ||
252 | tmp = getreg(child, addr); | ||
253 | break; | ||
254 | case offsetof(struct user, u_debugreg[0]): | ||
255 | tmp = child->thread.debugreg0; | ||
256 | break; | ||
257 | case offsetof(struct user, u_debugreg[1]): | ||
258 | tmp = child->thread.debugreg1; | ||
259 | break; | ||
260 | case offsetof(struct user, u_debugreg[2]): | ||
261 | tmp = child->thread.debugreg2; | ||
262 | break; | ||
263 | case offsetof(struct user, u_debugreg[3]): | ||
264 | tmp = child->thread.debugreg3; | ||
265 | break; | ||
266 | case offsetof(struct user, u_debugreg[6]): | ||
267 | tmp = child->thread.debugreg6; | ||
268 | break; | ||
269 | case offsetof(struct user, u_debugreg[7]): | ||
270 | tmp = child->thread.debugreg7; | ||
271 | break; | ||
272 | default: | ||
273 | tmp = 0; | ||
274 | break; | ||
275 | } | ||
276 | ret = put_user(tmp,(unsigned long __user *) data); | ||
277 | break; | ||
278 | } | ||
279 | |||
280 | /* when I and D space are separate, this will have to be fixed. */ | ||
281 | case PTRACE_POKETEXT: /* write the word at location addr. */ | ||
282 | case PTRACE_POKEDATA: | ||
283 | ret = 0; | ||
284 | if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) | ||
285 | break; | ||
286 | ret = -EIO; | ||
287 | break; | ||
288 | |||
289 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | ||
290 | ret = -EIO; | ||
291 | if ((addr & 7) || | ||
292 | addr > sizeof(struct user) - 7) | ||
293 | break; | ||
294 | |||
295 | switch (addr) { | ||
296 | case 0 ... sizeof(struct user_regs_struct): | ||
297 | ret = putreg(child, addr, data); | ||
298 | break; | ||
299 | /* Disallows to set a breakpoint into the vsyscall */ | ||
300 | case offsetof(struct user, u_debugreg[0]): | ||
301 | if (data >= TASK_SIZE-7) break; | ||
302 | child->thread.debugreg0 = data; | ||
303 | ret = 0; | ||
304 | break; | ||
305 | case offsetof(struct user, u_debugreg[1]): | ||
306 | if (data >= TASK_SIZE-7) break; | ||
307 | child->thread.debugreg1 = data; | ||
308 | ret = 0; | ||
309 | break; | ||
310 | case offsetof(struct user, u_debugreg[2]): | ||
311 | if (data >= TASK_SIZE-7) break; | ||
312 | child->thread.debugreg2 = data; | ||
313 | ret = 0; | ||
314 | break; | ||
315 | case offsetof(struct user, u_debugreg[3]): | ||
316 | if (data >= TASK_SIZE-7) break; | ||
317 | child->thread.debugreg3 = data; | ||
318 | ret = 0; | ||
319 | break; | ||
320 | case offsetof(struct user, u_debugreg[6]): | ||
321 | if (data >> 32) | ||
322 | break; | ||
323 | child->thread.debugreg6 = data; | ||
324 | ret = 0; | ||
325 | break; | ||
326 | case offsetof(struct user, u_debugreg[7]): | ||
327 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
328 | * this awkward check.*/ | ||
329 | data &= ~DR_CONTROL_RESERVED; | ||
330 | for(i=0; i<4; i++) | ||
331 | if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1) | ||
332 | break; | ||
333 | if (i == 4) { | ||
334 | child->thread.debugreg7 = data; | ||
335 | ret = 0; | ||
336 | } | ||
337 | break; | ||
338 | } | ||
339 | break; | ||
340 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | ||
341 | case PTRACE_CONT: { /* restart after signal. */ | ||
342 | long tmp; | ||
343 | |||
344 | ret = -EIO; | ||
345 | if ((unsigned long) data > _NSIG) | ||
346 | break; | ||
347 | if (request == PTRACE_SYSCALL) | ||
348 | set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
349 | else | ||
350 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
351 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
352 | child->exit_code = data; | ||
353 | /* make sure the single step bit is not set. */ | ||
354 | tmp = get_stack_long(child, EFL_OFFSET); | ||
355 | tmp &= ~TRAP_FLAG; | ||
356 | put_stack_long(child, EFL_OFFSET,tmp); | ||
357 | wake_up_process(child); | ||
358 | ret = 0; | ||
359 | break; | ||
360 | } | ||
361 | |||
362 | #ifdef CONFIG_IA32_EMULATION | ||
363 | /* This makes only sense with 32bit programs. Allow a | ||
364 | 64bit debugger to fully examine them too. Better | ||
365 | don't use it against 64bit processes, use | ||
366 | PTRACE_ARCH_PRCTL instead. */ | ||
367 | case PTRACE_SET_THREAD_AREA: { | ||
368 | struct user_desc __user *p; | ||
369 | int old; | ||
370 | p = (struct user_desc __user *)data; | ||
371 | get_user(old, &p->entry_number); | ||
372 | put_user(addr, &p->entry_number); | ||
373 | ret = do_set_thread_area(&child->thread, p); | ||
374 | put_user(old, &p->entry_number); | ||
375 | break; | ||
376 | case PTRACE_GET_THREAD_AREA: | ||
377 | p = (struct user_desc __user *)data; | ||
378 | get_user(old, &p->entry_number); | ||
379 | put_user(addr, &p->entry_number); | ||
380 | ret = do_get_thread_area(&child->thread, p); | ||
381 | put_user(old, &p->entry_number); | ||
382 | break; | ||
383 | } | ||
384 | #endif | ||
385 | /* normal 64bit interface to access TLS data. | ||
386 | Works just like arch_prctl, except that the arguments | ||
387 | are reversed. */ | ||
388 | case PTRACE_ARCH_PRCTL: | ||
389 | ret = do_arch_prctl(child, data, addr); | ||
390 | break; | ||
391 | |||
392 | /* | ||
393 | * make the child exit. Best I can do is send it a sigkill. | ||
394 | * perhaps it should be put in the status that it wants to | ||
395 | * exit. | ||
396 | */ | ||
397 | case PTRACE_KILL: { | ||
398 | long tmp; | ||
399 | |||
400 | ret = 0; | ||
401 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | ||
402 | break; | ||
403 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
404 | child->exit_code = SIGKILL; | ||
405 | /* make sure the single step bit is not set. */ | ||
406 | tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; | ||
407 | put_stack_long(child, EFL_OFFSET, tmp); | ||
408 | wake_up_process(child); | ||
409 | break; | ||
410 | } | ||
411 | |||
412 | case PTRACE_SINGLESTEP: { /* set the trap flag. */ | ||
413 | long tmp; | ||
414 | |||
415 | ret = -EIO; | ||
416 | if ((unsigned long) data > _NSIG) | ||
417 | break; | ||
418 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
419 | if ((child->ptrace & PT_DTRACE) == 0) { | ||
420 | /* Spurious delayed TF traps may occur */ | ||
421 | child->ptrace |= PT_DTRACE; | ||
422 | } | ||
423 | tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG; | ||
424 | put_stack_long(child, EFL_OFFSET, tmp); | ||
425 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
426 | child->exit_code = data; | ||
427 | /* give it a chance to run. */ | ||
428 | wake_up_process(child); | ||
429 | ret = 0; | ||
430 | break; | ||
431 | } | ||
432 | |||
433 | case PTRACE_DETACH: | ||
434 | /* detach a process that was attached. */ | ||
435 | ret = ptrace_detach(child, data); | ||
436 | break; | ||
437 | |||
438 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
439 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
440 | sizeof(struct user_regs_struct))) { | ||
441 | ret = -EIO; | ||
442 | break; | ||
443 | } | ||
444 | ret = 0; | ||
445 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
446 | ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); | ||
447 | data += sizeof(long); | ||
448 | } | ||
449 | break; | ||
450 | } | ||
451 | |||
452 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
453 | unsigned long tmp; | ||
454 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
455 | sizeof(struct user_regs_struct))) { | ||
456 | ret = -EIO; | ||
457 | break; | ||
458 | } | ||
459 | ret = 0; | ||
460 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
461 | ret |= __get_user(tmp, (unsigned long __user *) data); | ||
462 | putreg(child, ui, tmp); | ||
463 | data += sizeof(long); | ||
464 | } | ||
465 | break; | ||
466 | } | ||
467 | |||
468 | case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ | ||
469 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
470 | sizeof(struct user_i387_struct))) { | ||
471 | ret = -EIO; | ||
472 | break; | ||
473 | } | ||
474 | ret = get_fpregs((struct user_i387_struct __user *)data, child); | ||
475 | break; | ||
476 | } | ||
477 | |||
478 | case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ | ||
479 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
480 | sizeof(struct user_i387_struct))) { | ||
481 | ret = -EIO; | ||
482 | break; | ||
483 | } | ||
484 | set_stopped_child_used_math(child); | ||
485 | ret = set_fpregs(child, (struct user_i387_struct __user *)data); | ||
486 | break; | ||
487 | } | ||
488 | |||
489 | default: | ||
490 | ret = ptrace_request(child, request, addr, data); | ||
491 | break; | ||
492 | } | ||
493 | out_tsk: | ||
494 | put_task_struct(child); | ||
495 | out: | ||
496 | unlock_kernel(); | ||
497 | return ret; | ||
498 | } | ||
499 | |||
500 | static void syscall_trace(struct pt_regs *regs) | ||
501 | { | ||
502 | |||
503 | #if 0 | ||
504 | printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", | ||
505 | current->comm, | ||
506 | regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), | ||
507 | current_thread_info()->flags, current->ptrace); | ||
508 | #endif | ||
509 | |||
510 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | ||
511 | ? 0x80 : 0)); | ||
512 | /* | ||
513 | * this isn't the same as continuing with a signal, but it will do | ||
514 | * for normal use. strace only continues with a signal if the | ||
515 | * stopping signal is not SIGTRAP. -brl | ||
516 | */ | ||
517 | if (current->exit_code) { | ||
518 | send_sig(current->exit_code, current, 1); | ||
519 | current->exit_code = 0; | ||
520 | } | ||
521 | } | ||
522 | |||
523 | asmlinkage void syscall_trace_enter(struct pt_regs *regs) | ||
524 | { | ||
525 | /* do the secure computing check first */ | ||
526 | secure_computing(regs->orig_rax); | ||
527 | |||
528 | if (unlikely(current->audit_context)) | ||
529 | audit_syscall_entry(current, regs->orig_rax, | ||
530 | regs->rdi, regs->rsi, | ||
531 | regs->rdx, regs->r10); | ||
532 | |||
533 | if (test_thread_flag(TIF_SYSCALL_TRACE) | ||
534 | && (current->ptrace & PT_PTRACED)) | ||
535 | syscall_trace(regs); | ||
536 | } | ||
537 | |||
538 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | ||
539 | { | ||
540 | if (unlikely(current->audit_context)) | ||
541 | audit_syscall_exit(current, regs->rax); | ||
542 | |||
543 | if ((test_thread_flag(TIF_SYSCALL_TRACE) | ||
544 | || test_thread_flag(TIF_SINGLESTEP)) | ||
545 | && (current->ptrace & PT_PTRACED)) | ||
546 | syscall_trace(regs); | ||
547 | } | ||
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c new file mode 100644 index 000000000000..be4b36f762cf --- /dev/null +++ b/arch/x86_64/kernel/reboot.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* Various gunk just to reboot the machine. */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/reboot.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/ctype.h> | ||
8 | #include <linux/string.h> | ||
9 | #include <asm/io.h> | ||
10 | #include <asm/kdebug.h> | ||
11 | #include <asm/delay.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/tlbflush.h> | ||
16 | #include <asm/apic.h> | ||
17 | |||
18 | /* | ||
19 | * Power off function, if any | ||
20 | */ | ||
21 | void (*pm_power_off)(void); | ||
22 | |||
23 | static long no_idt[3]; | ||
24 | static enum { | ||
25 | BOOT_TRIPLE = 't', | ||
26 | BOOT_KBD = 'k' | ||
27 | } reboot_type = BOOT_KBD; | ||
28 | static int reboot_mode = 0; | ||
29 | int reboot_force; | ||
30 | |||
31 | /* reboot=t[riple] | k[bd] [, [w]arm | [c]old] | ||
32 | warm Don't set the cold reboot flag | ||
33 | cold Set the cold reboot flag | ||
34 | triple Force a triple fault (init) | ||
35 | kbd Use the keyboard controller. cold reset (default) | ||
36 | force Avoid anything that could hang. | ||
37 | */ | ||
38 | static int __init reboot_setup(char *str) | ||
39 | { | ||
40 | for (;;) { | ||
41 | switch (*str) { | ||
42 | case 'w': | ||
43 | reboot_mode = 0x1234; | ||
44 | break; | ||
45 | |||
46 | case 'c': | ||
47 | reboot_mode = 0; | ||
48 | break; | ||
49 | |||
50 | case 't': | ||
51 | case 'b': | ||
52 | case 'k': | ||
53 | reboot_type = *str; | ||
54 | break; | ||
55 | case 'f': | ||
56 | reboot_force = 1; | ||
57 | break; | ||
58 | } | ||
59 | if((str = strchr(str,',')) != NULL) | ||
60 | str++; | ||
61 | else | ||
62 | break; | ||
63 | } | ||
64 | return 1; | ||
65 | } | ||
66 | |||
67 | __setup("reboot=", reboot_setup); | ||
68 | |||
69 | #ifdef CONFIG_SMP | ||
70 | static void smp_halt(void) | ||
71 | { | ||
72 | int cpuid = safe_smp_processor_id(); | ||
73 | static int first_entry = 1; | ||
74 | |||
75 | if (reboot_force) | ||
76 | return; | ||
77 | |||
78 | if (first_entry) { | ||
79 | first_entry = 0; | ||
80 | smp_call_function((void *)machine_restart, NULL, 1, 0); | ||
81 | } | ||
82 | |||
83 | smp_stop_cpu(); | ||
84 | |||
85 | /* AP calling this. Just halt */ | ||
86 | if (cpuid != boot_cpu_id) { | ||
87 | for (;;) | ||
88 | asm("hlt"); | ||
89 | } | ||
90 | |||
91 | /* Wait for all other CPUs to have run smp_stop_cpu */ | ||
92 | while (!cpus_empty(cpu_online_map)) | ||
93 | rep_nop(); | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | static inline void kb_wait(void) | ||
98 | { | ||
99 | int i; | ||
100 | |||
101 | for (i=0; i<0x10000; i++) | ||
102 | if ((inb_p(0x64) & 0x02) == 0) | ||
103 | break; | ||
104 | } | ||
105 | |||
106 | void machine_restart(char * __unused) | ||
107 | { | ||
108 | int i; | ||
109 | |||
110 | printk("machine restart\n"); | ||
111 | |||
112 | #ifdef CONFIG_SMP | ||
113 | smp_halt(); | ||
114 | #endif | ||
115 | |||
116 | if (!reboot_force) { | ||
117 | local_irq_disable(); | ||
118 | #ifndef CONFIG_SMP | ||
119 | disable_local_APIC(); | ||
120 | #endif | ||
121 | disable_IO_APIC(); | ||
122 | local_irq_enable(); | ||
123 | } | ||
124 | |||
125 | /* Tell the BIOS if we want cold or warm reboot */ | ||
126 | *((unsigned short *)__va(0x472)) = reboot_mode; | ||
127 | |||
128 | for (;;) { | ||
129 | /* Could also try the reset bit in the Hammer NB */ | ||
130 | switch (reboot_type) { | ||
131 | case BOOT_KBD: | ||
132 | for (i=0; i<100; i++) { | ||
133 | kb_wait(); | ||
134 | udelay(50); | ||
135 | outb(0xfe,0x64); /* pulse reset low */ | ||
136 | udelay(50); | ||
137 | } | ||
138 | |||
139 | case BOOT_TRIPLE: | ||
140 | __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); | ||
141 | __asm__ __volatile__("int3"); | ||
142 | |||
143 | reboot_type = BOOT_KBD; | ||
144 | break; | ||
145 | } | ||
146 | } | ||
147 | } | ||
148 | |||
149 | EXPORT_SYMBOL(machine_restart); | ||
150 | |||
151 | void machine_halt(void) | ||
152 | { | ||
153 | } | ||
154 | |||
155 | EXPORT_SYMBOL(machine_halt); | ||
156 | |||
157 | void machine_power_off(void) | ||
158 | { | ||
159 | if (pm_power_off) | ||
160 | pm_power_off(); | ||
161 | } | ||
162 | |||
163 | EXPORT_SYMBOL(machine_power_off); | ||
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c new file mode 100644 index 000000000000..48f7c18172b9 --- /dev/null +++ b/arch/x86_64/kernel/semaphore.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * x86_64 semaphore implementation. | ||
3 | * | ||
4 | * (C) Copyright 1999 Linus Torvalds | ||
5 | * | ||
6 | * Portions Copyright 1999 Red Hat, Inc. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> | ||
14 | */ | ||
15 | #include <linux/config.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/errno.h> | ||
19 | |||
20 | #include <asm/semaphore.h> | ||
21 | |||
22 | /* | ||
23 | * Semaphores are implemented using a two-way counter: | ||
24 | * The "count" variable is decremented for each process | ||
25 | * that tries to acquire the semaphore, while the "sleeping" | ||
26 | * variable is a count of such acquires. | ||
27 | * | ||
28 | * Notably, the inline "up()" and "down()" functions can | ||
29 | * efficiently test if they need to do any extra work (up | ||
30 | * needs to do something only if count was negative before | ||
31 | * the increment operation. | ||
32 | * | ||
33 | * "sleeping" and the contention routine ordering is protected | ||
34 | * by the spinlock in the semaphore's waitqueue head. | ||
35 | * | ||
36 | * Note that these functions are only called when there is | ||
37 | * contention on the lock, and as such all this is the | ||
38 | * "non-critical" part of the whole semaphore business. The | ||
39 | * critical part is the inline stuff in <asm/semaphore.h> | ||
40 | * where we want to avoid any extra jumps and calls. | ||
41 | */ | ||
42 | |||
43 | /* | ||
44 | * Logic: | ||
45 | * - only on a boundary condition do we need to care. When we go | ||
46 | * from a negative count to a non-negative, we wake people up. | ||
47 | * - when we go from a non-negative count to a negative do we | ||
48 | * (a) synchronize with the "sleeper" count and (b) make sure | ||
49 | * that we're on the wakeup list before we synchronize so that | ||
50 | * we cannot lose wakeup events. | ||
51 | */ | ||
52 | |||
53 | void __up(struct semaphore *sem) | ||
54 | { | ||
55 | wake_up(&sem->wait); | ||
56 | } | ||
57 | |||
58 | void __sched __down(struct semaphore * sem) | ||
59 | { | ||
60 | struct task_struct *tsk = current; | ||
61 | DECLARE_WAITQUEUE(wait, tsk); | ||
62 | unsigned long flags; | ||
63 | |||
64 | tsk->state = TASK_UNINTERRUPTIBLE; | ||
65 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
66 | add_wait_queue_exclusive_locked(&sem->wait, &wait); | ||
67 | |||
68 | sem->sleepers++; | ||
69 | for (;;) { | ||
70 | int sleepers = sem->sleepers; | ||
71 | |||
72 | /* | ||
73 | * Add "everybody else" into it. They aren't | ||
74 | * playing, because we own the spinlock in | ||
75 | * the wait_queue_head. | ||
76 | */ | ||
77 | if (!atomic_add_negative(sleepers - 1, &sem->count)) { | ||
78 | sem->sleepers = 0; | ||
79 | break; | ||
80 | } | ||
81 | sem->sleepers = 1; /* us - see -1 above */ | ||
82 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
83 | |||
84 | schedule(); | ||
85 | |||
86 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
87 | tsk->state = TASK_UNINTERRUPTIBLE; | ||
88 | } | ||
89 | remove_wait_queue_locked(&sem->wait, &wait); | ||
90 | wake_up_locked(&sem->wait); | ||
91 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
92 | tsk->state = TASK_RUNNING; | ||
93 | } | ||
94 | |||
95 | int __sched __down_interruptible(struct semaphore * sem) | ||
96 | { | ||
97 | int retval = 0; | ||
98 | struct task_struct *tsk = current; | ||
99 | DECLARE_WAITQUEUE(wait, tsk); | ||
100 | unsigned long flags; | ||
101 | |||
102 | tsk->state = TASK_INTERRUPTIBLE; | ||
103 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
104 | add_wait_queue_exclusive_locked(&sem->wait, &wait); | ||
105 | |||
106 | sem->sleepers++; | ||
107 | for (;;) { | ||
108 | int sleepers = sem->sleepers; | ||
109 | |||
110 | /* | ||
111 | * With signals pending, this turns into | ||
112 | * the trylock failure case - we won't be | ||
113 | * sleeping, and we* can't get the lock as | ||
114 | * it has contention. Just correct the count | ||
115 | * and exit. | ||
116 | */ | ||
117 | if (signal_pending(current)) { | ||
118 | retval = -EINTR; | ||
119 | sem->sleepers = 0; | ||
120 | atomic_add(sleepers, &sem->count); | ||
121 | break; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Add "everybody else" into it. They aren't | ||
126 | * playing, because we own the spinlock in | ||
127 | * wait_queue_head. The "-1" is because we're | ||
128 | * still hoping to get the semaphore. | ||
129 | */ | ||
130 | if (!atomic_add_negative(sleepers - 1, &sem->count)) { | ||
131 | sem->sleepers = 0; | ||
132 | break; | ||
133 | } | ||
134 | sem->sleepers = 1; /* us - see -1 above */ | ||
135 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
136 | |||
137 | schedule(); | ||
138 | |||
139 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
140 | tsk->state = TASK_INTERRUPTIBLE; | ||
141 | } | ||
142 | remove_wait_queue_locked(&sem->wait, &wait); | ||
143 | wake_up_locked(&sem->wait); | ||
144 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
145 | |||
146 | tsk->state = TASK_RUNNING; | ||
147 | return retval; | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Trylock failed - make sure we correct for | ||
152 | * having decremented the count. | ||
153 | * | ||
154 | * We could have done the trylock with a | ||
155 | * single "cmpxchg" without failure cases, | ||
156 | * but then it wouldn't work on a 386. | ||
157 | */ | ||
158 | int __down_trylock(struct semaphore * sem) | ||
159 | { | ||
160 | int sleepers; | ||
161 | unsigned long flags; | ||
162 | |||
163 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
164 | sleepers = sem->sleepers + 1; | ||
165 | sem->sleepers = 0; | ||
166 | |||
167 | /* | ||
168 | * Add "everybody else" and us into it. They aren't | ||
169 | * playing, because we own the spinlock in the | ||
170 | * wait_queue_head. | ||
171 | */ | ||
172 | if (!atomic_add_negative(sleepers, &sem->count)) { | ||
173 | wake_up_locked(&sem->wait); | ||
174 | } | ||
175 | |||
176 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
177 | return 1; | ||
178 | } | ||
179 | |||
180 | |||
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c new file mode 100644 index 000000000000..e50fc41de6b9 --- /dev/null +++ b/arch/x86_64/kernel/setup.c | |||
@@ -0,0 +1,1189 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/setup.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Nov 2001 Dave Jones <davej@suse.de> | ||
7 | * Forked from i386 setup code. | ||
8 | * | ||
9 | * $Id$ | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | * This file handles the architecture-dependent parts of initialization | ||
14 | */ | ||
15 | |||
16 | #include <linux/errno.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/stddef.h> | ||
21 | #include <linux/unistd.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/user.h> | ||
25 | #include <linux/a.out.h> | ||
26 | #include <linux/tty.h> | ||
27 | #include <linux/ioport.h> | ||
28 | #include <linux/delay.h> | ||
29 | #include <linux/config.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <linux/initrd.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/bootmem.h> | ||
34 | #include <linux/module.h> | ||
35 | #include <asm/processor.h> | ||
36 | #include <linux/console.h> | ||
37 | #include <linux/seq_file.h> | ||
38 | #include <linux/root_dev.h> | ||
39 | #include <linux/pci.h> | ||
40 | #include <linux/acpi.h> | ||
41 | #include <linux/kallsyms.h> | ||
42 | #include <linux/edd.h> | ||
43 | #include <asm/mtrr.h> | ||
44 | #include <asm/uaccess.h> | ||
45 | #include <asm/system.h> | ||
46 | #include <asm/io.h> | ||
47 | #include <asm/smp.h> | ||
48 | #include <asm/msr.h> | ||
49 | #include <asm/desc.h> | ||
50 | #include <video/edid.h> | ||
51 | #include <asm/e820.h> | ||
52 | #include <asm/dma.h> | ||
53 | #include <asm/mpspec.h> | ||
54 | #include <asm/mmu_context.h> | ||
55 | #include <asm/bootsetup.h> | ||
56 | #include <asm/proto.h> | ||
57 | #include <asm/setup.h> | ||
58 | #include <asm/mach_apic.h> | ||
59 | #include <asm/numa.h> | ||
60 | |||
61 | /* | ||
62 | * Machine setup.. | ||
63 | */ | ||
64 | |||
65 | struct cpuinfo_x86 boot_cpu_data; | ||
66 | |||
67 | unsigned long mmu_cr4_features; | ||
68 | |||
69 | int acpi_disabled; | ||
70 | EXPORT_SYMBOL(acpi_disabled); | ||
71 | #ifdef CONFIG_ACPI_BOOT | ||
72 | extern int __initdata acpi_ht; | ||
73 | extern acpi_interrupt_flags acpi_sci_flags; | ||
74 | int __initdata acpi_force = 0; | ||
75 | #endif | ||
76 | |||
77 | int acpi_numa __initdata; | ||
78 | |||
79 | /* For PCI or other memory-mapped resources */ | ||
80 | unsigned long pci_mem_start = 0x10000000; | ||
81 | |||
82 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
83 | int bootloader_type; | ||
84 | |||
85 | unsigned long saved_video_mode; | ||
86 | |||
87 | #ifdef CONFIG_SWIOTLB | ||
88 | int swiotlb; | ||
89 | EXPORT_SYMBOL(swiotlb); | ||
90 | #endif | ||
91 | |||
92 | /* | ||
93 | * Setup options | ||
94 | */ | ||
95 | struct drive_info_struct { char dummy[32]; } drive_info; | ||
96 | struct screen_info screen_info; | ||
97 | struct sys_desc_table_struct { | ||
98 | unsigned short length; | ||
99 | unsigned char table[0]; | ||
100 | }; | ||
101 | |||
102 | struct edid_info edid_info; | ||
103 | struct e820map e820; | ||
104 | |||
105 | extern int root_mountflags; | ||
106 | extern char _text, _etext, _edata, _end; | ||
107 | |||
108 | char command_line[COMMAND_LINE_SIZE]; | ||
109 | |||
110 | struct resource standard_io_resources[] = { | ||
111 | { .name = "dma1", .start = 0x00, .end = 0x1f, | ||
112 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
113 | { .name = "pic1", .start = 0x20, .end = 0x21, | ||
114 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
115 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
116 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
117 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
118 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
119 | { .name = "keyboard", .start = 0x60, .end = 0x6f, | ||
120 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
121 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
122 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
123 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
124 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
125 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
126 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
127 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
128 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
129 | }; | ||
130 | |||
131 | #define STANDARD_IO_RESOURCES \ | ||
132 | (sizeof standard_io_resources / sizeof standard_io_resources[0]) | ||
133 | |||
134 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | ||
135 | |||
136 | struct resource data_resource = { | ||
137 | .name = "Kernel data", | ||
138 | .start = 0, | ||
139 | .end = 0, | ||
140 | .flags = IORESOURCE_RAM, | ||
141 | }; | ||
142 | struct resource code_resource = { | ||
143 | .name = "Kernel code", | ||
144 | .start = 0, | ||
145 | .end = 0, | ||
146 | .flags = IORESOURCE_RAM, | ||
147 | }; | ||
148 | |||
149 | #define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) | ||
150 | |||
151 | static struct resource system_rom_resource = { | ||
152 | .name = "System ROM", | ||
153 | .start = 0xf0000, | ||
154 | .end = 0xfffff, | ||
155 | .flags = IORESOURCE_ROM, | ||
156 | }; | ||
157 | |||
158 | static struct resource extension_rom_resource = { | ||
159 | .name = "Extension ROM", | ||
160 | .start = 0xe0000, | ||
161 | .end = 0xeffff, | ||
162 | .flags = IORESOURCE_ROM, | ||
163 | }; | ||
164 | |||
165 | static struct resource adapter_rom_resources[] = { | ||
166 | { .name = "Adapter ROM", .start = 0xc8000, .end = 0, | ||
167 | .flags = IORESOURCE_ROM }, | ||
168 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
169 | .flags = IORESOURCE_ROM }, | ||
170 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
171 | .flags = IORESOURCE_ROM }, | ||
172 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
173 | .flags = IORESOURCE_ROM }, | ||
174 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
175 | .flags = IORESOURCE_ROM }, | ||
176 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
177 | .flags = IORESOURCE_ROM } | ||
178 | }; | ||
179 | |||
180 | #define ADAPTER_ROM_RESOURCES \ | ||
181 | (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) | ||
182 | |||
183 | static struct resource video_rom_resource = { | ||
184 | .name = "Video ROM", | ||
185 | .start = 0xc0000, | ||
186 | .end = 0xc7fff, | ||
187 | .flags = IORESOURCE_ROM, | ||
188 | }; | ||
189 | |||
190 | static struct resource video_ram_resource = { | ||
191 | .name = "Video RAM area", | ||
192 | .start = 0xa0000, | ||
193 | .end = 0xbffff, | ||
194 | .flags = IORESOURCE_RAM, | ||
195 | }; | ||
196 | |||
197 | #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) | ||
198 | |||
199 | static int __init romchecksum(unsigned char *rom, unsigned long length) | ||
200 | { | ||
201 | unsigned char *p, sum = 0; | ||
202 | |||
203 | for (p = rom; p < rom + length; p++) | ||
204 | sum += *p; | ||
205 | return sum == 0; | ||
206 | } | ||
207 | |||
208 | static void __init probe_roms(void) | ||
209 | { | ||
210 | unsigned long start, length, upper; | ||
211 | unsigned char *rom; | ||
212 | int i; | ||
213 | |||
214 | /* video rom */ | ||
215 | upper = adapter_rom_resources[0].start; | ||
216 | for (start = video_rom_resource.start; start < upper; start += 2048) { | ||
217 | rom = isa_bus_to_virt(start); | ||
218 | if (!romsignature(rom)) | ||
219 | continue; | ||
220 | |||
221 | video_rom_resource.start = start; | ||
222 | |||
223 | /* 0 < length <= 0x7f * 512, historically */ | ||
224 | length = rom[2] * 512; | ||
225 | |||
226 | /* if checksum okay, trust length byte */ | ||
227 | if (length && romchecksum(rom, length)) | ||
228 | video_rom_resource.end = start + length - 1; | ||
229 | |||
230 | request_resource(&iomem_resource, &video_rom_resource); | ||
231 | break; | ||
232 | } | ||
233 | |||
234 | start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | ||
235 | if (start < upper) | ||
236 | start = upper; | ||
237 | |||
238 | /* system rom */ | ||
239 | request_resource(&iomem_resource, &system_rom_resource); | ||
240 | upper = system_rom_resource.start; | ||
241 | |||
242 | /* check for extension rom (ignore length byte!) */ | ||
243 | rom = isa_bus_to_virt(extension_rom_resource.start); | ||
244 | if (romsignature(rom)) { | ||
245 | length = extension_rom_resource.end - extension_rom_resource.start + 1; | ||
246 | if (romchecksum(rom, length)) { | ||
247 | request_resource(&iomem_resource, &extension_rom_resource); | ||
248 | upper = extension_rom_resource.start; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* check for adapter roms on 2k boundaries */ | ||
253 | for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { | ||
254 | rom = isa_bus_to_virt(start); | ||
255 | if (!romsignature(rom)) | ||
256 | continue; | ||
257 | |||
258 | /* 0 < length <= 0x7f * 512, historically */ | ||
259 | length = rom[2] * 512; | ||
260 | |||
261 | /* but accept any length that fits if checksum okay */ | ||
262 | if (!length || start + length > upper || !romchecksum(rom, length)) | ||
263 | continue; | ||
264 | |||
265 | adapter_rom_resources[i].start = start; | ||
266 | adapter_rom_resources[i].end = start + length - 1; | ||
267 | request_resource(&iomem_resource, &adapter_rom_resources[i]); | ||
268 | |||
269 | start = adapter_rom_resources[i++].end & ~2047UL; | ||
270 | } | ||
271 | } | ||
272 | |||
273 | static __init void parse_cmdline_early (char ** cmdline_p) | ||
274 | { | ||
275 | char c = ' ', *to = command_line, *from = COMMAND_LINE; | ||
276 | int len = 0; | ||
277 | |||
278 | /* Save unparsed command line copy for /proc/cmdline */ | ||
279 | memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); | ||
280 | saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; | ||
281 | |||
282 | for (;;) { | ||
283 | if (c != ' ') | ||
284 | goto next_char; | ||
285 | |||
286 | #ifdef CONFIG_SMP | ||
287 | /* | ||
288 | * If the BIOS enumerates physical processors before logical, | ||
289 | * maxcpus=N at enumeration-time can be used to disable HT. | ||
290 | */ | ||
291 | else if (!memcmp(from, "maxcpus=", 8)) { | ||
292 | extern unsigned int maxcpus; | ||
293 | |||
294 | maxcpus = simple_strtoul(from + 8, NULL, 0); | ||
295 | } | ||
296 | #endif | ||
297 | #ifdef CONFIG_ACPI_BOOT | ||
298 | /* "acpi=off" disables both ACPI table parsing and interpreter init */ | ||
299 | if (!memcmp(from, "acpi=off", 8)) | ||
300 | disable_acpi(); | ||
301 | |||
302 | if (!memcmp(from, "acpi=force", 10)) { | ||
303 | /* add later when we do DMI horrors: */ | ||
304 | acpi_force = 1; | ||
305 | acpi_disabled = 0; | ||
306 | } | ||
307 | |||
308 | /* acpi=ht just means: do ACPI MADT parsing | ||
309 | at bootup, but don't enable the full ACPI interpreter */ | ||
310 | if (!memcmp(from, "acpi=ht", 7)) { | ||
311 | if (!acpi_force) | ||
312 | disable_acpi(); | ||
313 | acpi_ht = 1; | ||
314 | } | ||
315 | else if (!memcmp(from, "pci=noacpi", 10)) | ||
316 | acpi_disable_pci(); | ||
317 | else if (!memcmp(from, "acpi=noirq", 10)) | ||
318 | acpi_noirq_set(); | ||
319 | |||
320 | else if (!memcmp(from, "acpi_sci=edge", 13)) | ||
321 | acpi_sci_flags.trigger = 1; | ||
322 | else if (!memcmp(from, "acpi_sci=level", 14)) | ||
323 | acpi_sci_flags.trigger = 3; | ||
324 | else if (!memcmp(from, "acpi_sci=high", 13)) | ||
325 | acpi_sci_flags.polarity = 1; | ||
326 | else if (!memcmp(from, "acpi_sci=low", 12)) | ||
327 | acpi_sci_flags.polarity = 3; | ||
328 | |||
329 | /* acpi=strict disables out-of-spec workarounds */ | ||
330 | else if (!memcmp(from, "acpi=strict", 11)) { | ||
331 | acpi_strict = 1; | ||
332 | } | ||
333 | #endif | ||
334 | |||
335 | if (!memcmp(from, "nolapic", 7) || | ||
336 | !memcmp(from, "disableapic", 11)) | ||
337 | disable_apic = 1; | ||
338 | |||
339 | if (!memcmp(from, "noapic", 6)) | ||
340 | skip_ioapic_setup = 1; | ||
341 | |||
342 | if (!memcmp(from, "apic", 4)) { | ||
343 | skip_ioapic_setup = 0; | ||
344 | ioapic_force = 1; | ||
345 | } | ||
346 | |||
347 | if (!memcmp(from, "mem=", 4)) | ||
348 | parse_memopt(from+4, &from); | ||
349 | |||
350 | #ifdef CONFIG_DISCONTIGMEM | ||
351 | if (!memcmp(from, "numa=", 5)) | ||
352 | numa_setup(from+5); | ||
353 | #endif | ||
354 | |||
355 | #ifdef CONFIG_GART_IOMMU | ||
356 | if (!memcmp(from,"iommu=",6)) { | ||
357 | iommu_setup(from+6); | ||
358 | } | ||
359 | #endif | ||
360 | |||
361 | if (!memcmp(from,"oops=panic", 10)) | ||
362 | panic_on_oops = 1; | ||
363 | |||
364 | if (!memcmp(from, "noexec=", 7)) | ||
365 | nonx_setup(from + 7); | ||
366 | |||
367 | next_char: | ||
368 | c = *(from++); | ||
369 | if (!c) | ||
370 | break; | ||
371 | if (COMMAND_LINE_SIZE <= ++len) | ||
372 | break; | ||
373 | *(to++) = c; | ||
374 | } | ||
375 | *to = '\0'; | ||
376 | *cmdline_p = command_line; | ||
377 | } | ||
378 | |||
379 | #ifndef CONFIG_DISCONTIGMEM | ||
380 | static void __init contig_initmem_init(void) | ||
381 | { | ||
382 | unsigned long bootmap_size, bootmap; | ||
383 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
384 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | ||
385 | if (bootmap == -1L) | ||
386 | panic("Cannot find bootmem map of size %ld\n",bootmap_size); | ||
387 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | ||
388 | e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); | ||
389 | reserve_bootmem(bootmap, bootmap_size); | ||
390 | } | ||
391 | #endif | ||
392 | |||
393 | /* Use inline assembly to define this because the nops are defined | ||
394 | as inline assembly strings in the include files and we cannot | ||
395 | get them easily into strings. */ | ||
396 | asm("\t.data\nk8nops: " | ||
397 | K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 | ||
398 | K8_NOP7 K8_NOP8); | ||
399 | |||
400 | extern unsigned char k8nops[]; | ||
401 | static unsigned char *k8_nops[ASM_NOP_MAX+1] = { | ||
402 | NULL, | ||
403 | k8nops, | ||
404 | k8nops + 1, | ||
405 | k8nops + 1 + 2, | ||
406 | k8nops + 1 + 2 + 3, | ||
407 | k8nops + 1 + 2 + 3 + 4, | ||
408 | k8nops + 1 + 2 + 3 + 4 + 5, | ||
409 | k8nops + 1 + 2 + 3 + 4 + 5 + 6, | ||
410 | k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, | ||
411 | }; | ||
412 | |||
413 | /* Replace instructions with better alternatives for this CPU type. | ||
414 | |||
415 | This runs before SMP is initialized to avoid SMP problems with | ||
416 | self modifying code. This implies that assymetric systems where | ||
417 | APs have less capabilities than the boot processor are not handled. | ||
418 | In this case boot with "noreplacement". */ | ||
419 | void apply_alternatives(void *start, void *end) | ||
420 | { | ||
421 | struct alt_instr *a; | ||
422 | int diff, i, k; | ||
423 | for (a = start; (void *)a < end; a++) { | ||
424 | if (!boot_cpu_has(a->cpuid)) | ||
425 | continue; | ||
426 | |||
427 | BUG_ON(a->replacementlen > a->instrlen); | ||
428 | __inline_memcpy(a->instr, a->replacement, a->replacementlen); | ||
429 | diff = a->instrlen - a->replacementlen; | ||
430 | |||
431 | /* Pad the rest with nops */ | ||
432 | for (i = a->replacementlen; diff > 0; diff -= k, i += k) { | ||
433 | k = diff; | ||
434 | if (k > ASM_NOP_MAX) | ||
435 | k = ASM_NOP_MAX; | ||
436 | __inline_memcpy(a->instr + i, k8_nops[k], k); | ||
437 | } | ||
438 | } | ||
439 | } | ||
440 | |||
441 | static int no_replacement __initdata = 0; | ||
442 | |||
443 | void __init alternative_instructions(void) | ||
444 | { | ||
445 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | ||
446 | if (no_replacement) | ||
447 | return; | ||
448 | apply_alternatives(__alt_instructions, __alt_instructions_end); | ||
449 | } | ||
450 | |||
451 | static int __init noreplacement_setup(char *s) | ||
452 | { | ||
453 | no_replacement = 1; | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | __setup("noreplacement", noreplacement_setup); | ||
458 | |||
459 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
460 | struct edd edd; | ||
461 | #ifdef CONFIG_EDD_MODULE | ||
462 | EXPORT_SYMBOL(edd); | ||
463 | #endif | ||
464 | /** | ||
465 | * copy_edd() - Copy the BIOS EDD information | ||
466 | * from boot_params into a safe place. | ||
467 | * | ||
468 | */ | ||
469 | static inline void copy_edd(void) | ||
470 | { | ||
471 | memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | ||
472 | memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | ||
473 | edd.mbr_signature_nr = EDD_MBR_SIG_NR; | ||
474 | edd.edd_info_nr = EDD_NR; | ||
475 | } | ||
476 | #else | ||
477 | static inline void copy_edd(void) | ||
478 | { | ||
479 | } | ||
480 | #endif | ||
481 | |||
482 | #define EBDA_ADDR_POINTER 0x40E | ||
483 | static void __init reserve_ebda_region(void) | ||
484 | { | ||
485 | unsigned int addr; | ||
486 | /** | ||
487 | * there is a real-mode segmented pointer pointing to the | ||
488 | * 4K EBDA area at 0x40E | ||
489 | */ | ||
490 | addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); | ||
491 | addr <<= 4; | ||
492 | if (addr) | ||
493 | reserve_bootmem_generic(addr, PAGE_SIZE); | ||
494 | } | ||
495 | |||
496 | void __init setup_arch(char **cmdline_p) | ||
497 | { | ||
498 | unsigned long low_mem_size; | ||
499 | unsigned long kernel_end; | ||
500 | |||
501 | ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | ||
502 | drive_info = DRIVE_INFO; | ||
503 | screen_info = SCREEN_INFO; | ||
504 | edid_info = EDID_INFO; | ||
505 | saved_video_mode = SAVED_VIDEO_MODE; | ||
506 | bootloader_type = LOADER_TYPE; | ||
507 | |||
508 | #ifdef CONFIG_BLK_DEV_RAM | ||
509 | rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | ||
510 | rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | ||
511 | rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | ||
512 | #endif | ||
513 | setup_memory_region(); | ||
514 | copy_edd(); | ||
515 | |||
516 | if (!MOUNT_ROOT_RDONLY) | ||
517 | root_mountflags &= ~MS_RDONLY; | ||
518 | init_mm.start_code = (unsigned long) &_text; | ||
519 | init_mm.end_code = (unsigned long) &_etext; | ||
520 | init_mm.end_data = (unsigned long) &_edata; | ||
521 | init_mm.brk = (unsigned long) &_end; | ||
522 | |||
523 | code_resource.start = virt_to_phys(&_text); | ||
524 | code_resource.end = virt_to_phys(&_etext)-1; | ||
525 | data_resource.start = virt_to_phys(&_etext); | ||
526 | data_resource.end = virt_to_phys(&_edata)-1; | ||
527 | |||
528 | parse_cmdline_early(cmdline_p); | ||
529 | |||
530 | early_identify_cpu(&boot_cpu_data); | ||
531 | |||
532 | /* | ||
533 | * partially used pages are not usable - thus | ||
534 | * we are rounding upwards: | ||
535 | */ | ||
536 | end_pfn = e820_end_of_ram(); | ||
537 | |||
538 | check_efer(); | ||
539 | |||
540 | init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | ||
541 | |||
542 | #ifdef CONFIG_ACPI_BOOT | ||
543 | /* | ||
544 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | ||
545 | * Call this early for SRAT node setup. | ||
546 | */ | ||
547 | acpi_boot_table_init(); | ||
548 | #endif | ||
549 | |||
550 | #ifdef CONFIG_ACPI_NUMA | ||
551 | /* | ||
552 | * Parse SRAT to discover nodes. | ||
553 | */ | ||
554 | acpi_numa_init(); | ||
555 | #endif | ||
556 | |||
557 | #ifdef CONFIG_DISCONTIGMEM | ||
558 | numa_initmem_init(0, end_pfn); | ||
559 | #else | ||
560 | contig_initmem_init(); | ||
561 | #endif | ||
562 | |||
563 | /* Reserve direct mapping */ | ||
564 | reserve_bootmem_generic(table_start << PAGE_SHIFT, | ||
565 | (table_end - table_start) << PAGE_SHIFT); | ||
566 | |||
567 | /* reserve kernel */ | ||
568 | kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); | ||
569 | reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); | ||
570 | |||
571 | /* | ||
572 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
573 | * enabling clean reboots, SMP operation, laptop functions. | ||
574 | */ | ||
575 | reserve_bootmem_generic(0, PAGE_SIZE); | ||
576 | |||
577 | /* reserve ebda region */ | ||
578 | reserve_ebda_region(); | ||
579 | |||
580 | #ifdef CONFIG_SMP | ||
581 | /* | ||
582 | * But first pinch a few for the stack/trampoline stuff | ||
583 | * FIXME: Don't need the extra page at 4K, but need to fix | ||
584 | * trampoline before removing it. (see the GDT stuff) | ||
585 | */ | ||
586 | reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); | ||
587 | |||
588 | /* Reserve SMP trampoline */ | ||
589 | reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); | ||
590 | #endif | ||
591 | |||
592 | #ifdef CONFIG_ACPI_SLEEP | ||
593 | /* | ||
594 | * Reserve low memory region for sleep support. | ||
595 | */ | ||
596 | acpi_reserve_bootmem(); | ||
597 | #endif | ||
598 | #ifdef CONFIG_X86_LOCAL_APIC | ||
599 | /* | ||
600 | * Find and reserve possible boot-time SMP configuration: | ||
601 | */ | ||
602 | find_smp_config(); | ||
603 | #endif | ||
604 | #ifdef CONFIG_BLK_DEV_INITRD | ||
605 | if (LOADER_TYPE && INITRD_START) { | ||
606 | if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | ||
607 | reserve_bootmem_generic(INITRD_START, INITRD_SIZE); | ||
608 | initrd_start = | ||
609 | INITRD_START ? INITRD_START + PAGE_OFFSET : 0; | ||
610 | initrd_end = initrd_start+INITRD_SIZE; | ||
611 | } | ||
612 | else { | ||
613 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
614 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
615 | (unsigned long)(INITRD_START + INITRD_SIZE), | ||
616 | (unsigned long)(end_pfn << PAGE_SHIFT)); | ||
617 | initrd_start = 0; | ||
618 | } | ||
619 | } | ||
620 | #endif | ||
621 | paging_init(); | ||
622 | |||
623 | check_ioapic(); | ||
624 | |||
625 | #ifdef CONFIG_ACPI_BOOT | ||
626 | /* | ||
627 | * Read APIC and some other early information from ACPI tables. | ||
628 | */ | ||
629 | acpi_boot_init(); | ||
630 | #endif | ||
631 | |||
632 | #ifdef CONFIG_X86_LOCAL_APIC | ||
633 | /* | ||
634 | * get boot-time SMP configuration: | ||
635 | */ | ||
636 | if (smp_found_config) | ||
637 | get_smp_config(); | ||
638 | init_apic_mappings(); | ||
639 | #endif | ||
640 | |||
641 | /* | ||
642 | * Request address space for all standard RAM and ROM resources | ||
643 | * and also for regions reported as reserved by the e820. | ||
644 | */ | ||
645 | probe_roms(); | ||
646 | e820_reserve_resources(); | ||
647 | |||
648 | request_resource(&iomem_resource, &video_ram_resource); | ||
649 | |||
650 | { | ||
651 | unsigned i; | ||
652 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
653 | for (i = 0; i < STANDARD_IO_RESOURCES; i++) | ||
654 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
655 | } | ||
656 | |||
657 | /* Will likely break when you have unassigned resources with more | ||
658 | than 4GB memory and bridges that don't support more than 4GB. | ||
659 | Doing it properly would require to use pci_alloc_consistent | ||
660 | in this case. */ | ||
661 | low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; | ||
662 | if (low_mem_size > pci_mem_start) | ||
663 | pci_mem_start = low_mem_size; | ||
664 | |||
665 | #ifdef CONFIG_GART_IOMMU | ||
666 | iommu_hole_init(); | ||
667 | #endif | ||
668 | |||
669 | #ifdef CONFIG_VT | ||
670 | #if defined(CONFIG_VGA_CONSOLE) | ||
671 | conswitchp = &vga_con; | ||
672 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
673 | conswitchp = &dummy_con; | ||
674 | #endif | ||
675 | #endif | ||
676 | } | ||
677 | |||
678 | static int __init get_model_name(struct cpuinfo_x86 *c) | ||
679 | { | ||
680 | unsigned int *v; | ||
681 | |||
682 | if (c->x86_cpuid_level < 0x80000004) | ||
683 | return 0; | ||
684 | |||
685 | v = (unsigned int *) c->x86_model_id; | ||
686 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
687 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
688 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
689 | c->x86_model_id[48] = 0; | ||
690 | return 1; | ||
691 | } | ||
692 | |||
693 | |||
694 | static void __init display_cacheinfo(struct cpuinfo_x86 *c) | ||
695 | { | ||
696 | unsigned int n, dummy, eax, ebx, ecx, edx; | ||
697 | |||
698 | n = c->x86_cpuid_level; | ||
699 | |||
700 | if (n >= 0x80000005) { | ||
701 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
702 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | ||
703 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
704 | c->x86_cache_size=(ecx>>24)+(edx>>24); | ||
705 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
706 | c->x86_tlbsize = 0; | ||
707 | } | ||
708 | |||
709 | if (n >= 0x80000006) { | ||
710 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
711 | ecx = cpuid_ecx(0x80000006); | ||
712 | c->x86_cache_size = ecx >> 16; | ||
713 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
714 | |||
715 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
716 | c->x86_cache_size, ecx & 0xFF); | ||
717 | } | ||
718 | |||
719 | if (n >= 0x80000007) | ||
720 | cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | ||
721 | if (n >= 0x80000008) { | ||
722 | cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | ||
723 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
724 | c->x86_phys_bits = eax & 0xff; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | |||
729 | static int __init init_amd(struct cpuinfo_x86 *c) | ||
730 | { | ||
731 | int r; | ||
732 | int level; | ||
733 | #ifdef CONFIG_NUMA | ||
734 | int cpu; | ||
735 | #endif | ||
736 | |||
737 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
738 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
739 | clear_bit(0*32+31, &c->x86_capability); | ||
740 | |||
741 | /* C-stepping K8? */ | ||
742 | level = cpuid_eax(1); | ||
743 | if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | ||
744 | set_bit(X86_FEATURE_K8_C, &c->x86_capability); | ||
745 | |||
746 | r = get_model_name(c); | ||
747 | if (!r) { | ||
748 | switch (c->x86) { | ||
749 | case 15: | ||
750 | /* Should distinguish Models here, but this is only | ||
751 | a fallback anyways. */ | ||
752 | strcpy(c->x86_model_id, "Hammer"); | ||
753 | break; | ||
754 | } | ||
755 | } | ||
756 | display_cacheinfo(c); | ||
757 | |||
758 | if (c->x86_cpuid_level >= 0x80000008) { | ||
759 | c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; | ||
760 | if (c->x86_num_cores & (c->x86_num_cores - 1)) | ||
761 | c->x86_num_cores = 1; | ||
762 | |||
763 | #ifdef CONFIG_NUMA | ||
764 | /* On a dual core setup the lower bits of apic id | ||
765 | distingush the cores. Fix up the CPU<->node mappings | ||
766 | here based on that. | ||
767 | Assumes number of cores is a power of two. | ||
768 | When using SRAT use mapping from SRAT. */ | ||
769 | cpu = c->x86_apicid; | ||
770 | if (acpi_numa <= 0 && c->x86_num_cores > 1) { | ||
771 | cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); | ||
772 | if (!node_online(cpu_to_node[cpu])) | ||
773 | cpu_to_node[cpu] = first_node(node_online_map); | ||
774 | } | ||
775 | printk(KERN_INFO "CPU %d(%d) -> Node %d\n", | ||
776 | cpu, c->x86_num_cores, cpu_to_node[cpu]); | ||
777 | #endif | ||
778 | } | ||
779 | |||
780 | return r; | ||
781 | } | ||
782 | |||
783 | static void __init detect_ht(struct cpuinfo_x86 *c) | ||
784 | { | ||
785 | #ifdef CONFIG_SMP | ||
786 | u32 eax, ebx, ecx, edx; | ||
787 | int index_lsb, index_msb, tmp; | ||
788 | int cpu = smp_processor_id(); | ||
789 | |||
790 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
791 | return; | ||
792 | |||
793 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
794 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
795 | |||
796 | if (smp_num_siblings == 1) { | ||
797 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
798 | } else if (smp_num_siblings > 1) { | ||
799 | index_lsb = 0; | ||
800 | index_msb = 31; | ||
801 | /* | ||
802 | * At this point we only support two siblings per | ||
803 | * processor package. | ||
804 | */ | ||
805 | if (smp_num_siblings > NR_CPUS) { | ||
806 | printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | ||
807 | smp_num_siblings = 1; | ||
808 | return; | ||
809 | } | ||
810 | tmp = smp_num_siblings; | ||
811 | while ((tmp & 1) == 0) { | ||
812 | tmp >>=1 ; | ||
813 | index_lsb++; | ||
814 | } | ||
815 | tmp = smp_num_siblings; | ||
816 | while ((tmp & 0x80000000 ) == 0) { | ||
817 | tmp <<=1 ; | ||
818 | index_msb--; | ||
819 | } | ||
820 | if (index_lsb != index_msb ) | ||
821 | index_msb++; | ||
822 | phys_proc_id[cpu] = phys_pkg_id(index_msb); | ||
823 | |||
824 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
825 | phys_proc_id[cpu]); | ||
826 | } | ||
827 | #endif | ||
828 | } | ||
829 | |||
830 | static void __init sched_cmp_hack(struct cpuinfo_x86 *c) | ||
831 | { | ||
832 | #ifdef CONFIG_SMP | ||
833 | /* AMD dual core looks like HT but isn't really. Hide it from the | ||
834 | scheduler. This works around problems with the domain scheduler. | ||
835 | Also probably gives slightly better scheduling and disables | ||
836 | SMT nice which is harmful on dual core. | ||
837 | TBD tune the domain scheduler for dual core. */ | ||
838 | if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
839 | smp_num_siblings = 1; | ||
840 | #endif | ||
841 | } | ||
842 | |||
843 | static void __init init_intel(struct cpuinfo_x86 *c) | ||
844 | { | ||
845 | /* Cache sizes */ | ||
846 | unsigned n; | ||
847 | |||
848 | init_intel_cacheinfo(c); | ||
849 | n = c->x86_cpuid_level; | ||
850 | if (n >= 0x80000008) { | ||
851 | unsigned eax = cpuid_eax(0x80000008); | ||
852 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
853 | c->x86_phys_bits = eax & 0xff; | ||
854 | } | ||
855 | |||
856 | if (c->x86 == 15) | ||
857 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
858 | } | ||
859 | |||
860 | void __init get_cpu_vendor(struct cpuinfo_x86 *c) | ||
861 | { | ||
862 | char *v = c->x86_vendor_id; | ||
863 | |||
864 | if (!strcmp(v, "AuthenticAMD")) | ||
865 | c->x86_vendor = X86_VENDOR_AMD; | ||
866 | else if (!strcmp(v, "GenuineIntel")) | ||
867 | c->x86_vendor = X86_VENDOR_INTEL; | ||
868 | else | ||
869 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
870 | } | ||
871 | |||
872 | struct cpu_model_info { | ||
873 | int vendor; | ||
874 | int family; | ||
875 | char *model_names[16]; | ||
876 | }; | ||
877 | |||
878 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
879 | needed before check_bugs. Everything advanced is in identify_cpu | ||
880 | below. */ | ||
881 | void __init early_identify_cpu(struct cpuinfo_x86 *c) | ||
882 | { | ||
883 | u32 tfms; | ||
884 | |||
885 | c->loops_per_jiffy = loops_per_jiffy; | ||
886 | c->x86_cache_size = -1; | ||
887 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
888 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
889 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
890 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
891 | c->x86_clflush_size = 64; | ||
892 | c->x86_cache_alignment = c->x86_clflush_size; | ||
893 | c->x86_num_cores = 1; | ||
894 | c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; | ||
895 | c->x86_cpuid_level = 0; | ||
896 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
897 | |||
898 | /* Get vendor name */ | ||
899 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
900 | (unsigned int *)&c->x86_vendor_id[0], | ||
901 | (unsigned int *)&c->x86_vendor_id[8], | ||
902 | (unsigned int *)&c->x86_vendor_id[4]); | ||
903 | |||
904 | get_cpu_vendor(c); | ||
905 | |||
906 | /* Initialize the standard set of capabilities */ | ||
907 | /* Note that the vendor-specific code below might override */ | ||
908 | |||
909 | /* Intel-defined flags: level 0x00000001 */ | ||
910 | if (c->cpuid_level >= 0x00000001) { | ||
911 | __u32 misc; | ||
912 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
913 | &c->x86_capability[0]); | ||
914 | c->x86 = (tfms >> 8) & 0xf; | ||
915 | c->x86_model = (tfms >> 4) & 0xf; | ||
916 | c->x86_mask = tfms & 0xf; | ||
917 | if (c->x86 == 0xf) { | ||
918 | c->x86 += (tfms >> 20) & 0xff; | ||
919 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
920 | } | ||
921 | if (c->x86_capability[0] & (1<<19)) | ||
922 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
923 | c->x86_apicid = misc >> 24; | ||
924 | } else { | ||
925 | /* Have CPUID level 0 only - unheard of */ | ||
926 | c->x86 = 4; | ||
927 | } | ||
928 | } | ||
929 | |||
930 | /* | ||
931 | * This does the hard work of actually picking apart the CPU stuff... | ||
932 | */ | ||
933 | void __init identify_cpu(struct cpuinfo_x86 *c) | ||
934 | { | ||
935 | int i; | ||
936 | u32 xlvl; | ||
937 | |||
938 | early_identify_cpu(c); | ||
939 | |||
940 | /* AMD-defined flags: level 0x80000001 */ | ||
941 | xlvl = cpuid_eax(0x80000000); | ||
942 | c->x86_cpuid_level = xlvl; | ||
943 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
944 | if (xlvl >= 0x80000001) { | ||
945 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
946 | c->x86_capability[5] = cpuid_ecx(0x80000001); | ||
947 | } | ||
948 | if (xlvl >= 0x80000004) | ||
949 | get_model_name(c); /* Default name */ | ||
950 | } | ||
951 | |||
952 | /* Transmeta-defined flags: level 0x80860001 */ | ||
953 | xlvl = cpuid_eax(0x80860000); | ||
954 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
955 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
956 | if (xlvl >= 0x80860001) | ||
957 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Vendor-specific initialization. In this section we | ||
962 | * canonicalize the feature flags, meaning if there are | ||
963 | * features a certain CPU supports which CPUID doesn't | ||
964 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
965 | * we handle them here. | ||
966 | * | ||
967 | * At the end of this section, c->x86_capability better | ||
968 | * indicate the features this CPU genuinely supports! | ||
969 | */ | ||
970 | switch (c->x86_vendor) { | ||
971 | case X86_VENDOR_AMD: | ||
972 | init_amd(c); | ||
973 | break; | ||
974 | |||
975 | case X86_VENDOR_INTEL: | ||
976 | init_intel(c); | ||
977 | break; | ||
978 | |||
979 | case X86_VENDOR_UNKNOWN: | ||
980 | default: | ||
981 | display_cacheinfo(c); | ||
982 | break; | ||
983 | } | ||
984 | |||
985 | select_idle_routine(c); | ||
986 | detect_ht(c); | ||
987 | sched_cmp_hack(c); | ||
988 | |||
989 | /* | ||
990 | * On SMP, boot_cpu_data holds the common feature set between | ||
991 | * all CPUs; so make sure that we indicate which features are | ||
992 | * common between the CPUs. The first time this routine gets | ||
993 | * executed, c == &boot_cpu_data. | ||
994 | */ | ||
995 | if (c != &boot_cpu_data) { | ||
996 | /* AND the already accumulated flags with these */ | ||
997 | for (i = 0 ; i < NCAPINTS ; i++) | ||
998 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
999 | } | ||
1000 | |||
1001 | #ifdef CONFIG_X86_MCE | ||
1002 | mcheck_init(c); | ||
1003 | #endif | ||
1004 | #ifdef CONFIG_NUMA | ||
1005 | if (c != &boot_cpu_data) | ||
1006 | numa_add_cpu(c - cpu_data); | ||
1007 | #endif | ||
1008 | } | ||
1009 | |||
1010 | |||
1011 | void __init print_cpu_info(struct cpuinfo_x86 *c) | ||
1012 | { | ||
1013 | if (c->x86_model_id[0]) | ||
1014 | printk("%s", c->x86_model_id); | ||
1015 | |||
1016 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1017 | printk(" stepping %02x\n", c->x86_mask); | ||
1018 | else | ||
1019 | printk("\n"); | ||
1020 | } | ||
1021 | |||
1022 | /* | ||
1023 | * Get CPU information for use by the procfs. | ||
1024 | */ | ||
1025 | |||
1026 | static int show_cpuinfo(struct seq_file *m, void *v) | ||
1027 | { | ||
1028 | struct cpuinfo_x86 *c = v; | ||
1029 | |||
1030 | /* | ||
1031 | * These flag bits must match the definitions in <asm/cpufeature.h>. | ||
1032 | * NULL means this bit is undefined or reserved; either way it doesn't | ||
1033 | * have meaning as far as Linux is concerned. Note that it's important | ||
1034 | * to realize there is a difference between this table and CPUID -- if | ||
1035 | * applications want to get the raw CPUID data, they should access | ||
1036 | * /dev/cpu/<cpu_nr>/cpuid instead. | ||
1037 | */ | ||
1038 | static char *x86_cap_flags[] = { | ||
1039 | /* Intel-defined */ | ||
1040 | "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | ||
1041 | "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | ||
1042 | "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | ||
1043 | "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, | ||
1044 | |||
1045 | /* AMD-defined */ | ||
1046 | "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1047 | NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | ||
1048 | NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | ||
1049 | NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", | ||
1050 | |||
1051 | /* Transmeta-defined */ | ||
1052 | "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | ||
1053 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1054 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1055 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1056 | |||
1057 | /* Other (Linux-defined) */ | ||
1058 | "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, | ||
1059 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1060 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1061 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1062 | |||
1063 | /* Intel-defined (#2) */ | ||
1064 | "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", | ||
1065 | "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, | ||
1066 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1067 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1068 | |||
1069 | /* AMD-defined (#2) */ | ||
1070 | "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, | ||
1071 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1072 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1073 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | ||
1074 | }; | ||
1075 | static char *x86_power_flags[] = { | ||
1076 | "ts", /* temperature sensor */ | ||
1077 | "fid", /* frequency id control */ | ||
1078 | "vid", /* voltage id control */ | ||
1079 | "ttp", /* thermal trip */ | ||
1080 | "tm", | ||
1081 | "stc" | ||
1082 | }; | ||
1083 | |||
1084 | |||
1085 | #ifdef CONFIG_SMP | ||
1086 | if (!cpu_online(c-cpu_data)) | ||
1087 | return 0; | ||
1088 | #endif | ||
1089 | |||
1090 | seq_printf(m,"processor\t: %u\n" | ||
1091 | "vendor_id\t: %s\n" | ||
1092 | "cpu family\t: %d\n" | ||
1093 | "model\t\t: %d\n" | ||
1094 | "model name\t: %s\n", | ||
1095 | (unsigned)(c-cpu_data), | ||
1096 | c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | ||
1097 | c->x86, | ||
1098 | (int)c->x86_model, | ||
1099 | c->x86_model_id[0] ? c->x86_model_id : "unknown"); | ||
1100 | |||
1101 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1102 | seq_printf(m, "stepping\t: %d\n", c->x86_mask); | ||
1103 | else | ||
1104 | seq_printf(m, "stepping\t: unknown\n"); | ||
1105 | |||
1106 | if (cpu_has(c,X86_FEATURE_TSC)) { | ||
1107 | seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | ||
1108 | cpu_khz / 1000, (cpu_khz % 1000)); | ||
1109 | } | ||
1110 | |||
1111 | /* Cache size */ | ||
1112 | if (c->x86_cache_size >= 0) | ||
1113 | seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | ||
1114 | |||
1115 | #ifdef CONFIG_SMP | ||
1116 | seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]); | ||
1117 | seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); | ||
1118 | #endif | ||
1119 | |||
1120 | seq_printf(m, | ||
1121 | "fpu\t\t: yes\n" | ||
1122 | "fpu_exception\t: yes\n" | ||
1123 | "cpuid level\t: %d\n" | ||
1124 | "wp\t\t: yes\n" | ||
1125 | "flags\t\t:", | ||
1126 | c->cpuid_level); | ||
1127 | |||
1128 | { | ||
1129 | int i; | ||
1130 | for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | ||
1131 | if ( test_bit(i, &c->x86_capability) && | ||
1132 | x86_cap_flags[i] != NULL ) | ||
1133 | seq_printf(m, " %s", x86_cap_flags[i]); | ||
1134 | } | ||
1135 | |||
1136 | seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | ||
1137 | c->loops_per_jiffy/(500000/HZ), | ||
1138 | (c->loops_per_jiffy/(5000/HZ)) % 100); | ||
1139 | |||
1140 | if (c->x86_tlbsize > 0) | ||
1141 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | ||
1142 | seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | ||
1143 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | ||
1144 | |||
1145 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | ||
1146 | c->x86_phys_bits, c->x86_virt_bits); | ||
1147 | |||
1148 | seq_printf(m, "power management:"); | ||
1149 | { | ||
1150 | unsigned i; | ||
1151 | for (i = 0; i < 32; i++) | ||
1152 | if (c->x86_power & (1 << i)) { | ||
1153 | if (i < ARRAY_SIZE(x86_power_flags)) | ||
1154 | seq_printf(m, " %s", x86_power_flags[i]); | ||
1155 | else | ||
1156 | seq_printf(m, " [%d]", i); | ||
1157 | } | ||
1158 | } | ||
1159 | seq_printf(m, "\n"); | ||
1160 | |||
1161 | if (c->x86_num_cores > 1) | ||
1162 | seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); | ||
1163 | |||
1164 | seq_printf(m, "\n\n"); | ||
1165 | |||
1166 | return 0; | ||
1167 | } | ||
1168 | |||
1169 | static void *c_start(struct seq_file *m, loff_t *pos) | ||
1170 | { | ||
1171 | return *pos < NR_CPUS ? cpu_data + *pos : NULL; | ||
1172 | } | ||
1173 | |||
1174 | static void *c_next(struct seq_file *m, void *v, loff_t *pos) | ||
1175 | { | ||
1176 | ++*pos; | ||
1177 | return c_start(m, pos); | ||
1178 | } | ||
1179 | |||
1180 | static void c_stop(struct seq_file *m, void *v) | ||
1181 | { | ||
1182 | } | ||
1183 | |||
1184 | struct seq_operations cpuinfo_op = { | ||
1185 | .start =c_start, | ||
1186 | .next = c_next, | ||
1187 | .stop = c_stop, | ||
1188 | .show = show_cpuinfo, | ||
1189 | }; | ||
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c new file mode 100644 index 000000000000..b5305b04bc40 --- /dev/null +++ b/arch/x86_64/kernel/setup64.c | |||
@@ -0,0 +1,292 @@ | |||
1 | /* | ||
2 | * X86-64 specific CPU setup. | ||
3 | * Copyright (C) 1995 Linus Torvalds | ||
4 | * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | ||
5 | * See setup.c for older changelog. | ||
6 | * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ | ||
7 | */ | ||
8 | #include <linux/config.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/bootmem.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <asm/pda.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/processor.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include <asm/atomic.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/smp.h> | ||
22 | #include <asm/i387.h> | ||
23 | #include <asm/percpu.h> | ||
24 | #include <asm/mtrr.h> | ||
25 | #include <asm/proto.h> | ||
26 | #include <asm/mman.h> | ||
27 | #include <asm/numa.h> | ||
28 | |||
29 | char x86_boot_params[2048] __initdata = {0,}; | ||
30 | |||
31 | cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; | ||
32 | |||
33 | struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; | ||
34 | |||
35 | extern struct task_struct init_task; | ||
36 | |||
37 | extern unsigned char __per_cpu_start[], __per_cpu_end[]; | ||
38 | |||
39 | extern struct desc_ptr cpu_gdt_descr[]; | ||
40 | struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; | ||
41 | |||
42 | char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | ||
43 | |||
44 | unsigned long __supported_pte_mask = ~0UL; | ||
45 | static int do_not_nx __initdata = 0; | ||
46 | |||
47 | /* noexec=on|off | ||
48 | Control non executable mappings for 64bit processes. | ||
49 | |||
50 | on Enable(default) | ||
51 | off Disable | ||
52 | */ | ||
53 | int __init nonx_setup(char *str) | ||
54 | { | ||
55 | if (!strncmp(str, "on", 2)) { | ||
56 | __supported_pte_mask |= _PAGE_NX; | ||
57 | do_not_nx = 0; | ||
58 | } else if (!strncmp(str, "off", 3)) { | ||
59 | do_not_nx = 1; | ||
60 | __supported_pte_mask &= ~_PAGE_NX; | ||
61 | } | ||
62 | return 0; | ||
63 | } | ||
64 | __setup("noexec=", nonx_setup); /* parsed early actually */ | ||
65 | |||
66 | int force_personality32 = READ_IMPLIES_EXEC; | ||
67 | |||
68 | /* noexec32=on|off | ||
69 | Control non executable heap for 32bit processes. | ||
70 | To control the stack too use noexec=off | ||
71 | |||
72 | on PROT_READ does not imply PROT_EXEC for 32bit processes | ||
73 | off PROT_READ implies PROT_EXEC (default) | ||
74 | */ | ||
75 | static int __init nonx32_setup(char *str) | ||
76 | { | ||
77 | if (!strcmp(str, "on")) | ||
78 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
79 | else if (!strcmp(str, "off")) | ||
80 | force_personality32 |= READ_IMPLIES_EXEC; | ||
81 | return 0; | ||
82 | } | ||
83 | __setup("noexec32=", nonx32_setup); | ||
84 | |||
85 | /* | ||
86 | * Great future plan: | ||
87 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
88 | * Always point %gs to its beginning | ||
89 | */ | ||
90 | void __init setup_per_cpu_areas(void) | ||
91 | { | ||
92 | int i; | ||
93 | unsigned long size; | ||
94 | |||
95 | /* Copy section for each CPU (we discard the original) */ | ||
96 | size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); | ||
97 | #ifdef CONFIG_MODULES | ||
98 | if (size < PERCPU_ENOUGH_ROOM) | ||
99 | size = PERCPU_ENOUGH_ROOM; | ||
100 | #endif | ||
101 | |||
102 | for (i = 0; i < NR_CPUS; i++) { | ||
103 | unsigned char *ptr; | ||
104 | |||
105 | if (!NODE_DATA(cpu_to_node(i))) { | ||
106 | printk("cpu with no node %d, num_online_nodes %d\n", | ||
107 | i, num_online_nodes()); | ||
108 | ptr = alloc_bootmem(size); | ||
109 | } else { | ||
110 | ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); | ||
111 | } | ||
112 | if (!ptr) | ||
113 | panic("Cannot allocate cpu data for CPU %d\n", i); | ||
114 | cpu_pda[i].data_offset = ptr - __per_cpu_start; | ||
115 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | void pda_init(int cpu) | ||
120 | { | ||
121 | struct x8664_pda *pda = &cpu_pda[cpu]; | ||
122 | |||
123 | /* Setup up data that may be needed in __get_free_pages early */ | ||
124 | asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | ||
125 | wrmsrl(MSR_GS_BASE, cpu_pda + cpu); | ||
126 | |||
127 | pda->me = pda; | ||
128 | pda->cpunumber = cpu; | ||
129 | pda->irqcount = -1; | ||
130 | pda->kernelstack = | ||
131 | (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | ||
132 | pda->active_mm = &init_mm; | ||
133 | pda->mmu_state = 0; | ||
134 | |||
135 | if (cpu == 0) { | ||
136 | /* others are initialized in smpboot.c */ | ||
137 | pda->pcurrent = &init_task; | ||
138 | pda->irqstackptr = boot_cpu_stack; | ||
139 | } else { | ||
140 | pda->irqstackptr = (char *) | ||
141 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
142 | if (!pda->irqstackptr) | ||
143 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
144 | } | ||
145 | |||
146 | asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); | ||
147 | |||
148 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
149 | } | ||
150 | |||
151 | char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] | ||
152 | __attribute__((section(".bss.page_aligned"))); | ||
153 | |||
154 | /* May not be marked __init: used by software suspend */ | ||
155 | void syscall_init(void) | ||
156 | { | ||
157 | /* | ||
158 | * LSTAR and STAR live in a bit strange symbiosis. | ||
159 | * They both write to the same internal register. STAR allows to set CS/DS | ||
160 | * but only a 32bit target. LSTAR sets the 64bit rip. | ||
161 | */ | ||
162 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
163 | wrmsrl(MSR_LSTAR, system_call); | ||
164 | |||
165 | #ifdef CONFIG_IA32_EMULATION | ||
166 | syscall32_cpu_init (); | ||
167 | #endif | ||
168 | |||
169 | /* Flags to clear on syscall */ | ||
170 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | ||
171 | } | ||
172 | |||
173 | void __init check_efer(void) | ||
174 | { | ||
175 | unsigned long efer; | ||
176 | |||
177 | rdmsrl(MSR_EFER, efer); | ||
178 | if (!(efer & EFER_NX) || do_not_nx) { | ||
179 | __supported_pte_mask &= ~_PAGE_NX; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
185 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
186 | * and IDT. We reload them nevertheless, this function acts as a | ||
187 | * 'CPU state barrier', nothing should get across. | ||
188 | * A lot of state is already set up in PDA init. | ||
189 | */ | ||
190 | void __init cpu_init (void) | ||
191 | { | ||
192 | #ifdef CONFIG_SMP | ||
193 | int cpu = stack_smp_processor_id(); | ||
194 | #else | ||
195 | int cpu = smp_processor_id(); | ||
196 | #endif | ||
197 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
198 | unsigned long v; | ||
199 | char *estacks = NULL; | ||
200 | struct task_struct *me; | ||
201 | int i; | ||
202 | |||
203 | /* CPU 0 is initialised in head64.c */ | ||
204 | if (cpu != 0) { | ||
205 | pda_init(cpu); | ||
206 | } else | ||
207 | estacks = boot_exception_stacks; | ||
208 | |||
209 | me = current; | ||
210 | |||
211 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
212 | panic("CPU#%d already initialized!\n", cpu); | ||
213 | |||
214 | printk("Initializing CPU#%d\n", cpu); | ||
215 | |||
216 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
217 | |||
218 | /* | ||
219 | * Initialize the per-CPU GDT with the boot GDT, | ||
220 | * and set up the GDT descriptor: | ||
221 | */ | ||
222 | if (cpu) { | ||
223 | memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); | ||
224 | } | ||
225 | |||
226 | cpu_gdt_descr[cpu].size = GDT_SIZE; | ||
227 | cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; | ||
228 | asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); | ||
229 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
230 | |||
231 | memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); | ||
232 | |||
233 | /* | ||
234 | * Delete NT | ||
235 | */ | ||
236 | |||
237 | asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); | ||
238 | |||
239 | syscall_init(); | ||
240 | |||
241 | wrmsrl(MSR_FS_BASE, 0); | ||
242 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
243 | barrier(); | ||
244 | |||
245 | check_efer(); | ||
246 | |||
247 | /* | ||
248 | * set up and load the per-CPU TSS | ||
249 | */ | ||
250 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
251 | if (cpu) { | ||
252 | estacks = (char *)__get_free_pages(GFP_ATOMIC, | ||
253 | EXCEPTION_STACK_ORDER); | ||
254 | if (!estacks) | ||
255 | panic("Cannot allocate exception stack %ld %d\n", | ||
256 | v, cpu); | ||
257 | } | ||
258 | estacks += EXCEPTION_STKSZ; | ||
259 | t->ist[v] = (unsigned long)estacks; | ||
260 | } | ||
261 | |||
262 | t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
263 | /* | ||
264 | * <= is required because the CPU will access up to | ||
265 | * 8 bits beyond the end of the IO permission bitmap. | ||
266 | */ | ||
267 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
268 | t->io_bitmap[i] = ~0UL; | ||
269 | |||
270 | atomic_inc(&init_mm.mm_count); | ||
271 | me->active_mm = &init_mm; | ||
272 | if (me->mm) | ||
273 | BUG(); | ||
274 | enter_lazy_tlb(&init_mm, me); | ||
275 | |||
276 | set_tss_desc(cpu, t); | ||
277 | load_TR_desc(); | ||
278 | load_LDT(&init_mm.context); | ||
279 | |||
280 | /* | ||
281 | * Clear all 6 debug registers: | ||
282 | */ | ||
283 | |||
284 | set_debug(0UL, 0); | ||
285 | set_debug(0UL, 1); | ||
286 | set_debug(0UL, 2); | ||
287 | set_debug(0UL, 3); | ||
288 | set_debug(0UL, 6); | ||
289 | set_debug(0UL, 7); | ||
290 | |||
291 | fpu_init(); | ||
292 | } | ||
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c new file mode 100644 index 000000000000..5ace32c6ac5c --- /dev/null +++ b/arch/x86_64/kernel/signal.c | |||
@@ -0,0 +1,486 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * | ||
7 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
8 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
9 | * 2000-2002 x86-64 support by Andi Kleen | ||
10 | * | ||
11 | * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ | ||
12 | */ | ||
13 | |||
14 | #include <linux/sched.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/signal.h> | ||
20 | #include <linux/errno.h> | ||
21 | #include <linux/wait.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/unistd.h> | ||
24 | #include <linux/stddef.h> | ||
25 | #include <linux/personality.h> | ||
26 | #include <linux/compiler.h> | ||
27 | #include <asm/ucontext.h> | ||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/i387.h> | ||
30 | #include <asm/proto.h> | ||
31 | |||
32 | /* #define DEBUG_SIG 1 */ | ||
33 | |||
34 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
35 | |||
36 | void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
37 | sigset_t *set, struct pt_regs * regs); | ||
38 | void ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
39 | sigset_t *set, struct pt_regs * regs); | ||
40 | |||
41 | asmlinkage long | ||
42 | sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) | ||
43 | { | ||
44 | sigset_t saveset, newset; | ||
45 | |||
46 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
47 | if (sigsetsize != sizeof(sigset_t)) | ||
48 | return -EINVAL; | ||
49 | |||
50 | if (copy_from_user(&newset, unewset, sizeof(newset))) | ||
51 | return -EFAULT; | ||
52 | sigdelsetmask(&newset, ~_BLOCKABLE); | ||
53 | |||
54 | spin_lock_irq(¤t->sighand->siglock); | ||
55 | saveset = current->blocked; | ||
56 | current->blocked = newset; | ||
57 | recalc_sigpending(); | ||
58 | spin_unlock_irq(¤t->sighand->siglock); | ||
59 | #ifdef DEBUG_SIG | ||
60 | printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", | ||
61 | saveset, newset, regs, regs->rip); | ||
62 | #endif | ||
63 | regs->rax = -EINTR; | ||
64 | while (1) { | ||
65 | current->state = TASK_INTERRUPTIBLE; | ||
66 | schedule(); | ||
67 | if (do_signal(regs, &saveset)) | ||
68 | return -EINTR; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | asmlinkage long | ||
73 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | ||
74 | struct pt_regs *regs) | ||
75 | { | ||
76 | return do_sigaltstack(uss, uoss, regs->rsp); | ||
77 | } | ||
78 | |||
79 | |||
80 | /* | ||
81 | * Do a signal return; undo the signal stack. | ||
82 | */ | ||
83 | |||
84 | struct rt_sigframe | ||
85 | { | ||
86 | char *pretcode; | ||
87 | struct ucontext uc; | ||
88 | struct siginfo info; | ||
89 | }; | ||
90 | |||
91 | static int | ||
92 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) | ||
93 | { | ||
94 | unsigned int err = 0; | ||
95 | |||
96 | /* Always make any pending restarted system calls return -EINTR */ | ||
97 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
98 | |||
99 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
100 | |||
101 | COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); | ||
102 | COPY(rdx); COPY(rcx); COPY(rip); | ||
103 | COPY(r8); | ||
104 | COPY(r9); | ||
105 | COPY(r10); | ||
106 | COPY(r11); | ||
107 | COPY(r12); | ||
108 | COPY(r13); | ||
109 | COPY(r14); | ||
110 | COPY(r15); | ||
111 | |||
112 | { | ||
113 | unsigned int tmpflags; | ||
114 | err |= __get_user(tmpflags, &sc->eflags); | ||
115 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
116 | regs->orig_rax = -1; /* disable syscall checks */ | ||
117 | } | ||
118 | |||
119 | { | ||
120 | struct _fpstate __user * buf; | ||
121 | err |= __get_user(buf, &sc->fpstate); | ||
122 | |||
123 | if (buf) { | ||
124 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
125 | goto badframe; | ||
126 | err |= restore_i387(buf); | ||
127 | } else { | ||
128 | struct task_struct *me = current; | ||
129 | if (used_math()) { | ||
130 | clear_fpu(me); | ||
131 | clear_used_math(); | ||
132 | } | ||
133 | } | ||
134 | } | ||
135 | |||
136 | err |= __get_user(*prax, &sc->rax); | ||
137 | return err; | ||
138 | |||
139 | badframe: | ||
140 | return 1; | ||
141 | } | ||
142 | |||
143 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | ||
144 | { | ||
145 | struct rt_sigframe __user *frame; | ||
146 | sigset_t set; | ||
147 | unsigned long eax; | ||
148 | |||
149 | frame = (struct rt_sigframe __user *)(regs->rsp - 8); | ||
150 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { | ||
151 | goto badframe; | ||
152 | } | ||
153 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { | ||
154 | goto badframe; | ||
155 | } | ||
156 | |||
157 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
158 | spin_lock_irq(¤t->sighand->siglock); | ||
159 | current->blocked = set; | ||
160 | recalc_sigpending(); | ||
161 | spin_unlock_irq(¤t->sighand->siglock); | ||
162 | |||
163 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
164 | goto badframe; | ||
165 | |||
166 | #ifdef DEBUG_SIG | ||
167 | printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); | ||
168 | #endif | ||
169 | |||
170 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) | ||
171 | goto badframe; | ||
172 | |||
173 | return eax; | ||
174 | |||
175 | badframe: | ||
176 | signal_fault(regs,frame,"sigreturn"); | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Set up a signal frame. | ||
182 | */ | ||
183 | |||
184 | static inline int | ||
185 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) | ||
186 | { | ||
187 | int err = 0; | ||
188 | unsigned long eflags; | ||
189 | |||
190 | err |= __put_user(0, &sc->gs); | ||
191 | err |= __put_user(0, &sc->fs); | ||
192 | |||
193 | err |= __put_user(regs->rdi, &sc->rdi); | ||
194 | err |= __put_user(regs->rsi, &sc->rsi); | ||
195 | err |= __put_user(regs->rbp, &sc->rbp); | ||
196 | err |= __put_user(regs->rsp, &sc->rsp); | ||
197 | err |= __put_user(regs->rbx, &sc->rbx); | ||
198 | err |= __put_user(regs->rdx, &sc->rdx); | ||
199 | err |= __put_user(regs->rcx, &sc->rcx); | ||
200 | err |= __put_user(regs->rax, &sc->rax); | ||
201 | err |= __put_user(regs->r8, &sc->r8); | ||
202 | err |= __put_user(regs->r9, &sc->r9); | ||
203 | err |= __put_user(regs->r10, &sc->r10); | ||
204 | err |= __put_user(regs->r11, &sc->r11); | ||
205 | err |= __put_user(regs->r12, &sc->r12); | ||
206 | err |= __put_user(regs->r13, &sc->r13); | ||
207 | err |= __put_user(regs->r14, &sc->r14); | ||
208 | err |= __put_user(regs->r15, &sc->r15); | ||
209 | err |= __put_user(me->thread.trap_no, &sc->trapno); | ||
210 | err |= __put_user(me->thread.error_code, &sc->err); | ||
211 | err |= __put_user(regs->rip, &sc->rip); | ||
212 | eflags = regs->eflags; | ||
213 | if (current->ptrace & PT_PTRACED) { | ||
214 | eflags &= ~TF_MASK; | ||
215 | } | ||
216 | err |= __put_user(eflags, &sc->eflags); | ||
217 | err |= __put_user(mask, &sc->oldmask); | ||
218 | err |= __put_user(me->thread.cr2, &sc->cr2); | ||
219 | |||
220 | return err; | ||
221 | } | ||
222 | |||
223 | /* | ||
224 | * Determine which stack to use.. | ||
225 | */ | ||
226 | |||
227 | static void __user * | ||
228 | get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) | ||
229 | { | ||
230 | unsigned long rsp; | ||
231 | |||
232 | /* Default to using normal stack - redzone*/ | ||
233 | rsp = regs->rsp - 128; | ||
234 | |||
235 | /* This is the X/Open sanctioned signal stack switching. */ | ||
236 | /* RED-PEN: redzone on that stack? */ | ||
237 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
238 | if (sas_ss_flags(rsp) == 0) | ||
239 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
240 | } | ||
241 | |||
242 | return (void __user *)round_down(rsp - size, 16); | ||
243 | } | ||
244 | |||
245 | static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
246 | sigset_t *set, struct pt_regs * regs) | ||
247 | { | ||
248 | struct rt_sigframe __user *frame; | ||
249 | struct _fpstate __user *fp = NULL; | ||
250 | int err = 0; | ||
251 | struct task_struct *me = current; | ||
252 | |||
253 | if (used_math()) { | ||
254 | fp = get_stack(ka, regs, sizeof(struct _fpstate)); | ||
255 | frame = (void __user *)round_down( | ||
256 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; | ||
257 | |||
258 | if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) | ||
259 | goto give_sigsegv; | ||
260 | |||
261 | if (save_i387(fp) < 0) | ||
262 | err |= -1; | ||
263 | } else | ||
264 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; | ||
265 | |||
266 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
267 | goto give_sigsegv; | ||
268 | |||
269 | if (ka->sa.sa_flags & SA_SIGINFO) { | ||
270 | err |= copy_siginfo_to_user(&frame->info, info); | ||
271 | if (err) | ||
272 | goto give_sigsegv; | ||
273 | } | ||
274 | |||
275 | /* Create the ucontext. */ | ||
276 | err |= __put_user(0, &frame->uc.uc_flags); | ||
277 | err |= __put_user(0, &frame->uc.uc_link); | ||
278 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
279 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
280 | &frame->uc.uc_stack.ss_flags); | ||
281 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
282 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); | ||
283 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); | ||
284 | if (sizeof(*set) == 16) { | ||
285 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); | ||
286 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); | ||
287 | } else | ||
288 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
289 | |||
290 | /* Set up to return from userspace. If provided, use a stub | ||
291 | already in userspace. */ | ||
292 | /* x86-64 should always use SA_RESTORER. */ | ||
293 | if (ka->sa.sa_flags & SA_RESTORER) { | ||
294 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); | ||
295 | } else { | ||
296 | /* could use a vstub here */ | ||
297 | goto give_sigsegv; | ||
298 | } | ||
299 | |||
300 | if (err) | ||
301 | goto give_sigsegv; | ||
302 | |||
303 | #ifdef DEBUG_SIG | ||
304 | printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); | ||
305 | #endif | ||
306 | |||
307 | /* Set up registers for signal handler */ | ||
308 | { | ||
309 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
310 | if (unlikely(ed && ed->signal_invmap && sig < 32)) | ||
311 | sig = ed->signal_invmap[sig]; | ||
312 | } | ||
313 | regs->rdi = sig; | ||
314 | /* In case the signal handler was declared without prototypes */ | ||
315 | regs->rax = 0; | ||
316 | |||
317 | /* This also works for non SA_SIGINFO handlers because they expect the | ||
318 | next argument after the signal number on the stack. */ | ||
319 | regs->rsi = (unsigned long)&frame->info; | ||
320 | regs->rdx = (unsigned long)&frame->uc; | ||
321 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
322 | |||
323 | regs->rsp = (unsigned long)frame; | ||
324 | |||
325 | set_fs(USER_DS); | ||
326 | if (regs->eflags & TF_MASK) { | ||
327 | if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) { | ||
328 | ptrace_notify(SIGTRAP); | ||
329 | } else { | ||
330 | regs->eflags &= ~TF_MASK; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | #ifdef DEBUG_SIG | ||
335 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
336 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
337 | #endif | ||
338 | |||
339 | return; | ||
340 | |||
341 | give_sigsegv: | ||
342 | force_sigsegv(sig, current); | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * OK, we're invoking a handler | ||
347 | */ | ||
348 | |||
349 | static void | ||
350 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | ||
351 | sigset_t *oldset, struct pt_regs *regs) | ||
352 | { | ||
353 | #ifdef DEBUG_SIG | ||
354 | printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", | ||
355 | current->pid, sig, | ||
356 | regs->rip, regs->rsp, regs); | ||
357 | #endif | ||
358 | |||
359 | /* Are we from a system call? */ | ||
360 | if ((long)regs->orig_rax >= 0) { | ||
361 | /* If so, check system call restarting.. */ | ||
362 | switch (regs->rax) { | ||
363 | case -ERESTART_RESTARTBLOCK: | ||
364 | case -ERESTARTNOHAND: | ||
365 | regs->rax = -EINTR; | ||
366 | break; | ||
367 | |||
368 | case -ERESTARTSYS: | ||
369 | if (!(ka->sa.sa_flags & SA_RESTART)) { | ||
370 | regs->rax = -EINTR; | ||
371 | break; | ||
372 | } | ||
373 | /* fallthrough */ | ||
374 | case -ERESTARTNOINTR: | ||
375 | regs->rax = regs->orig_rax; | ||
376 | regs->rip -= 2; | ||
377 | break; | ||
378 | } | ||
379 | } | ||
380 | |||
381 | #ifdef CONFIG_IA32_EMULATION | ||
382 | if (test_thread_flag(TIF_IA32)) { | ||
383 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
384 | ia32_setup_rt_frame(sig, ka, info, oldset, regs); | ||
385 | else | ||
386 | ia32_setup_frame(sig, ka, oldset, regs); | ||
387 | } else | ||
388 | #endif | ||
389 | setup_rt_frame(sig, ka, info, oldset, regs); | ||
390 | |||
391 | if (!(ka->sa.sa_flags & SA_NODEFER)) { | ||
392 | spin_lock_irq(¤t->sighand->siglock); | ||
393 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
394 | sigaddset(¤t->blocked,sig); | ||
395 | recalc_sigpending(); | ||
396 | spin_unlock_irq(¤t->sighand->siglock); | ||
397 | } | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * Note that 'init' is a special process: it doesn't get signals it doesn't | ||
402 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | ||
403 | * mistake. | ||
404 | */ | ||
405 | int do_signal(struct pt_regs *regs, sigset_t *oldset) | ||
406 | { | ||
407 | struct k_sigaction ka; | ||
408 | siginfo_t info; | ||
409 | int signr; | ||
410 | |||
411 | /* | ||
412 | * We want the common case to go fast, which | ||
413 | * is why we may in certain cases get here from | ||
414 | * kernel mode. Just return without doing anything | ||
415 | * if so. | ||
416 | */ | ||
417 | if ((regs->cs & 3) != 3) | ||
418 | return 1; | ||
419 | |||
420 | if (try_to_freeze(0)) | ||
421 | goto no_signal; | ||
422 | |||
423 | if (!oldset) | ||
424 | oldset = ¤t->blocked; | ||
425 | |||
426 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | ||
427 | if (signr > 0) { | ||
428 | /* Reenable any watchpoints before delivering the | ||
429 | * signal to user space. The processor register will | ||
430 | * have been cleared if the watchpoint triggered | ||
431 | * inside the kernel. | ||
432 | */ | ||
433 | if (current->thread.debugreg7) | ||
434 | asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); | ||
435 | |||
436 | /* Whee! Actually deliver the signal. */ | ||
437 | handle_signal(signr, &info, &ka, oldset, regs); | ||
438 | return 1; | ||
439 | } | ||
440 | |||
441 | no_signal: | ||
442 | /* Did we come from a system call? */ | ||
443 | if ((long)regs->orig_rax >= 0) { | ||
444 | /* Restart the system call - no handlers present */ | ||
445 | long res = regs->rax; | ||
446 | if (res == -ERESTARTNOHAND || | ||
447 | res == -ERESTARTSYS || | ||
448 | res == -ERESTARTNOINTR) { | ||
449 | regs->rax = regs->orig_rax; | ||
450 | regs->rip -= 2; | ||
451 | } | ||
452 | if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { | ||
453 | regs->rax = __NR_restart_syscall; | ||
454 | regs->rip -= 2; | ||
455 | } | ||
456 | } | ||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) | ||
461 | { | ||
462 | #ifdef DEBUG_SIG | ||
463 | printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", | ||
464 | thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); | ||
465 | #endif | ||
466 | |||
467 | /* Pending single-step? */ | ||
468 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
469 | regs->eflags |= TF_MASK; | ||
470 | clear_thread_flag(TIF_SINGLESTEP); | ||
471 | } | ||
472 | |||
473 | /* deal with pending signal delivery */ | ||
474 | if (thread_info_flags & _TIF_SIGPENDING) | ||
475 | do_signal(regs,oldset); | ||
476 | } | ||
477 | |||
478 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | ||
479 | { | ||
480 | struct task_struct *me = current; | ||
481 | if (exception_trace) | ||
482 | printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", | ||
483 | me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); | ||
484 | |||
485 | force_sig(SIGSEGV, me); | ||
486 | } | ||
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c new file mode 100644 index 000000000000..e3cdbf9a88bd --- /dev/null +++ b/arch/x86_64/kernel/smp.c | |||
@@ -0,0 +1,415 @@ | |||
1 | /* | ||
2 | * Intel SMP support routines. | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * (c) 2002,2003 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * This code is released under the GNU General Public License version 2 or | ||
9 | * later. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/irq.h> | ||
16 | #include <linux/delay.h> | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/smp_lock.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/kernel_stat.h> | ||
21 | #include <linux/mc146818rtc.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | |||
24 | #include <asm/mtrr.h> | ||
25 | #include <asm/pgalloc.h> | ||
26 | #include <asm/tlbflush.h> | ||
27 | #include <asm/mach_apic.h> | ||
28 | #include <asm/mmu_context.h> | ||
29 | #include <asm/proto.h> | ||
30 | |||
31 | /* | ||
32 | * Smarter SMP flushing macros. | ||
33 | * c/o Linus Torvalds. | ||
34 | * | ||
35 | * These mean you can really definitely utterly forget about | ||
36 | * writing to user space from interrupts. (Its not allowed anyway). | ||
37 | * | ||
38 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
39 | */ | ||
40 | |||
41 | static cpumask_t flush_cpumask; | ||
42 | static struct mm_struct * flush_mm; | ||
43 | static unsigned long flush_va; | ||
44 | static DEFINE_SPINLOCK(tlbstate_lock); | ||
45 | #define FLUSH_ALL -1ULL | ||
46 | |||
47 | /* | ||
48 | * We cannot call mmdrop() because we are in interrupt context, | ||
49 | * instead update mm->cpu_vm_mask. | ||
50 | */ | ||
51 | static inline void leave_mm (unsigned long cpu) | ||
52 | { | ||
53 | if (read_pda(mmu_state) == TLBSTATE_OK) | ||
54 | BUG(); | ||
55 | clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); | ||
56 | load_cr3(swapper_pg_dir); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * | ||
61 | * The flush IPI assumes that a thread switch happens in this order: | ||
62 | * [cpu0: the cpu that switches] | ||
63 | * 1) switch_mm() either 1a) or 1b) | ||
64 | * 1a) thread switch to a different mm | ||
65 | * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); | ||
66 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
67 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
68 | * for the wrong mm, and in the worst case we perform a superfluous | ||
69 | * tlb flush. | ||
70 | * 1a2) set cpu mmu_state to TLBSTATE_OK | ||
71 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
72 | * was in lazy tlb mode. | ||
73 | * 1a3) update cpu active_mm | ||
74 | * Now cpu0 accepts tlb flushes for the new mm. | ||
75 | * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); | ||
76 | * Now the other cpus will send tlb flush ipis. | ||
77 | * 1a4) change cr3. | ||
78 | * 1b) thread switch without mm change | ||
79 | * cpu active_mm is correct, cpu0 already handles | ||
80 | * flush ipis. | ||
81 | * 1b1) set cpu mmu_state to TLBSTATE_OK | ||
82 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
83 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
84 | * and test the bit. | ||
85 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
86 | * 2) switch %%esp, ie current | ||
87 | * | ||
88 | * The interrupt must handle 2 special cases: | ||
89 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
90 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
91 | * runs in kernel space, the cpu could load tlb entries for user space | ||
92 | * pages. | ||
93 | * | ||
94 | * The good news is that cpu mmu_state is local to each cpu, no | ||
95 | * write/read ordering problems. | ||
96 | */ | ||
97 | |||
98 | /* | ||
99 | * TLB flush IPI: | ||
100 | * | ||
101 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
102 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
103 | */ | ||
104 | |||
105 | asmlinkage void smp_invalidate_interrupt (void) | ||
106 | { | ||
107 | unsigned long cpu; | ||
108 | |||
109 | cpu = get_cpu(); | ||
110 | |||
111 | if (!cpu_isset(cpu, flush_cpumask)) | ||
112 | goto out; | ||
113 | /* | ||
114 | * This was a BUG() but until someone can quote me the | ||
115 | * line from the intel manual that guarantees an IPI to | ||
116 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
117 | * its staying as a return | ||
118 | * | ||
119 | * BUG(); | ||
120 | */ | ||
121 | |||
122 | if (flush_mm == read_pda(active_mm)) { | ||
123 | if (read_pda(mmu_state) == TLBSTATE_OK) { | ||
124 | if (flush_va == FLUSH_ALL) | ||
125 | local_flush_tlb(); | ||
126 | else | ||
127 | __flush_tlb_one(flush_va); | ||
128 | } else | ||
129 | leave_mm(cpu); | ||
130 | } | ||
131 | ack_APIC_irq(); | ||
132 | cpu_clear(cpu, flush_cpumask); | ||
133 | |||
134 | out: | ||
135 | put_cpu_no_resched(); | ||
136 | } | ||
137 | |||
138 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | ||
139 | unsigned long va) | ||
140 | { | ||
141 | cpumask_t tmp; | ||
142 | /* | ||
143 | * A couple of (to be removed) sanity checks: | ||
144 | * | ||
145 | * - we do not send IPIs to not-yet booted CPUs. | ||
146 | * - current CPU must not be in mask | ||
147 | * - mask must exist :) | ||
148 | */ | ||
149 | BUG_ON(cpus_empty(cpumask)); | ||
150 | cpus_and(tmp, cpumask, cpu_online_map); | ||
151 | BUG_ON(!cpus_equal(tmp, cpumask)); | ||
152 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
153 | if (!mm) | ||
154 | BUG(); | ||
155 | |||
156 | /* | ||
157 | * I'm not happy about this global shared spinlock in the | ||
158 | * MM hot path, but we'll see how contended it is. | ||
159 | * Temporarily this turns IRQs off, so that lockups are | ||
160 | * detected by the NMI watchdog. | ||
161 | */ | ||
162 | spin_lock(&tlbstate_lock); | ||
163 | |||
164 | flush_mm = mm; | ||
165 | flush_va = va; | ||
166 | cpus_or(flush_cpumask, cpumask, flush_cpumask); | ||
167 | |||
168 | /* | ||
169 | * We have to send the IPI only to | ||
170 | * CPUs affected. | ||
171 | */ | ||
172 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); | ||
173 | |||
174 | while (!cpus_empty(flush_cpumask)) | ||
175 | mb(); /* nothing. lockup detection does not belong here */; | ||
176 | |||
177 | flush_mm = NULL; | ||
178 | flush_va = 0; | ||
179 | spin_unlock(&tlbstate_lock); | ||
180 | } | ||
181 | |||
182 | void flush_tlb_current_task(void) | ||
183 | { | ||
184 | struct mm_struct *mm = current->mm; | ||
185 | cpumask_t cpu_mask; | ||
186 | |||
187 | preempt_disable(); | ||
188 | cpu_mask = mm->cpu_vm_mask; | ||
189 | cpu_clear(smp_processor_id(), cpu_mask); | ||
190 | |||
191 | local_flush_tlb(); | ||
192 | if (!cpus_empty(cpu_mask)) | ||
193 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
194 | preempt_enable(); | ||
195 | } | ||
196 | |||
197 | void flush_tlb_mm (struct mm_struct * mm) | ||
198 | { | ||
199 | cpumask_t cpu_mask; | ||
200 | |||
201 | preempt_disable(); | ||
202 | cpu_mask = mm->cpu_vm_mask; | ||
203 | cpu_clear(smp_processor_id(), cpu_mask); | ||
204 | |||
205 | if (current->active_mm == mm) { | ||
206 | if (current->mm) | ||
207 | local_flush_tlb(); | ||
208 | else | ||
209 | leave_mm(smp_processor_id()); | ||
210 | } | ||
211 | if (!cpus_empty(cpu_mask)) | ||
212 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
213 | |||
214 | preempt_enable(); | ||
215 | } | ||
216 | |||
217 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | ||
218 | { | ||
219 | struct mm_struct *mm = vma->vm_mm; | ||
220 | cpumask_t cpu_mask; | ||
221 | |||
222 | preempt_disable(); | ||
223 | cpu_mask = mm->cpu_vm_mask; | ||
224 | cpu_clear(smp_processor_id(), cpu_mask); | ||
225 | |||
226 | if (current->active_mm == mm) { | ||
227 | if(current->mm) | ||
228 | __flush_tlb_one(va); | ||
229 | else | ||
230 | leave_mm(smp_processor_id()); | ||
231 | } | ||
232 | |||
233 | if (!cpus_empty(cpu_mask)) | ||
234 | flush_tlb_others(cpu_mask, mm, va); | ||
235 | |||
236 | preempt_enable(); | ||
237 | } | ||
238 | |||
239 | static void do_flush_tlb_all(void* info) | ||
240 | { | ||
241 | unsigned long cpu = smp_processor_id(); | ||
242 | |||
243 | __flush_tlb_all(); | ||
244 | if (read_pda(mmu_state) == TLBSTATE_LAZY) | ||
245 | leave_mm(cpu); | ||
246 | } | ||
247 | |||
248 | void flush_tlb_all(void) | ||
249 | { | ||
250 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | ||
251 | } | ||
252 | |||
253 | void smp_kdb_stop(void) | ||
254 | { | ||
255 | send_IPI_allbutself(KDB_VECTOR); | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * this function sends a 'reschedule' IPI to another CPU. | ||
260 | * it goes straight through and wastes no time serializing | ||
261 | * anything. Worst case is that we lose a reschedule ... | ||
262 | */ | ||
263 | |||
264 | void smp_send_reschedule(int cpu) | ||
265 | { | ||
266 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Structure and data for smp_call_function(). This is designed to minimise | ||
271 | * static memory requirements. It also looks cleaner. | ||
272 | */ | ||
273 | static DEFINE_SPINLOCK(call_lock); | ||
274 | |||
275 | struct call_data_struct { | ||
276 | void (*func) (void *info); | ||
277 | void *info; | ||
278 | atomic_t started; | ||
279 | atomic_t finished; | ||
280 | int wait; | ||
281 | }; | ||
282 | |||
283 | static struct call_data_struct * call_data; | ||
284 | |||
285 | /* | ||
286 | * this function sends a 'generic call function' IPI to all other CPUs | ||
287 | * in the system. | ||
288 | */ | ||
289 | static void __smp_call_function (void (*func) (void *info), void *info, | ||
290 | int nonatomic, int wait) | ||
291 | { | ||
292 | struct call_data_struct data; | ||
293 | int cpus = num_online_cpus()-1; | ||
294 | |||
295 | if (!cpus) | ||
296 | return; | ||
297 | |||
298 | data.func = func; | ||
299 | data.info = info; | ||
300 | atomic_set(&data.started, 0); | ||
301 | data.wait = wait; | ||
302 | if (wait) | ||
303 | atomic_set(&data.finished, 0); | ||
304 | |||
305 | call_data = &data; | ||
306 | wmb(); | ||
307 | /* Send a message to all other CPUs and wait for them to respond */ | ||
308 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
309 | |||
310 | /* Wait for response */ | ||
311 | while (atomic_read(&data.started) != cpus) | ||
312 | cpu_relax(); | ||
313 | |||
314 | if (!wait) | ||
315 | return; | ||
316 | |||
317 | while (atomic_read(&data.finished) != cpus) | ||
318 | cpu_relax(); | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * smp_call_function - run a function on all other CPUs. | ||
323 | * @func: The function to run. This must be fast and non-blocking. | ||
324 | * @info: An arbitrary pointer to pass to the function. | ||
325 | * @nonatomic: currently unused. | ||
326 | * @wait: If true, wait (atomically) until function has completed on other | ||
327 | * CPUs. | ||
328 | * | ||
329 | * Returns 0 on success, else a negative status code. Does not return until | ||
330 | * remote CPUs are nearly ready to execute func or are or have executed. | ||
331 | * | ||
332 | * You must not call this function with disabled interrupts or from a | ||
333 | * hardware interrupt handler or from a bottom half handler. | ||
334 | * Actually there are a few legal cases, like panic. | ||
335 | */ | ||
336 | int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
337 | int wait) | ||
338 | { | ||
339 | spin_lock(&call_lock); | ||
340 | __smp_call_function(func,info,nonatomic,wait); | ||
341 | spin_unlock(&call_lock); | ||
342 | return 0; | ||
343 | } | ||
344 | |||
345 | void smp_stop_cpu(void) | ||
346 | { | ||
347 | /* | ||
348 | * Remove this CPU: | ||
349 | */ | ||
350 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
351 | local_irq_disable(); | ||
352 | disable_local_APIC(); | ||
353 | local_irq_enable(); | ||
354 | } | ||
355 | |||
356 | static void smp_really_stop_cpu(void *dummy) | ||
357 | { | ||
358 | smp_stop_cpu(); | ||
359 | for (;;) | ||
360 | asm("hlt"); | ||
361 | } | ||
362 | |||
363 | void smp_send_stop(void) | ||
364 | { | ||
365 | int nolock = 0; | ||
366 | if (reboot_force) | ||
367 | return; | ||
368 | /* Don't deadlock on the call lock in panic */ | ||
369 | if (!spin_trylock(&call_lock)) { | ||
370 | /* ignore locking because we have paniced anyways */ | ||
371 | nolock = 1; | ||
372 | } | ||
373 | __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); | ||
374 | if (!nolock) | ||
375 | spin_unlock(&call_lock); | ||
376 | |||
377 | local_irq_disable(); | ||
378 | disable_local_APIC(); | ||
379 | local_irq_enable(); | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Reschedule call back. Nothing to do, | ||
384 | * all the work is done automatically when | ||
385 | * we return from the interrupt. | ||
386 | */ | ||
387 | asmlinkage void smp_reschedule_interrupt(void) | ||
388 | { | ||
389 | ack_APIC_irq(); | ||
390 | } | ||
391 | |||
392 | asmlinkage void smp_call_function_interrupt(void) | ||
393 | { | ||
394 | void (*func) (void *info) = call_data->func; | ||
395 | void *info = call_data->info; | ||
396 | int wait = call_data->wait; | ||
397 | |||
398 | ack_APIC_irq(); | ||
399 | /* | ||
400 | * Notify initiating CPU that I've grabbed the data and am | ||
401 | * about to execute the function | ||
402 | */ | ||
403 | mb(); | ||
404 | atomic_inc(&call_data->started); | ||
405 | /* | ||
406 | * At this point the info structure may be out of scope unless wait==1 | ||
407 | */ | ||
408 | irq_enter(); | ||
409 | (*func)(info); | ||
410 | irq_exit(); | ||
411 | if (wait) { | ||
412 | mb(); | ||
413 | atomic_inc(&call_data->finished); | ||
414 | } | ||
415 | } | ||
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c new file mode 100644 index 000000000000..afd3600b919e --- /dev/null +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -0,0 +1,938 @@ | |||
1 | /* | ||
2 | * x86 SMP booting functions | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright 2001 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * Much of the core SMP work is based on previous work by Thomas Radke, to | ||
9 | * whom a great many thanks are extended. | ||
10 | * | ||
11 | * Thanks to Intel for making available several different Pentium, | ||
12 | * Pentium Pro and Pentium-II/Xeon MP machines. | ||
13 | * Original development of Linux SMP code supported by Caldera. | ||
14 | * | ||
15 | * This code is released under the GNU General Public License version 2 or | ||
16 | * later. | ||
17 | * | ||
18 | * Fixes | ||
19 | * Felix Koop : NR_CPUS used properly | ||
20 | * Jose Renau : Handle single CPU case. | ||
21 | * Alan Cox : By repeated request 8) - Total BogoMIP report. | ||
22 | * Greg Wright : Fix for kernel stacks panic. | ||
23 | * Erich Boleyn : MP v1.4 and additional changes. | ||
24 | * Matthias Sattler : Changes for 2.1 kernel map. | ||
25 | * Michel Lespinasse : Changes for 2.1 kernel map. | ||
26 | * Michael Chastain : Change trampoline.S to gnu as. | ||
27 | * Alan Cox : Dumb bug: 'B' step PPro's are fine | ||
28 | * Ingo Molnar : Added APIC timers, based on code | ||
29 | * from Jose Renau | ||
30 | * Ingo Molnar : various cleanups and rewrites | ||
31 | * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. | ||
32 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs | ||
33 | * Andi Kleen : Changed for SMP boot into long mode. | ||
34 | * Rusty Russell : Hacked into shape for new "hotplug" boot process. | ||
35 | */ | ||
36 | |||
37 | #include <linux/config.h> | ||
38 | #include <linux/init.h> | ||
39 | |||
40 | #include <linux/mm.h> | ||
41 | #include <linux/kernel_stat.h> | ||
42 | #include <linux/smp_lock.h> | ||
43 | #include <linux/irq.h> | ||
44 | #include <linux/bootmem.h> | ||
45 | #include <linux/thread_info.h> | ||
46 | #include <linux/module.h> | ||
47 | |||
48 | #include <linux/delay.h> | ||
49 | #include <linux/mc146818rtc.h> | ||
50 | #include <asm/mtrr.h> | ||
51 | #include <asm/pgalloc.h> | ||
52 | #include <asm/desc.h> | ||
53 | #include <asm/kdebug.h> | ||
54 | #include <asm/tlbflush.h> | ||
55 | #include <asm/proto.h> | ||
56 | |||
57 | /* Number of siblings per CPU package */ | ||
58 | int smp_num_siblings = 1; | ||
59 | /* Package ID of each logical CPU */ | ||
60 | u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
61 | EXPORT_SYMBOL(phys_proc_id); | ||
62 | |||
63 | /* Bitmask of currently online CPUs */ | ||
64 | cpumask_t cpu_online_map; | ||
65 | |||
66 | cpumask_t cpu_callin_map; | ||
67 | cpumask_t cpu_callout_map; | ||
68 | static cpumask_t smp_commenced_mask; | ||
69 | |||
70 | /* Per CPU bogomips and other parameters */ | ||
71 | struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; | ||
72 | |||
73 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; | ||
74 | |||
75 | /* | ||
76 | * Trampoline 80x86 program as an array. | ||
77 | */ | ||
78 | |||
79 | extern unsigned char trampoline_data []; | ||
80 | extern unsigned char trampoline_end []; | ||
81 | |||
82 | /* | ||
83 | * Currently trivial. Write the real->protected mode | ||
84 | * bootstrap into the page concerned. The caller | ||
85 | * has made sure it's suitably aligned. | ||
86 | */ | ||
87 | |||
88 | static unsigned long __init setup_trampoline(void) | ||
89 | { | ||
90 | void *tramp = __va(SMP_TRAMPOLINE_BASE); | ||
91 | memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); | ||
92 | return virt_to_phys(tramp); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * The bootstrap kernel entry code has set these up. Save them for | ||
97 | * a given CPU | ||
98 | */ | ||
99 | |||
100 | static void __init smp_store_cpu_info(int id) | ||
101 | { | ||
102 | struct cpuinfo_x86 *c = cpu_data + id; | ||
103 | |||
104 | *c = boot_cpu_data; | ||
105 | identify_cpu(c); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * TSC synchronization. | ||
110 | * | ||
111 | * We first check whether all CPUs have their TSC's synchronized, | ||
112 | * then we print a warning if not, and always resync. | ||
113 | */ | ||
114 | |||
115 | static atomic_t tsc_start_flag = ATOMIC_INIT(0); | ||
116 | static atomic_t tsc_count_start = ATOMIC_INIT(0); | ||
117 | static atomic_t tsc_count_stop = ATOMIC_INIT(0); | ||
118 | static unsigned long long tsc_values[NR_CPUS]; | ||
119 | |||
120 | #define NR_LOOPS 5 | ||
121 | |||
122 | extern unsigned int fast_gettimeoffset_quotient; | ||
123 | |||
124 | static void __init synchronize_tsc_bp (void) | ||
125 | { | ||
126 | int i; | ||
127 | unsigned long long t0; | ||
128 | unsigned long long sum, avg; | ||
129 | long long delta; | ||
130 | long one_usec; | ||
131 | int buggy = 0; | ||
132 | |||
133 | printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus()); | ||
134 | |||
135 | one_usec = cpu_khz; | ||
136 | |||
137 | atomic_set(&tsc_start_flag, 1); | ||
138 | wmb(); | ||
139 | |||
140 | /* | ||
141 | * We loop a few times to get a primed instruction cache, | ||
142 | * then the last pass is more or less synchronized and | ||
143 | * the BP and APs set their cycle counters to zero all at | ||
144 | * once. This reduces the chance of having random offsets | ||
145 | * between the processors, and guarantees that the maximum | ||
146 | * delay between the cycle counters is never bigger than | ||
147 | * the latency of information-passing (cachelines) between | ||
148 | * two CPUs. | ||
149 | */ | ||
150 | for (i = 0; i < NR_LOOPS; i++) { | ||
151 | /* | ||
152 | * all APs synchronize but they loop on '== num_cpus' | ||
153 | */ | ||
154 | while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb(); | ||
155 | atomic_set(&tsc_count_stop, 0); | ||
156 | wmb(); | ||
157 | /* | ||
158 | * this lets the APs save their current TSC: | ||
159 | */ | ||
160 | atomic_inc(&tsc_count_start); | ||
161 | |||
162 | sync_core(); | ||
163 | rdtscll(tsc_values[smp_processor_id()]); | ||
164 | /* | ||
165 | * We clear the TSC in the last loop: | ||
166 | */ | ||
167 | if (i == NR_LOOPS-1) | ||
168 | write_tsc(0, 0); | ||
169 | |||
170 | /* | ||
171 | * Wait for all APs to leave the synchronization point: | ||
172 | */ | ||
173 | while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb(); | ||
174 | atomic_set(&tsc_count_start, 0); | ||
175 | wmb(); | ||
176 | atomic_inc(&tsc_count_stop); | ||
177 | } | ||
178 | |||
179 | sum = 0; | ||
180 | for (i = 0; i < NR_CPUS; i++) { | ||
181 | if (cpu_isset(i, cpu_callout_map)) { | ||
182 | t0 = tsc_values[i]; | ||
183 | sum += t0; | ||
184 | } | ||
185 | } | ||
186 | avg = sum / num_booting_cpus(); | ||
187 | |||
188 | sum = 0; | ||
189 | for (i = 0; i < NR_CPUS; i++) { | ||
190 | if (!cpu_isset(i, cpu_callout_map)) | ||
191 | continue; | ||
192 | |||
193 | delta = tsc_values[i] - avg; | ||
194 | if (delta < 0) | ||
195 | delta = -delta; | ||
196 | /* | ||
197 | * We report bigger than 2 microseconds clock differences. | ||
198 | */ | ||
199 | if (delta > 2*one_usec) { | ||
200 | long realdelta; | ||
201 | if (!buggy) { | ||
202 | buggy = 1; | ||
203 | printk("\n"); | ||
204 | } | ||
205 | realdelta = delta / one_usec; | ||
206 | if (tsc_values[i] < avg) | ||
207 | realdelta = -realdelta; | ||
208 | |||
209 | printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", | ||
210 | i, realdelta); | ||
211 | } | ||
212 | |||
213 | sum += delta; | ||
214 | } | ||
215 | if (!buggy) | ||
216 | printk("passed.\n"); | ||
217 | } | ||
218 | |||
219 | static void __init synchronize_tsc_ap (void) | ||
220 | { | ||
221 | int i; | ||
222 | |||
223 | /* | ||
224 | * Not every cpu is online at the time | ||
225 | * this gets called, so we first wait for the BP to | ||
226 | * finish SMP initialization: | ||
227 | */ | ||
228 | while (!atomic_read(&tsc_start_flag)) mb(); | ||
229 | |||
230 | for (i = 0; i < NR_LOOPS; i++) { | ||
231 | atomic_inc(&tsc_count_start); | ||
232 | while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb(); | ||
233 | |||
234 | sync_core(); | ||
235 | rdtscll(tsc_values[smp_processor_id()]); | ||
236 | if (i == NR_LOOPS-1) | ||
237 | write_tsc(0, 0); | ||
238 | |||
239 | atomic_inc(&tsc_count_stop); | ||
240 | while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); | ||
241 | } | ||
242 | } | ||
243 | #undef NR_LOOPS | ||
244 | |||
245 | static atomic_t init_deasserted; | ||
246 | |||
247 | static void __init smp_callin(void) | ||
248 | { | ||
249 | int cpuid, phys_id; | ||
250 | unsigned long timeout; | ||
251 | |||
252 | /* | ||
253 | * If waken up by an INIT in an 82489DX configuration | ||
254 | * we may get here before an INIT-deassert IPI reaches | ||
255 | * our local APIC. We have to wait for the IPI or we'll | ||
256 | * lock up on an APIC access. | ||
257 | */ | ||
258 | while (!atomic_read(&init_deasserted)); | ||
259 | |||
260 | /* | ||
261 | * (This works even if the APIC is not enabled.) | ||
262 | */ | ||
263 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
264 | cpuid = smp_processor_id(); | ||
265 | if (cpu_isset(cpuid, cpu_callin_map)) { | ||
266 | panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", | ||
267 | phys_id, cpuid); | ||
268 | } | ||
269 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
270 | |||
271 | /* | ||
272 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
273 | * trigger some glue motherboard logic. Complete APIC bus | ||
274 | * silence for 1 second, this overestimates the time the | ||
275 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
276 | * by a factor of two. This should be enough. | ||
277 | */ | ||
278 | |||
279 | /* | ||
280 | * Waiting 2s total for startup (udelay is not yet working) | ||
281 | */ | ||
282 | timeout = jiffies + 2*HZ; | ||
283 | while (time_before(jiffies, timeout)) { | ||
284 | /* | ||
285 | * Has the boot CPU finished it's STARTUP sequence? | ||
286 | */ | ||
287 | if (cpu_isset(cpuid, cpu_callout_map)) | ||
288 | break; | ||
289 | rep_nop(); | ||
290 | } | ||
291 | |||
292 | if (!time_before(jiffies, timeout)) { | ||
293 | panic("smp_callin: CPU%d started up but did not get a callout!\n", | ||
294 | cpuid); | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * the boot CPU has finished the init stage and is spinning | ||
299 | * on callin_map until we finish. We are free to set up this | ||
300 | * CPU, first the APIC. (this is probably redundant on most | ||
301 | * boards) | ||
302 | */ | ||
303 | |||
304 | Dprintk("CALLIN, before setup_local_APIC().\n"); | ||
305 | setup_local_APIC(); | ||
306 | |||
307 | local_irq_enable(); | ||
308 | |||
309 | /* | ||
310 | * Get our bogomips. | ||
311 | */ | ||
312 | calibrate_delay(); | ||
313 | Dprintk("Stack at about %p\n",&cpuid); | ||
314 | |||
315 | disable_APIC_timer(); | ||
316 | |||
317 | /* | ||
318 | * Save our processor parameters | ||
319 | */ | ||
320 | smp_store_cpu_info(cpuid); | ||
321 | |||
322 | local_irq_disable(); | ||
323 | |||
324 | /* | ||
325 | * Allow the master to continue. | ||
326 | */ | ||
327 | cpu_set(cpuid, cpu_callin_map); | ||
328 | |||
329 | /* | ||
330 | * Synchronize the TSC with the BP | ||
331 | */ | ||
332 | if (cpu_has_tsc) | ||
333 | synchronize_tsc_ap(); | ||
334 | } | ||
335 | |||
336 | static int cpucount; | ||
337 | |||
338 | /* | ||
339 | * Activate a secondary processor. | ||
340 | */ | ||
341 | void __init start_secondary(void) | ||
342 | { | ||
343 | /* | ||
344 | * Dont put anything before smp_callin(), SMP | ||
345 | * booting is too fragile that we want to limit the | ||
346 | * things done here to the most necessary things. | ||
347 | */ | ||
348 | cpu_init(); | ||
349 | smp_callin(); | ||
350 | |||
351 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | ||
352 | barrier(); | ||
353 | |||
354 | Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); | ||
355 | while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) | ||
356 | rep_nop(); | ||
357 | |||
358 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); | ||
359 | setup_secondary_APIC_clock(); | ||
360 | |||
361 | Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); | ||
362 | |||
363 | if (nmi_watchdog == NMI_IO_APIC) { | ||
364 | disable_8259A_irq(0); | ||
365 | enable_NMI_through_LVT0(NULL); | ||
366 | enable_8259A_irq(0); | ||
367 | } | ||
368 | |||
369 | |||
370 | enable_APIC_timer(); | ||
371 | |||
372 | /* | ||
373 | * low-memory mappings have been cleared, flush them from | ||
374 | * the local TLBs too. | ||
375 | */ | ||
376 | local_flush_tlb(); | ||
377 | |||
378 | Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); | ||
379 | cpu_set(smp_processor_id(), cpu_online_map); | ||
380 | wmb(); | ||
381 | |||
382 | cpu_idle(); | ||
383 | } | ||
384 | |||
385 | extern volatile unsigned long init_rsp; | ||
386 | extern void (*initial_code)(void); | ||
387 | |||
388 | #if APIC_DEBUG | ||
389 | static inline void inquire_remote_apic(int apicid) | ||
390 | { | ||
391 | unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; | ||
392 | char *names[] = { "ID", "VERSION", "SPIV" }; | ||
393 | int timeout, status; | ||
394 | |||
395 | printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); | ||
396 | |||
397 | for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { | ||
398 | printk("... APIC #%d %s: ", apicid, names[i]); | ||
399 | |||
400 | /* | ||
401 | * Wait for idle. | ||
402 | */ | ||
403 | apic_wait_icr_idle(); | ||
404 | |||
405 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | ||
406 | apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
407 | |||
408 | timeout = 0; | ||
409 | do { | ||
410 | udelay(100); | ||
411 | status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; | ||
412 | } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); | ||
413 | |||
414 | switch (status) { | ||
415 | case APIC_ICR_RR_VALID: | ||
416 | status = apic_read(APIC_RRR); | ||
417 | printk("%08x\n", status); | ||
418 | break; | ||
419 | default: | ||
420 | printk("failed\n"); | ||
421 | } | ||
422 | } | ||
423 | } | ||
424 | #endif | ||
425 | |||
426 | static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) | ||
427 | { | ||
428 | unsigned long send_status = 0, accept_status = 0; | ||
429 | int maxlvt, timeout, num_starts, j; | ||
430 | |||
431 | Dprintk("Asserting INIT.\n"); | ||
432 | |||
433 | /* | ||
434 | * Turn INIT on target chip | ||
435 | */ | ||
436 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
437 | |||
438 | /* | ||
439 | * Send IPI | ||
440 | */ | ||
441 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | ||
442 | | APIC_DM_INIT); | ||
443 | |||
444 | Dprintk("Waiting for send to finish...\n"); | ||
445 | timeout = 0; | ||
446 | do { | ||
447 | Dprintk("+"); | ||
448 | udelay(100); | ||
449 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
450 | } while (send_status && (timeout++ < 1000)); | ||
451 | |||
452 | mdelay(10); | ||
453 | |||
454 | Dprintk("Deasserting INIT.\n"); | ||
455 | |||
456 | /* Target chip */ | ||
457 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
458 | |||
459 | /* Send IPI */ | ||
460 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
461 | |||
462 | Dprintk("Waiting for send to finish...\n"); | ||
463 | timeout = 0; | ||
464 | do { | ||
465 | Dprintk("+"); | ||
466 | udelay(100); | ||
467 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
468 | } while (send_status && (timeout++ < 1000)); | ||
469 | |||
470 | atomic_set(&init_deasserted, 1); | ||
471 | |||
472 | /* | ||
473 | * Should we send STARTUP IPIs ? | ||
474 | * | ||
475 | * Determine this based on the APIC version. | ||
476 | * If we don't have an integrated APIC, don't send the STARTUP IPIs. | ||
477 | */ | ||
478 | if (APIC_INTEGRATED(apic_version[phys_apicid])) | ||
479 | num_starts = 2; | ||
480 | else | ||
481 | num_starts = 0; | ||
482 | |||
483 | /* | ||
484 | * Run STARTUP IPI loop. | ||
485 | */ | ||
486 | Dprintk("#startup loops: %d.\n", num_starts); | ||
487 | |||
488 | maxlvt = get_maxlvt(); | ||
489 | |||
490 | for (j = 1; j <= num_starts; j++) { | ||
491 | Dprintk("Sending STARTUP #%d.\n",j); | ||
492 | apic_read_around(APIC_SPIV); | ||
493 | apic_write(APIC_ESR, 0); | ||
494 | apic_read(APIC_ESR); | ||
495 | Dprintk("After apic_write.\n"); | ||
496 | |||
497 | /* | ||
498 | * STARTUP IPI | ||
499 | */ | ||
500 | |||
501 | /* Target chip */ | ||
502 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
503 | |||
504 | /* Boot on the stack */ | ||
505 | /* Kick the second */ | ||
506 | apic_write_around(APIC_ICR, APIC_DM_STARTUP | ||
507 | | (start_rip >> 12)); | ||
508 | |||
509 | /* | ||
510 | * Give the other CPU some time to accept the IPI. | ||
511 | */ | ||
512 | udelay(300); | ||
513 | |||
514 | Dprintk("Startup point 1.\n"); | ||
515 | |||
516 | Dprintk("Waiting for send to finish...\n"); | ||
517 | timeout = 0; | ||
518 | do { | ||
519 | Dprintk("+"); | ||
520 | udelay(100); | ||
521 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
522 | } while (send_status && (timeout++ < 1000)); | ||
523 | |||
524 | /* | ||
525 | * Give the other CPU some time to accept the IPI. | ||
526 | */ | ||
527 | udelay(200); | ||
528 | /* | ||
529 | * Due to the Pentium erratum 3AP. | ||
530 | */ | ||
531 | if (maxlvt > 3) { | ||
532 | apic_read_around(APIC_SPIV); | ||
533 | apic_write(APIC_ESR, 0); | ||
534 | } | ||
535 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
536 | if (send_status || accept_status) | ||
537 | break; | ||
538 | } | ||
539 | Dprintk("After Startup.\n"); | ||
540 | |||
541 | if (send_status) | ||
542 | printk(KERN_ERR "APIC never delivered???\n"); | ||
543 | if (accept_status) | ||
544 | printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); | ||
545 | |||
546 | return (send_status | accept_status); | ||
547 | } | ||
548 | |||
549 | static void __init do_boot_cpu (int apicid) | ||
550 | { | ||
551 | struct task_struct *idle; | ||
552 | unsigned long boot_error; | ||
553 | int timeout, cpu; | ||
554 | unsigned long start_rip; | ||
555 | |||
556 | cpu = ++cpucount; | ||
557 | /* | ||
558 | * We can't use kernel_thread since we must avoid to | ||
559 | * reschedule the child. | ||
560 | */ | ||
561 | idle = fork_idle(cpu); | ||
562 | if (IS_ERR(idle)) | ||
563 | panic("failed fork for CPU %d", cpu); | ||
564 | x86_cpu_to_apicid[cpu] = apicid; | ||
565 | |||
566 | cpu_pda[cpu].pcurrent = idle; | ||
567 | |||
568 | start_rip = setup_trampoline(); | ||
569 | |||
570 | init_rsp = idle->thread.rsp; | ||
571 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | ||
572 | initial_code = start_secondary; | ||
573 | clear_ti_thread_flag(idle->thread_info, TIF_FORK); | ||
574 | |||
575 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, | ||
576 | start_rip, init_rsp); | ||
577 | |||
578 | /* | ||
579 | * This grunge runs the startup process for | ||
580 | * the targeted processor. | ||
581 | */ | ||
582 | |||
583 | atomic_set(&init_deasserted, 0); | ||
584 | |||
585 | Dprintk("Setting warm reset code and vector.\n"); | ||
586 | |||
587 | CMOS_WRITE(0xa, 0xf); | ||
588 | local_flush_tlb(); | ||
589 | Dprintk("1.\n"); | ||
590 | *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; | ||
591 | Dprintk("2.\n"); | ||
592 | *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; | ||
593 | Dprintk("3.\n"); | ||
594 | |||
595 | /* | ||
596 | * Be paranoid about clearing APIC errors. | ||
597 | */ | ||
598 | if (APIC_INTEGRATED(apic_version[apicid])) { | ||
599 | apic_read_around(APIC_SPIV); | ||
600 | apic_write(APIC_ESR, 0); | ||
601 | apic_read(APIC_ESR); | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * Status is now clean | ||
606 | */ | ||
607 | boot_error = 0; | ||
608 | |||
609 | /* | ||
610 | * Starting actual IPI sequence... | ||
611 | */ | ||
612 | boot_error = wakeup_secondary_via_INIT(apicid, start_rip); | ||
613 | |||
614 | if (!boot_error) { | ||
615 | /* | ||
616 | * allow APs to start initializing. | ||
617 | */ | ||
618 | Dprintk("Before Callout %d.\n", cpu); | ||
619 | cpu_set(cpu, cpu_callout_map); | ||
620 | Dprintk("After Callout %d.\n", cpu); | ||
621 | |||
622 | /* | ||
623 | * Wait 5s total for a response | ||
624 | */ | ||
625 | for (timeout = 0; timeout < 50000; timeout++) { | ||
626 | if (cpu_isset(cpu, cpu_callin_map)) | ||
627 | break; /* It has booted */ | ||
628 | udelay(100); | ||
629 | } | ||
630 | |||
631 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
632 | /* number CPUs logically, starting from 1 (BSP is 0) */ | ||
633 | Dprintk("OK.\n"); | ||
634 | print_cpu_info(&cpu_data[cpu]); | ||
635 | Dprintk("CPU has booted.\n"); | ||
636 | } else { | ||
637 | boot_error = 1; | ||
638 | if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) | ||
639 | == 0xA5) | ||
640 | /* trampoline started but...? */ | ||
641 | printk("Stuck ??\n"); | ||
642 | else | ||
643 | /* trampoline code not run */ | ||
644 | printk("Not responding.\n"); | ||
645 | #if APIC_DEBUG | ||
646 | inquire_remote_apic(apicid); | ||
647 | #endif | ||
648 | } | ||
649 | } | ||
650 | if (boot_error) { | ||
651 | cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ | ||
652 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
653 | cpucount--; | ||
654 | x86_cpu_to_apicid[cpu] = BAD_APICID; | ||
655 | x86_cpu_to_log_apicid[cpu] = BAD_APICID; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | static void smp_tune_scheduling (void) | ||
660 | { | ||
661 | int cachesize; /* kB */ | ||
662 | unsigned long bandwidth = 1000; /* MB/s */ | ||
663 | /* | ||
664 | * Rough estimation for SMP scheduling, this is the number of | ||
665 | * cycles it takes for a fully memory-limited process to flush | ||
666 | * the SMP-local cache. | ||
667 | * | ||
668 | * (For a P5 this pretty much means we will choose another idle | ||
669 | * CPU almost always at wakeup time (this is due to the small | ||
670 | * L1 cache), on PIIs it's around 50-100 usecs, depending on | ||
671 | * the cache size) | ||
672 | */ | ||
673 | |||
674 | if (!cpu_khz) { | ||
675 | return; | ||
676 | } else { | ||
677 | cachesize = boot_cpu_data.x86_cache_size; | ||
678 | if (cachesize == -1) { | ||
679 | cachesize = 16; /* Pentiums, 2x8kB cache */ | ||
680 | bandwidth = 100; | ||
681 | } | ||
682 | } | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * Cycle through the processors sending APIC IPIs to boot each. | ||
687 | */ | ||
688 | |||
689 | static void __init smp_boot_cpus(unsigned int max_cpus) | ||
690 | { | ||
691 | unsigned apicid, cpu, bit, kicked; | ||
692 | |||
693 | nmi_watchdog_default(); | ||
694 | |||
695 | /* | ||
696 | * Setup boot CPU information | ||
697 | */ | ||
698 | smp_store_cpu_info(0); /* Final full version of the data */ | ||
699 | printk(KERN_INFO "CPU%d: ", 0); | ||
700 | print_cpu_info(&cpu_data[0]); | ||
701 | |||
702 | current_thread_info()->cpu = 0; | ||
703 | smp_tune_scheduling(); | ||
704 | |||
705 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { | ||
706 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
707 | hard_smp_processor_id()); | ||
708 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * If we couldn't find an SMP configuration at boot time, | ||
713 | * get out of here now! | ||
714 | */ | ||
715 | if (!smp_found_config) { | ||
716 | printk(KERN_NOTICE "SMP motherboard not detected.\n"); | ||
717 | io_apic_irqs = 0; | ||
718 | cpu_online_map = cpumask_of_cpu(0); | ||
719 | cpu_set(0, cpu_sibling_map[0]); | ||
720 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
721 | if (APIC_init_uniprocessor()) | ||
722 | printk(KERN_NOTICE "Local APIC not detected." | ||
723 | " Using dummy APIC emulation.\n"); | ||
724 | goto smp_done; | ||
725 | } | ||
726 | |||
727 | /* | ||
728 | * Should not be necessary because the MP table should list the boot | ||
729 | * CPU too, but we do it for the sake of robustness anyway. | ||
730 | */ | ||
731 | if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { | ||
732 | printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
733 | boot_cpu_id); | ||
734 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | * If we couldn't find a local APIC, then get out of here now! | ||
739 | */ | ||
740 | if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { | ||
741 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
742 | boot_cpu_id); | ||
743 | printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); | ||
744 | io_apic_irqs = 0; | ||
745 | cpu_online_map = cpumask_of_cpu(0); | ||
746 | cpu_set(0, cpu_sibling_map[0]); | ||
747 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
748 | disable_apic = 1; | ||
749 | goto smp_done; | ||
750 | } | ||
751 | |||
752 | verify_local_APIC(); | ||
753 | |||
754 | /* | ||
755 | * If SMP should be disabled, then really disable it! | ||
756 | */ | ||
757 | if (!max_cpus) { | ||
758 | smp_found_config = 0; | ||
759 | printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); | ||
760 | io_apic_irqs = 0; | ||
761 | cpu_online_map = cpumask_of_cpu(0); | ||
762 | cpu_set(0, cpu_sibling_map[0]); | ||
763 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
764 | disable_apic = 1; | ||
765 | goto smp_done; | ||
766 | } | ||
767 | |||
768 | connect_bsp_APIC(); | ||
769 | setup_local_APIC(); | ||
770 | |||
771 | if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) | ||
772 | BUG(); | ||
773 | |||
774 | x86_cpu_to_apicid[0] = boot_cpu_id; | ||
775 | |||
776 | /* | ||
777 | * Now scan the CPU present map and fire up the other CPUs. | ||
778 | */ | ||
779 | Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); | ||
780 | |||
781 | kicked = 1; | ||
782 | for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { | ||
783 | apicid = cpu_present_to_apicid(bit); | ||
784 | /* | ||
785 | * Don't even attempt to start the boot CPU! | ||
786 | */ | ||
787 | if (apicid == boot_cpu_id || (apicid == BAD_APICID)) | ||
788 | continue; | ||
789 | |||
790 | if (!physid_isset(apicid, phys_cpu_present_map)) | ||
791 | continue; | ||
792 | if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) | ||
793 | continue; | ||
794 | |||
795 | do_boot_cpu(apicid); | ||
796 | ++kicked; | ||
797 | } | ||
798 | |||
799 | /* | ||
800 | * Cleanup possible dangling ends... | ||
801 | */ | ||
802 | { | ||
803 | /* | ||
804 | * Install writable page 0 entry to set BIOS data area. | ||
805 | */ | ||
806 | local_flush_tlb(); | ||
807 | |||
808 | /* | ||
809 | * Paranoid: Set warm reset code and vector here back | ||
810 | * to default values. | ||
811 | */ | ||
812 | CMOS_WRITE(0, 0xf); | ||
813 | |||
814 | *((volatile int *) phys_to_virt(0x467)) = 0; | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * Allow the user to impress friends. | ||
819 | */ | ||
820 | |||
821 | Dprintk("Before bogomips.\n"); | ||
822 | if (!cpucount) { | ||
823 | printk(KERN_INFO "Only one processor found.\n"); | ||
824 | } else { | ||
825 | unsigned long bogosum = 0; | ||
826 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
827 | if (cpu_isset(cpu, cpu_callout_map)) | ||
828 | bogosum += cpu_data[cpu].loops_per_jiffy; | ||
829 | printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", | ||
830 | cpucount+1, | ||
831 | bogosum/(500000/HZ), | ||
832 | (bogosum/(5000/HZ))%100); | ||
833 | Dprintk("Before bogocount - setting activated=1.\n"); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Construct cpu_sibling_map[], so that we can tell the | ||
838 | * sibling CPU efficiently. | ||
839 | */ | ||
840 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
841 | cpus_clear(cpu_sibling_map[cpu]); | ||
842 | |||
843 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
844 | int siblings = 0; | ||
845 | int i; | ||
846 | if (!cpu_isset(cpu, cpu_callout_map)) | ||
847 | continue; | ||
848 | |||
849 | if (smp_num_siblings > 1) { | ||
850 | for (i = 0; i < NR_CPUS; i++) { | ||
851 | if (!cpu_isset(i, cpu_callout_map)) | ||
852 | continue; | ||
853 | if (phys_proc_id[cpu] == phys_proc_id[i]) { | ||
854 | siblings++; | ||
855 | cpu_set(i, cpu_sibling_map[cpu]); | ||
856 | } | ||
857 | } | ||
858 | } else { | ||
859 | siblings++; | ||
860 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
861 | } | ||
862 | |||
863 | if (siblings != smp_num_siblings) { | ||
864 | printk(KERN_WARNING | ||
865 | "WARNING: %d siblings found for CPU%d, should be %d\n", | ||
866 | siblings, cpu, smp_num_siblings); | ||
867 | smp_num_siblings = siblings; | ||
868 | } | ||
869 | } | ||
870 | |||
871 | Dprintk("Boot done.\n"); | ||
872 | |||
873 | /* | ||
874 | * Here we can be sure that there is an IO-APIC in the system. Let's | ||
875 | * go and set it up: | ||
876 | */ | ||
877 | if (!skip_ioapic_setup && nr_ioapics) | ||
878 | setup_IO_APIC(); | ||
879 | else | ||
880 | nr_ioapics = 0; | ||
881 | |||
882 | setup_boot_APIC_clock(); | ||
883 | |||
884 | /* | ||
885 | * Synchronize the TSC with the AP | ||
886 | */ | ||
887 | if (cpu_has_tsc && cpucount) | ||
888 | synchronize_tsc_bp(); | ||
889 | |||
890 | smp_done: | ||
891 | time_init_smp(); | ||
892 | } | ||
893 | |||
894 | /* These are wrappers to interface to the new boot process. Someone | ||
895 | who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ | ||
896 | void __init smp_prepare_cpus(unsigned int max_cpus) | ||
897 | { | ||
898 | smp_boot_cpus(max_cpus); | ||
899 | } | ||
900 | |||
901 | void __devinit smp_prepare_boot_cpu(void) | ||
902 | { | ||
903 | cpu_set(smp_processor_id(), cpu_online_map); | ||
904 | cpu_set(smp_processor_id(), cpu_callout_map); | ||
905 | } | ||
906 | |||
907 | int __devinit __cpu_up(unsigned int cpu) | ||
908 | { | ||
909 | /* This only works at boot for x86. See "rewrite" above. */ | ||
910 | if (cpu_isset(cpu, smp_commenced_mask)) { | ||
911 | local_irq_enable(); | ||
912 | return -ENOSYS; | ||
913 | } | ||
914 | |||
915 | /* In case one didn't come up */ | ||
916 | if (!cpu_isset(cpu, cpu_callin_map)) { | ||
917 | local_irq_enable(); | ||
918 | return -EIO; | ||
919 | } | ||
920 | local_irq_enable(); | ||
921 | |||
922 | /* Unleash the CPU! */ | ||
923 | Dprintk("waiting for cpu %d\n", cpu); | ||
924 | |||
925 | cpu_set(cpu, smp_commenced_mask); | ||
926 | while (!cpu_isset(cpu, cpu_online_map)) | ||
927 | mb(); | ||
928 | return 0; | ||
929 | } | ||
930 | |||
931 | void __init smp_cpus_done(unsigned int max_cpus) | ||
932 | { | ||
933 | #ifdef CONFIG_X86_IO_APIC | ||
934 | setup_ioapic_dest(); | ||
935 | #endif | ||
936 | zap_low_mappings(); | ||
937 | } | ||
938 | |||
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c new file mode 100644 index 000000000000..ebaa1e37d657 --- /dev/null +++ b/arch/x86_64/kernel/suspend.c | |||
@@ -0,0 +1,157 @@ | |||
1 | /* | ||
2 | * Suspend support specific for i386. | ||
3 | * | ||
4 | * Distribute under GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | ||
7 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | ||
8 | */ | ||
9 | |||
10 | #include <linux/config.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/poll.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/sysrq.h> | ||
19 | #include <linux/proc_fs.h> | ||
20 | #include <linux/irq.h> | ||
21 | #include <linux/pm.h> | ||
22 | #include <linux/device.h> | ||
23 | #include <linux/suspend.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | #include <asm/acpi.h> | ||
26 | #include <asm/tlbflush.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/proto.h> | ||
29 | |||
30 | struct saved_context saved_context; | ||
31 | |||
32 | unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; | ||
33 | unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; | ||
34 | unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; | ||
35 | unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; | ||
36 | unsigned long saved_context_eflags; | ||
37 | |||
38 | void __save_processor_state(struct saved_context *ctxt) | ||
39 | { | ||
40 | kernel_fpu_begin(); | ||
41 | |||
42 | /* | ||
43 | * descriptor tables | ||
44 | */ | ||
45 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); | ||
46 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); | ||
47 | asm volatile ("sldt %0" : "=m" (ctxt->ldt)); | ||
48 | asm volatile ("str %0" : "=m" (ctxt->tr)); | ||
49 | |||
50 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | ||
51 | /* EFER should be constant for kernel version, no need to handle it. */ | ||
52 | /* | ||
53 | * segment registers | ||
54 | */ | ||
55 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); | ||
56 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); | ||
57 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); | ||
58 | asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); | ||
59 | asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); | ||
60 | |||
61 | rdmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
62 | rdmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
63 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
64 | |||
65 | /* | ||
66 | * control registers | ||
67 | */ | ||
68 | asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); | ||
69 | asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); | ||
70 | asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); | ||
71 | asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); | ||
72 | } | ||
73 | |||
74 | void save_processor_state(void) | ||
75 | { | ||
76 | __save_processor_state(&saved_context); | ||
77 | } | ||
78 | |||
79 | static void | ||
80 | do_fpu_end(void) | ||
81 | { | ||
82 | /* restore FPU regs if necessary */ | ||
83 | /* Do it out of line so that gcc does not move cr0 load to some stupid place */ | ||
84 | kernel_fpu_end(); | ||
85 | mxcsr_feature_mask_init(); | ||
86 | } | ||
87 | |||
88 | void __restore_processor_state(struct saved_context *ctxt) | ||
89 | { | ||
90 | /* | ||
91 | * control registers | ||
92 | */ | ||
93 | asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); | ||
94 | asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); | ||
95 | asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); | ||
96 | asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); | ||
97 | |||
98 | /* | ||
99 | * segment registers | ||
100 | */ | ||
101 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | ||
102 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); | ||
103 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); | ||
104 | load_gs_index(ctxt->gs); | ||
105 | asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); | ||
106 | |||
107 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
108 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
109 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
110 | |||
111 | /* | ||
112 | * now restore the descriptor tables to their proper values | ||
113 | * ltr is done i fix_processor_context(). | ||
114 | */ | ||
115 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
116 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
117 | asm volatile ("lldt %0" :: "m" (ctxt->ldt)); | ||
118 | |||
119 | fix_processor_context(); | ||
120 | |||
121 | do_fpu_end(); | ||
122 | } | ||
123 | |||
124 | void restore_processor_state(void) | ||
125 | { | ||
126 | __restore_processor_state(&saved_context); | ||
127 | } | ||
128 | |||
129 | void fix_processor_context(void) | ||
130 | { | ||
131 | int cpu = smp_processor_id(); | ||
132 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
133 | |||
134 | set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ | ||
135 | |||
136 | cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9; | ||
137 | |||
138 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
139 | load_TR_desc(); /* This does ltr */ | ||
140 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
141 | |||
142 | /* | ||
143 | * Now maybe reload the debug registers | ||
144 | */ | ||
145 | if (current->thread.debugreg7){ | ||
146 | loaddebug(¤t->thread, 0); | ||
147 | loaddebug(¤t->thread, 1); | ||
148 | loaddebug(¤t->thread, 2); | ||
149 | loaddebug(¤t->thread, 3); | ||
150 | /* no 4 and 5 */ | ||
151 | loaddebug(¤t->thread, 6); | ||
152 | loaddebug(¤t->thread, 7); | ||
153 | } | ||
154 | |||
155 | } | ||
156 | |||
157 | |||
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S new file mode 100644 index 000000000000..53f8e1659511 --- /dev/null +++ b/arch/x86_64/kernel/suspend_asm.S | |||
@@ -0,0 +1,104 @@ | |||
1 | /* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> | ||
2 | * | ||
3 | * Distribute under GPLv2. | ||
4 | * | ||
5 | * swsusp_arch_resume may not use any stack, nor any variable that is | ||
6 | * not "NoSave" during copying pages: | ||
7 | * | ||
8 | * Its rewriting one kernel image with another. What is stack in "old" | ||
9 | * image could very well be data page in "new" image, and overwriting | ||
10 | * your own stack under you is bad idea. | ||
11 | */ | ||
12 | |||
13 | .text | ||
14 | #include <linux/linkage.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/offset.h> | ||
18 | |||
19 | ENTRY(swsusp_arch_suspend) | ||
20 | |||
21 | movq %rsp, saved_context_esp(%rip) | ||
22 | movq %rax, saved_context_eax(%rip) | ||
23 | movq %rbx, saved_context_ebx(%rip) | ||
24 | movq %rcx, saved_context_ecx(%rip) | ||
25 | movq %rdx, saved_context_edx(%rip) | ||
26 | movq %rbp, saved_context_ebp(%rip) | ||
27 | movq %rsi, saved_context_esi(%rip) | ||
28 | movq %rdi, saved_context_edi(%rip) | ||
29 | movq %r8, saved_context_r08(%rip) | ||
30 | movq %r9, saved_context_r09(%rip) | ||
31 | movq %r10, saved_context_r10(%rip) | ||
32 | movq %r11, saved_context_r11(%rip) | ||
33 | movq %r12, saved_context_r12(%rip) | ||
34 | movq %r13, saved_context_r13(%rip) | ||
35 | movq %r14, saved_context_r14(%rip) | ||
36 | movq %r15, saved_context_r15(%rip) | ||
37 | pushfq ; popq saved_context_eflags(%rip) | ||
38 | |||
39 | call swsusp_save | ||
40 | ret | ||
41 | |||
42 | ENTRY(swsusp_arch_resume) | ||
43 | /* set up cr3 */ | ||
44 | leaq init_level4_pgt(%rip),%rax | ||
45 | subq $__START_KERNEL_map,%rax | ||
46 | movq %rax,%cr3 | ||
47 | |||
48 | movq mmu_cr4_features(%rip), %rax | ||
49 | movq %rax, %rdx | ||
50 | andq $~(1<<7), %rdx # PGE | ||
51 | movq %rdx, %cr4; # turn off PGE | ||
52 | movq %cr3, %rcx; # flush TLB | ||
53 | movq %rcx, %cr3; | ||
54 | movq %rax, %cr4; # turn PGE back on | ||
55 | |||
56 | movq pagedir_nosave(%rip), %rdx | ||
57 | loop: | ||
58 | testq %rdx, %rdx | ||
59 | jz done | ||
60 | |||
61 | /* get addresses from the pbe and copy the page */ | ||
62 | movq pbe_address(%rdx), %rsi | ||
63 | movq pbe_orig_address(%rdx), %rdi | ||
64 | movq $512, %rcx | ||
65 | rep | ||
66 | movsq | ||
67 | |||
68 | /* progress to the next pbe */ | ||
69 | movq pbe_next(%rdx), %rdx | ||
70 | jmp loop | ||
71 | done: | ||
72 | /* Flush TLB, including "global" things (vmalloc) */ | ||
73 | movq mmu_cr4_features(%rip), %rax | ||
74 | movq %rax, %rdx | ||
75 | andq $~(1<<7), %rdx; # PGE | ||
76 | movq %rdx, %cr4; # turn off PGE | ||
77 | movq %cr3, %rcx; # flush TLB | ||
78 | movq %rcx, %cr3 | ||
79 | movq %rax, %cr4; # turn PGE back on | ||
80 | |||
81 | movl $24, %eax | ||
82 | movl %eax, %ds | ||
83 | |||
84 | movq saved_context_esp(%rip), %rsp | ||
85 | movq saved_context_ebp(%rip), %rbp | ||
86 | /* Don't restore %rax, it must be 0 anyway */ | ||
87 | movq saved_context_ebx(%rip), %rbx | ||
88 | movq saved_context_ecx(%rip), %rcx | ||
89 | movq saved_context_edx(%rip), %rdx | ||
90 | movq saved_context_esi(%rip), %rsi | ||
91 | movq saved_context_edi(%rip), %rdi | ||
92 | movq saved_context_r08(%rip), %r8 | ||
93 | movq saved_context_r09(%rip), %r9 | ||
94 | movq saved_context_r10(%rip), %r10 | ||
95 | movq saved_context_r11(%rip), %r11 | ||
96 | movq saved_context_r12(%rip), %r12 | ||
97 | movq saved_context_r13(%rip), %r13 | ||
98 | movq saved_context_r14(%rip), %r14 | ||
99 | movq saved_context_r15(%rip), %r15 | ||
100 | pushq saved_context_eflags(%rip) ; popfq | ||
101 | |||
102 | xorq %rax, %rax | ||
103 | |||
104 | ret | ||
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c new file mode 100644 index 000000000000..477d8be57d64 --- /dev/null +++ b/arch/x86_64/kernel/sys_x86_64.c | |||
@@ -0,0 +1,173 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/sys_x86_64.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/syscalls.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/smp.h> | ||
10 | #include <linux/smp_lock.h> | ||
11 | #include <linux/sem.h> | ||
12 | #include <linux/msg.h> | ||
13 | #include <linux/shm.h> | ||
14 | #include <linux/stat.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/file.h> | ||
17 | #include <linux/utsname.h> | ||
18 | #include <linux/personality.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/ia32.h> | ||
22 | |||
23 | /* | ||
24 | * sys_pipe() is the normal C calling standard for creating | ||
25 | * a pipe. It's not the way Unix traditionally does this, though. | ||
26 | */ | ||
27 | asmlinkage long sys_pipe(int __user *fildes) | ||
28 | { | ||
29 | int fd[2]; | ||
30 | int error; | ||
31 | |||
32 | error = do_pipe(fd); | ||
33 | if (!error) { | ||
34 | if (copy_to_user(fildes, fd, 2*sizeof(int))) | ||
35 | error = -EFAULT; | ||
36 | } | ||
37 | return error; | ||
38 | } | ||
39 | |||
40 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, | ||
41 | unsigned long fd, unsigned long off) | ||
42 | { | ||
43 | long error; | ||
44 | struct file * file; | ||
45 | |||
46 | error = -EINVAL; | ||
47 | if (off & ~PAGE_MASK) | ||
48 | goto out; | ||
49 | |||
50 | error = -EBADF; | ||
51 | file = NULL; | ||
52 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
53 | if (!(flags & MAP_ANONYMOUS)) { | ||
54 | file = fget(fd); | ||
55 | if (!file) | ||
56 | goto out; | ||
57 | } | ||
58 | down_write(¤t->mm->mmap_sem); | ||
59 | error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); | ||
60 | up_write(¤t->mm->mmap_sem); | ||
61 | |||
62 | if (file) | ||
63 | fput(file); | ||
64 | out: | ||
65 | return error; | ||
66 | } | ||
67 | |||
68 | static void find_start_end(unsigned long flags, unsigned long *begin, | ||
69 | unsigned long *end) | ||
70 | { | ||
71 | #ifdef CONFIG_IA32_EMULATION | ||
72 | if (test_thread_flag(TIF_IA32)) { | ||
73 | *begin = TASK_UNMAPPED_32; | ||
74 | *end = IA32_PAGE_OFFSET; | ||
75 | } else | ||
76 | #endif | ||
77 | if (flags & MAP_32BIT) { | ||
78 | /* This is usually used needed to map code in small | ||
79 | model, so it needs to be in the first 31bit. Limit | ||
80 | it to that. This means we need to move the | ||
81 | unmapped base down for this case. This can give | ||
82 | conflicts with the heap, but we assume that glibc | ||
83 | malloc knows how to fall back to mmap. Give it 1GB | ||
84 | of playground for now. -AK */ | ||
85 | *begin = 0x40000000; | ||
86 | *end = 0x80000000; | ||
87 | } else { | ||
88 | *begin = TASK_UNMAPPED_64; | ||
89 | *end = TASK_SIZE; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | unsigned long | ||
94 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
95 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
96 | { | ||
97 | struct mm_struct *mm = current->mm; | ||
98 | struct vm_area_struct *vma; | ||
99 | unsigned long start_addr; | ||
100 | unsigned long begin, end; | ||
101 | |||
102 | find_start_end(flags, &begin, &end); | ||
103 | |||
104 | if (len > end) | ||
105 | return -ENOMEM; | ||
106 | |||
107 | if (addr) { | ||
108 | addr = PAGE_ALIGN(addr); | ||
109 | vma = find_vma(mm, addr); | ||
110 | if (end - len >= addr && | ||
111 | (!vma || addr + len <= vma->vm_start)) | ||
112 | return addr; | ||
113 | } | ||
114 | addr = mm->free_area_cache; | ||
115 | if (addr < begin) | ||
116 | addr = begin; | ||
117 | start_addr = addr; | ||
118 | |||
119 | full_search: | ||
120 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
121 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
122 | if (end - len < addr) { | ||
123 | /* | ||
124 | * Start a new search - just in case we missed | ||
125 | * some holes. | ||
126 | */ | ||
127 | if (start_addr != begin) { | ||
128 | start_addr = addr = begin; | ||
129 | goto full_search; | ||
130 | } | ||
131 | return -ENOMEM; | ||
132 | } | ||
133 | if (!vma || addr + len <= vma->vm_start) { | ||
134 | /* | ||
135 | * Remember the place where we stopped the search: | ||
136 | */ | ||
137 | mm->free_area_cache = addr + len; | ||
138 | return addr; | ||
139 | } | ||
140 | addr = vma->vm_end; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | asmlinkage long sys_uname(struct new_utsname __user * name) | ||
145 | { | ||
146 | int err; | ||
147 | down_read(&uts_sem); | ||
148 | err = copy_to_user(name, &system_utsname, sizeof (*name)); | ||
149 | up_read(&uts_sem); | ||
150 | if (personality(current->personality) == PER_LINUX32) | ||
151 | err |= copy_to_user(&name->machine, "i686", 5); | ||
152 | return err ? -EFAULT : 0; | ||
153 | } | ||
154 | |||
155 | asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg) | ||
156 | { | ||
157 | unsigned long raddr; | ||
158 | return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr; | ||
159 | } | ||
160 | |||
161 | asmlinkage long sys_time64(long __user * tloc) | ||
162 | { | ||
163 | struct timeval now; | ||
164 | int i; | ||
165 | |||
166 | do_gettimeofday(&now); | ||
167 | i = now.tv_sec; | ||
168 | if (tloc) { | ||
169 | if (put_user(i,tloc)) | ||
170 | i = -EFAULT; | ||
171 | } | ||
172 | return i; | ||
173 | } | ||
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c new file mode 100644 index 000000000000..e263685f864c --- /dev/null +++ b/arch/x86_64/kernel/syscall.c | |||
@@ -0,0 +1,26 @@ | |||
1 | /* System call table for x86-64. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <linux/config.h> | ||
7 | |||
8 | #define __NO_STUBS | ||
9 | |||
10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; | ||
11 | #undef _ASM_X86_64_UNISTD_H_ | ||
12 | #include <asm-x86_64/unistd.h> | ||
13 | |||
14 | #undef __SYSCALL | ||
15 | #define __SYSCALL(nr, sym) [ nr ] = sym, | ||
16 | #undef _ASM_X86_64_UNISTD_H_ | ||
17 | |||
18 | typedef void (*sys_call_ptr_t)(void); | ||
19 | |||
20 | extern void sys_ni_syscall(void); | ||
21 | |||
22 | sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { | ||
23 | /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ | ||
24 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | ||
25 | #include <asm-x86_64/unistd.h> | ||
26 | }; | ||
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c new file mode 100644 index 000000000000..3bafe438fa75 --- /dev/null +++ b/arch/x86_64/kernel/time.c | |||
@@ -0,0 +1,1262 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/time.c | ||
3 | * | ||
4 | * "High Precision Event Timer" based timekeeping. | ||
5 | * | ||
6 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
7 | * Copyright (c) 1994 Alan Modra | ||
8 | * Copyright (c) 1995 Markus Kuhn | ||
9 | * Copyright (c) 1996 Ingo Molnar | ||
10 | * Copyright (c) 1998 Andrea Arcangeli | ||
11 | * Copyright (c) 2002 Vojtech Pavlik | ||
12 | * Copyright (c) 2003 Andi Kleen | ||
13 | * RTC support code taken from arch/i386/kernel/timers/time_hpet.c | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/mc146818rtc.h> | ||
21 | #include <linux/irq.h> | ||
22 | #include <linux/time.h> | ||
23 | #include <linux/ioport.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/device.h> | ||
26 | #include <linux/sysdev.h> | ||
27 | #include <linux/bcd.h> | ||
28 | #include <linux/kallsyms.h> | ||
29 | #include <asm/8253pit.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | #include <asm/vsyscall.h> | ||
32 | #include <asm/timex.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/hpet.h> | ||
35 | #include <asm/sections.h> | ||
36 | #include <linux/cpufreq.h> | ||
37 | #include <linux/hpet.h> | ||
38 | #ifdef CONFIG_X86_LOCAL_APIC | ||
39 | #include <asm/apic.h> | ||
40 | #endif | ||
41 | |||
42 | u64 jiffies_64 = INITIAL_JIFFIES; | ||
43 | |||
44 | EXPORT_SYMBOL(jiffies_64); | ||
45 | |||
46 | #ifdef CONFIG_CPU_FREQ | ||
47 | static void cpufreq_delayed_get(void); | ||
48 | #endif | ||
49 | extern void i8254_timer_resume(void); | ||
50 | extern int using_apic_timer; | ||
51 | |||
52 | DEFINE_SPINLOCK(rtc_lock); | ||
53 | DEFINE_SPINLOCK(i8253_lock); | ||
54 | |||
55 | static int nohpet __initdata = 0; | ||
56 | static int notsc __initdata = 0; | ||
57 | |||
58 | #undef HPET_HACK_ENABLE_DANGEROUS | ||
59 | |||
60 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
61 | static unsigned long hpet_period; /* fsecs / HPET clock */ | ||
62 | unsigned long hpet_tick; /* HPET clocks / interrupt */ | ||
63 | unsigned long vxtime_hz = PIT_TICK_RATE; | ||
64 | int report_lost_ticks; /* command line option */ | ||
65 | unsigned long long monotonic_base; | ||
66 | |||
67 | struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ | ||
68 | |||
69 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
70 | unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; | ||
71 | struct timespec __xtime __section_xtime; | ||
72 | struct timezone __sys_tz __section_sys_tz; | ||
73 | |||
74 | static inline void rdtscll_sync(unsigned long *tsc) | ||
75 | { | ||
76 | #ifdef CONFIG_SMP | ||
77 | sync_core(); | ||
78 | #endif | ||
79 | rdtscll(*tsc); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * do_gettimeoffset() returns microseconds since last timer interrupt was | ||
84 | * triggered by hardware. A memory read of HPET is slower than a register read | ||
85 | * of TSC, but much more reliable. It's also synchronized to the timer | ||
86 | * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a | ||
87 | * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. | ||
88 | * This is not a problem, because jiffies hasn't updated either. They are bound | ||
89 | * together by xtime_lock. | ||
90 | */ | ||
91 | |||
92 | static inline unsigned int do_gettimeoffset_tsc(void) | ||
93 | { | ||
94 | unsigned long t; | ||
95 | unsigned long x; | ||
96 | rdtscll_sync(&t); | ||
97 | if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ | ||
98 | x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; | ||
99 | return x; | ||
100 | } | ||
101 | |||
102 | static inline unsigned int do_gettimeoffset_hpet(void) | ||
103 | { | ||
104 | return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; | ||
105 | } | ||
106 | |||
107 | unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; | ||
108 | |||
109 | /* | ||
110 | * This version of gettimeofday() has microsecond resolution and better than | ||
111 | * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 | ||
112 | * MHz) HPET timer. | ||
113 | */ | ||
114 | |||
115 | void do_gettimeofday(struct timeval *tv) | ||
116 | { | ||
117 | unsigned long seq, t; | ||
118 | unsigned int sec, usec; | ||
119 | |||
120 | do { | ||
121 | seq = read_seqbegin(&xtime_lock); | ||
122 | |||
123 | sec = xtime.tv_sec; | ||
124 | usec = xtime.tv_nsec / 1000; | ||
125 | |||
126 | /* i386 does some correction here to keep the clock | ||
127 | monotonous even when ntpd is fixing drift. | ||
128 | But they didn't work for me, there is a non monotonic | ||
129 | clock anyways with ntp. | ||
130 | I dropped all corrections now until a real solution can | ||
131 | be found. Note when you fix it here you need to do the same | ||
132 | in arch/x86_64/kernel/vsyscall.c and export all needed | ||
133 | variables in vmlinux.lds. -AK */ | ||
134 | |||
135 | t = (jiffies - wall_jiffies) * (1000000L / HZ) + | ||
136 | do_gettimeoffset(); | ||
137 | usec += t; | ||
138 | |||
139 | } while (read_seqretry(&xtime_lock, seq)); | ||
140 | |||
141 | tv->tv_sec = sec + usec / 1000000; | ||
142 | tv->tv_usec = usec % 1000000; | ||
143 | } | ||
144 | |||
145 | EXPORT_SYMBOL(do_gettimeofday); | ||
146 | |||
147 | /* | ||
148 | * settimeofday() first undoes the correction that gettimeofday would do | ||
149 | * on the time, and then saves it. This is ugly, but has been like this for | ||
150 | * ages already. | ||
151 | */ | ||
152 | |||
153 | int do_settimeofday(struct timespec *tv) | ||
154 | { | ||
155 | time_t wtm_sec, sec = tv->tv_sec; | ||
156 | long wtm_nsec, nsec = tv->tv_nsec; | ||
157 | |||
158 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
159 | return -EINVAL; | ||
160 | |||
161 | write_seqlock_irq(&xtime_lock); | ||
162 | |||
163 | nsec -= do_gettimeoffset() * 1000 + | ||
164 | (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); | ||
165 | |||
166 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
167 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
168 | |||
169 | set_normalized_timespec(&xtime, sec, nsec); | ||
170 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
171 | |||
172 | time_adjust = 0; /* stop active adjtime() */ | ||
173 | time_status |= STA_UNSYNC; | ||
174 | time_maxerror = NTP_PHASE_LIMIT; | ||
175 | time_esterror = NTP_PHASE_LIMIT; | ||
176 | |||
177 | write_sequnlock_irq(&xtime_lock); | ||
178 | clock_was_set(); | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | EXPORT_SYMBOL(do_settimeofday); | ||
183 | |||
184 | unsigned long profile_pc(struct pt_regs *regs) | ||
185 | { | ||
186 | unsigned long pc = instruction_pointer(regs); | ||
187 | |||
188 | /* Assume the lock function has either no stack frame or only a single word. | ||
189 | This checks if the address on the stack looks like a kernel text address. | ||
190 | There is a small window for false hits, but in that case the tick | ||
191 | is just accounted to the spinlock function. | ||
192 | Better would be to write these functions in assembler again | ||
193 | and check exactly. */ | ||
194 | if (in_lock_functions(pc)) { | ||
195 | char *v = *(char **)regs->rsp; | ||
196 | if ((v >= _stext && v <= _etext) || | ||
197 | (v >= _sinittext && v <= _einittext) || | ||
198 | (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) | ||
199 | return (unsigned long)v; | ||
200 | return ((unsigned long *)regs->rsp)[1]; | ||
201 | } | ||
202 | return pc; | ||
203 | } | ||
204 | EXPORT_SYMBOL(profile_pc); | ||
205 | |||
206 | /* | ||
207 | * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 | ||
208 | * ms after the second nowtime has started, because when nowtime is written | ||
209 | * into the registers of the CMOS clock, it will jump to the next second | ||
210 | * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data | ||
211 | * sheet for details. | ||
212 | */ | ||
213 | |||
214 | static void set_rtc_mmss(unsigned long nowtime) | ||
215 | { | ||
216 | int real_seconds, real_minutes, cmos_minutes; | ||
217 | unsigned char control, freq_select; | ||
218 | |||
219 | /* | ||
220 | * IRQs are disabled when we're called from the timer interrupt, | ||
221 | * no need for spin_lock_irqsave() | ||
222 | */ | ||
223 | |||
224 | spin_lock(&rtc_lock); | ||
225 | |||
226 | /* | ||
227 | * Tell the clock it's being set and stop it. | ||
228 | */ | ||
229 | |||
230 | control = CMOS_READ(RTC_CONTROL); | ||
231 | CMOS_WRITE(control | RTC_SET, RTC_CONTROL); | ||
232 | |||
233 | freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
234 | CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); | ||
235 | |||
236 | cmos_minutes = CMOS_READ(RTC_MINUTES); | ||
237 | BCD_TO_BIN(cmos_minutes); | ||
238 | |||
239 | /* | ||
240 | * since we're only adjusting minutes and seconds, don't interfere with hour | ||
241 | * overflow. This avoids messing with unknown time zones but requires your RTC | ||
242 | * not to be off by more than 15 minutes. Since we're calling it only when | ||
243 | * our clock is externally synchronized using NTP, this shouldn't be a problem. | ||
244 | */ | ||
245 | |||
246 | real_seconds = nowtime % 60; | ||
247 | real_minutes = nowtime / 60; | ||
248 | if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) | ||
249 | real_minutes += 30; /* correct for half hour time zone */ | ||
250 | real_minutes %= 60; | ||
251 | |||
252 | #if 0 | ||
253 | /* AMD 8111 is a really bad time keeper and hits this regularly. | ||
254 | It probably was an attempt to avoid screwing up DST, but ignore | ||
255 | that for now. */ | ||
256 | if (abs(real_minutes - cmos_minutes) >= 30) { | ||
257 | printk(KERN_WARNING "time.c: can't update CMOS clock " | ||
258 | "from %d to %d\n", cmos_minutes, real_minutes); | ||
259 | } else | ||
260 | #endif | ||
261 | |||
262 | { | ||
263 | BIN_TO_BCD(real_seconds); | ||
264 | BIN_TO_BCD(real_minutes); | ||
265 | CMOS_WRITE(real_seconds, RTC_SECONDS); | ||
266 | CMOS_WRITE(real_minutes, RTC_MINUTES); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * The following flags have to be released exactly in this order, otherwise the | ||
271 | * DS12887 (popular MC146818A clone with integrated battery and quartz) will | ||
272 | * not reset the oscillator and will not update precisely 500 ms later. You | ||
273 | * won't find this mentioned in the Dallas Semiconductor data sheets, but who | ||
274 | * believes data sheets anyway ... -- Markus Kuhn | ||
275 | */ | ||
276 | |||
277 | CMOS_WRITE(control, RTC_CONTROL); | ||
278 | CMOS_WRITE(freq_select, RTC_FREQ_SELECT); | ||
279 | |||
280 | spin_unlock(&rtc_lock); | ||
281 | } | ||
282 | |||
283 | |||
284 | /* monotonic_clock(): returns # of nanoseconds passed since time_init() | ||
285 | * Note: This function is required to return accurate | ||
286 | * time even in the absence of multiple timer ticks. | ||
287 | */ | ||
288 | unsigned long long monotonic_clock(void) | ||
289 | { | ||
290 | unsigned long seq; | ||
291 | u32 last_offset, this_offset, offset; | ||
292 | unsigned long long base; | ||
293 | |||
294 | if (vxtime.mode == VXTIME_HPET) { | ||
295 | do { | ||
296 | seq = read_seqbegin(&xtime_lock); | ||
297 | |||
298 | last_offset = vxtime.last; | ||
299 | base = monotonic_base; | ||
300 | this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
301 | |||
302 | } while (read_seqretry(&xtime_lock, seq)); | ||
303 | offset = (this_offset - last_offset); | ||
304 | offset *=(NSEC_PER_SEC/HZ)/hpet_tick; | ||
305 | return base + offset; | ||
306 | }else{ | ||
307 | do { | ||
308 | seq = read_seqbegin(&xtime_lock); | ||
309 | |||
310 | last_offset = vxtime.last_tsc; | ||
311 | base = monotonic_base; | ||
312 | } while (read_seqretry(&xtime_lock, seq)); | ||
313 | sync_core(); | ||
314 | rdtscll(this_offset); | ||
315 | offset = (this_offset - last_offset)*1000/cpu_khz; | ||
316 | return base + offset; | ||
317 | } | ||
318 | |||
319 | |||
320 | } | ||
321 | EXPORT_SYMBOL(monotonic_clock); | ||
322 | |||
323 | static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) | ||
324 | { | ||
325 | static long lost_count; | ||
326 | static int warned; | ||
327 | |||
328 | if (report_lost_ticks) { | ||
329 | printk(KERN_WARNING "time.c: Lost %d timer " | ||
330 | "tick(s)! ", lost); | ||
331 | print_symbol("rip %s)\n", regs->rip); | ||
332 | } | ||
333 | |||
334 | if (lost_count == 1000 && !warned) { | ||
335 | printk(KERN_WARNING | ||
336 | "warning: many lost ticks.\n" | ||
337 | KERN_WARNING "Your time source seems to be instable or " | ||
338 | "some driver is hogging interupts\n"); | ||
339 | print_symbol("rip %s\n", regs->rip); | ||
340 | if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { | ||
341 | printk(KERN_WARNING "Falling back to HPET\n"); | ||
342 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
343 | vxtime.mode = VXTIME_HPET; | ||
344 | do_gettimeoffset = do_gettimeoffset_hpet; | ||
345 | } | ||
346 | /* else should fall back to PIT, but code missing. */ | ||
347 | warned = 1; | ||
348 | } else | ||
349 | lost_count++; | ||
350 | |||
351 | #ifdef CONFIG_CPU_FREQ | ||
352 | /* In some cases the CPU can change frequency without us noticing | ||
353 | (like going into thermal throttle) | ||
354 | Give cpufreq a change to catch up. */ | ||
355 | if ((lost_count+1) % 25 == 0) { | ||
356 | cpufreq_delayed_get(); | ||
357 | } | ||
358 | #endif | ||
359 | } | ||
360 | |||
361 | static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) | ||
362 | { | ||
363 | static unsigned long rtc_update = 0; | ||
364 | unsigned long tsc; | ||
365 | int delay, offset = 0, lost = 0; | ||
366 | |||
367 | /* | ||
368 | * Here we are in the timer irq handler. We have irqs locally disabled (so we | ||
369 | * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running | ||
370 | * on the other CPU, so we need a lock. We also need to lock the vsyscall | ||
371 | * variables, because both do_timer() and us change them -arca+vojtech | ||
372 | */ | ||
373 | |||
374 | write_seqlock(&xtime_lock); | ||
375 | |||
376 | if (vxtime.hpet_address) { | ||
377 | offset = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
378 | delay = hpet_readl(HPET_COUNTER) - offset; | ||
379 | } else { | ||
380 | spin_lock(&i8253_lock); | ||
381 | outb_p(0x00, 0x43); | ||
382 | delay = inb_p(0x40); | ||
383 | delay |= inb(0x40) << 8; | ||
384 | spin_unlock(&i8253_lock); | ||
385 | delay = LATCH - 1 - delay; | ||
386 | } | ||
387 | |||
388 | rdtscll_sync(&tsc); | ||
389 | |||
390 | if (vxtime.mode == VXTIME_HPET) { | ||
391 | if (offset - vxtime.last > hpet_tick) { | ||
392 | lost = (offset - vxtime.last) / hpet_tick - 1; | ||
393 | } | ||
394 | |||
395 | monotonic_base += | ||
396 | (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; | ||
397 | |||
398 | vxtime.last = offset; | ||
399 | } else { | ||
400 | offset = (((tsc - vxtime.last_tsc) * | ||
401 | vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); | ||
402 | |||
403 | if (offset < 0) | ||
404 | offset = 0; | ||
405 | |||
406 | if (offset > (USEC_PER_SEC / HZ)) { | ||
407 | lost = offset / (USEC_PER_SEC / HZ); | ||
408 | offset %= (USEC_PER_SEC / HZ); | ||
409 | } | ||
410 | |||
411 | monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; | ||
412 | |||
413 | vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; | ||
414 | |||
415 | if ((((tsc - vxtime.last_tsc) * | ||
416 | vxtime.tsc_quot) >> 32) < offset) | ||
417 | vxtime.last_tsc = tsc - | ||
418 | (((long) offset << 32) / vxtime.tsc_quot) - 1; | ||
419 | } | ||
420 | |||
421 | if (lost > 0) { | ||
422 | handle_lost_ticks(lost, regs); | ||
423 | jiffies += lost; | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Do the timer stuff. | ||
428 | */ | ||
429 | |||
430 | do_timer(regs); | ||
431 | #ifndef CONFIG_SMP | ||
432 | update_process_times(user_mode(regs)); | ||
433 | #endif | ||
434 | |||
435 | /* | ||
436 | * In the SMP case we use the local APIC timer interrupt to do the profiling, | ||
437 | * except when we simulate SMP mode on a uniprocessor system, in that case we | ||
438 | * have to call the local interrupt handler. | ||
439 | */ | ||
440 | |||
441 | #ifndef CONFIG_X86_LOCAL_APIC | ||
442 | profile_tick(CPU_PROFILING, regs); | ||
443 | #else | ||
444 | if (!using_apic_timer) | ||
445 | smp_local_timer_interrupt(regs); | ||
446 | #endif | ||
447 | |||
448 | /* | ||
449 | * If we have an externally synchronized Linux clock, then update CMOS clock | ||
450 | * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy | ||
451 | * closest to exactly 500 ms before the next second. If the update fails, we | ||
452 | * don't care, as it'll be updated on the next turn, and the problem (time way | ||
453 | * off) isn't likely to go away much sooner anyway. | ||
454 | */ | ||
455 | |||
456 | if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update && | ||
457 | abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { | ||
458 | set_rtc_mmss(xtime.tv_sec); | ||
459 | rtc_update = xtime.tv_sec + 660; | ||
460 | } | ||
461 | |||
462 | write_sequnlock(&xtime_lock); | ||
463 | |||
464 | return IRQ_HANDLED; | ||
465 | } | ||
466 | |||
467 | static unsigned int cyc2ns_scale; | ||
468 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
469 | |||
470 | static inline void set_cyc2ns_scale(unsigned long cpu_mhz) | ||
471 | { | ||
472 | cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; | ||
473 | } | ||
474 | |||
475 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
476 | { | ||
477 | return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; | ||
478 | } | ||
479 | |||
480 | unsigned long long sched_clock(void) | ||
481 | { | ||
482 | unsigned long a = 0; | ||
483 | |||
484 | #if 0 | ||
485 | /* Don't do a HPET read here. Using TSC always is much faster | ||
486 | and HPET may not be mapped yet when the scheduler first runs. | ||
487 | Disadvantage is a small drift between CPUs in some configurations, | ||
488 | but that should be tolerable. */ | ||
489 | if (__vxtime.mode == VXTIME_HPET) | ||
490 | return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; | ||
491 | #endif | ||
492 | |||
493 | /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, | ||
494 | which means it is not completely exact and may not be monotonous between | ||
495 | CPUs. But the errors should be too small to matter for scheduling | ||
496 | purposes. */ | ||
497 | |||
498 | rdtscll(a); | ||
499 | return cycles_2_ns(a); | ||
500 | } | ||
501 | |||
502 | unsigned long get_cmos_time(void) | ||
503 | { | ||
504 | unsigned int timeout, year, mon, day, hour, min, sec; | ||
505 | unsigned char last, this; | ||
506 | unsigned long flags; | ||
507 | |||
508 | /* | ||
509 | * The Linux interpretation of the CMOS clock register contents: When the | ||
510 | * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the | ||
511 | * second which has precisely just started. Waiting for this can take up to 1 | ||
512 | * second, we timeout approximately after 2.4 seconds on a machine with | ||
513 | * standard 8.3 MHz ISA bus. | ||
514 | */ | ||
515 | |||
516 | spin_lock_irqsave(&rtc_lock, flags); | ||
517 | |||
518 | timeout = 1000000; | ||
519 | last = this = 0; | ||
520 | |||
521 | while (timeout && last && !this) { | ||
522 | last = this; | ||
523 | this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; | ||
524 | timeout--; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Here we are safe to assume the registers won't change for a whole second, so | ||
529 | * we just go ahead and read them. | ||
530 | */ | ||
531 | |||
532 | sec = CMOS_READ(RTC_SECONDS); | ||
533 | min = CMOS_READ(RTC_MINUTES); | ||
534 | hour = CMOS_READ(RTC_HOURS); | ||
535 | day = CMOS_READ(RTC_DAY_OF_MONTH); | ||
536 | mon = CMOS_READ(RTC_MONTH); | ||
537 | year = CMOS_READ(RTC_YEAR); | ||
538 | |||
539 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
540 | |||
541 | /* | ||
542 | * We know that x86-64 always uses BCD format, no need to check the config | ||
543 | * register. | ||
544 | */ | ||
545 | |||
546 | BCD_TO_BIN(sec); | ||
547 | BCD_TO_BIN(min); | ||
548 | BCD_TO_BIN(hour); | ||
549 | BCD_TO_BIN(day); | ||
550 | BCD_TO_BIN(mon); | ||
551 | BCD_TO_BIN(year); | ||
552 | |||
553 | /* | ||
554 | * x86-64 systems only exists since 2002. | ||
555 | * This will work up to Dec 31, 2100 | ||
556 | */ | ||
557 | year += 2000; | ||
558 | |||
559 | return mktime(year, mon, day, hour, min, sec); | ||
560 | } | ||
561 | |||
562 | #ifdef CONFIG_CPU_FREQ | ||
563 | |||
564 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
565 | changes. | ||
566 | |||
567 | RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
568 | not that important because current Opteron setups do not support | ||
569 | scaling on SMP anyroads. | ||
570 | |||
571 | Should fix up last_tsc too. Currently gettimeofday in the | ||
572 | first tick after the change will be slightly wrong. */ | ||
573 | |||
574 | #include <linux/workqueue.h> | ||
575 | |||
576 | static unsigned int cpufreq_delayed_issched = 0; | ||
577 | static unsigned int cpufreq_init = 0; | ||
578 | static struct work_struct cpufreq_delayed_get_work; | ||
579 | |||
580 | static void handle_cpufreq_delayed_get(void *v) | ||
581 | { | ||
582 | unsigned int cpu; | ||
583 | for_each_online_cpu(cpu) { | ||
584 | cpufreq_get(cpu); | ||
585 | } | ||
586 | cpufreq_delayed_issched = 0; | ||
587 | } | ||
588 | |||
589 | /* if we notice lost ticks, schedule a call to cpufreq_get() as it tries | ||
590 | * to verify the CPU frequency the timing core thinks the CPU is running | ||
591 | * at is still correct. | ||
592 | */ | ||
593 | static void cpufreq_delayed_get(void) | ||
594 | { | ||
595 | static int warned; | ||
596 | if (cpufreq_init && !cpufreq_delayed_issched) { | ||
597 | cpufreq_delayed_issched = 1; | ||
598 | if (!warned) { | ||
599 | warned = 1; | ||
600 | printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); | ||
601 | } | ||
602 | schedule_work(&cpufreq_delayed_get_work); | ||
603 | } | ||
604 | } | ||
605 | |||
606 | static unsigned int ref_freq = 0; | ||
607 | static unsigned long loops_per_jiffy_ref = 0; | ||
608 | |||
609 | static unsigned long cpu_khz_ref = 0; | ||
610 | |||
611 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
612 | void *data) | ||
613 | { | ||
614 | struct cpufreq_freqs *freq = data; | ||
615 | unsigned long *lpj, dummy; | ||
616 | |||
617 | lpj = &dummy; | ||
618 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
619 | #ifdef CONFIG_SMP | ||
620 | lpj = &cpu_data[freq->cpu].loops_per_jiffy; | ||
621 | #else | ||
622 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
623 | #endif | ||
624 | |||
625 | |||
626 | |||
627 | if (!ref_freq) { | ||
628 | ref_freq = freq->old; | ||
629 | loops_per_jiffy_ref = *lpj; | ||
630 | cpu_khz_ref = cpu_khz; | ||
631 | } | ||
632 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
633 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
634 | (val == CPUFREQ_RESUMECHANGE)) { | ||
635 | *lpj = | ||
636 | cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
637 | |||
638 | cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); | ||
639 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
640 | vxtime.tsc_quot = (1000L << 32) / cpu_khz; | ||
641 | } | ||
642 | |||
643 | set_cyc2ns_scale(cpu_khz_ref / 1000); | ||
644 | |||
645 | return 0; | ||
646 | } | ||
647 | |||
648 | static struct notifier_block time_cpufreq_notifier_block = { | ||
649 | .notifier_call = time_cpufreq_notifier | ||
650 | }; | ||
651 | |||
652 | static int __init cpufreq_tsc(void) | ||
653 | { | ||
654 | INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); | ||
655 | if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
656 | CPUFREQ_TRANSITION_NOTIFIER)) | ||
657 | cpufreq_init = 1; | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | core_initcall(cpufreq_tsc); | ||
662 | |||
663 | #endif | ||
664 | |||
665 | /* | ||
666 | * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing | ||
667 | * it to the HPET timer of known frequency. | ||
668 | */ | ||
669 | |||
670 | #define TICK_COUNT 100000000 | ||
671 | |||
672 | static unsigned int __init hpet_calibrate_tsc(void) | ||
673 | { | ||
674 | int tsc_start, hpet_start; | ||
675 | int tsc_now, hpet_now; | ||
676 | unsigned long flags; | ||
677 | |||
678 | local_irq_save(flags); | ||
679 | local_irq_disable(); | ||
680 | |||
681 | hpet_start = hpet_readl(HPET_COUNTER); | ||
682 | rdtscl(tsc_start); | ||
683 | |||
684 | do { | ||
685 | local_irq_disable(); | ||
686 | hpet_now = hpet_readl(HPET_COUNTER); | ||
687 | sync_core(); | ||
688 | rdtscl(tsc_now); | ||
689 | local_irq_restore(flags); | ||
690 | } while ((tsc_now - tsc_start) < TICK_COUNT && | ||
691 | (hpet_now - hpet_start) < TICK_COUNT); | ||
692 | |||
693 | return (tsc_now - tsc_start) * 1000000000L | ||
694 | / ((hpet_now - hpet_start) * hpet_period / 1000); | ||
695 | } | ||
696 | |||
697 | |||
698 | /* | ||
699 | * pit_calibrate_tsc() uses the speaker output (channel 2) of | ||
700 | * the PIT. This is better than using the timer interrupt output, | ||
701 | * because we can read the value of the speaker with just one inb(), | ||
702 | * where we need three i/o operations for the interrupt channel. | ||
703 | * We count how many ticks the TSC does in 50 ms. | ||
704 | */ | ||
705 | |||
706 | static unsigned int __init pit_calibrate_tsc(void) | ||
707 | { | ||
708 | unsigned long start, end; | ||
709 | unsigned long flags; | ||
710 | |||
711 | spin_lock_irqsave(&i8253_lock, flags); | ||
712 | |||
713 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
714 | |||
715 | outb(0xb0, 0x43); | ||
716 | outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
717 | outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
718 | rdtscll(start); | ||
719 | sync_core(); | ||
720 | while ((inb(0x61) & 0x20) == 0); | ||
721 | sync_core(); | ||
722 | rdtscll(end); | ||
723 | |||
724 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
725 | |||
726 | return (end - start) / 50; | ||
727 | } | ||
728 | |||
729 | #ifdef CONFIG_HPET | ||
730 | static __init int late_hpet_init(void) | ||
731 | { | ||
732 | struct hpet_data hd; | ||
733 | unsigned int ntimer; | ||
734 | |||
735 | if (!vxtime.hpet_address) | ||
736 | return -1; | ||
737 | |||
738 | memset(&hd, 0, sizeof (hd)); | ||
739 | |||
740 | ntimer = hpet_readl(HPET_ID); | ||
741 | ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; | ||
742 | ntimer++; | ||
743 | |||
744 | /* | ||
745 | * Register with driver. | ||
746 | * Timer0 and Timer1 is used by platform. | ||
747 | */ | ||
748 | hd.hd_phys_address = vxtime.hpet_address; | ||
749 | hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE); | ||
750 | hd.hd_nirqs = ntimer; | ||
751 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
752 | hpet_reserve_timer(&hd, 0); | ||
753 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
754 | hpet_reserve_timer(&hd, 1); | ||
755 | #endif | ||
756 | hd.hd_irq[0] = HPET_LEGACY_8254; | ||
757 | hd.hd_irq[1] = HPET_LEGACY_RTC; | ||
758 | if (ntimer > 2) { | ||
759 | struct hpet *hpet; | ||
760 | struct hpet_timer *timer; | ||
761 | int i; | ||
762 | |||
763 | hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); | ||
764 | |||
765 | for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; | ||
766 | timer++, i++) | ||
767 | hd.hd_irq[i] = (timer->hpet_config & | ||
768 | Tn_INT_ROUTE_CNF_MASK) >> | ||
769 | Tn_INT_ROUTE_CNF_SHIFT; | ||
770 | |||
771 | } | ||
772 | |||
773 | hpet_alloc(&hd); | ||
774 | return 0; | ||
775 | } | ||
776 | fs_initcall(late_hpet_init); | ||
777 | #endif | ||
778 | |||
779 | static int hpet_timer_stop_set_go(unsigned long tick) | ||
780 | { | ||
781 | unsigned int cfg; | ||
782 | |||
783 | /* | ||
784 | * Stop the timers and reset the main counter. | ||
785 | */ | ||
786 | |||
787 | cfg = hpet_readl(HPET_CFG); | ||
788 | cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); | ||
789 | hpet_writel(cfg, HPET_CFG); | ||
790 | hpet_writel(0, HPET_COUNTER); | ||
791 | hpet_writel(0, HPET_COUNTER + 4); | ||
792 | |||
793 | /* | ||
794 | * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, | ||
795 | * and period also hpet_tick. | ||
796 | */ | ||
797 | |||
798 | hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | | ||
799 | HPET_TN_32BIT, HPET_T0_CFG); | ||
800 | hpet_writel(hpet_tick, HPET_T0_CMP); | ||
801 | hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ | ||
802 | |||
803 | /* | ||
804 | * Go! | ||
805 | */ | ||
806 | |||
807 | cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; | ||
808 | hpet_writel(cfg, HPET_CFG); | ||
809 | |||
810 | return 0; | ||
811 | } | ||
812 | |||
813 | static int hpet_init(void) | ||
814 | { | ||
815 | unsigned int id; | ||
816 | |||
817 | if (!vxtime.hpet_address) | ||
818 | return -1; | ||
819 | set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); | ||
820 | __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
821 | |||
822 | /* | ||
823 | * Read the period, compute tick and quotient. | ||
824 | */ | ||
825 | |||
826 | id = hpet_readl(HPET_ID); | ||
827 | |||
828 | if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || | ||
829 | !(id & HPET_ID_LEGSUP)) | ||
830 | return -1; | ||
831 | |||
832 | hpet_period = hpet_readl(HPET_PERIOD); | ||
833 | if (hpet_period < 100000 || hpet_period > 100000000) | ||
834 | return -1; | ||
835 | |||
836 | hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / | ||
837 | hpet_period; | ||
838 | |||
839 | return hpet_timer_stop_set_go(hpet_tick); | ||
840 | } | ||
841 | |||
842 | static int hpet_reenable(void) | ||
843 | { | ||
844 | return hpet_timer_stop_set_go(hpet_tick); | ||
845 | } | ||
846 | |||
847 | void __init pit_init(void) | ||
848 | { | ||
849 | unsigned long flags; | ||
850 | |||
851 | spin_lock_irqsave(&i8253_lock, flags); | ||
852 | outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
853 | outb_p(LATCH & 0xff, 0x40); /* LSB */ | ||
854 | outb_p(LATCH >> 8, 0x40); /* MSB */ | ||
855 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
856 | } | ||
857 | |||
858 | int __init time_setup(char *str) | ||
859 | { | ||
860 | report_lost_ticks = 1; | ||
861 | return 1; | ||
862 | } | ||
863 | |||
864 | static struct irqaction irq0 = { | ||
865 | timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL | ||
866 | }; | ||
867 | |||
868 | extern void __init config_acpi_tables(void); | ||
869 | |||
870 | void __init time_init(void) | ||
871 | { | ||
872 | char *timename; | ||
873 | |||
874 | #ifdef HPET_HACK_ENABLE_DANGEROUS | ||
875 | if (!vxtime.hpet_address) { | ||
876 | printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " | ||
877 | "manually!\n"); | ||
878 | outl(0x800038a0, 0xcf8); | ||
879 | outl(0xff000001, 0xcfc); | ||
880 | outl(0x800038a0, 0xcf8); | ||
881 | vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; | ||
882 | printk(KERN_WARNING "time.c: WARNING: Enabled HPET " | ||
883 | "at %#lx.\n", vxtime.hpet_address); | ||
884 | } | ||
885 | #endif | ||
886 | if (nohpet) | ||
887 | vxtime.hpet_address = 0; | ||
888 | |||
889 | xtime.tv_sec = get_cmos_time(); | ||
890 | xtime.tv_nsec = 0; | ||
891 | |||
892 | set_normalized_timespec(&wall_to_monotonic, | ||
893 | -xtime.tv_sec, -xtime.tv_nsec); | ||
894 | |||
895 | if (!hpet_init()) { | ||
896 | vxtime_hz = (1000000000000000L + hpet_period / 2) / | ||
897 | hpet_period; | ||
898 | cpu_khz = hpet_calibrate_tsc(); | ||
899 | timename = "HPET"; | ||
900 | } else { | ||
901 | pit_init(); | ||
902 | cpu_khz = pit_calibrate_tsc(); | ||
903 | timename = "PIT"; | ||
904 | } | ||
905 | |||
906 | printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", | ||
907 | vxtime_hz / 1000000, vxtime_hz % 1000000, timename); | ||
908 | printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", | ||
909 | cpu_khz / 1000, cpu_khz % 1000); | ||
910 | vxtime.mode = VXTIME_TSC; | ||
911 | vxtime.quot = (1000000L << 32) / vxtime_hz; | ||
912 | vxtime.tsc_quot = (1000L << 32) / cpu_khz; | ||
913 | vxtime.hz = vxtime_hz; | ||
914 | rdtscll_sync(&vxtime.last_tsc); | ||
915 | setup_irq(0, &irq0); | ||
916 | |||
917 | set_cyc2ns_scale(cpu_khz / 1000); | ||
918 | } | ||
919 | |||
920 | void __init time_init_smp(void) | ||
921 | { | ||
922 | char *timetype; | ||
923 | |||
924 | /* | ||
925 | * AMD systems with more than one CPU don't have fully synchronized | ||
926 | * TSCs. Always use HPET gettimeofday for these, although it is slower. | ||
927 | * Intel SMP systems usually have synchronized TSCs, so use always | ||
928 | * the TSC. | ||
929 | * | ||
930 | * Exceptions: | ||
931 | * IBM Summit2 checked by oem_force_hpet_timer(). | ||
932 | * AMD dual core may also not need HPET. Check me. | ||
933 | * | ||
934 | * Can be turned off with "notsc". | ||
935 | */ | ||
936 | if (num_online_cpus() > 1 && | ||
937 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
938 | notsc = 1; | ||
939 | /* Some systems will want to disable TSC and use HPET. */ | ||
940 | if (oem_force_hpet_timer()) | ||
941 | notsc = 1; | ||
942 | if (vxtime.hpet_address && notsc) { | ||
943 | timetype = "HPET"; | ||
944 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
945 | vxtime.mode = VXTIME_HPET; | ||
946 | do_gettimeoffset = do_gettimeoffset_hpet; | ||
947 | } else { | ||
948 | timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; | ||
949 | vxtime.mode = VXTIME_TSC; | ||
950 | } | ||
951 | |||
952 | printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); | ||
953 | } | ||
954 | |||
955 | __setup("report_lost_ticks", time_setup); | ||
956 | |||
957 | static long clock_cmos_diff; | ||
958 | static unsigned long sleep_start; | ||
959 | |||
960 | static int timer_suspend(struct sys_device *dev, u32 state) | ||
961 | { | ||
962 | /* | ||
963 | * Estimate time zone so that set_time can update the clock | ||
964 | */ | ||
965 | long cmos_time = get_cmos_time(); | ||
966 | |||
967 | clock_cmos_diff = -cmos_time; | ||
968 | clock_cmos_diff += get_seconds(); | ||
969 | sleep_start = cmos_time; | ||
970 | return 0; | ||
971 | } | ||
972 | |||
973 | static int timer_resume(struct sys_device *dev) | ||
974 | { | ||
975 | unsigned long flags; | ||
976 | unsigned long sec; | ||
977 | unsigned long ctime = get_cmos_time(); | ||
978 | unsigned long sleep_length = (ctime - sleep_start) * HZ; | ||
979 | |||
980 | if (vxtime.hpet_address) | ||
981 | hpet_reenable(); | ||
982 | else | ||
983 | i8254_timer_resume(); | ||
984 | |||
985 | sec = ctime + clock_cmos_diff; | ||
986 | write_seqlock_irqsave(&xtime_lock,flags); | ||
987 | xtime.tv_sec = sec; | ||
988 | xtime.tv_nsec = 0; | ||
989 | write_sequnlock_irqrestore(&xtime_lock,flags); | ||
990 | jiffies += sleep_length; | ||
991 | wall_jiffies += sleep_length; | ||
992 | return 0; | ||
993 | } | ||
994 | |||
995 | static struct sysdev_class timer_sysclass = { | ||
996 | .resume = timer_resume, | ||
997 | .suspend = timer_suspend, | ||
998 | set_kset_name("timer"), | ||
999 | }; | ||
1000 | |||
1001 | |||
1002 | /* XXX this driverfs stuff should probably go elsewhere later -john */ | ||
1003 | static struct sys_device device_timer = { | ||
1004 | .id = 0, | ||
1005 | .cls = &timer_sysclass, | ||
1006 | }; | ||
1007 | |||
1008 | static int time_init_device(void) | ||
1009 | { | ||
1010 | int error = sysdev_class_register(&timer_sysclass); | ||
1011 | if (!error) | ||
1012 | error = sysdev_register(&device_timer); | ||
1013 | return error; | ||
1014 | } | ||
1015 | |||
1016 | device_initcall(time_init_device); | ||
1017 | |||
1018 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
1019 | /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET | ||
1020 | * is enabled, we support RTC interrupt functionality in software. | ||
1021 | * RTC has 3 kinds of interrupts: | ||
1022 | * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock | ||
1023 | * is updated | ||
1024 | * 2) Alarm Interrupt - generate an interrupt at a specific time of day | ||
1025 | * 3) Periodic Interrupt - generate periodic interrupt, with frequencies | ||
1026 | * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) | ||
1027 | * (1) and (2) above are implemented using polling at a frequency of | ||
1028 | * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt | ||
1029 | * overhead. (DEFAULT_RTC_INT_FREQ) | ||
1030 | * For (3), we use interrupts at 64Hz or user specified periodic | ||
1031 | * frequency, whichever is higher. | ||
1032 | */ | ||
1033 | #include <linux/rtc.h> | ||
1034 | |||
1035 | extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); | ||
1036 | |||
1037 | #define DEFAULT_RTC_INT_FREQ 64 | ||
1038 | #define RTC_NUM_INTS 1 | ||
1039 | |||
1040 | static unsigned long UIE_on; | ||
1041 | static unsigned long prev_update_sec; | ||
1042 | |||
1043 | static unsigned long AIE_on; | ||
1044 | static struct rtc_time alarm_time; | ||
1045 | |||
1046 | static unsigned long PIE_on; | ||
1047 | static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; | ||
1048 | static unsigned long PIE_count; | ||
1049 | |||
1050 | static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ | ||
1051 | |||
1052 | int is_hpet_enabled(void) | ||
1053 | { | ||
1054 | return vxtime.hpet_address != 0; | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Timer 1 for RTC, we do not use periodic interrupt feature, | ||
1059 | * even if HPET supports periodic interrupts on Timer 1. | ||
1060 | * The reason being, to set up a periodic interrupt in HPET, we need to | ||
1061 | * stop the main counter. And if we do that everytime someone diables/enables | ||
1062 | * RTC, we will have adverse effect on main kernel timer running on Timer 0. | ||
1063 | * So, for the time being, simulate the periodic interrupt in software. | ||
1064 | * | ||
1065 | * hpet_rtc_timer_init() is called for the first time and during subsequent | ||
1066 | * interuppts reinit happens through hpet_rtc_timer_reinit(). | ||
1067 | */ | ||
1068 | int hpet_rtc_timer_init(void) | ||
1069 | { | ||
1070 | unsigned int cfg, cnt; | ||
1071 | unsigned long flags; | ||
1072 | |||
1073 | if (!is_hpet_enabled()) | ||
1074 | return 0; | ||
1075 | /* | ||
1076 | * Set the counter 1 and enable the interrupts. | ||
1077 | */ | ||
1078 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
1079 | hpet_rtc_int_freq = PIE_freq; | ||
1080 | else | ||
1081 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
1082 | |||
1083 | local_irq_save(flags); | ||
1084 | cnt = hpet_readl(HPET_COUNTER); | ||
1085 | cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); | ||
1086 | hpet_writel(cnt, HPET_T1_CMP); | ||
1087 | local_irq_restore(flags); | ||
1088 | |||
1089 | cfg = hpet_readl(HPET_T1_CFG); | ||
1090 | cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; | ||
1091 | hpet_writel(cfg, HPET_T1_CFG); | ||
1092 | |||
1093 | return 1; | ||
1094 | } | ||
1095 | |||
1096 | static void hpet_rtc_timer_reinit(void) | ||
1097 | { | ||
1098 | unsigned int cfg, cnt; | ||
1099 | |||
1100 | if (!(PIE_on | AIE_on | UIE_on)) | ||
1101 | return; | ||
1102 | |||
1103 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
1104 | hpet_rtc_int_freq = PIE_freq; | ||
1105 | else | ||
1106 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
1107 | |||
1108 | /* It is more accurate to use the comparator value than current count.*/ | ||
1109 | cnt = hpet_readl(HPET_T1_CMP); | ||
1110 | cnt += hpet_tick*HZ/hpet_rtc_int_freq; | ||
1111 | hpet_writel(cnt, HPET_T1_CMP); | ||
1112 | |||
1113 | cfg = hpet_readl(HPET_T1_CFG); | ||
1114 | cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; | ||
1115 | hpet_writel(cfg, HPET_T1_CFG); | ||
1116 | |||
1117 | return; | ||
1118 | } | ||
1119 | |||
1120 | /* | ||
1121 | * The functions below are called from rtc driver. | ||
1122 | * Return 0 if HPET is not being used. | ||
1123 | * Otherwise do the necessary changes and return 1. | ||
1124 | */ | ||
1125 | int hpet_mask_rtc_irq_bit(unsigned long bit_mask) | ||
1126 | { | ||
1127 | if (!is_hpet_enabled()) | ||
1128 | return 0; | ||
1129 | |||
1130 | if (bit_mask & RTC_UIE) | ||
1131 | UIE_on = 0; | ||
1132 | if (bit_mask & RTC_PIE) | ||
1133 | PIE_on = 0; | ||
1134 | if (bit_mask & RTC_AIE) | ||
1135 | AIE_on = 0; | ||
1136 | |||
1137 | return 1; | ||
1138 | } | ||
1139 | |||
1140 | int hpet_set_rtc_irq_bit(unsigned long bit_mask) | ||
1141 | { | ||
1142 | int timer_init_reqd = 0; | ||
1143 | |||
1144 | if (!is_hpet_enabled()) | ||
1145 | return 0; | ||
1146 | |||
1147 | if (!(PIE_on | AIE_on | UIE_on)) | ||
1148 | timer_init_reqd = 1; | ||
1149 | |||
1150 | if (bit_mask & RTC_UIE) { | ||
1151 | UIE_on = 1; | ||
1152 | } | ||
1153 | if (bit_mask & RTC_PIE) { | ||
1154 | PIE_on = 1; | ||
1155 | PIE_count = 0; | ||
1156 | } | ||
1157 | if (bit_mask & RTC_AIE) { | ||
1158 | AIE_on = 1; | ||
1159 | } | ||
1160 | |||
1161 | if (timer_init_reqd) | ||
1162 | hpet_rtc_timer_init(); | ||
1163 | |||
1164 | return 1; | ||
1165 | } | ||
1166 | |||
1167 | int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) | ||
1168 | { | ||
1169 | if (!is_hpet_enabled()) | ||
1170 | return 0; | ||
1171 | |||
1172 | alarm_time.tm_hour = hrs; | ||
1173 | alarm_time.tm_min = min; | ||
1174 | alarm_time.tm_sec = sec; | ||
1175 | |||
1176 | return 1; | ||
1177 | } | ||
1178 | |||
1179 | int hpet_set_periodic_freq(unsigned long freq) | ||
1180 | { | ||
1181 | if (!is_hpet_enabled()) | ||
1182 | return 0; | ||
1183 | |||
1184 | PIE_freq = freq; | ||
1185 | PIE_count = 0; | ||
1186 | |||
1187 | return 1; | ||
1188 | } | ||
1189 | |||
1190 | int hpet_rtc_dropped_irq(void) | ||
1191 | { | ||
1192 | if (!is_hpet_enabled()) | ||
1193 | return 0; | ||
1194 | |||
1195 | return 1; | ||
1196 | } | ||
1197 | |||
1198 | irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) | ||
1199 | { | ||
1200 | struct rtc_time curr_time; | ||
1201 | unsigned long rtc_int_flag = 0; | ||
1202 | int call_rtc_interrupt = 0; | ||
1203 | |||
1204 | hpet_rtc_timer_reinit(); | ||
1205 | |||
1206 | if (UIE_on | AIE_on) { | ||
1207 | rtc_get_rtc_time(&curr_time); | ||
1208 | } | ||
1209 | if (UIE_on) { | ||
1210 | if (curr_time.tm_sec != prev_update_sec) { | ||
1211 | /* Set update int info, call real rtc int routine */ | ||
1212 | call_rtc_interrupt = 1; | ||
1213 | rtc_int_flag = RTC_UF; | ||
1214 | prev_update_sec = curr_time.tm_sec; | ||
1215 | } | ||
1216 | } | ||
1217 | if (PIE_on) { | ||
1218 | PIE_count++; | ||
1219 | if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { | ||
1220 | /* Set periodic int info, call real rtc int routine */ | ||
1221 | call_rtc_interrupt = 1; | ||
1222 | rtc_int_flag |= RTC_PF; | ||
1223 | PIE_count = 0; | ||
1224 | } | ||
1225 | } | ||
1226 | if (AIE_on) { | ||
1227 | if ((curr_time.tm_sec == alarm_time.tm_sec) && | ||
1228 | (curr_time.tm_min == alarm_time.tm_min) && | ||
1229 | (curr_time.tm_hour == alarm_time.tm_hour)) { | ||
1230 | /* Set alarm int info, call real rtc int routine */ | ||
1231 | call_rtc_interrupt = 1; | ||
1232 | rtc_int_flag |= RTC_AF; | ||
1233 | } | ||
1234 | } | ||
1235 | if (call_rtc_interrupt) { | ||
1236 | rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); | ||
1237 | rtc_interrupt(rtc_int_flag, dev_id, regs); | ||
1238 | } | ||
1239 | return IRQ_HANDLED; | ||
1240 | } | ||
1241 | #endif | ||
1242 | |||
1243 | |||
1244 | |||
1245 | static int __init nohpet_setup(char *s) | ||
1246 | { | ||
1247 | nohpet = 1; | ||
1248 | return 0; | ||
1249 | } | ||
1250 | |||
1251 | __setup("nohpet", nohpet_setup); | ||
1252 | |||
1253 | |||
1254 | static int __init notsc_setup(char *s) | ||
1255 | { | ||
1256 | notsc = 1; | ||
1257 | return 0; | ||
1258 | } | ||
1259 | |||
1260 | __setup("notsc", notsc_setup); | ||
1261 | |||
1262 | |||
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S new file mode 100644 index 000000000000..6d9c9a8e7d0e --- /dev/null +++ b/arch/x86_64/kernel/trampoline.S | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Trampoline.S Derived from Setup.S by Linus Torvalds | ||
4 | * | ||
5 | * 4 Jan 1997 Michael Chastain: changed to gnu as. | ||
6 | * | ||
7 | * Entry: CS:IP point to the start of our code, we are | ||
8 | * in real mode with no stack, but the rest of the | ||
9 | * trampoline page to make our stack and everything else | ||
10 | * is a mystery. | ||
11 | * | ||
12 | * In fact we don't actually need a stack so we don't | ||
13 | * set one up. | ||
14 | * | ||
15 | * On entry to trampoline_data, the processor is in real mode | ||
16 | * with 16-bit addressing and 16-bit data. CS has some value | ||
17 | * and IP is zero. Thus, data addresses need to be absolute | ||
18 | * (no relocation) and are taken with regard to r_base. | ||
19 | * | ||
20 | * If you work on this file, check the object module with objdump | ||
21 | * --full-contents --reloc to make sure there are no relocation | ||
22 | * entries. For the GDT entry we do hand relocation in smpboot.c | ||
23 | * because of 64bit linker limitations. | ||
24 | */ | ||
25 | |||
26 | #include <linux/linkage.h> | ||
27 | #include <asm/segment.h> | ||
28 | #include <asm/page.h> | ||
29 | |||
30 | .data | ||
31 | |||
32 | .code16 | ||
33 | |||
34 | ENTRY(trampoline_data) | ||
35 | r_base = . | ||
36 | wbinvd | ||
37 | mov %cs, %ax # Code and data in the same place | ||
38 | mov %ax, %ds | ||
39 | |||
40 | cli # We should be safe anyway | ||
41 | |||
42 | movl $0xA5A5A5A5, trampoline_data - r_base | ||
43 | # write marker for master knows we're running | ||
44 | |||
45 | lidt idt_48 - r_base # load idt with 0, 0 | ||
46 | lgdt gdt_48 - r_base # load gdt with whatever is appropriate | ||
47 | |||
48 | xor %ax, %ax | ||
49 | inc %ax # protected mode (PE) bit | ||
50 | lmsw %ax # into protected mode | ||
51 | # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S | ||
52 | ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) | ||
53 | |||
54 | # Careful these need to be in the same 64K segment as the above; | ||
55 | idt_48: | ||
56 | .word 0 # idt limit = 0 | ||
57 | .word 0, 0 # idt base = 0L | ||
58 | |||
59 | gdt_48: | ||
60 | .short __KERNEL32_CS + 7 # gdt limit | ||
61 | .long cpu_gdt_table-__START_KERNEL_map | ||
62 | |||
63 | .globl trampoline_end | ||
64 | trampoline_end: | ||
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c new file mode 100644 index 000000000000..d87ac6496cc2 --- /dev/null +++ b/arch/x86_64/kernel/traps.c | |||
@@ -0,0 +1,948 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/traps.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
9 | * | ||
10 | * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
15 | * state in 'entry.S'. | ||
16 | */ | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/errno.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/timer.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/spinlock.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/moduleparam.h> | ||
31 | |||
32 | #include <asm/system.h> | ||
33 | #include <asm/uaccess.h> | ||
34 | #include <asm/io.h> | ||
35 | #include <asm/atomic.h> | ||
36 | #include <asm/debugreg.h> | ||
37 | #include <asm/desc.h> | ||
38 | #include <asm/i387.h> | ||
39 | #include <asm/kdebug.h> | ||
40 | #include <asm/processor.h> | ||
41 | |||
42 | #include <asm/smp.h> | ||
43 | #include <asm/pgalloc.h> | ||
44 | #include <asm/pda.h> | ||
45 | #include <asm/proto.h> | ||
46 | #include <asm/nmi.h> | ||
47 | |||
48 | #include <linux/irq.h> | ||
49 | |||
50 | |||
51 | extern struct gate_struct idt_table[256]; | ||
52 | |||
53 | asmlinkage void divide_error(void); | ||
54 | asmlinkage void debug(void); | ||
55 | asmlinkage void nmi(void); | ||
56 | asmlinkage void int3(void); | ||
57 | asmlinkage void overflow(void); | ||
58 | asmlinkage void bounds(void); | ||
59 | asmlinkage void invalid_op(void); | ||
60 | asmlinkage void device_not_available(void); | ||
61 | asmlinkage void double_fault(void); | ||
62 | asmlinkage void coprocessor_segment_overrun(void); | ||
63 | asmlinkage void invalid_TSS(void); | ||
64 | asmlinkage void segment_not_present(void); | ||
65 | asmlinkage void stack_segment(void); | ||
66 | asmlinkage void general_protection(void); | ||
67 | asmlinkage void page_fault(void); | ||
68 | asmlinkage void coprocessor_error(void); | ||
69 | asmlinkage void simd_coprocessor_error(void); | ||
70 | asmlinkage void reserved(void); | ||
71 | asmlinkage void alignment_check(void); | ||
72 | asmlinkage void machine_check(void); | ||
73 | asmlinkage void spurious_interrupt_bug(void); | ||
74 | asmlinkage void call_debug(void); | ||
75 | |||
76 | struct notifier_block *die_chain; | ||
77 | static DEFINE_SPINLOCK(die_notifier_lock); | ||
78 | |||
79 | int register_die_notifier(struct notifier_block *nb) | ||
80 | { | ||
81 | int err = 0; | ||
82 | unsigned long flags; | ||
83 | spin_lock_irqsave(&die_notifier_lock, flags); | ||
84 | err = notifier_chain_register(&die_chain, nb); | ||
85 | spin_unlock_irqrestore(&die_notifier_lock, flags); | ||
86 | return err; | ||
87 | } | ||
88 | |||
89 | static inline void conditional_sti(struct pt_regs *regs) | ||
90 | { | ||
91 | if (regs->eflags & X86_EFLAGS_IF) | ||
92 | local_irq_enable(); | ||
93 | } | ||
94 | |||
95 | static int kstack_depth_to_print = 10; | ||
96 | |||
97 | #ifdef CONFIG_KALLSYMS | ||
98 | #include <linux/kallsyms.h> | ||
99 | int printk_address(unsigned long address) | ||
100 | { | ||
101 | unsigned long offset = 0, symsize; | ||
102 | const char *symname; | ||
103 | char *modname; | ||
104 | char *delim = ":"; | ||
105 | char namebuf[128]; | ||
106 | |||
107 | symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); | ||
108 | if (!symname) | ||
109 | return printk("[<%016lx>]", address); | ||
110 | if (!modname) | ||
111 | modname = delim = ""; | ||
112 | return printk("<%016lx>{%s%s%s%s%+ld}", | ||
113 | address,delim,modname,delim,symname,offset); | ||
114 | } | ||
115 | #else | ||
116 | int printk_address(unsigned long address) | ||
117 | { | ||
118 | return printk("[<%016lx>]", address); | ||
119 | } | ||
120 | #endif | ||
121 | |||
122 | unsigned long *in_exception_stack(int cpu, unsigned long stack) | ||
123 | { | ||
124 | int k; | ||
125 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
126 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
127 | unsigned long start = tss->ist[k] - EXCEPTION_STKSZ; | ||
128 | |||
129 | if (stack >= start && stack < tss->ist[k]) | ||
130 | return (unsigned long *)tss->ist[k]; | ||
131 | } | ||
132 | return NULL; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * x86-64 can have upto three kernel stacks: | ||
137 | * process stack | ||
138 | * interrupt stack | ||
139 | * severe exception (double fault, nmi, stack fault) hardware stack | ||
140 | * Check and process them in order. | ||
141 | */ | ||
142 | |||
143 | void show_trace(unsigned long *stack) | ||
144 | { | ||
145 | unsigned long addr; | ||
146 | unsigned long *irqstack, *irqstack_end, *estack_end; | ||
147 | const int cpu = safe_smp_processor_id(); | ||
148 | int i; | ||
149 | |||
150 | printk("\nCall Trace:"); | ||
151 | i = 0; | ||
152 | |||
153 | estack_end = in_exception_stack(cpu, (unsigned long)stack); | ||
154 | if (estack_end) { | ||
155 | while (stack < estack_end) { | ||
156 | addr = *stack++; | ||
157 | if (__kernel_text_address(addr)) { | ||
158 | i += printk_address(addr); | ||
159 | i += printk(" "); | ||
160 | if (i > 50) { | ||
161 | printk("\n"); | ||
162 | i = 0; | ||
163 | } | ||
164 | } | ||
165 | } | ||
166 | i += printk(" <EOE> "); | ||
167 | i += 7; | ||
168 | stack = (unsigned long *) estack_end[-2]; | ||
169 | } | ||
170 | |||
171 | irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); | ||
172 | irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64); | ||
173 | |||
174 | if (stack >= irqstack && stack < irqstack_end) { | ||
175 | printk("<IRQ> "); | ||
176 | while (stack < irqstack_end) { | ||
177 | addr = *stack++; | ||
178 | /* | ||
179 | * If the address is either in the text segment of the | ||
180 | * kernel, or in the region which contains vmalloc'ed | ||
181 | * memory, it *may* be the address of a calling | ||
182 | * routine; if so, print it so that someone tracing | ||
183 | * down the cause of the crash will be able to figure | ||
184 | * out the call path that was taken. | ||
185 | */ | ||
186 | if (__kernel_text_address(addr)) { | ||
187 | i += printk_address(addr); | ||
188 | i += printk(" "); | ||
189 | if (i > 50) { | ||
190 | printk("\n "); | ||
191 | i = 0; | ||
192 | } | ||
193 | } | ||
194 | } | ||
195 | stack = (unsigned long *) (irqstack_end[-1]); | ||
196 | printk(" <EOI> "); | ||
197 | i += 7; | ||
198 | } | ||
199 | |||
200 | while (((long) stack & (THREAD_SIZE-1)) != 0) { | ||
201 | addr = *stack++; | ||
202 | if (__kernel_text_address(addr)) { | ||
203 | i += printk_address(addr); | ||
204 | i += printk(" "); | ||
205 | if (i > 50) { | ||
206 | printk("\n "); | ||
207 | i = 0; | ||
208 | } | ||
209 | } | ||
210 | } | ||
211 | printk("\n"); | ||
212 | } | ||
213 | |||
214 | void show_stack(struct task_struct *tsk, unsigned long * rsp) | ||
215 | { | ||
216 | unsigned long *stack; | ||
217 | int i; | ||
218 | const int cpu = safe_smp_processor_id(); | ||
219 | unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); | ||
220 | unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); | ||
221 | |||
222 | // debugging aid: "show_stack(NULL, NULL);" prints the | ||
223 | // back trace for this cpu. | ||
224 | |||
225 | if (rsp == NULL) { | ||
226 | if (tsk) | ||
227 | rsp = (unsigned long *)tsk->thread.rsp; | ||
228 | else | ||
229 | rsp = (unsigned long *)&rsp; | ||
230 | } | ||
231 | |||
232 | stack = rsp; | ||
233 | for(i=0; i < kstack_depth_to_print; i++) { | ||
234 | if (stack >= irqstack && stack <= irqstack_end) { | ||
235 | if (stack == irqstack_end) { | ||
236 | stack = (unsigned long *) (irqstack_end[-1]); | ||
237 | printk(" <EOI> "); | ||
238 | } | ||
239 | } else { | ||
240 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
241 | break; | ||
242 | } | ||
243 | if (i && ((i % 4) == 0)) | ||
244 | printk("\n "); | ||
245 | printk("%016lx ", *stack++); | ||
246 | } | ||
247 | show_trace((unsigned long *)rsp); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * The architecture-independent dump_stack generator | ||
252 | */ | ||
253 | void dump_stack(void) | ||
254 | { | ||
255 | unsigned long dummy; | ||
256 | show_trace(&dummy); | ||
257 | } | ||
258 | |||
259 | EXPORT_SYMBOL(dump_stack); | ||
260 | |||
261 | void show_registers(struct pt_regs *regs) | ||
262 | { | ||
263 | int i; | ||
264 | int in_kernel = (regs->cs & 3) == 0; | ||
265 | unsigned long rsp; | ||
266 | const int cpu = safe_smp_processor_id(); | ||
267 | struct task_struct *cur = cpu_pda[cpu].pcurrent; | ||
268 | |||
269 | rsp = regs->rsp; | ||
270 | |||
271 | printk("CPU %d ", cpu); | ||
272 | __show_regs(regs); | ||
273 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
274 | cur->comm, cur->pid, cur->thread_info, cur); | ||
275 | |||
276 | /* | ||
277 | * When in-kernel, we also print out the stack and code at the | ||
278 | * time of the fault.. | ||
279 | */ | ||
280 | if (in_kernel) { | ||
281 | |||
282 | printk("Stack: "); | ||
283 | show_stack(NULL, (unsigned long*)rsp); | ||
284 | |||
285 | printk("\nCode: "); | ||
286 | if(regs->rip < PAGE_OFFSET) | ||
287 | goto bad; | ||
288 | |||
289 | for(i=0;i<20;i++) | ||
290 | { | ||
291 | unsigned char c; | ||
292 | if(__get_user(c, &((unsigned char*)regs->rip)[i])) { | ||
293 | bad: | ||
294 | printk(" Bad RIP value."); | ||
295 | break; | ||
296 | } | ||
297 | printk("%02x ", c); | ||
298 | } | ||
299 | } | ||
300 | printk("\n"); | ||
301 | } | ||
302 | |||
303 | void handle_BUG(struct pt_regs *regs) | ||
304 | { | ||
305 | struct bug_frame f; | ||
306 | char tmp; | ||
307 | |||
308 | if (regs->cs & 3) | ||
309 | return; | ||
310 | if (__copy_from_user(&f, (struct bug_frame *) regs->rip, | ||
311 | sizeof(struct bug_frame))) | ||
312 | return; | ||
313 | if ((unsigned long)f.filename < __PAGE_OFFSET || | ||
314 | f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) | ||
315 | return; | ||
316 | if (__get_user(tmp, f.filename)) | ||
317 | f.filename = "unmapped filename"; | ||
318 | printk("----------- [cut here ] --------- [please bite here ] ---------\n"); | ||
319 | printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); | ||
320 | } | ||
321 | |||
322 | void out_of_line_bug(void) | ||
323 | { | ||
324 | BUG(); | ||
325 | } | ||
326 | |||
327 | static DEFINE_SPINLOCK(die_lock); | ||
328 | static int die_owner = -1; | ||
329 | |||
330 | void oops_begin(void) | ||
331 | { | ||
332 | int cpu = safe_smp_processor_id(); | ||
333 | /* racy, but better than risking deadlock. */ | ||
334 | local_irq_disable(); | ||
335 | if (!spin_trylock(&die_lock)) { | ||
336 | if (cpu == die_owner) | ||
337 | /* nested oops. should stop eventually */; | ||
338 | else | ||
339 | spin_lock(&die_lock); | ||
340 | } | ||
341 | die_owner = cpu; | ||
342 | console_verbose(); | ||
343 | bust_spinlocks(1); | ||
344 | } | ||
345 | |||
346 | void oops_end(void) | ||
347 | { | ||
348 | die_owner = -1; | ||
349 | bust_spinlocks(0); | ||
350 | spin_unlock(&die_lock); | ||
351 | if (panic_on_oops) | ||
352 | panic("Oops"); | ||
353 | } | ||
354 | |||
355 | void __die(const char * str, struct pt_regs * regs, long err) | ||
356 | { | ||
357 | static int die_counter; | ||
358 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | ||
359 | #ifdef CONFIG_PREEMPT | ||
360 | printk("PREEMPT "); | ||
361 | #endif | ||
362 | #ifdef CONFIG_SMP | ||
363 | printk("SMP "); | ||
364 | #endif | ||
365 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
366 | printk("DEBUG_PAGEALLOC"); | ||
367 | #endif | ||
368 | printk("\n"); | ||
369 | notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); | ||
370 | show_registers(regs); | ||
371 | /* Executive summary in case the oops scrolled away */ | ||
372 | printk(KERN_ALERT "RIP "); | ||
373 | printk_address(regs->rip); | ||
374 | printk(" RSP <%016lx>\n", regs->rsp); | ||
375 | } | ||
376 | |||
377 | void die(const char * str, struct pt_regs * regs, long err) | ||
378 | { | ||
379 | oops_begin(); | ||
380 | handle_BUG(regs); | ||
381 | __die(str, regs, err); | ||
382 | oops_end(); | ||
383 | do_exit(SIGSEGV); | ||
384 | } | ||
385 | static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) | ||
386 | { | ||
387 | if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS)) | ||
388 | die(str, regs, err); | ||
389 | } | ||
390 | |||
391 | void die_nmi(char *str, struct pt_regs *regs) | ||
392 | { | ||
393 | oops_begin(); | ||
394 | /* | ||
395 | * We are in trouble anyway, lets at least try | ||
396 | * to get a message out. | ||
397 | */ | ||
398 | printk(str, safe_smp_processor_id()); | ||
399 | show_registers(regs); | ||
400 | if (panic_on_timeout || panic_on_oops) | ||
401 | panic("nmi watchdog"); | ||
402 | printk("console shuts up ...\n"); | ||
403 | oops_end(); | ||
404 | do_exit(SIGSEGV); | ||
405 | } | ||
406 | |||
407 | static void do_trap(int trapnr, int signr, char *str, | ||
408 | struct pt_regs * regs, long error_code, siginfo_t *info) | ||
409 | { | ||
410 | conditional_sti(regs); | ||
411 | |||
412 | #ifdef CONFIG_CHECKING | ||
413 | { | ||
414 | unsigned long gs; | ||
415 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
416 | rdmsrl(MSR_GS_BASE, gs); | ||
417 | if (gs != (unsigned long)pda) { | ||
418 | wrmsrl(MSR_GS_BASE, pda); | ||
419 | printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, | ||
420 | regs->rip); | ||
421 | } | ||
422 | } | ||
423 | #endif | ||
424 | |||
425 | if ((regs->cs & 3) != 0) { | ||
426 | struct task_struct *tsk = current; | ||
427 | |||
428 | if (exception_trace && unhandled_signal(tsk, signr)) | ||
429 | printk(KERN_INFO | ||
430 | "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | ||
431 | tsk->comm, tsk->pid, str, | ||
432 | regs->rip,regs->rsp,error_code); | ||
433 | |||
434 | tsk->thread.error_code = error_code; | ||
435 | tsk->thread.trap_no = trapnr; | ||
436 | if (info) | ||
437 | force_sig_info(signr, info, tsk); | ||
438 | else | ||
439 | force_sig(signr, tsk); | ||
440 | return; | ||
441 | } | ||
442 | |||
443 | |||
444 | /* kernel trap */ | ||
445 | { | ||
446 | const struct exception_table_entry *fixup; | ||
447 | fixup = search_exception_tables(regs->rip); | ||
448 | if (fixup) { | ||
449 | regs->rip = fixup->fixup; | ||
450 | } else | ||
451 | die(str, regs, error_code); | ||
452 | return; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
457 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
458 | { \ | ||
459 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
460 | == NOTIFY_STOP) \ | ||
461 | return; \ | ||
462 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
463 | } | ||
464 | |||
465 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
466 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
467 | { \ | ||
468 | siginfo_t info; \ | ||
469 | info.si_signo = signr; \ | ||
470 | info.si_errno = 0; \ | ||
471 | info.si_code = sicode; \ | ||
472 | info.si_addr = (void __user *)siaddr; \ | ||
473 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
474 | == NOTIFY_STOP) \ | ||
475 | return; \ | ||
476 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
477 | } | ||
478 | |||
479 | DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | ||
480 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | ||
481 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | ||
482 | DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) | ||
483 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | ||
484 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
485 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
486 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
487 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
488 | DO_ERROR(18, SIGSEGV, "reserved", reserved) | ||
489 | |||
490 | #define DO_ERROR_STACK(trapnr, signr, str, name) \ | ||
491 | asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \ | ||
492 | { \ | ||
493 | struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \ | ||
494 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
495 | == NOTIFY_STOP) \ | ||
496 | return regs; \ | ||
497 | if (regs->cs & 3) { \ | ||
498 | memcpy(pr, regs, sizeof(struct pt_regs)); \ | ||
499 | regs = pr; \ | ||
500 | } \ | ||
501 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
502 | return regs; \ | ||
503 | } | ||
504 | |||
505 | DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment) | ||
506 | DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault) | ||
507 | |||
508 | asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) | ||
509 | { | ||
510 | conditional_sti(regs); | ||
511 | |||
512 | #ifdef CONFIG_CHECKING | ||
513 | { | ||
514 | unsigned long gs; | ||
515 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
516 | rdmsrl(MSR_GS_BASE, gs); | ||
517 | if (gs != (unsigned long)pda) { | ||
518 | wrmsrl(MSR_GS_BASE, pda); | ||
519 | oops_in_progress++; | ||
520 | printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); | ||
521 | oops_in_progress--; | ||
522 | } | ||
523 | } | ||
524 | #endif | ||
525 | |||
526 | if ((regs->cs & 3)!=0) { | ||
527 | struct task_struct *tsk = current; | ||
528 | |||
529 | if (exception_trace && unhandled_signal(tsk, SIGSEGV)) | ||
530 | printk(KERN_INFO | ||
531 | "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | ||
532 | tsk->comm, tsk->pid, | ||
533 | regs->rip,regs->rsp,error_code); | ||
534 | |||
535 | tsk->thread.error_code = error_code; | ||
536 | tsk->thread.trap_no = 13; | ||
537 | force_sig(SIGSEGV, tsk); | ||
538 | return; | ||
539 | } | ||
540 | |||
541 | /* kernel gp */ | ||
542 | { | ||
543 | const struct exception_table_entry *fixup; | ||
544 | fixup = search_exception_tables(regs->rip); | ||
545 | if (fixup) { | ||
546 | regs->rip = fixup->fixup; | ||
547 | return; | ||
548 | } | ||
549 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
550 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
551 | return; | ||
552 | die("general protection fault", regs, error_code); | ||
553 | } | ||
554 | } | ||
555 | |||
556 | static void mem_parity_error(unsigned char reason, struct pt_regs * regs) | ||
557 | { | ||
558 | printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); | ||
559 | printk("You probably have a hardware problem with your RAM chips\n"); | ||
560 | |||
561 | /* Clear and disable the memory parity error line. */ | ||
562 | reason = (reason & 0xf) | 4; | ||
563 | outb(reason, 0x61); | ||
564 | } | ||
565 | |||
566 | static void io_check_error(unsigned char reason, struct pt_regs * regs) | ||
567 | { | ||
568 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
569 | show_registers(regs); | ||
570 | |||
571 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
572 | reason = (reason & 0xf) | 8; | ||
573 | outb(reason, 0x61); | ||
574 | mdelay(2000); | ||
575 | reason &= ~8; | ||
576 | outb(reason, 0x61); | ||
577 | } | ||
578 | |||
579 | static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
580 | { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); | ||
581 | printk("Dazed and confused, but trying to continue\n"); | ||
582 | printk("Do you have a strange power saving mode enabled?\n"); | ||
583 | } | ||
584 | |||
585 | asmlinkage void default_do_nmi(struct pt_regs *regs) | ||
586 | { | ||
587 | unsigned char reason = 0; | ||
588 | |||
589 | /* Only the BSP gets external NMIs from the system. */ | ||
590 | if (!smp_processor_id()) | ||
591 | reason = get_nmi_reason(); | ||
592 | |||
593 | if (!(reason & 0xc0)) { | ||
594 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) | ||
595 | == NOTIFY_STOP) | ||
596 | return; | ||
597 | #ifdef CONFIG_X86_LOCAL_APIC | ||
598 | /* | ||
599 | * Ok, so this is none of the documented NMI sources, | ||
600 | * so it must be the NMI watchdog. | ||
601 | */ | ||
602 | if (nmi_watchdog > 0) { | ||
603 | nmi_watchdog_tick(regs,reason); | ||
604 | return; | ||
605 | } | ||
606 | #endif | ||
607 | unknown_nmi_error(reason, regs); | ||
608 | return; | ||
609 | } | ||
610 | if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) | ||
611 | return; | ||
612 | |||
613 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
614 | |||
615 | if (reason & 0x80) | ||
616 | mem_parity_error(reason, regs); | ||
617 | if (reason & 0x40) | ||
618 | io_check_error(reason, regs); | ||
619 | } | ||
620 | |||
621 | asmlinkage void do_int3(struct pt_regs * regs, long error_code) | ||
622 | { | ||
623 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | ||
624 | return; | ||
625 | } | ||
626 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
627 | return; | ||
628 | } | ||
629 | |||
630 | /* runs on IST stack. */ | ||
631 | asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) | ||
632 | { | ||
633 | struct pt_regs *pr; | ||
634 | unsigned long condition; | ||
635 | struct task_struct *tsk = current; | ||
636 | siginfo_t info; | ||
637 | |||
638 | pr = (struct pt_regs *)(current->thread.rsp0)-1; | ||
639 | if (regs->cs & 3) { | ||
640 | memcpy(pr, regs, sizeof(struct pt_regs)); | ||
641 | regs = pr; | ||
642 | } | ||
643 | |||
644 | #ifdef CONFIG_CHECKING | ||
645 | { | ||
646 | /* RED-PEN interaction with debugger - could destroy gs */ | ||
647 | unsigned long gs; | ||
648 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
649 | rdmsrl(MSR_GS_BASE, gs); | ||
650 | if (gs != (unsigned long)pda) { | ||
651 | wrmsrl(MSR_GS_BASE, pda); | ||
652 | printk("debug handler: wrong gs %lx expected %p\n", gs, pda); | ||
653 | } | ||
654 | } | ||
655 | #endif | ||
656 | |||
657 | asm("movq %%db6,%0" : "=r" (condition)); | ||
658 | |||
659 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
660 | SIGTRAP) == NOTIFY_STOP) { | ||
661 | return regs; | ||
662 | } | ||
663 | conditional_sti(regs); | ||
664 | |||
665 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
666 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
667 | if (!tsk->thread.debugreg7) { | ||
668 | goto clear_dr7; | ||
669 | } | ||
670 | } | ||
671 | |||
672 | tsk->thread.debugreg6 = condition; | ||
673 | |||
674 | /* Mask out spurious TF errors due to lazy TF clearing */ | ||
675 | if ((condition & DR_STEP) && | ||
676 | (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition, | ||
677 | 1, SIGTRAP) != NOTIFY_STOP)) { | ||
678 | /* | ||
679 | * The TF error should be masked out only if the current | ||
680 | * process is not traced and if the TRAP flag has been set | ||
681 | * previously by a tracing process (condition detected by | ||
682 | * the PT_DTRACE flag); remember that the i386 TRAP flag | ||
683 | * can be modified by the process itself in user mode, | ||
684 | * allowing programs to debug themselves without the ptrace() | ||
685 | * interface. | ||
686 | */ | ||
687 | if ((regs->cs & 3) == 0) | ||
688 | goto clear_TF_reenable; | ||
689 | if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) | ||
690 | goto clear_TF; | ||
691 | } | ||
692 | |||
693 | /* Ok, finally something we can handle */ | ||
694 | tsk->thread.trap_no = 1; | ||
695 | tsk->thread.error_code = error_code; | ||
696 | info.si_signo = SIGTRAP; | ||
697 | info.si_errno = 0; | ||
698 | info.si_code = TRAP_BRKPT; | ||
699 | if ((regs->cs & 3) == 0) | ||
700 | goto clear_dr7; | ||
701 | |||
702 | info.si_addr = (void __user *)regs->rip; | ||
703 | force_sig_info(SIGTRAP, &info, tsk); | ||
704 | clear_dr7: | ||
705 | asm volatile("movq %0,%%db7"::"r"(0UL)); | ||
706 | notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP); | ||
707 | return regs; | ||
708 | |||
709 | clear_TF_reenable: | ||
710 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
711 | |||
712 | clear_TF: | ||
713 | /* RED-PEN could cause spurious errors */ | ||
714 | if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) | ||
715 | != NOTIFY_STOP) | ||
716 | regs->eflags &= ~TF_MASK; | ||
717 | return regs; | ||
718 | } | ||
719 | |||
720 | static int kernel_math_error(struct pt_regs *regs, char *str) | ||
721 | { | ||
722 | const struct exception_table_entry *fixup; | ||
723 | fixup = search_exception_tables(regs->rip); | ||
724 | if (fixup) { | ||
725 | regs->rip = fixup->fixup; | ||
726 | return 1; | ||
727 | } | ||
728 | notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); | ||
729 | #if 0 | ||
730 | /* This should be a die, but warn only for now */ | ||
731 | die(str, regs, 0); | ||
732 | #else | ||
733 | printk(KERN_DEBUG "%s: %s at ", current->comm, str); | ||
734 | printk_address(regs->rip); | ||
735 | printk("\n"); | ||
736 | #endif | ||
737 | return 0; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Note that we play around with the 'TS' bit in an attempt to get | ||
742 | * the correct behaviour even in the presence of the asynchronous | ||
743 | * IRQ13 behaviour | ||
744 | */ | ||
745 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
746 | { | ||
747 | void __user *rip = (void __user *)(regs->rip); | ||
748 | struct task_struct * task; | ||
749 | siginfo_t info; | ||
750 | unsigned short cwd, swd; | ||
751 | |||
752 | conditional_sti(regs); | ||
753 | if ((regs->cs & 3) == 0 && | ||
754 | kernel_math_error(regs, "kernel x87 math error")) | ||
755 | return; | ||
756 | |||
757 | /* | ||
758 | * Save the info for the exception handler and clear the error. | ||
759 | */ | ||
760 | task = current; | ||
761 | save_init_fpu(task); | ||
762 | task->thread.trap_no = 16; | ||
763 | task->thread.error_code = 0; | ||
764 | info.si_signo = SIGFPE; | ||
765 | info.si_errno = 0; | ||
766 | info.si_code = __SI_FAULT; | ||
767 | info.si_addr = rip; | ||
768 | /* | ||
769 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
770 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
771 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
772 | * fault bit. We should only be taking one exception at a time, | ||
773 | * so if this combination doesn't produce any single exception, | ||
774 | * then we have a bad program that isn't synchronizing its FPU usage | ||
775 | * and it will suffer the consequences since we won't be able to | ||
776 | * fully reproduce the context of the exception | ||
777 | */ | ||
778 | cwd = get_fpu_cwd(task); | ||
779 | swd = get_fpu_swd(task); | ||
780 | switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { | ||
781 | case 0x000: | ||
782 | default: | ||
783 | break; | ||
784 | case 0x001: /* Invalid Op */ | ||
785 | case 0x041: /* Stack Fault */ | ||
786 | case 0x241: /* Stack Fault | Direction */ | ||
787 | info.si_code = FPE_FLTINV; | ||
788 | break; | ||
789 | case 0x002: /* Denormalize */ | ||
790 | case 0x010: /* Underflow */ | ||
791 | info.si_code = FPE_FLTUND; | ||
792 | break; | ||
793 | case 0x004: /* Zero Divide */ | ||
794 | info.si_code = FPE_FLTDIV; | ||
795 | break; | ||
796 | case 0x008: /* Overflow */ | ||
797 | info.si_code = FPE_FLTOVF; | ||
798 | break; | ||
799 | case 0x020: /* Precision */ | ||
800 | info.si_code = FPE_FLTRES; | ||
801 | break; | ||
802 | } | ||
803 | force_sig_info(SIGFPE, &info, task); | ||
804 | } | ||
805 | |||
806 | asmlinkage void bad_intr(void) | ||
807 | { | ||
808 | printk("bad interrupt"); | ||
809 | } | ||
810 | |||
811 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
812 | { | ||
813 | void __user *rip = (void __user *)(regs->rip); | ||
814 | struct task_struct * task; | ||
815 | siginfo_t info; | ||
816 | unsigned short mxcsr; | ||
817 | |||
818 | conditional_sti(regs); | ||
819 | if ((regs->cs & 3) == 0 && | ||
820 | kernel_math_error(regs, "simd math error")) | ||
821 | return; | ||
822 | |||
823 | /* | ||
824 | * Save the info for the exception handler and clear the error. | ||
825 | */ | ||
826 | task = current; | ||
827 | save_init_fpu(task); | ||
828 | task->thread.trap_no = 19; | ||
829 | task->thread.error_code = 0; | ||
830 | info.si_signo = SIGFPE; | ||
831 | info.si_errno = 0; | ||
832 | info.si_code = __SI_FAULT; | ||
833 | info.si_addr = rip; | ||
834 | /* | ||
835 | * The SIMD FPU exceptions are handled a little differently, as there | ||
836 | * is only a single status/control register. Thus, to determine which | ||
837 | * unmasked exception was caught we must mask the exception mask bits | ||
838 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
839 | */ | ||
840 | mxcsr = get_fpu_mxcsr(task); | ||
841 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
842 | case 0x000: | ||
843 | default: | ||
844 | break; | ||
845 | case 0x001: /* Invalid Op */ | ||
846 | info.si_code = FPE_FLTINV; | ||
847 | break; | ||
848 | case 0x002: /* Denormalize */ | ||
849 | case 0x010: /* Underflow */ | ||
850 | info.si_code = FPE_FLTUND; | ||
851 | break; | ||
852 | case 0x004: /* Zero Divide */ | ||
853 | info.si_code = FPE_FLTDIV; | ||
854 | break; | ||
855 | case 0x008: /* Overflow */ | ||
856 | info.si_code = FPE_FLTOVF; | ||
857 | break; | ||
858 | case 0x020: /* Precision */ | ||
859 | info.si_code = FPE_FLTRES; | ||
860 | break; | ||
861 | } | ||
862 | force_sig_info(SIGFPE, &info, task); | ||
863 | } | ||
864 | |||
865 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | ||
866 | { | ||
867 | } | ||
868 | |||
869 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
870 | { | ||
871 | } | ||
872 | |||
873 | /* | ||
874 | * 'math_state_restore()' saves the current math information in the | ||
875 | * old math state array, and gets the new ones from the current task | ||
876 | * | ||
877 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
878 | * Don't touch unless you *really* know how it works. | ||
879 | */ | ||
880 | asmlinkage void math_state_restore(void) | ||
881 | { | ||
882 | struct task_struct *me = current; | ||
883 | clts(); /* Allow maths ops (or we recurse) */ | ||
884 | |||
885 | if (!used_math()) | ||
886 | init_fpu(me); | ||
887 | restore_fpu_checking(&me->thread.i387.fxsave); | ||
888 | me->thread_info->status |= TS_USEDFPU; | ||
889 | } | ||
890 | |||
891 | void do_call_debug(struct pt_regs *regs) | ||
892 | { | ||
893 | notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); | ||
894 | } | ||
895 | |||
896 | void __init trap_init(void) | ||
897 | { | ||
898 | set_intr_gate(0,÷_error); | ||
899 | set_intr_gate_ist(1,&debug,DEBUG_STACK); | ||
900 | set_intr_gate_ist(2,&nmi,NMI_STACK); | ||
901 | set_system_gate(3,&int3); | ||
902 | set_system_gate(4,&overflow); /* int4-5 can be called from all */ | ||
903 | set_system_gate(5,&bounds); | ||
904 | set_intr_gate(6,&invalid_op); | ||
905 | set_intr_gate(7,&device_not_available); | ||
906 | set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); | ||
907 | set_intr_gate(9,&coprocessor_segment_overrun); | ||
908 | set_intr_gate(10,&invalid_TSS); | ||
909 | set_intr_gate(11,&segment_not_present); | ||
910 | set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); | ||
911 | set_intr_gate(13,&general_protection); | ||
912 | set_intr_gate(14,&page_fault); | ||
913 | set_intr_gate(15,&spurious_interrupt_bug); | ||
914 | set_intr_gate(16,&coprocessor_error); | ||
915 | set_intr_gate(17,&alignment_check); | ||
916 | #ifdef CONFIG_X86_MCE | ||
917 | set_intr_gate_ist(18,&machine_check, MCE_STACK); | ||
918 | #endif | ||
919 | set_intr_gate(19,&simd_coprocessor_error); | ||
920 | |||
921 | #ifdef CONFIG_IA32_EMULATION | ||
922 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
923 | #endif | ||
924 | |||
925 | set_intr_gate(KDB_VECTOR, call_debug); | ||
926 | |||
927 | /* | ||
928 | * Should be a barrier for any external CPU state. | ||
929 | */ | ||
930 | cpu_init(); | ||
931 | } | ||
932 | |||
933 | |||
934 | /* Actual parsing is done early in setup.c. */ | ||
935 | static int __init oops_dummy(char *s) | ||
936 | { | ||
937 | panic_on_oops = 1; | ||
938 | return -1; | ||
939 | } | ||
940 | __setup("oops=", oops_dummy); | ||
941 | |||
942 | static int __init kstack_setup(char *s) | ||
943 | { | ||
944 | kstack_depth_to_print = simple_strtoul(s,NULL,0); | ||
945 | return 0; | ||
946 | } | ||
947 | __setup("kstack=", kstack_setup); | ||
948 | |||
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S new file mode 100644 index 000000000000..59ebd5beda87 --- /dev/null +++ b/arch/x86_64/kernel/vmlinux.lds.S | |||
@@ -0,0 +1,164 @@ | |||
1 | /* ld script to make x86-64 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | */ | ||
4 | |||
5 | #include <asm-generic/vmlinux.lds.h> | ||
6 | #include <linux/config.h> | ||
7 | |||
8 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
9 | OUTPUT_ARCH(i386:x86-64) | ||
10 | ENTRY(phys_startup_64) | ||
11 | jiffies_64 = jiffies; | ||
12 | SECTIONS | ||
13 | { | ||
14 | . = 0xffffffff80100000; | ||
15 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
16 | _text = .; /* Text and read-only data */ | ||
17 | .text : { | ||
18 | *(.text) | ||
19 | SCHED_TEXT | ||
20 | LOCK_TEXT | ||
21 | *(.fixup) | ||
22 | *(.gnu.warning) | ||
23 | } = 0x9090 | ||
24 | .text.lock : { *(.text.lock) } /* out-of-line lock text */ | ||
25 | |||
26 | _etext = .; /* End of text section */ | ||
27 | |||
28 | . = ALIGN(16); /* Exception table */ | ||
29 | __start___ex_table = .; | ||
30 | __ex_table : { *(__ex_table) } | ||
31 | __stop___ex_table = .; | ||
32 | |||
33 | RODATA | ||
34 | |||
35 | .data : { /* Data */ | ||
36 | *(.data) | ||
37 | CONSTRUCTORS | ||
38 | } | ||
39 | |||
40 | _edata = .; /* End of data section */ | ||
41 | |||
42 | __bss_start = .; /* BSS */ | ||
43 | .bss : { | ||
44 | *(.bss.page_aligned) | ||
45 | *(.bss) | ||
46 | } | ||
47 | __bss_end = .; | ||
48 | |||
49 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
50 | .data.cacheline_aligned : { *(.data.cacheline_aligned) } | ||
51 | |||
52 | #define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) | ||
53 | #define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) | ||
54 | #define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) | ||
55 | |||
56 | .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } | ||
57 | __vsyscall_0 = LOADADDR(.vsyscall_0); | ||
58 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
59 | .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } | ||
60 | xtime_lock = LOADADDR(.xtime_lock); | ||
61 | .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } | ||
62 | vxtime = LOADADDR(.vxtime); | ||
63 | .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } | ||
64 | wall_jiffies = LOADADDR(.wall_jiffies); | ||
65 | .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } | ||
66 | sys_tz = LOADADDR(.sys_tz); | ||
67 | .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } | ||
68 | sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); | ||
69 | .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } | ||
70 | xtime = LOADADDR(.xtime); | ||
71 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
72 | .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } | ||
73 | jiffies = LOADADDR(.jiffies); | ||
74 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } | ||
75 | . = LOADADDR(.vsyscall_0) + 4096; | ||
76 | |||
77 | . = ALIGN(8192); /* init_task */ | ||
78 | .data.init_task : { *(.data.init_task) } | ||
79 | |||
80 | . = ALIGN(4096); | ||
81 | .data.page_aligned : { *(.data.page_aligned) } | ||
82 | |||
83 | . = ALIGN(4096); /* Init code and data */ | ||
84 | __init_begin = .; | ||
85 | .init.text : { | ||
86 | _sinittext = .; | ||
87 | *(.init.text) | ||
88 | _einittext = .; | ||
89 | } | ||
90 | __initdata_begin = .; | ||
91 | .init.data : { *(.init.data) } | ||
92 | __initdata_end = .; | ||
93 | . = ALIGN(16); | ||
94 | __setup_start = .; | ||
95 | .init.setup : { *(.init.setup) } | ||
96 | __setup_end = .; | ||
97 | __initcall_start = .; | ||
98 | .initcall.init : { | ||
99 | *(.initcall1.init) | ||
100 | *(.initcall2.init) | ||
101 | *(.initcall3.init) | ||
102 | *(.initcall4.init) | ||
103 | *(.initcall5.init) | ||
104 | *(.initcall6.init) | ||
105 | *(.initcall7.init) | ||
106 | } | ||
107 | __initcall_end = .; | ||
108 | __con_initcall_start = .; | ||
109 | .con_initcall.init : { *(.con_initcall.init) } | ||
110 | __con_initcall_end = .; | ||
111 | SECURITY_INIT | ||
112 | . = ALIGN(8); | ||
113 | __alt_instructions = .; | ||
114 | .altinstructions : { *(.altinstructions) } | ||
115 | __alt_instructions_end = .; | ||
116 | .altinstr_replacement : { *(.altinstr_replacement) } | ||
117 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
118 | from .altinstructions and .eh_frame */ | ||
119 | .exit.text : { *(.exit.text) } | ||
120 | .exit.data : { *(.exit.data) } | ||
121 | . = ALIGN(4096); | ||
122 | __initramfs_start = .; | ||
123 | .init.ramfs : { *(.init.ramfs) } | ||
124 | __initramfs_end = .; | ||
125 | . = ALIGN(32); | ||
126 | __per_cpu_start = .; | ||
127 | .data.percpu : { *(.data.percpu) } | ||
128 | __per_cpu_end = .; | ||
129 | . = ALIGN(4096); | ||
130 | __init_end = .; | ||
131 | |||
132 | . = ALIGN(4096); | ||
133 | __nosave_begin = .; | ||
134 | .data_nosave : { *(.data.nosave) } | ||
135 | . = ALIGN(4096); | ||
136 | __nosave_end = .; | ||
137 | |||
138 | _end = . ; | ||
139 | |||
140 | /* Sections to be discarded */ | ||
141 | /DISCARD/ : { | ||
142 | *(.exitcall.exit) | ||
143 | #ifndef CONFIG_DEBUG_INFO | ||
144 | *(.eh_frame) | ||
145 | #endif | ||
146 | } | ||
147 | |||
148 | /* DWARF 2 */ | ||
149 | .debug_info 0 : { *(.debug_info) } | ||
150 | .debug_abbrev 0 : { *(.debug_abbrev) } | ||
151 | .debug_line 0 : { *(.debug_line) } | ||
152 | .debug_frame 0 : { *(.debug_frame) } | ||
153 | .debug_str 0 : { *(.debug_str) } | ||
154 | .debug_loc 0 : { *(.debug_loc) } | ||
155 | .debug_macinfo 0 : { *(.debug_macinfo) } | ||
156 | /* SGI/MIPS DWARF 2 extensions */ | ||
157 | .debug_weaknames 0 : { *(.debug_weaknames) } | ||
158 | .debug_funcnames 0 : { *(.debug_funcnames) } | ||
159 | .debug_typenames 0 : { *(.debug_typenames) } | ||
160 | .debug_varnames 0 : { *(.debug_varnames) } | ||
161 | |||
162 | |||
163 | .comment 0 : { *(.comment) } | ||
164 | } | ||
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c new file mode 100644 index 000000000000..b4b8dc59663a --- /dev/null +++ b/arch/x86_64/kernel/vsyscall.c | |||
@@ -0,0 +1,225 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/vsyscall.c | ||
3 | * | ||
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright 2003 Andi Kleen, SuSE Labs. | ||
6 | * | ||
7 | * Thanks to hpa@transmeta.com for some useful hint. | ||
8 | * Special thanks to Ingo Molnar for his early experience with | ||
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | ||
10 | * | ||
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | ||
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | ||
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | ||
14 | * jumping out of line if necessary. We cannot add more with this | ||
15 | * mechanism because older kernels won't return -ENOSYS. | ||
16 | * If we want more than four we need a vDSO. | ||
17 | * | ||
18 | * Note: the concept clashes with user mode linux. If you use UML and | ||
19 | * want per guest time just set the kernel.vsyscall64 sysctl to 0. | ||
20 | */ | ||
21 | |||
22 | #include <linux/time.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/seqlock.h> | ||
27 | #include <linux/jiffies.h> | ||
28 | #include <linux/sysctl.h> | ||
29 | |||
30 | #include <asm/vsyscall.h> | ||
31 | #include <asm/pgtable.h> | ||
32 | #include <asm/page.h> | ||
33 | #include <asm/fixmap.h> | ||
34 | #include <asm/errno.h> | ||
35 | #include <asm/io.h> | ||
36 | |||
37 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | ||
38 | #define force_inline __attribute__((always_inline)) inline | ||
39 | |||
40 | int __sysctl_vsyscall __section_sysctl_vsyscall = 1; | ||
41 | seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; | ||
42 | |||
43 | #include <asm/unistd.h> | ||
44 | |||
45 | static force_inline void timeval_normalize(struct timeval * tv) | ||
46 | { | ||
47 | time_t __sec; | ||
48 | |||
49 | __sec = tv->tv_usec / 1000000; | ||
50 | if (__sec) { | ||
51 | tv->tv_usec %= 1000000; | ||
52 | tv->tv_sec += __sec; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | static force_inline void do_vgettimeofday(struct timeval * tv) | ||
57 | { | ||
58 | long sequence, t; | ||
59 | unsigned long sec, usec; | ||
60 | |||
61 | do { | ||
62 | sequence = read_seqbegin(&__xtime_lock); | ||
63 | |||
64 | sec = __xtime.tv_sec; | ||
65 | usec = (__xtime.tv_nsec / 1000) + | ||
66 | (__jiffies - __wall_jiffies) * (1000000 / HZ); | ||
67 | |||
68 | if (__vxtime.mode == VXTIME_TSC) { | ||
69 | sync_core(); | ||
70 | rdtscll(t); | ||
71 | if (t < __vxtime.last_tsc) | ||
72 | t = __vxtime.last_tsc; | ||
73 | usec += ((t - __vxtime.last_tsc) * | ||
74 | __vxtime.tsc_quot) >> 32; | ||
75 | /* See comment in x86_64 do_gettimeofday. */ | ||
76 | } else { | ||
77 | usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - | ||
78 | __vxtime.last) * __vxtime.quot) >> 32; | ||
79 | } | ||
80 | } while (read_seqretry(&__xtime_lock, sequence)); | ||
81 | |||
82 | tv->tv_sec = sec + usec / 1000000; | ||
83 | tv->tv_usec = usec % 1000000; | ||
84 | } | ||
85 | |||
86 | /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ | ||
87 | static force_inline void do_get_tz(struct timezone * tz) | ||
88 | { | ||
89 | *tz = __sys_tz; | ||
90 | } | ||
91 | |||
92 | static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | ||
93 | { | ||
94 | int ret; | ||
95 | asm volatile("vsysc2: syscall" | ||
96 | : "=a" (ret) | ||
97 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); | ||
98 | return ret; | ||
99 | } | ||
100 | |||
101 | static force_inline long time_syscall(long *t) | ||
102 | { | ||
103 | long secs; | ||
104 | asm volatile("vsysc1: syscall" | ||
105 | : "=a" (secs) | ||
106 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | ||
107 | return secs; | ||
108 | } | ||
109 | |||
110 | static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | ||
111 | { | ||
112 | if (unlikely(!__sysctl_vsyscall)) | ||
113 | return gettimeofday(tv,tz); | ||
114 | if (tv) | ||
115 | do_vgettimeofday(tv); | ||
116 | if (tz) | ||
117 | do_get_tz(tz); | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | /* This will break when the xtime seconds get inaccurate, but that is | ||
122 | * unlikely */ | ||
123 | static time_t __vsyscall(1) vtime(time_t *t) | ||
124 | { | ||
125 | if (unlikely(!__sysctl_vsyscall)) | ||
126 | return time_syscall(t); | ||
127 | else if (t) | ||
128 | *t = __xtime.tv_sec; | ||
129 | return __xtime.tv_sec; | ||
130 | } | ||
131 | |||
132 | static long __vsyscall(2) venosys_0(void) | ||
133 | { | ||
134 | return -ENOSYS; | ||
135 | } | ||
136 | |||
137 | static long __vsyscall(3) venosys_1(void) | ||
138 | { | ||
139 | return -ENOSYS; | ||
140 | } | ||
141 | |||
142 | #ifdef CONFIG_SYSCTL | ||
143 | |||
144 | #define SYSCALL 0x050f | ||
145 | #define NOP2 0x9090 | ||
146 | |||
147 | /* | ||
148 | * NOP out syscall in vsyscall page when not needed. | ||
149 | */ | ||
150 | static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | ||
151 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
152 | { | ||
153 | extern u16 vsysc1, vsysc2; | ||
154 | u16 *map1, *map2; | ||
155 | int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | ||
156 | if (!write) | ||
157 | return ret; | ||
158 | /* gcc has some trouble with __va(__pa()), so just do it this | ||
159 | way. */ | ||
160 | map1 = ioremap(__pa_symbol(&vsysc1), 2); | ||
161 | if (!map1) | ||
162 | return -ENOMEM; | ||
163 | map2 = ioremap(__pa_symbol(&vsysc2), 2); | ||
164 | if (!map2) { | ||
165 | ret = -ENOMEM; | ||
166 | goto out; | ||
167 | } | ||
168 | if (!sysctl_vsyscall) { | ||
169 | *map1 = SYSCALL; | ||
170 | *map2 = SYSCALL; | ||
171 | } else { | ||
172 | *map1 = NOP2; | ||
173 | *map2 = NOP2; | ||
174 | } | ||
175 | iounmap(map2); | ||
176 | out: | ||
177 | iounmap(map1); | ||
178 | return ret; | ||
179 | } | ||
180 | |||
181 | static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, | ||
182 | void __user *oldval, size_t __user *oldlenp, | ||
183 | void __user *newval, size_t newlen, | ||
184 | void **context) | ||
185 | { | ||
186 | return -ENOSYS; | ||
187 | } | ||
188 | |||
189 | static ctl_table kernel_table2[] = { | ||
190 | { .ctl_name = 99, .procname = "vsyscall64", | ||
191 | .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, | ||
192 | .strategy = vsyscall_sysctl_nostrat, | ||
193 | .proc_handler = vsyscall_sysctl_change }, | ||
194 | { 0, } | ||
195 | }; | ||
196 | |||
197 | static ctl_table kernel_root_table2[] = { | ||
198 | { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, | ||
199 | .child = kernel_table2 }, | ||
200 | { 0 }, | ||
201 | }; | ||
202 | |||
203 | #endif | ||
204 | |||
205 | static void __init map_vsyscall(void) | ||
206 | { | ||
207 | extern char __vsyscall_0; | ||
208 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | ||
209 | |||
210 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | ||
211 | } | ||
212 | |||
213 | static int __init vsyscall_init(void) | ||
214 | { | ||
215 | BUG_ON(((unsigned long) &vgettimeofday != | ||
216 | VSYSCALL_ADDR(__NR_vgettimeofday))); | ||
217 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | ||
218 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | ||
219 | map_vsyscall(); | ||
220 | sysctl_vsyscall = 1; | ||
221 | register_sysctl_table(kernel_root_table2, 0); | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | __initcall(vsyscall_init); | ||
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c new file mode 100644 index 000000000000..88626e626886 --- /dev/null +++ b/arch/x86_64/kernel/x8664_ksyms.c | |||
@@ -0,0 +1,221 @@ | |||
1 | #include <linux/config.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/smp.h> | ||
4 | #include <linux/user.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/in6.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/smp_lock.h> | ||
9 | #include <linux/pm.h> | ||
10 | #include <linux/pci.h> | ||
11 | #include <linux/apm_bios.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/syscalls.h> | ||
15 | #include <linux/tty.h> | ||
16 | #include <linux/ioctl32.h> | ||
17 | |||
18 | #include <asm/semaphore.h> | ||
19 | #include <asm/processor.h> | ||
20 | #include <asm/i387.h> | ||
21 | #include <asm/uaccess.h> | ||
22 | #include <asm/checksum.h> | ||
23 | #include <asm/io.h> | ||
24 | #include <asm/delay.h> | ||
25 | #include <asm/irq.h> | ||
26 | #include <asm/mmx.h> | ||
27 | #include <asm/desc.h> | ||
28 | #include <asm/pgtable.h> | ||
29 | #include <asm/pgalloc.h> | ||
30 | #include <asm/nmi.h> | ||
31 | #include <asm/kdebug.h> | ||
32 | #include <asm/unistd.h> | ||
33 | #include <asm/tlbflush.h> | ||
34 | #include <asm/kdebug.h> | ||
35 | |||
36 | extern spinlock_t rtc_lock; | ||
37 | |||
38 | #ifdef CONFIG_SMP | ||
39 | extern void __write_lock_failed(rwlock_t *rw); | ||
40 | extern void __read_lock_failed(rwlock_t *rw); | ||
41 | #endif | ||
42 | |||
43 | #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) | ||
44 | extern struct drive_info_struct drive_info; | ||
45 | EXPORT_SYMBOL(drive_info); | ||
46 | #endif | ||
47 | |||
48 | extern unsigned long get_cmos_time(void); | ||
49 | |||
50 | /* platform dependent support */ | ||
51 | EXPORT_SYMBOL(boot_cpu_data); | ||
52 | //EXPORT_SYMBOL(dump_fpu); | ||
53 | EXPORT_SYMBOL(__ioremap); | ||
54 | EXPORT_SYMBOL(ioremap_nocache); | ||
55 | EXPORT_SYMBOL(iounmap); | ||
56 | EXPORT_SYMBOL(enable_irq); | ||
57 | EXPORT_SYMBOL(disable_irq); | ||
58 | EXPORT_SYMBOL(disable_irq_nosync); | ||
59 | EXPORT_SYMBOL(probe_irq_mask); | ||
60 | EXPORT_SYMBOL(kernel_thread); | ||
61 | EXPORT_SYMBOL(pm_idle); | ||
62 | EXPORT_SYMBOL(pm_power_off); | ||
63 | EXPORT_SYMBOL(get_cmos_time); | ||
64 | |||
65 | EXPORT_SYMBOL(__down_failed); | ||
66 | EXPORT_SYMBOL(__down_failed_interruptible); | ||
67 | EXPORT_SYMBOL(__down_failed_trylock); | ||
68 | EXPORT_SYMBOL(__up_wakeup); | ||
69 | /* Networking helper routines. */ | ||
70 | EXPORT_SYMBOL(csum_partial_copy_nocheck); | ||
71 | EXPORT_SYMBOL(ip_compute_csum); | ||
72 | /* Delay loops */ | ||
73 | EXPORT_SYMBOL(__udelay); | ||
74 | EXPORT_SYMBOL(__ndelay); | ||
75 | EXPORT_SYMBOL(__delay); | ||
76 | EXPORT_SYMBOL(__const_udelay); | ||
77 | |||
78 | EXPORT_SYMBOL(__get_user_1); | ||
79 | EXPORT_SYMBOL(__get_user_2); | ||
80 | EXPORT_SYMBOL(__get_user_4); | ||
81 | EXPORT_SYMBOL(__get_user_8); | ||
82 | EXPORT_SYMBOL(__put_user_1); | ||
83 | EXPORT_SYMBOL(__put_user_2); | ||
84 | EXPORT_SYMBOL(__put_user_4); | ||
85 | EXPORT_SYMBOL(__put_user_8); | ||
86 | |||
87 | EXPORT_SYMBOL(strpbrk); | ||
88 | EXPORT_SYMBOL(strstr); | ||
89 | |||
90 | EXPORT_SYMBOL(strncpy_from_user); | ||
91 | EXPORT_SYMBOL(__strncpy_from_user); | ||
92 | EXPORT_SYMBOL(clear_user); | ||
93 | EXPORT_SYMBOL(__clear_user); | ||
94 | EXPORT_SYMBOL(copy_user_generic); | ||
95 | EXPORT_SYMBOL(copy_from_user); | ||
96 | EXPORT_SYMBOL(copy_to_user); | ||
97 | EXPORT_SYMBOL(copy_in_user); | ||
98 | EXPORT_SYMBOL(strnlen_user); | ||
99 | |||
100 | #ifdef CONFIG_PCI | ||
101 | EXPORT_SYMBOL(pci_alloc_consistent); | ||
102 | EXPORT_SYMBOL(pci_free_consistent); | ||
103 | #endif | ||
104 | |||
105 | #ifdef CONFIG_PCI | ||
106 | EXPORT_SYMBOL(pci_mem_start); | ||
107 | #endif | ||
108 | |||
109 | EXPORT_SYMBOL(copy_page); | ||
110 | EXPORT_SYMBOL(clear_page); | ||
111 | |||
112 | EXPORT_SYMBOL(cpu_pda); | ||
113 | #ifdef CONFIG_SMP | ||
114 | EXPORT_SYMBOL(cpu_data); | ||
115 | EXPORT_SYMBOL(cpu_online_map); | ||
116 | EXPORT_SYMBOL(__write_lock_failed); | ||
117 | EXPORT_SYMBOL(__read_lock_failed); | ||
118 | |||
119 | EXPORT_SYMBOL(synchronize_irq); | ||
120 | EXPORT_SYMBOL(smp_call_function); | ||
121 | EXPORT_SYMBOL(cpu_callout_map); | ||
122 | #endif | ||
123 | |||
124 | #ifdef CONFIG_VT | ||
125 | EXPORT_SYMBOL(screen_info); | ||
126 | #endif | ||
127 | |||
128 | EXPORT_SYMBOL(get_wchan); | ||
129 | |||
130 | EXPORT_SYMBOL(rtc_lock); | ||
131 | |||
132 | EXPORT_SYMBOL_GPL(set_nmi_callback); | ||
133 | EXPORT_SYMBOL_GPL(unset_nmi_callback); | ||
134 | |||
135 | /* Export string functions. We normally rely on gcc builtin for most of these, | ||
136 | but gcc sometimes decides not to inline them. */ | ||
137 | #undef memcpy | ||
138 | #undef memset | ||
139 | #undef memmove | ||
140 | #undef memchr | ||
141 | #undef strlen | ||
142 | #undef strcpy | ||
143 | #undef strncmp | ||
144 | #undef strncpy | ||
145 | #undef strchr | ||
146 | #undef strcmp | ||
147 | #undef strcpy | ||
148 | #undef strcat | ||
149 | #undef memcmp | ||
150 | |||
151 | extern void * memset(void *,int,__kernel_size_t); | ||
152 | extern size_t strlen(const char *); | ||
153 | extern void * memmove(void * dest,const void *src,size_t count); | ||
154 | extern char * strcpy(char * dest,const char *src); | ||
155 | extern int strcmp(const char * cs,const char * ct); | ||
156 | extern void *memchr(const void *s, int c, size_t n); | ||
157 | extern void * memcpy(void *,const void *,__kernel_size_t); | ||
158 | extern void * __memcpy(void *,const void *,__kernel_size_t); | ||
159 | extern char * strcat(char *, const char *); | ||
160 | extern int memcmp(const void * cs,const void * ct,size_t count); | ||
161 | |||
162 | EXPORT_SYMBOL(memset); | ||
163 | EXPORT_SYMBOL(strlen); | ||
164 | EXPORT_SYMBOL(memmove); | ||
165 | EXPORT_SYMBOL(strcpy); | ||
166 | EXPORT_SYMBOL(strncmp); | ||
167 | EXPORT_SYMBOL(strncpy); | ||
168 | EXPORT_SYMBOL(strchr); | ||
169 | EXPORT_SYMBOL(strcmp); | ||
170 | EXPORT_SYMBOL(strcat); | ||
171 | EXPORT_SYMBOL(strncat); | ||
172 | EXPORT_SYMBOL(memchr); | ||
173 | EXPORT_SYMBOL(strrchr); | ||
174 | EXPORT_SYMBOL(strnlen); | ||
175 | EXPORT_SYMBOL(memscan); | ||
176 | EXPORT_SYMBOL(memcpy); | ||
177 | EXPORT_SYMBOL(__memcpy); | ||
178 | EXPORT_SYMBOL(memcmp); | ||
179 | |||
180 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | ||
181 | /* prototypes are wrong, these are assembly with custom calling functions */ | ||
182 | extern void rwsem_down_read_failed_thunk(void); | ||
183 | extern void rwsem_wake_thunk(void); | ||
184 | extern void rwsem_downgrade_thunk(void); | ||
185 | extern void rwsem_down_write_failed_thunk(void); | ||
186 | EXPORT_SYMBOL(rwsem_down_read_failed_thunk); | ||
187 | EXPORT_SYMBOL(rwsem_wake_thunk); | ||
188 | EXPORT_SYMBOL(rwsem_downgrade_thunk); | ||
189 | EXPORT_SYMBOL(rwsem_down_write_failed_thunk); | ||
190 | #endif | ||
191 | |||
192 | EXPORT_SYMBOL(empty_zero_page); | ||
193 | |||
194 | #ifdef CONFIG_HAVE_DEC_LOCK | ||
195 | EXPORT_SYMBOL(_atomic_dec_and_lock); | ||
196 | #endif | ||
197 | |||
198 | EXPORT_SYMBOL(die_chain); | ||
199 | EXPORT_SYMBOL(register_die_notifier); | ||
200 | |||
201 | #ifdef CONFIG_SMP | ||
202 | EXPORT_SYMBOL(cpu_sibling_map); | ||
203 | EXPORT_SYMBOL(smp_num_siblings); | ||
204 | #endif | ||
205 | |||
206 | extern void do_softirq_thunk(void); | ||
207 | EXPORT_SYMBOL(do_softirq_thunk); | ||
208 | |||
209 | void out_of_line_bug(void); | ||
210 | EXPORT_SYMBOL(out_of_line_bug); | ||
211 | |||
212 | EXPORT_SYMBOL(init_level4_pgt); | ||
213 | |||
214 | extern unsigned long __supported_pte_mask; | ||
215 | EXPORT_SYMBOL(__supported_pte_mask); | ||
216 | |||
217 | #ifdef CONFIG_SMP | ||
218 | EXPORT_SYMBOL(flush_tlb_page); | ||
219 | #endif | ||
220 | |||
221 | EXPORT_SYMBOL(cpu_khz); | ||
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile new file mode 100644 index 000000000000..6b26a1c1e9ff --- /dev/null +++ b/arch/x86_64/lib/Makefile | |||
@@ -0,0 +1,14 @@ | |||
1 | # | ||
2 | # Makefile for x86_64-specific library files. | ||
3 | # | ||
4 | |||
5 | CFLAGS_csum-partial.o := -funroll-loops | ||
6 | |||
7 | obj-y := io.o | ||
8 | |||
9 | lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ | ||
10 | usercopy.o getuser.o putuser.o \ | ||
11 | thunk.o clear_page.o copy_page.o bitstr.o bitops.o | ||
12 | lib-y += memcpy.o memmove.o memset.o copy_user.o | ||
13 | |||
14 | lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o | ||
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c new file mode 100644 index 000000000000..a29fb75b33ac --- /dev/null +++ b/arch/x86_64/lib/bitops.c | |||
@@ -0,0 +1,141 @@ | |||
1 | #include <linux/bitops.h> | ||
2 | |||
3 | #undef find_first_zero_bit | ||
4 | #undef find_next_zero_bit | ||
5 | #undef find_first_bit | ||
6 | #undef find_next_bit | ||
7 | |||
8 | /** | ||
9 | * find_first_zero_bit - find the first zero bit in a memory region | ||
10 | * @addr: The address to start the search at | ||
11 | * @size: The maximum size to search | ||
12 | * | ||
13 | * Returns the bit-number of the first zero bit, not the number of the byte | ||
14 | * containing a bit. | ||
15 | */ | ||
16 | inline long find_first_zero_bit(const unsigned long * addr, unsigned long size) | ||
17 | { | ||
18 | long d0, d1, d2; | ||
19 | long res; | ||
20 | |||
21 | if (!size) | ||
22 | return 0; | ||
23 | asm volatile( | ||
24 | " repe; scasq\n" | ||
25 | " je 1f\n" | ||
26 | " xorq -8(%%rdi),%%rax\n" | ||
27 | " subq $8,%%rdi\n" | ||
28 | " bsfq %%rax,%%rdx\n" | ||
29 | "1: subq %[addr],%%rdi\n" | ||
30 | " shlq $3,%%rdi\n" | ||
31 | " addq %%rdi,%%rdx" | ||
32 | :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) | ||
33 | :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL), | ||
34 | [addr] "r" (addr) : "memory"); | ||
35 | return res; | ||
36 | } | ||
37 | |||
38 | /** | ||
39 | * find_next_zero_bit - find the first zero bit in a memory region | ||
40 | * @addr: The address to base the search on | ||
41 | * @offset: The bitnumber to start searching at | ||
42 | * @size: The maximum size to search | ||
43 | */ | ||
44 | long find_next_zero_bit (const unsigned long * addr, long size, long offset) | ||
45 | { | ||
46 | unsigned long * p = ((unsigned long *) addr) + (offset >> 6); | ||
47 | unsigned long set = 0; | ||
48 | unsigned long res, bit = offset&63; | ||
49 | |||
50 | if (bit) { | ||
51 | /* | ||
52 | * Look for zero in first word | ||
53 | */ | ||
54 | asm("bsfq %1,%0\n\t" | ||
55 | "cmoveq %2,%0" | ||
56 | : "=r" (set) | ||
57 | : "r" (~(*p >> bit)), "r"(64L)); | ||
58 | if (set < (64 - bit)) | ||
59 | return set + offset; | ||
60 | set = 64 - bit; | ||
61 | p++; | ||
62 | } | ||
63 | /* | ||
64 | * No zero yet, search remaining full words for a zero | ||
65 | */ | ||
66 | res = find_first_zero_bit ((const unsigned long *)p, | ||
67 | size - 64 * (p - (unsigned long *) addr)); | ||
68 | return (offset + set + res); | ||
69 | } | ||
70 | |||
71 | static inline long | ||
72 | __find_first_bit(const unsigned long * addr, unsigned long size) | ||
73 | { | ||
74 | long d0, d1; | ||
75 | long res; | ||
76 | |||
77 | asm volatile( | ||
78 | " repe; scasq\n" | ||
79 | " jz 1f\n" | ||
80 | " subq $8,%%rdi\n" | ||
81 | " bsfq (%%rdi),%%rax\n" | ||
82 | "1: subq %[addr],%%rdi\n" | ||
83 | " shlq $3,%%rdi\n" | ||
84 | " addq %%rdi,%%rax" | ||
85 | :"=a" (res), "=&c" (d0), "=&D" (d1) | ||
86 | :"0" (0ULL), | ||
87 | "1" ((size + 63) >> 6), "2" (addr), | ||
88 | [addr] "r" (addr) : "memory"); | ||
89 | return res; | ||
90 | } | ||
91 | |||
92 | /** | ||
93 | * find_first_bit - find the first set bit in a memory region | ||
94 | * @addr: The address to start the search at | ||
95 | * @size: The maximum size to search | ||
96 | * | ||
97 | * Returns the bit-number of the first set bit, not the number of the byte | ||
98 | * containing a bit. | ||
99 | */ | ||
100 | long find_first_bit(const unsigned long * addr, unsigned long size) | ||
101 | { | ||
102 | return __find_first_bit(addr,size); | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * find_next_bit - find the first set bit in a memory region | ||
107 | * @addr: The address to base the search on | ||
108 | * @offset: The bitnumber to start searching at | ||
109 | * @size: The maximum size to search | ||
110 | */ | ||
111 | long find_next_bit(const unsigned long * addr, long size, long offset) | ||
112 | { | ||
113 | const unsigned long * p = addr + (offset >> 6); | ||
114 | unsigned long set = 0, bit = offset & 63, res; | ||
115 | |||
116 | if (bit) { | ||
117 | /* | ||
118 | * Look for nonzero in the first 64 bits: | ||
119 | */ | ||
120 | asm("bsfq %1,%0\n\t" | ||
121 | "cmoveq %2,%0\n\t" | ||
122 | : "=r" (set) | ||
123 | : "r" (*p >> bit), "r" (64L)); | ||
124 | if (set < (64 - bit)) | ||
125 | return set + offset; | ||
126 | set = 64 - bit; | ||
127 | p++; | ||
128 | } | ||
129 | /* | ||
130 | * No set bit yet, search remaining full words for a bit | ||
131 | */ | ||
132 | res = __find_first_bit (p, size - 64 * (p - addr)); | ||
133 | return (offset + set + res); | ||
134 | } | ||
135 | |||
136 | #include <linux/module.h> | ||
137 | |||
138 | EXPORT_SYMBOL(find_next_bit); | ||
139 | EXPORT_SYMBOL(find_first_bit); | ||
140 | EXPORT_SYMBOL(find_first_zero_bit); | ||
141 | EXPORT_SYMBOL(find_next_zero_bit); | ||
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c new file mode 100644 index 000000000000..24676609a6ac --- /dev/null +++ b/arch/x86_64/lib/bitstr.c | |||
@@ -0,0 +1,28 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/bitops.h> | ||
3 | |||
4 | /* Find string of zero bits in a bitmap */ | ||
5 | unsigned long | ||
6 | find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) | ||
7 | { | ||
8 | unsigned long n, end, i; | ||
9 | |||
10 | again: | ||
11 | n = find_next_zero_bit(bitmap, nbits, start); | ||
12 | if (n == -1) | ||
13 | return -1; | ||
14 | |||
15 | /* could test bitsliced, but it's hardly worth it */ | ||
16 | end = n+len; | ||
17 | if (end >= nbits) | ||
18 | return -1; | ||
19 | for (i = n+1; i < end; i++) { | ||
20 | if (test_bit(i, bitmap)) { | ||
21 | start = i+1; | ||
22 | goto again; | ||
23 | } | ||
24 | } | ||
25 | return n; | ||
26 | } | ||
27 | |||
28 | EXPORT_SYMBOL(find_next_zero_string); | ||
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S new file mode 100644 index 000000000000..30a9da458c15 --- /dev/null +++ b/arch/x86_64/lib/clear_page.S | |||
@@ -0,0 +1,50 @@ | |||
1 | /* | ||
2 | * Zero a page. | ||
3 | * rdi page | ||
4 | */ | ||
5 | .globl clear_page | ||
6 | .p2align 4 | ||
7 | clear_page: | ||
8 | xorl %eax,%eax | ||
9 | movl $4096/64,%ecx | ||
10 | .p2align 4 | ||
11 | .Lloop: | ||
12 | decl %ecx | ||
13 | #define PUT(x) movq %rax,x*8(%rdi) | ||
14 | movq %rax,(%rdi) | ||
15 | PUT(1) | ||
16 | PUT(2) | ||
17 | PUT(3) | ||
18 | PUT(4) | ||
19 | PUT(5) | ||
20 | PUT(6) | ||
21 | PUT(7) | ||
22 | leaq 64(%rdi),%rdi | ||
23 | jnz .Lloop | ||
24 | nop | ||
25 | ret | ||
26 | clear_page_end: | ||
27 | |||
28 | /* C stepping K8 run faster using the string instructions. | ||
29 | It is also a lot simpler. Use this when possible */ | ||
30 | |||
31 | #include <asm/cpufeature.h> | ||
32 | |||
33 | .section .altinstructions,"a" | ||
34 | .align 8 | ||
35 | .quad clear_page | ||
36 | .quad clear_page_c | ||
37 | .byte X86_FEATURE_K8_C | ||
38 | .byte clear_page_end-clear_page | ||
39 | .byte clear_page_c_end-clear_page_c | ||
40 | .previous | ||
41 | |||
42 | .section .altinstr_replacement,"ax" | ||
43 | clear_page_c: | ||
44 | movl $4096/8,%ecx | ||
45 | xorl %eax,%eax | ||
46 | rep | ||
47 | stosq | ||
48 | ret | ||
49 | clear_page_c_end: | ||
50 | .previous | ||
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S new file mode 100644 index 000000000000..dd3aa47b6bf5 --- /dev/null +++ b/arch/x86_64/lib/copy_page.S | |||
@@ -0,0 +1,101 @@ | |||
1 | /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ | ||
2 | |||
3 | /* Don't use streaming store because it's better when the target | ||
4 | ends up in cache. */ | ||
5 | |||
6 | /* Could vary the prefetch distance based on SMP/UP */ | ||
7 | |||
8 | .globl copy_page | ||
9 | .p2align 4 | ||
10 | copy_page: | ||
11 | subq $3*8,%rsp | ||
12 | movq %rbx,(%rsp) | ||
13 | movq %r12,1*8(%rsp) | ||
14 | movq %r13,2*8(%rsp) | ||
15 | |||
16 | movl $(4096/64)-5,%ecx | ||
17 | .p2align 4 | ||
18 | .Loop64: | ||
19 | dec %rcx | ||
20 | |||
21 | movq (%rsi), %rax | ||
22 | movq 8 (%rsi), %rbx | ||
23 | movq 16 (%rsi), %rdx | ||
24 | movq 24 (%rsi), %r8 | ||
25 | movq 32 (%rsi), %r9 | ||
26 | movq 40 (%rsi), %r10 | ||
27 | movq 48 (%rsi), %r11 | ||
28 | movq 56 (%rsi), %r12 | ||
29 | |||
30 | prefetcht0 5*64(%rsi) | ||
31 | |||
32 | movq %rax, (%rdi) | ||
33 | movq %rbx, 8 (%rdi) | ||
34 | movq %rdx, 16 (%rdi) | ||
35 | movq %r8, 24 (%rdi) | ||
36 | movq %r9, 32 (%rdi) | ||
37 | movq %r10, 40 (%rdi) | ||
38 | movq %r11, 48 (%rdi) | ||
39 | movq %r12, 56 (%rdi) | ||
40 | |||
41 | leaq 64 (%rsi), %rsi | ||
42 | leaq 64 (%rdi), %rdi | ||
43 | |||
44 | jnz .Loop64 | ||
45 | |||
46 | movl $5,%ecx | ||
47 | .p2align 4 | ||
48 | .Loop2: | ||
49 | decl %ecx | ||
50 | |||
51 | movq (%rsi), %rax | ||
52 | movq 8 (%rsi), %rbx | ||
53 | movq 16 (%rsi), %rdx | ||
54 | movq 24 (%rsi), %r8 | ||
55 | movq 32 (%rsi), %r9 | ||
56 | movq 40 (%rsi), %r10 | ||
57 | movq 48 (%rsi), %r11 | ||
58 | movq 56 (%rsi), %r12 | ||
59 | |||
60 | movq %rax, (%rdi) | ||
61 | movq %rbx, 8 (%rdi) | ||
62 | movq %rdx, 16 (%rdi) | ||
63 | movq %r8, 24 (%rdi) | ||
64 | movq %r9, 32 (%rdi) | ||
65 | movq %r10, 40 (%rdi) | ||
66 | movq %r11, 48 (%rdi) | ||
67 | movq %r12, 56 (%rdi) | ||
68 | |||
69 | leaq 64(%rdi),%rdi | ||
70 | leaq 64(%rsi),%rsi | ||
71 | |||
72 | jnz .Loop2 | ||
73 | |||
74 | movq (%rsp),%rbx | ||
75 | movq 1*8(%rsp),%r12 | ||
76 | movq 2*8(%rsp),%r13 | ||
77 | addq $3*8,%rsp | ||
78 | ret | ||
79 | |||
80 | /* C stepping K8 run faster using the string copy instructions. | ||
81 | It is also a lot simpler. Use this when possible */ | ||
82 | |||
83 | #include <asm/cpufeature.h> | ||
84 | |||
85 | .section .altinstructions,"a" | ||
86 | .align 8 | ||
87 | .quad copy_page | ||
88 | .quad copy_page_c | ||
89 | .byte X86_FEATURE_K8_C | ||
90 | .byte copy_page_c_end-copy_page_c | ||
91 | .byte copy_page_c_end-copy_page_c | ||
92 | .previous | ||
93 | |||
94 | .section .altinstr_replacement,"ax" | ||
95 | copy_page_c: | ||
96 | movl $4096/8,%ecx | ||
97 | rep | ||
98 | movsq | ||
99 | ret | ||
100 | copy_page_c_end: | ||
101 | .previous | ||
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S new file mode 100644 index 000000000000..bd556c804248 --- /dev/null +++ b/arch/x86_64/lib/copy_user.S | |||
@@ -0,0 +1,294 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs. | ||
2 | * Subject to the GNU Public License v2. | ||
3 | * | ||
4 | * Functions to copy from and to user space. | ||
5 | */ | ||
6 | |||
7 | #define FIX_ALIGNMENT 1 | ||
8 | |||
9 | #include <asm/current.h> | ||
10 | #include <asm/offset.h> | ||
11 | #include <asm/thread_info.h> | ||
12 | #include <asm/cpufeature.h> | ||
13 | |||
14 | /* Standard copy_to_user with segment limit checking */ | ||
15 | .globl copy_to_user | ||
16 | .p2align 4 | ||
17 | copy_to_user: | ||
18 | GET_THREAD_INFO(%rax) | ||
19 | movq %rdi,%rcx | ||
20 | addq %rdx,%rcx | ||
21 | jc bad_to_user | ||
22 | cmpq threadinfo_addr_limit(%rax),%rcx | ||
23 | jae bad_to_user | ||
24 | 2: | ||
25 | .byte 0xe9 /* 32bit jump */ | ||
26 | .long .Lcug-1f | ||
27 | 1: | ||
28 | |||
29 | .section .altinstr_replacement,"ax" | ||
30 | 3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ | ||
31 | .long copy_user_generic_c-1b /* offset */ | ||
32 | .previous | ||
33 | .section .altinstructions,"a" | ||
34 | .align 8 | ||
35 | .quad 2b | ||
36 | .quad 3b | ||
37 | .byte X86_FEATURE_K8_C | ||
38 | .byte 5 | ||
39 | .byte 5 | ||
40 | .previous | ||
41 | |||
42 | /* Standard copy_from_user with segment limit checking */ | ||
43 | .globl copy_from_user | ||
44 | .p2align 4 | ||
45 | copy_from_user: | ||
46 | GET_THREAD_INFO(%rax) | ||
47 | movq %rsi,%rcx | ||
48 | addq %rdx,%rcx | ||
49 | jc bad_from_user | ||
50 | cmpq threadinfo_addr_limit(%rax),%rcx | ||
51 | jae bad_from_user | ||
52 | /* FALL THROUGH to copy_user_generic */ | ||
53 | |||
54 | .section .fixup,"ax" | ||
55 | /* must zero dest */ | ||
56 | bad_from_user: | ||
57 | movl %edx,%ecx | ||
58 | xorl %eax,%eax | ||
59 | rep | ||
60 | stosb | ||
61 | bad_to_user: | ||
62 | movl %edx,%eax | ||
63 | ret | ||
64 | .previous | ||
65 | |||
66 | |||
67 | /* | ||
68 | * copy_user_generic - memory copy with exception handling. | ||
69 | * | ||
70 | * Input: | ||
71 | * rdi destination | ||
72 | * rsi source | ||
73 | * rdx count | ||
74 | * | ||
75 | * Output: | ||
76 | * eax uncopied bytes or 0 if successful. | ||
77 | */ | ||
78 | .globl copy_user_generic | ||
79 | .p2align 4 | ||
80 | copy_user_generic: | ||
81 | .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ | ||
82 | .byte 0x66,0x90 | ||
83 | 1: | ||
84 | .section .altinstr_replacement,"ax" | ||
85 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
86 | .long copy_user_generic_c-1b /* offset */ | ||
87 | .previous | ||
88 | .section .altinstructions,"a" | ||
89 | .align 8 | ||
90 | .quad copy_user_generic | ||
91 | .quad 2b | ||
92 | .byte X86_FEATURE_K8_C | ||
93 | .byte 5 | ||
94 | .byte 5 | ||
95 | .previous | ||
96 | .Lcug: | ||
97 | pushq %rbx | ||
98 | xorl %eax,%eax /*zero for the exception handler */ | ||
99 | |||
100 | #ifdef FIX_ALIGNMENT | ||
101 | /* check for bad alignment of destination */ | ||
102 | movl %edi,%ecx | ||
103 | andl $7,%ecx | ||
104 | jnz .Lbad_alignment | ||
105 | .Lafter_bad_alignment: | ||
106 | #endif | ||
107 | |||
108 | movq %rdx,%rcx | ||
109 | |||
110 | movl $64,%ebx | ||
111 | shrq $6,%rdx | ||
112 | decq %rdx | ||
113 | js .Lhandle_tail | ||
114 | |||
115 | .p2align 4 | ||
116 | .Lloop: | ||
117 | .Ls1: movq (%rsi),%r11 | ||
118 | .Ls2: movq 1*8(%rsi),%r8 | ||
119 | .Ls3: movq 2*8(%rsi),%r9 | ||
120 | .Ls4: movq 3*8(%rsi),%r10 | ||
121 | .Ld1: movq %r11,(%rdi) | ||
122 | .Ld2: movq %r8,1*8(%rdi) | ||
123 | .Ld3: movq %r9,2*8(%rdi) | ||
124 | .Ld4: movq %r10,3*8(%rdi) | ||
125 | |||
126 | .Ls5: movq 4*8(%rsi),%r11 | ||
127 | .Ls6: movq 5*8(%rsi),%r8 | ||
128 | .Ls7: movq 6*8(%rsi),%r9 | ||
129 | .Ls8: movq 7*8(%rsi),%r10 | ||
130 | .Ld5: movq %r11,4*8(%rdi) | ||
131 | .Ld6: movq %r8,5*8(%rdi) | ||
132 | .Ld7: movq %r9,6*8(%rdi) | ||
133 | .Ld8: movq %r10,7*8(%rdi) | ||
134 | |||
135 | decq %rdx | ||
136 | |||
137 | leaq 64(%rsi),%rsi | ||
138 | leaq 64(%rdi),%rdi | ||
139 | |||
140 | jns .Lloop | ||
141 | |||
142 | .p2align 4 | ||
143 | .Lhandle_tail: | ||
144 | movl %ecx,%edx | ||
145 | andl $63,%ecx | ||
146 | shrl $3,%ecx | ||
147 | jz .Lhandle_7 | ||
148 | movl $8,%ebx | ||
149 | .p2align 4 | ||
150 | .Lloop_8: | ||
151 | .Ls9: movq (%rsi),%r8 | ||
152 | .Ld9: movq %r8,(%rdi) | ||
153 | decl %ecx | ||
154 | leaq 8(%rdi),%rdi | ||
155 | leaq 8(%rsi),%rsi | ||
156 | jnz .Lloop_8 | ||
157 | |||
158 | .Lhandle_7: | ||
159 | movl %edx,%ecx | ||
160 | andl $7,%ecx | ||
161 | jz .Lende | ||
162 | .p2align 4 | ||
163 | .Lloop_1: | ||
164 | .Ls10: movb (%rsi),%bl | ||
165 | .Ld10: movb %bl,(%rdi) | ||
166 | incq %rdi | ||
167 | incq %rsi | ||
168 | decl %ecx | ||
169 | jnz .Lloop_1 | ||
170 | |||
171 | .Lende: | ||
172 | popq %rbx | ||
173 | ret | ||
174 | |||
175 | #ifdef FIX_ALIGNMENT | ||
176 | /* align destination */ | ||
177 | .p2align 4 | ||
178 | .Lbad_alignment: | ||
179 | movl $8,%r9d | ||
180 | subl %ecx,%r9d | ||
181 | movl %r9d,%ecx | ||
182 | cmpq %r9,%rdx | ||
183 | jz .Lhandle_7 | ||
184 | js .Lhandle_7 | ||
185 | .Lalign_1: | ||
186 | .Ls11: movb (%rsi),%bl | ||
187 | .Ld11: movb %bl,(%rdi) | ||
188 | incq %rsi | ||
189 | incq %rdi | ||
190 | decl %ecx | ||
191 | jnz .Lalign_1 | ||
192 | subq %r9,%rdx | ||
193 | jmp .Lafter_bad_alignment | ||
194 | #endif | ||
195 | |||
196 | /* table sorted by exception address */ | ||
197 | .section __ex_table,"a" | ||
198 | .align 8 | ||
199 | .quad .Ls1,.Ls1e | ||
200 | .quad .Ls2,.Ls2e | ||
201 | .quad .Ls3,.Ls3e | ||
202 | .quad .Ls4,.Ls4e | ||
203 | .quad .Ld1,.Ls1e | ||
204 | .quad .Ld2,.Ls2e | ||
205 | .quad .Ld3,.Ls3e | ||
206 | .quad .Ld4,.Ls4e | ||
207 | .quad .Ls5,.Ls5e | ||
208 | .quad .Ls6,.Ls6e | ||
209 | .quad .Ls7,.Ls7e | ||
210 | .quad .Ls8,.Ls8e | ||
211 | .quad .Ld5,.Ls5e | ||
212 | .quad .Ld6,.Ls6e | ||
213 | .quad .Ld7,.Ls7e | ||
214 | .quad .Ld8,.Ls8e | ||
215 | .quad .Ls9,.Le_quad | ||
216 | .quad .Ld9,.Le_quad | ||
217 | .quad .Ls10,.Le_byte | ||
218 | .quad .Ld10,.Le_byte | ||
219 | #ifdef FIX_ALIGNMENT | ||
220 | .quad .Ls11,.Lzero_rest | ||
221 | .quad .Ld11,.Lzero_rest | ||
222 | #endif | ||
223 | .quad .Le5,.Le_zero | ||
224 | .previous | ||
225 | |||
226 | /* compute 64-offset for main loop. 8 bytes accuracy with error on the | ||
227 | pessimistic side. this is gross. it would be better to fix the | ||
228 | interface. */ | ||
229 | /* eax: zero, ebx: 64 */ | ||
230 | .Ls1e: addl $8,%eax | ||
231 | .Ls2e: addl $8,%eax | ||
232 | .Ls3e: addl $8,%eax | ||
233 | .Ls4e: addl $8,%eax | ||
234 | .Ls5e: addl $8,%eax | ||
235 | .Ls6e: addl $8,%eax | ||
236 | .Ls7e: addl $8,%eax | ||
237 | .Ls8e: addl $8,%eax | ||
238 | addq %rbx,%rdi /* +64 */ | ||
239 | subq %rax,%rdi /* correct destination with computed offset */ | ||
240 | |||
241 | shlq $6,%rdx /* loop counter * 64 (stride length) */ | ||
242 | addq %rax,%rdx /* add offset to loopcnt */ | ||
243 | andl $63,%ecx /* remaining bytes */ | ||
244 | addq %rcx,%rdx /* add them */ | ||
245 | jmp .Lzero_rest | ||
246 | |||
247 | /* exception on quad word loop in tail handling */ | ||
248 | /* ecx: loopcnt/8, %edx: length, rdi: correct */ | ||
249 | .Le_quad: | ||
250 | shll $3,%ecx | ||
251 | andl $7,%edx | ||
252 | addl %ecx,%edx | ||
253 | /* edx: bytes to zero, rdi: dest, eax:zero */ | ||
254 | .Lzero_rest: | ||
255 | movq %rdx,%rcx | ||
256 | .Le_byte: | ||
257 | xorl %eax,%eax | ||
258 | .Le5: rep | ||
259 | stosb | ||
260 | /* when there is another exception while zeroing the rest just return */ | ||
261 | .Le_zero: | ||
262 | movq %rdx,%rax | ||
263 | jmp .Lende | ||
264 | |||
265 | /* C stepping K8 run faster using the string copy instructions. | ||
266 | This is also a lot simpler. Use them when possible. | ||
267 | Patch in jmps to this code instead of copying it fully | ||
268 | to avoid unwanted aliasing in the exception tables. */ | ||
269 | |||
270 | /* rdi destination | ||
271 | * rsi source | ||
272 | * rdx count | ||
273 | * | ||
274 | * Output: | ||
275 | * eax uncopied bytes or 0 if successfull. | ||
276 | */ | ||
277 | copy_user_generic_c: | ||
278 | movl %edx,%ecx | ||
279 | shrl $3,%ecx | ||
280 | andl $7,%edx | ||
281 | 1: rep | ||
282 | movsq | ||
283 | movl %edx,%ecx | ||
284 | 2: rep | ||
285 | movsb | ||
286 | 4: movl %ecx,%eax | ||
287 | ret | ||
288 | 3: lea (%rdx,%rcx,8),%rax | ||
289 | ret | ||
290 | |||
291 | .section __ex_table,"a" | ||
292 | .quad 1b,3b | ||
293 | .quad 2b,4b | ||
294 | .previous | ||
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S new file mode 100644 index 000000000000..01808ec37836 --- /dev/null +++ b/arch/x86_64/lib/csum-copy.S | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
3 | * | ||
4 | * This file is subject to the terms and conditions of the GNU General Public | ||
5 | * License. See the file COPYING in the main directory of this archive | ||
6 | * for more details. No warranty for anything given at all. | ||
7 | */ | ||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/errno.h> | ||
10 | |||
11 | /* | ||
12 | * Checksum copy with exception handling. | ||
13 | * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the | ||
14 | * destination is zeroed. | ||
15 | * | ||
16 | * Input | ||
17 | * rdi source | ||
18 | * rsi destination | ||
19 | * edx len (32bit) | ||
20 | * ecx sum (32bit) | ||
21 | * r8 src_err_ptr (int) | ||
22 | * r9 dst_err_ptr (int) | ||
23 | * | ||
24 | * Output | ||
25 | * eax 64bit sum. undefined in case of exception. | ||
26 | * | ||
27 | * Wrappers need to take care of valid exception sum and zeroing. | ||
28 | * They also should align source or destination to 8 bytes. | ||
29 | */ | ||
30 | |||
31 | .macro source | ||
32 | 10: | ||
33 | .section __ex_table,"a" | ||
34 | .align 8 | ||
35 | .quad 10b,.Lbad_source | ||
36 | .previous | ||
37 | .endm | ||
38 | |||
39 | .macro dest | ||
40 | 20: | ||
41 | .section __ex_table,"a" | ||
42 | .align 8 | ||
43 | .quad 20b,.Lbad_dest | ||
44 | .previous | ||
45 | .endm | ||
46 | |||
47 | .macro ignore L=.Lignore | ||
48 | 30: | ||
49 | .section __ex_table,"a" | ||
50 | .align 8 | ||
51 | .quad 30b,\L | ||
52 | .previous | ||
53 | .endm | ||
54 | |||
55 | |||
56 | .globl csum_partial_copy_generic | ||
57 | .p2align 4 | ||
58 | csum_partial_copy_generic: | ||
59 | cmpl $3*64,%edx | ||
60 | jle .Lignore | ||
61 | |||
62 | .Lignore: | ||
63 | subq $7*8,%rsp | ||
64 | movq %rbx,2*8(%rsp) | ||
65 | movq %r12,3*8(%rsp) | ||
66 | movq %r14,4*8(%rsp) | ||
67 | movq %r13,5*8(%rsp) | ||
68 | movq %rbp,6*8(%rsp) | ||
69 | |||
70 | movq %r8,(%rsp) | ||
71 | movq %r9,1*8(%rsp) | ||
72 | |||
73 | movl %ecx,%eax | ||
74 | movl %edx,%ecx | ||
75 | |||
76 | xorl %r9d,%r9d | ||
77 | movq %rcx,%r12 | ||
78 | |||
79 | shrq $6,%r12 | ||
80 | jz .Lhandle_tail /* < 64 */ | ||
81 | |||
82 | clc | ||
83 | |||
84 | /* main loop. clear in 64 byte blocks */ | ||
85 | /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ | ||
86 | /* r11: temp3, rdx: temp4, r12 loopcnt */ | ||
87 | /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ | ||
88 | .p2align 4 | ||
89 | .Lloop: | ||
90 | source | ||
91 | movq (%rdi),%rbx | ||
92 | source | ||
93 | movq 8(%rdi),%r8 | ||
94 | source | ||
95 | movq 16(%rdi),%r11 | ||
96 | source | ||
97 | movq 24(%rdi),%rdx | ||
98 | |||
99 | source | ||
100 | movq 32(%rdi),%r10 | ||
101 | source | ||
102 | movq 40(%rdi),%rbp | ||
103 | source | ||
104 | movq 48(%rdi),%r14 | ||
105 | source | ||
106 | movq 56(%rdi),%r13 | ||
107 | |||
108 | ignore 2f | ||
109 | prefetcht0 5*64(%rdi) | ||
110 | 2: | ||
111 | adcq %rbx,%rax | ||
112 | adcq %r8,%rax | ||
113 | adcq %r11,%rax | ||
114 | adcq %rdx,%rax | ||
115 | adcq %r10,%rax | ||
116 | adcq %rbp,%rax | ||
117 | adcq %r14,%rax | ||
118 | adcq %r13,%rax | ||
119 | |||
120 | decl %r12d | ||
121 | |||
122 | dest | ||
123 | movq %rbx,(%rsi) | ||
124 | dest | ||
125 | movq %r8,8(%rsi) | ||
126 | dest | ||
127 | movq %r11,16(%rsi) | ||
128 | dest | ||
129 | movq %rdx,24(%rsi) | ||
130 | |||
131 | dest | ||
132 | movq %r10,32(%rsi) | ||
133 | dest | ||
134 | movq %rbp,40(%rsi) | ||
135 | dest | ||
136 | movq %r14,48(%rsi) | ||
137 | dest | ||
138 | movq %r13,56(%rsi) | ||
139 | |||
140 | 3: | ||
141 | |||
142 | leaq 64(%rdi),%rdi | ||
143 | leaq 64(%rsi),%rsi | ||
144 | |||
145 | jnz .Lloop | ||
146 | |||
147 | adcq %r9,%rax | ||
148 | |||
149 | /* do last upto 56 bytes */ | ||
150 | .Lhandle_tail: | ||
151 | /* ecx: count */ | ||
152 | movl %ecx,%r10d | ||
153 | andl $63,%ecx | ||
154 | shrl $3,%ecx | ||
155 | jz .Lfold | ||
156 | clc | ||
157 | .p2align 4 | ||
158 | .Lloop_8: | ||
159 | source | ||
160 | movq (%rdi),%rbx | ||
161 | adcq %rbx,%rax | ||
162 | decl %ecx | ||
163 | dest | ||
164 | movq %rbx,(%rsi) | ||
165 | leaq 8(%rsi),%rsi /* preserve carry */ | ||
166 | leaq 8(%rdi),%rdi | ||
167 | jnz .Lloop_8 | ||
168 | adcq %r9,%rax /* add in carry */ | ||
169 | |||
170 | .Lfold: | ||
171 | /* reduce checksum to 32bits */ | ||
172 | movl %eax,%ebx | ||
173 | shrq $32,%rax | ||
174 | addl %ebx,%eax | ||
175 | adcl %r9d,%eax | ||
176 | |||
177 | /* do last upto 6 bytes */ | ||
178 | .Lhandle_7: | ||
179 | movl %r10d,%ecx | ||
180 | andl $7,%ecx | ||
181 | shrl $1,%ecx | ||
182 | jz .Lhandle_1 | ||
183 | movl $2,%edx | ||
184 | xorl %ebx,%ebx | ||
185 | clc | ||
186 | .p2align 4 | ||
187 | .Lloop_1: | ||
188 | source | ||
189 | movw (%rdi),%bx | ||
190 | adcl %ebx,%eax | ||
191 | dest | ||
192 | decl %ecx | ||
193 | movw %bx,(%rsi) | ||
194 | leaq 2(%rdi),%rdi | ||
195 | leaq 2(%rsi),%rsi | ||
196 | jnz .Lloop_1 | ||
197 | adcl %r9d,%eax /* add in carry */ | ||
198 | |||
199 | /* handle last odd byte */ | ||
200 | .Lhandle_1: | ||
201 | testl $1,%r10d | ||
202 | jz .Lende | ||
203 | xorl %ebx,%ebx | ||
204 | source | ||
205 | movb (%rdi),%bl | ||
206 | dest | ||
207 | movb %bl,(%rsi) | ||
208 | addl %ebx,%eax | ||
209 | adcl %r9d,%eax /* carry */ | ||
210 | |||
211 | .Lende: | ||
212 | movq 2*8(%rsp),%rbx | ||
213 | movq 3*8(%rsp),%r12 | ||
214 | movq 4*8(%rsp),%r14 | ||
215 | movq 5*8(%rsp),%r13 | ||
216 | movq 6*8(%rsp),%rbp | ||
217 | addq $7*8,%rsp | ||
218 | ret | ||
219 | |||
220 | /* Exception handlers. Very simple, zeroing is done in the wrappers */ | ||
221 | .Lbad_source: | ||
222 | movq (%rsp),%rax | ||
223 | testq %rax,%rax | ||
224 | jz .Lende | ||
225 | movl $-EFAULT,(%rax) | ||
226 | jmp .Lende | ||
227 | |||
228 | .Lbad_dest: | ||
229 | movq 8(%rsp),%rax | ||
230 | testq %rax,%rax | ||
231 | jz .Lende | ||
232 | movl $-EFAULT,(%rax) | ||
233 | jmp .Lende | ||
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c new file mode 100644 index 000000000000..5384e227cdf6 --- /dev/null +++ b/arch/x86_64/lib/csum-partial.c | |||
@@ -0,0 +1,150 @@ | |||
1 | /* | ||
2 | * arch/x86_64/lib/csum-partial.c | ||
3 | * | ||
4 | * This file contains network checksum routines that are better done | ||
5 | * in an architecture-specific manner due to speed. | ||
6 | */ | ||
7 | |||
8 | #include <linux/compiler.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/checksum.h> | ||
11 | |||
12 | #define __force_inline inline __attribute__((always_inline)) | ||
13 | |||
14 | static inline unsigned short from32to16(unsigned a) | ||
15 | { | ||
16 | unsigned short b = a >> 16; | ||
17 | asm("addw %w2,%w0\n\t" | ||
18 | "adcw $0,%w0\n" | ||
19 | : "=r" (b) | ||
20 | : "0" (b), "r" (a)); | ||
21 | return b; | ||
22 | } | ||
23 | |||
24 | /* | ||
25 | * Do a 64-bit checksum on an arbitrary memory area. | ||
26 | * Returns a 32bit checksum. | ||
27 | * | ||
28 | * This isn't as time critical as it used to be because many NICs | ||
29 | * do hardware checksumming these days. | ||
30 | * | ||
31 | * Things tried and found to not make it faster: | ||
32 | * Manual Prefetching | ||
33 | * Unrolling to an 128 bytes inner loop. | ||
34 | * Using interleaving with more registers to break the carry chains. | ||
35 | */ | ||
36 | static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len) | ||
37 | { | ||
38 | unsigned odd, count; | ||
39 | unsigned long result = 0; | ||
40 | |||
41 | if (unlikely(len == 0)) | ||
42 | return result; | ||
43 | odd = 1 & (unsigned long) buff; | ||
44 | if (unlikely(odd)) { | ||
45 | result = *buff << 8; | ||
46 | len--; | ||
47 | buff++; | ||
48 | } | ||
49 | count = len >> 1; /* nr of 16-bit words.. */ | ||
50 | if (count) { | ||
51 | if (2 & (unsigned long) buff) { | ||
52 | result += *(unsigned short *)buff; | ||
53 | count--; | ||
54 | len -= 2; | ||
55 | buff += 2; | ||
56 | } | ||
57 | count >>= 1; /* nr of 32-bit words.. */ | ||
58 | if (count) { | ||
59 | unsigned long zero; | ||
60 | unsigned count64; | ||
61 | if (4 & (unsigned long) buff) { | ||
62 | result += *(unsigned int *) buff; | ||
63 | count--; | ||
64 | len -= 4; | ||
65 | buff += 4; | ||
66 | } | ||
67 | count >>= 1; /* nr of 64-bit words.. */ | ||
68 | |||
69 | /* main loop using 64byte blocks */ | ||
70 | zero = 0; | ||
71 | count64 = count >> 3; | ||
72 | while (count64) { | ||
73 | asm("addq 0*8(%[src]),%[res]\n\t" | ||
74 | "adcq 1*8(%[src]),%[res]\n\t" | ||
75 | "adcq 2*8(%[src]),%[res]\n\t" | ||
76 | "adcq 3*8(%[src]),%[res]\n\t" | ||
77 | "adcq 4*8(%[src]),%[res]\n\t" | ||
78 | "adcq 5*8(%[src]),%[res]\n\t" | ||
79 | "adcq 6*8(%[src]),%[res]\n\t" | ||
80 | "adcq 7*8(%[src]),%[res]\n\t" | ||
81 | "adcq %[zero],%[res]" | ||
82 | : [res] "=r" (result) | ||
83 | : [src] "r" (buff), [zero] "r" (zero), | ||
84 | "[res]" (result)); | ||
85 | buff += 64; | ||
86 | count64--; | ||
87 | } | ||
88 | |||
89 | /* last upto 7 8byte blocks */ | ||
90 | count %= 8; | ||
91 | while (count) { | ||
92 | asm("addq %1,%0\n\t" | ||
93 | "adcq %2,%0\n" | ||
94 | : "=r" (result) | ||
95 | : "m" (*(unsigned long *)buff), | ||
96 | "r" (zero), "0" (result)); | ||
97 | --count; | ||
98 | buff += 8; | ||
99 | } | ||
100 | result = add32_with_carry(result>>32, | ||
101 | result&0xffffffff); | ||
102 | |||
103 | if (len & 4) { | ||
104 | result += *(unsigned int *) buff; | ||
105 | buff += 4; | ||
106 | } | ||
107 | } | ||
108 | if (len & 2) { | ||
109 | result += *(unsigned short *) buff; | ||
110 | buff += 2; | ||
111 | } | ||
112 | } | ||
113 | if (len & 1) | ||
114 | result += *buff; | ||
115 | result = add32_with_carry(result>>32, result & 0xffffffff); | ||
116 | if (unlikely(odd)) { | ||
117 | result = from32to16(result); | ||
118 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
119 | } | ||
120 | return result; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * computes the checksum of a memory block at buff, length len, | ||
125 | * and adds in "sum" (32-bit) | ||
126 | * | ||
127 | * returns a 32-bit number suitable for feeding into itself | ||
128 | * or csum_tcpudp_magic | ||
129 | * | ||
130 | * this function must be called with even lengths, except | ||
131 | * for the last fragment, which may be odd | ||
132 | * | ||
133 | * it's best to have buff aligned on a 64-bit boundary | ||
134 | */ | ||
135 | unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum) | ||
136 | { | ||
137 | return add32_with_carry(do_csum(buff, len), sum); | ||
138 | } | ||
139 | |||
140 | EXPORT_SYMBOL(csum_partial); | ||
141 | |||
142 | /* | ||
143 | * this routine is used for miscellaneous IP-like checksums, mainly | ||
144 | * in icmp.c | ||
145 | */ | ||
146 | unsigned short ip_compute_csum(unsigned char * buff, int len) | ||
147 | { | ||
148 | return csum_fold(csum_partial(buff,len,0)); | ||
149 | } | ||
150 | |||
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c new file mode 100644 index 000000000000..94323f20816e --- /dev/null +++ b/arch/x86_64/lib/csum-wrappers.c | |||
@@ -0,0 +1,129 @@ | |||
1 | /* Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
2 | * Subject to the GNU Public License v.2 | ||
3 | * | ||
4 | * Wrappers of assembly checksum functions for x86-64. | ||
5 | */ | ||
6 | |||
7 | #include <asm/checksum.h> | ||
8 | #include <linux/module.h> | ||
9 | |||
10 | /** | ||
11 | * csum_partial_copy_from_user - Copy and checksum from user space. | ||
12 | * @src: source address (user space) | ||
13 | * @dst: destination address | ||
14 | * @len: number of bytes to be copied. | ||
15 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
16 | * @errp: set to -EFAULT for an bad source address. | ||
17 | * | ||
18 | * Returns an 32bit unfolded checksum of the buffer. | ||
19 | * src and dst are best aligned to 64bits. | ||
20 | */ | ||
21 | unsigned int | ||
22 | csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, | ||
23 | int len, unsigned int isum, int *errp) | ||
24 | { | ||
25 | might_sleep(); | ||
26 | *errp = 0; | ||
27 | if (likely(access_ok(VERIFY_READ,src, len))) { | ||
28 | /* Why 6, not 7? To handle odd addresses aligned we | ||
29 | would need to do considerable complications to fix the | ||
30 | checksum which is defined as an 16bit accumulator. The | ||
31 | fix alignment code is primarily for performance | ||
32 | compatibility with 32bit and that will handle odd | ||
33 | addresses slowly too. */ | ||
34 | if (unlikely((unsigned long)src & 6)) { | ||
35 | while (((unsigned long)src & 6) && len >= 2) { | ||
36 | __u16 val16; | ||
37 | *errp = __get_user(val16, (__u16 __user *)src); | ||
38 | if (*errp) | ||
39 | return isum; | ||
40 | *(__u16 *)dst = val16; | ||
41 | isum = add32_with_carry(isum, val16); | ||
42 | src += 2; | ||
43 | dst += 2; | ||
44 | len -= 2; | ||
45 | } | ||
46 | } | ||
47 | isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL); | ||
48 | if (likely(*errp == 0)) | ||
49 | return isum; | ||
50 | } | ||
51 | *errp = -EFAULT; | ||
52 | memset(dst,0,len); | ||
53 | return isum; | ||
54 | } | ||
55 | |||
56 | EXPORT_SYMBOL(csum_partial_copy_from_user); | ||
57 | |||
58 | /** | ||
59 | * csum_partial_copy_to_user - Copy and checksum to user space. | ||
60 | * @src: source address | ||
61 | * @dst: destination address (user space) | ||
62 | * @len: number of bytes to be copied. | ||
63 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
64 | * @errp: set to -EFAULT for an bad destination address. | ||
65 | * | ||
66 | * Returns an 32bit unfolded checksum of the buffer. | ||
67 | * src and dst are best aligned to 64bits. | ||
68 | */ | ||
69 | unsigned int | ||
70 | csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst, | ||
71 | int len, unsigned int isum, int *errp) | ||
72 | { | ||
73 | might_sleep(); | ||
74 | if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { | ||
75 | *errp = -EFAULT; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | if (unlikely((unsigned long)dst & 6)) { | ||
80 | while (((unsigned long)dst & 6) && len >= 2) { | ||
81 | __u16 val16 = *(__u16 *)src; | ||
82 | isum = add32_with_carry(isum, val16); | ||
83 | *errp = __put_user(val16, (__u16 __user *)dst); | ||
84 | if (*errp) | ||
85 | return isum; | ||
86 | src += 2; | ||
87 | dst += 2; | ||
88 | len -= 2; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | *errp = 0; | ||
93 | return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); | ||
94 | } | ||
95 | |||
96 | EXPORT_SYMBOL(csum_partial_copy_to_user); | ||
97 | |||
98 | /** | ||
99 | * csum_partial_copy_nocheck - Copy and checksum. | ||
100 | * @src: source address | ||
101 | * @dst: destination address | ||
102 | * @len: number of bytes to be copied. | ||
103 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
104 | * | ||
105 | * Returns an 32bit unfolded checksum of the buffer. | ||
106 | */ | ||
107 | unsigned int | ||
108 | csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum) | ||
109 | { | ||
110 | return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); | ||
111 | } | ||
112 | |||
113 | unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr, | ||
114 | __u32 len, unsigned short proto, unsigned int sum) | ||
115 | { | ||
116 | __u64 rest, sum64; | ||
117 | |||
118 | rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum; | ||
119 | asm(" addq (%[saddr]),%[sum]\n" | ||
120 | " adcq 8(%[saddr]),%[sum]\n" | ||
121 | " adcq (%[daddr]),%[sum]\n" | ||
122 | " adcq 8(%[daddr]),%[sum]\n" | ||
123 | " adcq $0,%[sum]\n" | ||
124 | : [sum] "=r" (sum64) | ||
125 | : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); | ||
126 | return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32)); | ||
127 | } | ||
128 | |||
129 | EXPORT_SYMBOL(csum_ipv6_magic); | ||
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c new file mode 100644 index 000000000000..ab43394dc775 --- /dev/null +++ b/arch/x86_64/lib/dec_and_lock.c | |||
@@ -0,0 +1,40 @@ | |||
1 | /* | ||
2 | * x86 version of "atomic_dec_and_lock()" using | ||
3 | * the atomic "cmpxchg" instruction. | ||
4 | * | ||
5 | * (For CPU's lacking cmpxchg, we use the slow | ||
6 | * generic version, and this one never even gets | ||
7 | * compiled). | ||
8 | */ | ||
9 | |||
10 | #include <linux/spinlock.h> | ||
11 | #include <asm/atomic.h> | ||
12 | |||
13 | int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) | ||
14 | { | ||
15 | int counter; | ||
16 | int newcount; | ||
17 | |||
18 | repeat: | ||
19 | counter = atomic_read(atomic); | ||
20 | newcount = counter-1; | ||
21 | |||
22 | if (!newcount) | ||
23 | goto slow_path; | ||
24 | |||
25 | asm volatile("lock; cmpxchgl %1,%2" | ||
26 | :"=a" (newcount) | ||
27 | :"r" (newcount), "m" (atomic->counter), "0" (counter)); | ||
28 | |||
29 | /* If the above failed, "eax" will have changed */ | ||
30 | if (newcount != counter) | ||
31 | goto repeat; | ||
32 | return 0; | ||
33 | |||
34 | slow_path: | ||
35 | spin_lock(lock); | ||
36 | if (atomic_dec_and_test(atomic)) | ||
37 | return 1; | ||
38 | spin_unlock(lock); | ||
39 | return 0; | ||
40 | } | ||
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c new file mode 100644 index 000000000000..6e2d66472eb1 --- /dev/null +++ b/arch/x86_64/lib/delay.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* | ||
2 | * Precise Delay Loops for x86-64 | ||
3 | * | ||
4 | * Copyright (C) 1993 Linus Torvalds | ||
5 | * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> | ||
6 | * | ||
7 | * The __delay function must _NOT_ be inlined as its execution time | ||
8 | * depends wildly on alignment on many x86 processors. | ||
9 | */ | ||
10 | |||
11 | #include <linux/config.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <asm/delay.h> | ||
15 | |||
16 | #ifdef CONFIG_SMP | ||
17 | #include <asm/smp.h> | ||
18 | #endif | ||
19 | |||
20 | int x86_udelay_tsc = 0; /* Delay via TSC */ | ||
21 | |||
22 | void __delay(unsigned long loops) | ||
23 | { | ||
24 | unsigned bclock, now; | ||
25 | |||
26 | rdtscl(bclock); | ||
27 | do | ||
28 | { | ||
29 | rep_nop(); | ||
30 | rdtscl(now); | ||
31 | } | ||
32 | while((now-bclock) < loops); | ||
33 | } | ||
34 | |||
35 | inline void __const_udelay(unsigned long xloops) | ||
36 | { | ||
37 | __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); | ||
38 | } | ||
39 | |||
40 | void __udelay(unsigned long usecs) | ||
41 | { | ||
42 | __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ | ||
43 | } | ||
44 | |||
45 | void __ndelay(unsigned long nsecs) | ||
46 | { | ||
47 | __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ | ||
48 | } | ||
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S new file mode 100644 index 000000000000..f94ea8a44051 --- /dev/null +++ b/arch/x86_64/lib/getuser.S | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * __get_user functions. | ||
3 | * | ||
4 | * (C) Copyright 1998 Linus Torvalds | ||
5 | * (C) Copyright 2005 Andi Kleen | ||
6 | * | ||
7 | * These functions have a non-standard call interface | ||
8 | * to make them more efficient, especially as they | ||
9 | * return an error value in addition to the "real" | ||
10 | * return value. | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * __get_user_X | ||
15 | * | ||
16 | * Inputs: %rcx contains the address. | ||
17 | * The register is modified, but all changes are undone | ||
18 | * before returning because the C code doesn't know about it. | ||
19 | * | ||
20 | * Outputs: %rax is error code (0 or -EFAULT) | ||
21 | * %rdx contains zero-extended value | ||
22 | * | ||
23 | * %r8 is destroyed. | ||
24 | * | ||
25 | * These functions should not modify any other registers, | ||
26 | * as they get called from within inline assembly. | ||
27 | */ | ||
28 | |||
29 | #include <linux/linkage.h> | ||
30 | #include <asm/page.h> | ||
31 | #include <asm/errno.h> | ||
32 | #include <asm/offset.h> | ||
33 | #include <asm/thread_info.h> | ||
34 | |||
35 | .text | ||
36 | .p2align 4 | ||
37 | .globl __get_user_1 | ||
38 | __get_user_1: | ||
39 | GET_THREAD_INFO(%r8) | ||
40 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
41 | jae bad_get_user | ||
42 | 1: movzb (%rcx),%edx | ||
43 | xorl %eax,%eax | ||
44 | ret | ||
45 | |||
46 | .p2align 4 | ||
47 | .globl __get_user_2 | ||
48 | __get_user_2: | ||
49 | GET_THREAD_INFO(%r8) | ||
50 | addq $1,%rcx | ||
51 | jc 20f | ||
52 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
53 | jae 20f | ||
54 | decq %rcx | ||
55 | 2: movzwl (%rcx),%edx | ||
56 | xorl %eax,%eax | ||
57 | ret | ||
58 | 20: decq %rcx | ||
59 | jmp bad_get_user | ||
60 | |||
61 | .p2align 4 | ||
62 | .globl __get_user_4 | ||
63 | __get_user_4: | ||
64 | GET_THREAD_INFO(%r8) | ||
65 | addq $3,%rcx | ||
66 | jc 30f | ||
67 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
68 | jae 30f | ||
69 | subq $3,%rcx | ||
70 | 3: movl (%rcx),%edx | ||
71 | xorl %eax,%eax | ||
72 | ret | ||
73 | 30: subq $3,%rcx | ||
74 | jmp bad_get_user | ||
75 | |||
76 | .p2align 4 | ||
77 | .globl __get_user_8 | ||
78 | __get_user_8: | ||
79 | GET_THREAD_INFO(%r8) | ||
80 | addq $7,%rcx | ||
81 | jc bad_get_user | ||
82 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
83 | jae bad_get_user | ||
84 | subq $7,%rcx | ||
85 | 4: movq (%rcx),%rdx | ||
86 | xorl %eax,%eax | ||
87 | ret | ||
88 | 40: subq $7,%rcx | ||
89 | jmp bad_get_user | ||
90 | |||
91 | bad_get_user: | ||
92 | xorl %edx,%edx | ||
93 | movq $(-EFAULT),%rax | ||
94 | ret | ||
95 | |||
96 | .section __ex_table,"a" | ||
97 | .quad 1b,bad_get_user | ||
98 | .quad 2b,bad_get_user | ||
99 | .quad 3b,bad_get_user | ||
100 | .quad 4b,bad_get_user | ||
101 | .previous | ||
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c new file mode 100644 index 000000000000..87b4a4e18039 --- /dev/null +++ b/arch/x86_64/lib/io.c | |||
@@ -0,0 +1,23 @@ | |||
1 | #include <linux/string.h> | ||
2 | #include <asm/io.h> | ||
3 | #include <linux/module.h> | ||
4 | |||
5 | void __memcpy_toio(unsigned long dst,const void*src,unsigned len) | ||
6 | { | ||
7 | __inline_memcpy((void *) dst,src,len); | ||
8 | } | ||
9 | EXPORT_SYMBOL(__memcpy_toio); | ||
10 | |||
11 | void __memcpy_fromio(void *dst,unsigned long src,unsigned len) | ||
12 | { | ||
13 | __inline_memcpy(dst,(const void *) src,len); | ||
14 | } | ||
15 | EXPORT_SYMBOL(__memcpy_fromio); | ||
16 | |||
17 | void memset_io(volatile void __iomem *a, int b, size_t c) | ||
18 | { | ||
19 | /* XXX: memset can mangle the IO patterns quite a bit. | ||
20 | perhaps it would be better to use a dumb one */ | ||
21 | memset((void *)a,b,c); | ||
22 | } | ||
23 | EXPORT_SYMBOL(memset_io); | ||
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S new file mode 100644 index 000000000000..c6c46494fef5 --- /dev/null +++ b/arch/x86_64/lib/memcpy.S | |||
@@ -0,0 +1,121 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | ||
2 | |||
3 | #include <asm/cpufeature.h> | ||
4 | /* | ||
5 | * memcpy - Copy a memory block. | ||
6 | * | ||
7 | * Input: | ||
8 | * rdi destination | ||
9 | * rsi source | ||
10 | * rdx count | ||
11 | * | ||
12 | * Output: | ||
13 | * rax original destination | ||
14 | */ | ||
15 | |||
16 | .globl __memcpy | ||
17 | .globl memcpy | ||
18 | .p2align 4 | ||
19 | __memcpy: | ||
20 | memcpy: | ||
21 | pushq %rbx | ||
22 | movq %rdi,%rax | ||
23 | |||
24 | movl %edx,%ecx | ||
25 | shrl $6,%ecx | ||
26 | jz .Lhandle_tail | ||
27 | |||
28 | .p2align 4 | ||
29 | .Lloop_64: | ||
30 | decl %ecx | ||
31 | |||
32 | movq (%rsi),%r11 | ||
33 | movq 8(%rsi),%r8 | ||
34 | |||
35 | movq %r11,(%rdi) | ||
36 | movq %r8,1*8(%rdi) | ||
37 | |||
38 | movq 2*8(%rsi),%r9 | ||
39 | movq 3*8(%rsi),%r10 | ||
40 | |||
41 | movq %r9,2*8(%rdi) | ||
42 | movq %r10,3*8(%rdi) | ||
43 | |||
44 | movq 4*8(%rsi),%r11 | ||
45 | movq 5*8(%rsi),%r8 | ||
46 | |||
47 | movq %r11,4*8(%rdi) | ||
48 | movq %r8,5*8(%rdi) | ||
49 | |||
50 | movq 6*8(%rsi),%r9 | ||
51 | movq 7*8(%rsi),%r10 | ||
52 | |||
53 | movq %r9,6*8(%rdi) | ||
54 | movq %r10,7*8(%rdi) | ||
55 | |||
56 | leaq 64(%rsi),%rsi | ||
57 | leaq 64(%rdi),%rdi | ||
58 | jnz .Lloop_64 | ||
59 | |||
60 | .Lhandle_tail: | ||
61 | movl %edx,%ecx | ||
62 | andl $63,%ecx | ||
63 | shrl $3,%ecx | ||
64 | jz .Lhandle_7 | ||
65 | .p2align 4 | ||
66 | .Lloop_8: | ||
67 | decl %ecx | ||
68 | movq (%rsi),%r8 | ||
69 | movq %r8,(%rdi) | ||
70 | leaq 8(%rdi),%rdi | ||
71 | leaq 8(%rsi),%rsi | ||
72 | jnz .Lloop_8 | ||
73 | |||
74 | .Lhandle_7: | ||
75 | movl %edx,%ecx | ||
76 | andl $7,%ecx | ||
77 | jz .Lende | ||
78 | .p2align 4 | ||
79 | .Lloop_1: | ||
80 | movb (%rsi),%r8b | ||
81 | movb %r8b,(%rdi) | ||
82 | incq %rdi | ||
83 | incq %rsi | ||
84 | decl %ecx | ||
85 | jnz .Lloop_1 | ||
86 | |||
87 | .Lende: | ||
88 | popq %rbx | ||
89 | ret | ||
90 | .Lfinal: | ||
91 | |||
92 | /* C stepping K8 run faster using the string copy instructions. | ||
93 | It is also a lot simpler. Use this when possible */ | ||
94 | |||
95 | .section .altinstructions,"a" | ||
96 | .align 8 | ||
97 | .quad memcpy | ||
98 | .quad memcpy_c | ||
99 | .byte X86_FEATURE_K8_C | ||
100 | .byte .Lfinal-memcpy | ||
101 | .byte memcpy_c_end-memcpy_c | ||
102 | .previous | ||
103 | |||
104 | .section .altinstr_replacement,"ax" | ||
105 | /* rdi destination | ||
106 | * rsi source | ||
107 | * rdx count | ||
108 | */ | ||
109 | memcpy_c: | ||
110 | movq %rdi,%rax | ||
111 | movl %edx,%ecx | ||
112 | shrl $3,%ecx | ||
113 | andl $7,%edx | ||
114 | rep | ||
115 | movsq | ||
116 | movl %edx,%ecx | ||
117 | rep | ||
118 | movsb | ||
119 | ret | ||
120 | memcpy_c_end: | ||
121 | .previous | ||
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c new file mode 100644 index 000000000000..e93d5255fdc9 --- /dev/null +++ b/arch/x86_64/lib/memmove.c | |||
@@ -0,0 +1,19 @@ | |||
1 | /* Normally compiler builtins are used, but sometimes the compiler calls out | ||
2 | of line code. Based on asm-i386/string.h. | ||
3 | */ | ||
4 | #define _STRING_C | ||
5 | #include <linux/string.h> | ||
6 | |||
7 | #undef memmove | ||
8 | void *memmove(void * dest,const void *src,size_t count) | ||
9 | { | ||
10 | if (dest < src) { | ||
11 | __inline_memcpy(dest,src,count); | ||
12 | } else { | ||
13 | char *p = (char *) dest + count; | ||
14 | char *s = (char *) src + count; | ||
15 | while (count--) | ||
16 | *--p = *--s; | ||
17 | } | ||
18 | return dest; | ||
19 | } | ||
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S new file mode 100644 index 000000000000..4b4c40638640 --- /dev/null +++ b/arch/x86_64/lib/memset.S | |||
@@ -0,0 +1,125 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs */ | ||
2 | /* | ||
3 | * ISO C memset - set a memory block to a byte value. | ||
4 | * | ||
5 | * rdi destination | ||
6 | * rsi value (char) | ||
7 | * rdx count (bytes) | ||
8 | * | ||
9 | * rax original destination | ||
10 | */ | ||
11 | .globl __memset | ||
12 | .globl memset | ||
13 | .p2align 4 | ||
14 | memset: | ||
15 | __memset: | ||
16 | movq %rdi,%r10 | ||
17 | movq %rdx,%r11 | ||
18 | |||
19 | /* expand byte value */ | ||
20 | movzbl %sil,%ecx | ||
21 | movabs $0x0101010101010101,%rax | ||
22 | mul %rcx /* with rax, clobbers rdx */ | ||
23 | |||
24 | /* align dst */ | ||
25 | movl %edi,%r9d | ||
26 | andl $7,%r9d | ||
27 | jnz .Lbad_alignment | ||
28 | .Lafter_bad_alignment: | ||
29 | |||
30 | movl %r11d,%ecx | ||
31 | shrl $6,%ecx | ||
32 | jz .Lhandle_tail | ||
33 | |||
34 | .p2align 4 | ||
35 | .Lloop_64: | ||
36 | decl %ecx | ||
37 | movq %rax,(%rdi) | ||
38 | movq %rax,8(%rdi) | ||
39 | movq %rax,16(%rdi) | ||
40 | movq %rax,24(%rdi) | ||
41 | movq %rax,32(%rdi) | ||
42 | movq %rax,40(%rdi) | ||
43 | movq %rax,48(%rdi) | ||
44 | movq %rax,56(%rdi) | ||
45 | leaq 64(%rdi),%rdi | ||
46 | jnz .Lloop_64 | ||
47 | |||
48 | /* Handle tail in loops. The loops should be faster than hard | ||
49 | to predict jump tables. */ | ||
50 | .p2align 4 | ||
51 | .Lhandle_tail: | ||
52 | movl %r11d,%ecx | ||
53 | andl $63&(~7),%ecx | ||
54 | jz .Lhandle_7 | ||
55 | shrl $3,%ecx | ||
56 | .p2align 4 | ||
57 | .Lloop_8: | ||
58 | decl %ecx | ||
59 | movq %rax,(%rdi) | ||
60 | leaq 8(%rdi),%rdi | ||
61 | jnz .Lloop_8 | ||
62 | |||
63 | .Lhandle_7: | ||
64 | movl %r11d,%ecx | ||
65 | andl $7,%ecx | ||
66 | jz .Lende | ||
67 | .p2align 4 | ||
68 | .Lloop_1: | ||
69 | decl %ecx | ||
70 | movb %al,(%rdi) | ||
71 | leaq 1(%rdi),%rdi | ||
72 | jnz .Lloop_1 | ||
73 | |||
74 | .Lende: | ||
75 | movq %r10,%rax | ||
76 | ret | ||
77 | |||
78 | .Lbad_alignment: | ||
79 | cmpq $7,%r11 | ||
80 | jbe .Lhandle_7 | ||
81 | movq %rax,(%rdi) /* unaligned store */ | ||
82 | movq $8,%r8 | ||
83 | subq %r9,%r8 | ||
84 | addq %r8,%rdi | ||
85 | subq %r8,%r11 | ||
86 | jmp .Lafter_bad_alignment | ||
87 | |||
88 | /* C stepping K8 run faster using the string instructions. | ||
89 | It is also a lot simpler. Use this when possible */ | ||
90 | |||
91 | #include <asm/cpufeature.h> | ||
92 | |||
93 | .section .altinstructions,"a" | ||
94 | .align 8 | ||
95 | .quad memset | ||
96 | .quad memset_c | ||
97 | .byte X86_FEATURE_K8_C | ||
98 | .byte memset_c_end-memset_c | ||
99 | .byte memset_c_end-memset_c | ||
100 | .previous | ||
101 | |||
102 | .section .altinstr_replacement,"ax" | ||
103 | /* rdi destination | ||
104 | * rsi value | ||
105 | * rdx count | ||
106 | */ | ||
107 | memset_c: | ||
108 | movq %rdi,%r9 | ||
109 | movl %edx,%r8d | ||
110 | andl $7,%r8d | ||
111 | movl %edx,%ecx | ||
112 | shrl $3,%ecx | ||
113 | /* expand byte value */ | ||
114 | movzbl %sil,%esi | ||
115 | movabs $0x0101010101010101,%rax | ||
116 | mulq %rsi /* with rax, clobbers rdx */ | ||
117 | rep | ||
118 | stosq | ||
119 | movl %r8d,%ecx | ||
120 | rep | ||
121 | stosb | ||
122 | movq %r9,%rax | ||
123 | ret | ||
124 | memset_c_end: | ||
125 | .previous | ||
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S new file mode 100644 index 000000000000..0dee1fdcb162 --- /dev/null +++ b/arch/x86_64/lib/putuser.S | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | * __put_user functions. | ||
3 | * | ||
4 | * (C) Copyright 1998 Linus Torvalds | ||
5 | * (C) Copyright 2005 Andi Kleen | ||
6 | * | ||
7 | * These functions have a non-standard call interface | ||
8 | * to make them more efficient, especially as they | ||
9 | * return an error value in addition to the "real" | ||
10 | * return value. | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * __put_user_X | ||
15 | * | ||
16 | * Inputs: %rcx contains the address | ||
17 | * %rdx contains new value | ||
18 | * | ||
19 | * Outputs: %rax is error code (0 or -EFAULT) | ||
20 | * | ||
21 | * %r8 is destroyed. | ||
22 | * | ||
23 | * These functions should not modify any other registers, | ||
24 | * as they get called from within inline assembly. | ||
25 | */ | ||
26 | |||
27 | #include <linux/linkage.h> | ||
28 | #include <asm/page.h> | ||
29 | #include <asm/errno.h> | ||
30 | #include <asm/offset.h> | ||
31 | #include <asm/thread_info.h> | ||
32 | |||
33 | .text | ||
34 | .p2align 4 | ||
35 | .globl __put_user_1 | ||
36 | __put_user_1: | ||
37 | GET_THREAD_INFO(%r8) | ||
38 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
39 | jae bad_put_user | ||
40 | 1: movb %dl,(%rcx) | ||
41 | xorl %eax,%eax | ||
42 | ret | ||
43 | |||
44 | .p2align 4 | ||
45 | .globl __put_user_2 | ||
46 | __put_user_2: | ||
47 | GET_THREAD_INFO(%r8) | ||
48 | addq $1,%rcx | ||
49 | jc bad_put_user | ||
50 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
51 | jae bad_put_user | ||
52 | 2: movw %dx,-1(%rcx) | ||
53 | xorl %eax,%eax | ||
54 | ret | ||
55 | |||
56 | .p2align 4 | ||
57 | .globl __put_user_4 | ||
58 | __put_user_4: | ||
59 | GET_THREAD_INFO(%r8) | ||
60 | addq $3,%rcx | ||
61 | jc bad_put_user | ||
62 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
63 | jae bad_put_user | ||
64 | 3: movl %edx,-3(%rcx) | ||
65 | xorl %eax,%eax | ||
66 | ret | ||
67 | |||
68 | .p2align 4 | ||
69 | .globl __put_user_8 | ||
70 | __put_user_8: | ||
71 | GET_THREAD_INFO(%r8) | ||
72 | addq $7,%rcx | ||
73 | jc bad_put_user | ||
74 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
75 | jae bad_put_user | ||
76 | 4: movq %rdx,-7(%rcx) | ||
77 | xorl %eax,%eax | ||
78 | ret | ||
79 | |||
80 | bad_put_user: | ||
81 | movq $(-EFAULT),%rax | ||
82 | ret | ||
83 | |||
84 | .section __ex_table,"a" | ||
85 | .quad 1b,bad_put_user | ||
86 | .quad 2b,bad_put_user | ||
87 | .quad 3b,bad_put_user | ||
88 | .quad 4b,bad_put_user | ||
89 | .previous | ||
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S new file mode 100644 index 000000000000..acc1e2ca7ed7 --- /dev/null +++ b/arch/x86_64/lib/thunk.S | |||
@@ -0,0 +1,95 @@ | |||
1 | /* | ||
2 | * Save registers before calling assembly functions. This avoids | ||
3 | * disturbance of register allocation in some inline assembly constructs. | ||
4 | * Copyright 2001,2002 by Andi Kleen, SuSE Labs. | ||
5 | * Subject to the GNU public license, v.2. No warranty of any kind. | ||
6 | * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/linkage.h> | ||
11 | #include <asm/dwarf2.h> | ||
12 | #include <asm/calling.h> | ||
13 | #include <asm/rwlock.h> | ||
14 | |||
15 | /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ | ||
16 | .macro thunk name,func | ||
17 | .globl \name | ||
18 | \name: | ||
19 | CFI_STARTPROC | ||
20 | SAVE_ARGS | ||
21 | call \func | ||
22 | jmp restore | ||
23 | CFI_ENDPROC | ||
24 | .endm | ||
25 | |||
26 | /* rdi: arg1 ... normal C conventions. rax is passed from C. */ | ||
27 | .macro thunk_retrax name,func | ||
28 | .globl \name | ||
29 | \name: | ||
30 | CFI_STARTPROC | ||
31 | SAVE_ARGS | ||
32 | call \func | ||
33 | jmp restore_norax | ||
34 | CFI_ENDPROC | ||
35 | .endm | ||
36 | |||
37 | |||
38 | .section .sched.text | ||
39 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | ||
40 | thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed | ||
41 | thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed | ||
42 | thunk rwsem_wake_thunk,rwsem_wake | ||
43 | thunk rwsem_downgrade_thunk,rwsem_downgrade_wake | ||
44 | #endif | ||
45 | thunk do_softirq_thunk,do_softirq | ||
46 | |||
47 | thunk __down_failed,__down | ||
48 | thunk_retrax __down_failed_interruptible,__down_interruptible | ||
49 | thunk_retrax __down_failed_trylock,__down_trylock | ||
50 | thunk __up_wakeup,__up | ||
51 | |||
52 | /* SAVE_ARGS below is used only for the .cfi directives it contains. */ | ||
53 | CFI_STARTPROC | ||
54 | SAVE_ARGS | ||
55 | restore: | ||
56 | RESTORE_ARGS | ||
57 | ret | ||
58 | CFI_ENDPROC | ||
59 | |||
60 | CFI_STARTPROC | ||
61 | SAVE_ARGS | ||
62 | restore_norax: | ||
63 | RESTORE_ARGS 1 | ||
64 | ret | ||
65 | CFI_ENDPROC | ||
66 | |||
67 | #ifdef CONFIG_SMP | ||
68 | /* Support for read/write spinlocks. */ | ||
69 | .text | ||
70 | /* rax: pointer to rwlock_t */ | ||
71 | ENTRY(__write_lock_failed) | ||
72 | lock | ||
73 | addl $RW_LOCK_BIAS,(%rax) | ||
74 | 1: rep | ||
75 | nop | ||
76 | cmpl $RW_LOCK_BIAS,(%rax) | ||
77 | jne 1b | ||
78 | lock | ||
79 | subl $RW_LOCK_BIAS,(%rax) | ||
80 | jnz __write_lock_failed | ||
81 | ret | ||
82 | |||
83 | /* rax: pointer to rwlock_t */ | ||
84 | ENTRY(__read_lock_failed) | ||
85 | lock | ||
86 | incl (%rax) | ||
87 | 1: rep | ||
88 | nop | ||
89 | cmpl $1,(%rax) | ||
90 | js 1b | ||
91 | lock | ||
92 | decl (%rax) | ||
93 | js __read_lock_failed | ||
94 | ret | ||
95 | #endif | ||
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c new file mode 100644 index 000000000000..db8abba1ad81 --- /dev/null +++ b/arch/x86_64/lib/usercopy.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * User address space access functions. | ||
3 | * | ||
4 | * Copyright 1997 Andi Kleen <ak@muc.de> | ||
5 | * Copyright 1997 Linus Torvalds | ||
6 | * Copyright 2002 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | #include <asm/uaccess.h> | ||
9 | |||
10 | /* | ||
11 | * Copy a null terminated string from userspace. | ||
12 | */ | ||
13 | |||
14 | #define __do_strncpy_from_user(dst,src,count,res) \ | ||
15 | do { \ | ||
16 | long __d0, __d1, __d2; \ | ||
17 | might_sleep(); \ | ||
18 | __asm__ __volatile__( \ | ||
19 | " testq %1,%1\n" \ | ||
20 | " jz 2f\n" \ | ||
21 | "0: lodsb\n" \ | ||
22 | " stosb\n" \ | ||
23 | " testb %%al,%%al\n" \ | ||
24 | " jz 1f\n" \ | ||
25 | " decq %1\n" \ | ||
26 | " jnz 0b\n" \ | ||
27 | "1: subq %1,%0\n" \ | ||
28 | "2:\n" \ | ||
29 | ".section .fixup,\"ax\"\n" \ | ||
30 | "3: movq %5,%0\n" \ | ||
31 | " jmp 2b\n" \ | ||
32 | ".previous\n" \ | ||
33 | ".section __ex_table,\"a\"\n" \ | ||
34 | " .align 8\n" \ | ||
35 | " .quad 0b,3b\n" \ | ||
36 | ".previous" \ | ||
37 | : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ | ||
38 | "=&D" (__d2) \ | ||
39 | : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ | ||
40 | : "memory"); \ | ||
41 | } while (0) | ||
42 | |||
43 | long | ||
44 | __strncpy_from_user(char *dst, const char __user *src, long count) | ||
45 | { | ||
46 | long res; | ||
47 | __do_strncpy_from_user(dst, src, count, res); | ||
48 | return res; | ||
49 | } | ||
50 | |||
51 | long | ||
52 | strncpy_from_user(char *dst, const char __user *src, long count) | ||
53 | { | ||
54 | long res = -EFAULT; | ||
55 | if (access_ok(VERIFY_READ, src, 1)) | ||
56 | __do_strncpy_from_user(dst, src, count, res); | ||
57 | return res; | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * Zero Userspace | ||
62 | */ | ||
63 | |||
64 | unsigned long __clear_user(void __user *addr, unsigned long size) | ||
65 | { | ||
66 | long __d0; | ||
67 | might_sleep(); | ||
68 | /* no memory constraint because it doesn't change any memory gcc knows | ||
69 | about */ | ||
70 | asm volatile( | ||
71 | " testq %[size8],%[size8]\n" | ||
72 | " jz 4f\n" | ||
73 | "0: movq %[zero],(%[dst])\n" | ||
74 | " addq %[eight],%[dst]\n" | ||
75 | " decl %%ecx ; jnz 0b\n" | ||
76 | "4: movq %[size1],%%rcx\n" | ||
77 | " testl %%ecx,%%ecx\n" | ||
78 | " jz 2f\n" | ||
79 | "1: movb %b[zero],(%[dst])\n" | ||
80 | " incq %[dst]\n" | ||
81 | " decl %%ecx ; jnz 1b\n" | ||
82 | "2:\n" | ||
83 | ".section .fixup,\"ax\"\n" | ||
84 | "3: lea 0(%[size1],%[size8],8),%[size8]\n" | ||
85 | " jmp 2b\n" | ||
86 | ".previous\n" | ||
87 | ".section __ex_table,\"a\"\n" | ||
88 | " .align 8\n" | ||
89 | " .quad 0b,3b\n" | ||
90 | " .quad 1b,2b\n" | ||
91 | ".previous" | ||
92 | : [size8] "=c"(size), [dst] "=&D" (__d0) | ||
93 | : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), | ||
94 | [zero] "r" (0UL), [eight] "r" (8UL)); | ||
95 | return size; | ||
96 | } | ||
97 | |||
98 | |||
99 | unsigned long clear_user(void __user *to, unsigned long n) | ||
100 | { | ||
101 | if (access_ok(VERIFY_WRITE, to, n)) | ||
102 | return __clear_user(to, n); | ||
103 | return n; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Return the size of a string (including the ending 0) | ||
108 | * | ||
109 | * Return 0 on exception, a value greater than N if too long | ||
110 | */ | ||
111 | |||
112 | long strnlen_user(const char __user *s, long n) | ||
113 | { | ||
114 | long res = 0; | ||
115 | char c; | ||
116 | |||
117 | if (!access_ok(VERIFY_READ, s, n)) | ||
118 | return 0; | ||
119 | |||
120 | while (1) { | ||
121 | if (res>n) | ||
122 | return n+1; | ||
123 | if (__get_user(c, s)) | ||
124 | return 0; | ||
125 | if (!c) | ||
126 | return res+1; | ||
127 | res++; | ||
128 | s++; | ||
129 | } | ||
130 | } | ||
131 | |||
132 | long strlen_user(const char __user *s) | ||
133 | { | ||
134 | long res = 0; | ||
135 | char c; | ||
136 | |||
137 | for (;;) { | ||
138 | if (get_user(c, s)) | ||
139 | return 0; | ||
140 | if (!c) | ||
141 | return res+1; | ||
142 | res++; | ||
143 | s++; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) | ||
148 | { | ||
149 | if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { | ||
150 | return copy_user_generic((__force void *)to, (__force void *)from, len); | ||
151 | } | ||
152 | return len; | ||
153 | } | ||
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile new file mode 100644 index 000000000000..66c354ad80ca --- /dev/null +++ b/arch/x86_64/mm/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | # | ||
2 | # Makefile for the linux x86_64-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | obj-y := init.o fault.o ioremap.o extable.o pageattr.o | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
7 | obj-$(CONFIG_DISCONTIGMEM) += numa.o | ||
8 | obj-$(CONFIG_K8_NUMA) += k8topology.o | ||
9 | obj-$(CONFIG_ACPI_NUMA) += srat.o | ||
10 | |||
11 | hugetlbpage-y = ../../i386/mm/hugetlbpage.o | ||
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c new file mode 100644 index 000000000000..2d78f9fb4035 --- /dev/null +++ b/arch/x86_64/mm/extable.c | |||
@@ -0,0 +1,35 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/extable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/spinlock.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | |||
11 | /* Simple binary search */ | ||
12 | const struct exception_table_entry * | ||
13 | search_extable(const struct exception_table_entry *first, | ||
14 | const struct exception_table_entry *last, | ||
15 | unsigned long value) | ||
16 | { | ||
17 | /* Work around a B stepping K8 bug */ | ||
18 | if ((value >> 32) == 0) | ||
19 | value |= 0xffffffffUL << 32; | ||
20 | |||
21 | while (first <= last) { | ||
22 | const struct exception_table_entry *mid; | ||
23 | long diff; | ||
24 | |||
25 | mid = (last - first) / 2 + first; | ||
26 | diff = mid->insn - value; | ||
27 | if (diff == 0) | ||
28 | return mid; | ||
29 | else if (diff < 0) | ||
30 | first = mid+1; | ||
31 | else | ||
32 | last = mid-1; | ||
33 | } | ||
34 | return NULL; | ||
35 | } | ||
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c new file mode 100644 index 000000000000..5724370475cc --- /dev/null +++ b/arch/x86_64/mm/fault.c | |||
@@ -0,0 +1,579 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/mm/fault.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | ||
6 | */ | ||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/signal.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/mman.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/smp.h> | ||
19 | #include <linux/smp_lock.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/tty.h> | ||
23 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
24 | #include <linux/compiler.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/kprobes.h> | ||
27 | |||
28 | #include <asm/system.h> | ||
29 | #include <asm/uaccess.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/tlbflush.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/kdebug.h> | ||
35 | #include <asm-generic/sections.h> | ||
36 | #include <asm/kdebug.h> | ||
37 | |||
38 | void bust_spinlocks(int yes) | ||
39 | { | ||
40 | int loglevel_save = console_loglevel; | ||
41 | if (yes) { | ||
42 | oops_in_progress = 1; | ||
43 | } else { | ||
44 | #ifdef CONFIG_VT | ||
45 | unblank_screen(); | ||
46 | #endif | ||
47 | oops_in_progress = 0; | ||
48 | /* | ||
49 | * OK, the message is on the console. Now we call printk() | ||
50 | * without oops_in_progress set so that printk will give klogd | ||
51 | * a poke. Hold onto your hats... | ||
52 | */ | ||
53 | console_loglevel = 15; /* NMI oopser may have shut the console up */ | ||
54 | printk(" "); | ||
55 | console_loglevel = loglevel_save; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | /* Sometimes the CPU reports invalid exceptions on prefetch. | ||
60 | Check that here and ignore. | ||
61 | Opcode checker based on code by Richard Brunner */ | ||
62 | static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
63 | unsigned long error_code) | ||
64 | { | ||
65 | unsigned char *instr = (unsigned char *)(regs->rip); | ||
66 | int scan_more = 1; | ||
67 | int prefetch = 0; | ||
68 | unsigned char *max_instr = instr + 15; | ||
69 | |||
70 | /* If it was a exec fault ignore */ | ||
71 | if (error_code & (1<<4)) | ||
72 | return 0; | ||
73 | |||
74 | /* Code segments in LDT could have a non zero base. Don't check | ||
75 | when that's possible */ | ||
76 | if (regs->cs & (1<<2)) | ||
77 | return 0; | ||
78 | |||
79 | if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE) | ||
80 | return 0; | ||
81 | |||
82 | while (scan_more && instr < max_instr) { | ||
83 | unsigned char opcode; | ||
84 | unsigned char instr_hi; | ||
85 | unsigned char instr_lo; | ||
86 | |||
87 | if (__get_user(opcode, instr)) | ||
88 | break; | ||
89 | |||
90 | instr_hi = opcode & 0xf0; | ||
91 | instr_lo = opcode & 0x0f; | ||
92 | instr++; | ||
93 | |||
94 | switch (instr_hi) { | ||
95 | case 0x20: | ||
96 | case 0x30: | ||
97 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 | ||
98 | prefixes. In long mode, the CPU will signal | ||
99 | invalid opcode if some of these prefixes are | ||
100 | present so we will never get here anyway */ | ||
101 | scan_more = ((instr_lo & 7) == 0x6); | ||
102 | break; | ||
103 | |||
104 | case 0x40: | ||
105 | /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | ||
106 | Need to figure out under what instruction mode the | ||
107 | instruction was issued ... */ | ||
108 | /* Could check the LDT for lm, but for now it's good | ||
109 | enough to assume that long mode only uses well known | ||
110 | segments or kernel. */ | ||
111 | scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); | ||
112 | break; | ||
113 | |||
114 | case 0x60: | ||
115 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
116 | scan_more = (instr_lo & 0xC) == 0x4; | ||
117 | break; | ||
118 | case 0xF0: | ||
119 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | ||
120 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
121 | break; | ||
122 | case 0x00: | ||
123 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
124 | scan_more = 0; | ||
125 | if (__get_user(opcode, instr)) | ||
126 | break; | ||
127 | prefetch = (instr_lo == 0xF) && | ||
128 | (opcode == 0x0D || opcode == 0x18); | ||
129 | break; | ||
130 | default: | ||
131 | scan_more = 0; | ||
132 | break; | ||
133 | } | ||
134 | } | ||
135 | return prefetch; | ||
136 | } | ||
137 | |||
138 | static int bad_address(void *p) | ||
139 | { | ||
140 | unsigned long dummy; | ||
141 | return __get_user(dummy, (unsigned long *)p); | ||
142 | } | ||
143 | |||
144 | void dump_pagetable(unsigned long address) | ||
145 | { | ||
146 | pgd_t *pgd; | ||
147 | pud_t *pud; | ||
148 | pmd_t *pmd; | ||
149 | pte_t *pte; | ||
150 | |||
151 | asm("movq %%cr3,%0" : "=r" (pgd)); | ||
152 | |||
153 | pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | ||
154 | pgd += pgd_index(address); | ||
155 | printk("PGD %lx ", pgd_val(*pgd)); | ||
156 | if (bad_address(pgd)) goto bad; | ||
157 | if (!pgd_present(*pgd)) goto ret; | ||
158 | |||
159 | pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); | ||
160 | if (bad_address(pud)) goto bad; | ||
161 | printk("PUD %lx ", pud_val(*pud)); | ||
162 | if (!pud_present(*pud)) goto ret; | ||
163 | |||
164 | pmd = pmd_offset(pud, address); | ||
165 | if (bad_address(pmd)) goto bad; | ||
166 | printk("PMD %lx ", pmd_val(*pmd)); | ||
167 | if (!pmd_present(*pmd)) goto ret; | ||
168 | |||
169 | pte = pte_offset_kernel(pmd, address); | ||
170 | if (bad_address(pte)) goto bad; | ||
171 | printk("PTE %lx", pte_val(*pte)); | ||
172 | ret: | ||
173 | printk("\n"); | ||
174 | return; | ||
175 | bad: | ||
176 | printk("BAD\n"); | ||
177 | } | ||
178 | |||
179 | static const char errata93_warning[] = | ||
180 | KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | ||
181 | KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | ||
182 | KERN_ERR "******* Please consider a BIOS update.\n" | ||
183 | KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | ||
184 | |||
185 | /* Workaround for K8 erratum #93 & buggy BIOS. | ||
186 | BIOS SMM functions are required to use a specific workaround | ||
187 | to avoid corruption of the 64bit RIP register on C stepping K8. | ||
188 | A lot of BIOS that didn't get tested properly miss this. | ||
189 | The OS sees this as a page fault with the upper 32bits of RIP cleared. | ||
190 | Try to work around it here. | ||
191 | Note we only handle faults in kernel here. */ | ||
192 | |||
193 | static int is_errata93(struct pt_regs *regs, unsigned long address) | ||
194 | { | ||
195 | static int warned; | ||
196 | if (address != regs->rip) | ||
197 | return 0; | ||
198 | if ((address >> 32) != 0) | ||
199 | return 0; | ||
200 | address |= 0xffffffffUL << 32; | ||
201 | if ((address >= (u64)_stext && address <= (u64)_etext) || | ||
202 | (address >= MODULES_VADDR && address <= MODULES_END)) { | ||
203 | if (!warned) { | ||
204 | printk(errata93_warning); | ||
205 | warned = 1; | ||
206 | } | ||
207 | regs->rip = address; | ||
208 | return 1; | ||
209 | } | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | int unhandled_signal(struct task_struct *tsk, int sig) | ||
214 | { | ||
215 | if (tsk->pid == 1) | ||
216 | return 1; | ||
217 | /* Warn for strace, but not for gdb */ | ||
218 | if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) && | ||
219 | (tsk->ptrace & PT_PTRACED)) | ||
220 | return 0; | ||
221 | return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || | ||
222 | (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); | ||
223 | } | ||
224 | |||
225 | static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | ||
226 | unsigned long error_code) | ||
227 | { | ||
228 | oops_begin(); | ||
229 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | ||
230 | current->comm, address); | ||
231 | dump_pagetable(address); | ||
232 | __die("Bad pagetable", regs, error_code); | ||
233 | oops_end(); | ||
234 | do_exit(SIGKILL); | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Handle a fault on the vmalloc or module mapping area | ||
239 | */ | ||
240 | static int vmalloc_fault(unsigned long address) | ||
241 | { | ||
242 | pgd_t *pgd, *pgd_ref; | ||
243 | pud_t *pud, *pud_ref; | ||
244 | pmd_t *pmd, *pmd_ref; | ||
245 | pte_t *pte, *pte_ref; | ||
246 | |||
247 | /* Copy kernel mappings over when needed. This can also | ||
248 | happen within a race in page table update. In the later | ||
249 | case just flush. */ | ||
250 | |||
251 | pgd = pgd_offset(current->mm ?: &init_mm, address); | ||
252 | pgd_ref = pgd_offset_k(address); | ||
253 | if (pgd_none(*pgd_ref)) | ||
254 | return -1; | ||
255 | if (pgd_none(*pgd)) | ||
256 | set_pgd(pgd, *pgd_ref); | ||
257 | |||
258 | /* Below here mismatches are bugs because these lower tables | ||
259 | are shared */ | ||
260 | |||
261 | pud = pud_offset(pgd, address); | ||
262 | pud_ref = pud_offset(pgd_ref, address); | ||
263 | if (pud_none(*pud_ref)) | ||
264 | return -1; | ||
265 | if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) | ||
266 | BUG(); | ||
267 | pmd = pmd_offset(pud, address); | ||
268 | pmd_ref = pmd_offset(pud_ref, address); | ||
269 | if (pmd_none(*pmd_ref)) | ||
270 | return -1; | ||
271 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | ||
272 | BUG(); | ||
273 | pte_ref = pte_offset_kernel(pmd_ref, address); | ||
274 | if (!pte_present(*pte_ref)) | ||
275 | return -1; | ||
276 | pte = pte_offset_kernel(pmd, address); | ||
277 | if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) | ||
278 | BUG(); | ||
279 | __flush_tlb_all(); | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | int page_fault_trace = 0; | ||
284 | int exception_trace = 1; | ||
285 | |||
286 | /* | ||
287 | * This routine handles page faults. It determines the address, | ||
288 | * and the problem, and then passes it off to one of the appropriate | ||
289 | * routines. | ||
290 | * | ||
291 | * error_code: | ||
292 | * bit 0 == 0 means no page found, 1 means protection fault | ||
293 | * bit 1 == 0 means read, 1 means write | ||
294 | * bit 2 == 0 means kernel, 1 means user-mode | ||
295 | * bit 3 == 1 means fault was an instruction fetch | ||
296 | */ | ||
297 | asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
298 | { | ||
299 | struct task_struct *tsk; | ||
300 | struct mm_struct *mm; | ||
301 | struct vm_area_struct * vma; | ||
302 | unsigned long address; | ||
303 | const struct exception_table_entry *fixup; | ||
304 | int write; | ||
305 | siginfo_t info; | ||
306 | |||
307 | #ifdef CONFIG_CHECKING | ||
308 | { | ||
309 | unsigned long gs; | ||
310 | struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); | ||
311 | rdmsrl(MSR_GS_BASE, gs); | ||
312 | if (gs != (unsigned long)pda) { | ||
313 | wrmsrl(MSR_GS_BASE, pda); | ||
314 | printk("page_fault: wrong gs %lx expected %p\n", gs, pda); | ||
315 | } | ||
316 | } | ||
317 | #endif | ||
318 | |||
319 | /* get the address */ | ||
320 | __asm__("movq %%cr2,%0":"=r" (address)); | ||
321 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
322 | SIGSEGV) == NOTIFY_STOP) | ||
323 | return; | ||
324 | |||
325 | if (likely(regs->eflags & X86_EFLAGS_IF)) | ||
326 | local_irq_enable(); | ||
327 | |||
328 | if (unlikely(page_fault_trace)) | ||
329 | printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", | ||
330 | regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); | ||
331 | |||
332 | tsk = current; | ||
333 | mm = tsk->mm; | ||
334 | info.si_code = SEGV_MAPERR; | ||
335 | |||
336 | |||
337 | /* | ||
338 | * We fault-in kernel-space virtual memory on-demand. The | ||
339 | * 'reference' page table is init_mm.pgd. | ||
340 | * | ||
341 | * NOTE! We MUST NOT take any locks for this case. We may | ||
342 | * be in an interrupt or a critical region, and should | ||
343 | * only copy the information from the master page table, | ||
344 | * nothing more. | ||
345 | * | ||
346 | * This verifies that the fault happens in kernel space | ||
347 | * (error_code & 4) == 0, and that the fault was not a | ||
348 | * protection error (error_code & 1) == 0. | ||
349 | */ | ||
350 | if (unlikely(address >= TASK_SIZE)) { | ||
351 | if (!(error_code & 5)) { | ||
352 | if (vmalloc_fault(address) < 0) | ||
353 | goto bad_area_nosemaphore; | ||
354 | return; | ||
355 | } | ||
356 | /* | ||
357 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
358 | * fault we could otherwise deadlock. | ||
359 | */ | ||
360 | goto bad_area_nosemaphore; | ||
361 | } | ||
362 | |||
363 | if (unlikely(error_code & (1 << 3))) | ||
364 | pgtable_bad(address, regs, error_code); | ||
365 | |||
366 | /* | ||
367 | * If we're in an interrupt or have no user | ||
368 | * context, we must not take the fault.. | ||
369 | */ | ||
370 | if (unlikely(in_atomic() || !mm)) | ||
371 | goto bad_area_nosemaphore; | ||
372 | |||
373 | again: | ||
374 | /* When running in the kernel we expect faults to occur only to | ||
375 | * addresses in user space. All other faults represent errors in the | ||
376 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
377 | * erroneous fault occuring in a code path which already holds mmap_sem | ||
378 | * we will deadlock attempting to validate the fault against the | ||
379 | * address space. Luckily the kernel only validly references user | ||
380 | * space from well defined areas of code, which are listed in the | ||
381 | * exceptions table. | ||
382 | * | ||
383 | * As the vast majority of faults will be valid we will only perform | ||
384 | * the source reference check when there is a possibilty of a deadlock. | ||
385 | * Attempt to lock the address space, if we cannot we then validate the | ||
386 | * source. If this is invalid we can skip the address space check, | ||
387 | * thus avoiding the deadlock. | ||
388 | */ | ||
389 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
390 | if ((error_code & 4) == 0 && | ||
391 | !search_exception_tables(regs->rip)) | ||
392 | goto bad_area_nosemaphore; | ||
393 | down_read(&mm->mmap_sem); | ||
394 | } | ||
395 | |||
396 | vma = find_vma(mm, address); | ||
397 | if (!vma) | ||
398 | goto bad_area; | ||
399 | if (likely(vma->vm_start <= address)) | ||
400 | goto good_area; | ||
401 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
402 | goto bad_area; | ||
403 | if (error_code & 4) { | ||
404 | // XXX: align red zone size with ABI | ||
405 | if (address + 128 < regs->rsp) | ||
406 | goto bad_area; | ||
407 | } | ||
408 | if (expand_stack(vma, address)) | ||
409 | goto bad_area; | ||
410 | /* | ||
411 | * Ok, we have a good vm_area for this memory access, so | ||
412 | * we can handle it.. | ||
413 | */ | ||
414 | good_area: | ||
415 | info.si_code = SEGV_ACCERR; | ||
416 | write = 0; | ||
417 | switch (error_code & 3) { | ||
418 | default: /* 3: write, present */ | ||
419 | /* fall through */ | ||
420 | case 2: /* write, not present */ | ||
421 | if (!(vma->vm_flags & VM_WRITE)) | ||
422 | goto bad_area; | ||
423 | write++; | ||
424 | break; | ||
425 | case 1: /* read, present */ | ||
426 | goto bad_area; | ||
427 | case 0: /* read, not present */ | ||
428 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | ||
429 | goto bad_area; | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * If for any reason at all we couldn't handle the fault, | ||
434 | * make sure we exit gracefully rather than endlessly redo | ||
435 | * the fault. | ||
436 | */ | ||
437 | switch (handle_mm_fault(mm, vma, address, write)) { | ||
438 | case 1: | ||
439 | tsk->min_flt++; | ||
440 | break; | ||
441 | case 2: | ||
442 | tsk->maj_flt++; | ||
443 | break; | ||
444 | case 0: | ||
445 | goto do_sigbus; | ||
446 | default: | ||
447 | goto out_of_memory; | ||
448 | } | ||
449 | |||
450 | up_read(&mm->mmap_sem); | ||
451 | return; | ||
452 | |||
453 | /* | ||
454 | * Something tried to access memory that isn't in our memory map.. | ||
455 | * Fix it, but check if it's kernel or user first.. | ||
456 | */ | ||
457 | bad_area: | ||
458 | up_read(&mm->mmap_sem); | ||
459 | |||
460 | bad_area_nosemaphore: | ||
461 | |||
462 | #ifdef CONFIG_IA32_EMULATION | ||
463 | /* 32bit vsyscall. map on demand. */ | ||
464 | if (test_thread_flag(TIF_IA32) && | ||
465 | address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { | ||
466 | if (map_syscall32(mm, address) < 0) | ||
467 | goto out_of_memory2; | ||
468 | return; | ||
469 | } | ||
470 | #endif | ||
471 | |||
472 | /* User mode accesses just cause a SIGSEGV */ | ||
473 | if (error_code & 4) { | ||
474 | if (is_prefetch(regs, address, error_code)) | ||
475 | return; | ||
476 | |||
477 | /* Work around K8 erratum #100 K8 in compat mode | ||
478 | occasionally jumps to illegal addresses >4GB. We | ||
479 | catch this here in the page fault handler because | ||
480 | these addresses are not reachable. Just detect this | ||
481 | case and return. Any code segment in LDT is | ||
482 | compatibility mode. */ | ||
483 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | ||
484 | (address >> 32)) | ||
485 | return; | ||
486 | |||
487 | if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { | ||
488 | printk( | ||
489 | "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", | ||
490 | tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
491 | tsk->comm, tsk->pid, address, regs->rip, | ||
492 | regs->rsp, error_code); | ||
493 | } | ||
494 | |||
495 | tsk->thread.cr2 = address; | ||
496 | /* Kernel addresses are always protection faults */ | ||
497 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
498 | tsk->thread.trap_no = 14; | ||
499 | info.si_signo = SIGSEGV; | ||
500 | info.si_errno = 0; | ||
501 | /* info.si_code has been set above */ | ||
502 | info.si_addr = (void __user *)address; | ||
503 | force_sig_info(SIGSEGV, &info, tsk); | ||
504 | return; | ||
505 | } | ||
506 | |||
507 | no_context: | ||
508 | |||
509 | /* Are we prepared to handle this kernel fault? */ | ||
510 | fixup = search_exception_tables(regs->rip); | ||
511 | if (fixup) { | ||
512 | regs->rip = fixup->fixup; | ||
513 | return; | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Hall of shame of CPU/BIOS bugs. | ||
518 | */ | ||
519 | |||
520 | if (is_prefetch(regs, address, error_code)) | ||
521 | return; | ||
522 | |||
523 | if (is_errata93(regs, address)) | ||
524 | return; | ||
525 | |||
526 | /* | ||
527 | * Oops. The kernel tried to access some bad page. We'll have to | ||
528 | * terminate things with extreme prejudice. | ||
529 | */ | ||
530 | |||
531 | oops_begin(); | ||
532 | |||
533 | if (address < PAGE_SIZE) | ||
534 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
535 | else | ||
536 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
537 | printk(" at %016lx RIP: \n" KERN_ALERT,address); | ||
538 | printk_address(regs->rip); | ||
539 | printk("\n"); | ||
540 | dump_pagetable(address); | ||
541 | __die("Oops", regs, error_code); | ||
542 | /* Executive summary in case the body of the oops scrolled away */ | ||
543 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
544 | oops_end(); | ||
545 | do_exit(SIGKILL); | ||
546 | |||
547 | /* | ||
548 | * We ran out of memory, or some other thing happened to us that made | ||
549 | * us unable to handle the page fault gracefully. | ||
550 | */ | ||
551 | out_of_memory: | ||
552 | up_read(&mm->mmap_sem); | ||
553 | out_of_memory2: | ||
554 | if (current->pid == 1) { | ||
555 | yield(); | ||
556 | goto again; | ||
557 | } | ||
558 | printk("VM: killing process %s\n", tsk->comm); | ||
559 | if (error_code & 4) | ||
560 | do_exit(SIGKILL); | ||
561 | goto no_context; | ||
562 | |||
563 | do_sigbus: | ||
564 | up_read(&mm->mmap_sem); | ||
565 | |||
566 | /* Kernel mode? Handle exceptions or die */ | ||
567 | if (!(error_code & 4)) | ||
568 | goto no_context; | ||
569 | |||
570 | tsk->thread.cr2 = address; | ||
571 | tsk->thread.error_code = error_code; | ||
572 | tsk->thread.trap_no = 14; | ||
573 | info.si_signo = SIGBUS; | ||
574 | info.si_errno = 0; | ||
575 | info.si_code = BUS_ADRERR; | ||
576 | info.si_addr = (void __user *)address; | ||
577 | force_sig_info(SIGBUS, &info, tsk); | ||
578 | return; | ||
579 | } | ||
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c new file mode 100644 index 000000000000..b0d604551d86 --- /dev/null +++ b/arch/x86_64/mm/init.c | |||
@@ -0,0 +1,630 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/init.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/signal.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/errno.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/mman.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/swap.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/pagemap.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/proc_fs.h> | ||
25 | |||
26 | #include <asm/processor.h> | ||
27 | #include <asm/system.h> | ||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/dma.h> | ||
32 | #include <asm/fixmap.h> | ||
33 | #include <asm/e820.h> | ||
34 | #include <asm/apic.h> | ||
35 | #include <asm/tlb.h> | ||
36 | #include <asm/mmu_context.h> | ||
37 | #include <asm/proto.h> | ||
38 | #include <asm/smp.h> | ||
39 | |||
40 | #ifndef Dprintk | ||
41 | #define Dprintk(x...) | ||
42 | #endif | ||
43 | |||
44 | #ifdef CONFIG_GART_IOMMU | ||
45 | extern int swiotlb; | ||
46 | #endif | ||
47 | |||
48 | extern char _stext[]; | ||
49 | |||
50 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
51 | |||
52 | /* | ||
53 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | ||
54 | * physical space so we can cache the place of the first one and move | ||
55 | * around without checking the pgd every time. | ||
56 | */ | ||
57 | |||
58 | void show_mem(void) | ||
59 | { | ||
60 | int i, total = 0, reserved = 0; | ||
61 | int shared = 0, cached = 0; | ||
62 | pg_data_t *pgdat; | ||
63 | struct page *page; | ||
64 | |||
65 | printk("Mem-info:\n"); | ||
66 | show_free_areas(); | ||
67 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
68 | |||
69 | for_each_pgdat(pgdat) { | ||
70 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
71 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
72 | total++; | ||
73 | if (PageReserved(page)) | ||
74 | reserved++; | ||
75 | else if (PageSwapCache(page)) | ||
76 | cached++; | ||
77 | else if (page_count(page)) | ||
78 | shared += page_count(page) - 1; | ||
79 | } | ||
80 | } | ||
81 | printk("%d pages of RAM\n", total); | ||
82 | printk("%d reserved pages\n",reserved); | ||
83 | printk("%d pages shared\n",shared); | ||
84 | printk("%d pages swap cached\n",cached); | ||
85 | } | ||
86 | |||
87 | /* References to section boundaries */ | ||
88 | |||
89 | extern char _text, _etext, _edata, __bss_start, _end[]; | ||
90 | extern char __init_begin, __init_end; | ||
91 | |||
92 | int after_bootmem; | ||
93 | |||
94 | static void *spp_getpage(void) | ||
95 | { | ||
96 | void *ptr; | ||
97 | if (after_bootmem) | ||
98 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | ||
99 | else | ||
100 | ptr = alloc_bootmem_pages(PAGE_SIZE); | ||
101 | if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | ||
102 | panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | ||
103 | |||
104 | Dprintk("spp_getpage %p\n", ptr); | ||
105 | return ptr; | ||
106 | } | ||
107 | |||
108 | static void set_pte_phys(unsigned long vaddr, | ||
109 | unsigned long phys, pgprot_t prot) | ||
110 | { | ||
111 | pgd_t *pgd; | ||
112 | pud_t *pud; | ||
113 | pmd_t *pmd; | ||
114 | pte_t *pte, new_pte; | ||
115 | |||
116 | Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | ||
117 | |||
118 | pgd = pgd_offset_k(vaddr); | ||
119 | if (pgd_none(*pgd)) { | ||
120 | printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
121 | return; | ||
122 | } | ||
123 | pud = pud_offset(pgd, vaddr); | ||
124 | if (pud_none(*pud)) { | ||
125 | pmd = (pmd_t *) spp_getpage(); | ||
126 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | ||
127 | if (pmd != pmd_offset(pud, 0)) { | ||
128 | printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | ||
129 | return; | ||
130 | } | ||
131 | } | ||
132 | pmd = pmd_offset(pud, vaddr); | ||
133 | if (pmd_none(*pmd)) { | ||
134 | pte = (pte_t *) spp_getpage(); | ||
135 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | ||
136 | if (pte != pte_offset_kernel(pmd, 0)) { | ||
137 | printk("PAGETABLE BUG #02!\n"); | ||
138 | return; | ||
139 | } | ||
140 | } | ||
141 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | ||
142 | |||
143 | pte = pte_offset_kernel(pmd, vaddr); | ||
144 | if (!pte_none(*pte) && | ||
145 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
146 | pte_ERROR(*pte); | ||
147 | set_pte(pte, new_pte); | ||
148 | |||
149 | /* | ||
150 | * It's enough to flush this one mapping. | ||
151 | * (PGE mappings get flushed as well) | ||
152 | */ | ||
153 | __flush_tlb_one(vaddr); | ||
154 | } | ||
155 | |||
156 | /* NOTE: this is meant to be run only at boot */ | ||
157 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
158 | { | ||
159 | unsigned long address = __fix_to_virt(idx); | ||
160 | |||
161 | if (idx >= __end_of_fixed_addresses) { | ||
162 | printk("Invalid __set_fixmap\n"); | ||
163 | return; | ||
164 | } | ||
165 | set_pte_phys(address, phys, prot); | ||
166 | } | ||
167 | |||
168 | unsigned long __initdata table_start, table_end; | ||
169 | |||
170 | extern pmd_t temp_boot_pmds[]; | ||
171 | |||
172 | static struct temp_map { | ||
173 | pmd_t *pmd; | ||
174 | void *address; | ||
175 | int allocated; | ||
176 | } temp_mappings[] __initdata = { | ||
177 | { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, | ||
178 | { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, | ||
179 | {} | ||
180 | }; | ||
181 | |||
182 | static __init void *alloc_low_page(int *index, unsigned long *phys) | ||
183 | { | ||
184 | struct temp_map *ti; | ||
185 | int i; | ||
186 | unsigned long pfn = table_end++, paddr; | ||
187 | void *adr; | ||
188 | |||
189 | if (pfn >= end_pfn) | ||
190 | panic("alloc_low_page: ran out of memory"); | ||
191 | for (i = 0; temp_mappings[i].allocated; i++) { | ||
192 | if (!temp_mappings[i].pmd) | ||
193 | panic("alloc_low_page: ran out of temp mappings"); | ||
194 | } | ||
195 | ti = &temp_mappings[i]; | ||
196 | paddr = (pfn << PAGE_SHIFT) & PMD_MASK; | ||
197 | set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); | ||
198 | ti->allocated = 1; | ||
199 | __flush_tlb(); | ||
200 | adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); | ||
201 | *index = i; | ||
202 | *phys = pfn * PAGE_SIZE; | ||
203 | return adr; | ||
204 | } | ||
205 | |||
206 | static __init void unmap_low_page(int i) | ||
207 | { | ||
208 | struct temp_map *ti = &temp_mappings[i]; | ||
209 | set_pmd(ti->pmd, __pmd(0)); | ||
210 | ti->allocated = 0; | ||
211 | } | ||
212 | |||
213 | static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) | ||
214 | { | ||
215 | long i, j; | ||
216 | |||
217 | i = pud_index(address); | ||
218 | pud = pud + i; | ||
219 | for (; i < PTRS_PER_PUD; pud++, i++) { | ||
220 | int map; | ||
221 | unsigned long paddr, pmd_phys; | ||
222 | pmd_t *pmd; | ||
223 | |||
224 | paddr = address + i*PUD_SIZE; | ||
225 | if (paddr >= end) { | ||
226 | for (; i < PTRS_PER_PUD; i++, pud++) | ||
227 | set_pud(pud, __pud(0)); | ||
228 | break; | ||
229 | } | ||
230 | |||
231 | if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { | ||
232 | set_pud(pud, __pud(0)); | ||
233 | continue; | ||
234 | } | ||
235 | |||
236 | pmd = alloc_low_page(&map, &pmd_phys); | ||
237 | set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); | ||
238 | for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { | ||
239 | unsigned long pe; | ||
240 | |||
241 | if (paddr >= end) { | ||
242 | for (; j < PTRS_PER_PMD; j++, pmd++) | ||
243 | set_pmd(pmd, __pmd(0)); | ||
244 | break; | ||
245 | } | ||
246 | pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; | ||
247 | pe &= __supported_pte_mask; | ||
248 | set_pmd(pmd, __pmd(pe)); | ||
249 | } | ||
250 | unmap_low_page(map); | ||
251 | } | ||
252 | __flush_tlb(); | ||
253 | } | ||
254 | |||
255 | static void __init find_early_table_space(unsigned long end) | ||
256 | { | ||
257 | unsigned long puds, pmds, tables; | ||
258 | |||
259 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
260 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
261 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + | ||
262 | round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
263 | |||
264 | table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); | ||
265 | if (table_start == -1UL) | ||
266 | panic("Cannot find space for the kernel page tables"); | ||
267 | |||
268 | table_start >>= PAGE_SHIFT; | ||
269 | table_end = table_start; | ||
270 | } | ||
271 | |||
272 | /* Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
273 | This runs before bootmem is initialized and gets pages directly from the | ||
274 | physical memory. To access them they are temporarily mapped. */ | ||
275 | void __init init_memory_mapping(unsigned long start, unsigned long end) | ||
276 | { | ||
277 | unsigned long next; | ||
278 | |||
279 | Dprintk("init_memory_mapping\n"); | ||
280 | |||
281 | /* | ||
282 | * Find space for the kernel direct mapping tables. | ||
283 | * Later we should allocate these tables in the local node of the memory | ||
284 | * mapped. Unfortunately this is done currently before the nodes are | ||
285 | * discovered. | ||
286 | */ | ||
287 | find_early_table_space(end); | ||
288 | |||
289 | start = (unsigned long)__va(start); | ||
290 | end = (unsigned long)__va(end); | ||
291 | |||
292 | for (; start < end; start = next) { | ||
293 | int map; | ||
294 | unsigned long pud_phys; | ||
295 | pud_t *pud = alloc_low_page(&map, &pud_phys); | ||
296 | next = start + PGDIR_SIZE; | ||
297 | if (next > end) | ||
298 | next = end; | ||
299 | phys_pud_init(pud, __pa(start), __pa(next)); | ||
300 | set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | ||
301 | unmap_low_page(map); | ||
302 | } | ||
303 | |||
304 | asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); | ||
305 | __flush_tlb_all(); | ||
306 | early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, | ||
307 | table_start<<PAGE_SHIFT, | ||
308 | table_end<<PAGE_SHIFT); | ||
309 | } | ||
310 | |||
311 | extern struct x8664_pda cpu_pda[NR_CPUS]; | ||
312 | |||
313 | /* Assumes all CPUs still execute in init_mm */ | ||
314 | void zap_low_mappings(void) | ||
315 | { | ||
316 | pgd_t *pgd = pgd_offset_k(0UL); | ||
317 | pgd_clear(pgd); | ||
318 | flush_tlb_all(); | ||
319 | } | ||
320 | |||
321 | #ifndef CONFIG_DISCONTIGMEM | ||
322 | void __init paging_init(void) | ||
323 | { | ||
324 | { | ||
325 | unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; | ||
326 | unsigned int max_dma; | ||
327 | |||
328 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
329 | |||
330 | if (end_pfn < max_dma) | ||
331 | zones_size[ZONE_DMA] = end_pfn; | ||
332 | else { | ||
333 | zones_size[ZONE_DMA] = max_dma; | ||
334 | zones_size[ZONE_NORMAL] = end_pfn - max_dma; | ||
335 | } | ||
336 | free_area_init(zones_size); | ||
337 | } | ||
338 | return; | ||
339 | } | ||
340 | #endif | ||
341 | |||
342 | /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | ||
343 | from the CPU leading to inconsistent cache lines. address and size | ||
344 | must be aligned to 2MB boundaries. | ||
345 | Does nothing when the mapping doesn't exist. */ | ||
346 | void __init clear_kernel_mapping(unsigned long address, unsigned long size) | ||
347 | { | ||
348 | unsigned long end = address + size; | ||
349 | |||
350 | BUG_ON(address & ~LARGE_PAGE_MASK); | ||
351 | BUG_ON(size & ~LARGE_PAGE_MASK); | ||
352 | |||
353 | for (; address < end; address += LARGE_PAGE_SIZE) { | ||
354 | pgd_t *pgd = pgd_offset_k(address); | ||
355 | pud_t *pud; | ||
356 | pmd_t *pmd; | ||
357 | if (pgd_none(*pgd)) | ||
358 | continue; | ||
359 | pud = pud_offset(pgd, address); | ||
360 | if (pud_none(*pud)) | ||
361 | continue; | ||
362 | pmd = pmd_offset(pud, address); | ||
363 | if (!pmd || pmd_none(*pmd)) | ||
364 | continue; | ||
365 | if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { | ||
366 | /* Could handle this, but it should not happen currently. */ | ||
367 | printk(KERN_ERR | ||
368 | "clear_kernel_mapping: mapping has been split. will leak memory\n"); | ||
369 | pmd_ERROR(*pmd); | ||
370 | } | ||
371 | set_pmd(pmd, __pmd(0)); | ||
372 | } | ||
373 | __flush_tlb_all(); | ||
374 | } | ||
375 | |||
376 | static inline int page_is_ram (unsigned long pagenr) | ||
377 | { | ||
378 | int i; | ||
379 | |||
380 | for (i = 0; i < e820.nr_map; i++) { | ||
381 | unsigned long addr, end; | ||
382 | |||
383 | if (e820.map[i].type != E820_RAM) /* not usable memory */ | ||
384 | continue; | ||
385 | /* | ||
386 | * !!!FIXME!!! Some BIOSen report areas as RAM that | ||
387 | * are not. Notably the 640->1Mb area. We need a sanity | ||
388 | * check here. | ||
389 | */ | ||
390 | addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
391 | end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | ||
392 | if ((pagenr >= addr) && (pagenr < end)) | ||
393 | return 1; | ||
394 | } | ||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | extern int swiotlb_force; | ||
399 | |||
400 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | ||
401 | kcore_vsyscall; | ||
402 | |||
403 | void __init mem_init(void) | ||
404 | { | ||
405 | int codesize, reservedpages, datasize, initsize; | ||
406 | int tmp; | ||
407 | |||
408 | #ifdef CONFIG_SWIOTLB | ||
409 | if (swiotlb_force) | ||
410 | swiotlb = 1; | ||
411 | if (!iommu_aperture && | ||
412 | (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) | ||
413 | swiotlb = 1; | ||
414 | if (swiotlb) | ||
415 | swiotlb_init(); | ||
416 | #endif | ||
417 | |||
418 | /* How many end-of-memory variables you have, grandma! */ | ||
419 | max_low_pfn = end_pfn; | ||
420 | max_pfn = end_pfn; | ||
421 | num_physpages = end_pfn; | ||
422 | high_memory = (void *) __va(end_pfn * PAGE_SIZE); | ||
423 | |||
424 | /* clear the zero-page */ | ||
425 | memset(empty_zero_page, 0, PAGE_SIZE); | ||
426 | |||
427 | reservedpages = 0; | ||
428 | |||
429 | /* this will put all low memory onto the freelists */ | ||
430 | #ifdef CONFIG_DISCONTIGMEM | ||
431 | totalram_pages += numa_free_all_bootmem(); | ||
432 | tmp = 0; | ||
433 | /* should count reserved pages here for all nodes */ | ||
434 | #else | ||
435 | max_mapnr = end_pfn; | ||
436 | if (!mem_map) BUG(); | ||
437 | |||
438 | totalram_pages += free_all_bootmem(); | ||
439 | |||
440 | for (tmp = 0; tmp < end_pfn; tmp++) | ||
441 | /* | ||
442 | * Only count reserved RAM pages | ||
443 | */ | ||
444 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | ||
445 | reservedpages++; | ||
446 | #endif | ||
447 | |||
448 | after_bootmem = 1; | ||
449 | |||
450 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
451 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
452 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
453 | |||
454 | /* Register memory areas for /proc/kcore */ | ||
455 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
456 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
457 | VMALLOC_END-VMALLOC_START); | ||
458 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | ||
459 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | ||
460 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | ||
461 | VSYSCALL_END - VSYSCALL_START); | ||
462 | |||
463 | printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", | ||
464 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
465 | end_pfn << (PAGE_SHIFT-10), | ||
466 | codesize >> 10, | ||
467 | reservedpages << (PAGE_SHIFT-10), | ||
468 | datasize >> 10, | ||
469 | initsize >> 10); | ||
470 | |||
471 | /* | ||
472 | * Subtle. SMP is doing its boot stuff late (because it has to | ||
473 | * fork idle threads) - but it also needs low mappings for the | ||
474 | * protected-mode entry to work. We zap these entries only after | ||
475 | * the WP-bit has been tested. | ||
476 | */ | ||
477 | #ifndef CONFIG_SMP | ||
478 | zap_low_mappings(); | ||
479 | #endif | ||
480 | } | ||
481 | |||
482 | extern char __initdata_begin[], __initdata_end[]; | ||
483 | |||
484 | void free_initmem(void) | ||
485 | { | ||
486 | unsigned long addr; | ||
487 | |||
488 | addr = (unsigned long)(&__init_begin); | ||
489 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
490 | ClearPageReserved(virt_to_page(addr)); | ||
491 | set_page_count(virt_to_page(addr), 1); | ||
492 | memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); | ||
493 | free_page(addr); | ||
494 | totalram_pages++; | ||
495 | } | ||
496 | memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); | ||
497 | printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); | ||
498 | } | ||
499 | |||
500 | #ifdef CONFIG_BLK_DEV_INITRD | ||
501 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
502 | { | ||
503 | if (start < (unsigned long)&_end) | ||
504 | return; | ||
505 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | ||
506 | for (; start < end; start += PAGE_SIZE) { | ||
507 | ClearPageReserved(virt_to_page(start)); | ||
508 | set_page_count(virt_to_page(start), 1); | ||
509 | free_page(start); | ||
510 | totalram_pages++; | ||
511 | } | ||
512 | } | ||
513 | #endif | ||
514 | |||
515 | void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | ||
516 | { | ||
517 | /* Should check here against the e820 map to avoid double free */ | ||
518 | #ifdef CONFIG_DISCONTIGMEM | ||
519 | int nid = phys_to_nid(phys); | ||
520 | reserve_bootmem_node(NODE_DATA(nid), phys, len); | ||
521 | #else | ||
522 | reserve_bootmem(phys, len); | ||
523 | #endif | ||
524 | } | ||
525 | |||
526 | int kern_addr_valid(unsigned long addr) | ||
527 | { | ||
528 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | ||
529 | pgd_t *pgd; | ||
530 | pud_t *pud; | ||
531 | pmd_t *pmd; | ||
532 | pte_t *pte; | ||
533 | |||
534 | if (above != 0 && above != -1UL) | ||
535 | return 0; | ||
536 | |||
537 | pgd = pgd_offset_k(addr); | ||
538 | if (pgd_none(*pgd)) | ||
539 | return 0; | ||
540 | |||
541 | pud = pud_offset(pgd, addr); | ||
542 | if (pud_none(*pud)) | ||
543 | return 0; | ||
544 | |||
545 | pmd = pmd_offset(pud, addr); | ||
546 | if (pmd_none(*pmd)) | ||
547 | return 0; | ||
548 | if (pmd_large(*pmd)) | ||
549 | return pfn_valid(pmd_pfn(*pmd)); | ||
550 | |||
551 | pte = pte_offset_kernel(pmd, addr); | ||
552 | if (pte_none(*pte)) | ||
553 | return 0; | ||
554 | return pfn_valid(pte_pfn(*pte)); | ||
555 | } | ||
556 | |||
557 | #ifdef CONFIG_SYSCTL | ||
558 | #include <linux/sysctl.h> | ||
559 | |||
560 | extern int exception_trace, page_fault_trace; | ||
561 | |||
562 | static ctl_table debug_table2[] = { | ||
563 | { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, | ||
564 | proc_dointvec }, | ||
565 | #ifdef CONFIG_CHECKING | ||
566 | { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, | ||
567 | proc_dointvec }, | ||
568 | #endif | ||
569 | { 0, } | ||
570 | }; | ||
571 | |||
572 | static ctl_table debug_root_table2[] = { | ||
573 | { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, | ||
574 | .child = debug_table2 }, | ||
575 | { 0 }, | ||
576 | }; | ||
577 | |||
578 | static __init int x8664_sysctl_init(void) | ||
579 | { | ||
580 | register_sysctl_table(debug_root_table2, 1); | ||
581 | return 0; | ||
582 | } | ||
583 | __initcall(x8664_sysctl_init); | ||
584 | #endif | ||
585 | |||
586 | /* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two | ||
587 | different ones: one for 32bit and one for 64bit. Use the appropiate | ||
588 | for the target task. */ | ||
589 | |||
590 | static struct vm_area_struct gate_vma = { | ||
591 | .vm_start = VSYSCALL_START, | ||
592 | .vm_end = VSYSCALL_END, | ||
593 | .vm_page_prot = PAGE_READONLY | ||
594 | }; | ||
595 | |||
596 | static struct vm_area_struct gate32_vma = { | ||
597 | .vm_start = VSYSCALL32_BASE, | ||
598 | .vm_end = VSYSCALL32_END, | ||
599 | .vm_page_prot = PAGE_READONLY | ||
600 | }; | ||
601 | |||
602 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
603 | { | ||
604 | #ifdef CONFIG_IA32_EMULATION | ||
605 | if (test_tsk_thread_flag(tsk, TIF_IA32)) { | ||
606 | /* lookup code assumes the pages are present. set them up | ||
607 | now */ | ||
608 | if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0) | ||
609 | return NULL; | ||
610 | return &gate32_vma; | ||
611 | } | ||
612 | #endif | ||
613 | return &gate_vma; | ||
614 | } | ||
615 | |||
616 | int in_gate_area(struct task_struct *task, unsigned long addr) | ||
617 | { | ||
618 | struct vm_area_struct *vma = get_gate_vma(task); | ||
619 | return (addr >= vma->vm_start) && (addr < vma->vm_end); | ||
620 | } | ||
621 | |||
622 | /* Use this when you have no reliable task/vma, typically from interrupt | ||
623 | * context. It is less reliable than using the task's vma and may give | ||
624 | * false positives. | ||
625 | */ | ||
626 | int in_gate_area_no_task(unsigned long addr) | ||
627 | { | ||
628 | return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) || | ||
629 | ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END))); | ||
630 | } | ||
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c new file mode 100644 index 000000000000..74ec8554b195 --- /dev/null +++ b/arch/x86_64/mm/ioremap.c | |||
@@ -0,0 +1,283 @@ | |||
1 | /* | ||
2 | * arch/x86_64/mm/ioremap.c | ||
3 | * | ||
4 | * Re-map IO memory to kernel address space so that we can access it. | ||
5 | * This is needed for high PCI addresses that aren't mapped in the | ||
6 | * 640k-1MB IO memory area on PC's | ||
7 | * | ||
8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
9 | */ | ||
10 | |||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <asm/pgalloc.h> | ||
16 | #include <asm/fixmap.h> | ||
17 | #include <asm/cacheflush.h> | ||
18 | #include <asm/tlbflush.h> | ||
19 | #include <asm/proto.h> | ||
20 | |||
21 | #define ISA_START_ADDRESS 0xa0000 | ||
22 | #define ISA_END_ADDRESS 0x100000 | ||
23 | |||
24 | static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, | ||
25 | unsigned long phys_addr, unsigned long flags) | ||
26 | { | ||
27 | unsigned long end; | ||
28 | unsigned long pfn; | ||
29 | |||
30 | address &= ~PMD_MASK; | ||
31 | end = address + size; | ||
32 | if (end > PMD_SIZE) | ||
33 | end = PMD_SIZE; | ||
34 | if (address >= end) | ||
35 | BUG(); | ||
36 | pfn = phys_addr >> PAGE_SHIFT; | ||
37 | do { | ||
38 | if (!pte_none(*pte)) { | ||
39 | printk("remap_area_pte: page already exists\n"); | ||
40 | BUG(); | ||
41 | } | ||
42 | set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | | ||
43 | _PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); | ||
44 | address += PAGE_SIZE; | ||
45 | pfn++; | ||
46 | pte++; | ||
47 | } while (address && (address < end)); | ||
48 | } | ||
49 | |||
50 | static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | ||
51 | unsigned long phys_addr, unsigned long flags) | ||
52 | { | ||
53 | unsigned long end; | ||
54 | |||
55 | address &= ~PUD_MASK; | ||
56 | end = address + size; | ||
57 | if (end > PUD_SIZE) | ||
58 | end = PUD_SIZE; | ||
59 | phys_addr -= address; | ||
60 | if (address >= end) | ||
61 | BUG(); | ||
62 | do { | ||
63 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | ||
64 | if (!pte) | ||
65 | return -ENOMEM; | ||
66 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | ||
67 | address = (address + PMD_SIZE) & PMD_MASK; | ||
68 | pmd++; | ||
69 | } while (address && (address < end)); | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size, | ||
74 | unsigned long phys_addr, unsigned long flags) | ||
75 | { | ||
76 | unsigned long end; | ||
77 | |||
78 | address &= ~PGDIR_MASK; | ||
79 | end = address + size; | ||
80 | if (end > PGDIR_SIZE) | ||
81 | end = PGDIR_SIZE; | ||
82 | phys_addr -= address; | ||
83 | if (address >= end) | ||
84 | BUG(); | ||
85 | do { | ||
86 | pmd_t * pmd = pmd_alloc(&init_mm, pud, address); | ||
87 | if (!pmd) | ||
88 | return -ENOMEM; | ||
89 | remap_area_pmd(pmd, address, end - address, address + phys_addr, flags); | ||
90 | address = (address + PUD_SIZE) & PUD_MASK; | ||
91 | pud++; | ||
92 | } while (address && (address < end)); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int remap_area_pages(unsigned long address, unsigned long phys_addr, | ||
97 | unsigned long size, unsigned long flags) | ||
98 | { | ||
99 | int error; | ||
100 | pgd_t *pgd; | ||
101 | unsigned long end = address + size; | ||
102 | |||
103 | phys_addr -= address; | ||
104 | pgd = pgd_offset_k(address); | ||
105 | flush_cache_all(); | ||
106 | if (address >= end) | ||
107 | BUG(); | ||
108 | spin_lock(&init_mm.page_table_lock); | ||
109 | do { | ||
110 | pud_t *pud; | ||
111 | pud = pud_alloc(&init_mm, pgd, address); | ||
112 | error = -ENOMEM; | ||
113 | if (!pud) | ||
114 | break; | ||
115 | if (remap_area_pud(pud, address, end - address, | ||
116 | phys_addr + address, flags)) | ||
117 | break; | ||
118 | error = 0; | ||
119 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | ||
120 | pgd++; | ||
121 | } while (address && (address < end)); | ||
122 | spin_unlock(&init_mm.page_table_lock); | ||
123 | flush_tlb_all(); | ||
124 | return error; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | ||
129 | * conflicts. | ||
130 | */ | ||
131 | static int | ||
132 | ioremap_change_attr(unsigned long phys_addr, unsigned long size, | ||
133 | unsigned long flags) | ||
134 | { | ||
135 | int err = 0; | ||
136 | if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { | ||
137 | unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
138 | unsigned long vaddr = (unsigned long) __va(phys_addr); | ||
139 | |||
140 | /* | ||
141 | * Must use a address here and not struct page because the phys addr | ||
142 | * can be a in hole between nodes and not have an memmap entry. | ||
143 | */ | ||
144 | err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); | ||
145 | if (!err) | ||
146 | global_flush_tlb(); | ||
147 | } | ||
148 | return err; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * Generic mapping function | ||
153 | */ | ||
154 | |||
155 | /* | ||
156 | * Remap an arbitrary physical address space into the kernel virtual | ||
157 | * address space. Needed when the kernel wants to access high addresses | ||
158 | * directly. | ||
159 | * | ||
160 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
161 | * have to convert them into an offset in a page-aligned mapping, but the | ||
162 | * caller shouldn't need to know that small detail. | ||
163 | */ | ||
164 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
165 | { | ||
166 | void * addr; | ||
167 | struct vm_struct * area; | ||
168 | unsigned long offset, last_addr; | ||
169 | |||
170 | /* Don't allow wraparound or zero size */ | ||
171 | last_addr = phys_addr + size - 1; | ||
172 | if (!size || last_addr < phys_addr) | ||
173 | return NULL; | ||
174 | |||
175 | /* | ||
176 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
177 | */ | ||
178 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
179 | return (__force void __iomem *)phys_to_virt(phys_addr); | ||
180 | |||
181 | #ifndef CONFIG_DISCONTIGMEM | ||
182 | /* | ||
183 | * Don't allow anybody to remap normal RAM that we're using.. | ||
184 | */ | ||
185 | if (last_addr < virt_to_phys(high_memory)) { | ||
186 | char *t_addr, *t_end; | ||
187 | struct page *page; | ||
188 | |||
189 | t_addr = __va(phys_addr); | ||
190 | t_end = t_addr + (size - 1); | ||
191 | |||
192 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
193 | if(!PageReserved(page)) | ||
194 | return NULL; | ||
195 | } | ||
196 | #endif | ||
197 | |||
198 | /* | ||
199 | * Mappings have to be page-aligned | ||
200 | */ | ||
201 | offset = phys_addr & ~PAGE_MASK; | ||
202 | phys_addr &= PAGE_MASK; | ||
203 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
204 | |||
205 | /* | ||
206 | * Ok, go for it.. | ||
207 | */ | ||
208 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
209 | if (!area) | ||
210 | return NULL; | ||
211 | area->phys_addr = phys_addr; | ||
212 | addr = area->addr; | ||
213 | if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) { | ||
214 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
215 | return NULL; | ||
216 | } | ||
217 | if (ioremap_change_attr(phys_addr, size, flags) < 0) { | ||
218 | area->flags &= 0xffffff; | ||
219 | vunmap(addr); | ||
220 | return NULL; | ||
221 | } | ||
222 | return (__force void __iomem *) (offset + (char *)addr); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * ioremap_nocache - map bus memory into CPU space | ||
227 | * @offset: bus address of the memory | ||
228 | * @size: size of the resource to map | ||
229 | * | ||
230 | * ioremap_nocache performs a platform specific sequence of operations to | ||
231 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
232 | * writew/writel functions and the other mmio helpers. The returned | ||
233 | * address is not guaranteed to be usable directly as a virtual | ||
234 | * address. | ||
235 | * | ||
236 | * This version of ioremap ensures that the memory is marked uncachable | ||
237 | * on the CPU as well as honouring existing caching rules from things like | ||
238 | * the PCI bus. Note that there are other caches and buffers on many | ||
239 | * busses. In particular driver authors should read up on PCI writes | ||
240 | * | ||
241 | * It's useful if some control registers are in such an area and | ||
242 | * write combining or read caching is not desirable: | ||
243 | * | ||
244 | * Must be freed with iounmap. | ||
245 | */ | ||
246 | |||
247 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
248 | { | ||
249 | return __ioremap(phys_addr, size, _PAGE_PCD); | ||
250 | } | ||
251 | |||
252 | void iounmap(volatile void __iomem *addr) | ||
253 | { | ||
254 | struct vm_struct *p, **pprev; | ||
255 | |||
256 | if (addr <= high_memory) | ||
257 | return; | ||
258 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
259 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
260 | return; | ||
261 | |||
262 | write_lock(&vmlist_lock); | ||
263 | for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev) | ||
264 | if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr)) | ||
265 | break; | ||
266 | if (!p) { | ||
267 | printk("__iounmap: bad address %p\n", addr); | ||
268 | goto out_unlock; | ||
269 | } | ||
270 | *pprev = p->next; | ||
271 | unmap_vm_area(p); | ||
272 | if ((p->flags >> 20) && | ||
273 | p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { | ||
274 | /* p->size includes the guard page, but cpa doesn't like that */ | ||
275 | change_page_attr(virt_to_page(__va(p->phys_addr)), | ||
276 | p->size >> PAGE_SHIFT, | ||
277 | PAGE_KERNEL); | ||
278 | global_flush_tlb(); | ||
279 | } | ||
280 | out_unlock: | ||
281 | write_unlock(&vmlist_lock); | ||
282 | kfree(p); | ||
283 | } | ||
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c new file mode 100644 index 000000000000..ec35747aacd7 --- /dev/null +++ b/arch/x86_64/mm/k8topology.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * AMD K8 NUMA support. | ||
3 | * Discover the memory map and associated nodes. | ||
4 | * | ||
5 | * This version reads it directly from the K8 northbridge. | ||
6 | * | ||
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
8 | */ | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <linux/pci_ids.h> | ||
16 | #include <asm/types.h> | ||
17 | #include <asm/mmzone.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/e820.h> | ||
20 | #include <asm/pci-direct.h> | ||
21 | #include <asm/numa.h> | ||
22 | |||
23 | static __init int find_northbridge(void) | ||
24 | { | ||
25 | int num; | ||
26 | |||
27 | for (num = 0; num < 32; num++) { | ||
28 | u32 header; | ||
29 | |||
30 | header = read_pci_config(0, num, 0, 0x00); | ||
31 | if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) | ||
32 | continue; | ||
33 | |||
34 | header = read_pci_config(0, num, 1, 0x00); | ||
35 | if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) | ||
36 | continue; | ||
37 | return num; | ||
38 | } | ||
39 | |||
40 | return -1; | ||
41 | } | ||
42 | |||
43 | int __init k8_scan_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | unsigned long prevbase; | ||
46 | struct node nodes[8]; | ||
47 | int nodeid, i, nb; | ||
48 | int found = 0; | ||
49 | u32 reg; | ||
50 | unsigned numnodes; | ||
51 | nodemask_t nodes_parsed; | ||
52 | |||
53 | nodes_clear(nodes_parsed); | ||
54 | |||
55 | nb = find_northbridge(); | ||
56 | if (nb < 0) | ||
57 | return nb; | ||
58 | |||
59 | printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); | ||
60 | |||
61 | reg = read_pci_config(0, nb, 0, 0x60); | ||
62 | numnodes = ((reg >> 4) & 0xF) + 1; | ||
63 | |||
64 | printk(KERN_INFO "Number of nodes %d\n", numnodes); | ||
65 | |||
66 | memset(&nodes,0,sizeof(nodes)); | ||
67 | prevbase = 0; | ||
68 | for (i = 0; i < 8; i++) { | ||
69 | unsigned long base,limit; | ||
70 | |||
71 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | ||
72 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | ||
73 | |||
74 | nodeid = limit & 7; | ||
75 | if ((base & 3) == 0) { | ||
76 | if (i < numnodes) | ||
77 | printk("Skipping disabled node %d\n", i); | ||
78 | continue; | ||
79 | } | ||
80 | if (nodeid >= numnodes) { | ||
81 | printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, | ||
82 | base, limit); | ||
83 | continue; | ||
84 | } | ||
85 | |||
86 | if (!limit) { | ||
87 | printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, | ||
88 | base); | ||
89 | continue; | ||
90 | } | ||
91 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | ||
92 | printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", | ||
93 | nodeid, (base>>8)&3, (limit>>8) & 3); | ||
94 | return -1; | ||
95 | } | ||
96 | if (node_isset(nodeid, nodes_parsed)) { | ||
97 | printk(KERN_INFO "Node %d already present. Skipping\n", | ||
98 | nodeid); | ||
99 | continue; | ||
100 | } | ||
101 | |||
102 | limit >>= 16; | ||
103 | limit <<= 24; | ||
104 | limit |= (1<<24)-1; | ||
105 | |||
106 | if (limit > end_pfn << PAGE_SHIFT) | ||
107 | limit = end_pfn << PAGE_SHIFT; | ||
108 | if (limit <= base) | ||
109 | continue; | ||
110 | |||
111 | base >>= 16; | ||
112 | base <<= 24; | ||
113 | |||
114 | if (base < start) | ||
115 | base = start; | ||
116 | if (limit > end) | ||
117 | limit = end; | ||
118 | if (limit == base) { | ||
119 | printk(KERN_ERR "Empty node %d\n", nodeid); | ||
120 | continue; | ||
121 | } | ||
122 | if (limit < base) { | ||
123 | printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", | ||
124 | nodeid, base, limit); | ||
125 | continue; | ||
126 | } | ||
127 | |||
128 | /* Could sort here, but pun for now. Should not happen anyroads. */ | ||
129 | if (prevbase > base) { | ||
130 | printk(KERN_ERR "Node map not sorted %lx,%lx\n", | ||
131 | prevbase,base); | ||
132 | return -1; | ||
133 | } | ||
134 | |||
135 | printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", | ||
136 | nodeid, base, limit); | ||
137 | |||
138 | found++; | ||
139 | |||
140 | nodes[nodeid].start = base; | ||
141 | nodes[nodeid].end = limit; | ||
142 | |||
143 | prevbase = base; | ||
144 | |||
145 | node_set(nodeid, nodes_parsed); | ||
146 | } | ||
147 | |||
148 | if (!found) | ||
149 | return -1; | ||
150 | |||
151 | memnode_shift = compute_hash_shift(nodes, numnodes); | ||
152 | if (memnode_shift < 0) { | ||
153 | printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); | ||
154 | return -1; | ||
155 | } | ||
156 | printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); | ||
157 | |||
158 | for (i = 0; i < 8; i++) { | ||
159 | if (nodes[i].start != nodes[i].end) { | ||
160 | /* assume 1:1 NODE:CPU */ | ||
161 | cpu_to_node[i] = i; | ||
162 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | numa_init_array(); | ||
167 | return 0; | ||
168 | } | ||
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c new file mode 100644 index 000000000000..fd9f25d7a6c4 --- /dev/null +++ b/arch/x86_64/mm/numa.c | |||
@@ -0,0 +1,294 @@ | |||
1 | /* | ||
2 | * Generic VM initialization for x86-64 NUMA setups. | ||
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/numa.h> | ||
19 | #include <asm/acpi.h> | ||
20 | |||
21 | #ifndef Dprintk | ||
22 | #define Dprintk(x...) | ||
23 | #endif | ||
24 | |||
25 | struct pglist_data *node_data[MAX_NUMNODES]; | ||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
27 | |||
28 | int memnode_shift; | ||
29 | u8 memnodemap[NODEMAPSIZE]; | ||
30 | |||
31 | unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; | ||
32 | cpumask_t node_to_cpumask[MAX_NUMNODES]; | ||
33 | |||
34 | int numa_off __initdata; | ||
35 | |||
36 | int __init compute_hash_shift(struct node *nodes, int numnodes) | ||
37 | { | ||
38 | int i; | ||
39 | int shift = 24; | ||
40 | u64 addr; | ||
41 | |||
42 | /* When in doubt use brute force. */ | ||
43 | while (shift < 48) { | ||
44 | memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); | ||
45 | for (i = 0; i < numnodes; i++) { | ||
46 | if (nodes[i].start == nodes[i].end) | ||
47 | continue; | ||
48 | for (addr = nodes[i].start; | ||
49 | addr < nodes[i].end; | ||
50 | addr += (1UL << shift)) { | ||
51 | if (memnodemap[addr >> shift] != 0xff && | ||
52 | memnodemap[addr >> shift] != i) { | ||
53 | printk(KERN_INFO | ||
54 | "node %d shift %d addr %Lx conflict %d\n", | ||
55 | i, shift, addr, memnodemap[addr>>shift]); | ||
56 | goto next; | ||
57 | } | ||
58 | memnodemap[addr >> shift] = i; | ||
59 | } | ||
60 | } | ||
61 | return shift; | ||
62 | next: | ||
63 | shift++; | ||
64 | } | ||
65 | memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); | ||
66 | return -1; | ||
67 | } | ||
68 | |||
69 | /* Initialize bootmem allocator for a node */ | ||
70 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
71 | { | ||
72 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | ||
73 | unsigned long nodedata_phys; | ||
74 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | ||
75 | |||
76 | start = round_up(start, ZONE_ALIGN); | ||
77 | |||
78 | printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | ||
79 | |||
80 | start_pfn = start >> PAGE_SHIFT; | ||
81 | end_pfn = end >> PAGE_SHIFT; | ||
82 | |||
83 | nodedata_phys = find_e820_area(start, end, pgdat_size); | ||
84 | if (nodedata_phys == -1L) | ||
85 | panic("Cannot find memory pgdat in node %d\n", nodeid); | ||
86 | |||
87 | Dprintk("nodedata_phys %lx\n", nodedata_phys); | ||
88 | |||
89 | node_data[nodeid] = phys_to_virt(nodedata_phys); | ||
90 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
91 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | ||
92 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
93 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | ||
94 | |||
95 | /* Find a place for the bootmem map */ | ||
96 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
97 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
98 | bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); | ||
99 | if (bootmap_start == -1L) | ||
100 | panic("Not enough continuous space for bootmap on node %d", nodeid); | ||
101 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
102 | |||
103 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
104 | bootmap_start >> PAGE_SHIFT, | ||
105 | start_pfn, end_pfn); | ||
106 | |||
107 | e820_bootmem_free(NODE_DATA(nodeid), start, end); | ||
108 | |||
109 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | ||
110 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | ||
111 | node_set_online(nodeid); | ||
112 | } | ||
113 | |||
114 | /* Initialize final allocator for a zone */ | ||
115 | void __init setup_node_zones(int nodeid) | ||
116 | { | ||
117 | unsigned long start_pfn, end_pfn; | ||
118 | unsigned long zones[MAX_NR_ZONES]; | ||
119 | unsigned long dma_end_pfn; | ||
120 | |||
121 | memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); | ||
122 | |||
123 | start_pfn = node_start_pfn(nodeid); | ||
124 | end_pfn = node_end_pfn(nodeid); | ||
125 | |||
126 | Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); | ||
127 | |||
128 | /* All nodes > 0 have a zero length zone DMA */ | ||
129 | dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
130 | if (start_pfn < dma_end_pfn) { | ||
131 | zones[ZONE_DMA] = dma_end_pfn - start_pfn; | ||
132 | zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; | ||
133 | } else { | ||
134 | zones[ZONE_NORMAL] = end_pfn - start_pfn; | ||
135 | } | ||
136 | |||
137 | free_area_init_node(nodeid, NODE_DATA(nodeid), zones, | ||
138 | start_pfn, NULL); | ||
139 | } | ||
140 | |||
141 | void __init numa_init_array(void) | ||
142 | { | ||
143 | int rr, i; | ||
144 | /* There are unfortunately some poorly designed mainboards around | ||
145 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
146 | mapping. To avoid this fill in the mapping for all possible | ||
147 | CPUs, as the number of CPUs is not known yet. | ||
148 | We round robin the existing nodes. */ | ||
149 | rr = 0; | ||
150 | for (i = 0; i < NR_CPUS; i++) { | ||
151 | if (cpu_to_node[i] != NUMA_NO_NODE) | ||
152 | continue; | ||
153 | rr = next_node(rr, node_online_map); | ||
154 | if (rr == MAX_NUMNODES) | ||
155 | rr = first_node(node_online_map); | ||
156 | cpu_to_node[i] = rr; | ||
157 | rr++; | ||
158 | } | ||
159 | |||
160 | set_bit(0, &node_to_cpumask[cpu_to_node(0)]); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_NUMA_EMU | ||
164 | int numa_fake __initdata = 0; | ||
165 | |||
166 | /* Numa emulation */ | ||
167 | static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
168 | { | ||
169 | int i; | ||
170 | struct node nodes[MAX_NUMNODES]; | ||
171 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | ||
172 | |||
173 | /* Kludge needed for the hash function */ | ||
174 | if (hweight64(sz) > 1) { | ||
175 | unsigned long x = 1; | ||
176 | while ((x << 1) < sz) | ||
177 | x <<= 1; | ||
178 | if (x < sz/2) | ||
179 | printk("Numa emulation unbalanced. Complain to maintainer\n"); | ||
180 | sz = x; | ||
181 | } | ||
182 | |||
183 | memset(&nodes,0,sizeof(nodes)); | ||
184 | for (i = 0; i < numa_fake; i++) { | ||
185 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | ||
186 | if (i == numa_fake-1) | ||
187 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | ||
188 | nodes[i].end = nodes[i].start + sz; | ||
189 | if (i != numa_fake-1) | ||
190 | nodes[i].end--; | ||
191 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | ||
192 | i, | ||
193 | nodes[i].start, nodes[i].end, | ||
194 | (nodes[i].end - nodes[i].start) >> 20); | ||
195 | node_set_online(i); | ||
196 | } | ||
197 | memnode_shift = compute_hash_shift(nodes, numa_fake); | ||
198 | if (memnode_shift < 0) { | ||
199 | memnode_shift = 0; | ||
200 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | ||
201 | return -1; | ||
202 | } | ||
203 | for_each_online_node(i) | ||
204 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
205 | numa_init_array(); | ||
206 | return 0; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
211 | { | ||
212 | int i; | ||
213 | |||
214 | #ifdef CONFIG_NUMA_EMU | ||
215 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | ||
216 | return; | ||
217 | #endif | ||
218 | |||
219 | #ifdef CONFIG_ACPI_NUMA | ||
220 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
221 | end_pfn << PAGE_SHIFT)) | ||
222 | return; | ||
223 | #endif | ||
224 | |||
225 | #ifdef CONFIG_K8_NUMA | ||
226 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | ||
227 | return; | ||
228 | #endif | ||
229 | printk(KERN_INFO "%s\n", | ||
230 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
231 | |||
232 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
233 | start_pfn << PAGE_SHIFT, | ||
234 | end_pfn << PAGE_SHIFT); | ||
235 | /* setup dummy node covering all memory */ | ||
236 | memnode_shift = 63; | ||
237 | memnodemap[0] = 0; | ||
238 | nodes_clear(node_online_map); | ||
239 | node_set_online(0); | ||
240 | for (i = 0; i < NR_CPUS; i++) | ||
241 | cpu_to_node[i] = 0; | ||
242 | node_to_cpumask[0] = cpumask_of_cpu(0); | ||
243 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
244 | } | ||
245 | |||
246 | __init void numa_add_cpu(int cpu) | ||
247 | { | ||
248 | /* BP is initialized elsewhere */ | ||
249 | if (cpu) | ||
250 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | ||
251 | } | ||
252 | |||
253 | unsigned long __init numa_free_all_bootmem(void) | ||
254 | { | ||
255 | int i; | ||
256 | unsigned long pages = 0; | ||
257 | for_each_online_node(i) { | ||
258 | pages += free_all_bootmem_node(NODE_DATA(i)); | ||
259 | } | ||
260 | return pages; | ||
261 | } | ||
262 | |||
263 | void __init paging_init(void) | ||
264 | { | ||
265 | int i; | ||
266 | for_each_online_node(i) { | ||
267 | setup_node_zones(i); | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* [numa=off] */ | ||
272 | __init int numa_setup(char *opt) | ||
273 | { | ||
274 | if (!strncmp(opt,"off",3)) | ||
275 | numa_off = 1; | ||
276 | #ifdef CONFIG_NUMA_EMU | ||
277 | if(!strncmp(opt, "fake=", 5)) { | ||
278 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | ||
279 | if (numa_fake >= MAX_NUMNODES) | ||
280 | numa_fake = MAX_NUMNODES; | ||
281 | } | ||
282 | #endif | ||
283 | #ifdef CONFIG_ACPI_NUMA | ||
284 | if (!strncmp(opt,"noacpi",6)) | ||
285 | acpi_numa = -1; | ||
286 | #endif | ||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | EXPORT_SYMBOL(cpu_to_node); | ||
291 | EXPORT_SYMBOL(node_to_cpumask); | ||
292 | EXPORT_SYMBOL(memnode_shift); | ||
293 | EXPORT_SYMBOL(memnodemap); | ||
294 | EXPORT_SYMBOL(node_data); | ||
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c new file mode 100644 index 000000000000..94862e1ec032 --- /dev/null +++ b/arch/x86_64/mm/pageattr.c | |||
@@ -0,0 +1,235 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * Thanks to Ben LaHaise for precious feedback. | ||
4 | */ | ||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/highmem.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/io.h> | ||
16 | |||
17 | static inline pte_t *lookup_address(unsigned long address) | ||
18 | { | ||
19 | pgd_t *pgd = pgd_offset_k(address); | ||
20 | pud_t *pud; | ||
21 | pmd_t *pmd; | ||
22 | pte_t *pte; | ||
23 | if (pgd_none(*pgd)) | ||
24 | return NULL; | ||
25 | pud = pud_offset(pgd, address); | ||
26 | if (!pud_present(*pud)) | ||
27 | return NULL; | ||
28 | pmd = pmd_offset(pud, address); | ||
29 | if (!pmd_present(*pmd)) | ||
30 | return NULL; | ||
31 | if (pmd_large(*pmd)) | ||
32 | return (pte_t *)pmd; | ||
33 | pte = pte_offset_kernel(pmd, address); | ||
34 | if (pte && !pte_present(*pte)) | ||
35 | pte = NULL; | ||
36 | return pte; | ||
37 | } | ||
38 | |||
39 | static struct page *split_large_page(unsigned long address, pgprot_t prot, | ||
40 | pgprot_t ref_prot) | ||
41 | { | ||
42 | int i; | ||
43 | unsigned long addr; | ||
44 | struct page *base = alloc_pages(GFP_KERNEL, 0); | ||
45 | pte_t *pbase; | ||
46 | if (!base) | ||
47 | return NULL; | ||
48 | address = __pa(address); | ||
49 | addr = address & LARGE_PAGE_MASK; | ||
50 | pbase = (pte_t *)page_address(base); | ||
51 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
52 | pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | ||
53 | addr == address ? prot : ref_prot); | ||
54 | } | ||
55 | return base; | ||
56 | } | ||
57 | |||
58 | |||
59 | static void flush_kernel_map(void *address) | ||
60 | { | ||
61 | if (0 && address && cpu_has_clflush) { | ||
62 | /* is this worth it? */ | ||
63 | int i; | ||
64 | for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | ||
65 | asm volatile("clflush (%0)" :: "r" (address + i)); | ||
66 | } else | ||
67 | asm volatile("wbinvd":::"memory"); | ||
68 | if (address) | ||
69 | __flush_tlb_one(address); | ||
70 | else | ||
71 | __flush_tlb_all(); | ||
72 | } | ||
73 | |||
74 | |||
75 | static inline void flush_map(unsigned long address) | ||
76 | { | ||
77 | on_each_cpu(flush_kernel_map, (void *)address, 1, 1); | ||
78 | } | ||
79 | |||
80 | struct deferred_page { | ||
81 | struct deferred_page *next; | ||
82 | struct page *fpage; | ||
83 | unsigned long address; | ||
84 | }; | ||
85 | static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ | ||
86 | |||
87 | static inline void save_page(unsigned long address, struct page *fpage) | ||
88 | { | ||
89 | struct deferred_page *df; | ||
90 | df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); | ||
91 | if (!df) { | ||
92 | flush_map(address); | ||
93 | __free_page(fpage); | ||
94 | } else { | ||
95 | df->next = df_list; | ||
96 | df->fpage = fpage; | ||
97 | df->address = address; | ||
98 | df_list = df; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * No more special protections in this 2/4MB area - revert to a | ||
104 | * large page again. | ||
105 | */ | ||
106 | static void revert_page(unsigned long address, pgprot_t ref_prot) | ||
107 | { | ||
108 | pgd_t *pgd; | ||
109 | pud_t *pud; | ||
110 | pmd_t *pmd; | ||
111 | pte_t large_pte; | ||
112 | |||
113 | pgd = pgd_offset_k(address); | ||
114 | BUG_ON(pgd_none(*pgd)); | ||
115 | pud = pud_offset(pgd,address); | ||
116 | BUG_ON(pud_none(*pud)); | ||
117 | pmd = pmd_offset(pud, address); | ||
118 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); | ||
119 | pgprot_val(ref_prot) |= _PAGE_PSE; | ||
120 | large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); | ||
121 | set_pte((pte_t *)pmd, large_pte); | ||
122 | } | ||
123 | |||
124 | static int | ||
125 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | ||
126 | pgprot_t ref_prot) | ||
127 | { | ||
128 | pte_t *kpte; | ||
129 | struct page *kpte_page; | ||
130 | unsigned kpte_flags; | ||
131 | kpte = lookup_address(address); | ||
132 | if (!kpte) return 0; | ||
133 | kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | ||
134 | kpte_flags = pte_val(*kpte); | ||
135 | if (pgprot_val(prot) != pgprot_val(ref_prot)) { | ||
136 | if ((kpte_flags & _PAGE_PSE) == 0) { | ||
137 | set_pte(kpte, pfn_pte(pfn, prot)); | ||
138 | } else { | ||
139 | /* | ||
140 | * split_large_page will take the reference for this change_page_attr | ||
141 | * on the split page. | ||
142 | */ | ||
143 | struct page *split = split_large_page(address, prot, ref_prot); | ||
144 | if (!split) | ||
145 | return -ENOMEM; | ||
146 | set_pte(kpte,mk_pte(split, ref_prot)); | ||
147 | kpte_page = split; | ||
148 | } | ||
149 | get_page(kpte_page); | ||
150 | } else if ((kpte_flags & _PAGE_PSE) == 0) { | ||
151 | set_pte(kpte, pfn_pte(pfn, ref_prot)); | ||
152 | __put_page(kpte_page); | ||
153 | } else | ||
154 | BUG(); | ||
155 | |||
156 | /* on x86-64 the direct mapping set at boot is not using 4k pages */ | ||
157 | BUG_ON(PageReserved(kpte_page)); | ||
158 | |||
159 | switch (page_count(kpte_page)) { | ||
160 | case 1: | ||
161 | save_page(address, kpte_page); | ||
162 | revert_page(address, ref_prot); | ||
163 | break; | ||
164 | case 0: | ||
165 | BUG(); /* memleak and failed 2M page regeneration */ | ||
166 | } | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Change the page attributes of an page in the linear mapping. | ||
172 | * | ||
173 | * This should be used when a page is mapped with a different caching policy | ||
174 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
175 | * different caching policies exist. This changes the page attributes of the | ||
176 | * in kernel linear mapping too. | ||
177 | * | ||
178 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
179 | * This function only deals with the kernel linear map. | ||
180 | * | ||
181 | * Caller must call global_flush_tlb() after this. | ||
182 | */ | ||
183 | int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | ||
184 | { | ||
185 | int err = 0; | ||
186 | int i; | ||
187 | |||
188 | down_write(&init_mm.mmap_sem); | ||
189 | for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | ||
190 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | ||
191 | |||
192 | err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | ||
193 | if (err) | ||
194 | break; | ||
195 | /* Handle kernel mapping too which aliases part of the | ||
196 | * lowmem */ | ||
197 | if (__pa(address) < KERNEL_TEXT_SIZE) { | ||
198 | unsigned long addr2; | ||
199 | pgprot_t prot2 = prot; | ||
200 | addr2 = __START_KERNEL_map + __pa(address); | ||
201 | pgprot_val(prot2) &= ~_PAGE_NX; | ||
202 | err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); | ||
203 | } | ||
204 | } | ||
205 | up_write(&init_mm.mmap_sem); | ||
206 | return err; | ||
207 | } | ||
208 | |||
209 | /* Don't call this for MMIO areas that may not have a mem_map entry */ | ||
210 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
211 | { | ||
212 | unsigned long addr = (unsigned long)page_address(page); | ||
213 | return change_page_attr_addr(addr, numpages, prot); | ||
214 | } | ||
215 | |||
216 | void global_flush_tlb(void) | ||
217 | { | ||
218 | struct deferred_page *df, *next_df; | ||
219 | |||
220 | down_read(&init_mm.mmap_sem); | ||
221 | df = xchg(&df_list, NULL); | ||
222 | up_read(&init_mm.mmap_sem); | ||
223 | if (!df) | ||
224 | return; | ||
225 | flush_map((df && !df->next) ? df->address : 0); | ||
226 | for (; df; df = next_df) { | ||
227 | next_df = df->next; | ||
228 | if (df->fpage) | ||
229 | __free_page(df->fpage); | ||
230 | kfree(df); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | EXPORT_SYMBOL(change_page_attr); | ||
235 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c new file mode 100644 index 000000000000..5d01b31472e1 --- /dev/null +++ b/arch/x86_64/mm/srat.c | |||
@@ -0,0 +1,217 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/numa.h> | ||
20 | |||
21 | static struct acpi_table_slit *acpi_slit; | ||
22 | |||
23 | static nodemask_t nodes_parsed __initdata; | ||
24 | static nodemask_t nodes_found __initdata; | ||
25 | static struct node nodes[MAX_NUMNODES] __initdata; | ||
26 | static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; | ||
27 | |||
28 | static __init int setup_node(int pxm) | ||
29 | { | ||
30 | unsigned node = pxm2node[pxm]; | ||
31 | if (node == 0xff) { | ||
32 | if (nodes_weight(nodes_found) >= MAX_NUMNODES) | ||
33 | return -1; | ||
34 | node = first_unset_node(nodes_found); | ||
35 | node_set(node, nodes_found); | ||
36 | pxm2node[pxm] = node; | ||
37 | } | ||
38 | return pxm2node[pxm]; | ||
39 | } | ||
40 | |||
41 | static __init int conflicting_nodes(unsigned long start, unsigned long end) | ||
42 | { | ||
43 | int i; | ||
44 | for_each_online_node(i) { | ||
45 | struct node *nd = &nodes[i]; | ||
46 | if (nd->start == nd->end) | ||
47 | continue; | ||
48 | if (nd->end > start && nd->start < end) | ||
49 | return 1; | ||
50 | if (nd->end == end && nd->start == start) | ||
51 | return 1; | ||
52 | } | ||
53 | return -1; | ||
54 | } | ||
55 | |||
56 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
57 | { | ||
58 | struct node *nd = &nodes[i]; | ||
59 | if (nd->start < start) { | ||
60 | nd->start = start; | ||
61 | if (nd->end < nd->start) | ||
62 | nd->start = nd->end; | ||
63 | } | ||
64 | if (nd->end > end) { | ||
65 | if (!(end & 0xfff)) | ||
66 | end--; | ||
67 | nd->end = end; | ||
68 | if (nd->start > nd->end) | ||
69 | nd->start = nd->end; | ||
70 | } | ||
71 | } | ||
72 | |||
73 | static __init void bad_srat(void) | ||
74 | { | ||
75 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
76 | acpi_numa = -1; | ||
77 | } | ||
78 | |||
79 | static __init inline int srat_disabled(void) | ||
80 | { | ||
81 | return numa_off || acpi_numa < 0; | ||
82 | } | ||
83 | |||
84 | /* Callback for SLIT parsing */ | ||
85 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
86 | { | ||
87 | acpi_slit = slit; | ||
88 | } | ||
89 | |||
90 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
91 | void __init | ||
92 | acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) | ||
93 | { | ||
94 | int pxm, node; | ||
95 | if (srat_disabled() || pa->flags.enabled == 0) | ||
96 | return; | ||
97 | pxm = pa->proximity_domain; | ||
98 | node = setup_node(pxm); | ||
99 | if (node < 0) { | ||
100 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
101 | bad_srat(); | ||
102 | return; | ||
103 | } | ||
104 | if (pa->apic_id >= NR_CPUS) { | ||
105 | printk(KERN_ERR "SRAT: lapic %u too large.\n", | ||
106 | pa->apic_id); | ||
107 | bad_srat(); | ||
108 | return; | ||
109 | } | ||
110 | cpu_to_node[pa->apic_id] = node; | ||
111 | acpi_numa = 1; | ||
112 | printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", | ||
113 | pxm, pa->apic_id, node); | ||
114 | } | ||
115 | |||
116 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
117 | void __init | ||
118 | acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | ||
119 | { | ||
120 | struct node *nd; | ||
121 | unsigned long start, end; | ||
122 | int node, pxm; | ||
123 | int i; | ||
124 | |||
125 | if (srat_disabled() || ma->flags.enabled == 0) | ||
126 | return; | ||
127 | /* hotplug bit is ignored for now */ | ||
128 | pxm = ma->proximity_domain; | ||
129 | node = setup_node(pxm); | ||
130 | if (node < 0) { | ||
131 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
132 | bad_srat(); | ||
133 | return; | ||
134 | } | ||
135 | start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); | ||
136 | end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); | ||
137 | i = conflicting_nodes(start, end); | ||
138 | if (i >= 0) { | ||
139 | printk(KERN_ERR | ||
140 | "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n", | ||
141 | pxm, start, end, i, nodes[i].start, nodes[i].end); | ||
142 | bad_srat(); | ||
143 | return; | ||
144 | } | ||
145 | nd = &nodes[node]; | ||
146 | if (!node_test_and_set(node, nodes_parsed)) { | ||
147 | nd->start = start; | ||
148 | nd->end = end; | ||
149 | } else { | ||
150 | if (start < nd->start) | ||
151 | nd->start = start; | ||
152 | if (nd->end < end) | ||
153 | nd->end = end; | ||
154 | } | ||
155 | if (!(nd->end & 0xfff)) | ||
156 | nd->end--; | ||
157 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | ||
158 | nd->start, nd->end); | ||
159 | } | ||
160 | |||
161 | void __init acpi_numa_arch_fixup(void) {} | ||
162 | |||
163 | /* Use the information discovered above to actually set up the nodes. */ | ||
164 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
165 | { | ||
166 | int i; | ||
167 | if (acpi_numa <= 0) | ||
168 | return -1; | ||
169 | memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); | ||
170 | if (memnode_shift < 0) { | ||
171 | printk(KERN_ERR | ||
172 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
173 | bad_srat(); | ||
174 | return -1; | ||
175 | } | ||
176 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
177 | if (!node_isset(i, nodes_parsed)) | ||
178 | continue; | ||
179 | cutoff_node(i, start, end); | ||
180 | if (nodes[i].start == nodes[i].end) { | ||
181 | node_clear(i, nodes_parsed); | ||
182 | continue; | ||
183 | } | ||
184 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
185 | } | ||
186 | for (i = 0; i < NR_CPUS; i++) { | ||
187 | if (cpu_to_node[i] == NUMA_NO_NODE) | ||
188 | continue; | ||
189 | if (!node_isset(cpu_to_node[i], nodes_parsed)) | ||
190 | cpu_to_node[i] = NUMA_NO_NODE; | ||
191 | } | ||
192 | numa_init_array(); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | int node_to_pxm(int n) | ||
197 | { | ||
198 | int i; | ||
199 | if (pxm2node[n] == n) | ||
200 | return n; | ||
201 | for (i = 0; i < 256; i++) | ||
202 | if (pxm2node[i] == n) | ||
203 | return i; | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | int __node_distance(int a, int b) | ||
208 | { | ||
209 | int index; | ||
210 | |||
211 | if (!acpi_slit) | ||
212 | return a == b ? 10 : 20; | ||
213 | index = acpi_slit->localities * node_to_pxm(a); | ||
214 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
215 | } | ||
216 | |||
217 | EXPORT_SYMBOL(__node_distance); | ||
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig new file mode 100644 index 000000000000..5ade19801b97 --- /dev/null +++ b/arch/x86_64/oprofile/Kconfig | |||
@@ -0,0 +1,23 @@ | |||
1 | |||
2 | menu "Profiling support" | ||
3 | depends on EXPERIMENTAL | ||
4 | |||
5 | config PROFILING | ||
6 | bool "Profiling support (EXPERIMENTAL)" | ||
7 | help | ||
8 | Say Y here to enable the extended profiling support mechanisms used | ||
9 | by profilers such as OProfile. | ||
10 | |||
11 | |||
12 | config OPROFILE | ||
13 | tristate "OProfile system profiling (EXPERIMENTAL)" | ||
14 | depends on PROFILING | ||
15 | help | ||
16 | OProfile is a profiling system capable of profiling the | ||
17 | whole system, include the kernel, kernel modules, libraries, | ||
18 | and applications. | ||
19 | |||
20 | If unsure, say N. | ||
21 | |||
22 | endmenu | ||
23 | |||
diff --git a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile new file mode 100644 index 000000000000..6be32683e1bc --- /dev/null +++ b/arch/x86_64/oprofile/Makefile | |||
@@ -0,0 +1,19 @@ | |||
1 | # | ||
2 | # oprofile for x86-64. | ||
3 | # Just reuse the one from i386. | ||
4 | # | ||
5 | |||
6 | obj-$(CONFIG_OPROFILE) += oprofile.o | ||
7 | |||
8 | DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ | ||
9 | oprof.o cpu_buffer.o buffer_sync.o \ | ||
10 | event_buffer.o oprofile_files.o \ | ||
11 | oprofilefs.o oprofile_stats.o \ | ||
12 | timer_int.o ) | ||
13 | |||
14 | OPROFILE-y := init.o backtrace.o | ||
15 | OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \ | ||
16 | op_model_ppro.o | ||
17 | OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o | ||
18 | |||
19 | oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) | ||
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile new file mode 100644 index 000000000000..37c92e841dec --- /dev/null +++ b/arch/x86_64/pci/Makefile | |||
@@ -0,0 +1,24 @@ | |||
1 | # | ||
2 | # Makefile for X86_64 specific PCI routines | ||
3 | # | ||
4 | # Reuse the i386 PCI subsystem | ||
5 | # | ||
6 | CFLAGS += -Iarch/i386/pci | ||
7 | |||
8 | obj-y := i386.o | ||
9 | obj-$(CONFIG_PCI_DIRECT)+= direct.o | ||
10 | obj-y += fixup.o | ||
11 | obj-$(CONFIG_ACPI_PCI) += acpi.o | ||
12 | obj-y += legacy.o irq.o common.o | ||
13 | # mmconfig has a 64bit special | ||
14 | obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o | ||
15 | |||
16 | obj-$(CONFIG_NUMA) += k8-bus.o | ||
17 | |||
18 | direct-y += ../../i386/pci/direct.o | ||
19 | acpi-y += ../../i386/pci/acpi.o | ||
20 | legacy-y += ../../i386/pci/legacy.o | ||
21 | irq-y += ../../i386/pci/irq.o | ||
22 | common-y += ../../i386/pci/common.o | ||
23 | fixup-y += ../../i386/pci/fixup.o | ||
24 | i386-y += ../../i386/pci/i386.o | ||
diff --git a/arch/x86_64/pci/Makefile-BUS b/arch/x86_64/pci/Makefile-BUS new file mode 100644 index 000000000000..291985f0d2e4 --- /dev/null +++ b/arch/x86_64/pci/Makefile-BUS | |||
@@ -0,0 +1,22 @@ | |||
1 | # | ||
2 | # Makefile for X86_64 specific PCI routines | ||
3 | # | ||
4 | # Reuse the i386 PCI subsystem | ||
5 | # | ||
6 | CFLAGS += -I arch/i386/pci | ||
7 | |||
8 | obj-y := i386.o | ||
9 | obj-$(CONFIG_PCI_DIRECT)+= direct.o | ||
10 | obj-y += fixup.o | ||
11 | obj-$(CONFIG_ACPI_PCI) += acpi.o | ||
12 | obj-y += legacy.o irq.o common.o | ||
13 | # mmconfig has a 64bit special | ||
14 | obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o | ||
15 | |||
16 | direct-y += ../../i386/pci/direct.o | ||
17 | acpi-y += ../../i386/pci/acpi.o | ||
18 | legacy-y += ../../i386/pci/legacy.o | ||
19 | irq-y += ../../i386/pci/irq.o | ||
20 | common-y += ../../i386/pci/common.o | ||
21 | fixup-y += ../../i386/pci/fixup.o | ||
22 | i386-y += ../../i386/pci/i386.o | ||
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c new file mode 100644 index 000000000000..62349c78db57 --- /dev/null +++ b/arch/x86_64/pci/k8-bus.c | |||
@@ -0,0 +1,78 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/pci.h> | ||
3 | #include <asm/mpspec.h> | ||
4 | #include <linux/cpumask.h> | ||
5 | |||
6 | /* | ||
7 | * This discovers the pcibus <-> node mapping on AMD K8. | ||
8 | * | ||
9 | * RED-PEN need to call this again on PCI hotplug | ||
10 | * RED-PEN empty cpus get reported wrong | ||
11 | */ | ||
12 | |||
13 | #define NODE_ID_REGISTER 0x60 | ||
14 | #define NODE_ID(dword) (dword & 0x07) | ||
15 | #define LDT_BUS_NUMBER_REGISTER_0 0x94 | ||
16 | #define LDT_BUS_NUMBER_REGISTER_1 0xB4 | ||
17 | #define LDT_BUS_NUMBER_REGISTER_2 0xD4 | ||
18 | #define NR_LDT_BUS_NUMBER_REGISTERS 3 | ||
19 | #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) | ||
20 | #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) | ||
21 | #define PCI_DEVICE_ID_K8HTCONFIG 0x1100 | ||
22 | |||
23 | /** | ||
24 | * fill_mp_bus_to_cpumask() | ||
25 | * fills the mp_bus_to_cpumask array based according to the LDT Bus Number | ||
26 | * Registers found in the K8 northbridge | ||
27 | */ | ||
28 | __init static int | ||
29 | fill_mp_bus_to_cpumask(void) | ||
30 | { | ||
31 | struct pci_dev *nb_dev = NULL; | ||
32 | int i, j, printed; | ||
33 | u32 ldtbus, nid; | ||
34 | static int lbnr[3] = { | ||
35 | LDT_BUS_NUMBER_REGISTER_0, | ||
36 | LDT_BUS_NUMBER_REGISTER_1, | ||
37 | LDT_BUS_NUMBER_REGISTER_2 | ||
38 | }; | ||
39 | |||
40 | while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, | ||
41 | PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { | ||
42 | pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); | ||
43 | |||
44 | for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { | ||
45 | pci_read_config_dword(nb_dev, lbnr[i], &ldtbus); | ||
46 | /* | ||
47 | * if there are no busses hanging off of the current | ||
48 | * ldt link then both the secondary and subordinate | ||
49 | * bus number fields are set to 0. | ||
50 | */ | ||
51 | if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0 | ||
52 | && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) { | ||
53 | for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); | ||
54 | j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); | ||
55 | j++) | ||
56 | pci_bus_to_cpumask[j] = | ||
57 | node_to_cpumask(NODE_ID(nid)); | ||
58 | } | ||
59 | } | ||
60 | } | ||
61 | |||
62 | /* quick sanity check */ | ||
63 | printed = 0; | ||
64 | for (i = 0; i < 256; i++) { | ||
65 | if (cpus_empty(pci_bus_to_cpumask[i])) { | ||
66 | pci_bus_to_cpumask[i] = CPU_MASK_ALL; | ||
67 | if (printed) | ||
68 | continue; | ||
69 | printk(KERN_ERR | ||
70 | "k8-bus.c: some busses have empty cpu mask\n"); | ||
71 | printed = 1; | ||
72 | } | ||
73 | } | ||
74 | |||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | fs_initcall(fill_mp_bus_to_cpumask); | ||
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c new file mode 100644 index 000000000000..b693c232fd07 --- /dev/null +++ b/arch/x86_64/pci/mmconfig.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * mmconfig.c - Low-level direct PCI config space access via MMCONFIG | ||
3 | * | ||
4 | * This is an 64bit optimized version that always keeps the full mmconfig | ||
5 | * space mapped. This allows lockless config space operation. | ||
6 | */ | ||
7 | |||
8 | #include <linux/pci.h> | ||
9 | #include <linux/init.h> | ||
10 | #include "pci.h" | ||
11 | |||
12 | #define MMCONFIG_APER_SIZE (256*1024*1024) | ||
13 | |||
14 | /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ | ||
15 | u32 pci_mmcfg_base_addr; | ||
16 | |||
17 | /* Static virtual mapping of the MMCONFIG aperture */ | ||
18 | char *pci_mmcfg_virt; | ||
19 | |||
20 | static inline char *pci_dev_base(unsigned int bus, unsigned int devfn) | ||
21 | { | ||
22 | return pci_mmcfg_virt + ((bus << 20) | (devfn << 12)); | ||
23 | } | ||
24 | |||
25 | static int pci_mmcfg_read(unsigned int seg, unsigned int bus, | ||
26 | unsigned int devfn, int reg, int len, u32 *value) | ||
27 | { | ||
28 | char *addr = pci_dev_base(bus, devfn); | ||
29 | |||
30 | if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) | ||
31 | return -EINVAL; | ||
32 | |||
33 | switch (len) { | ||
34 | case 1: | ||
35 | *value = readb(addr + reg); | ||
36 | break; | ||
37 | case 2: | ||
38 | *value = readw(addr + reg); | ||
39 | break; | ||
40 | case 4: | ||
41 | *value = readl(addr + reg); | ||
42 | break; | ||
43 | } | ||
44 | |||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | static int pci_mmcfg_write(unsigned int seg, unsigned int bus, | ||
49 | unsigned int devfn, int reg, int len, u32 value) | ||
50 | { | ||
51 | char *addr = pci_dev_base(bus,devfn); | ||
52 | |||
53 | if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) | ||
54 | return -EINVAL; | ||
55 | |||
56 | switch (len) { | ||
57 | case 1: | ||
58 | writeb(value, addr + reg); | ||
59 | break; | ||
60 | case 2: | ||
61 | writew(value, addr + reg); | ||
62 | break; | ||
63 | case 4: | ||
64 | writel(value, addr + reg); | ||
65 | break; | ||
66 | } | ||
67 | |||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static struct pci_raw_ops pci_mmcfg = { | ||
72 | .read = pci_mmcfg_read, | ||
73 | .write = pci_mmcfg_write, | ||
74 | }; | ||
75 | |||
76 | static int __init pci_mmcfg_init(void) | ||
77 | { | ||
78 | if ((pci_probe & PCI_PROBE_MMCONF) == 0) | ||
79 | return 0; | ||
80 | if (!pci_mmcfg_base_addr) | ||
81 | return 0; | ||
82 | |||
83 | /* Kludge for now. Don't use mmconfig on AMD systems because | ||
84 | those have some busses where mmconfig doesn't work, | ||
85 | and we don't parse ACPI MCFG well enough to handle that. | ||
86 | Remove when proper handling is added. */ | ||
87 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
88 | return 0; | ||
89 | |||
90 | /* RED-PEN i386 doesn't do _nocache right now */ | ||
91 | pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE); | ||
92 | if (!pci_mmcfg_virt) { | ||
93 | printk("PCI: Cannot map mmconfig aperture\n"); | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr); | ||
98 | raw_pci_ops = &pci_mmcfg; | ||
99 | pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | arch_initcall(pci_mmcfg_init); | ||