aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig477
-rw-r--r--arch/x86_64/Kconfig.debug57
-rw-r--r--arch/x86_64/Makefile119
-rw-r--r--arch/x86_64/boot/Makefile102
-rw-r--r--arch/x86_64/boot/bootsect.S98
-rw-r--r--arch/x86_64/boot/compressed/Makefile32
-rw-r--r--arch/x86_64/boot/compressed/head.S142
-rw-r--r--arch/x86_64/boot/compressed/misc.c354
-rw-r--r--arch/x86_64/boot/compressed/miscsetup.h39
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.scr9
-rw-r--r--arch/x86_64/boot/install.sh40
-rw-r--r--arch/x86_64/boot/mtools.conf.in17
-rw-r--r--arch/x86_64/boot/setup.S867
-rw-r--r--arch/x86_64/boot/tools/build.c186
-rw-r--r--arch/x86_64/boot/video.S2007
-rw-r--r--arch/x86_64/defconfig1129
-rw-r--r--arch/x86_64/ia32/Makefile32
-rw-r--r--arch/x86_64/ia32/fpu32.c184
-rw-r--r--arch/x86_64/ia32/ia32_aout.c529
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c434
-rw-r--r--arch/x86_64/ia32/ia32_ioctl.c201
-rw-r--r--arch/x86_64/ia32/ia32_signal.c621
-rw-r--r--arch/x86_64/ia32/ia32entry.S602
-rw-r--r--arch/x86_64/ia32/ipc32.c57
-rw-r--r--arch/x86_64/ia32/ptrace32.c379
-rw-r--r--arch/x86_64/ia32/sys_ia32.c1050
-rw-r--r--arch/x86_64/ia32/syscall32.c111
-rw-r--r--arch/x86_64/ia32/tls32.c163
-rw-r--r--arch/x86_64/ia32/vsyscall-sigreturn.S120
-rw-r--r--arch/x86_64/ia32/vsyscall-syscall.S68
-rw-r--r--arch/x86_64/ia32/vsyscall-sysenter.S94
-rw-r--r--arch/x86_64/ia32/vsyscall.lds77
-rw-r--r--arch/x86_64/kernel/Makefile45
-rw-r--r--arch/x86_64/kernel/acpi/Makefile3
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c132
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S527
-rw-r--r--arch/x86_64/kernel/aperture.c286
-rw-r--r--arch/x86_64/kernel/apic.c1088
-rw-r--r--arch/x86_64/kernel/asm-offsets.c69
-rw-r--r--arch/x86_64/kernel/cpufreq/Kconfig96
-rw-r--r--arch/x86_64/kernel/cpufreq/Makefile17
-rw-r--r--arch/x86_64/kernel/e820.c513
-rw-r--r--arch/x86_64/kernel/early_printk.c220
-rw-r--r--arch/x86_64/kernel/entry.S920
-rw-r--r--arch/x86_64/kernel/genapic.c89
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c130
-rw-r--r--arch/x86_64/kernel/genapic_flat.c127
-rw-r--r--arch/x86_64/kernel/head.S396
-rw-r--r--arch/x86_64/kernel/head64.c117
-rw-r--r--arch/x86_64/kernel/i387.c155
-rw-r--r--arch/x86_64/kernel/i8259.c579
-rw-r--r--arch/x86_64/kernel/init_task.c49
-rw-r--r--arch/x86_64/kernel/io_apic.c1982
-rw-r--r--arch/x86_64/kernel/ioport.c117
-rw-r--r--arch/x86_64/kernel/irq.c108
-rw-r--r--arch/x86_64/kernel/kprobes.c631
-rw-r--r--arch/x86_64/kernel/ldt.c253
-rw-r--r--arch/x86_64/kernel/mce.c548
-rw-r--r--arch/x86_64/kernel/mce_intel.c99
-rw-r--r--arch/x86_64/kernel/module.c166
-rw-r--r--arch/x86_64/kernel/mpparse.c949
-rw-r--r--arch/x86_64/kernel/msr.c279
-rw-r--r--arch/x86_64/kernel/nmi.c488
-rw-r--r--arch/x86_64/kernel/pci-dma.c60
-rw-r--r--arch/x86_64/kernel/pci-gart.c980
-rw-r--r--arch/x86_64/kernel/pci-nommu.c94
-rw-r--r--arch/x86_64/kernel/process.c770
-rw-r--r--arch/x86_64/kernel/ptrace.c547
-rw-r--r--arch/x86_64/kernel/reboot.c163
-rw-r--r--arch/x86_64/kernel/semaphore.c180
-rw-r--r--arch/x86_64/kernel/setup.c1189
-rw-r--r--arch/x86_64/kernel/setup64.c292
-rw-r--r--arch/x86_64/kernel/signal.c486
-rw-r--r--arch/x86_64/kernel/smp.c415
-rw-r--r--arch/x86_64/kernel/smpboot.c938
-rw-r--r--arch/x86_64/kernel/suspend.c157
-rw-r--r--arch/x86_64/kernel/suspend_asm.S104
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c173
-rw-r--r--arch/x86_64/kernel/syscall.c26
-rw-r--r--arch/x86_64/kernel/time.c1262
-rw-r--r--arch/x86_64/kernel/trampoline.S64
-rw-r--r--arch/x86_64/kernel/traps.c948
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S164
-rw-r--r--arch/x86_64/kernel/vsyscall.c225
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c221
-rw-r--r--arch/x86_64/lib/Makefile14
-rw-r--r--arch/x86_64/lib/bitops.c141
-rw-r--r--arch/x86_64/lib/bitstr.c28
-rw-r--r--arch/x86_64/lib/clear_page.S50
-rw-r--r--arch/x86_64/lib/copy_page.S101
-rw-r--r--arch/x86_64/lib/copy_user.S294
-rw-r--r--arch/x86_64/lib/csum-copy.S233
-rw-r--r--arch/x86_64/lib/csum-partial.c150
-rw-r--r--arch/x86_64/lib/csum-wrappers.c129
-rw-r--r--arch/x86_64/lib/dec_and_lock.c40
-rw-r--r--arch/x86_64/lib/delay.c48
-rw-r--r--arch/x86_64/lib/getuser.S101
-rw-r--r--arch/x86_64/lib/io.c23
-rw-r--r--arch/x86_64/lib/memcpy.S121
-rw-r--r--arch/x86_64/lib/memmove.c19
-rw-r--r--arch/x86_64/lib/memset.S125
-rw-r--r--arch/x86_64/lib/putuser.S89
-rw-r--r--arch/x86_64/lib/thunk.S95
-rw-r--r--arch/x86_64/lib/usercopy.c153
-rw-r--r--arch/x86_64/mm/Makefile11
-rw-r--r--arch/x86_64/mm/extable.c35
-rw-r--r--arch/x86_64/mm/fault.c579
-rw-r--r--arch/x86_64/mm/init.c630
-rw-r--r--arch/x86_64/mm/ioremap.c283
-rw-r--r--arch/x86_64/mm/k8topology.c168
-rw-r--r--arch/x86_64/mm/numa.c294
-rw-r--r--arch/x86_64/mm/pageattr.c235
-rw-r--r--arch/x86_64/mm/srat.c217
-rw-r--r--arch/x86_64/oprofile/Kconfig23
-rw-r--r--arch/x86_64/oprofile/Makefile19
-rw-r--r--arch/x86_64/pci/Makefile24
-rw-r--r--arch/x86_64/pci/Makefile-BUS22
-rw-r--r--arch/x86_64/pci/k8-bus.c78
-rw-r--r--arch/x86_64/pci/mmconfig.c104
119 files changed, 35709 insertions, 0 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
new file mode 100644
index 000000000000..80c38c5d71fe
--- /dev/null
+++ b/arch/x86_64/Kconfig
@@ -0,0 +1,477 @@
1#
2# For a description of the syntax of this configuration file,
3# see Documentation/kbuild/kconfig-language.txt.
4#
5# Note: ISA is disabled and will hopefully never be enabled.
6# If you managed to buy an ISA x86-64 box you'll have to fix all the
7# ISA drivers you need yourself.
8#
9
10mainmenu "Linux Kernel Configuration"
11
12config X86_64
13 bool
14 default y
15 help
16 Port to the x86-64 architecture. x86-64 is a 64-bit extension to the
17 classical 32-bit x86 architecture. For details see
18 <http://www.x86-64.org/>.
19
20config 64BIT
21 def_bool y
22
23config X86
24 bool
25 default y
26
27config MMU
28 bool
29 default y
30
31config ISA
32 bool
33
34config SBUS
35 bool
36
37config RWSEM_GENERIC_SPINLOCK
38 bool
39 default y
40
41config RWSEM_XCHGADD_ALGORITHM
42 bool
43
44config GENERIC_CALIBRATE_DELAY
45 bool
46 default y
47
48config X86_CMPXCHG
49 bool
50 default y
51
52config EARLY_PRINTK
53 bool
54 default y
55
56config GENERIC_ISA_DMA
57 bool
58 default y
59
60config GENERIC_IOMAP
61 bool
62 default y
63
64source "init/Kconfig"
65
66
67menu "Processor type and features"
68
69choice
70 prompt "Processor family"
71 default MK8
72
73config MK8
74 bool "AMD-Opteron/Athlon64"
75 help
76 Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
77
78config MPSC
79 bool "Intel EM64T"
80 help
81 Optimize for Intel Pentium 4 and Xeon CPUs with Intel
82 Extended Memory 64 Technology(EM64T). For details see
83 <http://www.intel.com/technology/64bitextensions/>.
84
85config GENERIC_CPU
86 bool "Generic-x86-64"
87 help
88 Generic x86-64 CPU.
89
90endchoice
91
92#
93# Define implied options from the CPU selection here
94#
95config X86_L1_CACHE_BYTES
96 int
97 default "128" if GENERIC_CPU || MPSC
98 default "64" if MK8
99
100config X86_L1_CACHE_SHIFT
101 int
102 default "7" if GENERIC_CPU || MPSC
103 default "6" if MK8
104
105config X86_TSC
106 bool
107 default y
108
109config X86_GOOD_APIC
110 bool
111 default y
112
113config MICROCODE
114 tristate "/dev/cpu/microcode - Intel CPU microcode support"
115 ---help---
116 If you say Y here the 'File systems' section, you will be
117 able to update the microcode on Intel processors. You will
118 obviously need the actual microcode binary data itself which is
119 not shipped with the Linux kernel.
120
121 For latest news and information on obtaining all the required
122 ingredients for this driver, check:
123 <http://www.urbanmyth.org/microcode/>.
124
125 To compile this driver as a module, choose M here: the
126 module will be called microcode.
127 If you use modprobe or kmod you may also want to add the line
128 'alias char-major-10-184 microcode' to your /etc/modules.conf file.
129
130config X86_MSR
131 tristate "/dev/cpu/*/msr - Model-specific register support"
132 help
133 This device gives privileged processes access to the x86
134 Model-Specific Registers (MSRs). It is a character device with
135 major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
136 MSR accesses are directed to a specific CPU on multi-processor
137 systems.
138
139config X86_CPUID
140 tristate "/dev/cpu/*/cpuid - CPU information support"
141 help
142 This device gives processes access to the x86 CPUID instruction to
143 be executed on a specific processor. It is a character device
144 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
145 /dev/cpu/31/cpuid.
146
147# disable it for opteron optimized builds because it pulls in ACPI_BOOT
148config X86_HT
149 bool
150 depends on SMP && !MK8
151 default y
152
153config MATH_EMULATION
154 bool
155
156config MCA
157 bool
158
159config EISA
160 bool
161
162config X86_IO_APIC
163 bool
164 default y
165
166config X86_LOCAL_APIC
167 bool
168 default y
169
170config MTRR
171 bool "MTRR (Memory Type Range Register) support"
172 ---help---
173 On Intel P6 family processors (Pentium Pro, Pentium II and later)
174 the Memory Type Range Registers (MTRRs) may be used to control
175 processor access to memory ranges. This is most useful if you have
176 a video (VGA) card on a PCI or AGP bus. Enabling write-combining
177 allows bus write transfers to be combined into a larger transfer
178 before bursting over the PCI/AGP bus. This can increase performance
179 of image write operations 2.5 times or more. Saying Y here creates a
180 /proc/mtrr file which may be used to manipulate your processor's
181 MTRRs. Typically the X server should use this.
182
183 This code has a reasonably generic interface so that similar
184 control registers on other processors can be easily supported
185 as well.
186
187 Saying Y here also fixes a problem with buggy SMP BIOSes which only
188 set the MTRRs for the boot CPU and not for the secondary CPUs. This
189 can lead to all sorts of problems, so it's good to say Y here.
190
191 Just say Y here, all x86-64 machines support MTRRs.
192
193 See <file:Documentation/mtrr.txt> for more information.
194
195config SMP
196 bool "Symmetric multi-processing support"
197 ---help---
198 This enables support for systems with more than one CPU. If you have
199 a system with only one CPU, like most personal computers, say N. If
200 you have a system with more than one CPU, say Y.
201
202 If you say N here, the kernel will run on single and multiprocessor
203 machines, but will use only one CPU of a multiprocessor machine. If
204 you say Y here, the kernel will run on many, but not all,
205 singleprocessor machines. On a singleprocessor machine, the kernel
206 will run faster if you say N here.
207
208 If you don't know what to do here, say N.
209
210config PREEMPT
211 bool "Preemptible Kernel"
212 ---help---
213 This option reduces the latency of the kernel when reacting to
214 real-time or interactive events by allowing a low priority process to
215 be preempted even if it is in kernel mode executing a system call.
216 This allows applications to run more reliably even when the system is
217 under load. On contrary it may also break your drivers and add
218 priority inheritance problems to your system. Don't select it if
219 you rely on a stable system or have slightly obscure hardware.
220 It's also not very well tested on x86-64 currently.
221 You have been warned.
222
223 Say Y here if you are feeling brave and building a kernel for a
224 desktop, embedded or real-time system. Say N if you are unsure.
225
226config PREEMPT_BKL
227 bool "Preempt The Big Kernel Lock"
228 depends on PREEMPT
229 default y
230 help
231 This option reduces the latency of the kernel by making the
232 big kernel lock preemptible.
233
234 Say Y here if you are building a kernel for a desktop system.
235 Say N if you are unsure.
236
237config SCHED_SMT
238 bool "SMT (Hyperthreading) scheduler support"
239 depends on SMP
240 default n
241 help
242 SMT scheduler support improves the CPU scheduler's decision making
243 when dealing with Intel Pentium 4 chips with HyperThreading at a
244 cost of slightly increased overhead in some places. If unsure say
245 N here.
246
247config K8_NUMA
248 bool "K8 NUMA support"
249 select NUMA
250 depends on SMP
251 help
252 Enable NUMA (Non Unified Memory Architecture) support for
253 AMD Opteron Multiprocessor systems. The kernel will try to allocate
254 memory used by a CPU on the local memory controller of the CPU
255 and add some more NUMA awareness to the kernel.
256 This code is recommended on all multiprocessor Opteron systems
257 and normally doesn't hurt on others.
258
259config NUMA_EMU
260 bool "NUMA emulation support"
261 select NUMA
262 depends on SMP
263 help
264 Enable NUMA emulation. A flat machine will be split
265 into virtual nodes when booted with "numa=fake=N", where N is the
266 number of nodes. This is only useful for debugging.
267
268config DISCONTIGMEM
269 bool
270 depends on NUMA
271 default y
272
273config NUMA
274 bool
275 default n
276
277config HAVE_DEC_LOCK
278 bool
279 depends on SMP
280 default y
281
282config NR_CPUS
283 int "Maximum number of CPUs (2-256)"
284 range 2 256
285 depends on SMP
286 default "8"
287 help
288 This allows you to specify the maximum number of CPUs which this
289 kernel will support. Current maximum is 256 CPUs due to
290 APIC addressing limits. Less depending on the hardware.
291
292 This is purely to save memory - each supported CPU requires
293 memory in the static kernel configuration.
294
295config HPET_TIMER
296 bool
297 default y
298 help
299 Use the IA-PC HPET (High Precision Event Timer) to manage
300 time in preference to the PIT and RTC, if a HPET is
301 present. The HPET provides a stable time base on SMP
302 systems, unlike the TSC, but it is more expensive to access,
303 as it is off-chip. You can find the HPET spec at
304 <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
305
306config HPET_EMULATE_RTC
307 bool "Provide RTC interrupt"
308 depends on HPET_TIMER && RTC=y
309
310config GART_IOMMU
311 bool "IOMMU support"
312 depends on PCI
313 help
314 Support the K8 IOMMU. Needed to run systems with more than 4GB of memory
315 properly with 32-bit PCI devices that do not support DAC (Double Address
316 Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
317 Normally the kernel will take the right choice by itself.
318 If unsure, say Y.
319
320# need this always enabled with GART_IOMMU for the VIA workaround
321config SWIOTLB
322 bool
323 depends on GART_IOMMU
324 default y
325
326config DUMMY_IOMMU
327 bool
328 depends on !GART_IOMMU && !SWIOTLB
329 default y
330 help
331 Don't use IOMMU code. This will cause problems when you have more than 4GB
332 of memory and any 32-bit devices. Don't turn on unless you know what you
333 are doing.
334
335config X86_MCE
336 bool "Machine check support" if EMBEDDED
337 default y
338 help
339 Include a machine check error handler to report hardware errors.
340 This version will require the mcelog utility to decode some
341 machine check error logs. See
342 ftp://ftp.x86-64.org/pub/linux/tools/mcelog
343
344config X86_MCE_INTEL
345 bool "Intel MCE features"
346 depends on X86_MCE && X86_LOCAL_APIC
347 default y
348 help
349 Additional support for intel specific MCE features such as
350 the thermal monitor.
351
352config SECCOMP
353 bool "Enable seccomp to safely compute untrusted bytecode"
354 depends on PROC_FS
355 default y
356 help
357 This kernel feature is useful for number crunching applications
358 that may need to compute untrusted bytecode during their
359 execution. By using pipes or other transports made available to
360 the process as file descriptors supporting the read/write
361 syscalls, it's possible to isolate those applications in
362 their own address space using seccomp. Once seccomp is
363 enabled via /proc/<pid>/seccomp, it cannot be disabled
364 and the task is only allowed to execute a few safe syscalls
365 defined by each seccomp mode.
366
367 If unsure, say Y. Only embedded should say N here.
368
369endmenu
370
371#
372# Use the generic interrupt handling code in kernel/irq/:
373#
374config GENERIC_HARDIRQS
375 bool
376 default y
377
378config GENERIC_IRQ_PROBE
379 bool
380 default y
381
382menu "Power management options"
383
384source kernel/power/Kconfig
385
386source "drivers/acpi/Kconfig"
387
388source "arch/x86_64/kernel/cpufreq/Kconfig"
389
390endmenu
391
392menu "Bus options (PCI etc.)"
393
394config PCI
395 bool "PCI support"
396
397# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
398config PCI_DIRECT
399 bool
400 depends on PCI
401 default y
402
403config PCI_MMCONFIG
404 bool "Support mmconfig PCI config space access"
405 depends on PCI
406 select ACPI_BOOT
407
408config UNORDERED_IO
409 bool "Unordered IO mapping access"
410 depends on EXPERIMENTAL
411 help
412 Use unordered stores to access IO memory mappings in device drivers.
413 Still very experimental. When a driver works on IA64/ppc64/pa-risc it should
414 work with this option, but it makes the drivers behave differently
415 from i386. Requires that the driver writer used memory barriers
416 properly.
417
418source "drivers/pci/pcie/Kconfig"
419
420source "drivers/pci/Kconfig"
421
422source "drivers/pcmcia/Kconfig"
423
424source "drivers/pci/hotplug/Kconfig"
425
426endmenu
427
428
429menu "Executable file formats / Emulations"
430
431source "fs/Kconfig.binfmt"
432
433config IA32_EMULATION
434 bool "IA32 Emulation"
435 help
436 Include code to run 32-bit programs under a 64-bit kernel. You should likely
437 turn this on, unless you're 100% sure that you don't have any 32-bit programs
438 left.
439
440config IA32_AOUT
441 bool "IA32 a.out support"
442 depends on IA32_EMULATION
443 help
444 Support old a.out binaries in the 32bit emulation.
445
446config COMPAT
447 bool
448 depends on IA32_EMULATION
449 default y
450
451config SYSVIPC_COMPAT
452 bool
453 depends on COMPAT && SYSVIPC
454 default y
455
456config UID16
457 bool
458 depends on IA32_EMULATION
459 default y
460
461endmenu
462
463source drivers/Kconfig
464
465source "drivers/firmware/Kconfig"
466
467source fs/Kconfig
468
469source "arch/x86_64/oprofile/Kconfig"
470
471source "arch/x86_64/Kconfig.debug"
472
473source "security/Kconfig"
474
475source "crypto/Kconfig"
476
477source "lib/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
new file mode 100644
index 000000000000..9cf1410d2f5a
--- /dev/null
+++ b/arch/x86_64/Kconfig.debug
@@ -0,0 +1,57 @@
1menu "Kernel hacking"
2
3source "lib/Kconfig.debug"
4
5# !SMP for now because the context switch early causes GPF in segment reloading
6# and the GS base checking does the wrong thing then, causing a hang.
7config CHECKING
8 bool "Additional run-time checks"
9 depends on DEBUG_KERNEL && !SMP
10 help
11 Enables some internal consistency checks for kernel debugging.
12 You should normally say N.
13
14config INIT_DEBUG
15 bool "Debug __init statements"
16 depends on DEBUG_KERNEL
17 help
18 Fill __init and __initdata at the end of boot. This helps debugging
19 illegal uses of __init and __initdata after initialization.
20
21config IOMMU_DEBUG
22 depends on GART_IOMMU && DEBUG_KERNEL
23 bool "Enable IOMMU debugging"
24 help
25 Force the IOMMU to on even when you have less than 4GB of
26 memory and add debugging code. On overflow always panic. And
27 allow to enable IOMMU leak tracing. Can be disabled at boot
28 time with iommu=noforce. This will also enable scatter gather
29 list merging. Currently not recommended for production
30 code. When you use it make sure you have a big enough
31 IOMMU/AGP aperture. Most of the options enabled by this can
32 be set more finegrained using the iommu= command line
33 options. See Documentation/x86_64/boot-options.txt for more
34 details.
35
36config KPROBES
37 bool "Kprobes"
38 depends on DEBUG_KERNEL
39 help
40 Kprobes allows you to trap at almost any kernel address and
41 execute a callback function. register_kprobe() establishes
42 a probepoint and specifies the callback. Kprobes is useful
43 for kernel debugging, non-intrusive instrumentation and testing.
44 If in doubt, say "N".
45
46config IOMMU_LEAK
47 bool "IOMMU leak tracing"
48 depends on DEBUG_KERNEL
49 depends on IOMMU_DEBUG
50 help
51 Add a simple leak tracer to the IOMMU code. This is useful when you
52 are debugging a buggy device driver that leaks IOMMU mappings.
53
54#config X86_REMOTE_DEBUG
55# bool "kgdb debugging stub"
56
57endmenu
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
new file mode 100644
index 000000000000..6f90c246c418
--- /dev/null
+++ b/arch/x86_64/Makefile
@@ -0,0 +1,119 @@
1#
2# x86_64/Makefile
3#
4# This file is included by the global makefile so that you can add your own
5# architecture-specific flags and dependencies. Remember to do have actions
6# for "archclean" and "archdep" for cleaning up and making dependencies for
7# this architecture
8#
9# This file is subject to the terms and conditions of the GNU General Public
10# License. See the file "COPYING" in the main directory of this archive
11# for more details.
12#
13# Copyright (C) 1994 by Linus Torvalds
14#
15# 19990713 Artur Skawina <skawina@geocities.com>
16# Added '-march' and '-mpreferred-stack-boundary' support
17# 20000913 Pavel Machek <pavel@suse.cz>
18# Converted for x86_64 architecture
19# 20010105 Andi Kleen, add IA32 compiler.
20# ....and later removed it again....
21#
22# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
23
24#
25# early bootup linking needs 32bit. You can either use real 32bit tools
26# here or 64bit tools in 32bit mode.
27#
28IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer
29IA32_LD := $(LD) -m elf_i386
30IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c
31IA32_OBJCOPY := $(CROSS_COMPILE)objcopy
32IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E
33export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
34
35
36LDFLAGS := -m elf_x86_64
37OBJCOPYFLAGS := -O binary -R .note -R .comment -S
38LDFLAGS_vmlinux := -e stext
39
40CHECKFLAGS += -D__x86_64__ -m64
41
42cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
43cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
44CFLAGS += $(cflags-y)
45
46CFLAGS += -mno-red-zone
47CFLAGS += -mcmodel=kernel
48CFLAGS += -pipe
49# this makes reading assembly source easier, but produces worse code
50# actually it makes the kernel smaller too.
51CFLAGS += -fno-reorder-blocks
52CFLAGS += -Wno-sign-compare
53ifneq ($(CONFIG_DEBUG_INFO),y)
54CFLAGS += -fno-asynchronous-unwind-tables
55# -fweb shrinks the kernel a bit, but the difference is very small
56# it also messes up debugging, so don't use it for now.
57#CFLAGS += $(call cc-option,-fweb)
58endif
59# -funit-at-a-time shrinks the kernel .text considerably
60# unfortunately it makes reading oopses harder.
61CFLAGS += $(call cc-option,-funit-at-a-time)
62# prevent gcc from generating any FP code by mistake
63CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
64
65head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
66
67libs-y += arch/x86_64/lib/
68core-y += arch/x86_64/kernel/ arch/x86_64/mm/
69core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
70drivers-$(CONFIG_PCI) += arch/x86_64/pci/
71drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/
72
73boot := arch/x86_64/boot
74
75.PHONY: bzImage bzlilo install archmrproper \
76 fdimage fdimage144 fdimage288 archclean
77
78#Default target when executing "make"
79all: bzImage
80
81BOOTIMAGE := arch/x86_64/boot/bzImage
82KBUILD_IMAGE := $(BOOTIMAGE)
83
84bzImage: vmlinux
85 $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
86
87bzlilo: vmlinux
88 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
89
90bzdisk: vmlinux
91 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
92
93install fdimage fdimage144 fdimage288: vmlinux
94 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
95
96archclean:
97 $(Q)$(MAKE) $(clean)=$(boot)
98
99prepare: include/asm-$(ARCH)/offset.h
100
101arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
102 include/config/MARKER
103
104include/asm-$(ARCH)/offset.h: arch/$(ARCH)/kernel/asm-offsets.s
105 $(call filechk,gen-asm-offsets)
106
107CLEAN_FILES += include/asm-$(ARCH)/offset.h
108
109define archhelp
110 echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)'
111 echo ' install - Install kernel using'
112 echo ' (your) ~/bin/installkernel or'
113 echo ' (distribution) /sbin/installkernel or'
114 echo ' install to $$(INSTALL_PATH) and run lilo'
115endef
116
117CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
118
119
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
new file mode 100644
index 000000000000..f4399c701b77
--- /dev/null
+++ b/arch/x86_64/boot/Makefile
@@ -0,0 +1,102 @@
1#
2# arch/x86_64/boot/Makefile
3#
4# This file is subject to the terms and conditions of the GNU General Public
5# License. See the file "COPYING" in the main directory of this archive
6# for more details.
7#
8# Copyright (C) 1994 by Linus Torvalds
9#
10
11# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'.
14
15ROOT_DEV := CURRENT
16
17# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup.
21
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23
24# If you want the RAM disk device, define this to be the size in blocks.
25
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin bootsect bootsect.o \
29 setup setup.o bzImage mtools.conf
30
31EXTRA_CFLAGS := -m32
32
33hostprogs-y := tools/build
34HOST_EXTRACFLAGS += $(LINUXINCLUDE)
35subdir- := compressed/ #Let make clean descend in compressed/
36# ---------------------------------------------------------------------------
37
38$(obj)/bzImage: IMAGE_OFFSET := 0x100000
39$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
40$(obj)/bzImage: BUILDFLAGS := -b
41
42quiet_cmd_image = BUILD $@
43cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \
44 $(obj)/vmlinux.bin $(ROOT_DEV) > $@
45
46$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
47 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
48 $(call if_changed,image)
49 @echo 'Kernel: $@ is ready'
50
51$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
52 $(call if_changed,objcopy)
53
54LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
55LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext
56
57$(obj)/setup $(obj)/bootsect: %: %.o FORCE
58 $(call if_changed,ld)
59
60$(obj)/compressed/vmlinux: FORCE
61 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
62
63# Set this if you want to pass append arguments to the zdisk/fdimage kernel
64FDARGS =
65
66$(obj)/mtools.conf: $(src)/mtools.conf.in
67 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
68
69# This requires write access to /dev/fd0
70zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
71 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
72 syslinux /dev/fd0 ; sync
73 echo 'default linux $(FDARGS)' | \
74 MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg
75 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
76
77# These require being root or having syslinux 2.02 or higher installed
78fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
79 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
80 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
81 syslinux $(obj)/fdimage ; sync
82 echo 'default linux $(FDARGS)' | \
83 MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
84 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
85
86fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
87 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
88 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
89 syslinux $(obj)/fdimage ; sync
90 echo 'default linux $(FDARGS)' | \
91 MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
92 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
93
94zlilo: $(BOOTIMAGE)
95 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
96 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
97 cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
98 cp System.map $(INSTALL_PATH)/
99 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
100
101install: $(BOOTIMAGE)
102 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S
new file mode 100644
index 000000000000..bb15d406ee95
--- /dev/null
+++ b/arch/x86_64/boot/bootsect.S
@@ -0,0 +1,98 @@
1/*
2 * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds
3 *
4 * modified by Drew Eckhardt
5 * modified by Bruce Evans (bde)
6 * modified by Chris Noe (May 1999) (as86 -> gas)
7 * gutted by H. Peter Anvin (Jan 2003)
8 *
9 * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
10 * addresses must be multiplied by 16 to obtain their respective linear
11 * addresses. To avoid confusion, linear addresses are written using leading
12 * hex while segment addresses are written as segment:offset.
13 *
14 */
15
16#include <asm/boot.h>
17
18SETUPSECTS = 4 /* default nr of setup-sectors */
19BOOTSEG = 0x07C0 /* original address of boot-sector */
20INITSEG = DEF_INITSEG /* we move boot here - out of the way */
21SETUPSEG = DEF_SETUPSEG /* setup starts here */
22SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
23SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
24 /* to be loaded */
25ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
26SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
27
28#ifndef SVGA_MODE
29#define SVGA_MODE ASK_VGA
30#endif
31
32#ifndef RAMDISK
33#define RAMDISK 0
34#endif
35
36#ifndef ROOT_RDONLY
37#define ROOT_RDONLY 1
38#endif
39
40.code16
41.text
42
43.global _start
44_start:
45
46 # Normalize the start address
47 jmpl $BOOTSEG, $start2
48
49start2:
50 movw %cs, %ax
51 movw %ax, %ds
52 movw %ax, %es
53 movw %ax, %ss
54 movw $0x7c00, %sp
55 sti
56 cld
57
58 movw $bugger_off_msg, %si
59
60msg_loop:
61 lodsb
62 andb %al, %al
63 jz die
64 movb $0xe, %ah
65 movw $7, %bx
66 int $0x10
67 jmp msg_loop
68
69die:
70 # Allow the user to press a key, then reboot
71 xorw %ax, %ax
72 int $0x16
73 int $0x19
74
75 # int 0x19 should never return. In case it does anyway,
76 # invoke the BIOS reset code...
77 ljmp $0xf000,$0xfff0
78
79
80bugger_off_msg:
81 .ascii "Direct booting from floppy is no longer supported.\r\n"
82 .ascii "Please use a boot loader program instead.\r\n"
83 .ascii "\n"
84 .ascii "Remove disk and press any key to reboot . . .\r\n"
85 .byte 0
86
87
88 # Kernel attributes; used by setup
89
90 .org 497
91setup_sects: .byte SETUPSECTS
92root_flags: .word ROOT_RDONLY
93syssize: .word SYSSIZE
94swap_dev: .word SWAP_DEV
95ram_size: .word RAMDISK
96vid_mode: .word SVGA_MODE
97root_dev: .word ROOT_DEV
98boot_flag: .word 0xAA55
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
new file mode 100644
index 000000000000..f89d96f11a9f
--- /dev/null
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -0,0 +1,32 @@
1#
2# linux/arch/x86_64/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6# Note all the files here are compiled/linked as 32bit executables.
7#
8
9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
10EXTRA_AFLAGS := -traditional -m32
11
12# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
13# -m32
14CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing
15LDFLAGS := -m elf_i386
16
17LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386
18
19$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
20 $(call if_changed,ld)
21 @:
22
23$(obj)/vmlinux.bin: vmlinux FORCE
24 $(call if_changed,objcopy)
25
26$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
27 $(call if_changed,gzip)
28
29LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
30
31$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
32 $(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
new file mode 100644
index 000000000000..27264dbd575c
--- /dev/null
+++ b/arch/x86_64/boot/compressed/head.S
@@ -0,0 +1,142 @@
1/*
2 * linux/boot/head.S
3 *
4 * Copyright (C) 1991, 1992, 1993 Linus Torvalds
5 *
6 * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $
7 */
8
9/*
10 * head.S contains the 32-bit startup code.
11 *
12 * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
13 * the page directory will exist. The startup code will be overwritten by
14 * the page directory. [According to comments etc elsewhere on a compressed
15 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
16 *
17 * Page 0 is deliberately kept safe, since System Management Mode code in
18 * laptops may need to access the BIOS data stored there. This is also
19 * useful for future device drivers that either access the BIOS via VM86
20 * mode.
21 */
22
23/*
24 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
25 */
26.code32
27.text
28
29#include <linux/linkage.h>
30#include <asm/segment.h>
31
32 .code32
33 .globl startup_32
34
35startup_32:
36 cld
37 cli
38 movl $(__KERNEL_DS),%eax
39 movl %eax,%ds
40 movl %eax,%es
41 movl %eax,%fs
42 movl %eax,%gs
43
44 lss stack_start,%esp
45 xorl %eax,%eax
461: incl %eax # check that A20 really IS enabled
47 movl %eax,0x000000 # loop forever if it isn't
48 cmpl %eax,0x100000
49 je 1b
50
51/*
52 * Initialize eflags. Some BIOS's leave bits like NT set. This would
53 * confuse the debugger if this code is traced.
54 * XXX - best to initialize before switching to protected mode.
55 */
56 pushl $0
57 popfl
58/*
59 * Clear BSS
60 */
61 xorl %eax,%eax
62 movl $_edata,%edi
63 movl $_end,%ecx
64 subl %edi,%ecx
65 cld
66 rep
67 stosb
68/*
69 * Do the decompression, and jump to the new kernel..
70 */
71 subl $16,%esp # place for structure on the stack
72 movl %esp,%eax
73 pushl %esi # real mode pointer as second arg
74 pushl %eax # address of structure as first arg
75 call decompress_kernel
76 orl %eax,%eax
77 jnz 3f
78 addl $8,%esp
79 xorl %ebx,%ebx
80 ljmp $(__KERNEL_CS), $0x100000
81
82/*
83 * We come here, if we were loaded high.
84 * We need to move the move-in-place routine down to 0x1000
85 * and then start it with the buffer addresses in registers,
86 * which we got from the stack.
87 */
883:
89 movl %esi,%ebx
90 movl $move_routine_start,%esi
91 movl $0x1000,%edi
92 movl $move_routine_end,%ecx
93 subl %esi,%ecx
94 addl $3,%ecx
95 shrl $2,%ecx
96 cld
97 rep
98 movsl
99
100 popl %esi # discard the address
101 addl $4,%esp # real mode pointer
102 popl %esi # low_buffer_start
103 popl %ecx # lcount
104 popl %edx # high_buffer_start
105 popl %eax # hcount
106 movl $0x100000,%edi
107 cli # make sure we don't get interrupted
108 ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
109
110/*
111 * Routine (template) for moving the decompressed kernel in place,
112 * if we were high loaded. This _must_ PIC-code !
113 */
114move_routine_start:
115 movl %ecx,%ebp
116 shrl $2,%ecx
117 rep
118 movsl
119 movl %ebp,%ecx
120 andl $3,%ecx
121 rep
122 movsb
123 movl %edx,%esi
124 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
125 addl $3,%ecx
126 shrl $2,%ecx
127 rep
128 movsl
129 movl %ebx,%esi # Restore setup pointer
130 xorl %ebx,%ebx
131 ljmp $(__KERNEL_CS), $0x100000
132move_routine_end:
133
134
135/* Stack for uncompression */
136 .align 32
137user_stack:
138 .fill 4096,4,0
139stack_start:
140 .long user_stack+4096
141 .word __KERNEL_DS
142
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
new file mode 100644
index 000000000000..c8b9216f9e63
--- /dev/null
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -0,0 +1,354 @@
1/*
2 * misc.c
3 *
4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux.
6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */
11
12#include "miscsetup.h"
13#include <asm/io.h>
14
15/*
16 * gzip declarations
17 */
18
19#define OF(args) args
20#define STATIC static
21
22#undef memset
23#undef memcpy
24#define memzero(s, n) memset ((s), 0, (n))
25
26typedef unsigned char uch;
27typedef unsigned short ush;
28typedef unsigned long ulg;
29
30#define WSIZE 0x8000 /* Window size must be at least 32k, */
31 /* and a power of two */
32
33static uch *inbuf; /* input buffer */
34static uch window[WSIZE]; /* Sliding window buffer */
35
36static unsigned insize = 0; /* valid bytes in inbuf */
37static unsigned inptr = 0; /* index of next byte to be processed in inbuf */
38static unsigned outcnt = 0; /* bytes in output buffer */
39
40/* gzip flag byte */
41#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
42#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
43#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
44#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
45#define COMMENT 0x10 /* bit 4 set: file comment present */
46#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
47#define RESERVED 0xC0 /* bit 6,7: reserved */
48
49#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
50
51/* Diagnostic functions */
52#ifdef DEBUG
53# define Assert(cond,msg) {if(!(cond)) error(msg);}
54# define Trace(x) fprintf x
55# define Tracev(x) {if (verbose) fprintf x ;}
56# define Tracevv(x) {if (verbose>1) fprintf x ;}
57# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
58# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
59#else
60# define Assert(cond,msg)
61# define Trace(x)
62# define Tracev(x)
63# define Tracevv(x)
64# define Tracec(c,x)
65# define Tracecv(c,x)
66#endif
67
68static int fill_inbuf(void);
69static void flush_window(void);
70static void error(char *m);
71static void gzip_mark(void **);
72static void gzip_release(void **);
73
74/*
75 * This is set up by the setup-routine at boot-time
76 */
77static unsigned char *real_mode; /* Pointer to real-mode data */
78
79#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
80#ifndef STANDARD_MEMORY_BIOS_CALL
81#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
82#endif
83#define SCREEN_INFO (*(struct screen_info *)(real_mode+0))
84
85extern char input_data[];
86extern int input_len;
87
88static long bytes_out = 0;
89static uch *output_data;
90static unsigned long output_ptr = 0;
91
92static void *malloc(int size);
93static void free(void *where);
94
95static void putstr(const char *);
96
97extern int end;
98static long free_mem_ptr = (long)&end;
99static long free_mem_end_ptr;
100
101#define INPLACE_MOVE_ROUTINE 0x1000
102#define LOW_BUFFER_START 0x2000
103#define LOW_BUFFER_MAX 0x90000
104#define HEAP_SIZE 0x3000
105static unsigned int low_buffer_end, low_buffer_size;
106static int high_loaded =0;
107static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
108
109static char *vidmem = (char *)0xb8000;
110static int vidport;
111static int lines, cols;
112
113#include "../../../../lib/inflate.c"
114
115static void *malloc(int size)
116{
117 void *p;
118
119 if (size <0) error("Malloc error");
120 if (free_mem_ptr <= 0) error("Memory error");
121
122 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
123
124 p = (void *)free_mem_ptr;
125 free_mem_ptr += size;
126
127 if (free_mem_ptr >= free_mem_end_ptr)
128 error("Out of memory");
129
130 return p;
131}
132
133static void free(void *where)
134{ /* Don't care */
135}
136
137static void gzip_mark(void **ptr)
138{
139 *ptr = (void *) free_mem_ptr;
140}
141
142static void gzip_release(void **ptr)
143{
144 free_mem_ptr = (long) *ptr;
145}
146
147static void scroll(void)
148{
149 int i;
150
151 memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
152 for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
153 vidmem[i] = ' ';
154}
155
156static void putstr(const char *s)
157{
158 int x,y,pos;
159 char c;
160
161 x = SCREEN_INFO.orig_x;
162 y = SCREEN_INFO.orig_y;
163
164 while ( ( c = *s++ ) != '\0' ) {
165 if ( c == '\n' ) {
166 x = 0;
167 if ( ++y >= lines ) {
168 scroll();
169 y--;
170 }
171 } else {
172 vidmem [ ( x + cols * y ) * 2 ] = c;
173 if ( ++x >= cols ) {
174 x = 0;
175 if ( ++y >= lines ) {
176 scroll();
177 y--;
178 }
179 }
180 }
181 }
182
183 SCREEN_INFO.orig_x = x;
184 SCREEN_INFO.orig_y = y;
185
186 pos = (x + cols * y) * 2; /* Update cursor position */
187 outb_p(14, vidport);
188 outb_p(0xff & (pos >> 9), vidport+1);
189 outb_p(15, vidport);
190 outb_p(0xff & (pos >> 1), vidport+1);
191}
192
193void* memset(void* s, int c, unsigned n)
194{
195 int i;
196 char *ss = (char*)s;
197
198 for (i=0;i<n;i++) ss[i] = c;
199 return s;
200}
201
202void* memcpy(void* dest, const void* src, unsigned n)
203{
204 int i;
205 char *d = (char *)dest, *s = (char *)src;
206
207 for (i=0;i<n;i++) d[i] = s[i];
208 return dest;
209}
210
211/* ===========================================================================
212 * Fill the input buffer. This is called only when the buffer is empty
213 * and at least one byte is really needed.
214 */
215static int fill_inbuf(void)
216{
217 if (insize != 0) {
218 error("ran out of input data");
219 }
220
221 inbuf = input_data;
222 insize = input_len;
223 inptr = 1;
224 return inbuf[0];
225}
226
227/* ===========================================================================
228 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
229 * (Used for the decompressed data only.)
230 */
231static void flush_window_low(void)
232{
233 ulg c = crc; /* temporary variable */
234 unsigned n;
235 uch *in, *out, ch;
236
237 in = window;
238 out = &output_data[output_ptr];
239 for (n = 0; n < outcnt; n++) {
240 ch = *out++ = *in++;
241 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
242 }
243 crc = c;
244 bytes_out += (ulg)outcnt;
245 output_ptr += (ulg)outcnt;
246 outcnt = 0;
247}
248
249static void flush_window_high(void)
250{
251 ulg c = crc; /* temporary variable */
252 unsigned n;
253 uch *in, ch;
254 in = window;
255 for (n = 0; n < outcnt; n++) {
256 ch = *output_data++ = *in++;
257 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
258 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
259 }
260 crc = c;
261 bytes_out += (ulg)outcnt;
262 outcnt = 0;
263}
264
265static void flush_window(void)
266{
267 if (high_loaded) flush_window_high();
268 else flush_window_low();
269}
270
271static void error(char *x)
272{
273 putstr("\n\n");
274 putstr(x);
275 putstr("\n\n -- System halted");
276
277 while(1);
278}
279
280void setup_normal_output_buffer(void)
281{
282#ifdef STANDARD_MEMORY_BIOS_CALL
283 if (EXT_MEM_K < 1024) error("Less than 2MB of memory");
284#else
285 if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
286#endif
287 output_data = (char *)0x100000; /* Points to 1M */
288 free_mem_end_ptr = (long)real_mode;
289}
290
291struct moveparams {
292 uch *low_buffer_start; int lcount;
293 uch *high_buffer_start; int hcount;
294};
295
296void setup_output_buffer_if_we_run_high(struct moveparams *mv)
297{
298 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
299#ifdef STANDARD_MEMORY_BIOS_CALL
300 if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
301#else
302 if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
303#endif
304 mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START;
305 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
306 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
307 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
308 high_loaded = 1;
309 free_mem_end_ptr = (long)high_buffer_start;
310 if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
311 high_buffer_start = (uch *)(0x100000 + low_buffer_size);
312 mv->hcount = 0; /* say: we need not to move high_buffer */
313 }
314 else mv->hcount = -1;
315 mv->high_buffer_start = high_buffer_start;
316}
317
318void close_output_buffer_if_we_run_high(struct moveparams *mv)
319{
320 if (bytes_out > low_buffer_size) {
321 mv->lcount = low_buffer_size;
322 if (mv->hcount)
323 mv->hcount = bytes_out - low_buffer_size;
324 } else {
325 mv->lcount = bytes_out;
326 mv->hcount = 0;
327 }
328}
329
330int decompress_kernel(struct moveparams *mv, void *rmode)
331{
332 real_mode = rmode;
333
334 if (SCREEN_INFO.orig_video_mode == 7) {
335 vidmem = (char *) 0xb0000;
336 vidport = 0x3b4;
337 } else {
338 vidmem = (char *) 0xb8000;
339 vidport = 0x3d4;
340 }
341
342 lines = SCREEN_INFO.orig_video_lines;
343 cols = SCREEN_INFO.orig_video_cols;
344
345 if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
346 else setup_output_buffer_if_we_run_high(mv);
347
348 makecrc();
349 putstr(".\nDecompressing Linux...");
350 gunzip();
351 putstr("done.\nBooting the kernel.\n");
352 if (high_loaded) close_output_buffer_if_we_run_high(mv);
353 return high_loaded;
354}
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
new file mode 100644
index 000000000000..bb1620531703
--- /dev/null
+++ b/arch/x86_64/boot/compressed/miscsetup.h
@@ -0,0 +1,39 @@
1#define NULL 0
2//typedef unsigned int size_t;
3
4
5struct screen_info {
6 unsigned char orig_x; /* 0x00 */
7 unsigned char orig_y; /* 0x01 */
8 unsigned short dontuse1; /* 0x02 -- EXT_MEM_K sits here */
9 unsigned short orig_video_page; /* 0x04 */
10 unsigned char orig_video_mode; /* 0x06 */
11 unsigned char orig_video_cols; /* 0x07 */
12 unsigned short unused2; /* 0x08 */
13 unsigned short orig_video_ega_bx; /* 0x0a */
14 unsigned short unused3; /* 0x0c */
15 unsigned char orig_video_lines; /* 0x0e */
16 unsigned char orig_video_isVGA; /* 0x0f */
17 unsigned short orig_video_points; /* 0x10 */
18
19 /* VESA graphic mode -- linear frame buffer */
20 unsigned short lfb_width; /* 0x12 */
21 unsigned short lfb_height; /* 0x14 */
22 unsigned short lfb_depth; /* 0x16 */
23 unsigned long lfb_base; /* 0x18 */
24 unsigned long lfb_size; /* 0x1c */
25 unsigned short dontuse2, dontuse3; /* 0x20 -- CL_MAGIC and CL_OFFSET here */
26 unsigned short lfb_linelength; /* 0x24 */
27 unsigned char red_size; /* 0x26 */
28 unsigned char red_pos; /* 0x27 */
29 unsigned char green_size; /* 0x28 */
30 unsigned char green_pos; /* 0x29 */
31 unsigned char blue_size; /* 0x2a */
32 unsigned char blue_pos; /* 0x2b */
33 unsigned char rsvd_size; /* 0x2c */
34 unsigned char rsvd_pos; /* 0x2d */
35 unsigned short vesapm_seg; /* 0x2e */
36 unsigned short vesapm_off; /* 0x30 */
37 unsigned short pages; /* 0x32 */
38 /* 0x34 -- 0x3f reserved for future expansion */
39};
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
new file mode 100644
index 000000000000..1ed9d791f863
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -0,0 +1,9 @@
1SECTIONS
2{
3 .data : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 input_data_end = .;
8 }
9}
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
new file mode 100644
index 000000000000..90f2452b3b9e
--- /dev/null
+++ b/arch/x86_64/boot/install.sh
@@ -0,0 +1,40 @@
1#!/bin/sh
2#
3# arch/i386/boot/install.sh
4#
5# This file is subject to the terms and conditions of the GNU General Public
6# License. See the file "COPYING" in the main directory of this archive
7# for more details.
8#
9# Copyright (C) 1995 by Linus Torvalds
10#
11# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
12#
13# "make install" script for i386 architecture
14#
15# Arguments:
16# $1 - kernel version
17# $2 - kernel image file
18# $3 - kernel map file
19# $4 - default install path (blank if root directory)
20#
21
22# User may have a custom install script
23
24if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
25if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
26
27# Default install - same as make zlilo
28
29if [ -f $4/vmlinuz ]; then
30 mv $4/vmlinuz $4/vmlinuz.old
31fi
32
33if [ -f $4/System.map ]; then
34 mv $4/System.map $4/System.old
35fi
36
37cat $2 > $4/vmlinuz
38cp $3 $4/System.map
39
40if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in
new file mode 100644
index 000000000000..efd6d2490c1d
--- /dev/null
+++ b/arch/x86_64/boot/mtools.conf.in
@@ -0,0 +1,17 @@
1#
2# mtools configuration file for "make (b)zdisk"
3#
4
5# Actual floppy drive
6drive a:
7 file="/dev/fd0"
8
9# 1.44 MB floppy disk image
10drive v:
11 file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
12
13# 2.88 MB floppy disk image (mostly for virtual uses)
14drive w:
15 file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
16
17
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
new file mode 100644
index 000000000000..3e838be9dbe7
--- /dev/null
+++ b/arch/x86_64/boot/setup.S
@@ -0,0 +1,867 @@
1/*
2 * setup.S Copyright (C) 1991, 1992 Linus Torvalds
3 *
4 * setup.s is responsible for getting the system data from the BIOS,
5 * and putting them into the appropriate places in system memory.
6 * both setup.s and system has been loaded by the bootblock.
7 *
8 * This code asks the bios for memory/disk/other parameters, and
9 * puts them in a "safe" place: 0x90000-0x901FF, ie where the
10 * boot-block used to be. It is then up to the protected mode
11 * system to read them from there before the area is overwritten
12 * for buffer-blocks.
13 *
14 * Move PS/2 aux init code to psaux.c
15 * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
16 *
17 * some changes and additional features by Christoph Niemann,
18 * March 1993/June 1994 (Christoph.Niemann@linux.org)
19 *
20 * add APM BIOS checking by Stephen Rothwell, May 1994
21 * (sfr@canb.auug.org.au)
22 *
23 * High load stuff, initrd support and position independency
24 * by Hans Lermen & Werner Almesberger, February 1996
25 * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
26 *
27 * Video handling moved to video.S by Martin Mares, March 1996
28 * <mj@k332.feld.cvut.cz>
29 *
30 * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
31 * parsons) to avoid loadlin confusion, July 1997
32 *
33 * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
34 * <stiker@northlink.com>
35 *
36 * Fix to work around buggy BIOSes which dont use carry bit correctly
37 * and/or report extended memory in CX/DX for e801h memory size detection
38 * call. As a result the kernel got wrong figures. The int15/e801h docs
39 * from Ralf Brown interrupt list seem to indicate AX/BX should be used
40 * anyway. So to avoid breaking many machines (presumably there was a reason
41 * to orginally use CX/DX instead of AX/BX), we do a kludge to see
42 * if CX/DX have been changed in the e801 call and if so use AX/BX .
43 * Michael Miller, April 2001 <michaelm@mjmm.org>
44 *
45 * Added long mode checking and SSE force. March 2003, Andi Kleen.
46 */
47
48#include <linux/config.h>
49#include <asm/segment.h>
50#include <linux/version.h>
51#include <linux/compile.h>
52#include <asm/boot.h>
53#include <asm/e820.h>
54#include <asm/page.h>
55
56/* Signature words to ensure LILO loaded us right */
57#define SIG1 0xAA55
58#define SIG2 0x5A5A
59
60INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way
61SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536).
62SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment
63 # ... and the former contents of CS
64
65DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020
66
67.code16
68.globl begtext, begdata, begbss, endtext, enddata, endbss
69
70.text
71begtext:
72.data
73begdata:
74.bss
75begbss:
76.text
77
78start:
79 jmp trampoline
80
81# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
82
83 .ascii "HdrS" # header signature
84 .word 0x0203 # header version number (>= 0x0105)
85 # or else old loadlin-1.5 will fail)
86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
87start_sys_seg: .word SYSSEG
88 .word kernel_version # pointing to kernel version string
89 # above section of header is compatible
90 # with loadlin-1.5 (header v1.5). Don't
91 # change it.
92
93type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
94 # Bootlin, SYSLX, bootsect...)
95 # See Documentation/i386/boot.txt for
96 # assigned ids
97
98# flags, unused bits must be zero (RFU) bit within loadflags
99loadflags:
100LOADED_HIGH = 1 # If set, the kernel is loaded high
101CAN_USE_HEAP = 0x80 # If set, the loader also has set
102 # heap_end_ptr to tell how much
103 # space behind setup.S can be used for
104 # heap purposes.
105 # Only the loader knows what is free
106#ifndef __BIG_KERNEL__
107 .byte 0
108#else
109 .byte LOADED_HIGH
110#endif
111
112setup_move_size: .word 0x8000 # size to move, when setup is not
113 # loaded at 0x90000. We will move setup
114 # to 0x90000 then just before jumping
115 # into the kernel. However, only the
116 # loader knows how much data behind
117 # us also needs to be loaded.
118
119code32_start: # here loaders can put a different
120 # start address for 32-bit code.
121#ifndef __BIG_KERNEL__
122 .long 0x1000 # 0x1000 = default for zImage
123#else
124 .long 0x100000 # 0x100000 = default for big kernel
125#endif
126
127ramdisk_image: .long 0 # address of loaded ramdisk image
128 # Here the loader puts the 32-bit
129 # address where it loaded the image.
130 # This only will be read by the kernel.
131
132ramdisk_size: .long 0 # its size in bytes
133
134bootsect_kludge:
135 .long 0 # obsolete
136
137heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later)
138 # space from here (exclusive) down to
139 # end of setup code can be used by setup
140 # for local heap purposes.
141
142pad1: .word 0
143cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
144 # If nonzero, a 32-bit pointer
145 # to the kernel command line.
146 # The command line should be
147 # located between the start of
148 # setup and the end of low
149 # memory (0xa0000), or it may
150 # get overwritten before it
151 # gets read. If this field is
152 # used, there is no longer
153 # anything magical about the
154 # 0x90000 segment; the setup
155 # can be located anywhere in
156 # low memory 0x10000 or higher.
157
158ramdisk_max: .long 0xffffffff
159
160trampoline: call start_of_setup
161 .align 16
162 # The offset at this point is 0x240
163 .space (0x7ff-0x240+1) # E820 & EDD space (ending at 0x7ff)
164# End of setup header #####################################################
165
166start_of_setup:
167# Bootlin depends on this being done early
168 movw $0x01500, %ax
169 movb $0x81, %dl
170 int $0x13
171
172#ifdef SAFE_RESET_DISK_CONTROLLER
173# Reset the disk controller.
174 movw $0x0000, %ax
175 movb $0x80, %dl
176 int $0x13
177#endif
178
179# Set %ds = %cs, we know that SETUPSEG = %cs at this point
180 movw %cs, %ax # aka SETUPSEG
181 movw %ax, %ds
182# Check signature at end of setup
183 cmpw $SIG1, setup_sig1
184 jne bad_sig
185
186 cmpw $SIG2, setup_sig2
187 jne bad_sig
188
189 jmp good_sig1
190
191# Routine to print asciiz string at ds:si
192prtstr:
193 lodsb
194 andb %al, %al
195 jz fin
196
197 call prtchr
198 jmp prtstr
199
200fin: ret
201
202# Space printing
203prtsp2: call prtspc # Print double space
204prtspc: movb $0x20, %al # Print single space (note: fall-thru)
205
206prtchr:
207 pushw %ax
208 pushw %cx
209 movw $0007,%bx
210 movw $0x01, %cx
211 movb $0x0e, %ah
212 int $0x10
213 popw %cx
214 popw %ax
215 ret
216
217beep: movb $0x07, %al
218 jmp prtchr
219
220no_sig_mess: .string "No setup signature found ..."
221
222good_sig1:
223 jmp good_sig
224
225# We now have to find the rest of the setup code/data
226bad_sig:
227 movw %cs, %ax # SETUPSEG
228 subw $DELTA_INITSEG, %ax # INITSEG
229 movw %ax, %ds
230 xorb %bh, %bh
231 movb (497), %bl # get setup sect from bootsect
232 subw $4, %bx # LILO loads 4 sectors of setup
233 shlw $8, %bx # convert to words (1sect=2^8 words)
234 movw %bx, %cx
235 shrw $3, %bx # convert to segment
236 addw $SYSSEG, %bx
237 movw %bx, %cs:start_sys_seg
238# Move rest of setup code/data to here
239 movw $2048, %di # four sectors loaded by LILO
240 subw %si, %si
241 movw %cs, %ax # aka SETUPSEG
242 movw %ax, %es
243 movw $SYSSEG, %ax
244 movw %ax, %ds
245 rep
246 movsw
247 movw %cs, %ax # aka SETUPSEG
248 movw %ax, %ds
249 cmpw $SIG1, setup_sig1
250 jne no_sig
251
252 cmpw $SIG2, setup_sig2
253 jne no_sig
254
255 jmp good_sig
256
257no_sig:
258 lea no_sig_mess, %si
259 call prtstr
260
261no_sig_loop:
262 jmp no_sig_loop
263
264good_sig:
265 movw %cs, %ax # aka SETUPSEG
266 subw $DELTA_INITSEG, %ax # aka INITSEG
267 movw %ax, %ds
268# Check if an old loader tries to load a big-kernel
269 testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel?
270 jz loader_ok # No, no danger for old loaders.
271
272 cmpb $0, %cs:type_of_loader # Do we have a loader that
273 # can deal with us?
274 jnz loader_ok # Yes, continue.
275
276 pushw %cs # No, we have an old loader,
277 popw %ds # die.
278 lea loader_panic_mess, %si
279 call prtstr
280
281 jmp no_sig_loop
282
283loader_panic_mess: .string "Wrong loader, giving up..."
284
285loader_ok:
286 /* check for long mode. */
287 /* we have to do this before the VESA setup, otherwise the user
288 can't see the error message. */
289
290 pushw %ds
291 movw %cs,%ax
292 movw %ax,%ds
293
294 /* minimum CPUID flags for x86-64 */
295 /* see http://www.x86-64.org/lists/discuss/msg02971.html */
296#define SSE_MASK ((1<<25)|(1<<26))
297#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\
298 (1<<13)|(1<<15)|(1<<24))
299#define REQUIRED_MASK2 (1<<29)
300
301 pushfl /* standard way to check for cpuid */
302 popl %eax
303 movl %eax,%ebx
304 xorl $0x200000,%eax
305 pushl %eax
306 popfl
307 pushfl
308 popl %eax
309 cmpl %eax,%ebx
310 jz no_longmode /* cpu has no cpuid */
311 movl $0x0,%eax
312 cpuid
313 cmpl $0x1,%eax
314 jb no_longmode /* no cpuid 1 */
315 xor %di,%di
316 cmpl $0x68747541,%ebx /* AuthenticAMD */
317 jnz noamd
318 cmpl $0x69746e65,%edx
319 jnz noamd
320 cmpl $0x444d4163,%ecx
321 jnz noamd
322 mov $1,%di /* cpu is from AMD */
323noamd:
324 movl $0x1,%eax
325 cpuid
326 andl $REQUIRED_MASK1,%edx
327 xorl $REQUIRED_MASK1,%edx
328 jnz no_longmode
329 movl $0x80000000,%eax
330 cpuid
331 cmpl $0x80000001,%eax
332 jb no_longmode /* no extended cpuid */
333 movl $0x80000001,%eax
334 cpuid
335 andl $REQUIRED_MASK2,%edx
336 xorl $REQUIRED_MASK2,%edx
337 jnz no_longmode
338sse_test:
339 movl $1,%eax
340 cpuid
341 andl $SSE_MASK,%edx
342 cmpl $SSE_MASK,%edx
343 je sse_ok
344 test %di,%di
345 jz no_longmode /* only try to force SSE on AMD */
346 movl $0xc0010015,%ecx /* HWCR */
347 rdmsr
348 btr $15,%eax /* enable SSE */
349 wrmsr
350 xor %di,%di /* don't loop */
351 jmp sse_test /* try again */
352no_longmode:
353 call beep
354 lea long_mode_panic,%si
355 call prtstr
356no_longmode_loop:
357 jmp no_longmode_loop
358long_mode_panic:
359 .string "Your CPU does not support long mode. Use a 32bit distribution."
360 .byte 0
361
362sse_ok:
363 popw %ds
364
365# tell BIOS we want to go to long mode
366 movl $0xec00,%eax # declare target operating mode
367 movl $2,%ebx # long mode
368 int $0x15
369
370# Get memory size (extended mem, kB)
371
372 xorl %eax, %eax
373 movl %eax, (0x1e0)
374#ifndef STANDARD_MEMORY_BIOS_CALL
375 movb %al, (E820NR)
376# Try three different memory detection schemes. First, try
377# e820h, which lets us assemble a memory map, then try e801h,
378# which returns a 32-bit memory size, and finally 88h, which
379# returns 0-64m
380
381# method E820H:
382# the memory map from hell. e820h returns memory classified into
383# a whole bunch of different types, and allows memory holes and
384# everything. We scan through this memory map and build a list
385# of the first 32 memory areas, which we return at [E820MAP].
386# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
387
388#define SMAP 0x534d4150
389
390meme820:
391 xorl %ebx, %ebx # continuation counter
392 movw $E820MAP, %di # point into the whitelist
393 # so we can have the bios
394 # directly write into it.
395
396jmpe820:
397 movl $0x0000e820, %eax # e820, upper word zeroed
398 movl $SMAP, %edx # ascii 'SMAP'
399 movl $20, %ecx # size of the e820rec
400 pushw %ds # data record.
401 popw %es
402 int $0x15 # make the call
403 jc bail820 # fall to e801 if it fails
404
405 cmpl $SMAP, %eax # check the return is `SMAP'
406 jne bail820 # fall to e801 if it fails
407
408# cmpl $1, 16(%di) # is this usable memory?
409# jne again820
410
411 # If this is usable memory, we save it by simply advancing %di by
412 # sizeof(e820rec).
413 #
414good820:
415 movb (E820NR), %al # up to 32 entries
416 cmpb $E820MAX, %al
417 jnl bail820
418
419 incb (E820NR)
420 movw %di, %ax
421 addw $20, %ax
422 movw %ax, %di
423again820:
424 cmpl $0, %ebx # check to see if
425 jne jmpe820 # %ebx is set to EOF
426bail820:
427
428
429# method E801H:
430# memory size is in 1k chunksizes, to avoid confusing loadlin.
431# we store the 0xe801 memory size in a completely different place,
432# because it will most likely be longer than 16 bits.
433# (use 1e0 because that's what Larry Augustine uses in his
434# alternative new memory detection scheme, and it's sensible
435# to write everything into the same place.)
436
437meme801:
438 stc # fix to work around buggy
439 xorw %cx,%cx # BIOSes which dont clear/set
440 xorw %dx,%dx # carry on pass/error of
441 # e801h memory size call
442 # or merely pass cx,dx though
443 # without changing them.
444 movw $0xe801, %ax
445 int $0x15
446 jc mem88
447
448 cmpw $0x0, %cx # Kludge to handle BIOSes
449 jne e801usecxdx # which report their extended
450 cmpw $0x0, %dx # memory in AX/BX rather than
451 jne e801usecxdx # CX/DX. The spec I have read
452 movw %ax, %cx # seems to indicate AX/BX
453 movw %bx, %dx # are more reasonable anyway...
454
455e801usecxdx:
456 andl $0xffff, %edx # clear sign extend
457 shll $6, %edx # and go from 64k to 1k chunks
458 movl %edx, (0x1e0) # store extended memory size
459 andl $0xffff, %ecx # clear sign extend
460 addl %ecx, (0x1e0) # and add lower memory into
461 # total size.
462
463# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or
464# 64mb, depending on the bios) in ax.
465mem88:
466
467#endif
468 movb $0x88, %ah
469 int $0x15
470 movw %ax, (2)
471
472# Set the keyboard repeat rate to the max
473 movw $0x0305, %ax
474 xorw %bx, %bx
475 int $0x16
476
477# Check for video adapter and its parameters and allow the
478# user to browse video modes.
479 call video # NOTE: we need %ds pointing
480 # to bootsector
481
482# Get hd0 data...
483 xorw %ax, %ax
484 movw %ax, %ds
485 ldsw (4 * 0x41), %si
486 movw %cs, %ax # aka SETUPSEG
487 subw $DELTA_INITSEG, %ax # aka INITSEG
488 pushw %ax
489 movw %ax, %es
490 movw $0x0080, %di
491 movw $0x10, %cx
492 pushw %cx
493 cld
494 rep
495 movsb
496# Get hd1 data...
497 xorw %ax, %ax
498 movw %ax, %ds
499 ldsw (4 * 0x46), %si
500 popw %cx
501 popw %es
502 movw $0x0090, %di
503 rep
504 movsb
505# Check that there IS a hd1 :-)
506 movw $0x01500, %ax
507 movb $0x81, %dl
508 int $0x13
509 jc no_disk1
510
511 cmpb $3, %ah
512 je is_disk1
513
514no_disk1:
515 movw %cs, %ax # aka SETUPSEG
516 subw $DELTA_INITSEG, %ax # aka INITSEG
517 movw %ax, %es
518 movw $0x0090, %di
519 movw $0x10, %cx
520 xorw %ax, %ax
521 cld
522 rep
523 stosb
524is_disk1:
525
526# Check for PS/2 pointing device
527 movw %cs, %ax # aka SETUPSEG
528 subw $DELTA_INITSEG, %ax # aka INITSEG
529 movw %ax, %ds
530 movw $0, (0x1ff) # default is no pointing device
531 int $0x11 # int 0x11: equipment list
532 testb $0x04, %al # check if mouse installed
533 jz no_psmouse
534
535 movw $0xAA, (0x1ff) # device present
536no_psmouse:
537
538#include "../../i386/boot/edd.S"
539
540# Now we want to move to protected mode ...
541 cmpw $0, %cs:realmode_swtch
542 jz rmodeswtch_normal
543
544 lcall *%cs:realmode_swtch
545
546 jmp rmodeswtch_end
547
548rmodeswtch_normal:
549 pushw %cs
550 call default_switch
551
552rmodeswtch_end:
553# we get the code32 start address and modify the below 'jmpi'
554# (loader may have changed it)
555 movl %cs:code32_start, %eax
556 movl %eax, %cs:code32
557
558# Now we move the system to its rightful place ... but we check if we have a
559# big-kernel. In that case we *must* not move it ...
560 testb $LOADED_HIGH, %cs:loadflags
561 jz do_move0 # .. then we have a normal low
562 # loaded zImage
563 # .. or else we have a high
564 # loaded bzImage
565 jmp end_move # ... and we skip moving
566
567do_move0:
568 movw $0x100, %ax # start of destination segment
569 movw %cs, %bp # aka SETUPSEG
570 subw $DELTA_INITSEG, %bp # aka INITSEG
571 movw %cs:start_sys_seg, %bx # start of source segment
572 cld
573do_move:
574 movw %ax, %es # destination segment
575 incb %ah # instead of add ax,#0x100
576 movw %bx, %ds # source segment
577 addw $0x100, %bx
578 subw %di, %di
579 subw %si, %si
580 movw $0x800, %cx
581 rep
582 movsw
583 cmpw %bp, %bx # assume start_sys_seg > 0x200,
584 # so we will perhaps read one
585 # page more than needed, but
586 # never overwrite INITSEG
587 # because destination is a
588 # minimum one page below source
589 jb do_move
590
591end_move:
592# then we load the segment descriptors
593 movw %cs, %ax # aka SETUPSEG
594 movw %ax, %ds
595
596# Check whether we need to be downward compatible with version <=201
597 cmpl $0, cmd_line_ptr
598 jne end_move_self # loader uses version >=202 features
599 cmpb $0x20, type_of_loader
600 je end_move_self # bootsect loader, we know of it
601
602# Boot loader doesnt support boot protocol version 2.02.
603# If we have our code not at 0x90000, we need to move it there now.
604# We also then need to move the params behind it (commandline)
605# Because we would overwrite the code on the current IP, we move
606# it in two steps, jumping high after the first one.
607 movw %cs, %ax
608 cmpw $SETUPSEG, %ax
609 je end_move_self
610
611 cli # make sure we really have
612 # interrupts disabled !
613 # because after this the stack
614 # should not be used
615 subw $DELTA_INITSEG, %ax # aka INITSEG
616 movw %ss, %dx
617 cmpw %ax, %dx
618 jb move_self_1
619
620 addw $INITSEG, %dx
621 subw %ax, %dx # this will go into %ss after
622 # the move
623move_self_1:
624 movw %ax, %ds
625 movw $INITSEG, %ax # real INITSEG
626 movw %ax, %es
627 movw %cs:setup_move_size, %cx
628 std # we have to move up, so we use
629 # direction down because the
630 # areas may overlap
631 movw %cx, %di
632 decw %di
633 movw %di, %si
634 subw $move_self_here+0x200, %cx
635 rep
636 movsb
637 ljmp $SETUPSEG, $move_self_here
638
639move_self_here:
640 movw $move_self_here+0x200, %cx
641 rep
642 movsb
643 movw $SETUPSEG, %ax
644 movw %ax, %ds
645 movw %dx, %ss
646end_move_self: # now we are at the right place
647 lidt idt_48 # load idt with 0,0
648 xorl %eax, %eax # Compute gdt_base
649 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
650 shll $4, %eax
651 addl $gdt, %eax
652 movl %eax, (gdt_48+2)
653 lgdt gdt_48 # load gdt with whatever is
654 # appropriate
655
656# that was painless, now we enable a20
657 call empty_8042
658
659 movb $0xD1, %al # command write
660 outb %al, $0x64
661 call empty_8042
662
663 movb $0xDF, %al # A20 on
664 outb %al, $0x60
665 call empty_8042
666
667#
668# You must preserve the other bits here. Otherwise embarrasing things
669# like laptops powering off on boot happen. Corrected version by Kira
670# Brown from Linux 2.2
671#
672 inb $0x92, %al #
673 orb $02, %al # "fast A20" version
674 outb %al, $0x92 # some chips have only this
675
676# wait until a20 really *is* enabled; it can take a fair amount of
677# time on certain systems; Toshiba Tecras are known to have this
678# problem. The memory location used here (0x200) is the int 0x80
679# vector, which should be safe to use.
680
681 xorw %ax, %ax # segment 0x0000
682 movw %ax, %fs
683 decw %ax # segment 0xffff (HMA)
684 movw %ax, %gs
685a20_wait:
686 incw %ax # unused memory location <0xfff0
687 movw %ax, %fs:(0x200) # we use the "int 0x80" vector
688 cmpw %gs:(0x210), %ax # and its corresponding HMA addr
689 je a20_wait # loop until no longer aliased
690
691# make sure any possible coprocessor is properly reset..
692 xorw %ax, %ax
693 outb %al, $0xf0
694 call delay
695
696 outb %al, $0xf1
697 call delay
698
699# well, that went ok, I hope. Now we mask all interrupts - the rest
700# is done in init_IRQ().
701 movb $0xFF, %al # mask all interrupts for now
702 outb %al, $0xA1
703 call delay
704
705 movb $0xFB, %al # mask all irq's but irq2 which
706 outb %al, $0x21 # is cascaded
707
708# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
709# need no steenking BIOS anyway (except for the initial loading :-).
710# The BIOS-routine wants lots of unnecessary data, and it's less
711# "interesting" anyway. This is how REAL programmers do it.
712#
713# Well, now's the time to actually move into protected mode. To make
714# things as simple as possible, we do no register set-up or anything,
715# we let the gnu-compiled 32-bit programs do that. We just jump to
716# absolute address 0x1000 (or the loader supplied one),
717# in 32-bit protected mode.
718#
719# Note that the short jump isn't strictly needed, although there are
720# reasons why it might be a good idea. It won't hurt in any case.
721 movw $1, %ax # protected mode (PE) bit
722 lmsw %ax # This is it!
723 jmp flush_instr
724
725flush_instr:
726 xorw %bx, %bx # Flag to indicate a boot
727 xorl %esi, %esi # Pointer to real-mode code
728 movw %cs, %si
729 subw $DELTA_INITSEG, %si
730 shll $4, %esi # Convert to 32-bit pointer
731# NOTE: For high loaded big kernels we need a
732# jmpi 0x100000,__KERNEL_CS
733#
734# but we yet haven't reloaded the CS register, so the default size
735# of the target offset still is 16 bit.
736# However, using an operant prefix (0x66), the CPU will properly
737# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
738# Manual, Mixing 16-bit and 32-bit code, page 16-6)
739
740 .byte 0x66, 0xea # prefix + jmpi-opcode
741code32: .long 0x1000 # will be set to 0x100000
742 # for big kernels
743 .word __KERNEL_CS
744
745# Here's a bunch of information about your current kernel..
746kernel_version: .ascii UTS_RELEASE
747 .ascii " ("
748 .ascii LINUX_COMPILE_BY
749 .ascii "@"
750 .ascii LINUX_COMPILE_HOST
751 .ascii ") "
752 .ascii UTS_VERSION
753 .byte 0
754
755# This is the default real mode switch routine.
756# to be called just before protected mode transition
757default_switch:
758 cli # no interrupts allowed !
759 movb $0x80, %al # disable NMI for bootup
760 # sequence
761 outb %al, $0x70
762 lret
763
764
765# This routine checks that the keyboard command queue is empty
766# (after emptying the output buffers)
767#
768# Some machines have delusions that the keyboard buffer is always full
769# with no keyboard attached...
770#
771# If there is no keyboard controller, we will usually get 0xff
772# to all the reads. With each IO taking a microsecond and
773# a timeout of 100,000 iterations, this can take about half a
774# second ("delay" == outb to port 0x80). That should be ok,
775# and should also be plenty of time for a real keyboard controller
776# to empty.
777#
778
779empty_8042:
780 pushl %ecx
781 movl $100000, %ecx
782
783empty_8042_loop:
784 decl %ecx
785 jz empty_8042_end_loop
786
787 call delay
788
789 inb $0x64, %al # 8042 status port
790 testb $1, %al # output buffer?
791 jz no_output
792
793 call delay
794 inb $0x60, %al # read it
795 jmp empty_8042_loop
796
797no_output:
798 testb $2, %al # is input buffer full?
799 jnz empty_8042_loop # yes - loop
800empty_8042_end_loop:
801 popl %ecx
802 ret
803
804# Read the cmos clock. Return the seconds in al
805gettime:
806 pushw %cx
807 movb $0x02, %ah
808 int $0x1a
809 movb %dh, %al # %dh contains the seconds
810 andb $0x0f, %al
811 movb %dh, %ah
812 movb $0x04, %cl
813 shrb %cl, %ah
814 aad
815 popw %cx
816 ret
817
818# Delay is needed after doing I/O
819delay:
820 outb %al,$0x80
821 ret
822
823# Descriptor tables
824gdt:
825 .word 0, 0, 0, 0 # dummy
826
827 .word 0, 0, 0, 0 # unused
828
829 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
830 .word 0 # base address = 0
831 .word 0x9A00 # code read/exec
832 .word 0x00CF # granularity = 4096, 386
833 # (+5th nibble of limit)
834
835 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
836 .word 0 # base address = 0
837 .word 0x9200 # data read/write
838 .word 0x00CF # granularity = 4096, 386
839 # (+5th nibble of limit)
840idt_48:
841 .word 0 # idt limit = 0
842 .word 0, 0 # idt base = 0L
843gdt_48:
844 .word 0x8000 # gdt limit=2048,
845 # 256 GDT entries
846
847 .word 0, 0 # gdt base (filled in later)
848
849# Include video setup & detection code
850
851#include "video.S"
852
853# Setup signature -- must be last
854setup_sig1: .word SIG1
855setup_sig2: .word SIG2
856
857# After this point, there is some free space which is used by the video mode
858# handling code to store the temporary mode table (not used by the kernel).
859
860modelist:
861
862.text
863endtext:
864.data
865enddata:
866.bss
867endbss:
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
new file mode 100644
index 000000000000..c2fa66313170
--- /dev/null
+++ b/arch/x86_64/boot/tools/build.c
@@ -0,0 +1,186 @@
1/*
2 * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares
6 */
7
8/*
9 * This file builds a disk-image from three different files:
10 *
11 * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
12 * - setup: 8086 machine code, sets up system parm
13 * - system: 80386 code for actual system
14 *
15 * It does some checking that all files are of the correct type, and
16 * just writes the result to stdout, removing headers and padding to
17 * the right amount. It also writes some system data to stderr.
18 */
19
20/*
21 * Changes by tytso to allow root device specification
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 * Cross compiling fixes by Gertjan van Wingerde, July 1996
24 * Rewritten by Martin Mares, April 1997
25 */
26
27#include <stdio.h>
28#include <string.h>
29#include <stdlib.h>
30#include <stdarg.h>
31#include <sys/types.h>
32#include <sys/stat.h>
33#include <sys/sysmacros.h>
34#include <unistd.h>
35#include <fcntl.h>
36#include <asm/boot.h>
37
38typedef unsigned char byte;
39typedef unsigned short word;
40typedef unsigned long u32;
41
42#define DEFAULT_MAJOR_ROOT 0
43#define DEFAULT_MINOR_ROOT 0
44
45/* Minimal number of setup sectors (see also bootsect.S) */
46#define SETUP_SECTS 4
47
48byte buf[1024];
49int fd;
50int is_big_kernel;
51
52void die(const char * str, ...)
53{
54 va_list args;
55 va_start(args, str);
56 vfprintf(stderr, str, args);
57 fputc('\n', stderr);
58 exit(1);
59}
60
61void file_open(const char *name)
62{
63 if ((fd = open(name, O_RDONLY, 0)) < 0)
64 die("Unable to open `%s': %m", name);
65}
66
67void usage(void)
68{
69 die("Usage: build [-b] bootsect setup system [rootdev] [> image]");
70}
71
72int main(int argc, char ** argv)
73{
74 unsigned int i, c, sz, setup_sectors;
75 u32 sys_size;
76 byte major_root, minor_root;
77 struct stat sb;
78
79 if (argc > 2 && !strcmp(argv[1], "-b"))
80 {
81 is_big_kernel = 1;
82 argc--, argv++;
83 }
84 if ((argc < 4) || (argc > 5))
85 usage();
86 if (argc > 4) {
87 if (!strcmp(argv[4], "CURRENT")) {
88 if (stat("/", &sb)) {
89 perror("/");
90 die("Couldn't stat /");
91 }
92 major_root = major(sb.st_dev);
93 minor_root = minor(sb.st_dev);
94 } else if (strcmp(argv[4], "FLOPPY")) {
95 if (stat(argv[4], &sb)) {
96 perror(argv[4]);
97 die("Couldn't stat root device.");
98 }
99 major_root = major(sb.st_rdev);
100 minor_root = minor(sb.st_rdev);
101 } else {
102 major_root = 0;
103 minor_root = 0;
104 }
105 } else {
106 major_root = DEFAULT_MAJOR_ROOT;
107 minor_root = DEFAULT_MINOR_ROOT;
108 }
109 fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
110
111 file_open(argv[1]);
112 i = read(fd, buf, sizeof(buf));
113 fprintf(stderr,"Boot sector %d bytes.\n",i);
114 if (i != 512)
115 die("Boot block must be exactly 512 bytes");
116 if (buf[510] != 0x55 || buf[511] != 0xaa)
117 die("Boot block hasn't got boot flag (0xAA55)");
118 buf[508] = minor_root;
119 buf[509] = major_root;
120 if (write(1, buf, 512) != 512)
121 die("Write call failed");
122 close (fd);
123
124 file_open(argv[2]); /* Copy the setup code */
125 for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c )
126 if (write(1, buf, c) != c)
127 die("Write call failed");
128 if (c != 0)
129 die("read-error on `setup'");
130 close (fd);
131
132 setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */
133 /* for compatibility with ancient versions of LILO. */
134 if (setup_sectors < SETUP_SECTS)
135 setup_sectors = SETUP_SECTS;
136 fprintf(stderr, "Setup is %d bytes.\n", i);
137 memset(buf, 0, sizeof(buf));
138 while (i < setup_sectors * 512) {
139 c = setup_sectors * 512 - i;
140 if (c > sizeof(buf))
141 c = sizeof(buf);
142 if (write(1, buf, c) != c)
143 die("Write call failed");
144 i += c;
145 }
146
147 file_open(argv[3]);
148 if (fstat (fd, &sb))
149 die("Unable to stat `%s': %m", argv[3]);
150 sz = sb.st_size;
151 fprintf (stderr, "System is %d kB\n", sz/1024);
152 sys_size = (sz + 15) / 16;
153 /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
154 if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
155 die("System is too big. Try using %smodules.",
156 is_big_kernel ? "" : "bzImage or ");
157 while (sz > 0) {
158 int l, n;
159
160 l = (sz > sizeof(buf)) ? sizeof(buf) : sz;
161 if ((n=read(fd, buf, l)) != l) {
162 if (n < 0)
163 die("Error reading %s: %m", argv[3]);
164 else
165 die("%s: Unexpected EOF", argv[3]);
166 }
167 if (write(1, buf, l) != l)
168 die("Write failed");
169 sz -= l;
170 }
171 close(fd);
172
173 if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */
174 die("Output: seek failed");
175 buf[0] = setup_sectors;
176 if (write(1, buf, 1) != 1)
177 die("Write of setup sector count failed");
178 if (lseek(1, 500, SEEK_SET) != 500)
179 die("Output: seek failed");
180 buf[0] = (sys_size & 0xff);
181 buf[1] = ((sys_size >> 8) & 0xff);
182 if (write(1, buf, 2) != 2)
183 die("Write of image length failed");
184
185 return 0; /* Everything is OK */
186}
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
new file mode 100644
index 000000000000..0587477c99f2
--- /dev/null
+++ b/arch/x86_64/boot/video.S
@@ -0,0 +1,2007 @@
1/* video.S
2 *
3 * Display adapter & video mode setup, version 2.13 (14-May-99)
4 *
5 * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
6 * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
7 *
8 * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
9 *
10 * For further information, look at Documentation/svga.txt.
11 *
12 */
13
14#include <linux/config.h> /* for CONFIG_VIDEO_* */
15
16/* Enable autodetection of SVGA adapters and modes. */
17#undef CONFIG_VIDEO_SVGA
18
19/* Enable autodetection of VESA modes */
20#define CONFIG_VIDEO_VESA
21
22/* Enable compacting of mode table */
23#define CONFIG_VIDEO_COMPACT
24
25/* Retain screen contents when switching modes */
26#define CONFIG_VIDEO_RETAIN
27
28/* Enable local mode list */
29#undef CONFIG_VIDEO_LOCAL
30
31/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
32#undef CONFIG_VIDEO_400_HACK
33
34/* Hack that lets you force specific BIOS mode ID and specific dimensions */
35#undef CONFIG_VIDEO_GFX_HACK
36#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */
37#define VIDEO_GFX_BIOS_BX 0x0102
38#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */
39
40/* This code uses an extended set of video mode numbers. These include:
41 * Aliases for standard modes
42 * NORMAL_VGA (-1)
43 * EXTENDED_VGA (-2)
44 * ASK_VGA (-3)
45 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
46 * of compatibility when extending the table. These are between 0x00 and 0xff.
47 */
48#define VIDEO_FIRST_MENU 0x0000
49
50/* Standard BIOS video modes (BIOS number + 0x0100) */
51#define VIDEO_FIRST_BIOS 0x0100
52
53/* VESA BIOS video modes (VESA number + 0x0200) */
54#define VIDEO_FIRST_VESA 0x0200
55
56/* Video7 special modes (BIOS number + 0x0900) */
57#define VIDEO_FIRST_V7 0x0900
58
59/* Special video modes */
60#define VIDEO_FIRST_SPECIAL 0x0f00
61#define VIDEO_80x25 0x0f00
62#define VIDEO_8POINT 0x0f01
63#define VIDEO_80x43 0x0f02
64#define VIDEO_80x28 0x0f03
65#define VIDEO_CURRENT_MODE 0x0f04
66#define VIDEO_80x30 0x0f05
67#define VIDEO_80x34 0x0f06
68#define VIDEO_80x60 0x0f07
69#define VIDEO_GFX_HACK 0x0f08
70#define VIDEO_LAST_SPECIAL 0x0f09
71
72/* Video modes given by resolution */
73#define VIDEO_FIRST_RESOLUTION 0x1000
74
75/* The "recalculate timings" flag */
76#define VIDEO_RECALC 0x8000
77
78/* Positions of various video parameters passed to the kernel */
79/* (see also include/linux/tty.h) */
80#define PARAM_CURSOR_POS 0x00
81#define PARAM_VIDEO_PAGE 0x04
82#define PARAM_VIDEO_MODE 0x06
83#define PARAM_VIDEO_COLS 0x07
84#define PARAM_VIDEO_EGA_BX 0x0a
85#define PARAM_VIDEO_LINES 0x0e
86#define PARAM_HAVE_VGA 0x0f
87#define PARAM_FONT_POINTS 0x10
88
89#define PARAM_LFB_WIDTH 0x12
90#define PARAM_LFB_HEIGHT 0x14
91#define PARAM_LFB_DEPTH 0x16
92#define PARAM_LFB_BASE 0x18
93#define PARAM_LFB_SIZE 0x1c
94#define PARAM_LFB_LINELENGTH 0x24
95#define PARAM_LFB_COLORS 0x26
96#define PARAM_VESAPM_SEG 0x2e
97#define PARAM_VESAPM_OFF 0x30
98#define PARAM_LFB_PAGES 0x32
99#define PARAM_VESA_ATTRIB 0x34
100
101/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
102#ifdef CONFIG_VIDEO_RETAIN
103#define DO_STORE call store_screen
104#else
105#define DO_STORE
106#endif /* CONFIG_VIDEO_RETAIN */
107
108# This is the main entry point called by setup.S
109# %ds *must* be pointing to the bootsector
110video: pushw %ds # We use different segments
111 pushw %ds # FS contains original DS
112 popw %fs
113 pushw %cs # DS is equal to CS
114 popw %ds
115 pushw %cs # ES is equal to CS
116 popw %es
117 xorw %ax, %ax
118 movw %ax, %gs # GS is zero
119 cld
120 call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA)
121#ifdef CONFIG_VIDEO_SELECT
122 movw %fs:(0x01fa), %ax # User selected video mode
123 cmpw $ASK_VGA, %ax # Bring up the menu
124 jz vid2
125
126 call mode_set # Set the mode
127 jc vid1
128
129 leaw badmdt, %si # Invalid mode ID
130 call prtstr
131vid2: call mode_menu
132vid1:
133#ifdef CONFIG_VIDEO_RETAIN
134 call restore_screen # Restore screen contents
135#endif /* CONFIG_VIDEO_RETAIN */
136 call store_edid
137#endif /* CONFIG_VIDEO_SELECT */
138 call mode_params # Store mode parameters
139 popw %ds # Restore original DS
140 ret
141
142# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
143basic_detect:
144 movb $0, %fs:(PARAM_HAVE_VGA)
145 movb $0x12, %ah # Check EGA/VGA
146 movb $0x10, %bl
147 int $0x10
148 movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel
149 cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card.
150 je basret
151
152 incb adapter
153 movw $0x1a00, %ax # Check EGA or VGA?
154 int $0x10
155 cmpb $0x1a, %al # 1a means VGA...
156 jne basret # anything else is EGA.
157
158 incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA
159 incb adapter
160basret: ret
161
162# Store the video mode parameters for later usage by the kernel.
163# This is done by asking the BIOS except for the rows/columns
164# parameters in the default 80x25 mode -- these are set directly,
165# because some very obscure BIOSes supply insane values.
166mode_params:
167#ifdef CONFIG_VIDEO_SELECT
168 cmpb $0, graphic_mode
169 jnz mopar_gr
170#endif
171 movb $0x03, %ah # Read cursor position
172 xorb %bh, %bh
173 int $0x10
174 movw %dx, %fs:(PARAM_CURSOR_POS)
175 movb $0x0f, %ah # Read page/mode/width
176 int $0x10
177 movw %bx, %fs:(PARAM_VIDEO_PAGE)
178 movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width
179 cmpb $0x7, %al # MDA/HGA => segment differs
180 jnz mopar0
181
182 movw $0xb000, video_segment
183mopar0: movw %gs:(0x485), %ax # Font size
184 movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA)
185 movw force_size, %ax # Forced size?
186 orw %ax, %ax
187 jz mopar1
188
189 movb %ah, %fs:(PARAM_VIDEO_COLS)
190 movb %al, %fs:(PARAM_VIDEO_LINES)
191 ret
192
193mopar1: movb $25, %al
194 cmpb $0, adapter # If we are on CGA/MDA/HGA, the
195 jz mopar2 # screen must have 25 lines.
196
197 movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS
198 incb %al # location of max lines.
199mopar2: movb %al, %fs:(PARAM_VIDEO_LINES)
200 ret
201
202#ifdef CONFIG_VIDEO_SELECT
203# Fetching of VESA frame buffer parameters
204mopar_gr:
205 leaw modelist+1024, %di
206 movb $0x23, %fs:(PARAM_HAVE_VGA)
207 movw 16(%di), %ax
208 movw %ax, %fs:(PARAM_LFB_LINELENGTH)
209 movw 18(%di), %ax
210 movw %ax, %fs:(PARAM_LFB_WIDTH)
211 movw 20(%di), %ax
212 movw %ax, %fs:(PARAM_LFB_HEIGHT)
213 movb 25(%di), %al
214 movb $0, %ah
215 movw %ax, %fs:(PARAM_LFB_DEPTH)
216 movb 29(%di), %al
217 movb $0, %ah
218 movw %ax, %fs:(PARAM_LFB_PAGES)
219 movl 40(%di), %eax
220 movl %eax, %fs:(PARAM_LFB_BASE)
221 movl 31(%di), %eax
222 movl %eax, %fs:(PARAM_LFB_COLORS)
223 movl 35(%di), %eax
224 movl %eax, %fs:(PARAM_LFB_COLORS+4)
225 movw 0(%di), %ax
226 movw %ax, %fs:(PARAM_VESA_ATTRIB)
227
228# get video mem size
229 leaw modelist+1024, %di
230 movw $0x4f00, %ax
231 int $0x10
232 xorl %eax, %eax
233 movw 18(%di), %ax
234 movl %eax, %fs:(PARAM_LFB_SIZE)
235
236# switching the DAC to 8-bit is for <= 8 bpp only
237 movw %fs:(PARAM_LFB_DEPTH), %ax
238 cmpw $8, %ax
239 jg dac_done
240
241# get DAC switching capability
242 xorl %eax, %eax
243 movb 10(%di), %al
244 testb $1, %al
245 jz dac_set
246
247# attempt to switch DAC to 8-bit
248 movw $0x4f08, %ax
249 movw $0x0800, %bx
250 int $0x10
251 cmpw $0x004f, %ax
252 jne dac_set
253 movb %bh, dac_size # store actual DAC size
254
255dac_set:
256# set color size to DAC size
257 movb dac_size, %al
258 movb %al, %fs:(PARAM_LFB_COLORS+0)
259 movb %al, %fs:(PARAM_LFB_COLORS+2)
260 movb %al, %fs:(PARAM_LFB_COLORS+4)
261 movb %al, %fs:(PARAM_LFB_COLORS+6)
262
263# set color offsets to 0
264 movb $0, %fs:(PARAM_LFB_COLORS+1)
265 movb $0, %fs:(PARAM_LFB_COLORS+3)
266 movb $0, %fs:(PARAM_LFB_COLORS+5)
267 movb $0, %fs:(PARAM_LFB_COLORS+7)
268
269dac_done:
270# get protected mode interface informations
271 movw $0x4f0a, %ax
272 xorw %bx, %bx
273 xorw %di, %di
274 int $0x10
275 cmp $0x004f, %ax
276 jnz no_pm
277
278 movw %es, %fs:(PARAM_VESAPM_SEG)
279 movw %di, %fs:(PARAM_VESAPM_OFF)
280no_pm: ret
281
282# The video mode menu
283mode_menu:
284 leaw keymsg, %si # "Return/Space/Timeout" message
285 call prtstr
286 call flush
287nokey: call getkt
288
289 cmpb $0x0d, %al # ENTER ?
290 je listm # yes - manual mode selection
291
292 cmpb $0x20, %al # SPACE ?
293 je defmd1 # no - repeat
294
295 call beep
296 jmp nokey
297
298defmd1: ret # No mode chosen? Default 80x25
299
300listm: call mode_table # List mode table
301listm0: leaw name_bann, %si # Print adapter name
302 call prtstr
303 movw card_name, %si
304 orw %si, %si
305 jnz an2
306
307 movb adapter, %al
308 leaw old_name, %si
309 orb %al, %al
310 jz an1
311
312 leaw ega_name, %si
313 decb %al
314 jz an1
315
316 leaw vga_name, %si
317 jmp an1
318
319an2: call prtstr
320 leaw svga_name, %si
321an1: call prtstr
322 leaw listhdr, %si # Table header
323 call prtstr
324 movb $0x30, %dl # DL holds mode number
325 leaw modelist, %si
326lm1: cmpw $ASK_VGA, (%si) # End?
327 jz lm2
328
329 movb %dl, %al # Menu selection number
330 call prtchr
331 call prtsp2
332 lodsw
333 call prthw # Mode ID
334 call prtsp2
335 movb 0x1(%si), %al
336 call prtdec # Rows
337 movb $0x78, %al # the letter 'x'
338 call prtchr
339 lodsw
340 call prtdec # Columns
341 movb $0x0d, %al # New line
342 call prtchr
343 movb $0x0a, %al
344 call prtchr
345 incb %dl # Next character
346 cmpb $0x3a, %dl
347 jnz lm1
348
349 movb $0x61, %dl
350 jmp lm1
351
352lm2: leaw prompt, %si # Mode prompt
353 call prtstr
354 leaw edit_buf, %di # Editor buffer
355lm3: call getkey
356 cmpb $0x0d, %al # Enter?
357 jz lment
358
359 cmpb $0x08, %al # Backspace?
360 jz lmbs
361
362 cmpb $0x20, %al # Printable?
363 jc lm3
364
365 cmpw $edit_buf+4, %di # Enough space?
366 jz lm3
367
368 stosb
369 call prtchr
370 jmp lm3
371
372lmbs: cmpw $edit_buf, %di # Backspace
373 jz lm3
374
375 decw %di
376 movb $0x08, %al
377 call prtchr
378 call prtspc
379 movb $0x08, %al
380 call prtchr
381 jmp lm3
382
383lment: movb $0, (%di)
384 leaw crlft, %si
385 call prtstr
386 leaw edit_buf, %si
387 cmpb $0, (%si) # Empty string = default mode
388 jz lmdef
389
390 cmpb $0, 1(%si) # One character = menu selection
391 jz mnusel
392
393 cmpw $0x6373, (%si) # "scan" => mode scanning
394 jnz lmhx
395
396 cmpw $0x6e61, 2(%si)
397 jz lmscan
398
399lmhx: xorw %bx, %bx # Else => mode ID in hex
400lmhex: lodsb
401 orb %al, %al
402 jz lmuse1
403
404 subb $0x30, %al
405 jc lmbad
406
407 cmpb $10, %al
408 jc lmhx1
409
410 subb $7, %al
411 andb $0xdf, %al
412 cmpb $10, %al
413 jc lmbad
414
415 cmpb $16, %al
416 jnc lmbad
417
418lmhx1: shlw $4, %bx
419 orb %al, %bl
420 jmp lmhex
421
422lmuse1: movw %bx, %ax
423 jmp lmuse
424
425mnusel: lodsb # Menu selection
426 xorb %ah, %ah
427 subb $0x30, %al
428 jc lmbad
429
430 cmpb $10, %al
431 jc lmuse
432
433 cmpb $0x61-0x30, %al
434 jc lmbad
435
436 subb $0x61-0x30-10, %al
437 cmpb $36, %al
438 jnc lmbad
439
440lmuse: call mode_set
441 jc lmdef
442
443lmbad: leaw unknt, %si
444 call prtstr
445 jmp lm2
446lmscan: cmpb $0, adapter # Scanning only on EGA/VGA
447 jz lmbad
448
449 movw $0, mt_end # Scanning of modes is
450 movb $1, scanning # done as new autodetection.
451 call mode_table
452 jmp listm0
453lmdef: ret
454
455# Additional parts of mode_set... (relative jumps, you know)
456setv7: # Video7 extended modes
457 DO_STORE
458 subb $VIDEO_FIRST_V7>>8, %bh
459 movw $0x6f05, %ax
460 int $0x10
461 stc
462 ret
463
464_setrec: jmp setrec # Ugly...
465_set_80x25: jmp set_80x25
466
467# Aliases for backward compatibility.
468setalias:
469 movw $VIDEO_80x25, %ax
470 incw %bx
471 jz mode_set
472
473 movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
474 incw %bx
475 jnz setbad # Fall-through!
476
477# Setting of user mode (AX=mode ID) => CF=success
478mode_set:
479 movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S
480 movw %ax, %bx
481 cmpb $0xff, %ah
482 jz setalias
483
484 testb $VIDEO_RECALC>>8, %ah
485 jnz _setrec
486
487 cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
488 jnc setres
489
490 cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
491 jz setspc
492
493 cmpb $VIDEO_FIRST_V7>>8, %ah
494 jz setv7
495
496 cmpb $VIDEO_FIRST_VESA>>8, %ah
497 jnc check_vesa
498
499 orb %ah, %ah
500 jz setmenu
501
502 decb %ah
503 jz setbios
504
505setbad: clc
506 movb $0, do_restore # The screen needn't be restored
507 ret
508
509setvesa:
510 DO_STORE
511 subb $VIDEO_FIRST_VESA>>8, %bh
512 movw $0x4f02, %ax # VESA BIOS mode set call
513 int $0x10
514 cmpw $0x004f, %ax # AL=4f if implemented
515 jnz setbad # AH=0 if OK
516
517 stc
518 ret
519
520setbios:
521 DO_STORE
522 int $0x10 # Standard BIOS mode set call
523 pushw %bx
524 movb $0x0f, %ah # Check if really set
525 int $0x10
526 popw %bx
527 cmpb %bl, %al
528 jnz setbad
529
530 stc
531 ret
532
533setspc: xorb %bh, %bh # Set special mode
534 cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
535 jnc setbad
536
537 addw %bx, %bx
538 jmp *spec_inits(%bx)
539
540setmenu:
541 orb %al, %al # 80x25 is an exception
542 jz _set_80x25
543
544 pushw %bx # Set mode chosen from menu
545 call mode_table # Build the mode table
546 popw %ax
547 shlw $2, %ax
548 addw %ax, %si
549 cmpw %di, %si
550 jnc setbad
551
552 movw (%si), %ax # Fetch mode ID
553_m_s: jmp mode_set
554
555setres: pushw %bx # Set mode chosen by resolution
556 call mode_table
557 popw %bx
558 xchgb %bl, %bh
559setr1: lodsw
560 cmpw $ASK_VGA, %ax # End of the list?
561 jz setbad
562
563 lodsw
564 cmpw %bx, %ax
565 jnz setr1
566
567 movw -4(%si), %ax # Fetch mode ID
568 jmp _m_s
569
570check_vesa:
571 leaw modelist+1024, %di
572 subb $VIDEO_FIRST_VESA>>8, %bh
573 movw %bx, %cx # Get mode information structure
574 movw $0x4f01, %ax
575 int $0x10
576 addb $VIDEO_FIRST_VESA>>8, %bh
577 cmpw $0x004f, %ax
578 jnz setbad
579
580 movb (%di), %al # Check capabilities.
581 andb $0x19, %al
582 cmpb $0x09, %al
583 jz setvesa # This is a text mode
584
585 movb (%di), %al # Check capabilities.
586 andb $0x99, %al
587 cmpb $0x99, %al
588 jnz _setbad # Doh! No linear frame buffer.
589
590 subb $VIDEO_FIRST_VESA>>8, %bh
591 orw $0x4000, %bx # Use linear frame buffer
592 movw $0x4f02, %ax # VESA BIOS mode set call
593 int $0x10
594 cmpw $0x004f, %ax # AL=4f if implemented
595 jnz _setbad # AH=0 if OK
596
597 movb $1, graphic_mode # flag graphic mode
598 movb $0, do_restore # no screen restore
599 stc
600 ret
601
602_setbad: jmp setbad # Ugly...
603
604# Recalculate vertical display end registers -- this fixes various
605# inconsistencies of extended modes on many adapters. Called when
606# the VIDEO_RECALC flag is set in the mode ID.
607
608setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode
609 call mode_set
610 jnc rct3
611
612 movw %gs:(0x485), %ax # Font size in pixels
613 movb %gs:(0x484), %bl # Number of rows
614 incb %bl
615 mulb %bl # Number of visible
616 decw %ax # scan lines - 1
617 movw $0x3d4, %dx
618 movw %ax, %bx
619 movb $0x12, %al # Lower 8 bits
620 movb %bl, %ah
621 outw %ax, %dx
622 movb $0x07, %al # Bits 8 and 9 in the overflow register
623 call inidx
624 xchgb %al, %ah
625 andb $0xbd, %ah
626 shrb %bh
627 jnc rct1
628 orb $0x02, %ah
629rct1: shrb %bh
630 jnc rct2
631 orb $0x40, %ah
632rct2: movb $0x07, %al
633 outw %ax, %dx
634 stc
635rct3: ret
636
637# Table of routines for setting of the special modes.
638spec_inits:
639 .word set_80x25
640 .word set_8pixel
641 .word set_80x43
642 .word set_80x28
643 .word set_current
644 .word set_80x30
645 .word set_80x34
646 .word set_80x60
647 .word set_gfx
648
649# Set the 80x25 mode. If already set, do nothing.
650set_80x25:
651 movw $0x5019, force_size # Override possibly broken BIOS
652use_80x25:
653#ifdef CONFIG_VIDEO_400_HACK
654 movw $0x1202, %ax # Force 400 scan lines
655 movb $0x30, %bl
656 int $0x10
657#else
658 movb $0x0f, %ah # Get current mode ID
659 int $0x10
660 cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available
661 jz st80 # on CGA/MDA/HGA and is also available on EGAM
662
663 cmpw $0x5003, %ax # Unknown mode, force 80x25 color
664 jnz force3
665
666st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25
667 jz set80
668
669 movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc.
670 orb %al, %al # Some buggy BIOS'es set 0 rows
671 jz set80
672
673 cmpb $24, %al # It's hopefully correct
674 jz set80
675#endif /* CONFIG_VIDEO_400_HACK */
676force3: DO_STORE
677 movw $0x0003, %ax # Forced set
678 int $0x10
679set80: stc
680 ret
681
682# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
683set_8pixel:
684 DO_STORE
685 call use_80x25 # The base is 80x25
686set_8pt:
687 movw $0x1112, %ax # Use 8x8 font
688 xorb %bl, %bl
689 int $0x10
690 movw $0x1200, %ax # Use alternate print screen
691 movb $0x20, %bl
692 int $0x10
693 movw $0x1201, %ax # Turn off cursor emulation
694 movb $0x34, %bl
695 int $0x10
696 movb $0x01, %ah # Define cursor scan lines 6-7
697 movw $0x0607, %cx
698 int $0x10
699set_current:
700 stc
701 ret
702
703# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
704# 80x25 mode with 14-point fonts instead of 16-point.
705set_80x28:
706 DO_STORE
707 call use_80x25 # The base is 80x25
708set14: movw $0x1111, %ax # Use 9x14 font
709 xorb %bl, %bl
710 int $0x10
711 movb $0x01, %ah # Define cursor scan lines 11-12
712 movw $0x0b0c, %cx
713 int $0x10
714 stc
715 ret
716
717# Set the 80x43 mode. This mode is works on all VGA's.
718# It's a 350-scanline mode with 8-pixel font.
719set_80x43:
720 DO_STORE
721 movw $0x1201, %ax # Set 350 scans
722 movb $0x30, %bl
723 int $0x10
724 movw $0x0003, %ax # Reset video mode
725 int $0x10
726 jmp set_8pt # Use 8-pixel font
727
728# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
729set_80x30:
730 call use_80x25 # Start with real 80x25
731 DO_STORE
732 movw $0x3cc, %dx # Get CRTC port
733 inb %dx, %al
734 movb $0xd4, %dl
735 rorb %al # Mono or color?
736 jc set48a
737
738 movb $0xb4, %dl
739set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7)
740 call outidx
741 movw $0x0b06, %ax # Vertical total
742 call outidx
743 movw $0x3e07, %ax # (Vertical) overflow
744 call outidx
745 movw $0xea10, %ax # Vertical sync start
746 call outidx
747 movw $0xdf12, %ax # Vertical display end
748 call outidx
749 movw $0xe715, %ax # Vertical blank start
750 call outidx
751 movw $0x0416, %ax # Vertical blank end
752 call outidx
753 pushw %dx
754 movb $0xcc, %dl # Misc output register (read)
755 inb %dx, %al
756 movb $0xc2, %dl # (write)
757 andb $0x0d, %al # Preserve clock select bits and color bit
758 orb $0xe2, %al # Set correct sync polarity
759 outb %al, %dx
760 popw %dx
761 movw $0x501e, force_size
762 stc # That's all.
763 ret
764
765# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
766set_80x34:
767 call set_80x30 # Set 480 scans
768 call set14 # And 14-pt font
769 movw $0xdb12, %ax # VGA vertical display end
770 movw $0x5022, force_size
771setvde: call outidx
772 stc
773 ret
774
775# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
776set_80x60:
777 call set_80x30 # Set 480 scans
778 call set_8pt # And 8-pt font
779 movw $0xdf12, %ax # VGA vertical display end
780 movw $0x503c, force_size
781 jmp setvde
782
783# Special hack for ThinkPad graphics
784set_gfx:
785#ifdef CONFIG_VIDEO_GFX_HACK
786 movw $VIDEO_GFX_BIOS_AX, %ax
787 movw $VIDEO_GFX_BIOS_BX, %bx
788 int $0x10
789 movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size
790 stc
791#endif
792 ret
793
794#ifdef CONFIG_VIDEO_RETAIN
795
796# Store screen contents to temporary buffer.
797store_screen:
798 cmpb $0, do_restore # Already stored?
799 jnz stsr
800
801 testb $CAN_USE_HEAP, loadflags # Have we space for storing?
802 jz stsr
803
804 pushw %ax
805 pushw %bx
806 pushw force_size # Don't force specific size
807 movw $0, force_size
808 call mode_params # Obtain params of current mode
809 popw force_size
810 movb %fs:(PARAM_VIDEO_LINES), %ah
811 movb %fs:(PARAM_VIDEO_COLS), %al
812 movw %ax, %bx # BX=dimensions
813 mulb %ah
814 movw %ax, %cx # CX=number of characters
815 addw %ax, %ax # Calculate image size
816 addw $modelist+1024+4, %ax
817 cmpw heap_end_ptr, %ax
818 jnc sts1 # Unfortunately, out of memory
819
820 movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params
821 leaw modelist+1024, %di
822 stosw
823 movw %bx, %ax
824 stosw
825 pushw %ds # Store the screen
826 movw video_segment, %ds
827 xorw %si, %si
828 rep
829 movsw
830 popw %ds
831 incb do_restore # Screen will be restored later
832sts1: popw %bx
833 popw %ax
834stsr: ret
835
836# Restore screen contents from temporary buffer.
837restore_screen:
838 cmpb $0, do_restore # Has the screen been stored?
839 jz res1
840
841 call mode_params # Get parameters of current mode
842 movb %fs:(PARAM_VIDEO_LINES), %cl
843 movb %fs:(PARAM_VIDEO_COLS), %ch
844 leaw modelist+1024, %si # Screen buffer
845 lodsw # Set cursor position
846 movw %ax, %dx
847 cmpb %cl, %dh
848 jc res2
849
850 movb %cl, %dh
851 decb %dh
852res2: cmpb %ch, %dl
853 jc res3
854
855 movb %ch, %dl
856 decb %dl
857res3: movb $0x02, %ah
858 movb $0x00, %bh
859 int $0x10
860 lodsw # Display size
861 movb %ah, %dl # DL=number of lines
862 movb $0, %ah # BX=phys. length of orig. line
863 movw %ax, %bx
864 cmpb %cl, %dl # Too many?
865 jc res4
866
867 pushw %ax
868 movb %dl, %al
869 subb %cl, %al
870 mulb %bl
871 addw %ax, %si
872 addw %ax, %si
873 popw %ax
874 movb %cl, %dl
875res4: cmpb %ch, %al # Too wide?
876 jc res5
877
878 movb %ch, %al # AX=width of src. line
879res5: movb $0, %cl
880 xchgb %ch, %cl
881 movw %cx, %bp # BP=width of dest. line
882 pushw %es
883 movw video_segment, %es
884 xorw %di, %di # Move the data
885 addw %bx, %bx # Convert BX and BP to _bytes_
886 addw %bp, %bp
887res6: pushw %si
888 pushw %di
889 movw %ax, %cx
890 rep
891 movsw
892 popw %di
893 popw %si
894 addw %bp, %di
895 addw %bx, %si
896 decb %dl
897 jnz res6
898
899 popw %es # Done
900res1: ret
901#endif /* CONFIG_VIDEO_RETAIN */
902
903# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
904outidx: outb %al, %dx
905 pushw %ax
906 movb %ah, %al
907 incw %dx
908 outb %al, %dx
909 decw %dx
910 popw %ax
911 ret
912
913# Build the table of video modes (stored after the setup.S code at the
914# `modelist' label. Each video mode record looks like:
915# .word MODE-ID (our special mode ID (see above))
916# .byte rows (number of rows)
917# .byte columns (number of columns)
918# Returns address of the end of the table in DI, the end is marked
919# with a ASK_VGA ID.
920mode_table:
921 movw mt_end, %di # Already filled?
922 orw %di, %di
923 jnz mtab1x
924
925 leaw modelist, %di # Store standard modes:
926 movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL)
927 stosl
928 movb adapter, %al # CGA/MDA/HGA -- no more modes
929 orb %al, %al
930 jz mtabe
931
932 decb %al
933 jnz mtabv
934
935 movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode
936 stosl
937 jmp mtabe
938
939mtab1x: jmp mtab1
940
941mtabv: leaw vga_modes, %si # All modes for std VGA
942 movw $vga_modes_end-vga_modes, %cx
943 rep # I'm unable to use movsw as I don't know how to store a half
944 movsb # of the expression above to cx without using explicit shr.
945
946 cmpb $0, scanning # Mode scan requested?
947 jz mscan1
948
949 call mode_scan
950mscan1:
951
952#ifdef CONFIG_VIDEO_LOCAL
953 call local_modes
954#endif /* CONFIG_VIDEO_LOCAL */
955
956#ifdef CONFIG_VIDEO_VESA
957 call vesa_modes # Detect VESA VGA modes
958#endif /* CONFIG_VIDEO_VESA */
959
960#ifdef CONFIG_VIDEO_SVGA
961 cmpb $0, scanning # Bypass when scanning
962 jnz mscan2
963
964 call svga_modes # Detect SVGA cards & modes
965mscan2:
966#endif /* CONFIG_VIDEO_SVGA */
967
968mtabe:
969
970#ifdef CONFIG_VIDEO_COMPACT
971 leaw modelist, %si
972 movw %di, %dx
973 movw %si, %di
974cmt1: cmpw %dx, %si # Scan all modes
975 jz cmt2
976
977 leaw modelist, %bx # Find in previous entries
978 movw 2(%si), %cx
979cmt3: cmpw %bx, %si
980 jz cmt4
981
982 cmpw 2(%bx), %cx # Found => don't copy this entry
983 jz cmt5
984
985 addw $4, %bx
986 jmp cmt3
987
988cmt4: movsl # Copy entry
989 jmp cmt1
990
991cmt5: addw $4, %si # Skip entry
992 jmp cmt1
993
994cmt2:
995#endif /* CONFIG_VIDEO_COMPACT */
996
997 movw $ASK_VGA, (%di) # End marker
998 movw %di, mt_end
999mtab1: leaw modelist, %si # SI=mode list, DI=list end
1000ret0: ret
1001
1002# Modes usable on all standard VGAs
1003vga_modes:
1004 .word VIDEO_8POINT
1005 .word 0x5032 # 80x50
1006 .word VIDEO_80x43
1007 .word 0x502b # 80x43
1008 .word VIDEO_80x28
1009 .word 0x501c # 80x28
1010 .word VIDEO_80x30
1011 .word 0x501e # 80x30
1012 .word VIDEO_80x34
1013 .word 0x5022 # 80x34
1014 .word VIDEO_80x60
1015 .word 0x503c # 80x60
1016#ifdef CONFIG_VIDEO_GFX_HACK
1017 .word VIDEO_GFX_HACK
1018 .word VIDEO_GFX_DUMMY_RESOLUTION
1019#endif
1020
1021vga_modes_end:
1022# Detect VESA modes.
1023
1024#ifdef CONFIG_VIDEO_VESA
1025vesa_modes:
1026 cmpb $2, adapter # VGA only
1027 jnz ret0
1028
1029 movw %di, %bp # BP=original mode table end
1030 addw $0x200, %di # Buffer space
1031 movw $0x4f00, %ax # VESA Get card info call
1032 int $0x10
1033 movw %bp, %di
1034 cmpw $0x004f, %ax # Successful?
1035 jnz ret0
1036
1037 cmpw $0x4556, 0x200(%di)
1038 jnz ret0
1039
1040 cmpw $0x4153, 0x202(%di)
1041 jnz ret0
1042
1043 movw $vesa_name, card_name # Set name to "VESA VGA"
1044 pushw %gs
1045 lgsw 0x20e(%di), %si # GS:SI=mode list
1046 movw $128, %cx # Iteration limit
1047vesa1:
1048# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
1049# XXX: lodsw %gs:(%si), %ax # Get next mode in the list
1050 gs; lodsw
1051 cmpw $0xffff, %ax # End of the table?
1052 jz vesar
1053
1054 cmpw $0x0080, %ax # Check validity of mode ID
1055 jc vesa2
1056
1057 orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff
1058 jz vesan # Certain BIOSes report 0x80-0xff!
1059
1060 cmpw $0x0800, %ax
1061 jnc vesae
1062
1063vesa2: pushw %cx
1064 movw %ax, %cx # Get mode information structure
1065 movw $0x4f01, %ax
1066 int $0x10
1067 movw %cx, %bx # BX=mode number
1068 addb $VIDEO_FIRST_VESA>>8, %bh
1069 popw %cx
1070 cmpw $0x004f, %ax
1071 jnz vesan # Don't report errors (buggy BIOSES)
1072
1073 movb (%di), %al # Check capabilities. We require
1074 andb $0x19, %al # a color text mode.
1075 cmpb $0x09, %al
1076 jnz vesan
1077
1078 cmpw $0xb800, 8(%di) # Standard video memory address required
1079 jnz vesan
1080
1081 testb $2, (%di) # Mode characteristics supplied?
1082 movw %bx, (%di) # Store mode number
1083 jz vesa3
1084
1085 xorw %dx, %dx
1086 movw 0x12(%di), %bx # Width
1087 orb %bh, %bh
1088 jnz vesan
1089
1090 movb %bl, 0x3(%di)
1091 movw 0x14(%di), %ax # Height
1092 orb %ah, %ah
1093 jnz vesan
1094
1095 movb %al, 2(%di)
1096 mulb %bl
1097 cmpw $8193, %ax # Small enough for Linux console driver?
1098 jnc vesan
1099
1100 jmp vesaok
1101
1102vesa3: subw $0x8108, %bx # This mode has no detailed info specified,
1103 jc vesan # so it must be a standard VESA mode.
1104
1105 cmpw $5, %bx
1106 jnc vesan
1107
1108 movw vesa_text_mode_table(%bx), %ax
1109 movw %ax, 2(%di)
1110vesaok: addw $4, %di # The mode is valid. Store it.
1111vesan: loop vesa1 # Next mode. Limit exceeded => error
1112vesae: leaw vesaer, %si
1113 call prtstr
1114 movw %bp, %di # Discard already found modes.
1115vesar: popw %gs
1116 ret
1117
1118# Dimensions of standard VESA text modes
1119vesa_text_mode_table:
1120 .byte 60, 80 # 0108
1121 .byte 25, 132 # 0109
1122 .byte 43, 132 # 010A
1123 .byte 50, 132 # 010B
1124 .byte 60, 132 # 010C
1125#endif /* CONFIG_VIDEO_VESA */
1126
1127# Scan for video modes. A bit dirty, but should work.
1128mode_scan:
1129 movw $0x0100, %cx # Start with mode 0
1130scm1: movb $0, %ah # Test the mode
1131 movb %cl, %al
1132 int $0x10
1133 movb $0x0f, %ah
1134 int $0x10
1135 cmpb %cl, %al
1136 jnz scm2 # Mode not set
1137
1138 movw $0x3c0, %dx # Test if it's a text mode
1139 movb $0x10, %al # Mode bits
1140 call inidx
1141 andb $0x03, %al
1142 jnz scm2
1143
1144 movb $0xce, %dl # Another set of mode bits
1145 movb $0x06, %al
1146 call inidx
1147 shrb %al
1148 jc scm2
1149
1150 movb $0xd4, %dl # Cursor location
1151 movb $0x0f, %al
1152 call inidx
1153 orb %al, %al
1154 jnz scm2
1155
1156 movw %cx, %ax # Ok, store the mode
1157 stosw
1158 movb %gs:(0x484), %al # Number of rows
1159 incb %al
1160 stosb
1161 movw %gs:(0x44a), %ax # Number of columns
1162 stosb
1163scm2: incb %cl
1164 jns scm1
1165
1166 movw $0x0003, %ax # Return back to mode 3
1167 int $0x10
1168 ret
1169
1170tstidx: outw %ax, %dx # OUT DX,AX and inidx
1171inidx: outb %al, %dx # Read from indexed VGA register
1172 incw %dx # AL=index, DX=index reg port -> AL=data
1173 inb %dx, %al
1174 decw %dx
1175 ret
1176
1177# Try to detect type of SVGA card and supply (usually approximate) video
1178# mode table for it.
1179
1180#ifdef CONFIG_VIDEO_SVGA
1181svga_modes:
1182 leaw svga_table, %si # Test all known SVGA adapters
1183dosvga: lodsw
1184 movw %ax, %bp # Default mode table
1185 orw %ax, %ax
1186 jz didsv1
1187
1188 lodsw # Pointer to test routine
1189 pushw %si
1190 pushw %di
1191 pushw %es
1192 movw $0xc000, %bx
1193 movw %bx, %es
1194 call *%ax # Call test routine
1195 popw %es
1196 popw %di
1197 popw %si
1198 orw %bp, %bp
1199 jz dosvga
1200
1201 movw %bp, %si # Found, copy the modes
1202 movb svga_prefix, %ah
1203cpsvga: lodsb
1204 orb %al, %al
1205 jz didsv
1206
1207 stosw
1208 movsw
1209 jmp cpsvga
1210
1211didsv: movw %si, card_name # Store pointer to card name
1212didsv1: ret
1213
1214# Table of all known SVGA cards. For each card, we store a pointer to
1215# a table of video modes supported by the card and a pointer to a routine
1216# used for testing of presence of the card. The video mode table is always
1217# followed by the name of the card or the chipset.
1218svga_table:
1219 .word ati_md, ati_test
1220 .word oak_md, oak_test
1221 .word paradise_md, paradise_test
1222 .word realtek_md, realtek_test
1223 .word s3_md, s3_test
1224 .word chips_md, chips_test
1225 .word video7_md, video7_test
1226 .word cirrus5_md, cirrus5_test
1227 .word cirrus6_md, cirrus6_test
1228 .word cirrus1_md, cirrus1_test
1229 .word ahead_md, ahead_test
1230 .word everex_md, everex_test
1231 .word genoa_md, genoa_test
1232 .word trident_md, trident_test
1233 .word tseng_md, tseng_test
1234 .word 0
1235
1236# Test routines and mode tables:
1237
1238# S3 - The test algorithm was taken from the SuperProbe package
1239# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
1240s3_test:
1241 movw $0x0f35, %cx # we store some constants in cl/ch
1242 movw $0x03d4, %dx
1243 movb $0x38, %al
1244 call inidx
1245 movb %al, %bh # store current CRT-register 0x38
1246 movw $0x0038, %ax
1247 call outidx # disable writing to special regs
1248 movb %cl, %al # check whether we can write special reg 0x35
1249 call inidx
1250 movb %al, %bl # save the current value of CRT reg 0x35
1251 andb $0xf0, %al # clear bits 0-3
1252 movb %al, %ah
1253 movb %cl, %al # and write it to CRT reg 0x35
1254 call outidx
1255 call inidx # now read it back
1256 andb %ch, %al # clear the upper 4 bits
1257 jz s3_2 # the first test failed. But we have a
1258
1259 movb %bl, %ah # second chance
1260 movb %cl, %al
1261 call outidx
1262 jmp s3_1 # do the other tests
1263
1264s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35
1265 orb %bl, %ah # set the upper 4 bits of ah with the orig value
1266 call outidx # write ...
1267 call inidx # ... and reread
1268 andb %cl, %al # turn off the upper 4 bits
1269 pushw %ax
1270 movb %bl, %ah # restore old value in register 0x35
1271 movb %cl, %al
1272 call outidx
1273 popw %ax
1274 cmpb %ch, %al # setting lower 4 bits was successful => bad
1275 je no_s3 # writing is allowed => this is not an S3
1276
1277s3_1: movw $0x4838, %ax # allow writing to special regs by putting
1278 call outidx # magic number into CRT-register 0x38
1279 movb %cl, %al # check whether we can write special reg 0x35
1280 call inidx
1281 movb %al, %bl
1282 andb $0xf0, %al
1283 movb %al, %ah
1284 movb %cl, %al
1285 call outidx
1286 call inidx
1287 andb %ch, %al
1288 jnz no_s3 # no, we can't write => no S3
1289
1290 movw %cx, %ax
1291 orb %bl, %ah
1292 call outidx
1293 call inidx
1294 andb %ch, %al
1295 pushw %ax
1296 movb %bl, %ah # restore old value in register 0x35
1297 movb %cl, %al
1298 call outidx
1299 popw %ax
1300 cmpb %ch, %al
1301 jne no_s31 # writing not possible => no S3
1302 movb $0x30, %al
1303 call inidx # now get the S3 id ...
1304 leaw idS3, %di
1305 movw $0x10, %cx
1306 repne
1307 scasb
1308 je no_s31
1309
1310 movb %bh, %ah
1311 movb $0x38, %al
1312 jmp s3rest
1313
1314no_s3: movb $0x35, %al # restore CRT register 0x35
1315 movb %bl, %ah
1316 call outidx
1317no_s31: xorw %bp, %bp # Detection failed
1318s3rest: movb %bh, %ah
1319 movb $0x38, %al # restore old value of CRT register 0x38
1320 jmp outidx
1321
1322idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
1323 .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
1324
1325s3_md: .byte 0x54, 0x2b, 0x84
1326 .byte 0x55, 0x19, 0x84
1327 .byte 0
1328 .ascii "S3"
1329 .byte 0
1330
1331# ATI cards.
1332ati_test:
1333 leaw idati, %si
1334 movw $0x31, %di
1335 movw $0x09, %cx
1336 repe
1337 cmpsb
1338 je atiok
1339
1340 xorw %bp, %bp
1341atiok: ret
1342
1343idati: .ascii "761295520"
1344
1345ati_md: .byte 0x23, 0x19, 0x84
1346 .byte 0x33, 0x2c, 0x84
1347 .byte 0x22, 0x1e, 0x64
1348 .byte 0x21, 0x19, 0x64
1349 .byte 0x58, 0x21, 0x50
1350 .byte 0x5b, 0x1e, 0x50
1351 .byte 0
1352 .ascii "ATI"
1353 .byte 0
1354
1355# AHEAD
1356ahead_test:
1357 movw $0x200f, %ax
1358 movw $0x3ce, %dx
1359 outw %ax, %dx
1360 incw %dx
1361 inb %dx, %al
1362 cmpb $0x20, %al
1363 je isahed
1364
1365 cmpb $0x21, %al
1366 je isahed
1367
1368 xorw %bp, %bp
1369isahed: ret
1370
1371ahead_md:
1372 .byte 0x22, 0x2c, 0x84
1373 .byte 0x23, 0x19, 0x84
1374 .byte 0x24, 0x1c, 0x84
1375 .byte 0x2f, 0x32, 0xa0
1376 .byte 0x32, 0x22, 0x50
1377 .byte 0x34, 0x42, 0x50
1378 .byte 0
1379 .ascii "Ahead"
1380 .byte 0
1381
1382# Chips & Tech.
1383chips_test:
1384 movw $0x3c3, %dx
1385 inb %dx, %al
1386 orb $0x10, %al
1387 outb %al, %dx
1388 movw $0x104, %dx
1389 inb %dx, %al
1390 movb %al, %bl
1391 movw $0x3c3, %dx
1392 inb %dx, %al
1393 andb $0xef, %al
1394 outb %al, %dx
1395 cmpb $0xa5, %bl
1396 je cantok
1397
1398 xorw %bp, %bp
1399cantok: ret
1400
1401chips_md:
1402 .byte 0x60, 0x19, 0x84
1403 .byte 0x61, 0x32, 0x84
1404 .byte 0
1405 .ascii "Chips & Technologies"
1406 .byte 0
1407
1408# Cirrus Logic 5X0
1409cirrus1_test:
1410 movw $0x3d4, %dx
1411 movb $0x0c, %al
1412 outb %al, %dx
1413 incw %dx
1414 inb %dx, %al
1415 movb %al, %bl
1416 xorb %al, %al
1417 outb %al, %dx
1418 decw %dx
1419 movb $0x1f, %al
1420 outb %al, %dx
1421 incw %dx
1422 inb %dx, %al
1423 movb %al, %bh
1424 xorb %ah, %ah
1425 shlb $4, %al
1426 movw %ax, %cx
1427 movb %bh, %al
1428 shrb $4, %al
1429 addw %ax, %cx
1430 shlw $8, %cx
1431 addw $6, %cx
1432 movw %cx, %ax
1433 movw $0x3c4, %dx
1434 outw %ax, %dx
1435 incw %dx
1436 inb %dx, %al
1437 andb %al, %al
1438 jnz nocirr
1439
1440 movb %bh, %al
1441 outb %al, %dx
1442 inb %dx, %al
1443 cmpb $0x01, %al
1444 je iscirr
1445
1446nocirr: xorw %bp, %bp
1447iscirr: movw $0x3d4, %dx
1448 movb %bl, %al
1449 xorb %ah, %ah
1450 shlw $8, %ax
1451 addw $0x0c, %ax
1452 outw %ax, %dx
1453 ret
1454
1455cirrus1_md:
1456 .byte 0x1f, 0x19, 0x84
1457 .byte 0x20, 0x2c, 0x84
1458 .byte 0x22, 0x1e, 0x84
1459 .byte 0x31, 0x25, 0x64
1460 .byte 0
1461 .ascii "Cirrus Logic 5X0"
1462 .byte 0
1463
1464# Cirrus Logic 54XX
1465cirrus5_test:
1466 movw $0x3c4, %dx
1467 movb $6, %al
1468 call inidx
1469 movb %al, %bl # BL=backup
1470 movw $6, %ax
1471 call tstidx
1472 cmpb $0x0f, %al
1473 jne c5fail
1474
1475 movw $0x1206, %ax
1476 call tstidx
1477 cmpb $0x12, %al
1478 jne c5fail
1479
1480 movb $0x1e, %al
1481 call inidx
1482 movb %al, %bh
1483 movb %bh, %ah
1484 andb $0xc0, %ah
1485 movb $0x1e, %al
1486 call tstidx
1487 andb $0x3f, %al
1488 jne c5xx
1489
1490 movb $0x1e, %al
1491 movb %bh, %ah
1492 orb $0x3f, %ah
1493 call tstidx
1494 xorb $0x3f, %al
1495 andb $0x3f, %al
1496c5xx: pushf
1497 movb $0x1e, %al
1498 movb %bh, %ah
1499 outw %ax, %dx
1500 popf
1501 je c5done
1502
1503c5fail: xorw %bp, %bp
1504c5done: movb $6, %al
1505 movb %bl, %ah
1506 outw %ax, %dx
1507 ret
1508
1509cirrus5_md:
1510 .byte 0x14, 0x19, 0x84
1511 .byte 0x54, 0x2b, 0x84
1512 .byte 0
1513 .ascii "Cirrus Logic 54XX"
1514 .byte 0
1515
1516# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
1517# it's misidentified by the Ahead test.
1518cirrus6_test:
1519 movw $0x3ce, %dx
1520 movb $0x0a, %al
1521 call inidx
1522 movb %al, %bl # BL=backup
1523 movw $0xce0a, %ax
1524 call tstidx
1525 orb %al, %al
1526 jne c2fail
1527
1528 movw $0xec0a, %ax
1529 call tstidx
1530 cmpb $0x01, %al
1531 jne c2fail
1532
1533 movb $0xaa, %al
1534 call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's.
1535 shrb $4, %al
1536 subb $4, %al
1537 jz c6done
1538
1539 decb %al
1540 jz c6done
1541
1542 subb $2, %al
1543 jz c6done
1544
1545 decb %al
1546 jz c6done
1547
1548c2fail: xorw %bp, %bp
1549c6done: movb $0x0a, %al
1550 movb %bl, %ah
1551 outw %ax, %dx
1552 ret
1553
1554cirrus6_md:
1555 .byte 0
1556 .ascii "Cirrus Logic 64XX"
1557 .byte 0
1558
1559# Everex / Trident
1560everex_test:
1561 movw $0x7000, %ax
1562 xorw %bx, %bx
1563 int $0x10
1564 cmpb $0x70, %al
1565 jne noevrx
1566
1567 shrw $4, %dx
1568 cmpw $0x678, %dx
1569 je evtrid
1570
1571 cmpw $0x236, %dx
1572 jne evrxok
1573
1574evtrid: leaw trident_md, %bp
1575evrxok: ret
1576
1577noevrx: xorw %bp, %bp
1578 ret
1579
1580everex_md:
1581 .byte 0x03, 0x22, 0x50
1582 .byte 0x04, 0x3c, 0x50
1583 .byte 0x07, 0x2b, 0x64
1584 .byte 0x08, 0x4b, 0x64
1585 .byte 0x0a, 0x19, 0x84
1586 .byte 0x0b, 0x2c, 0x84
1587 .byte 0x16, 0x1e, 0x50
1588 .byte 0x18, 0x1b, 0x64
1589 .byte 0x21, 0x40, 0xa0
1590 .byte 0x40, 0x1e, 0x84
1591 .byte 0
1592 .ascii "Everex/Trident"
1593 .byte 0
1594
1595# Genoa.
1596genoa_test:
1597 leaw idgenoa, %si # Check Genoa 'clues'
1598 xorw %ax, %ax
1599 movb %es:(0x37), %al
1600 movw %ax, %di
1601 movw $0x04, %cx
1602 decw %si
1603 decw %di
1604l1: incw %si
1605 incw %di
1606 movb (%si), %al
1607 testb %al, %al
1608 jz l2
1609
1610 cmpb %es:(%di), %al
1611l2: loope l1
1612 orw %cx, %cx
1613 je isgen
1614
1615 xorw %bp, %bp
1616isgen: ret
1617
1618idgenoa: .byte 0x77, 0x00, 0x99, 0x66
1619
1620genoa_md:
1621 .byte 0x58, 0x20, 0x50
1622 .byte 0x5a, 0x2a, 0x64
1623 .byte 0x60, 0x19, 0x84
1624 .byte 0x61, 0x1d, 0x84
1625 .byte 0x62, 0x20, 0x84
1626 .byte 0x63, 0x2c, 0x84
1627 .byte 0x64, 0x3c, 0x84
1628 .byte 0x6b, 0x4f, 0x64
1629 .byte 0x72, 0x3c, 0x50
1630 .byte 0x74, 0x42, 0x50
1631 .byte 0x78, 0x4b, 0x64
1632 .byte 0
1633 .ascii "Genoa"
1634 .byte 0
1635
1636# OAK
1637oak_test:
1638 leaw idoakvga, %si
1639 movw $0x08, %di
1640 movw $0x08, %cx
1641 repe
1642 cmpsb
1643 je isoak
1644
1645 xorw %bp, %bp
1646isoak: ret
1647
1648idoakvga: .ascii "OAK VGA "
1649
1650oak_md: .byte 0x4e, 0x3c, 0x50
1651 .byte 0x4f, 0x3c, 0x84
1652 .byte 0x50, 0x19, 0x84
1653 .byte 0x51, 0x2b, 0x84
1654 .byte 0
1655 .ascii "OAK"
1656 .byte 0
1657
1658# WD Paradise.
1659paradise_test:
1660 leaw idparadise, %si
1661 movw $0x7d, %di
1662 movw $0x04, %cx
1663 repe
1664 cmpsb
1665 je ispara
1666
1667 xorw %bp, %bp
1668ispara: ret
1669
1670idparadise: .ascii "VGA="
1671
1672paradise_md:
1673 .byte 0x41, 0x22, 0x50
1674 .byte 0x47, 0x1c, 0x84
1675 .byte 0x55, 0x19, 0x84
1676 .byte 0x54, 0x2c, 0x84
1677 .byte 0
1678 .ascii "Paradise"
1679 .byte 0
1680
1681# Trident.
1682trident_test:
1683 movw $0x3c4, %dx
1684 movb $0x0e, %al
1685 outb %al, %dx
1686 incw %dx
1687 inb %dx, %al
1688 xchgb %al, %ah
1689 xorb %al, %al
1690 outb %al, %dx
1691 inb %dx, %al
1692 xchgb %ah, %al
1693 movb %al, %bl # Strange thing ... in the book this wasn't
1694 andb $0x02, %bl # necessary but it worked on my card which
1695 jz setb2 # is a trident. Without it the screen goes
1696 # blurred ...
1697 andb $0xfd, %al
1698 jmp clrb2
1699
1700setb2: orb $0x02, %al
1701clrb2: outb %al, %dx
1702 andb $0x0f, %ah
1703 cmpb $0x02, %ah
1704 je istrid
1705
1706 xorw %bp, %bp
1707istrid: ret
1708
1709trident_md:
1710 .byte 0x50, 0x1e, 0x50
1711 .byte 0x51, 0x2b, 0x50
1712 .byte 0x52, 0x3c, 0x50
1713 .byte 0x57, 0x19, 0x84
1714 .byte 0x58, 0x1e, 0x84
1715 .byte 0x59, 0x2b, 0x84
1716 .byte 0x5a, 0x3c, 0x84
1717 .byte 0
1718 .ascii "Trident"
1719 .byte 0
1720
1721# Tseng.
1722tseng_test:
1723 movw $0x3cd, %dx
1724 inb %dx, %al # Could things be this simple ! :-)
1725 movb %al, %bl
1726 movb $0x55, %al
1727 outb %al, %dx
1728 inb %dx, %al
1729 movb %al, %ah
1730 movb %bl, %al
1731 outb %al, %dx
1732 cmpb $0x55, %ah
1733 je istsen
1734
1735isnot: xorw %bp, %bp
1736istsen: ret
1737
1738tseng_md:
1739 .byte 0x26, 0x3c, 0x50
1740 .byte 0x2a, 0x28, 0x64
1741 .byte 0x23, 0x19, 0x84
1742 .byte 0x24, 0x1c, 0x84
1743 .byte 0x22, 0x2c, 0x84
1744 .byte 0x21, 0x3c, 0x84
1745 .byte 0
1746 .ascii "Tseng"
1747 .byte 0
1748
1749# Video7.
1750video7_test:
1751 movw $0x3cc, %dx
1752 inb %dx, %al
1753 movw $0x3b4, %dx
1754 andb $0x01, %al
1755 jz even7
1756
1757 movw $0x3d4, %dx
1758even7: movb $0x0c, %al
1759 outb %al, %dx
1760 incw %dx
1761 inb %dx, %al
1762 movb %al, %bl
1763 movb $0x55, %al
1764 outb %al, %dx
1765 inb %dx, %al
1766 decw %dx
1767 movb $0x1f, %al
1768 outb %al, %dx
1769 incw %dx
1770 inb %dx, %al
1771 movb %al, %bh
1772 decw %dx
1773 movb $0x0c, %al
1774 outb %al, %dx
1775 incw %dx
1776 movb %bl, %al
1777 outb %al, %dx
1778 movb $0x55, %al
1779 xorb $0xea, %al
1780 cmpb %bh, %al
1781 jne isnot
1782
1783 movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
1784 ret
1785
1786video7_md:
1787 .byte 0x40, 0x2b, 0x50
1788 .byte 0x43, 0x3c, 0x50
1789 .byte 0x44, 0x3c, 0x64
1790 .byte 0x41, 0x19, 0x84
1791 .byte 0x42, 0x2c, 0x84
1792 .byte 0x45, 0x1c, 0x84
1793 .byte 0
1794 .ascii "Video 7"
1795 .byte 0
1796
1797# Realtek VGA
1798realtek_test:
1799 leaw idrtvga, %si
1800 movw $0x45, %di
1801 movw $0x0b, %cx
1802 repe
1803 cmpsb
1804 je isrt
1805
1806 xorw %bp, %bp
1807isrt: ret
1808
1809idrtvga: .ascii "REALTEK VGA"
1810
1811realtek_md:
1812 .byte 0x1a, 0x3c, 0x50
1813 .byte 0x1b, 0x19, 0x84
1814 .byte 0x1c, 0x1e, 0x84
1815 .byte 0x1d, 0x2b, 0x84
1816 .byte 0x1e, 0x3c, 0x84
1817 .byte 0
1818 .ascii "REALTEK"
1819 .byte 0
1820
1821#endif /* CONFIG_VIDEO_SVGA */
1822
1823# User-defined local mode table (VGA only)
1824#ifdef CONFIG_VIDEO_LOCAL
1825local_modes:
1826 leaw local_mode_table, %si
1827locm1: lodsw
1828 orw %ax, %ax
1829 jz locm2
1830
1831 stosw
1832 movsw
1833 jmp locm1
1834
1835locm2: ret
1836
1837# This is the table of local video modes which can be supplied manually
1838# by the user. Each entry consists of mode ID (word) and dimensions
1839# (byte for column count and another byte for row count). These modes
1840# are placed before all SVGA and VESA modes and override them if table
1841# compacting is enabled. The table must end with a zero word followed
1842# by NUL-terminated video adapter name.
1843local_mode_table:
1844 .word 0x0100 # Example: 40x25
1845 .byte 25,40
1846 .word 0
1847 .ascii "Local"
1848 .byte 0
1849#endif /* CONFIG_VIDEO_LOCAL */
1850
1851# Read a key and return the ASCII code in al, scan code in ah
1852getkey: xorb %ah, %ah
1853 int $0x16
1854 ret
1855
1856# Read a key with a timeout of 30 seconds.
1857# The hardware clock is used to get the time.
1858getkt: call gettime
1859 addb $30, %al # Wait 30 seconds
1860 cmpb $60, %al
1861 jl lminute
1862
1863 subb $60, %al
1864lminute:
1865 movb %al, %cl
1866again: movb $0x01, %ah
1867 int $0x16
1868 jnz getkey # key pressed, so get it
1869
1870 call gettime
1871 cmpb %cl, %al
1872 jne again
1873
1874 movb $0x20, %al # timeout, return `space'
1875 ret
1876
1877# Flush the keyboard buffer
1878flush: movb $0x01, %ah
1879 int $0x16
1880 jz empty
1881
1882 xorb %ah, %ah
1883 int $0x16
1884 jmp flush
1885
1886empty: ret
1887
1888# Print hexadecimal number.
1889prthw: pushw %ax
1890 movb %ah, %al
1891 call prthb
1892 popw %ax
1893prthb: pushw %ax
1894 shrb $4, %al
1895 call prthn
1896 popw %ax
1897 andb $0x0f, %al
1898prthn: cmpb $0x0a, %al
1899 jc prth1
1900
1901 addb $0x07, %al
1902prth1: addb $0x30, %al
1903 jmp prtchr
1904
1905# Print decimal number in al
1906prtdec: pushw %ax
1907 pushw %cx
1908 xorb %ah, %ah
1909 movb $0x0a, %cl
1910 idivb %cl
1911 cmpb $0x09, %al
1912 jbe lt100
1913
1914 call prtdec
1915 jmp skip10
1916
1917lt100: addb $0x30, %al
1918 call prtchr
1919skip10: movb %ah, %al
1920 addb $0x30, %al
1921 call prtchr
1922 popw %cx
1923 popw %ax
1924 ret
1925
1926store_edid:
1927 pushw %es # just save all registers
1928 pushw %ax
1929 pushw %bx
1930 pushw %cx
1931 pushw %dx
1932 pushw %di
1933
1934 pushw %fs
1935 popw %es
1936
1937 movl $0x13131313, %eax # memset block with 0x13
1938 movw $32, %cx
1939 movw $0x140, %di
1940 cld
1941 rep
1942 stosl
1943
1944 movw $0x4f15, %ax # do VBE/DDC
1945 movw $0x01, %bx
1946 movw $0x00, %cx
1947 movw $0x01, %dx
1948 movw $0x140, %di
1949 int $0x10
1950
1951 popw %di # restore all registers
1952 popw %dx
1953 popw %cx
1954 popw %bx
1955 popw %ax
1956 popw %es
1957 ret
1958
1959# VIDEO_SELECT-only variables
1960mt_end: .word 0 # End of video mode table if built
1961edit_buf: .space 6 # Line editor buffer
1962card_name: .word 0 # Pointer to adapter name
1963scanning: .byte 0 # Performing mode scan
1964do_restore: .byte 0 # Screen contents altered during mode change
1965svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes
1966graphic_mode: .byte 0 # Graphic mode with a linear frame buffer
1967dac_size: .byte 6 # DAC bit depth
1968
1969# Status messages
1970keymsg: .ascii "Press <RETURN> to see video modes available, "
1971 .ascii "<SPACE> to continue or wait 30 secs"
1972 .byte 0x0d, 0x0a, 0
1973
1974listhdr: .byte 0x0d, 0x0a
1975 .ascii "Mode: COLSxROWS:"
1976
1977crlft: .byte 0x0d, 0x0a, 0
1978
1979prompt: .byte 0x0d, 0x0a
1980 .asciz "Enter mode number or `scan': "
1981
1982unknt: .asciz "Unknown mode ID. Try again."
1983
1984badmdt: .ascii "You passed an undefined mode number."
1985 .byte 0x0d, 0x0a, 0
1986
1987vesaer: .ascii "Error: Scanning of VESA modes failed. Please "
1988 .ascii "report to <mj@ucw.cz>."
1989 .byte 0x0d, 0x0a, 0
1990
1991old_name: .asciz "CGA/MDA/HGA"
1992
1993ega_name: .asciz "EGA"
1994
1995svga_name: .ascii " "
1996
1997vga_name: .asciz "VGA"
1998
1999vesa_name: .asciz "VESA"
2000
2001name_bann: .asciz "Video adapter: "
2002#endif /* CONFIG_VIDEO_SELECT */
2003
2004# Other variables:
2005adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
2006video_segment: .word 0xb800 # Video memory segment
2007force_size: .word 0 # Use this size instead of the one in BIOS vars
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
new file mode 100644
index 000000000000..9ce51dee30b3
--- /dev/null
+++ b/arch/x86_64/defconfig
@@ -0,0 +1,1129 @@
1#
2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.11-bk7
4# Sat Mar 12 23:43:44 2005
5#
6CONFIG_X86_64=y
7CONFIG_64BIT=y
8CONFIG_X86=y
9CONFIG_MMU=y
10CONFIG_RWSEM_GENERIC_SPINLOCK=y
11CONFIG_GENERIC_CALIBRATE_DELAY=y
12CONFIG_X86_CMPXCHG=y
13CONFIG_EARLY_PRINTK=y
14CONFIG_HPET_TIMER=y
15CONFIG_HPET_EMULATE_RTC=y
16CONFIG_GENERIC_ISA_DMA=y
17CONFIG_GENERIC_IOMAP=y
18
19#
20# Code maturity level options
21#
22CONFIG_EXPERIMENTAL=y
23CONFIG_CLEAN_COMPILE=y
24CONFIG_LOCK_KERNEL=y
25
26#
27# General setup
28#
29CONFIG_LOCALVERSION=""
30CONFIG_SWAP=y
31CONFIG_SYSVIPC=y
32CONFIG_POSIX_MQUEUE=y
33# CONFIG_BSD_PROCESS_ACCT is not set
34CONFIG_SYSCTL=y
35# CONFIG_AUDIT is not set
36CONFIG_LOG_BUF_SHIFT=18
37# CONFIG_HOTPLUG is not set
38CONFIG_KOBJECT_UEVENT=y
39CONFIG_IKCONFIG=y
40CONFIG_IKCONFIG_PROC=y
41# CONFIG_CPUSETS is not set
42# CONFIG_EMBEDDED is not set
43CONFIG_KALLSYMS=y
44CONFIG_KALLSYMS_ALL=y
45# CONFIG_KALLSYMS_EXTRA_PASS is not set
46CONFIG_BASE_FULL=y
47CONFIG_FUTEX=y
48CONFIG_EPOLL=y
49# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
50CONFIG_SHMEM=y
51CONFIG_CC_ALIGN_FUNCTIONS=0
52CONFIG_CC_ALIGN_LABELS=0
53CONFIG_CC_ALIGN_LOOPS=0
54CONFIG_CC_ALIGN_JUMPS=0
55# CONFIG_TINY_SHMEM is not set
56CONFIG_BASE_SMALL=0
57
58#
59# Loadable module support
60#
61CONFIG_MODULES=y
62CONFIG_MODULE_UNLOAD=y
63CONFIG_MODULE_FORCE_UNLOAD=y
64CONFIG_OBSOLETE_MODPARM=y
65# CONFIG_MODVERSIONS is not set
66# CONFIG_MODULE_SRCVERSION_ALL is not set
67# CONFIG_KMOD is not set
68CONFIG_STOP_MACHINE=y
69
70#
71# Processor type and features
72#
73# CONFIG_MK8 is not set
74# CONFIG_MPSC is not set
75CONFIG_GENERIC_CPU=y
76CONFIG_X86_L1_CACHE_BYTES=128
77CONFIG_X86_L1_CACHE_SHIFT=7
78CONFIG_X86_TSC=y
79CONFIG_X86_GOOD_APIC=y
80# CONFIG_MICROCODE is not set
81CONFIG_X86_MSR=y
82CONFIG_X86_CPUID=y
83CONFIG_X86_HT=y
84CONFIG_X86_IO_APIC=y
85CONFIG_X86_LOCAL_APIC=y
86CONFIG_MTRR=y
87CONFIG_SMP=y
88# CONFIG_PREEMPT is not set
89CONFIG_SCHED_SMT=y
90CONFIG_K8_NUMA=y
91# CONFIG_NUMA_EMU is not set
92CONFIG_DISCONTIGMEM=y
93CONFIG_NUMA=y
94CONFIG_HAVE_DEC_LOCK=y
95CONFIG_NR_CPUS=8
96CONFIG_GART_IOMMU=y
97CONFIG_SWIOTLB=y
98CONFIG_X86_MCE=y
99CONFIG_X86_MCE_INTEL=y
100CONFIG_SECCOMP=y
101CONFIG_GENERIC_HARDIRQS=y
102CONFIG_GENERIC_IRQ_PROBE=y
103
104#
105# Power management options
106#
107CONFIG_PM=y
108# CONFIG_PM_DEBUG is not set
109CONFIG_SOFTWARE_SUSPEND=y
110CONFIG_PM_STD_PARTITION=""
111
112#
113# ACPI (Advanced Configuration and Power Interface) Support
114#
115CONFIG_ACPI=y
116CONFIG_ACPI_BOOT=y
117CONFIG_ACPI_INTERPRETER=y
118CONFIG_ACPI_SLEEP=y
119CONFIG_ACPI_SLEEP_PROC_FS=y
120CONFIG_ACPI_AC=y
121CONFIG_ACPI_BATTERY=y
122CONFIG_ACPI_BUTTON=y
123# CONFIG_ACPI_VIDEO is not set
124CONFIG_ACPI_FAN=y
125CONFIG_ACPI_PROCESSOR=y
126CONFIG_ACPI_THERMAL=y
127CONFIG_ACPI_NUMA=y
128# CONFIG_ACPI_ASUS is not set
129# CONFIG_ACPI_IBM is not set
130CONFIG_ACPI_TOSHIBA=y
131CONFIG_ACPI_BLACKLIST_YEAR=2001
132CONFIG_ACPI_DEBUG=y
133CONFIG_ACPI_BUS=y
134CONFIG_ACPI_EC=y
135CONFIG_ACPI_POWER=y
136CONFIG_ACPI_PCI=y
137CONFIG_ACPI_SYSTEM=y
138# CONFIG_ACPI_CONTAINER is not set
139
140#
141# CPU Frequency scaling
142#
143CONFIG_CPU_FREQ=y
144# CONFIG_CPU_FREQ_DEBUG is not set
145CONFIG_CPU_FREQ_STAT=y
146# CONFIG_CPU_FREQ_STAT_DETAILS is not set
147CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
148# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
149CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
150# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
151CONFIG_CPU_FREQ_GOV_USERSPACE=y
152CONFIG_CPU_FREQ_GOV_ONDEMAND=y
153CONFIG_CPU_FREQ_TABLE=y
154
155#
156# CPUFreq processor drivers
157#
158CONFIG_X86_POWERNOW_K8=y
159CONFIG_X86_POWERNOW_K8_ACPI=y
160# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
161CONFIG_X86_ACPI_CPUFREQ=y
162
163#
164# shared options
165#
166CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
167
168#
169# Bus options (PCI etc.)
170#
171CONFIG_PCI=y
172CONFIG_PCI_DIRECT=y
173CONFIG_PCI_MMCONFIG=y
174CONFIG_UNORDERED_IO=y
175CONFIG_PCI_MSI=y
176# CONFIG_PCI_LEGACY_PROC is not set
177# CONFIG_PCI_NAMES is not set
178
179#
180# PCCARD (PCMCIA/CardBus) support
181#
182# CONFIG_PCCARD is not set
183
184#
185# PC-card bridges
186#
187
188#
189# PCI Hotplug Support
190#
191# CONFIG_HOTPLUG_PCI is not set
192
193#
194# Executable file formats / Emulations
195#
196CONFIG_BINFMT_ELF=y
197# CONFIG_BINFMT_MISC is not set
198CONFIG_IA32_EMULATION=y
199CONFIG_IA32_AOUT=y
200CONFIG_COMPAT=y
201CONFIG_SYSVIPC_COMPAT=y
202CONFIG_UID16=y
203
204#
205# Device Drivers
206#
207
208#
209# Generic Driver Options
210#
211CONFIG_STANDALONE=y
212CONFIG_PREVENT_FIRMWARE_BUILD=y
213# CONFIG_FW_LOADER is not set
214# CONFIG_DEBUG_DRIVER is not set
215
216#
217# Memory Technology Devices (MTD)
218#
219# CONFIG_MTD is not set
220
221#
222# Parallel port support
223#
224# CONFIG_PARPORT is not set
225
226#
227# Plug and Play support
228#
229# CONFIG_PNP is not set
230
231#
232# Block devices
233#
234CONFIG_BLK_DEV_FD=y
235# CONFIG_BLK_CPQ_DA is not set
236# CONFIG_BLK_CPQ_CISS_DA is not set
237# CONFIG_BLK_DEV_DAC960 is not set
238# CONFIG_BLK_DEV_UMEM is not set
239# CONFIG_BLK_DEV_COW_COMMON is not set
240CONFIG_BLK_DEV_LOOP=y
241# CONFIG_BLK_DEV_CRYPTOLOOP is not set
242# CONFIG_BLK_DEV_NBD is not set
243# CONFIG_BLK_DEV_SX8 is not set
244# CONFIG_BLK_DEV_UB is not set
245CONFIG_BLK_DEV_RAM=y
246CONFIG_BLK_DEV_RAM_COUNT=16
247CONFIG_BLK_DEV_RAM_SIZE=4096
248CONFIG_BLK_DEV_INITRD=y
249CONFIG_INITRAMFS_SOURCE=""
250CONFIG_LBD=y
251# CONFIG_CDROM_PKTCDVD is not set
252
253#
254# IO Schedulers
255#
256CONFIG_IOSCHED_NOOP=y
257CONFIG_IOSCHED_AS=y
258CONFIG_IOSCHED_DEADLINE=y
259CONFIG_IOSCHED_CFQ=y
260# CONFIG_ATA_OVER_ETH is not set
261
262#
263# ATA/ATAPI/MFM/RLL support
264#
265CONFIG_IDE=y
266CONFIG_BLK_DEV_IDE=y
267
268#
269# Please see Documentation/ide.txt for help/info on IDE drives
270#
271# CONFIG_BLK_DEV_IDE_SATA is not set
272# CONFIG_BLK_DEV_HD_IDE is not set
273CONFIG_BLK_DEV_IDEDISK=y
274CONFIG_IDEDISK_MULTI_MODE=y
275CONFIG_BLK_DEV_IDECD=y
276# CONFIG_BLK_DEV_IDETAPE is not set
277# CONFIG_BLK_DEV_IDEFLOPPY is not set
278# CONFIG_BLK_DEV_IDESCSI is not set
279# CONFIG_IDE_TASK_IOCTL is not set
280
281#
282# IDE chipset support/bugfixes
283#
284CONFIG_IDE_GENERIC=y
285# CONFIG_BLK_DEV_CMD640 is not set
286CONFIG_BLK_DEV_IDEPCI=y
287# CONFIG_IDEPCI_SHARE_IRQ is not set
288# CONFIG_BLK_DEV_OFFBOARD is not set
289# CONFIG_BLK_DEV_GENERIC is not set
290# CONFIG_BLK_DEV_OPTI621 is not set
291# CONFIG_BLK_DEV_RZ1000 is not set
292CONFIG_BLK_DEV_IDEDMA_PCI=y
293# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
294CONFIG_IDEDMA_PCI_AUTO=y
295# CONFIG_IDEDMA_ONLYDISK is not set
296# CONFIG_BLK_DEV_AEC62XX is not set
297# CONFIG_BLK_DEV_ALI15X3 is not set
298CONFIG_BLK_DEV_AMD74XX=y
299# CONFIG_BLK_DEV_ATIIXP is not set
300# CONFIG_BLK_DEV_CMD64X is not set
301# CONFIG_BLK_DEV_TRIFLEX is not set
302# CONFIG_BLK_DEV_CY82C693 is not set
303# CONFIG_BLK_DEV_CS5520 is not set
304# CONFIG_BLK_DEV_CS5530 is not set
305# CONFIG_BLK_DEV_HPT34X is not set
306# CONFIG_BLK_DEV_HPT366 is not set
307# CONFIG_BLK_DEV_SC1200 is not set
308CONFIG_BLK_DEV_PIIX=y
309# CONFIG_BLK_DEV_NS87415 is not set
310# CONFIG_BLK_DEV_PDC202XX_OLD is not set
311# CONFIG_BLK_DEV_PDC202XX_NEW is not set
312# CONFIG_BLK_DEV_SVWKS is not set
313# CONFIG_BLK_DEV_SIIMAGE is not set
314# CONFIG_BLK_DEV_SIS5513 is not set
315# CONFIG_BLK_DEV_SLC90E66 is not set
316# CONFIG_BLK_DEV_TRM290 is not set
317# CONFIG_BLK_DEV_VIA82CXXX is not set
318# CONFIG_IDE_ARM is not set
319CONFIG_BLK_DEV_IDEDMA=y
320# CONFIG_IDEDMA_IVB is not set
321CONFIG_IDEDMA_AUTO=y
322# CONFIG_BLK_DEV_HD is not set
323
324#
325# SCSI device support
326#
327CONFIG_SCSI=y
328# CONFIG_SCSI_PROC_FS is not set
329
330#
331# SCSI support type (disk, tape, CD-ROM)
332#
333CONFIG_BLK_DEV_SD=y
334# CONFIG_CHR_DEV_ST is not set
335# CONFIG_CHR_DEV_OSST is not set
336# CONFIG_BLK_DEV_SR is not set
337# CONFIG_CHR_DEV_SG is not set
338
339#
340# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
341#
342# CONFIG_SCSI_MULTI_LUN is not set
343# CONFIG_SCSI_CONSTANTS is not set
344# CONFIG_SCSI_LOGGING is not set
345
346#
347# SCSI Transport Attributes
348#
349# CONFIG_SCSI_SPI_ATTRS is not set
350# CONFIG_SCSI_FC_ATTRS is not set
351# CONFIG_SCSI_ISCSI_ATTRS is not set
352
353#
354# SCSI low-level drivers
355#
356CONFIG_BLK_DEV_3W_XXXX_RAID=y
357# CONFIG_SCSI_3W_9XXX is not set
358# CONFIG_SCSI_ACARD is not set
359# CONFIG_SCSI_AACRAID is not set
360# CONFIG_SCSI_AIC7XXX is not set
361# CONFIG_SCSI_AIC7XXX_OLD is not set
362CONFIG_SCSI_AIC79XX=y
363CONFIG_AIC79XX_CMDS_PER_DEVICE=32
364CONFIG_AIC79XX_RESET_DELAY_MS=4000
365# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
366# CONFIG_AIC79XX_DEBUG_ENABLE is not set
367CONFIG_AIC79XX_DEBUG_MASK=0
368# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
369# CONFIG_MEGARAID_NEWGEN is not set
370# CONFIG_MEGARAID_LEGACY is not set
371CONFIG_SCSI_SATA=y
372# CONFIG_SCSI_SATA_AHCI is not set
373# CONFIG_SCSI_SATA_SVW is not set
374CONFIG_SCSI_ATA_PIIX=y
375# CONFIG_SCSI_SATA_NV is not set
376# CONFIG_SCSI_SATA_PROMISE is not set
377# CONFIG_SCSI_SATA_QSTOR is not set
378# CONFIG_SCSI_SATA_SX4 is not set
379# CONFIG_SCSI_SATA_SIL is not set
380# CONFIG_SCSI_SATA_SIS is not set
381# CONFIG_SCSI_SATA_ULI is not set
382CONFIG_SCSI_SATA_VIA=y
383# CONFIG_SCSI_SATA_VITESSE is not set
384# CONFIG_SCSI_BUSLOGIC is not set
385# CONFIG_SCSI_DMX3191D is not set
386# CONFIG_SCSI_EATA is not set
387# CONFIG_SCSI_EATA_PIO is not set
388# CONFIG_SCSI_FUTURE_DOMAIN is not set
389# CONFIG_SCSI_GDTH is not set
390# CONFIG_SCSI_IPS is not set
391# CONFIG_SCSI_INITIO is not set
392# CONFIG_SCSI_INIA100 is not set
393# CONFIG_SCSI_SYM53C8XX_2 is not set
394# CONFIG_SCSI_IPR is not set
395# CONFIG_SCSI_QLOGIC_ISP is not set
396# CONFIG_SCSI_QLOGIC_FC is not set
397# CONFIG_SCSI_QLOGIC_1280 is not set
398CONFIG_SCSI_QLA2XXX=y
399# CONFIG_SCSI_QLA21XX is not set
400# CONFIG_SCSI_QLA22XX is not set
401# CONFIG_SCSI_QLA2300 is not set
402# CONFIG_SCSI_QLA2322 is not set
403# CONFIG_SCSI_QLA6312 is not set
404# CONFIG_SCSI_DC395x is not set
405# CONFIG_SCSI_DC390T is not set
406# CONFIG_SCSI_DEBUG is not set
407
408#
409# Multi-device support (RAID and LVM)
410#
411# CONFIG_MD is not set
412
413#
414# Fusion MPT device support
415#
416CONFIG_FUSION=y
417CONFIG_FUSION_MAX_SGE=40
418# CONFIG_FUSION_CTL is not set
419
420#
421# IEEE 1394 (FireWire) support
422#
423# CONFIG_IEEE1394 is not set
424
425#
426# I2O device support
427#
428# CONFIG_I2O is not set
429
430#
431# Networking support
432#
433CONFIG_NET=y
434
435#
436# Networking options
437#
438CONFIG_PACKET=y
439# CONFIG_PACKET_MMAP is not set
440# CONFIG_NETLINK_DEV is not set
441CONFIG_UNIX=y
442# CONFIG_NET_KEY is not set
443CONFIG_INET=y
444CONFIG_IP_MULTICAST=y
445# CONFIG_IP_ADVANCED_ROUTER is not set
446# CONFIG_IP_PNP is not set
447# CONFIG_NET_IPIP is not set
448# CONFIG_NET_IPGRE is not set
449# CONFIG_IP_MROUTE is not set
450# CONFIG_ARPD is not set
451# CONFIG_SYN_COOKIES is not set
452# CONFIG_INET_AH is not set
453# CONFIG_INET_ESP is not set
454# CONFIG_INET_IPCOMP is not set
455# CONFIG_INET_TUNNEL is not set
456CONFIG_IP_TCPDIAG=y
457CONFIG_IP_TCPDIAG_IPV6=y
458CONFIG_IPV6=y
459# CONFIG_IPV6_PRIVACY is not set
460# CONFIG_INET6_AH is not set
461# CONFIG_INET6_ESP is not set
462# CONFIG_INET6_IPCOMP is not set
463# CONFIG_INET6_TUNNEL is not set
464# CONFIG_IPV6_TUNNEL is not set
465# CONFIG_NETFILTER is not set
466
467#
468# SCTP Configuration (EXPERIMENTAL)
469#
470# CONFIG_IP_SCTP is not set
471# CONFIG_ATM is not set
472# CONFIG_BRIDGE is not set
473# CONFIG_VLAN_8021Q is not set
474# CONFIG_DECNET is not set
475# CONFIG_LLC2 is not set
476# CONFIG_IPX is not set
477# CONFIG_ATALK is not set
478# CONFIG_X25 is not set
479# CONFIG_LAPB is not set
480# CONFIG_NET_DIVERT is not set
481# CONFIG_ECONET is not set
482# CONFIG_WAN_ROUTER is not set
483
484#
485# QoS and/or fair queueing
486#
487# CONFIG_NET_SCHED is not set
488# CONFIG_NET_CLS_ROUTE is not set
489
490#
491# Network testing
492#
493# CONFIG_NET_PKTGEN is not set
494CONFIG_NETPOLL=y
495# CONFIG_NETPOLL_RX is not set
496# CONFIG_NETPOLL_TRAP is not set
497CONFIG_NET_POLL_CONTROLLER=y
498# CONFIG_HAMRADIO is not set
499# CONFIG_IRDA is not set
500# CONFIG_BT is not set
501CONFIG_NETDEVICES=y
502# CONFIG_DUMMY is not set
503# CONFIG_BONDING is not set
504# CONFIG_EQUALIZER is not set
505# CONFIG_TUN is not set
506
507#
508# ARCnet devices
509#
510# CONFIG_ARCNET is not set
511
512#
513# Ethernet (10 or 100Mbit)
514#
515CONFIG_NET_ETHERNET=y
516CONFIG_MII=y
517# CONFIG_HAPPYMEAL is not set
518# CONFIG_SUNGEM is not set
519# CONFIG_NET_VENDOR_3COM is not set
520
521#
522# Tulip family network device support
523#
524# CONFIG_NET_TULIP is not set
525# CONFIG_HP100 is not set
526CONFIG_NET_PCI=y
527# CONFIG_PCNET32 is not set
528CONFIG_AMD8111_ETH=y
529# CONFIG_AMD8111E_NAPI is not set
530# CONFIG_ADAPTEC_STARFIRE is not set
531# CONFIG_B44 is not set
532CONFIG_FORCEDETH=y
533# CONFIG_DGRS is not set
534# CONFIG_EEPRO100 is not set
535# CONFIG_E100 is not set
536# CONFIG_FEALNX is not set
537# CONFIG_NATSEMI is not set
538# CONFIG_NE2K_PCI is not set
539CONFIG_8139CP=m
540CONFIG_8139TOO=y
541# CONFIG_8139TOO_PIO is not set
542# CONFIG_8139TOO_TUNE_TWISTER is not set
543# CONFIG_8139TOO_8129 is not set
544# CONFIG_8139_OLD_RX_RESET is not set
545# CONFIG_SIS900 is not set
546# CONFIG_EPIC100 is not set
547# CONFIG_SUNDANCE is not set
548# CONFIG_VIA_RHINE is not set
549
550#
551# Ethernet (1000 Mbit)
552#
553# CONFIG_ACENIC is not set
554# CONFIG_DL2K is not set
555CONFIG_E1000=y
556# CONFIG_E1000_NAPI is not set
557# CONFIG_NS83820 is not set
558# CONFIG_HAMACHI is not set
559# CONFIG_YELLOWFIN is not set
560# CONFIG_R8169 is not set
561# CONFIG_SK98LIN is not set
562# CONFIG_VIA_VELOCITY is not set
563CONFIG_TIGON3=y
564
565#
566# Ethernet (10000 Mbit)
567#
568# CONFIG_IXGB is not set
569CONFIG_S2IO=m
570# CONFIG_S2IO_NAPI is not set
571# CONFIG_2BUFF_MODE is not set
572
573#
574# Token Ring devices
575#
576# CONFIG_TR is not set
577
578#
579# Wireless LAN (non-hamradio)
580#
581# CONFIG_NET_RADIO is not set
582
583#
584# Wan interfaces
585#
586# CONFIG_WAN is not set
587# CONFIG_FDDI is not set
588# CONFIG_HIPPI is not set
589# CONFIG_PPP is not set
590# CONFIG_SLIP is not set
591# CONFIG_NET_FC is not set
592# CONFIG_SHAPER is not set
593CONFIG_NETCONSOLE=y
594
595#
596# ISDN subsystem
597#
598# CONFIG_ISDN is not set
599
600#
601# Telephony Support
602#
603# CONFIG_PHONE is not set
604
605#
606# Input device support
607#
608CONFIG_INPUT=y
609
610#
611# Userland interfaces
612#
613CONFIG_INPUT_MOUSEDEV=y
614CONFIG_INPUT_MOUSEDEV_PSAUX=y
615CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
616CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
617# CONFIG_INPUT_JOYDEV is not set
618# CONFIG_INPUT_TSDEV is not set
619CONFIG_INPUT_EVDEV=y
620# CONFIG_INPUT_EVBUG is not set
621
622#
623# Input Device Drivers
624#
625CONFIG_INPUT_KEYBOARD=y
626CONFIG_KEYBOARD_ATKBD=y
627# CONFIG_KEYBOARD_SUNKBD is not set
628# CONFIG_KEYBOARD_LKKBD is not set
629# CONFIG_KEYBOARD_XTKBD is not set
630# CONFIG_KEYBOARD_NEWTON is not set
631CONFIG_INPUT_MOUSE=y
632CONFIG_MOUSE_PS2=y
633# CONFIG_MOUSE_SERIAL is not set
634# CONFIG_MOUSE_VSXXXAA is not set
635# CONFIG_INPUT_JOYSTICK is not set
636# CONFIG_INPUT_TOUCHSCREEN is not set
637# CONFIG_INPUT_MISC is not set
638
639#
640# Hardware I/O ports
641#
642CONFIG_SERIO=y
643CONFIG_SERIO_I8042=y
644# CONFIG_SERIO_SERPORT is not set
645# CONFIG_SERIO_CT82C710 is not set
646# CONFIG_SERIO_PCIPS2 is not set
647CONFIG_SERIO_LIBPS2=y
648# CONFIG_SERIO_RAW is not set
649# CONFIG_GAMEPORT is not set
650CONFIG_SOUND_GAMEPORT=y
651
652#
653# Character devices
654#
655CONFIG_VT=y
656CONFIG_VT_CONSOLE=y
657CONFIG_HW_CONSOLE=y
658# CONFIG_SERIAL_NONSTANDARD is not set
659
660#
661# Serial drivers
662#
663CONFIG_SERIAL_8250=y
664CONFIG_SERIAL_8250_CONSOLE=y
665# CONFIG_SERIAL_8250_ACPI is not set
666CONFIG_SERIAL_8250_NR_UARTS=4
667# CONFIG_SERIAL_8250_EXTENDED is not set
668
669#
670# Non-8250 serial port support
671#
672CONFIG_SERIAL_CORE=y
673CONFIG_SERIAL_CORE_CONSOLE=y
674CONFIG_UNIX98_PTYS=y
675CONFIG_LEGACY_PTYS=y
676CONFIG_LEGACY_PTY_COUNT=256
677
678#
679# IPMI
680#
681# CONFIG_IPMI_HANDLER is not set
682
683#
684# Watchdog Cards
685#
686# CONFIG_WATCHDOG is not set
687CONFIG_HW_RANDOM=y
688# CONFIG_NVRAM is not set
689CONFIG_RTC=y
690# CONFIG_DTLK is not set
691# CONFIG_R3964 is not set
692# CONFIG_APPLICOM is not set
693
694#
695# Ftape, the floppy tape device driver
696#
697CONFIG_AGP=y
698CONFIG_AGP_AMD64=y
699# CONFIG_DRM is not set
700# CONFIG_MWAVE is not set
701CONFIG_RAW_DRIVER=y
702CONFIG_HPET=y
703# CONFIG_HPET_RTC_IRQ is not set
704CONFIG_HPET_MMAP=y
705CONFIG_MAX_RAW_DEVS=256
706CONFIG_HANGCHECK_TIMER=y
707
708#
709# TPM devices
710#
711# CONFIG_TCG_TPM is not set
712
713#
714# I2C support
715#
716# CONFIG_I2C is not set
717
718#
719# Dallas's 1-wire bus
720#
721# CONFIG_W1 is not set
722
723#
724# Misc devices
725#
726# CONFIG_IBM_ASM is not set
727
728#
729# Multimedia devices
730#
731# CONFIG_VIDEO_DEV is not set
732
733#
734# Digital Video Broadcasting Devices
735#
736# CONFIG_DVB is not set
737
738#
739# Graphics support
740#
741# CONFIG_FB is not set
742CONFIG_VIDEO_SELECT=y
743
744#
745# Console display driver support
746#
747CONFIG_VGA_CONSOLE=y
748CONFIG_DUMMY_CONSOLE=y
749
750#
751# Sound
752#
753CONFIG_SOUND=y
754
755#
756# Advanced Linux Sound Architecture
757#
758# CONFIG_SND is not set
759
760#
761# Open Sound System
762#
763CONFIG_SOUND_PRIME=y
764# CONFIG_SOUND_BT878 is not set
765# CONFIG_SOUND_CMPCI is not set
766# CONFIG_SOUND_EMU10K1 is not set
767# CONFIG_SOUND_FUSION is not set
768# CONFIG_SOUND_CS4281 is not set
769# CONFIG_SOUND_ES1370 is not set
770# CONFIG_SOUND_ES1371 is not set
771# CONFIG_SOUND_ESSSOLO1 is not set
772# CONFIG_SOUND_MAESTRO is not set
773# CONFIG_SOUND_MAESTRO3 is not set
774CONFIG_SOUND_ICH=y
775# CONFIG_SOUND_SONICVIBES is not set
776# CONFIG_SOUND_TRIDENT is not set
777# CONFIG_SOUND_MSNDCLAS is not set
778# CONFIG_SOUND_MSNDPIN is not set
779# CONFIG_SOUND_VIA82CXXX is not set
780# CONFIG_SOUND_OSS is not set
781# CONFIG_SOUND_ALI5455 is not set
782# CONFIG_SOUND_FORTE is not set
783# CONFIG_SOUND_RME96XX is not set
784# CONFIG_SOUND_AD1980 is not set
785
786#
787# USB support
788#
789CONFIG_USB=y
790# CONFIG_USB_DEBUG is not set
791
792#
793# Miscellaneous USB options
794#
795CONFIG_USB_DEVICEFS=y
796# CONFIG_USB_BANDWIDTH is not set
797# CONFIG_USB_DYNAMIC_MINORS is not set
798# CONFIG_USB_SUSPEND is not set
799# CONFIG_USB_OTG is not set
800CONFIG_USB_ARCH_HAS_HCD=y
801CONFIG_USB_ARCH_HAS_OHCI=y
802
803#
804# USB Host Controller Drivers
805#
806CONFIG_USB_EHCI_HCD=y
807# CONFIG_USB_EHCI_SPLIT_ISO is not set
808# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
809CONFIG_USB_OHCI_HCD=y
810# CONFIG_USB_OHCI_BIG_ENDIAN is not set
811CONFIG_USB_OHCI_LITTLE_ENDIAN=y
812CONFIG_USB_UHCI_HCD=y
813# CONFIG_USB_SL811_HCD is not set
814
815#
816# USB Device Class drivers
817#
818# CONFIG_USB_AUDIO is not set
819# CONFIG_USB_BLUETOOTH_TTY is not set
820# CONFIG_USB_MIDI is not set
821# CONFIG_USB_ACM is not set
822CONFIG_USB_PRINTER=y
823
824#
825# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
826#
827CONFIG_USB_STORAGE=y
828# CONFIG_USB_STORAGE_DEBUG is not set
829# CONFIG_USB_STORAGE_RW_DETECT is not set
830# CONFIG_USB_STORAGE_DATAFAB is not set
831# CONFIG_USB_STORAGE_FREECOM is not set
832# CONFIG_USB_STORAGE_ISD200 is not set
833# CONFIG_USB_STORAGE_DPCM is not set
834# CONFIG_USB_STORAGE_USBAT is not set
835# CONFIG_USB_STORAGE_SDDR09 is not set
836# CONFIG_USB_STORAGE_SDDR55 is not set
837# CONFIG_USB_STORAGE_JUMPSHOT is not set
838
839#
840# USB Input Devices
841#
842CONFIG_USB_HID=y
843CONFIG_USB_HIDINPUT=y
844# CONFIG_HID_FF is not set
845# CONFIG_USB_HIDDEV is not set
846# CONFIG_USB_AIPTEK is not set
847# CONFIG_USB_WACOM is not set
848# CONFIG_USB_KBTAB is not set
849# CONFIG_USB_POWERMATE is not set
850# CONFIG_USB_MTOUCH is not set
851# CONFIG_USB_EGALAX is not set
852# CONFIG_USB_XPAD is not set
853# CONFIG_USB_ATI_REMOTE is not set
854
855#
856# USB Imaging devices
857#
858# CONFIG_USB_MDC800 is not set
859# CONFIG_USB_MICROTEK is not set
860
861#
862# USB Multimedia devices
863#
864# CONFIG_USB_DABUSB is not set
865
866#
867# Video4Linux support is needed for USB Multimedia device support
868#
869
870#
871# USB Network Adapters
872#
873# CONFIG_USB_CATC is not set
874# CONFIG_USB_KAWETH is not set
875# CONFIG_USB_PEGASUS is not set
876# CONFIG_USB_RTL8150 is not set
877# CONFIG_USB_USBNET is not set
878CONFIG_USB_MON=y
879
880#
881# USB port drivers
882#
883
884#
885# USB Serial Converter support
886#
887# CONFIG_USB_SERIAL is not set
888
889#
890# USB Miscellaneous drivers
891#
892# CONFIG_USB_EMI62 is not set
893# CONFIG_USB_EMI26 is not set
894# CONFIG_USB_AUERSWALD is not set
895# CONFIG_USB_RIO500 is not set
896# CONFIG_USB_LEGOTOWER is not set
897# CONFIG_USB_LCD is not set
898# CONFIG_USB_LED is not set
899# CONFIG_USB_CYTHERM is not set
900# CONFIG_USB_PHIDGETKIT is not set
901# CONFIG_USB_PHIDGETSERVO is not set
902# CONFIG_USB_IDMOUSE is not set
903# CONFIG_USB_SISUSBVGA is not set
904# CONFIG_USB_TEST is not set
905
906#
907# USB ATM/DSL drivers
908#
909
910#
911# USB Gadget Support
912#
913# CONFIG_USB_GADGET is not set
914
915#
916# MMC/SD Card support
917#
918# CONFIG_MMC is not set
919
920#
921# InfiniBand support
922#
923# CONFIG_INFINIBAND is not set
924
925#
926# Firmware Drivers
927#
928# CONFIG_EDD is not set
929
930#
931# File systems
932#
933CONFIG_EXT2_FS=y
934CONFIG_EXT2_FS_XATTR=y
935CONFIG_EXT2_FS_POSIX_ACL=y
936# CONFIG_EXT2_FS_SECURITY is not set
937CONFIG_EXT3_FS=y
938CONFIG_EXT3_FS_XATTR=y
939CONFIG_EXT3_FS_POSIX_ACL=y
940# CONFIG_EXT3_FS_SECURITY is not set
941CONFIG_JBD=y
942# CONFIG_JBD_DEBUG is not set
943CONFIG_FS_MBCACHE=y
944CONFIG_REISERFS_FS=y
945# CONFIG_REISERFS_CHECK is not set
946# CONFIG_REISERFS_PROC_INFO is not set
947CONFIG_REISERFS_FS_XATTR=y
948CONFIG_REISERFS_FS_POSIX_ACL=y
949# CONFIG_REISERFS_FS_SECURITY is not set
950# CONFIG_JFS_FS is not set
951CONFIG_FS_POSIX_ACL=y
952
953#
954# XFS support
955#
956# CONFIG_XFS_FS is not set
957# CONFIG_MINIX_FS is not set
958# CONFIG_ROMFS_FS is not set
959# CONFIG_QUOTA is not set
960CONFIG_DNOTIFY=y
961CONFIG_AUTOFS_FS=y
962# CONFIG_AUTOFS4_FS is not set
963
964#
965# CD-ROM/DVD Filesystems
966#
967CONFIG_ISO9660_FS=y
968# CONFIG_JOLIET is not set
969# CONFIG_ZISOFS is not set
970# CONFIG_UDF_FS is not set
971
972#
973# DOS/FAT/NT Filesystems
974#
975CONFIG_FAT_FS=y
976CONFIG_MSDOS_FS=y
977CONFIG_VFAT_FS=y
978CONFIG_FAT_DEFAULT_CODEPAGE=437
979CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
980# CONFIG_NTFS_FS is not set
981
982#
983# Pseudo filesystems
984#
985CONFIG_PROC_FS=y
986CONFIG_PROC_KCORE=y
987CONFIG_SYSFS=y
988# CONFIG_DEVFS_FS is not set
989# CONFIG_DEVPTS_FS_XATTR is not set
990CONFIG_TMPFS=y
991# CONFIG_TMPFS_XATTR is not set
992CONFIG_HUGETLBFS=y
993CONFIG_HUGETLB_PAGE=y
994CONFIG_RAMFS=y
995
996#
997# Miscellaneous filesystems
998#
999# CONFIG_ADFS_FS is not set
1000# CONFIG_AFFS_FS is not set
1001# CONFIG_HFS_FS is not set
1002# CONFIG_HFSPLUS_FS is not set
1003# CONFIG_BEFS_FS is not set
1004# CONFIG_BFS_FS is not set
1005# CONFIG_EFS_FS is not set
1006# CONFIG_CRAMFS is not set
1007# CONFIG_VXFS_FS is not set
1008# CONFIG_HPFS_FS is not set
1009# CONFIG_QNX4FS_FS is not set
1010# CONFIG_SYSV_FS is not set
1011# CONFIG_UFS_FS is not set
1012
1013#
1014# Network File Systems
1015#
1016CONFIG_NFS_FS=y
1017CONFIG_NFS_V3=y
1018# CONFIG_NFS_V4 is not set
1019# CONFIG_NFS_DIRECTIO is not set
1020CONFIG_NFSD=y
1021CONFIG_NFSD_V3=y
1022# CONFIG_NFSD_V4 is not set
1023CONFIG_NFSD_TCP=y
1024CONFIG_LOCKD=y
1025CONFIG_LOCKD_V4=y
1026CONFIG_EXPORTFS=y
1027CONFIG_SUNRPC=y
1028# CONFIG_RPCSEC_GSS_KRB5 is not set
1029# CONFIG_RPCSEC_GSS_SPKM3 is not set
1030# CONFIG_SMB_FS is not set
1031# CONFIG_CIFS is not set
1032# CONFIG_NCP_FS is not set
1033# CONFIG_CODA_FS is not set
1034# CONFIG_AFS_FS is not set
1035
1036#
1037# Partition Types
1038#
1039# CONFIG_PARTITION_ADVANCED is not set
1040CONFIG_MSDOS_PARTITION=y
1041
1042#
1043# Native Language Support
1044#
1045CONFIG_NLS=y
1046CONFIG_NLS_DEFAULT="iso8859-1"
1047CONFIG_NLS_CODEPAGE_437=y
1048# CONFIG_NLS_CODEPAGE_737 is not set
1049# CONFIG_NLS_CODEPAGE_775 is not set
1050# CONFIG_NLS_CODEPAGE_850 is not set
1051# CONFIG_NLS_CODEPAGE_852 is not set
1052# CONFIG_NLS_CODEPAGE_855 is not set
1053# CONFIG_NLS_CODEPAGE_857 is not set
1054# CONFIG_NLS_CODEPAGE_860 is not set
1055# CONFIG_NLS_CODEPAGE_861 is not set
1056# CONFIG_NLS_CODEPAGE_862 is not set
1057# CONFIG_NLS_CODEPAGE_863 is not set
1058# CONFIG_NLS_CODEPAGE_864 is not set
1059# CONFIG_NLS_CODEPAGE_865 is not set
1060# CONFIG_NLS_CODEPAGE_866 is not set
1061# CONFIG_NLS_CODEPAGE_869 is not set
1062# CONFIG_NLS_CODEPAGE_936 is not set
1063# CONFIG_NLS_CODEPAGE_950 is not set
1064# CONFIG_NLS_CODEPAGE_932 is not set
1065# CONFIG_NLS_CODEPAGE_949 is not set
1066# CONFIG_NLS_CODEPAGE_874 is not set
1067# CONFIG_NLS_ISO8859_8 is not set
1068# CONFIG_NLS_CODEPAGE_1250 is not set
1069# CONFIG_NLS_CODEPAGE_1251 is not set
1070CONFIG_NLS_ASCII=y
1071CONFIG_NLS_ISO8859_1=y
1072# CONFIG_NLS_ISO8859_2 is not set
1073# CONFIG_NLS_ISO8859_3 is not set
1074# CONFIG_NLS_ISO8859_4 is not set
1075# CONFIG_NLS_ISO8859_5 is not set
1076# CONFIG_NLS_ISO8859_6 is not set
1077# CONFIG_NLS_ISO8859_7 is not set
1078# CONFIG_NLS_ISO8859_9 is not set
1079# CONFIG_NLS_ISO8859_13 is not set
1080# CONFIG_NLS_ISO8859_14 is not set
1081CONFIG_NLS_ISO8859_15=y
1082# CONFIG_NLS_KOI8_R is not set
1083# CONFIG_NLS_KOI8_U is not set
1084CONFIG_NLS_UTF8=y
1085
1086#
1087# Profiling support
1088#
1089CONFIG_PROFILING=y
1090CONFIG_OPROFILE=y
1091
1092#
1093# Kernel hacking
1094#
1095CONFIG_DEBUG_KERNEL=y
1096CONFIG_MAGIC_SYSRQ=y
1097# CONFIG_PRINTK_TIME is not set
1098# CONFIG_SCHEDSTATS is not set
1099# CONFIG_DEBUG_SLAB is not set
1100# CONFIG_DEBUG_SPINLOCK is not set
1101# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
1102# CONFIG_DEBUG_KOBJECT is not set
1103# CONFIG_DEBUG_INFO is not set
1104CONFIG_DEBUG_FS=y
1105CONFIG_INIT_DEBUG=y
1106# CONFIG_IOMMU_DEBUG is not set
1107CONFIG_KPROBES=y
1108
1109#
1110# Security options
1111#
1112# CONFIG_KEYS is not set
1113# CONFIG_SECURITY is not set
1114
1115#
1116# Cryptographic options
1117#
1118# CONFIG_CRYPTO is not set
1119
1120#
1121# Hardware crypto devices
1122#
1123
1124#
1125# Library routines
1126#
1127# CONFIG_CRC_CCITT is not set
1128CONFIG_CRC32=y
1129# CONFIG_LIBCRC32C is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
new file mode 100644
index 000000000000..a12b19da4b59
--- /dev/null
+++ b/arch/x86_64/ia32/Makefile
@@ -0,0 +1,32 @@
1#
2# Makefile for the ia32 kernel emulation subsystem.
3#
4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
6 ia32_signal.o tls32.o \
7 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o
8
9sysv-$(CONFIG_SYSVIPC) := ipc32.o
10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
11
12obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
13
14$(obj)/syscall32.o: $(src)/syscall32.c \
15 $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
16
17# Teach kbuild about targets
18targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
19
20# The DSO images are built using a special linker script
21quiet_cmd_syscall = SYSCALL $@
22 cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
23 -Wl,-soname=linux-gate.so.1 -o $@ \
24 -Wl,-T,$(filter-out FORCE,$^)
25
26$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
27$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
28 $(call if_changed,syscall)
29
30AFLAGS_vsyscall-sysenter.o = -m32
31AFLAGS_vsyscall-syscall.o = -m32
32CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c
new file mode 100644
index 000000000000..1c23095f1813
--- /dev/null
+++ b/arch/x86_64/ia32/fpu32.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
4 * This is used for ptrace, signals and coredumps in 32bit emulation.
5 * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $
6 */
7
8#include <linux/sched.h>
9#include <asm/sigcontext32.h>
10#include <asm/processor.h>
11#include <asm/uaccess.h>
12#include <asm/i387.h>
13
14static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
15{
16 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
17
18 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
19 tmp = ~twd;
20 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
21 /* and move the valid bits to the lower byte. */
22 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
23 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
24 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
25 return tmp;
26}
27
28static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
29{
30 struct _fpxreg *st = NULL;
31 unsigned long tos = (fxsave->swd >> 11) & 7;
32 unsigned long twd = (unsigned long) fxsave->twd;
33 unsigned long tag;
34 unsigned long ret = 0xffff0000;
35 int i;
36
37#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
38
39 for (i = 0 ; i < 8 ; i++) {
40 if (twd & 0x1) {
41 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
42
43 switch (st->exponent & 0x7fff) {
44 case 0x7fff:
45 tag = 2; /* Special */
46 break;
47 case 0x0000:
48 if ( !st->significand[0] &&
49 !st->significand[1] &&
50 !st->significand[2] &&
51 !st->significand[3] ) {
52 tag = 1; /* Zero */
53 } else {
54 tag = 2; /* Special */
55 }
56 break;
57 default:
58 if (st->significand[3] & 0x8000) {
59 tag = 0; /* Valid */
60 } else {
61 tag = 2; /* Special */
62 }
63 break;
64 }
65 } else {
66 tag = 3; /* Empty */
67 }
68 ret |= (tag << (2 * i));
69 twd = twd >> 1;
70 }
71 return ret;
72}
73
74
75static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
76 struct _fpstate_ia32 __user *buf)
77{
78 struct _fpxreg *to;
79 struct _fpreg __user *from;
80 int i;
81 u32 v;
82 int err = 0;
83
84#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
85 G(0, fxsave->cwd);
86 G(1, fxsave->swd);
87 G(2, fxsave->twd);
88 fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
89 G(3, fxsave->rip);
90 G(4, v);
91 fxsave->fop = v>>16; /* cs ignored */
92 G(5, fxsave->rdp);
93 /* 6: ds ignored */
94#undef G
95 if (err)
96 return -1;
97
98 to = (struct _fpxreg *)&fxsave->st_space[0];
99 from = &buf->_st[0];
100 for (i = 0 ; i < 8 ; i++, to++, from++) {
101 if (__copy_from_user(to, from, sizeof(*from)))
102 return -1;
103 }
104 return 0;
105}
106
107
108static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
109 struct i387_fxsave_struct *fxsave,
110 struct pt_regs *regs,
111 struct task_struct *tsk)
112{
113 struct _fpreg __user *to;
114 struct _fpxreg *from;
115 int i;
116 u16 cs,ds;
117 int err = 0;
118
119 if (tsk == current) {
120 /* should be actually ds/cs at fpu exception time,
121 but that information is not available in 64bit mode. */
122 asm("movw %%ds,%0 " : "=r" (ds));
123 asm("movw %%cs,%0 " : "=r" (cs));
124 } else { /* ptrace. task has stopped. */
125 ds = tsk->thread.ds;
126 cs = regs->cs;
127 }
128
129#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
130 P(0, (u32)fxsave->cwd | 0xffff0000);
131 P(1, (u32)fxsave->swd | 0xffff0000);
132 P(2, twd_fxsr_to_i387(fxsave));
133 P(3, (u32)fxsave->rip);
134 P(4, cs | ((u32)fxsave->fop) << 16);
135 P(5, fxsave->rdp);
136 P(6, 0xffff0000 | ds);
137#undef P
138
139 if (err)
140 return -1;
141
142 to = &buf->_st[0];
143 from = (struct _fpxreg *) &fxsave->st_space[0];
144 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
145 if (__copy_to_user(to, from, sizeof(*to)))
146 return -1;
147 }
148 return 0;
149}
150
151int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave)
152{
153 clear_fpu(tsk);
154 if (!fsave) {
155 if (__copy_from_user(&tsk->thread.i387.fxsave,
156 &buf->_fxsr_env[0],
157 sizeof(struct i387_fxsave_struct)))
158 return -1;
159 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
160 set_stopped_child_used_math(tsk);
161 }
162 return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
163}
164
165int save_i387_ia32(struct task_struct *tsk,
166 struct _fpstate_ia32 __user *buf,
167 struct pt_regs *regs,
168 int fsave)
169{
170 int err = 0;
171
172 init_fpu(tsk);
173 if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
174 return -1;
175 if (fsave)
176 return 0;
177 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
178 if (fsave)
179 return err ? -1 : 1;
180 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
181 err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
182 sizeof(struct i387_fxsave_struct));
183 return err ? -1 : 1;
184}
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
new file mode 100644
index 000000000000..1965efc974dc
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -0,0 +1,529 @@
1/*
2 * a.out loader for x86-64
3 *
4 * Copyright (C) 1991, 1992, 1996 Linus Torvalds
5 * Hacked together by Andi Kleen
6 */
7
8#include <linux/module.h>
9
10#include <linux/time.h>
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/mman.h>
14#include <linux/a.out.h>
15#include <linux/errno.h>
16#include <linux/signal.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/stat.h>
21#include <linux/fcntl.h>
22#include <linux/ptrace.h>
23#include <linux/user.h>
24#include <linux/slab.h>
25#include <linux/binfmts.h>
26#include <linux/personality.h>
27#include <linux/init.h>
28
29#include <asm/system.h>
30#include <asm/uaccess.h>
31#include <asm/pgalloc.h>
32#include <asm/cacheflush.h>
33#include <asm/user32.h>
34#include <asm/ia32.h>
35
36#undef WARN_OLD
37#undef CORE_DUMP /* probably broken */
38
39extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
40 unsigned long stack_top, int exec_stack);
41
42static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
43static int load_aout_library(struct file*);
44
45#if CORE_DUMP
46static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
47
48/*
49 * fill in the user structure for a core dump..
50 */
51static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
52{
53 u32 fs,gs;
54
55/* changed the size calculations - should hopefully work better. lbt */
56 dump->magic = CMAGIC;
57 dump->start_code = 0;
58 dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
59 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
60 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
61 dump->u_dsize -= dump->u_tsize;
62 dump->u_ssize = 0;
63 dump->u_debugreg[0] = current->thread.debugreg0;
64 dump->u_debugreg[1] = current->thread.debugreg1;
65 dump->u_debugreg[2] = current->thread.debugreg2;
66 dump->u_debugreg[3] = current->thread.debugreg3;
67 dump->u_debugreg[4] = 0;
68 dump->u_debugreg[5] = 0;
69 dump->u_debugreg[6] = current->thread.debugreg6;
70 dump->u_debugreg[7] = current->thread.debugreg7;
71
72 if (dump->start_stack < 0xc0000000)
73 dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
74
75 dump->regs.ebx = regs->rbx;
76 dump->regs.ecx = regs->rcx;
77 dump->regs.edx = regs->rdx;
78 dump->regs.esi = regs->rsi;
79 dump->regs.edi = regs->rdi;
80 dump->regs.ebp = regs->rbp;
81 dump->regs.eax = regs->rax;
82 dump->regs.ds = current->thread.ds;
83 dump->regs.es = current->thread.es;
84 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
85 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
86 dump->regs.orig_eax = regs->orig_rax;
87 dump->regs.eip = regs->rip;
88 dump->regs.cs = regs->cs;
89 dump->regs.eflags = regs->eflags;
90 dump->regs.esp = regs->rsp;
91 dump->regs.ss = regs->ss;
92
93#if 1 /* FIXME */
94 dump->u_fpvalid = 0;
95#else
96 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
97#endif
98}
99
100#endif
101
102static struct linux_binfmt aout_format = {
103 .module = THIS_MODULE,
104 .load_binary = load_aout_binary,
105 .load_shlib = load_aout_library,
106#if CORE_DUMP
107 .core_dump = aout_core_dump,
108#endif
109 .min_coredump = PAGE_SIZE
110};
111
112static void set_brk(unsigned long start, unsigned long end)
113{
114 start = PAGE_ALIGN(start);
115 end = PAGE_ALIGN(end);
116 if (end <= start)
117 return;
118 down_write(&current->mm->mmap_sem);
119 do_brk(start, end - start);
120 up_write(&current->mm->mmap_sem);
121}
122
123#if CORE_DUMP
124/*
125 * These are the only things you should do on a core-file: use only these
126 * macros to write out all the necessary info.
127 */
128
129static int dump_write(struct file *file, const void *addr, int nr)
130{
131 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
132}
133
134#define DUMP_WRITE(addr, nr) \
135 if (!dump_write(file, (void *)(addr), (nr))) \
136 goto end_coredump;
137
138#define DUMP_SEEK(offset) \
139if (file->f_op->llseek) { \
140 if (file->f_op->llseek(file,(offset),0) != (offset)) \
141 goto end_coredump; \
142} else file->f_pos = (offset)
143
144/*
145 * Routine writes a core dump image in the current directory.
146 * Currently only a stub-function.
147 *
148 * Note that setuid/setgid files won't make a core-dump if the uid/gid
149 * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
150 * field, which also makes sure the core-dumps won't be recursive if the
151 * dumping of the process results in another error..
152 */
153
154static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
155{
156 mm_segment_t fs;
157 int has_dumped = 0;
158 unsigned long dump_start, dump_size;
159 struct user32 dump;
160# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
161# define START_STACK(u) (u.start_stack)
162
163 fs = get_fs();
164 set_fs(KERNEL_DS);
165 has_dumped = 1;
166 current->flags |= PF_DUMPCORE;
167 strncpy(dump.u_comm, current->comm, sizeof(current->comm));
168 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
169 dump.signal = signr;
170 dump_thread32(regs, &dump);
171
172/* If the size of the dump file exceeds the rlimit, then see what would happen
173 if we wrote the stack, but not the data area. */
174 if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
175 current->signal->rlim[RLIMIT_CORE].rlim_cur)
176 dump.u_dsize = 0;
177
178/* Make sure we have enough room to write the stack and data areas. */
179 if ((dump.u_ssize+1) * PAGE_SIZE >
180 current->signal->rlim[RLIMIT_CORE].rlim_cur)
181 dump.u_ssize = 0;
182
183/* make sure we actually have a data and stack area to dump */
184 set_fs(USER_DS);
185 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
186 dump.u_dsize = 0;
187 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
188 dump.u_ssize = 0;
189
190 set_fs(KERNEL_DS);
191/* struct user */
192 DUMP_WRITE(&dump,sizeof(dump));
193/* Now dump all of the user data. Include malloced stuff as well */
194 DUMP_SEEK(PAGE_SIZE);
195/* now we start writing out the user space info */
196 set_fs(USER_DS);
197/* Dump the data area */
198 if (dump.u_dsize != 0) {
199 dump_start = START_DATA(dump);
200 dump_size = dump.u_dsize << PAGE_SHIFT;
201 DUMP_WRITE(dump_start,dump_size);
202 }
203/* Now prepare to dump the stack area */
204 if (dump.u_ssize != 0) {
205 dump_start = START_STACK(dump);
206 dump_size = dump.u_ssize << PAGE_SHIFT;
207 DUMP_WRITE(dump_start,dump_size);
208 }
209/* Finally dump the task struct. Not be used by gdb, but could be useful */
210 set_fs(KERNEL_DS);
211 DUMP_WRITE(current,sizeof(*current));
212end_coredump:
213 set_fs(fs);
214 return has_dumped;
215}
216#endif
217
218/*
219 * create_aout_tables() parses the env- and arg-strings in new user
220 * memory and creates the pointer tables from them, and puts their
221 * addresses on the "stack", returning the new stack pointer value.
222 */
223static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
224{
225 u32 __user *argv;
226 u32 __user *envp;
227 u32 __user *sp;
228 int argc = bprm->argc;
229 int envc = bprm->envc;
230
231 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
232 sp -= envc+1;
233 envp = sp;
234 sp -= argc+1;
235 argv = sp;
236 put_user((unsigned long) envp,--sp);
237 put_user((unsigned long) argv,--sp);
238 put_user(argc,--sp);
239 current->mm->arg_start = (unsigned long) p;
240 while (argc-->0) {
241 char c;
242 put_user((u32)(unsigned long)p,argv++);
243 do {
244 get_user(c,p++);
245 } while (c);
246 }
247 put_user(NULL,argv);
248 current->mm->arg_end = current->mm->env_start = (unsigned long) p;
249 while (envc-->0) {
250 char c;
251 put_user((u32)(unsigned long)p,envp++);
252 do {
253 get_user(c,p++);
254 } while (c);
255 }
256 put_user(NULL,envp);
257 current->mm->env_end = (unsigned long) p;
258 return sp;
259}
260
261/*
262 * These are the functions used to load a.out style executables and shared
263 * libraries. There is no binary dependent code anywhere else.
264 */
265
266static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
267{
268 struct exec ex;
269 unsigned long error;
270 unsigned long fd_offset;
271 unsigned long rlim;
272 int retval;
273
274 ex = *((struct exec *) bprm->buf); /* exec-header */
275 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
276 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
277 N_TRSIZE(ex) || N_DRSIZE(ex) ||
278 i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
279 return -ENOEXEC;
280 }
281
282 fd_offset = N_TXTOFF(ex);
283
284 /* Check initial limits. This avoids letting people circumvent
285 * size limits imposed on them by creating programs with large
286 * arrays in the data or bss.
287 */
288 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
289 if (rlim >= RLIM_INFINITY)
290 rlim = ~0;
291 if (ex.a_data + ex.a_bss > rlim)
292 return -ENOMEM;
293
294 /* Flush all traces of the currently running executable */
295 retval = flush_old_exec(bprm);
296 if (retval)
297 return retval;
298
299 regs->cs = __USER32_CS;
300 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
301 regs->r13 = regs->r14 = regs->r15 = 0;
302
303 /* OK, This is the point of no return */
304 set_personality(PER_LINUX);
305 set_thread_flag(TIF_IA32);
306 clear_thread_flag(TIF_ABI_PENDING);
307
308 current->mm->end_code = ex.a_text +
309 (current->mm->start_code = N_TXTADDR(ex));
310 current->mm->end_data = ex.a_data +
311 (current->mm->start_data = N_DATADDR(ex));
312 current->mm->brk = ex.a_bss +
313 (current->mm->start_brk = N_BSSADDR(ex));
314 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
315
316 set_mm_counter(current->mm, rss, 0);
317 current->mm->mmap = NULL;
318 compute_creds(bprm);
319 current->flags &= ~PF_FORKNOEXEC;
320
321 if (N_MAGIC(ex) == OMAGIC) {
322 unsigned long text_addr, map_size;
323 loff_t pos;
324
325 text_addr = N_TXTADDR(ex);
326
327 pos = 32;
328 map_size = ex.a_text+ex.a_data;
329
330 down_write(&current->mm->mmap_sem);
331 error = do_brk(text_addr & PAGE_MASK, map_size);
332 up_write(&current->mm->mmap_sem);
333
334 if (error != (text_addr & PAGE_MASK)) {
335 send_sig(SIGKILL, current, 0);
336 return error;
337 }
338
339 error = bprm->file->f_op->read(bprm->file, (char *)text_addr,
340 ex.a_text+ex.a_data, &pos);
341 if ((signed long)error < 0) {
342 send_sig(SIGKILL, current, 0);
343 return error;
344 }
345
346 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
347 } else {
348#ifdef WARN_OLD
349 static unsigned long error_time, error_time2;
350 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
351 (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
352 {
353 printk(KERN_NOTICE "executable not page aligned\n");
354 error_time2 = jiffies;
355 }
356
357 if ((fd_offset & ~PAGE_MASK) != 0 &&
358 (jiffies-error_time) > 5*HZ)
359 {
360 printk(KERN_WARNING
361 "fd_offset is not page aligned. Please convert program: %s\n",
362 bprm->file->f_dentry->d_name.name);
363 error_time = jiffies;
364 }
365#endif
366
367 if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
368 loff_t pos = fd_offset;
369 down_write(&current->mm->mmap_sem);
370 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
371 up_write(&current->mm->mmap_sem);
372 bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex),
373 ex.a_text+ex.a_data, &pos);
374 flush_icache_range((unsigned long) N_TXTADDR(ex),
375 (unsigned long) N_TXTADDR(ex) +
376 ex.a_text+ex.a_data);
377 goto beyond_if;
378 }
379
380 down_write(&current->mm->mmap_sem);
381 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
382 PROT_READ | PROT_EXEC,
383 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
384 fd_offset);
385 up_write(&current->mm->mmap_sem);
386
387 if (error != N_TXTADDR(ex)) {
388 send_sig(SIGKILL, current, 0);
389 return error;
390 }
391
392 down_write(&current->mm->mmap_sem);
393 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
394 PROT_READ | PROT_WRITE | PROT_EXEC,
395 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
396 fd_offset + ex.a_text);
397 up_write(&current->mm->mmap_sem);
398 if (error != N_DATADDR(ex)) {
399 send_sig(SIGKILL, current, 0);
400 return error;
401 }
402 }
403beyond_if:
404 set_binfmt(&aout_format);
405
406 set_brk(current->mm->start_brk, current->mm->brk);
407
408 retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
409 if (retval < 0) {
410 /* Someone check-me: is this error path enough? */
411 send_sig(SIGKILL, current, 0);
412 return retval;
413 }
414
415 current->mm->start_stack =
416 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
417 /* start thread */
418 asm volatile("movl %0,%%fs" :: "r" (0)); \
419 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
420 load_gs_index(0);
421 (regs)->rip = ex.a_entry;
422 (regs)->rsp = current->mm->start_stack;
423 (regs)->eflags = 0x200;
424 (regs)->cs = __USER32_CS;
425 (regs)->ss = __USER32_DS;
426 set_fs(USER_DS);
427 if (unlikely(current->ptrace & PT_PTRACED)) {
428 if (current->ptrace & PT_TRACE_EXEC)
429 ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
430 else
431 send_sig(SIGTRAP, current, 0);
432 }
433 return 0;
434}
435
436static int load_aout_library(struct file *file)
437{
438 struct inode * inode;
439 unsigned long bss, start_addr, len;
440 unsigned long error;
441 int retval;
442 struct exec ex;
443
444 inode = file->f_dentry->d_inode;
445
446 retval = -ENOEXEC;
447 error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
448 if (error != sizeof(ex))
449 goto out;
450
451 /* We come in here for the regular a.out style of shared libraries */
452 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
453 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
454 i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
455 goto out;
456 }
457
458 if (N_FLAGS(ex))
459 goto out;
460
461 /* For QMAGIC, the starting address is 0x20 into the page. We mask
462 this off to get the starting address for the page */
463
464 start_addr = ex.a_entry & 0xfffff000;
465
466 if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
467 loff_t pos = N_TXTOFF(ex);
468
469#ifdef WARN_OLD
470 static unsigned long error_time;
471 if ((jiffies-error_time) > 5*HZ)
472 {
473 printk(KERN_WARNING
474 "N_TXTOFF is not page aligned. Please convert library: %s\n",
475 file->f_dentry->d_name.name);
476 error_time = jiffies;
477 }
478#endif
479 down_write(&current->mm->mmap_sem);
480 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
481 up_write(&current->mm->mmap_sem);
482
483 file->f_op->read(file, (char *)start_addr,
484 ex.a_text + ex.a_data, &pos);
485 flush_icache_range((unsigned long) start_addr,
486 (unsigned long) start_addr + ex.a_text + ex.a_data);
487
488 retval = 0;
489 goto out;
490 }
491 /* Now use mmap to map the library into memory. */
492 down_write(&current->mm->mmap_sem);
493 error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
494 PROT_READ | PROT_WRITE | PROT_EXEC,
495 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
496 N_TXTOFF(ex));
497 up_write(&current->mm->mmap_sem);
498 retval = error;
499 if (error != start_addr)
500 goto out;
501
502 len = PAGE_ALIGN(ex.a_text + ex.a_data);
503 bss = ex.a_text + ex.a_data + ex.a_bss;
504 if (bss > len) {
505 down_write(&current->mm->mmap_sem);
506 error = do_brk(start_addr + len, bss - len);
507 up_write(&current->mm->mmap_sem);
508 retval = error;
509 if (error != start_addr + len)
510 goto out;
511 }
512 retval = 0;
513out:
514 return retval;
515}
516
517static int __init init_aout_binfmt(void)
518{
519 return register_binfmt(&aout_format);
520}
521
522static void __exit exit_aout_binfmt(void)
523{
524 unregister_binfmt(&aout_format);
525}
526
527module_init(init_aout_binfmt);
528module_exit(exit_aout_binfmt);
529MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
new file mode 100644
index 000000000000..93d568dfa762
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -0,0 +1,434 @@
1/*
2 * Written 2000,2002 by Andi Kleen.
3 *
4 * Loosely based on the sparc64 and IA64 32bit emulation loaders.
5 * This tricks binfmt_elf.c into loading 32bit binaries using lots
6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
7 */
8#include <linux/types.h>
9#include <linux/config.h>
10#include <linux/stddef.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/compat.h>
14#include <linux/string.h>
15#include <linux/binfmts.h>
16#include <linux/mm.h>
17#include <linux/security.h>
18
19#include <asm/segment.h>
20#include <asm/ptrace.h>
21#include <asm/processor.h>
22#include <asm/user32.h>
23#include <asm/sigcontext32.h>
24#include <asm/fpu32.h>
25#include <asm/i387.h>
26#include <asm/uaccess.h>
27#include <asm/ia32.h>
28#include <asm/vsyscall32.h>
29
30#define ELF_NAME "elf/i386"
31
32#define AT_SYSINFO 32
33#define AT_SYSINFO_EHDR 33
34
35int sysctl_vsyscall32 = 1;
36
37#define ARCH_DLINFO do { \
38 if (sysctl_vsyscall32) { \
39 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
40 NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \
41 } \
42} while(0)
43
44struct file;
45struct elf_phdr;
46
47#define IA32_EMULATOR 1
48
49#define ELF_ET_DYN_BASE (TASK_UNMAPPED_32 + 0x1000000)
50
51#undef ELF_ARCH
52#define ELF_ARCH EM_386
53
54#undef ELF_CLASS
55#define ELF_CLASS ELFCLASS32
56
57#define ELF_DATA ELFDATA2LSB
58
59#define USE_ELF_CORE_DUMP 1
60
61/* Overwrite elfcore.h */
62#define _LINUX_ELFCORE_H 1
63typedef unsigned int elf_greg_t;
64
65#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t))
66typedef elf_greg_t elf_gregset_t[ELF_NGREG];
67
68/*
69 * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out
70 * extra segments containing the vsyscall DSO contents. Dumping its
71 * contents makes post-mortem fully interpretable later without matching up
72 * the same kernel and hardware config to see what PC values meant.
73 * Dumping its extra ELF program headers includes all the other information
74 * a debugger needs to easily find how the vsyscall DSO was being used.
75 */
76#define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum)
77#define ELF_CORE_WRITE_EXTRA_PHDRS \
78do { \
79 const struct elf32_phdr *const vsyscall_phdrs = \
80 (const struct elf32_phdr *) (VSYSCALL32_BASE \
81 + VSYSCALL32_EHDR->e_phoff); \
82 int i; \
83 Elf32_Off ofs = 0; \
84 for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
85 struct elf32_phdr phdr = vsyscall_phdrs[i]; \
86 if (phdr.p_type == PT_LOAD) { \
87 BUG_ON(ofs != 0); \
88 ofs = phdr.p_offset = offset; \
89 phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \
90 phdr.p_filesz = phdr.p_memsz; \
91 offset += phdr.p_filesz; \
92 } \
93 else \
94 phdr.p_offset += ofs; \
95 phdr.p_paddr = 0; /* match other core phdrs */ \
96 DUMP_WRITE(&phdr, sizeof(phdr)); \
97 } \
98} while (0)
99#define ELF_CORE_WRITE_EXTRA_DATA \
100do { \
101 const struct elf32_phdr *const vsyscall_phdrs = \
102 (const struct elf32_phdr *) (VSYSCALL32_BASE \
103 + VSYSCALL32_EHDR->e_phoff); \
104 int i; \
105 for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
106 if (vsyscall_phdrs[i].p_type == PT_LOAD) \
107 DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \
108 PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \
109 } \
110} while (0)
111
112struct elf_siginfo
113{
114 int si_signo; /* signal number */
115 int si_code; /* extra code */
116 int si_errno; /* errno */
117};
118
119#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
120
121struct elf_prstatus
122{
123 struct elf_siginfo pr_info; /* Info associated with signal */
124 short pr_cursig; /* Current signal */
125 unsigned int pr_sigpend; /* Set of pending signals */
126 unsigned int pr_sighold; /* Set of held signals */
127 pid_t pr_pid;
128 pid_t pr_ppid;
129 pid_t pr_pgrp;
130 pid_t pr_sid;
131 struct compat_timeval pr_utime; /* User time */
132 struct compat_timeval pr_stime; /* System time */
133 struct compat_timeval pr_cutime; /* Cumulative user time */
134 struct compat_timeval pr_cstime; /* Cumulative system time */
135 elf_gregset_t pr_reg; /* GP registers */
136 int pr_fpvalid; /* True if math co-processor being used. */
137};
138
139#define ELF_PRARGSZ (80) /* Number of chars for args */
140
141struct elf_prpsinfo
142{
143 char pr_state; /* numeric process state */
144 char pr_sname; /* char for pr_state */
145 char pr_zomb; /* zombie */
146 char pr_nice; /* nice val */
147 unsigned int pr_flag; /* flags */
148 __u16 pr_uid;
149 __u16 pr_gid;
150 pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid;
151 /* Lots missing */
152 char pr_fname[16]; /* filename of executable */
153 char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
154};
155
156#define __STR(x) #x
157#define STR(x) __STR(x)
158
159#define _GET_SEG(x) \
160 ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; })
161
162/* Assumes current==process to be dumped */
163#define ELF_CORE_COPY_REGS(pr_reg, regs) \
164 pr_reg[0] = regs->rbx; \
165 pr_reg[1] = regs->rcx; \
166 pr_reg[2] = regs->rdx; \
167 pr_reg[3] = regs->rsi; \
168 pr_reg[4] = regs->rdi; \
169 pr_reg[5] = regs->rbp; \
170 pr_reg[6] = regs->rax; \
171 pr_reg[7] = _GET_SEG(ds); \
172 pr_reg[8] = _GET_SEG(es); \
173 pr_reg[9] = _GET_SEG(fs); \
174 pr_reg[10] = _GET_SEG(gs); \
175 pr_reg[11] = regs->orig_rax; \
176 pr_reg[12] = regs->rip; \
177 pr_reg[13] = regs->cs; \
178 pr_reg[14] = regs->eflags; \
179 pr_reg[15] = regs->rsp; \
180 pr_reg[16] = regs->ss;
181
182#define user user32
183
184#define __ASM_X86_64_ELF_H 1
185#define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack))
186//#include <asm/ia32.h>
187#include <linux/elf.h>
188
189typedef struct user_i387_ia32_struct elf_fpregset_t;
190typedef struct user32_fxsr_struct elf_fpxregset_t;
191
192
193static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
194{
195 ELF_CORE_COPY_REGS((*elfregs), regs)
196}
197
198static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
199{
200 struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0);
201 --pp;
202 ELF_CORE_COPY_REGS((*elfregs), pp);
203 /* fix wrong segments */
204 (*elfregs)[7] = t->thread.ds;
205 (*elfregs)[9] = t->thread.fsindex;
206 (*elfregs)[10] = t->thread.gsindex;
207 (*elfregs)[8] = t->thread.es;
208 return 1;
209}
210
211static inline int
212elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu)
213{
214 struct _fpstate_ia32 *fpstate = (void*)fpu;
215 mm_segment_t oldfs = get_fs();
216
217 if (!tsk_used_math(tsk))
218 return 0;
219 if (!regs)
220 regs = (struct pt_regs *)tsk->thread.rsp0;
221 --regs;
222 if (tsk == current)
223 unlazy_fpu(tsk);
224 set_fs(KERNEL_DS);
225 save_i387_ia32(tsk, fpstate, regs, 1);
226 /* Correct for i386 bug. It puts the fop into the upper 16bits of
227 the tag word (like FXSAVE), not into the fcs*/
228 fpstate->cssel |= fpstate->tag & 0xffff0000;
229 set_fs(oldfs);
230 return 1;
231}
232
233#define ELF_CORE_COPY_XFPREGS 1
234static inline int
235elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
236{
237 struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1;
238 if (!tsk_used_math(t))
239 return 0;
240 if (t == current)
241 unlazy_fpu(t);
242 memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
243 xfpu->fcs = regs->cs;
244 xfpu->fos = t->thread.ds; /* right? */
245 return 1;
246}
247
248#undef elf_check_arch
249#define elf_check_arch(x) \
250 ((x)->e_machine == EM_386)
251
252extern int force_personality32;
253
254#define ELF_EXEC_PAGESIZE PAGE_SIZE
255#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
256#define ELF_PLATFORM ("i686")
257#define SET_PERSONALITY(ex, ibcs2) \
258do { \
259 unsigned long new_flags = 0; \
260 if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
261 new_flags = _TIF_IA32; \
262 if ((current_thread_info()->flags & _TIF_IA32) \
263 != new_flags) \
264 set_thread_flag(TIF_ABI_PENDING); \
265 else \
266 clear_thread_flag(TIF_ABI_PENDING); \
267 /* XXX This overwrites the user set personality */ \
268 current->personality |= force_personality32; \
269} while (0)
270
271/* Override some function names */
272#define elf_format elf32_format
273
274#define init_elf_binfmt init_elf32_binfmt
275#define exit_elf_binfmt exit_elf32_binfmt
276
277#define load_elf_binary load_elf32_binary
278
279#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
280#define setup_arg_pages(bprm, stack_top, exec_stack) \
281 ia32_setup_arg_pages(bprm, stack_top, exec_stack)
282int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
283
284#undef start_thread
285#define start_thread(regs,new_rip,new_rsp) do { \
286 asm volatile("movl %0,%%fs" :: "r" (0)); \
287 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
288 load_gs_index(0); \
289 (regs)->rip = (new_rip); \
290 (regs)->rsp = (new_rsp); \
291 (regs)->eflags = 0x200; \
292 (regs)->cs = __USER32_CS; \
293 (regs)->ss = __USER32_DS; \
294 set_fs(USER_DS); \
295} while(0)
296
297
298#define elf_map elf32_map
299
300#include <linux/module.h>
301
302MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
303MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
304
305#undef MODULE_DESCRIPTION
306#undef MODULE_AUTHOR
307
308#define elf_addr_t __u32
309
310#undef TASK_SIZE
311#define TASK_SIZE 0xffffffff
312
313static void elf32_init(struct pt_regs *);
314
315#include "../../../fs/binfmt_elf.c"
316
317static void elf32_init(struct pt_regs *regs)
318{
319 struct task_struct *me = current;
320 regs->rdi = 0;
321 regs->rsi = 0;
322 regs->rdx = 0;
323 regs->rcx = 0;
324 regs->rax = 0;
325 regs->rbx = 0;
326 regs->rbp = 0;
327 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
328 regs->r13 = regs->r14 = regs->r15 = 0;
329 me->thread.fs = 0;
330 me->thread.gs = 0;
331 me->thread.fsindex = 0;
332 me->thread.gsindex = 0;
333 me->thread.ds = __USER_DS;
334 me->thread.es = __USER_DS;
335}
336
337int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
338{
339 unsigned long stack_base;
340 struct vm_area_struct *mpnt;
341 struct mm_struct *mm = current->mm;
342 int i, ret;
343
344 stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
345 mm->arg_start = bprm->p + stack_base;
346
347 bprm->p += stack_base;
348 if (bprm->loader)
349 bprm->loader += stack_base;
350 bprm->exec += stack_base;
351
352 mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
353 if (!mpnt)
354 return -ENOMEM;
355
356 if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
357 kmem_cache_free(vm_area_cachep, mpnt);
358 return -ENOMEM;
359 }
360
361 memset(mpnt, 0, sizeof(*mpnt));
362
363 down_write(&mm->mmap_sem);
364 {
365 mpnt->vm_mm = mm;
366 mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
367 mpnt->vm_end = IA32_STACK_TOP;
368 if (executable_stack == EXSTACK_ENABLE_X)
369 mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
370 else if (executable_stack == EXSTACK_DISABLE_X)
371 mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
372 else
373 mpnt->vm_flags = VM_STACK_FLAGS;
374 mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ?
375 PAGE_COPY_EXEC : PAGE_COPY;
376 if ((ret = insert_vm_struct(mm, mpnt))) {
377 up_write(&mm->mmap_sem);
378 kmem_cache_free(vm_area_cachep, mpnt);
379 return ret;
380 }
381 mm->stack_vm = mm->total_vm = vma_pages(mpnt);
382 }
383
384 for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
385 struct page *page = bprm->page[i];
386 if (page) {
387 bprm->page[i] = NULL;
388 install_arg_page(mpnt, page, stack_base);
389 }
390 stack_base += PAGE_SIZE;
391 }
392 up_write(&mm->mmap_sem);
393
394 return 0;
395}
396
397static unsigned long
398elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
399{
400 unsigned long map_addr;
401 struct task_struct *me = current;
402
403 down_write(&me->mm->mmap_sem);
404 map_addr = do_mmap(filep, ELF_PAGESTART(addr),
405 eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot,
406 type,
407 eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
408 up_write(&me->mm->mmap_sem);
409 return(map_addr);
410}
411
412#ifdef CONFIG_SYSCTL
413/* Register vsyscall32 into the ABI table */
414#include <linux/sysctl.h>
415
416static ctl_table abi_table2[] = {
417 { 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL,
418 proc_dointvec },
419 { 0, }
420};
421
422static ctl_table abi_root_table2[] = {
423 { .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555,
424 .child = abi_table2 },
425 { 0 },
426};
427
428static __init int ia32_binfmt_init(void)
429{
430 register_sysctl_table(abi_root_table2, 1);
431 return 0;
432}
433__initcall(ia32_binfmt_init);
434#endif
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
new file mode 100644
index 000000000000..d259f8a6f811
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -0,0 +1,201 @@
1/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
2 * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
3 *
4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
7 *
8 * These routines maintain argument size conversion between 32bit and 64bit
9 * ioctls.
10 */
11
12#define INCLUDES
13#include <linux/syscalls.h>
14#include "compat_ioctl.c"
15#include <asm/mtrr.h>
16#include <asm/ia32.h>
17
18#define CODE
19#include "compat_ioctl.c"
20
21#ifndef TIOCGDEV
22#define TIOCGDEV _IOR('T',0x32, unsigned int)
23#endif
24static int tiocgdev(unsigned fd, unsigned cmd, unsigned int __user *ptr)
25{
26
27 struct file *file = fget(fd);
28 struct tty_struct *real_tty;
29
30 if (!file)
31 return -EBADF;
32 if (file->f_op->ioctl != tty_ioctl)
33 return -EINVAL;
34 real_tty = (struct tty_struct *)file->private_data;
35 if (!real_tty)
36 return -EINVAL;
37 return put_user(new_encode_dev(tty_devnum(real_tty)), ptr);
38}
39
40#define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */
41#define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */
42#define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */
43#define RTC_EPOCH_SET32 _IOW('p', 0x0e, unsigned) /* Set epoch */
44
45static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
46{
47 unsigned long val;
48 mm_segment_t oldfs = get_fs();
49 int ret;
50
51 switch (cmd) {
52 case RTC_IRQP_READ32:
53 set_fs(KERNEL_DS);
54 ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val);
55 set_fs(oldfs);
56 if (!ret)
57 ret = put_user(val, (unsigned int __user *) arg);
58 return ret;
59
60 case RTC_IRQP_SET32:
61 cmd = RTC_IRQP_SET;
62 break;
63
64 case RTC_EPOCH_READ32:
65 set_fs(KERNEL_DS);
66 ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val);
67 set_fs(oldfs);
68 if (!ret)
69 ret = put_user(val, (unsigned int __user *) arg);
70 return ret;
71
72 case RTC_EPOCH_SET32:
73 cmd = RTC_EPOCH_SET;
74 break;
75 }
76 return sys_ioctl(fd,cmd,arg);
77}
78
79/* /proc/mtrr ioctls */
80
81
82struct mtrr_sentry32
83{
84 compat_ulong_t base; /* Base address */
85 compat_uint_t size; /* Size of region */
86 compat_uint_t type; /* Type of region */
87};
88
89struct mtrr_gentry32
90{
91 compat_ulong_t regnum; /* Register number */
92 compat_uint_t base; /* Base address */
93 compat_uint_t size; /* Size of region */
94 compat_uint_t type; /* Type of region */
95};
96
97#define MTRR_IOCTL_BASE 'M'
98
99#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
100#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
101#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
102#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
103#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
104#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
105#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
106#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
107#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
108#define MTRRIOC32_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
109
110
111static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg)
112{
113 struct mtrr_gentry g;
114 struct mtrr_sentry s;
115 int get = 0, err = 0;
116 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg;
117 mm_segment_t oldfs = get_fs();
118
119 switch (cmd) {
120#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break
121#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break
122 SET(ADD);
123 SET(SET);
124 SET(DEL);
125 GET(GET);
126 SET(KILL);
127 SET(ADD_PAGE);
128 SET(SET_PAGE);
129 SET(DEL_PAGE);
130 GET(GET_PAGE);
131 SET(KILL_PAGE);
132 }
133
134 if (get) {
135 err = get_user(g.regnum, &g32->regnum);
136 err |= get_user(g.base, &g32->base);
137 err |= get_user(g.size, &g32->size);
138 err |= get_user(g.type, &g32->type);
139
140 arg = (unsigned long)&g;
141 } else {
142 struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg;
143 err = get_user(s.base, &s32->base);
144 err |= get_user(s.size, &s32->size);
145 err |= get_user(s.type, &s32->type);
146
147 arg = (unsigned long)&s;
148 }
149 if (err) return err;
150
151 set_fs(KERNEL_DS);
152 err = sys_ioctl(fd, cmd, arg);
153 set_fs(oldfs);
154
155 if (!err && get) {
156 err = put_user(g.base, &g32->base);
157 err |= put_user(g.size, &g32->size);
158 err |= put_user(g.regnum, &g32->regnum);
159 err |= put_user(g.type, &g32->type);
160 }
161 return err;
162}
163
164#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) },
165#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
166
167struct ioctl_trans ioctl_start[] = {
168#include <linux/compat_ioctl.h>
169#define DECLARES
170#include "compat_ioctl.c"
171COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS)
172COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
173COMPATIBLE_IOCTL(BLKRASET)
174COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
175COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
176COMPATIBLE_IOCTL(FIOQSIZE)
177
178/* And these ioctls need translation */
179HANDLE_IOCTL(TIOCGDEV, tiocgdev)
180/* realtime device */
181HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl)
182HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
183HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
184HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
185HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
186/* take care of sizeof(sizeof()) breakage */
187/* mtrr */
188HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32)
189HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32)
190HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32)
191HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32)
192HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32)
193HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32)
194HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32)
195HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32)
196HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32)
197HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32)
198};
199
200int ioctl_table_size = ARRAY_SIZE(ioctl_start);
201
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
new file mode 100644
index 000000000000..fbd09b5126ce
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -0,0 +1,621 @@
1/*
2 * linux/arch/x86_64/ia32/ia32_signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
7 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
8 * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
9 *
10 * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $
11 */
12
13#include <linux/sched.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/smp_lock.h>
17#include <linux/kernel.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/wait.h>
21#include <linux/ptrace.h>
22#include <linux/unistd.h>
23#include <linux/stddef.h>
24#include <linux/personality.h>
25#include <linux/compat.h>
26#include <asm/ucontext.h>
27#include <asm/uaccess.h>
28#include <asm/i387.h>
29#include <asm/ia32.h>
30#include <asm/ptrace.h>
31#include <asm/ia32_unistd.h>
32#include <asm/user32.h>
33#include <asm/sigcontext32.h>
34#include <asm/fpu32.h>
35#include <asm/proto.h>
36#include <asm/vsyscall32.h>
37
38#define DEBUG_SIG 0
39
40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
41
42asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
43void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
44
45int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
46{
47 int err;
48 if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
49 return -EFAULT;
50
51 /* If you change siginfo_t structure, please make sure that
52 this code is fixed accordingly.
53 It should never copy any pad contained in the structure
54 to avoid security leaks, but must copy the generic
55 3 ints plus the relevant union member. */
56 err = __put_user(from->si_signo, &to->si_signo);
57 err |= __put_user(from->si_errno, &to->si_errno);
58 err |= __put_user((short)from->si_code, &to->si_code);
59
60 if (from->si_code < 0) {
61 err |= __put_user(from->si_pid, &to->si_pid);
62 err |= __put_user(from->si_uid, &to->si_uid);
63 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
64 } else {
65 /* First 32bits of unions are always present:
66 * si_pid === si_band === si_tid === si_addr(LS half) */
67 err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
68 switch (from->si_code >> 16) {
69 case __SI_FAULT >> 16:
70 break;
71 case __SI_CHLD >> 16:
72 err |= __put_user(from->si_utime, &to->si_utime);
73 err |= __put_user(from->si_stime, &to->si_stime);
74 err |= __put_user(from->si_status, &to->si_status);
75 /* FALL THROUGH */
76 default:
77 case __SI_KILL >> 16:
78 err |= __put_user(from->si_uid, &to->si_uid);
79 break;
80 case __SI_POLL >> 16:
81 err |= __put_user(from->si_fd, &to->si_fd);
82 break;
83 case __SI_TIMER >> 16:
84 err |= __put_user(from->si_overrun, &to->si_overrun);
85 err |= __put_user(ptr_to_compat(from->si_ptr),
86 &to->si_ptr);
87 break;
88 case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
89 case __SI_MESGQ >> 16:
90 err |= __put_user(from->si_uid, &to->si_uid);
91 err |= __put_user(from->si_int, &to->si_int);
92 break;
93 }
94 }
95 return err;
96}
97
98int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
99{
100 int err;
101 u32 ptr32;
102 if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
103 return -EFAULT;
104
105 err = __get_user(to->si_signo, &from->si_signo);
106 err |= __get_user(to->si_errno, &from->si_errno);
107 err |= __get_user(to->si_code, &from->si_code);
108
109 err |= __get_user(to->si_pid, &from->si_pid);
110 err |= __get_user(to->si_uid, &from->si_uid);
111 err |= __get_user(ptr32, &from->si_ptr);
112 to->si_ptr = compat_ptr(ptr32);
113
114 return err;
115}
116
117asmlinkage long
118sys32_sigsuspend(int history0, int history1, old_sigset_t mask,
119 struct pt_regs *regs)
120{
121 sigset_t saveset;
122
123 mask &= _BLOCKABLE;
124 spin_lock_irq(&current->sighand->siglock);
125 saveset = current->blocked;
126 siginitset(&current->blocked, mask);
127 recalc_sigpending();
128 spin_unlock_irq(&current->sighand->siglock);
129
130 regs->rax = -EINTR;
131 while (1) {
132 current->state = TASK_INTERRUPTIBLE;
133 schedule();
134 if (do_signal(regs, &saveset))
135 return -EINTR;
136 }
137}
138
139asmlinkage long
140sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
141 stack_ia32_t __user *uoss_ptr,
142 struct pt_regs *regs)
143{
144 stack_t uss,uoss;
145 int ret;
146 mm_segment_t seg;
147 if (uss_ptr) {
148 u32 ptr;
149 memset(&uss,0,sizeof(stack_t));
150 if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
151 __get_user(ptr, &uss_ptr->ss_sp) ||
152 __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
153 __get_user(uss.ss_size, &uss_ptr->ss_size))
154 return -EFAULT;
155 uss.ss_sp = compat_ptr(ptr);
156 }
157 seg = get_fs();
158 set_fs(KERNEL_DS);
159 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
160 set_fs(seg);
161 if (ret >= 0 && uoss_ptr) {
162 if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
163 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
164 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
165 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
166 ret = -EFAULT;
167 }
168 return ret;
169}
170
171/*
172 * Do a signal return; undo the signal stack.
173 */
174
175struct sigframe
176{
177 u32 pretcode;
178 int sig;
179 struct sigcontext_ia32 sc;
180 struct _fpstate_ia32 fpstate;
181 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
182 char retcode[8];
183};
184
185struct rt_sigframe
186{
187 u32 pretcode;
188 int sig;
189 u32 pinfo;
190 u32 puc;
191 compat_siginfo_t info;
192 struct ucontext_ia32 uc;
193 struct _fpstate_ia32 fpstate;
194 char retcode[8];
195};
196
197static int
198ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
199{
200 unsigned int err = 0;
201
202 /* Always make any pending restarted system calls return -EINTR */
203 current_thread_info()->restart_block.fn = do_no_restart_syscall;
204
205#if DEBUG_SIG
206 printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
207 sc, sc->err, sc->eip, sc->cs, sc->eflags);
208#endif
209#define COPY(x) { \
210 unsigned int reg; \
211 err |= __get_user(reg, &sc->e ##x); \
212 regs->r ## x = reg; \
213}
214
215#define RELOAD_SEG(seg,mask) \
216 { unsigned int cur; \
217 unsigned short pre; \
218 err |= __get_user(pre, &sc->seg); \
219 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
220 pre |= mask; \
221 if (pre != cur) loadsegment(seg,pre); }
222
223 /* Reload fs and gs if they have changed in the signal handler.
224 This does not handle long fs/gs base changes in the handler, but
225 does not clobber them at least in the normal case. */
226
227 {
228 unsigned gs, oldgs;
229 err |= __get_user(gs, &sc->gs);
230 gs |= 3;
231 asm("movl %%gs,%0" : "=r" (oldgs));
232 if (gs != oldgs)
233 load_gs_index(gs);
234 }
235 RELOAD_SEG(fs,3);
236 RELOAD_SEG(ds,3);
237 RELOAD_SEG(es,3);
238
239 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
240 COPY(dx); COPY(cx); COPY(ip);
241 /* Don't touch extended registers */
242
243 err |= __get_user(regs->cs, &sc->cs);
244 regs->cs |= 3;
245 err |= __get_user(regs->ss, &sc->ss);
246 regs->ss |= 3;
247
248 {
249 unsigned int tmpflags;
250 err |= __get_user(tmpflags, &sc->eflags);
251 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
252 regs->orig_rax = -1; /* disable syscall checks */
253 }
254
255 {
256 u32 tmp;
257 struct _fpstate_ia32 __user * buf;
258 err |= __get_user(tmp, &sc->fpstate);
259 buf = compat_ptr(tmp);
260 if (buf) {
261 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
262 goto badframe;
263 err |= restore_i387_ia32(current, buf, 0);
264 } else {
265 struct task_struct *me = current;
266 if (used_math()) {
267 clear_fpu(me);
268 clear_used_math();
269 }
270 }
271 }
272
273 {
274 u32 tmp;
275 err |= __get_user(tmp, &sc->eax);
276 *peax = tmp;
277 }
278 return err;
279
280badframe:
281 return 1;
282}
283
284asmlinkage long sys32_sigreturn(struct pt_regs *regs)
285{
286 struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
287 sigset_t set;
288 unsigned int eax;
289
290 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
291 goto badframe;
292 if (__get_user(set.sig[0], &frame->sc.oldmask)
293 || (_COMPAT_NSIG_WORDS > 1
294 && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
295 sizeof(frame->extramask))))
296 goto badframe;
297
298 sigdelsetmask(&set, ~_BLOCKABLE);
299 spin_lock_irq(&current->sighand->siglock);
300 current->blocked = set;
301 recalc_sigpending();
302 spin_unlock_irq(&current->sighand->siglock);
303
304 if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
305 goto badframe;
306 return eax;
307
308badframe:
309 signal_fault(regs, frame, "32bit sigreturn");
310 return 0;
311}
312
313asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
314{
315 struct rt_sigframe __user *frame;
316 sigset_t set;
317 unsigned int eax;
318 struct pt_regs tregs;
319
320 frame = (struct rt_sigframe __user *)(regs->rsp - 4);
321
322 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
323 goto badframe;
324 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
325 goto badframe;
326
327 sigdelsetmask(&set, ~_BLOCKABLE);
328 spin_lock_irq(&current->sighand->siglock);
329 current->blocked = set;
330 recalc_sigpending();
331 spin_unlock_irq(&current->sighand->siglock);
332
333 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
334 goto badframe;
335
336 tregs = *regs;
337 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
338 goto badframe;
339
340 return eax;
341
342badframe:
343 signal_fault(regs,frame,"32bit rt sigreturn");
344 return 0;
345}
346
347/*
348 * Set up a signal frame.
349 */
350
351static int
352ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
353 struct pt_regs *regs, unsigned int mask)
354{
355 int tmp, err = 0;
356 u32 eflags;
357
358 tmp = 0;
359 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
360 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
361 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
362 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
363 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
364 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
365 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
366 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
367
368 err |= __put_user((u32)regs->rdi, &sc->edi);
369 err |= __put_user((u32)regs->rsi, &sc->esi);
370 err |= __put_user((u32)regs->rbp, &sc->ebp);
371 err |= __put_user((u32)regs->rsp, &sc->esp);
372 err |= __put_user((u32)regs->rbx, &sc->ebx);
373 err |= __put_user((u32)regs->rdx, &sc->edx);
374 err |= __put_user((u32)regs->rcx, &sc->ecx);
375 err |= __put_user((u32)regs->rax, &sc->eax);
376 err |= __put_user((u32)regs->cs, &sc->cs);
377 err |= __put_user((u32)regs->ss, &sc->ss);
378 err |= __put_user(current->thread.trap_no, &sc->trapno);
379 err |= __put_user(current->thread.error_code, &sc->err);
380 err |= __put_user((u32)regs->rip, &sc->eip);
381 eflags = regs->eflags;
382 if (current->ptrace & PT_PTRACED)
383 eflags &= ~TF_MASK;
384 err |= __put_user((u32)eflags, &sc->eflags);
385 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
386
387 tmp = save_i387_ia32(current, fpstate, regs, 0);
388 if (tmp < 0)
389 err = -EFAULT;
390 else {
391 clear_used_math();
392 stts();
393 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
394 &sc->fpstate);
395 }
396
397 /* non-iBCS2 extensions.. */
398 err |= __put_user(mask, &sc->oldmask);
399 err |= __put_user(current->thread.cr2, &sc->cr2);
400
401 return err;
402}
403
404/*
405 * Determine which stack to use..
406 */
407static void __user *
408get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
409{
410 unsigned long rsp;
411
412 /* Default to using normal stack */
413 rsp = regs->rsp;
414
415 /* This is the X/Open sanctioned signal stack switching. */
416 if (ka->sa.sa_flags & SA_ONSTACK) {
417 if (sas_ss_flags(rsp) == 0)
418 rsp = current->sas_ss_sp + current->sas_ss_size;
419 }
420
421 /* This is the legacy signal stack switching. */
422 else if ((regs->ss & 0xffff) != __USER_DS &&
423 !(ka->sa.sa_flags & SA_RESTORER) &&
424 ka->sa.sa_restorer) {
425 rsp = (unsigned long) ka->sa.sa_restorer;
426 }
427
428 return (void __user *)((rsp - frame_size) & -8UL);
429}
430
431void ia32_setup_frame(int sig, struct k_sigaction *ka,
432 compat_sigset_t *set, struct pt_regs * regs)
433{
434 struct sigframe __user *frame;
435 int err = 0;
436
437 frame = get_sigframe(ka, regs, sizeof(*frame));
438
439 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
440 goto give_sigsegv;
441
442 {
443 struct exec_domain *ed = current_thread_info()->exec_domain;
444 err |= __put_user((ed
445 && ed->signal_invmap
446 && sig < 32
447 ? ed->signal_invmap[sig]
448 : sig),
449 &frame->sig);
450 }
451 if (err)
452 goto give_sigsegv;
453
454 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
455 set->sig[0]);
456 if (err)
457 goto give_sigsegv;
458
459 if (_COMPAT_NSIG_WORDS > 1) {
460 err |= __copy_to_user(frame->extramask, &set->sig[1],
461 sizeof(frame->extramask));
462 }
463 if (err)
464 goto give_sigsegv;
465
466 /* Return stub is in 32bit vsyscall page */
467 {
468 void __user *restorer = VSYSCALL32_SIGRETURN;
469 if (ka->sa.sa_flags & SA_RESTORER)
470 restorer = ka->sa.sa_restorer;
471 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
472 }
473 /* These are actually not used anymore, but left because some
474 gdb versions depend on them as a marker. */
475 {
476 /* copy_to_user optimizes that into a single 8 byte store */
477 static const struct {
478 u16 poplmovl;
479 u32 val;
480 u16 int80;
481 u16 pad;
482 } __attribute__((packed)) code = {
483 0xb858, /* popl %eax ; movl $...,%eax */
484 __NR_ia32_sigreturn,
485 0x80cd, /* int $0x80 */
486 0,
487 };
488 err |= __copy_to_user(frame->retcode, &code, 8);
489 }
490 if (err)
491 goto give_sigsegv;
492
493 /* Set up registers for signal handler */
494 regs->rsp = (unsigned long) frame;
495 regs->rip = (unsigned long) ka->sa.sa_handler;
496
497 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
498 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
499
500 regs->cs = __USER32_CS;
501 regs->ss = __USER32_DS;
502
503 set_fs(USER_DS);
504 if (regs->eflags & TF_MASK) {
505 if (current->ptrace & PT_PTRACED) {
506 ptrace_notify(SIGTRAP);
507 } else {
508 regs->eflags &= ~TF_MASK;
509 }
510 }
511
512#if DEBUG_SIG
513 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
514 current->comm, current->pid, frame, regs->rip, frame->pretcode);
515#endif
516
517 return;
518
519give_sigsegv:
520 force_sigsegv(sig, current);
521}
522
523void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
524 compat_sigset_t *set, struct pt_regs * regs)
525{
526 struct rt_sigframe __user *frame;
527 int err = 0;
528
529 frame = get_sigframe(ka, regs, sizeof(*frame));
530
531 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
532 goto give_sigsegv;
533
534 {
535 struct exec_domain *ed = current_thread_info()->exec_domain;
536 err |= __put_user((ed
537 && ed->signal_invmap
538 && sig < 32
539 ? ed->signal_invmap[sig]
540 : sig),
541 &frame->sig);
542 }
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info);
546 if (err)
547 goto give_sigsegv;
548
549 /* Create the ucontext. */
550 err |= __put_user(0, &frame->uc.uc_flags);
551 err |= __put_user(0, &frame->uc.uc_link);
552 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
553 err |= __put_user(sas_ss_flags(regs->rsp),
554 &frame->uc.uc_stack.ss_flags);
555 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
556 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
557 regs, set->sig[0]);
558 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
559 if (err)
560 goto give_sigsegv;
561
562
563 {
564 void __user *restorer = VSYSCALL32_RTSIGRETURN;
565 if (ka->sa.sa_flags & SA_RESTORER)
566 restorer = ka->sa.sa_restorer;
567 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
568 }
569
570 /* This is movl $,%eax ; int $0x80 */
571 /* Not actually used anymore, but left because some gdb versions
572 need it. */
573 {
574 /* __copy_to_user optimizes that into a single 8 byte store */
575 static const struct {
576 u8 movl;
577 u32 val;
578 u16 int80;
579 u16 pad;
580 u8 pad2;
581 } __attribute__((packed)) code = {
582 0xb8,
583 __NR_ia32_rt_sigreturn,
584 0x80cd,
585 0,
586 };
587 err |= __copy_to_user(frame->retcode, &code, 8);
588 }
589 if (err)
590 goto give_sigsegv;
591
592 /* Set up registers for signal handler */
593 regs->rsp = (unsigned long) frame;
594 regs->rip = (unsigned long) ka->sa.sa_handler;
595
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
598
599 regs->cs = __USER32_CS;
600 regs->ss = __USER32_DS;
601
602 set_fs(USER_DS);
603 if (regs->eflags & TF_MASK) {
604 if (current->ptrace & PT_PTRACED) {
605 ptrace_notify(SIGTRAP);
606 } else {
607 regs->eflags &= ~TF_MASK;
608 }
609 }
610
611#if DEBUG_SIG
612 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
613 current->comm, current->pid, frame, regs->rip, frame->pretcode);
614#endif
615
616 return;
617
618give_sigsegv:
619 force_sigsegv(sig, current);
620}
621
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
new file mode 100644
index 000000000000..f3ca0db85b5b
--- /dev/null
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -0,0 +1,602 @@
1/*
2 * Compatibility mode system call entry point for x86-64.
3 *
4 * Copyright 2000-2002 Andi Kleen, SuSE Labs.
5 */
6
7#include <asm/dwarf2.h>
8#include <asm/calling.h>
9#include <asm/offset.h>
10#include <asm/current.h>
11#include <asm/errno.h>
12#include <asm/ia32_unistd.h>
13#include <asm/thread_info.h>
14#include <asm/segment.h>
15#include <asm/vsyscall32.h>
16#include <linux/linkage.h>
17
18 .macro IA32_ARG_FIXUP noebp=0
19 movl %edi,%r8d
20 .if \noebp
21 .else
22 movl %ebp,%r9d
23 .endif
24 xchg %ecx,%esi
25 movl %ebx,%edi
26 movl %edx,%edx /* zero extension */
27 .endm
28
29 /* clobbers %eax */
30 .macro CLEAR_RREGS
31 xorl %eax,%eax
32 movq %rax,R11(%rsp)
33 movq %rax,R10(%rsp)
34 movq %rax,R9(%rsp)
35 movq %rax,R8(%rsp)
36 .endm
37
38/*
39 * 32bit SYSENTER instruction entry.
40 *
41 * Arguments:
42 * %eax System call number.
43 * %ebx Arg1
44 * %ecx Arg2
45 * %edx Arg3
46 * %esi Arg4
47 * %edi Arg5
48 * %ebp user stack
49 * 0(%ebp) Arg6
50 *
51 * Interrupts off.
52 *
53 * This is purely a fast path. For anything complicated we use the int 0x80
54 * path below. Set up a complete hardware stack frame to share code
55 * with the int 0x80 path.
56 */
57ENTRY(ia32_sysenter_target)
58 CFI_STARTPROC
59 swapgs
60 movq %gs:pda_kernelstack, %rsp
61 addq $(PDA_STACKOFFSET),%rsp
62 sti
63 movl %ebp,%ebp /* zero extension */
64 pushq $__USER32_DS
65 pushq %rbp
66 pushfq
67 movl $VSYSCALL32_SYSEXIT, %r10d
68 pushq $__USER32_CS
69 movl %eax, %eax
70 pushq %r10
71 pushq %rax
72 cld
73 SAVE_ARGS 0,0,1
74 /* no need to do an access_ok check here because rbp has been
75 32bit zero extended */
761: movl (%rbp),%r9d
77 .section __ex_table,"a"
78 .quad 1b,ia32_badarg
79 .previous
80 GET_THREAD_INFO(%r10)
81 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
82 jnz sysenter_tracesys
83sysenter_do_call:
84 cmpl $(IA32_NR_syscalls),%eax
85 jae ia32_badsys
86 IA32_ARG_FIXUP 1
87 call *ia32_sys_call_table(,%rax,8)
88 movq %rax,RAX-ARGOFFSET(%rsp)
89 GET_THREAD_INFO(%r10)
90 cli
91 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
92 jnz int_ret_from_sys_call
93 /* clear IF, that popfq doesn't enable interrupts early */
94 andl $~0x200,EFLAGS-R11(%rsp)
95 RESTORE_ARGS 1,24,1,1,1,1
96 popfq
97 popq %rcx /* User %esp */
98 movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
99 swapgs
100 sti /* sti only takes effect after the next instruction */
101 /* sysexit */
102 .byte 0xf, 0x35
103
104sysenter_tracesys:
105 SAVE_REST
106 CLEAR_RREGS
107 movq $-ENOSYS,RAX(%rsp) /* really needed? */
108 movq %rsp,%rdi /* &pt_regs -> arg1 */
109 call syscall_trace_enter
110 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
111 RESTORE_REST
112 movl %ebp, %ebp
113 /* no need to do an access_ok check here because rbp has been
114 32bit zero extended */
1151: movl (%rbp),%r9d
116 .section __ex_table,"a"
117 .quad 1b,ia32_badarg
118 .previous
119 jmp sysenter_do_call
120 CFI_ENDPROC
121
122/*
123 * 32bit SYSCALL instruction entry.
124 *
125 * Arguments:
126 * %eax System call number.
127 * %ebx Arg1
128 * %ecx return EIP
129 * %edx Arg3
130 * %esi Arg4
131 * %edi Arg5
132 * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
133 * %esp user stack
134 * 0(%esp) Arg6
135 *
136 * Interrupts off.
137 *
138 * This is purely a fast path. For anything complicated we use the int 0x80
139 * path below. Set up a complete hardware stack frame to share code
140 * with the int 0x80 path.
141 */
142ENTRY(ia32_cstar_target)
143 CFI_STARTPROC
144 swapgs
145 movl %esp,%r8d
146 movq %gs:pda_kernelstack,%rsp
147 sti
148 SAVE_ARGS 8,1,1
149 movl %eax,%eax /* zero extension */
150 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
151 movq %rcx,RIP-ARGOFFSET(%rsp)
152 movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
153 movl %ebp,%ecx
154 movq $__USER32_CS,CS-ARGOFFSET(%rsp)
155 movq $__USER32_DS,SS-ARGOFFSET(%rsp)
156 movq %r11,EFLAGS-ARGOFFSET(%rsp)
157 movq %r8,RSP-ARGOFFSET(%rsp)
158 /* no need to do an access_ok check here because r8 has been
159 32bit zero extended */
160 /* hardware stack frame is complete now */
1611: movl (%r8),%r9d
162 .section __ex_table,"a"
163 .quad 1b,ia32_badarg
164 .previous
165 GET_THREAD_INFO(%r10)
166 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
167 jnz cstar_tracesys
168cstar_do_call:
169 cmpl $IA32_NR_syscalls,%eax
170 jae ia32_badsys
171 IA32_ARG_FIXUP 1
172 call *ia32_sys_call_table(,%rax,8)
173 movq %rax,RAX-ARGOFFSET(%rsp)
174 GET_THREAD_INFO(%r10)
175 cli
176 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
177 jnz int_ret_from_sys_call
178 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
179 movl RIP-ARGOFFSET(%rsp),%ecx
180 movl EFLAGS-ARGOFFSET(%rsp),%r11d
181 movl RSP-ARGOFFSET(%rsp),%esp
182 swapgs
183 sysretl
184
185cstar_tracesys:
186 SAVE_REST
187 CLEAR_RREGS
188 movq $-ENOSYS,RAX(%rsp) /* really needed? */
189 movq %rsp,%rdi /* &pt_regs -> arg1 */
190 call syscall_trace_enter
191 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
192 RESTORE_REST
193 movl RSP-ARGOFFSET(%rsp), %r8d
194 /* no need to do an access_ok check here because r8 has been
195 32bit zero extended */
1961: movl (%r8),%r9d
197 .section __ex_table,"a"
198 .quad 1b,ia32_badarg
199 .previous
200 jmp cstar_do_call
201
202ia32_badarg:
203 movq $-EFAULT,%rax
204 jmp ia32_sysret
205 CFI_ENDPROC
206
207/*
208 * Emulated IA32 system calls via int 0x80.
209 *
210 * Arguments:
211 * %eax System call number.
212 * %ebx Arg1
213 * %ecx Arg2
214 * %edx Arg3
215 * %esi Arg4
216 * %edi Arg5
217 * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
218 *
219 * Notes:
220 * Uses the same stack frame as the x86-64 version.
221 * All registers except %eax must be saved (but ptrace may violate that)
222 * Arguments are zero extended. For system calls that want sign extension and
223 * take long arguments a wrapper is needed. Most calls can just be called
224 * directly.
225 * Assumes it is only called from user space and entered with interrupts off.
226 */
227
228ENTRY(ia32_syscall)
229 CFI_STARTPROC
230 swapgs
231 sti
232 movl %eax,%eax
233 pushq %rax
234 cld
235 /* note the registers are not zero extended to the sf.
236 this could be a problem. */
237 SAVE_ARGS 0,0,1
238 GET_THREAD_INFO(%r10)
239 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
240 jnz ia32_tracesys
241ia32_do_syscall:
242 cmpl $(IA32_NR_syscalls),%eax
243 jae ia32_badsys
244 IA32_ARG_FIXUP
245 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
246ia32_sysret:
247 movq %rax,RAX-ARGOFFSET(%rsp)
248 jmp int_ret_from_sys_call
249
250ia32_tracesys:
251 SAVE_REST
252 movq $-ENOSYS,RAX(%rsp) /* really needed? */
253 movq %rsp,%rdi /* &pt_regs -> arg1 */
254 call syscall_trace_enter
255 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
256 RESTORE_REST
257 jmp ia32_do_syscall
258
259ia32_badsys:
260 movq $0,ORIG_RAX-ARGOFFSET(%rsp)
261 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
262 jmp int_ret_from_sys_call
263
264ni_syscall:
265 movq %rax,%rdi
266 jmp sys32_ni_syscall
267
268quiet_ni_syscall:
269 movq $-ENOSYS,%rax
270 ret
271 CFI_ENDPROC
272
273 .macro PTREGSCALL label, func, arg
274 .globl \label
275\label:
276 leaq \func(%rip),%rax
277 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
278 jmp ia32_ptregs_common
279 .endm
280
281 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
282 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
283 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
284 PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
285 PTREGSCALL stub32_execve, sys32_execve, %rcx
286 PTREGSCALL stub32_fork, sys_fork, %rdi
287 PTREGSCALL stub32_clone, sys32_clone, %rdx
288 PTREGSCALL stub32_vfork, sys_vfork, %rdi
289 PTREGSCALL stub32_iopl, sys_iopl, %rsi
290 PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
291
292ENTRY(ia32_ptregs_common)
293 CFI_STARTPROC
294 popq %r11
295 SAVE_REST
296 call *%rax
297 RESTORE_REST
298 jmp ia32_sysret /* misbalances the return cache */
299 CFI_ENDPROC
300
301 .data
302 .align 8
303 .globl ia32_sys_call_table
304ia32_sys_call_table:
305 .quad sys_restart_syscall
306 .quad sys_exit
307 .quad stub32_fork
308 .quad sys_read
309 .quad sys_write
310 .quad sys32_open /* 5 */
311 .quad sys_close
312 .quad sys32_waitpid
313 .quad sys_creat
314 .quad sys_link
315 .quad sys_unlink /* 10 */
316 .quad stub32_execve
317 .quad sys_chdir
318 .quad compat_sys_time
319 .quad sys_mknod
320 .quad sys_chmod /* 15 */
321 .quad sys_lchown16
322 .quad quiet_ni_syscall /* old break syscall holder */
323 .quad sys_stat
324 .quad sys32_lseek
325 .quad sys_getpid /* 20 */
326 .quad compat_sys_mount /* mount */
327 .quad sys_oldumount /* old_umount */
328 .quad sys_setuid16
329 .quad sys_getuid16
330 .quad compat_sys_stime /* stime */ /* 25 */
331 .quad sys32_ptrace /* ptrace */
332 .quad sys_alarm
333 .quad sys_fstat /* (old)fstat */
334 .quad sys_pause
335 .quad compat_sys_utime /* 30 */
336 .quad quiet_ni_syscall /* old stty syscall holder */
337 .quad quiet_ni_syscall /* old gtty syscall holder */
338 .quad sys_access
339 .quad sys_nice
340 .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
341 .quad sys_sync
342 .quad sys32_kill
343 .quad sys_rename
344 .quad sys_mkdir
345 .quad sys_rmdir /* 40 */
346 .quad sys_dup
347 .quad sys32_pipe
348 .quad compat_sys_times
349 .quad quiet_ni_syscall /* old prof syscall holder */
350 .quad sys_brk /* 45 */
351 .quad sys_setgid16
352 .quad sys_getgid16
353 .quad sys_signal
354 .quad sys_geteuid16
355 .quad sys_getegid16 /* 50 */
356 .quad sys_acct
357 .quad sys_umount /* new_umount */
358 .quad quiet_ni_syscall /* old lock syscall holder */
359 .quad compat_sys_ioctl
360 .quad compat_sys_fcntl64 /* 55 */
361 .quad quiet_ni_syscall /* old mpx syscall holder */
362 .quad sys_setpgid
363 .quad quiet_ni_syscall /* old ulimit syscall holder */
364 .quad sys32_olduname
365 .quad sys_umask /* 60 */
366 .quad sys_chroot
367 .quad sys32_ustat
368 .quad sys_dup2
369 .quad sys_getppid
370 .quad sys_getpgrp /* 65 */
371 .quad sys_setsid
372 .quad sys32_sigaction
373 .quad sys_sgetmask
374 .quad sys_ssetmask
375 .quad sys_setreuid16 /* 70 */
376 .quad sys_setregid16
377 .quad stub32_sigsuspend
378 .quad compat_sys_sigpending
379 .quad sys_sethostname
380 .quad compat_sys_setrlimit /* 75 */
381 .quad compat_sys_old_getrlimit /* old_getrlimit */
382 .quad compat_sys_getrusage
383 .quad sys32_gettimeofday
384 .quad sys32_settimeofday
385 .quad sys_getgroups16 /* 80 */
386 .quad sys_setgroups16
387 .quad sys32_old_select
388 .quad sys_symlink
389 .quad sys_lstat
390 .quad sys_readlink /* 85 */
391#ifdef CONFIG_IA32_AOUT
392 .quad sys_uselib
393#else
394 .quad quiet_ni_syscall
395#endif
396 .quad sys_swapon
397 .quad sys_reboot
398 .quad compat_sys_old_readdir
399 .quad sys32_mmap /* 90 */
400 .quad sys_munmap
401 .quad sys_truncate
402 .quad sys_ftruncate
403 .quad sys_fchmod
404 .quad sys_fchown16 /* 95 */
405 .quad sys_getpriority
406 .quad sys_setpriority
407 .quad quiet_ni_syscall /* old profil syscall holder */
408 .quad compat_sys_statfs
409 .quad compat_sys_fstatfs /* 100 */
410 .quad sys_ioperm
411 .quad compat_sys_socketcall
412 .quad sys_syslog
413 .quad compat_sys_setitimer
414 .quad compat_sys_getitimer /* 105 */
415 .quad compat_sys_newstat
416 .quad compat_sys_newlstat
417 .quad compat_sys_newfstat
418 .quad sys32_uname
419 .quad stub32_iopl /* 110 */
420 .quad sys_vhangup
421 .quad quiet_ni_syscall /* old "idle" system call */
422 .quad sys32_vm86_warning /* vm86old */
423 .quad compat_sys_wait4
424 .quad sys_swapoff /* 115 */
425 .quad sys32_sysinfo
426 .quad sys32_ipc
427 .quad sys_fsync
428 .quad stub32_sigreturn
429 .quad stub32_clone /* 120 */
430 .quad sys_setdomainname
431 .quad sys_uname
432 .quad sys_modify_ldt
433 .quad sys32_adjtimex
434 .quad sys32_mprotect /* 125 */
435 .quad compat_sys_sigprocmask
436 .quad quiet_ni_syscall /* create_module */
437 .quad sys_init_module
438 .quad sys_delete_module
439 .quad quiet_ni_syscall /* 130 get_kernel_syms */
440 .quad sys_quotactl
441 .quad sys_getpgid
442 .quad sys_fchdir
443 .quad quiet_ni_syscall /* bdflush */
444 .quad sys_sysfs /* 135 */
445 .quad sys_personality
446 .quad quiet_ni_syscall /* for afs_syscall */
447 .quad sys_setfsuid16
448 .quad sys_setfsgid16
449 .quad sys_llseek /* 140 */
450 .quad compat_sys_getdents
451 .quad compat_sys_select
452 .quad sys_flock
453 .quad sys_msync
454 .quad compat_sys_readv /* 145 */
455 .quad compat_sys_writev
456 .quad sys_getsid
457 .quad sys_fdatasync
458 .quad sys32_sysctl /* sysctl */
459 .quad sys_mlock /* 150 */
460 .quad sys_munlock
461 .quad sys_mlockall
462 .quad sys_munlockall
463 .quad sys_sched_setparam
464 .quad sys_sched_getparam /* 155 */
465 .quad sys_sched_setscheduler
466 .quad sys_sched_getscheduler
467 .quad sys_sched_yield
468 .quad sys_sched_get_priority_max
469 .quad sys_sched_get_priority_min /* 160 */
470 .quad sys_sched_rr_get_interval
471 .quad compat_sys_nanosleep
472 .quad sys_mremap
473 .quad sys_setresuid16
474 .quad sys_getresuid16 /* 165 */
475 .quad sys32_vm86_warning /* vm86 */
476 .quad quiet_ni_syscall /* query_module */
477 .quad sys_poll
478 .quad compat_sys_nfsservctl
479 .quad sys_setresgid16 /* 170 */
480 .quad sys_getresgid16
481 .quad sys_prctl
482 .quad stub32_rt_sigreturn
483 .quad sys32_rt_sigaction
484 .quad sys32_rt_sigprocmask /* 175 */
485 .quad sys32_rt_sigpending
486 .quad compat_sys_rt_sigtimedwait
487 .quad sys32_rt_sigqueueinfo
488 .quad stub32_rt_sigsuspend
489 .quad sys32_pread /* 180 */
490 .quad sys32_pwrite
491 .quad sys_chown16
492 .quad sys_getcwd
493 .quad sys_capget
494 .quad sys_capset
495 .quad stub32_sigaltstack
496 .quad sys32_sendfile
497 .quad quiet_ni_syscall /* streams1 */
498 .quad quiet_ni_syscall /* streams2 */
499 .quad stub32_vfork /* 190 */
500 .quad compat_sys_getrlimit
501 .quad sys32_mmap2
502 .quad sys32_truncate64
503 .quad sys32_ftruncate64
504 .quad sys32_stat64 /* 195 */
505 .quad sys32_lstat64
506 .quad sys32_fstat64
507 .quad sys_lchown
508 .quad sys_getuid
509 .quad sys_getgid /* 200 */
510 .quad sys_geteuid
511 .quad sys_getegid
512 .quad sys_setreuid
513 .quad sys_setregid
514 .quad sys_getgroups /* 205 */
515 .quad sys_setgroups
516 .quad sys_fchown
517 .quad sys_setresuid
518 .quad sys_getresuid
519 .quad sys_setresgid /* 210 */
520 .quad sys_getresgid
521 .quad sys_chown
522 .quad sys_setuid
523 .quad sys_setgid
524 .quad sys_setfsuid /* 215 */
525 .quad sys_setfsgid
526 .quad sys_pivot_root
527 .quad sys_mincore
528 .quad sys_madvise
529 .quad compat_sys_getdents64 /* 220 getdents64 */
530 .quad compat_sys_fcntl64
531 .quad quiet_ni_syscall /* tux */
532 .quad quiet_ni_syscall /* security */
533 .quad sys_gettid
534 .quad sys_readahead /* 225 */
535 .quad sys_setxattr
536 .quad sys_lsetxattr
537 .quad sys_fsetxattr
538 .quad sys_getxattr
539 .quad sys_lgetxattr /* 230 */
540 .quad sys_fgetxattr
541 .quad sys_listxattr
542 .quad sys_llistxattr
543 .quad sys_flistxattr
544 .quad sys_removexattr /* 235 */
545 .quad sys_lremovexattr
546 .quad sys_fremovexattr
547 .quad sys_tkill
548 .quad sys_sendfile64
549 .quad compat_sys_futex /* 240 */
550 .quad compat_sys_sched_setaffinity
551 .quad compat_sys_sched_getaffinity
552 .quad sys32_set_thread_area
553 .quad sys32_get_thread_area
554 .quad compat_sys_io_setup /* 245 */
555 .quad sys_io_destroy
556 .quad compat_sys_io_getevents
557 .quad compat_sys_io_submit
558 .quad sys_io_cancel
559 .quad sys_fadvise64 /* 250 */
560 .quad quiet_ni_syscall /* free_huge_pages */
561 .quad sys_exit_group
562 .quad sys32_lookup_dcookie
563 .quad sys_epoll_create
564 .quad sys_epoll_ctl /* 255 */
565 .quad sys_epoll_wait
566 .quad sys_remap_file_pages
567 .quad sys_set_tid_address
568 .quad sys32_timer_create
569 .quad compat_sys_timer_settime /* 260 */
570 .quad compat_sys_timer_gettime
571 .quad sys_timer_getoverrun
572 .quad sys_timer_delete
573 .quad compat_sys_clock_settime
574 .quad compat_sys_clock_gettime /* 265 */
575 .quad compat_sys_clock_getres
576 .quad compat_sys_clock_nanosleep
577 .quad compat_sys_statfs64
578 .quad compat_sys_fstatfs64
579 .quad sys_tgkill /* 270 */
580 .quad compat_sys_utimes
581 .quad sys32_fadvise64_64
582 .quad quiet_ni_syscall /* sys_vserver */
583 .quad sys_mbind
584 .quad compat_sys_get_mempolicy /* 275 */
585 .quad sys_set_mempolicy
586 .quad compat_sys_mq_open
587 .quad sys_mq_unlink
588 .quad compat_sys_mq_timedsend
589 .quad compat_sys_mq_timedreceive /* 280 */
590 .quad compat_sys_mq_notify
591 .quad compat_sys_mq_getsetattr
592 .quad quiet_ni_syscall /* reserved for kexec */
593 .quad compat_sys_waitid
594 .quad quiet_ni_syscall /* sys_altroot */
595 .quad sys_add_key
596 .quad sys_request_key
597 .quad sys_keyctl
598 /* don't forget to change IA32_NR_syscalls */
599ia32_syscall_end:
600 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
601 .quad ni_syscall
602 .endr
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c
new file mode 100644
index 000000000000..369151dc3213
--- /dev/null
+++ b/arch/x86_64/ia32/ipc32.c
@@ -0,0 +1,57 @@
1#include <linux/kernel.h>
2#include <linux/spinlock.h>
3#include <linux/list.h>
4#include <linux/syscalls.h>
5#include <linux/time.h>
6#include <linux/sem.h>
7#include <linux/msg.h>
8#include <linux/shm.h>
9#include <linux/ipc.h>
10#include <linux/compat.h>
11
12#include <asm-i386/ipc.h>
13
14asmlinkage long
15sys32_ipc(u32 call, int first, int second, int third,
16 compat_uptr_t ptr, u32 fifth)
17{
18 int version;
19
20 version = call >> 16; /* hack for backward compatibility */
21 call &= 0xffff;
22
23 switch (call) {
24 case SEMOP:
25 /* struct sembuf is the same on 32 and 64bit :)) */
26 return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
27 case SEMTIMEDOP:
28 return compat_sys_semtimedop(first, compat_ptr(ptr), second,
29 compat_ptr(fifth));
30 case SEMGET:
31 return sys_semget(first, second, third);
32 case SEMCTL:
33 return compat_sys_semctl(first, second, third, compat_ptr(ptr));
34
35 case MSGSND:
36 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
37 case MSGRCV:
38 return compat_sys_msgrcv(first, second, fifth, third,
39 version, compat_ptr(ptr));
40 case MSGGET:
41 return sys_msgget((key_t) first, second);
42 case MSGCTL:
43 return compat_sys_msgctl(first, second, compat_ptr(ptr));
44
45 case SHMAT:
46 return compat_sys_shmat(first, second, third, version,
47 compat_ptr(ptr));
48 break;
49 case SHMDT:
50 return sys_shmdt(compat_ptr(ptr));
51 case SHMGET:
52 return sys_shmget(first, (unsigned)second, third);
53 case SHMCTL:
54 return compat_sys_shmctl(first, second, compat_ptr(ptr));
55 }
56 return -ENOSYS;
57}
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
new file mode 100644
index 000000000000..b98b6d2462f6
--- /dev/null
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -0,0 +1,379 @@
1/*
2 * 32bit ptrace for x86-64.
3 *
4 * Copyright 2001,2002 Andi Kleen, SuSE Labs.
5 * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
6 * copyright.
7 *
8 * This allows to access 64bit processes too; but there is no way to see the extended
9 * register contents.
10 *
11 * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $
12 */
13
14#include <linux/kernel.h>
15#include <linux/stddef.h>
16#include <linux/sched.h>
17#include <linux/syscalls.h>
18#include <linux/unistd.h>
19#include <linux/mm.h>
20#include <linux/ptrace.h>
21#include <asm/ptrace.h>
22#include <asm/compat.h>
23#include <asm/uaccess.h>
24#include <asm/user32.h>
25#include <asm/user.h>
26#include <asm/errno.h>
27#include <asm/debugreg.h>
28#include <asm/i387.h>
29#include <asm/fpu32.h>
30
31/* determines which flags the user has access to. */
32/* 1 = access 0 = no access */
33#define FLAG_MASK 0x44dd5UL
34
35#define R32(l,q) \
36 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
37
38static int putreg32(struct task_struct *child, unsigned regno, u32 val)
39{
40 int i;
41 __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs));
42
43 switch (regno) {
44 case offsetof(struct user32, regs.fs):
45 if (val && (val & 3) != 3) return -EIO;
46 child->thread.fs = val & 0xffff;
47 break;
48 case offsetof(struct user32, regs.gs):
49 if (val && (val & 3) != 3) return -EIO;
50 child->thread.gs = val & 0xffff;
51 break;
52 case offsetof(struct user32, regs.ds):
53 if (val && (val & 3) != 3) return -EIO;
54 child->thread.ds = val & 0xffff;
55 break;
56 case offsetof(struct user32, regs.es):
57 child->thread.es = val & 0xffff;
58 break;
59 case offsetof(struct user32, regs.ss):
60 if ((val & 3) != 3) return -EIO;
61 stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
62 break;
63 case offsetof(struct user32, regs.cs):
64 if ((val & 3) != 3) return -EIO;
65 stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
66 break;
67
68 R32(ebx, rbx);
69 R32(ecx, rcx);
70 R32(edx, rdx);
71 R32(edi, rdi);
72 R32(esi, rsi);
73 R32(ebp, rbp);
74 R32(eax, rax);
75 R32(orig_eax, orig_rax);
76 R32(eip, rip);
77 R32(esp, rsp);
78
79 case offsetof(struct user32, regs.eflags): {
80 __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
81 val &= FLAG_MASK;
82 *flags = val | (*flags & ~FLAG_MASK);
83 break;
84 }
85
86 case offsetof(struct user32, u_debugreg[4]):
87 case offsetof(struct user32, u_debugreg[5]):
88 return -EIO;
89
90 case offsetof(struct user32, u_debugreg[0]):
91 child->thread.debugreg0 = val;
92 break;
93
94 case offsetof(struct user32, u_debugreg[1]):
95 child->thread.debugreg1 = val;
96 break;
97
98 case offsetof(struct user32, u_debugreg[2]):
99 child->thread.debugreg2 = val;
100 break;
101
102 case offsetof(struct user32, u_debugreg[3]):
103 child->thread.debugreg3 = val;
104 break;
105
106 case offsetof(struct user32, u_debugreg[6]):
107 child->thread.debugreg6 = val;
108 break;
109
110 case offsetof(struct user32, u_debugreg[7]):
111 val &= ~DR_CONTROL_RESERVED;
112 /* See arch/i386/kernel/ptrace.c for an explanation of
113 * this awkward check.*/
114 for(i=0; i<4; i++)
115 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
116 return -EIO;
117 child->thread.debugreg7 = val;
118 break;
119
120 default:
121 if (regno > sizeof(struct user32) || (regno & 3))
122 return -EIO;
123
124 /* Other dummy fields in the virtual user structure are ignored */
125 break;
126 }
127 return 0;
128}
129
130#undef R32
131
132#define R32(l,q) \
133 case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
134
135static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
136{
137 __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs));
138
139 switch (regno) {
140 case offsetof(struct user32, regs.fs):
141 *val = child->thread.fs;
142 break;
143 case offsetof(struct user32, regs.gs):
144 *val = child->thread.gs;
145 break;
146 case offsetof(struct user32, regs.ds):
147 *val = child->thread.ds;
148 break;
149 case offsetof(struct user32, regs.es):
150 *val = child->thread.es;
151 break;
152
153 R32(cs, cs);
154 R32(ss, ss);
155 R32(ebx, rbx);
156 R32(ecx, rcx);
157 R32(edx, rdx);
158 R32(edi, rdi);
159 R32(esi, rsi);
160 R32(ebp, rbp);
161 R32(eax, rax);
162 R32(orig_eax, orig_rax);
163 R32(eip, rip);
164 R32(eflags, eflags);
165 R32(esp, rsp);
166
167 case offsetof(struct user32, u_debugreg[0]):
168 *val = child->thread.debugreg0;
169 break;
170 case offsetof(struct user32, u_debugreg[1]):
171 *val = child->thread.debugreg1;
172 break;
173 case offsetof(struct user32, u_debugreg[2]):
174 *val = child->thread.debugreg2;
175 break;
176 case offsetof(struct user32, u_debugreg[3]):
177 *val = child->thread.debugreg3;
178 break;
179 case offsetof(struct user32, u_debugreg[6]):
180 *val = child->thread.debugreg6;
181 break;
182 case offsetof(struct user32, u_debugreg[7]):
183 *val = child->thread.debugreg7;
184 break;
185
186 default:
187 if (regno > sizeof(struct user32) || (regno & 3))
188 return -EIO;
189
190 /* Other dummy fields in the virtual user structure are ignored */
191 *val = 0;
192 break;
193 }
194 return 0;
195}
196
197#undef R32
198
199static struct task_struct *find_target(int request, int pid, int *err)
200{
201 struct task_struct *child;
202
203 *err = -EPERM;
204 if (pid == 1)
205 return NULL;
206
207 *err = -ESRCH;
208 read_lock(&tasklist_lock);
209 child = find_task_by_pid(pid);
210 if (child)
211 get_task_struct(child);
212 read_unlock(&tasklist_lock);
213 if (child) {
214 *err = -EPERM;
215 if (child->pid == 1)
216 goto out;
217 *err = ptrace_check_attach(child, request == PTRACE_KILL);
218 if (*err < 0)
219 goto out;
220 return child;
221 }
222 out:
223 if (child)
224 put_task_struct(child);
225 return NULL;
226
227}
228
229asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
230{
231 struct task_struct *child;
232 struct pt_regs *childregs;
233 void __user *datap = compat_ptr(data);
234 int ret;
235 __u32 val;
236
237 switch (request) {
238 default:
239 return sys_ptrace(request, pid, addr, data);
240
241 case PTRACE_PEEKTEXT:
242 case PTRACE_PEEKDATA:
243 case PTRACE_POKEDATA:
244 case PTRACE_POKETEXT:
245 case PTRACE_POKEUSR:
246 case PTRACE_PEEKUSR:
247 case PTRACE_GETREGS:
248 case PTRACE_SETREGS:
249 case PTRACE_SETFPREGS:
250 case PTRACE_GETFPREGS:
251 case PTRACE_SETFPXREGS:
252 case PTRACE_GETFPXREGS:
253 case PTRACE_GETEVENTMSG:
254 break;
255 }
256
257 child = find_target(request, pid, &ret);
258 if (!child)
259 return ret;
260
261 childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs));
262
263 switch (request) {
264 case PTRACE_PEEKDATA:
265 case PTRACE_PEEKTEXT:
266 ret = 0;
267 if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
268 ret = -EIO;
269 else
270 ret = put_user(val, (unsigned int __user *)datap);
271 break;
272
273 case PTRACE_POKEDATA:
274 case PTRACE_POKETEXT:
275 ret = 0;
276 if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
277 ret = -EIO;
278 break;
279
280 case PTRACE_PEEKUSR:
281 ret = getreg32(child, addr, &val);
282 if (ret == 0)
283 ret = put_user(val, (__u32 __user *)datap);
284 break;
285
286 case PTRACE_POKEUSR:
287 ret = putreg32(child, addr, data);
288 break;
289
290 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
291 int i;
292 if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
293 ret = -EIO;
294 break;
295 }
296 ret = 0;
297 for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
298 getreg32(child, i, &val);
299 ret |= __put_user(val,(u32 __user *)datap);
300 datap += sizeof(u32);
301 }
302 break;
303 }
304
305 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
306 unsigned long tmp;
307 int i;
308 if (!access_ok(VERIFY_READ, datap, 16*4)) {
309 ret = -EIO;
310 break;
311 }
312 ret = 0;
313 for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
314 ret |= __get_user(tmp, (u32 __user *)datap);
315 putreg32(child, i, tmp);
316 datap += sizeof(u32);
317 }
318 break;
319 }
320
321 case PTRACE_GETFPREGS:
322 ret = -EIO;
323 if (!access_ok(VERIFY_READ, compat_ptr(data),
324 sizeof(struct user_i387_struct)))
325 break;
326 save_i387_ia32(child, datap, childregs, 1);
327 ret = 0;
328 break;
329
330 case PTRACE_SETFPREGS:
331 ret = -EIO;
332 if (!access_ok(VERIFY_WRITE, datap,
333 sizeof(struct user_i387_struct)))
334 break;
335 ret = 0;
336 /* don't check EFAULT to be bug-to-bug compatible to i386 */
337 restore_i387_ia32(child, datap, 1);
338 break;
339
340 case PTRACE_GETFPXREGS: {
341 struct user32_fxsr_struct __user *u = datap;
342 init_fpu(child);
343 ret = -EIO;
344 if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
345 break;
346 ret = -EFAULT;
347 if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
348 break;
349 ret = __put_user(childregs->cs, &u->fcs);
350 ret |= __put_user(child->thread.ds, &u->fos);
351 break;
352 }
353 case PTRACE_SETFPXREGS: {
354 struct user32_fxsr_struct __user *u = datap;
355 unlazy_fpu(child);
356 ret = -EIO;
357 if (!access_ok(VERIFY_READ, u, sizeof(*u)))
358 break;
359 /* no checking to be bug-to-bug compatible with i386 */
360 __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
361 set_stopped_child_used_math(child);
362 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
363 ret = 0;
364 break;
365 }
366
367 case PTRACE_GETEVENTMSG:
368 ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
369 break;
370
371 default:
372 ret = -EINVAL;
373 break;
374 }
375
376 put_task_struct(child);
377 return ret;
378}
379
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
new file mode 100644
index 000000000000..68a9ab06ee7c
--- /dev/null
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -0,0 +1,1050 @@
1/*
2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
3 * sys_sparc32
4 *
5 * Copyright (C) 2000 VA Linux Co
6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
10 * Copyright (C) 2000 Hewlett-Packard Co.
11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
13 *
14 * These routines maintain argument size conversion between 32bit and 64bit
15 * environment. In 2.5 most of this should be moved to a generic directory.
16 *
17 * This file assumes that there is a hole at the end of user address space.
18 *
19 * Some of the functions are LE specific currently. These are hopefully all marked.
20 * This should be fixed.
21 */
22
23#include <linux/config.h>
24#include <linux/kernel.h>
25#include <linux/sched.h>
26#include <linux/fs.h>
27#include <linux/file.h>
28#include <linux/signal.h>
29#include <linux/syscalls.h>
30#include <linux/resource.h>
31#include <linux/times.h>
32#include <linux/utsname.h>
33#include <linux/timex.h>
34#include <linux/smp.h>
35#include <linux/smp_lock.h>
36#include <linux/sem.h>
37#include <linux/msg.h>
38#include <linux/mm.h>
39#include <linux/shm.h>
40#include <linux/slab.h>
41#include <linux/uio.h>
42#include <linux/nfs_fs.h>
43#include <linux/quota.h>
44#include <linux/module.h>
45#include <linux/sunrpc/svc.h>
46#include <linux/nfsd/nfsd.h>
47#include <linux/nfsd/cache.h>
48#include <linux/nfsd/xdr.h>
49#include <linux/nfsd/syscall.h>
50#include <linux/poll.h>
51#include <linux/personality.h>
52#include <linux/stat.h>
53#include <linux/ipc.h>
54#include <linux/rwsem.h>
55#include <linux/binfmts.h>
56#include <linux/init.h>
57#include <linux/aio_abi.h>
58#include <linux/aio.h>
59#include <linux/compat.h>
60#include <linux/vfs.h>
61#include <linux/ptrace.h>
62#include <linux/highuid.h>
63#include <linux/vmalloc.h>
64#include <asm/mman.h>
65#include <asm/types.h>
66#include <asm/uaccess.h>
67#include <asm/semaphore.h>
68#include <asm/atomic.h>
69#include <asm/ldt.h>
70
71#include <net/scm.h>
72#include <net/sock.h>
73#include <asm/ia32.h>
74
75#define AA(__x) ((unsigned long)(__x))
76
77int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
78{
79 typeof(ubuf->st_uid) uid = 0;
80 typeof(ubuf->st_gid) gid = 0;
81 SET_UID(uid, kbuf->uid);
82 SET_GID(gid, kbuf->gid);
83 if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
84 return -EOVERFLOW;
85 if (kbuf->size >= 0x7fffffff)
86 return -EOVERFLOW;
87 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
88 __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
89 __put_user (kbuf->ino, &ubuf->st_ino) ||
90 __put_user (kbuf->mode, &ubuf->st_mode) ||
91 __put_user (kbuf->nlink, &ubuf->st_nlink) ||
92 __put_user (uid, &ubuf->st_uid) ||
93 __put_user (gid, &ubuf->st_gid) ||
94 __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
95 __put_user (kbuf->size, &ubuf->st_size) ||
96 __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
97 __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
98 __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
99 __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
100 __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
101 __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
102 __put_user (kbuf->blksize, &ubuf->st_blksize) ||
103 __put_user (kbuf->blocks, &ubuf->st_blocks))
104 return -EFAULT;
105 return 0;
106}
107
108asmlinkage long
109sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
110{
111 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
112}
113
114asmlinkage long
115sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
116{
117 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
118}
119
120/* Another set for IA32/LFS -- x86_64 struct stat is different due to
121 support for 64bit inode numbers. */
122
123static int
124cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
125{
126 typeof(ubuf->st_uid) uid = 0;
127 typeof(ubuf->st_gid) gid = 0;
128 SET_UID(uid, stat->uid);
129 SET_GID(gid, stat->gid);
130 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
131 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
132 __put_user (stat->ino, &ubuf->__st_ino) ||
133 __put_user (stat->ino, &ubuf->st_ino) ||
134 __put_user (stat->mode, &ubuf->st_mode) ||
135 __put_user (stat->nlink, &ubuf->st_nlink) ||
136 __put_user (uid, &ubuf->st_uid) ||
137 __put_user (gid, &ubuf->st_gid) ||
138 __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
139 __put_user (stat->size, &ubuf->st_size) ||
140 __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
141 __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
142 __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
143 __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
144 __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
145 __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
146 __put_user (stat->blksize, &ubuf->st_blksize) ||
147 __put_user (stat->blocks, &ubuf->st_blocks))
148 return -EFAULT;
149 return 0;
150}
151
152asmlinkage long
153sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
154{
155 struct kstat stat;
156 int ret = vfs_stat(filename, &stat);
157 if (!ret)
158 ret = cp_stat64(statbuf, &stat);
159 return ret;
160}
161
162asmlinkage long
163sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
164{
165 struct kstat stat;
166 int ret = vfs_lstat(filename, &stat);
167 if (!ret)
168 ret = cp_stat64(statbuf, &stat);
169 return ret;
170}
171
172asmlinkage long
173sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
174{
175 struct kstat stat;
176 int ret = vfs_fstat(fd, &stat);
177 if (!ret)
178 ret = cp_stat64(statbuf, &stat);
179 return ret;
180}
181
182/*
183 * Linux/i386 didn't use to be able to handle more than
184 * 4 system call parameters, so these system calls used a memory
185 * block for parameter passing..
186 */
187
188struct mmap_arg_struct {
189 unsigned int addr;
190 unsigned int len;
191 unsigned int prot;
192 unsigned int flags;
193 unsigned int fd;
194 unsigned int offset;
195};
196
197asmlinkage long
198sys32_mmap(struct mmap_arg_struct __user *arg)
199{
200 struct mmap_arg_struct a;
201 struct file *file = NULL;
202 unsigned long retval;
203 struct mm_struct *mm ;
204
205 if (copy_from_user(&a, arg, sizeof(a)))
206 return -EFAULT;
207
208 if (a.offset & ~PAGE_MASK)
209 return -EINVAL;
210
211 if (!(a.flags & MAP_ANONYMOUS)) {
212 file = fget(a.fd);
213 if (!file)
214 return -EBADF;
215 }
216
217 mm = current->mm;
218 down_write(&mm->mmap_sem);
219 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
220 if (file)
221 fput(file);
222
223 up_write(&mm->mmap_sem);
224
225 return retval;
226}
227
228asmlinkage long
229sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
230{
231 return sys_mprotect(start,len,prot);
232}
233
234asmlinkage long
235sys32_pipe(int __user *fd)
236{
237 int retval;
238 int fds[2];
239
240 retval = do_pipe(fds);
241 if (retval)
242 goto out;
243 if (copy_to_user(fd, fds, sizeof(fds)))
244 retval = -EFAULT;
245 out:
246 return retval;
247}
248
249asmlinkage long
250sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
251 struct sigaction32 __user *oact, unsigned int sigsetsize)
252{
253 struct k_sigaction new_ka, old_ka;
254 int ret;
255 compat_sigset_t set32;
256
257 /* XXX: Don't preclude handling different sized sigset_t's. */
258 if (sigsetsize != sizeof(compat_sigset_t))
259 return -EINVAL;
260
261 if (act) {
262 compat_uptr_t handler, restorer;
263
264 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
265 __get_user(handler, &act->sa_handler) ||
266 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
267 __get_user(restorer, &act->sa_restorer)||
268 __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
269 return -EFAULT;
270 new_ka.sa.sa_handler = compat_ptr(handler);
271 new_ka.sa.sa_restorer = compat_ptr(restorer);
272 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
273 switch (_NSIG_WORDS) {
274 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
275 | (((long)set32.sig[7]) << 32);
276 case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
277 | (((long)set32.sig[5]) << 32);
278 case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
279 | (((long)set32.sig[3]) << 32);
280 case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
281 | (((long)set32.sig[1]) << 32);
282 }
283 }
284
285 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
286
287 if (!ret && oact) {
288 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
289 switch (_NSIG_WORDS) {
290 case 4:
291 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
292 set32.sig[6] = old_ka.sa.sa_mask.sig[3];
293 case 3:
294 set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
295 set32.sig[4] = old_ka.sa.sa_mask.sig[2];
296 case 2:
297 set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
298 set32.sig[2] = old_ka.sa.sa_mask.sig[1];
299 case 1:
300 set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
301 set32.sig[0] = old_ka.sa.sa_mask.sig[0];
302 }
303 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
304 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
305 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
306 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
307 __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
308 return -EFAULT;
309 }
310
311 return ret;
312}
313
314asmlinkage long
315sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
316{
317 struct k_sigaction new_ka, old_ka;
318 int ret;
319
320 if (act) {
321 compat_old_sigset_t mask;
322 compat_uptr_t handler, restorer;
323
324 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
325 __get_user(handler, &act->sa_handler) ||
326 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
327 __get_user(restorer, &act->sa_restorer) ||
328 __get_user(mask, &act->sa_mask))
329 return -EFAULT;
330
331 new_ka.sa.sa_handler = compat_ptr(handler);
332 new_ka.sa.sa_restorer = compat_ptr(restorer);
333
334 siginitset(&new_ka.sa.sa_mask, mask);
335 }
336
337 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
338
339 if (!ret && oact) {
340 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
341 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
342 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
343 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
344 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
345 return -EFAULT;
346 }
347
348 return ret;
349}
350
351asmlinkage long
352sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
353 compat_sigset_t __user *oset, unsigned int sigsetsize)
354{
355 sigset_t s;
356 compat_sigset_t s32;
357 int ret;
358 mm_segment_t old_fs = get_fs();
359
360 if (set) {
361 if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
362 return -EFAULT;
363 switch (_NSIG_WORDS) {
364 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
365 case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
366 case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
367 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
368 }
369 }
370 set_fs (KERNEL_DS);
371 ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL,
372 sigsetsize);
373 set_fs (old_fs);
374 if (ret) return ret;
375 if (oset) {
376 switch (_NSIG_WORDS) {
377 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
378 case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
379 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
380 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
381 }
382 if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
383 return -EFAULT;
384 }
385 return 0;
386}
387
388static inline long
389get_tv32(struct timeval *o, struct compat_timeval __user *i)
390{
391 int err = -EFAULT;
392 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
393 err = __get_user(o->tv_sec, &i->tv_sec);
394 err |= __get_user(o->tv_usec, &i->tv_usec);
395 }
396 return err;
397}
398
399static inline long
400put_tv32(struct compat_timeval __user *o, struct timeval *i)
401{
402 int err = -EFAULT;
403 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
404 err = __put_user(i->tv_sec, &o->tv_sec);
405 err |= __put_user(i->tv_usec, &o->tv_usec);
406 }
407 return err;
408}
409
410extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
411
412asmlinkage long
413sys32_alarm(unsigned int seconds)
414{
415 struct itimerval it_new, it_old;
416 unsigned int oldalarm;
417
418 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
419 it_new.it_value.tv_sec = seconds;
420 it_new.it_value.tv_usec = 0;
421 do_setitimer(ITIMER_REAL, &it_new, &it_old);
422 oldalarm = it_old.it_value.tv_sec;
423 /* ehhh.. We can't return 0 if we have an alarm pending.. */
424 /* And we'd better return too much than too little anyway */
425 if (it_old.it_value.tv_usec)
426 oldalarm++;
427 return oldalarm;
428}
429
430/* Translations due to time_t size differences. Which affects all
431 sorts of things, like timeval and itimerval. */
432
433extern struct timezone sys_tz;
434
435asmlinkage long
436sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
437{
438 if (tv) {
439 struct timeval ktv;
440 do_gettimeofday(&ktv);
441 if (put_tv32(tv, &ktv))
442 return -EFAULT;
443 }
444 if (tz) {
445 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
446 return -EFAULT;
447 }
448 return 0;
449}
450
451asmlinkage long
452sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
453{
454 struct timeval ktv;
455 struct timespec kts;
456 struct timezone ktz;
457
458 if (tv) {
459 if (get_tv32(&ktv, tv))
460 return -EFAULT;
461 kts.tv_sec = ktv.tv_sec;
462 kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
463 }
464 if (tz) {
465 if (copy_from_user(&ktz, tz, sizeof(ktz)))
466 return -EFAULT;
467 }
468
469 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
470}
471
472struct sel_arg_struct {
473 unsigned int n;
474 unsigned int inp;
475 unsigned int outp;
476 unsigned int exp;
477 unsigned int tvp;
478};
479
480asmlinkage long
481sys32_old_select(struct sel_arg_struct __user *arg)
482{
483 struct sel_arg_struct a;
484
485 if (copy_from_user(&a, arg, sizeof(a)))
486 return -EFAULT;
487 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
488 compat_ptr(a.exp), compat_ptr(a.tvp));
489}
490
491extern asmlinkage long
492compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
493 struct compat_rusage *ru);
494
495asmlinkage long
496sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
497{
498 return compat_sys_wait4(pid, stat_addr, options, NULL);
499}
500
501int sys32_ni_syscall(int call)
502{
503 struct task_struct *me = current;
504 static char lastcomm[sizeof(me->comm)];
505
506 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
507 printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
508 call, me->comm);
509 strncpy(lastcomm, me->comm, sizeof(lastcomm));
510 }
511 return -ENOSYS;
512}
513
514/* 32-bit timeval and related flotsam. */
515
516asmlinkage long
517sys32_sysfs(int option, u32 arg1, u32 arg2)
518{
519 return sys_sysfs(option, arg1, arg2);
520}
521
522struct sysinfo32 {
523 s32 uptime;
524 u32 loads[3];
525 u32 totalram;
526 u32 freeram;
527 u32 sharedram;
528 u32 bufferram;
529 u32 totalswap;
530 u32 freeswap;
531 unsigned short procs;
532 unsigned short pad;
533 u32 totalhigh;
534 u32 freehigh;
535 u32 mem_unit;
536 char _f[20-2*sizeof(u32)-sizeof(int)];
537};
538
539asmlinkage long
540sys32_sysinfo(struct sysinfo32 __user *info)
541{
542 struct sysinfo s;
543 int ret;
544 mm_segment_t old_fs = get_fs ();
545 int bitcount = 0;
546
547 set_fs (KERNEL_DS);
548 ret = sys_sysinfo(&s);
549 set_fs (old_fs);
550
551 /* Check to see if any memory value is too large for 32-bit and scale
552 * down if needed
553 */
554 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
555 while (s.mem_unit < PAGE_SIZE) {
556 s.mem_unit <<= 1;
557 bitcount++;
558 }
559 s.totalram >>= bitcount;
560 s.freeram >>= bitcount;
561 s.sharedram >>= bitcount;
562 s.bufferram >>= bitcount;
563 s.totalswap >>= bitcount;
564 s.freeswap >>= bitcount;
565 s.totalhigh >>= bitcount;
566 s.freehigh >>= bitcount;
567 }
568
569 if (!access_ok(VERIFY_WRITE, info, sizeof(struct sysinfo32)) ||
570 __put_user (s.uptime, &info->uptime) ||
571 __put_user (s.loads[0], &info->loads[0]) ||
572 __put_user (s.loads[1], &info->loads[1]) ||
573 __put_user (s.loads[2], &info->loads[2]) ||
574 __put_user (s.totalram, &info->totalram) ||
575 __put_user (s.freeram, &info->freeram) ||
576 __put_user (s.sharedram, &info->sharedram) ||
577 __put_user (s.bufferram, &info->bufferram) ||
578 __put_user (s.totalswap, &info->totalswap) ||
579 __put_user (s.freeswap, &info->freeswap) ||
580 __put_user (s.procs, &info->procs) ||
581 __put_user (s.totalhigh, &info->totalhigh) ||
582 __put_user (s.freehigh, &info->freehigh) ||
583 __put_user (s.mem_unit, &info->mem_unit))
584 return -EFAULT;
585 return 0;
586}
587
588asmlinkage long
589sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
590{
591 struct timespec t;
592 int ret;
593 mm_segment_t old_fs = get_fs ();
594
595 set_fs (KERNEL_DS);
596 ret = sys_sched_rr_get_interval(pid, &t);
597 set_fs (old_fs);
598 if (put_compat_timespec(&t, interval))
599 return -EFAULT;
600 return ret;
601}
602
603asmlinkage long
604sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
605{
606 sigset_t s;
607 compat_sigset_t s32;
608 int ret;
609 mm_segment_t old_fs = get_fs();
610
611 set_fs (KERNEL_DS);
612 ret = sys_rt_sigpending(&s, sigsetsize);
613 set_fs (old_fs);
614 if (!ret) {
615 switch (_NSIG_WORDS) {
616 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
617 case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
618 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
619 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
620 }
621 if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
622 return -EFAULT;
623 }
624 return ret;
625}
626
627asmlinkage long
628sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
629{
630 siginfo_t info;
631 int ret;
632 mm_segment_t old_fs = get_fs();
633
634 if (copy_siginfo_from_user32(&info, uinfo))
635 return -EFAULT;
636 set_fs (KERNEL_DS);
637 ret = sys_rt_sigqueueinfo(pid, sig, &info);
638 set_fs (old_fs);
639 return ret;
640}
641
642/* These are here just in case some old ia32 binary calls it. */
643asmlinkage long
644sys32_pause(void)
645{
646 current->state = TASK_INTERRUPTIBLE;
647 schedule();
648 return -ERESTARTNOHAND;
649}
650
651
652#ifdef CONFIG_SYSCTL
653struct sysctl_ia32 {
654 unsigned int name;
655 int nlen;
656 unsigned int oldval;
657 unsigned int oldlenp;
658 unsigned int newval;
659 unsigned int newlen;
660 unsigned int __unused[4];
661};
662
663
664asmlinkage long
665sys32_sysctl(struct sysctl_ia32 __user *args32)
666{
667 struct sysctl_ia32 a32;
668 mm_segment_t old_fs = get_fs ();
669 void __user *oldvalp, *newvalp;
670 size_t oldlen;
671 int __user *namep;
672 long ret;
673 extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
674 void *newval, size_t newlen);
675
676
677 if (copy_from_user(&a32, args32, sizeof (a32)))
678 return -EFAULT;
679
680 /*
681 * We need to pre-validate these because we have to disable address checking
682 * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
683 * user specifying bad addresses here. Well, since we're dealing with 32 bit
684 * addresses, we KNOW that access_ok() will always succeed, so this is an
685 * expensive NOP, but so what...
686 */
687 namep = compat_ptr(a32.name);
688 oldvalp = compat_ptr(a32.oldval);
689 newvalp = compat_ptr(a32.newval);
690
691 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
692 || !access_ok(VERIFY_WRITE, namep, 0)
693 || !access_ok(VERIFY_WRITE, oldvalp, 0)
694 || !access_ok(VERIFY_WRITE, newvalp, 0))
695 return -EFAULT;
696
697 set_fs(KERNEL_DS);
698 lock_kernel();
699 ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen);
700 unlock_kernel();
701 set_fs(old_fs);
702
703 if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
704 return -EFAULT;
705
706 return ret;
707}
708#endif
709
710/* warning: next two assume little endian */
711asmlinkage long
712sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
713{
714 return sys_pread64(fd, ubuf, count,
715 ((loff_t)AA(poshi) << 32) | AA(poslo));
716}
717
718asmlinkage long
719sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
720{
721 return sys_pwrite64(fd, ubuf, count,
722 ((loff_t)AA(poshi) << 32) | AA(poslo));
723}
724
725
726asmlinkage long
727sys32_personality(unsigned long personality)
728{
729 int ret;
730 if (personality(current->personality) == PER_LINUX32 &&
731 personality == PER_LINUX)
732 personality = PER_LINUX32;
733 ret = sys_personality(personality);
734 if (ret == PER_LINUX32)
735 ret = PER_LINUX;
736 return ret;
737}
738
739asmlinkage long
740sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
741{
742 mm_segment_t old_fs = get_fs();
743 int ret;
744 off_t of;
745
746 if (offset && get_user(of, offset))
747 return -EFAULT;
748
749 set_fs(KERNEL_DS);
750 ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count);
751 set_fs(old_fs);
752
753 if (!ret && offset && put_user(of, offset))
754 return -EFAULT;
755
756 return ret;
757}
758
759/* Handle adjtimex compatibility. */
760
761struct timex32 {
762 u32 modes;
763 s32 offset, freq, maxerror, esterror;
764 s32 status, constant, precision, tolerance;
765 struct compat_timeval time;
766 s32 tick;
767 s32 ppsfreq, jitter, shift, stabil;
768 s32 jitcnt, calcnt, errcnt, stbcnt;
769 s32 :32; s32 :32; s32 :32; s32 :32;
770 s32 :32; s32 :32; s32 :32; s32 :32;
771 s32 :32; s32 :32; s32 :32; s32 :32;
772};
773
774extern int do_adjtimex(struct timex *);
775
776asmlinkage long
777sys32_adjtimex(struct timex32 __user *utp)
778{
779 struct timex txc;
780 int ret;
781
782 memset(&txc, 0, sizeof(struct timex));
783
784 if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) ||
785 __get_user(txc.modes, &utp->modes) ||
786 __get_user(txc.offset, &utp->offset) ||
787 __get_user(txc.freq, &utp->freq) ||
788 __get_user(txc.maxerror, &utp->maxerror) ||
789 __get_user(txc.esterror, &utp->esterror) ||
790 __get_user(txc.status, &utp->status) ||
791 __get_user(txc.constant, &utp->constant) ||
792 __get_user(txc.precision, &utp->precision) ||
793 __get_user(txc.tolerance, &utp->tolerance) ||
794 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
795 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
796 __get_user(txc.tick, &utp->tick) ||
797 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
798 __get_user(txc.jitter, &utp->jitter) ||
799 __get_user(txc.shift, &utp->shift) ||
800 __get_user(txc.stabil, &utp->stabil) ||
801 __get_user(txc.jitcnt, &utp->jitcnt) ||
802 __get_user(txc.calcnt, &utp->calcnt) ||
803 __get_user(txc.errcnt, &utp->errcnt) ||
804 __get_user(txc.stbcnt, &utp->stbcnt))
805 return -EFAULT;
806
807 ret = do_adjtimex(&txc);
808
809 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) ||
810 __put_user(txc.modes, &utp->modes) ||
811 __put_user(txc.offset, &utp->offset) ||
812 __put_user(txc.freq, &utp->freq) ||
813 __put_user(txc.maxerror, &utp->maxerror) ||
814 __put_user(txc.esterror, &utp->esterror) ||
815 __put_user(txc.status, &utp->status) ||
816 __put_user(txc.constant, &utp->constant) ||
817 __put_user(txc.precision, &utp->precision) ||
818 __put_user(txc.tolerance, &utp->tolerance) ||
819 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
820 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
821 __put_user(txc.tick, &utp->tick) ||
822 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
823 __put_user(txc.jitter, &utp->jitter) ||
824 __put_user(txc.shift, &utp->shift) ||
825 __put_user(txc.stabil, &utp->stabil) ||
826 __put_user(txc.jitcnt, &utp->jitcnt) ||
827 __put_user(txc.calcnt, &utp->calcnt) ||
828 __put_user(txc.errcnt, &utp->errcnt) ||
829 __put_user(txc.stbcnt, &utp->stbcnt))
830 ret = -EFAULT;
831
832 return ret;
833}
834
835asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
836 unsigned long prot, unsigned long flags,
837 unsigned long fd, unsigned long pgoff)
838{
839 struct mm_struct *mm = current->mm;
840 unsigned long error;
841 struct file * file = NULL;
842
843 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
844 if (!(flags & MAP_ANONYMOUS)) {
845 file = fget(fd);
846 if (!file)
847 return -EBADF;
848 }
849
850 down_write(&mm->mmap_sem);
851 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
852 up_write(&mm->mmap_sem);
853
854 if (file)
855 fput(file);
856 return error;
857}
858
859asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
860{
861 int error;
862
863 if (!name)
864 return -EFAULT;
865 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
866 return -EFAULT;
867
868 down_read(&uts_sem);
869
870 error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
871 __put_user(0,name->sysname+__OLD_UTS_LEN);
872 __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
873 __put_user(0,name->nodename+__OLD_UTS_LEN);
874 __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
875 __put_user(0,name->release+__OLD_UTS_LEN);
876 __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
877 __put_user(0,name->version+__OLD_UTS_LEN);
878 {
879 char *arch = "x86_64";
880 if (personality(current->personality) == PER_LINUX32)
881 arch = "i686";
882
883 __copy_to_user(&name->machine,arch,strlen(arch)+1);
884 }
885
886 up_read(&uts_sem);
887
888 error = error ? -EFAULT : 0;
889
890 return error;
891}
892
893long sys32_uname(struct old_utsname __user * name)
894{
895 int err;
896 if (!name)
897 return -EFAULT;
898 down_read(&uts_sem);
899 err=copy_to_user(name, &system_utsname, sizeof (*name));
900 up_read(&uts_sem);
901 if (personality(current->personality) == PER_LINUX32)
902 err |= copy_to_user(&name->machine, "i686", 5);
903 return err?-EFAULT:0;
904}
905
906long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
907{
908 struct ustat u;
909 mm_segment_t seg;
910 int ret;
911
912 seg = get_fs();
913 set_fs(KERNEL_DS);
914 ret = sys_ustat(dev,&u);
915 set_fs(seg);
916 if (ret >= 0) {
917 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) ||
918 __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
919 __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
920 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
921 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
922 ret = -EFAULT;
923 }
924 return ret;
925}
926
927asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
928 compat_uptr_t __user *envp, struct pt_regs *regs)
929{
930 long error;
931 char * filename;
932
933 filename = getname(name);
934 error = PTR_ERR(filename);
935 if (IS_ERR(filename))
936 return error;
937 error = compat_do_execve(filename, argv, envp, regs);
938 if (error == 0) {
939 task_lock(current);
940 current->ptrace &= ~PT_DTRACE;
941 task_unlock(current);
942 }
943 putname(filename);
944 return error;
945}
946
947asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
948 struct pt_regs *regs)
949{
950 void __user *parent_tid = (void __user *)regs->rdx;
951 void __user *child_tid = (void __user *)regs->rdi;
952 if (!newsp)
953 newsp = regs->rsp;
954 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
955}
956
957/*
958 * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
959 */
960
961long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
962{
963 return sys_lseek(fd, offset, whence);
964}
965
966long sys32_kill(int pid, int sig)
967{
968 return sys_kill(pid, sig);
969}
970
971asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
972{
973 char * tmp;
974 int fd, error;
975
976 /* don't force O_LARGEFILE */
977 tmp = getname(filename);
978 fd = PTR_ERR(tmp);
979 if (!IS_ERR(tmp)) {
980 fd = get_unused_fd();
981 if (fd >= 0) {
982 struct file *f = filp_open(tmp, flags, mode);
983 error = PTR_ERR(f);
984 if (IS_ERR(f)) {
985 put_unused_fd(fd);
986 fd = error;
987 } else
988 fd_install(fd, f);
989 }
990 putname(tmp);
991 }
992 return fd;
993}
994
995extern asmlinkage long
996sys_timer_create(clockid_t which_clock,
997 struct sigevent __user *timer_event_spec,
998 timer_t __user * created_timer_id);
999
1000long
1001sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id)
1002{
1003 struct sigevent __user *p = NULL;
1004 if (se32) {
1005 struct sigevent se;
1006 p = compat_alloc_user_space(sizeof(struct sigevent));
1007 if (get_compat_sigevent(&se, se32) ||
1008 copy_to_user(p, &se, sizeof(se)))
1009 return -EFAULT;
1010 }
1011 return sys_timer_create(clock, p, timer_id);
1012}
1013
1014long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
1015 __u32 len_low, __u32 len_high, int advice)
1016{
1017 return sys_fadvise64_64(fd,
1018 (((u64)offset_high)<<32) | offset_low,
1019 (((u64)len_high)<<32) | len_low,
1020 advice);
1021}
1022
1023long sys32_vm86_warning(void)
1024{
1025 struct task_struct *me = current;
1026 static char lastcomm[sizeof(me->comm)];
1027 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
1028 printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
1029 me->comm);
1030 strncpy(lastcomm, me->comm, sizeof(lastcomm));
1031 }
1032 return -ENOSYS;
1033}
1034
1035long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
1036 char __user * buf, size_t len)
1037{
1038 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
1039}
1040
1041static int __init ia32_init (void)
1042{
1043 printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");
1044 return 0;
1045}
1046
1047__initcall(ia32_init);
1048
1049extern unsigned long ia32_sys_call_table[];
1050EXPORT_SYMBOL(ia32_sys_call_table);
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
new file mode 100644
index 000000000000..399ff4985099
--- /dev/null
+++ b/arch/x86_64/ia32/syscall32.c
@@ -0,0 +1,111 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
2
3/* vsyscall handling for 32bit processes. Map a stub page into it
4 on demand because 32bit cannot reach the kernel's fixmaps */
5
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/kernel.h>
9#include <linux/gfp.h>
10#include <linux/init.h>
11#include <linux/stringify.h>
12#include <asm/proto.h>
13#include <asm/tlbflush.h>
14#include <asm/ia32_unistd.h>
15
16/* 32bit VDSOs mapped into user space. */
17asm(".section \".init.data\",\"aw\"\n"
18 "syscall32_syscall:\n"
19 ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n"
20 "syscall32_syscall_end:\n"
21 "syscall32_sysenter:\n"
22 ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n"
23 "syscall32_sysenter_end:\n"
24 ".previous");
25
26extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
27extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
28extern int sysctl_vsyscall32;
29
30char *syscall32_page;
31static int use_sysenter = -1;
32
33/*
34 * Map the 32bit vsyscall page on demand.
35 *
36 * RED-PEN: This knows too much about high level VM.
37 *
38 * Alternative would be to generate a vma with appropriate backing options
39 * and let it be handled by generic VM.
40 */
41int __map_syscall32(struct mm_struct *mm, unsigned long address)
42{
43 pgd_t *pgd;
44 pud_t *pud;
45 pte_t *pte;
46 pmd_t *pmd;
47 int err = -ENOMEM;
48
49 spin_lock(&mm->page_table_lock);
50 pgd = pgd_offset(mm, address);
51 pud = pud_alloc(mm, pgd, address);
52 if (pud) {
53 pmd = pmd_alloc(mm, pud, address);
54 if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) {
55 if (pte_none(*pte)) {
56 set_pte(pte,
57 mk_pte(virt_to_page(syscall32_page),
58 PAGE_KERNEL_VSYSCALL32));
59 }
60 /* Flush only the local CPU. Other CPUs taking a fault
61 will just end up here again
62 This probably not needed and just paranoia. */
63 __flush_tlb_one(address);
64 err = 0;
65 }
66 }
67 spin_unlock(&mm->page_table_lock);
68 return err;
69}
70
71int map_syscall32(struct mm_struct *mm, unsigned long address)
72{
73 int err;
74 down_read(&mm->mmap_sem);
75 err = __map_syscall32(mm, address);
76 up_read(&mm->mmap_sem);
77 return err;
78}
79
80static int __init init_syscall32(void)
81{
82 syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
83 if (!syscall32_page)
84 panic("Cannot allocate syscall32 page");
85 SetPageReserved(virt_to_page(syscall32_page));
86 if (use_sysenter > 0) {
87 memcpy(syscall32_page, syscall32_sysenter,
88 syscall32_sysenter_end - syscall32_sysenter);
89 } else {
90 memcpy(syscall32_page, syscall32_syscall,
91 syscall32_syscall_end - syscall32_syscall);
92 }
93 return 0;
94}
95
96__initcall(init_syscall32);
97
98/* May not be __init: called during resume */
99void syscall32_cpu_init(void)
100{
101 if (use_sysenter < 0)
102 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
103
104 /* Load these always in case some future AMD CPU supports
105 SYSENTER from compat mode too. */
106 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
107 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
108 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
109
110 wrmsrl(MSR_CSTAR, ia32_cstar_target);
111}
diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c
new file mode 100644
index 000000000000..1cc4340de3ca
--- /dev/null
+++ b/arch/x86_64/ia32/tls32.c
@@ -0,0 +1,163 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5
6#include <asm/uaccess.h>
7#include <asm/desc.h>
8#include <asm/system.h>
9#include <asm/ldt.h>
10#include <asm/processor.h>
11#include <asm/proto.h>
12
13/*
14 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
15 */
16static int get_free_idx(void)
17{
18 struct thread_struct *t = &current->thread;
19 int idx;
20
21 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
22 if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
23 return idx + GDT_ENTRY_TLS_MIN;
24 return -ESRCH;
25}
26
27/*
28 * Set a given TLS descriptor:
29 * When you want addresses > 32bit use arch_prctl()
30 */
31int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
32{
33 struct user_desc info;
34 struct n_desc_struct *desc;
35 int cpu, idx;
36
37 if (copy_from_user(&info, u_info, sizeof(info)))
38 return -EFAULT;
39
40 idx = info.entry_number;
41
42 /*
43 * index -1 means the kernel should try to find and
44 * allocate an empty descriptor:
45 */
46 if (idx == -1) {
47 idx = get_free_idx();
48 if (idx < 0)
49 return idx;
50 if (put_user(idx, &u_info->entry_number))
51 return -EFAULT;
52 }
53
54 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
55 return -EINVAL;
56
57 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
58
59 /*
60 * We must not get preempted while modifying the TLS.
61 */
62 cpu = get_cpu();
63
64 if (LDT_empty(&info)) {
65 desc->a = 0;
66 desc->b = 0;
67 } else {
68 desc->a = LDT_entry_a(&info);
69 desc->b = LDT_entry_b(&info);
70 }
71 if (t == &current->thread)
72 load_TLS(t, cpu);
73
74 put_cpu();
75 return 0;
76}
77
78asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
79{
80 return do_set_thread_area(&current->thread, u_info);
81}
82
83
84/*
85 * Get the current Thread-Local Storage area:
86 */
87
88#define GET_BASE(desc) ( \
89 (((desc)->a >> 16) & 0x0000ffff) | \
90 (((desc)->b << 16) & 0x00ff0000) | \
91 ( (desc)->b & 0xff000000) )
92
93#define GET_LIMIT(desc) ( \
94 ((desc)->a & 0x0ffff) | \
95 ((desc)->b & 0xf0000) )
96
97#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
98#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
99#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
100#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
101#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
102#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
103#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
104
105int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
106{
107 struct user_desc info;
108 struct n_desc_struct *desc;
109 int idx;
110
111 if (get_user(idx, &u_info->entry_number))
112 return -EFAULT;
113 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
114 return -EINVAL;
115
116 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
117
118 memset(&info, 0, sizeof(struct user_desc));
119 info.entry_number = idx;
120 info.base_addr = GET_BASE(desc);
121 info.limit = GET_LIMIT(desc);
122 info.seg_32bit = GET_32BIT(desc);
123 info.contents = GET_CONTENTS(desc);
124 info.read_exec_only = !GET_WRITABLE(desc);
125 info.limit_in_pages = GET_LIMIT_PAGES(desc);
126 info.seg_not_present = !GET_PRESENT(desc);
127 info.useable = GET_USEABLE(desc);
128 info.lm = GET_LONGMODE(desc);
129
130 if (copy_to_user(u_info, &info, sizeof(info)))
131 return -EFAULT;
132 return 0;
133}
134
135asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
136{
137 return do_get_thread_area(&current->thread, u_info);
138}
139
140
141int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
142{
143 struct n_desc_struct *desc;
144 struct user_desc info;
145 struct user_desc __user *cp;
146 int idx;
147
148 cp = (void __user *)childregs->rsi;
149 if (copy_from_user(&info, cp, sizeof(info)))
150 return -EFAULT;
151 if (LDT_empty(&info))
152 return -EINVAL;
153
154 idx = info.entry_number;
155 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
156 return -EINVAL;
157
158 desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
159 desc->a = LDT_entry_a(&info);
160 desc->b = LDT_entry_b(&info);
161
162 return 0;
163}
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..ba4067d350e4
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -0,0 +1,120 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
4 * to enter the kernel.
5 * This file is #include'd by vsyscall-*.S to define them after the
6 * vsyscall entry point. The addresses we get for these entry points
7 * by doing ".balign 32" must match in both versions of the page.
8 */
9
10 .section .text.sigreturn,"ax"
11 .balign 32
12 .globl __kernel_sigreturn
13 .type __kernel_sigreturn,@function
14__kernel_sigreturn:
15.LSTART_sigreturn:
16 popl %eax
17 movl $__NR_ia32_sigreturn, %eax
18 SYSCALL_ENTER_KERNEL
19.LEND_sigreturn:
20 .size __kernel_sigreturn,.-.LSTART_sigreturn
21
22 .section .text.rtsigreturn,"ax"
23 .balign 32
24 .globl __kernel_rt_sigreturn
25 .type __kernel_rt_sigreturn,@function
26__kernel_rt_sigreturn:
27.LSTART_rt_sigreturn:
28 movl $__NR_ia32_rt_sigreturn, %eax
29 SYSCALL_ENTER_KERNEL
30.LEND_rt_sigreturn:
31 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
32
33 .section .eh_frame,"a",@progbits
34 .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
35.LSTARTFDE2:
36 .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */
37 /* HACK: The dwarf2 unwind routines will subtract 1 from the
38 return address to get an address in the middle of the
39 presumed call instruction. Since we didn't get here via
40 a call, we need to include the nop before the real start
41 to make up for it. */
42 .long .LSTART_sigreturn-1-. /* PC-relative start address */
43 .long .LEND_sigreturn-.LSTART_sigreturn+1
44 .uleb128 0 /* Augmentation length */
45 /* What follows are the instructions for the table generation.
46 We record the locations of each register saved. This is
47 complicated by the fact that the "CFA" is always assumed to
48 be the value of the stack pointer in the caller. This means
49 that we must define the CFA of this body of code to be the
50 saved value of the stack pointer in the sigcontext. Which
51 also means that there is no fixed relation to the other
52 saved registers, which means that we must use DW_CFA_expression
53 to compute their addresses. It also means that when we
54 adjust the stack with the popl, we have to do it all over again. */
55
56#define do_cfa_expr(offset) \
57 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
58 .uleb128 1f-0f; /* length */ \
590: .byte 0x74; /* DW_OP_breg4 */ \
60 .sleb128 offset; /* offset */ \
61 .byte 0x06; /* DW_OP_deref */ \
621:
63
64#define do_expr(regno, offset) \
65 .byte 0x10; /* DW_CFA_expression */ \
66 .uleb128 regno; /* regno */ \
67 .uleb128 1f-0f; /* length */ \
680: .byte 0x74; /* DW_OP_breg4 */ \
69 .sleb128 offset; /* offset */ \
701:
71
72 do_cfa_expr(IA32_SIGCONTEXT_esp+4)
73 do_expr(0, IA32_SIGCONTEXT_eax+4)
74 do_expr(1, IA32_SIGCONTEXT_ecx+4)
75 do_expr(2, IA32_SIGCONTEXT_edx+4)
76 do_expr(3, IA32_SIGCONTEXT_ebx+4)
77 do_expr(5, IA32_SIGCONTEXT_ebp+4)
78 do_expr(6, IA32_SIGCONTEXT_esi+4)
79 do_expr(7, IA32_SIGCONTEXT_edi+4)
80 do_expr(8, IA32_SIGCONTEXT_eip+4)
81
82 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
83
84 do_cfa_expr(IA32_SIGCONTEXT_esp)
85 do_expr(0, IA32_SIGCONTEXT_eax)
86 do_expr(1, IA32_SIGCONTEXT_ecx)
87 do_expr(2, IA32_SIGCONTEXT_edx)
88 do_expr(3, IA32_SIGCONTEXT_ebx)
89 do_expr(5, IA32_SIGCONTEXT_ebp)
90 do_expr(6, IA32_SIGCONTEXT_esi)
91 do_expr(7, IA32_SIGCONTEXT_edi)
92 do_expr(8, IA32_SIGCONTEXT_eip)
93
94 .align 4
95.LENDFDE2:
96
97 .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
98.LSTARTFDE3:
99 .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */
100 /* HACK: See above wrt unwind library assumptions. */
101 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
102 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
103 .uleb128 0 /* Augmentation */
104 /* What follows are the instructions for the table generation.
105 We record the locations of each register saved. This is
106 slightly less complicated than the above, since we don't
107 modify the stack pointer in the process. */
108
109 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
110 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
111 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
112 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
113 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
114 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
115 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
116 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
117 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
118
119 .align 4
120.LENDFDE3:
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S
new file mode 100644
index 000000000000..e2aaf3de8a42
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-syscall.S
@@ -0,0 +1,68 @@
1/*
2 * Code for the vsyscall page. This version uses the syscall instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/offset.h>
7#include <asm/segment.h>
8
9 .text
10 .section .text.vsyscall,"ax"
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 push %ebp
16.Lpush_ebp:
17 movl %ecx, %ebp
18 syscall
19 movl $__USER32_DS, %ecx
20 movl %ecx, %ss
21 movl %ebp, %ecx
22 popl %ebp
23.Lpop_ebp:
24 ret
25.LEND_vsyscall:
26 .size __kernel_vsyscall,.-.LSTART_vsyscall
27
28 .section .eh_frame,"a",@progbits
29.LSTARTFRAME:
30 .long .LENDCIE-.LSTARTCIE
31.LSTARTCIE:
32 .long 0 /* CIE ID */
33 .byte 1 /* Version number */
34 .string "zR" /* NUL-terminated augmentation string */
35 .uleb128 1 /* Code alignment factor */
36 .sleb128 -4 /* Data alignment factor */
37 .byte 8 /* Return address register column */
38 .uleb128 1 /* Augmentation value length */
39 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
40 .byte 0x0c /* DW_CFA_def_cfa */
41 .uleb128 4
42 .uleb128 4
43 .byte 0x88 /* DW_CFA_offset, column 0x8 */
44 .uleb128 1
45 .align 4
46.LENDCIE:
47
48 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
49.LSTARTFDE1:
50 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
51 .long .LSTART_vsyscall-. /* PC-relative start address */
52 .long .LEND_vsyscall-.LSTART_vsyscall
53 .uleb128 0 /* Augmentation length */
54 /* What follows are the instructions for the table generation.
55 We have to record all changes of the stack pointer. */
56 .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
57 .byte 0x0e /* DW_CFA_def_cfa_offset */
58 .uleb128 8
59 .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
60 .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
61 .byte 0xc5 /* DW_CFA_restore %ebp */
62 .byte 0x0e /* DW_CFA_def_cfa_offset */
63 .uleb128 4
64 .align 4
65.LENDFDE1:
66
67#define SYSCALL_ENTER_KERNEL syscall
68#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S
new file mode 100644
index 000000000000..8fb8e0ff3afa
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sysenter.S
@@ -0,0 +1,94 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/offset.h>
7
8 .text
9 .section .text.vsyscall,"ax"
10 .globl __kernel_vsyscall
11 .type __kernel_vsyscall,@function
12__kernel_vsyscall:
13.LSTART_vsyscall:
14 push %ecx
15.Lpush_ecx:
16 push %edx
17.Lpush_edx:
18 push %ebp
19.Lenter_kernel:
20 movl %esp,%ebp
21 sysenter
22 .space 7,0x90
23 jmp .Lenter_kernel
24 /* 16: System call normal return point is here! */
25 pop %ebp
26.Lpop_ebp:
27 pop %edx
28.Lpop_edx:
29 pop %ecx
30.Lpop_ecx:
31 ret
32.LEND_vsyscall:
33 .size __kernel_vsyscall,.-.LSTART_vsyscall
34
35 .section .eh_frame,"a",@progbits
36.LSTARTFRAME:
37 .long .LENDCIE-.LSTARTCIE
38.LSTARTCIE:
39 .long 0 /* CIE ID */
40 .byte 1 /* Version number */
41 .string "zR" /* NUL-terminated augmentation string */
42 .uleb128 1 /* Code alignment factor */
43 .sleb128 -4 /* Data alignment factor */
44 .byte 8 /* Return address register column */
45 .uleb128 1 /* Augmentation value length */
46 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
47 .byte 0x0c /* DW_CFA_def_cfa */
48 .uleb128 4
49 .uleb128 4
50 .byte 0x88 /* DW_CFA_offset, column 0x8 */
51 .uleb128 1
52 .align 4
53.LENDCIE:
54
55 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
56.LSTARTFDE1:
57 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
58 .long .LSTART_vsyscall-. /* PC-relative start address */
59 .long .LEND_vsyscall-.LSTART_vsyscall
60 .uleb128 0 /* Augmentation length */
61 /* What follows are the instructions for the table generation.
62 We have to record all changes of the stack pointer. */
63 .byte 0x04 /* DW_CFA_advance_loc4 */
64 .long .Lpush_ecx-.LSTART_vsyscall
65 .byte 0x0e /* DW_CFA_def_cfa_offset */
66 .byte 0x08 /* RA at offset 8 now */
67 .byte 0x04 /* DW_CFA_advance_loc4 */
68 .long .Lpush_edx-.Lpush_ecx
69 .byte 0x0e /* DW_CFA_def_cfa_offset */
70 .byte 0x0c /* RA at offset 12 now */
71 .byte 0x04 /* DW_CFA_advance_loc4 */
72 .long .Lenter_kernel-.Lpush_edx
73 .byte 0x0e /* DW_CFA_def_cfa_offset */
74 .byte 0x10 /* RA at offset 16 now */
75 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
76 /* Finally the epilogue. */
77 .byte 0x04 /* DW_CFA_advance_loc4 */
78 .long .Lpop_ebp-.Lenter_kernel
79 .byte 0x0e /* DW_CFA_def_cfa_offset */
80 .byte 0x12 /* RA at offset 12 now */
81 .byte 0xc5 /* DW_CFA_restore %ebp */
82 .byte 0x04 /* DW_CFA_advance_loc4 */
83 .long .Lpop_edx-.Lpop_ebp
84 .byte 0x0e /* DW_CFA_def_cfa_offset */
85 .byte 0x08 /* RA at offset 8 now */
86 .byte 0x04 /* DW_CFA_advance_loc4 */
87 .long .Lpop_ecx-.Lpop_edx
88 .byte 0x0e /* DW_CFA_def_cfa_offset */
89 .byte 0x04 /* RA at offset 4 now */
90 .align 4
91.LENDFDE1:
92
93#define SYSCALL_ENTER_KERNEL int $0x80
94#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds
new file mode 100644
index 000000000000..fa4b4dd4a9ff
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall.lds
@@ -0,0 +1,77 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address. This script controls its layout.
4 */
5
6/* This must match <asm/fixmap.h>. */
7VSYSCALL_BASE = 0xffffe000;
8
9SECTIONS
10{
11 . = VSYSCALL_BASE + SIZEOF_HEADERS;
12
13 .hash : { *(.hash) } :text
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VSYSCALL_BASE + 0x400;
25
26 .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090
27
28 /* This is an 32bit object and we cannot easily get the offsets
29 into the 64bit kernel. Just hardcode them here. This assumes
30 that all the stubs don't need more than 0x100 bytes. */
31 . = VSYSCALL_BASE + 0x500;
32
33 .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090
34
35 . = VSYSCALL_BASE + 0x600;
36
37 .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090
38
39 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
40 .eh_frame : { KEEP (*(.eh_frame)) } :text
41 .dynamic : { *(.dynamic) } :text :dynamic
42 .useless : {
43 *(.got.plt) *(.got)
44 *(.data .data.* .gnu.linkonce.d.*)
45 *(.dynbss)
46 *(.bss .bss.* .gnu.linkonce.b.*)
47 } :text
48}
49
50/*
51 * We must supply the ELF program headers explicitly to get just one
52 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
53 */
54PHDRS
55{
56 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
57 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
58 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
59}
60
61/*
62 * This controls what symbols we export from the DSO.
63 */
64VERSION
65{
66 LINUX_2.5 {
67 global:
68 __kernel_vsyscall;
69 __kernel_sigreturn;
70 __kernel_rt_sigreturn;
71
72 local: *;
73 };
74}
75
76/* The ELF entry point can be used to set the AT_SYSINFO value. */
77ENTRY(__kernel_vsyscall);
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
new file mode 100644
index 000000000000..0a3318e08ab6
--- /dev/null
+++ b/arch/x86_64/kernel/Makefile
@@ -0,0 +1,45 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head.o head64.o init_task.o vmlinux.lds
6EXTRA_AFLAGS := -traditional
7obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o
11
12obj-$(CONFIG_X86_MCE) += mce.o
13obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
14obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
15obj-$(CONFIG_ACPI_BOOT) += acpi/
16obj-$(CONFIG_X86_MSR) += msr.o
17obj-$(CONFIG_MICROCODE) += microcode.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_PM) += suspend.o
24obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
25obj-$(CONFIG_CPU_FREQ) += cpufreq/
26obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
27obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
29obj-$(CONFIG_SWIOTLB) += swiotlb.o
30obj-$(CONFIG_KPROBES) += kprobes.o
31
32obj-$(CONFIG_MODULES) += module.o
33
34obj-y += topology.o
35obj-y += intel_cacheinfo.o
36
37CFLAGS_vsyscall.o := $(PROFILING) -g0
38
39bootflag-y += ../../i386/kernel/bootflag.o
40cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
41topology-y += ../../i386/mach-default/topology.o
42swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
43microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
44intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
45quirks-y += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
new file mode 100644
index 000000000000..d2c2ee5f9a88
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_ACPI_BOOT) := boot.o
2boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o
3obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..7a275de6df22
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -0,0 +1,132 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/config.h>
30#include <linux/kernel.h>
31#include <linux/init.h>
32#include <linux/types.h>
33#include <linux/stddef.h>
34#include <linux/slab.h>
35#include <linux/pci.h>
36#include <linux/bootmem.h>
37#include <linux/irq.h>
38#include <linux/acpi.h>
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50
51/* --------------------------------------------------------------------------
52 Low-Level Sleep Support
53 -------------------------------------------------------------------------- */
54
55#ifdef CONFIG_ACPI_SLEEP
56
57/* address in low memory of the wakeup routine. */
58unsigned long acpi_wakeup_address = 0;
59unsigned long acpi_video_flags;
60extern char wakeup_start, wakeup_end;
61
62extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
63
64static pgd_t low_ptr;
65
66static void init_low_mapping(void)
67{
68 pgd_t *slot0 = pgd_offset(current->mm, 0UL);
69 low_ptr = *slot0;
70 set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
71 flush_tlb_all();
72}
73
74/**
75 * acpi_save_state_mem - save kernel state
76 *
77 * Create an identity mapped page table and copy the wakeup routine to
78 * low memory.
79 */
80int acpi_save_state_mem (void)
81{
82 init_low_mapping();
83
84 memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
85 acpi_copy_wakeup_routine(acpi_wakeup_address);
86
87 return 0;
88}
89
90/*
91 * acpi_restore_state
92 */
93void acpi_restore_state_mem (void)
94{
95 set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
96 flush_tlb_all();
97}
98
99/**
100 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
101 *
102 * We allocate a page in low memory for the wakeup
103 * routine for when we come back from a sleep state. The
104 * runtime allocator allows specification of <16M pages, but not
105 * <1M pages.
106 */
107void __init acpi_reserve_bootmem(void)
108{
109 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
110 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
111 printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
112}
113
114static int __init acpi_sleep_setup(char *str)
115{
116 while ((str != NULL) && (*str != '\0')) {
117 if (strncmp(str, "s3_bios", 7) == 0)
118 acpi_video_flags = 1;
119 if (strncmp(str, "s3_mode", 7) == 0)
120 acpi_video_flags |= 2;
121 str = strchr(str, ',');
122 if (str != NULL)
123 str += strspn(str, ", \t");
124 }
125 return 1;
126}
127
128__setup("acpi_sleep=", acpi_sleep_setup);
129
130#endif /*CONFIG_ACPI_SLEEP*/
131
132void acpi_pci_link_exit(void) {}
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..a4c630034cd4
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -0,0 +1,527 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/page.h>
5#include <asm/msr.h>
6
7# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
8#
9# wakeup_code runs in real mode, and at unknown address (determined at run-time).
10# Therefore it must only use relative jumps/calls.
11#
12# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
13#
14# If physical address of wakeup_code is 0x12345, BIOS should call us with
15# cs = 0x1234, eip = 0x05
16#
17
18
19ALIGN
20 .align 16
21ENTRY(wakeup_start)
22wakeup_code:
23 wakeup_code_start = .
24 .code16
25
26# Running in *copy* of this code, somewhere in low 1MB.
27
28 movb $0xa1, %al ; outb %al, $0x80
29 cli
30 cld
31 # setup data segment
32 movw %cs, %ax
33 movw %ax, %ds # Make ds:0 point to wakeup_start
34 movw %ax, %ss
35 mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
36
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 movl real_magic - wakeup_code, %eax
41 cmpl $0x12345678, %eax
42 jne bogus_real_magic
43
44 testl $1, video_flags - wakeup_code
45 jz 1f
46 lcall $0xc000,$3
47 movw %cs, %ax
48 movw %ax, %ds # Bios might have played with that
49 movw %ax, %ss
501:
51
52 testl $2, video_flags - wakeup_code
53 jz 1f
54 mov video_mode - wakeup_code, %ax
55 call mode_seta
561:
57
58 movw $0xb800, %ax
59 movw %ax,%fs
60 movw $0x0e00 + 'L', %fs:(0x10)
61
62 movb $0xa2, %al ; outb %al, $0x80
63
64 lidt %ds:idt_48a - wakeup_code
65 xorl %eax, %eax
66 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
67 shll $4, %eax
68 addl $(gdta - wakeup_code), %eax
69 movl %eax, gdt_48a +2 - wakeup_code
70 lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
71 # appropriate
72
73 movl $1, %eax # protected mode (PE) bit
74 lmsw %ax # This is it!
75 jmp 1f
761:
77
78 .byte 0x66, 0xea # prefix + jmpi-opcode
79 .long wakeup_32 - __START_KERNEL_map
80 .word __KERNEL_CS
81
82 .code32
83wakeup_32:
84# Running in this code, but at low address; paging is not yet turned on.
85 movb $0xa5, %al ; outb %al, $0x80
86
87 /* Check if extended functions are implemented */
88 movl $0x80000000, %eax
89 cpuid
90 cmpl $0x80000000, %eax
91 jbe bogus_cpu
92 wbinvd
93 mov $0x80000001, %eax
94 cpuid
95 btl $29, %edx
96 jnc bogus_cpu
97 movl %edx,%edi
98
99 movw $__KERNEL_DS, %ax
100 movw %ax, %ds
101 movw %ax, %es
102 movw %ax, %fs
103 movw %ax, %gs
104
105 movw $__KERNEL_DS, %ax
106 movw %ax, %ss
107
108 mov $(wakeup_stack - __START_KERNEL_map), %esp
109 movl saved_magic - __START_KERNEL_map, %eax
110 cmpl $0x9abcdef0, %eax
111 jne bogus_32_magic
112
113 /*
114 * Prepare for entering 64bits mode
115 */
116
117 /* Enable PAE mode and PGE */
118 xorl %eax, %eax
119 btsl $5, %eax
120 btsl $7, %eax
121 movl %eax, %cr4
122
123 /* Setup early boot stage 4 level pagetables */
124 movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax
125 movl %eax, %cr3
126
127 /* Setup EFER (Extended Feature Enable Register) */
128 movl $MSR_EFER, %ecx
129 rdmsr
130 /* Fool rdmsr and reset %eax to avoid dependences */
131 xorl %eax, %eax
132 /* Enable Long Mode */
133 btsl $_EFER_LME, %eax
134 /* Enable System Call */
135 btsl $_EFER_SCE, %eax
136
137 /* No Execute supported? */
138 btl $20,%edi
139 jnc 1f
140 btsl $_EFER_NX, %eax
1411:
142
143 /* Make changes effective */
144 wrmsr
145 wbinvd
146
147 xorl %eax, %eax
148 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
149 btsl $0, %eax /* Enable protected mode */
150 btsl $1, %eax /* Enable MP */
151 btsl $4, %eax /* Enable ET */
152 btsl $5, %eax /* Enable NE */
153 btsl $16, %eax /* Enable WP */
154 btsl $18, %eax /* Enable AM */
155
156 /* Make changes effective */
157 movl %eax, %cr0
158 /* At this point:
159 CR4.PAE must be 1
160 CS.L must be 0
161 CR3 must point to PML4
162 Next instruction must be a branch
163 This must be on identity-mapped page
164 */
165 jmp reach_compatibility_mode
166reach_compatibility_mode:
167 movw $0x0e00 + 'i', %ds:(0xb8012)
168 movb $0xa8, %al ; outb %al, $0x80;
169
170 /*
171 * At this point we're in long mode but in 32bit compatibility mode
172 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
173 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
174 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
175 */
176
177 movw $0x0e00 + 'n', %ds:(0xb8014)
178 movb $0xa9, %al ; outb %al, $0x80
179
180 /* Load new GDT with the 64bit segment using 32bit descriptor */
181 movl $(pGDT32 - __START_KERNEL_map), %eax
182 lgdt (%eax)
183
184 movl $(wakeup_jumpvector - __START_KERNEL_map), %eax
185 /* Finally jump in 64bit mode */
186 ljmp *(%eax)
187
188wakeup_jumpvector:
189 .long wakeup_long64 - __START_KERNEL_map
190 .word __KERNEL_CS
191
192.code64
193
194 /* Hooray, we are in Long 64-bit mode (but still running in low memory) */
195wakeup_long64:
196 /*
197 * We must switch to a new descriptor in kernel space for the GDT
198 * because soon the kernel won't have access anymore to the userspace
199 * addresses where we're currently running on. We have to do that here
200 * because in 32bit we couldn't load a 64bit linear address.
201 */
202 lgdt cpu_gdt_descr - __START_KERNEL_map
203
204 movw $0x0e00 + 'u', %ds:(0xb8016)
205
206 nop
207 nop
208 movw $__KERNEL_DS, %ax
209 movw %ax, %ss
210 movw %ax, %ds
211 movw %ax, %es
212 movw %ax, %fs
213 movw %ax, %gs
214 movq saved_esp, %rsp
215
216 movw $0x0e00 + 'x', %ds:(0xb8018)
217 movq saved_ebx, %rbx
218 movq saved_edi, %rdi
219 movq saved_esi, %rsi
220 movq saved_ebp, %rbp
221
222 movw $0x0e00 + '!', %ds:(0xb801a)
223 movq saved_eip, %rax
224 jmp *%rax
225
226.code32
227
228 .align 64
229gdta:
230 .word 0, 0, 0, 0 # dummy
231
232 .word 0, 0, 0, 0 # unused
233
234 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
235 .word 0 # base address = 0
236 .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
237 .word 0x00CF # granularity = 4096, 386
238 # (+5th nibble of limit)
239
240 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
241 .word 0 # base address = 0
242 .word 0x9200 # data read/write
243 .word 0x00CF # granularity = 4096, 386
244 # (+5th nibble of limit)
245# this is 64bit descriptor for code
246 .word 0xFFFF
247 .word 0
248 .word 0x9A00 # code read/exec
249 .word 0x00AF # as above, but it is long mode and with D=0
250
251idt_48a:
252 .word 0 # idt limit = 0
253 .word 0, 0 # idt base = 0L
254
255gdt_48a:
256 .word 0x8000 # gdt limit=2048,
257 # 256 GDT entries
258 .word 0, 0 # gdt base (filled in later)
259
260
261real_save_gdt: .word 0
262 .quad 0
263real_magic: .quad 0
264video_mode: .quad 0
265video_flags: .quad 0
266
267bogus_real_magic:
268 movb $0xba,%al ; outb %al,$0x80
269 jmp bogus_real_magic
270
271bogus_32_magic:
272 movb $0xb3,%al ; outb %al,$0x80
273 jmp bogus_32_magic
274
275bogus_31_magic:
276 movb $0xb1,%al ; outb %al,$0x80
277 jmp bogus_31_magic
278
279bogus_cpu:
280 movb $0xbc,%al ; outb %al,$0x80
281 jmp bogus_cpu
282
283
284/* This code uses an extended set of video mode numbers. These include:
285 * Aliases for standard modes
286 * NORMAL_VGA (-1)
287 * EXTENDED_VGA (-2)
288 * ASK_VGA (-3)
289 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
290 * of compatibility when extending the table. These are between 0x00 and 0xff.
291 */
292#define VIDEO_FIRST_MENU 0x0000
293
294/* Standard BIOS video modes (BIOS number + 0x0100) */
295#define VIDEO_FIRST_BIOS 0x0100
296
297/* VESA BIOS video modes (VESA number + 0x0200) */
298#define VIDEO_FIRST_VESA 0x0200
299
300/* Video7 special modes (BIOS number + 0x0900) */
301#define VIDEO_FIRST_V7 0x0900
302
303# Setting of user mode (AX=mode ID) => CF=success
304mode_seta:
305 movw %ax, %bx
306#if 0
307 cmpb $0xff, %ah
308 jz setalias
309
310 testb $VIDEO_RECALC>>8, %ah
311 jnz _setrec
312
313 cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
314 jnc setres
315
316 cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
317 jz setspc
318
319 cmpb $VIDEO_FIRST_V7>>8, %ah
320 jz setv7
321#endif
322
323 cmpb $VIDEO_FIRST_VESA>>8, %ah
324 jnc check_vesaa
325#if 0
326 orb %ah, %ah
327 jz setmenu
328#endif
329
330 decb %ah
331# jz setbios Add bios modes later
332
333setbada: clc
334 ret
335
336check_vesaa:
337 subb $VIDEO_FIRST_VESA>>8, %bh
338 orw $0x4000, %bx # Use linear frame buffer
339 movw $0x4f02, %ax # VESA BIOS mode set call
340 int $0x10
341 cmpw $0x004f, %ax # AL=4f if implemented
342 jnz _setbada # AH=0 if OK
343
344 stc
345 ret
346
347_setbada: jmp setbada
348
349 .code64
350bogus_magic:
351 movw $0x0e00 + 'B', %ds:(0xb8018)
352 jmp bogus_magic
353
354bogus_magic2:
355 movw $0x0e00 + '2', %ds:(0xb8018)
356 jmp bogus_magic2
357
358
359wakeup_stack_begin: # Stack grows down
360
361.org 0xff0
362wakeup_stack: # Just below end of page
363
364ENTRY(wakeup_end)
365
366##
367# acpi_copy_wakeup_routine
368#
369# Copy the above routine to low memory.
370#
371# Parameters:
372# %rdi: place to copy wakeup routine to
373#
374# Returned address is location of code in low memory (past data and stack)
375#
376ENTRY(acpi_copy_wakeup_routine)
377 pushq %rax
378 pushq %rcx
379 pushq %rdx
380
381 sgdt saved_gdt
382 sidt saved_idt
383 sldt saved_ldt
384 str saved_tss
385
386 movq %cr3, %rdx
387 movq %rdx, saved_cr3
388 movq %cr4, %rdx
389 movq %rdx, saved_cr4
390 movq %cr0, %rdx
391 movq %rdx, saved_cr0
392 sgdt real_save_gdt - wakeup_start (,%rdi)
393 movl $MSR_EFER, %ecx
394 rdmsr
395 movl %eax, saved_efer
396 movl %edx, saved_efer2
397
398 movl saved_video_mode, %edx
399 movl %edx, video_mode - wakeup_start (,%rdi)
400 movl acpi_video_flags, %edx
401 movl %edx, video_flags - wakeup_start (,%rdi)
402 movq $0x12345678, real_magic - wakeup_start (,%rdi)
403 movq $0x123456789abcdef0, %rdx
404 movq %rdx, saved_magic
405
406 movl saved_magic - __START_KERNEL_map, %eax
407 cmpl $0x9abcdef0, %eax
408 jne bogus_32_magic
409
410 # make sure %cr4 is set correctly (features, etc)
411 movl saved_cr4 - __START_KERNEL_map, %eax
412 movq %rax, %cr4
413
414 movl saved_cr0 - __START_KERNEL_map, %eax
415 movq %rax, %cr0
416 jmp 1f # Flush pipelines
4171:
418 # restore the regs we used
419 popq %rdx
420 popq %rcx
421 popq %rax
422ENTRY(do_suspend_lowlevel_s4bios)
423 ret
424
425 .align 2
426 .p2align 4,,15
427.globl do_suspend_lowlevel
428 .type do_suspend_lowlevel,@function
429do_suspend_lowlevel:
430.LFB5:
431 subq $8, %rsp
432 xorl %eax, %eax
433 call save_processor_state
434
435 movq %rsp, saved_context_esp(%rip)
436 movq %rax, saved_context_eax(%rip)
437 movq %rbx, saved_context_ebx(%rip)
438 movq %rcx, saved_context_ecx(%rip)
439 movq %rdx, saved_context_edx(%rip)
440 movq %rbp, saved_context_ebp(%rip)
441 movq %rsi, saved_context_esi(%rip)
442 movq %rdi, saved_context_edi(%rip)
443 movq %r8, saved_context_r08(%rip)
444 movq %r9, saved_context_r09(%rip)
445 movq %r10, saved_context_r10(%rip)
446 movq %r11, saved_context_r11(%rip)
447 movq %r12, saved_context_r12(%rip)
448 movq %r13, saved_context_r13(%rip)
449 movq %r14, saved_context_r14(%rip)
450 movq %r15, saved_context_r15(%rip)
451 pushfq ; popq saved_context_eflags(%rip)
452
453 movq $.L97, saved_eip(%rip)
454
455 movq %rsp,saved_esp
456 movq %rbp,saved_ebp
457 movq %rbx,saved_ebx
458 movq %rdi,saved_edi
459 movq %rsi,saved_esi
460
461 addq $8, %rsp
462 movl $3, %edi
463 xorl %eax, %eax
464 jmp acpi_enter_sleep_state
465.L97:
466 .p2align 4,,7
467.L99:
468 .align 4
469 movl $24, %eax
470 movw %ax, %ds
471 movq saved_context+58(%rip), %rax
472 movq %rax, %cr4
473 movq saved_context+50(%rip), %rax
474 movq %rax, %cr3
475 movq saved_context+42(%rip), %rax
476 movq %rax, %cr2
477 movq saved_context+34(%rip), %rax
478 movq %rax, %cr0
479 pushq saved_context_eflags(%rip) ; popfq
480 movq saved_context_esp(%rip), %rsp
481 movq saved_context_ebp(%rip), %rbp
482 movq saved_context_eax(%rip), %rax
483 movq saved_context_ebx(%rip), %rbx
484 movq saved_context_ecx(%rip), %rcx
485 movq saved_context_edx(%rip), %rdx
486 movq saved_context_esi(%rip), %rsi
487 movq saved_context_edi(%rip), %rdi
488 movq saved_context_r08(%rip), %r8
489 movq saved_context_r09(%rip), %r9
490 movq saved_context_r10(%rip), %r10
491 movq saved_context_r11(%rip), %r11
492 movq saved_context_r12(%rip), %r12
493 movq saved_context_r13(%rip), %r13
494 movq saved_context_r14(%rip), %r14
495 movq saved_context_r15(%rip), %r15
496
497 xorl %eax, %eax
498 addq $8, %rsp
499 jmp restore_processor_state
500.LFE5:
501.Lfe5:
502 .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
503
504.data
505ALIGN
506ENTRY(saved_ebp) .quad 0
507ENTRY(saved_esi) .quad 0
508ENTRY(saved_edi) .quad 0
509ENTRY(saved_ebx) .quad 0
510
511ENTRY(saved_eip) .quad 0
512ENTRY(saved_esp) .quad 0
513
514ENTRY(saved_magic) .quad 0
515
516ALIGN
517# saved registers
518saved_gdt: .quad 0,0
519saved_idt: .quad 0,0
520saved_ldt: .quad 0
521saved_tss: .quad 0
522
523saved_cr0: .quad 0
524saved_cr3: .quad 0
525saved_cr4: .quad 0
526saved_efer: .quad 0
527saved_efer2: .quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
new file mode 100644
index 000000000000..4baa99fe1e5c
--- /dev/null
+++ b/arch/x86_64/kernel/aperture.c
@@ -0,0 +1,286 @@
1/*
2 * Firmware replacement code.
3 *
4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
12 */
13#include <linux/config.h>
14#include <linux/kernel.h>
15#include <linux/types.h>
16#include <linux/init.h>
17#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/pci_ids.h>
20#include <linux/pci.h>
21#include <linux/bitops.h>
22#include <asm/e820.h>
23#include <asm/io.h>
24#include <asm/proto.h>
25#include <asm/pci-direct.h>
26
27int iommu_aperture;
28int iommu_aperture_disabled __initdata = 0;
29int iommu_aperture_allowed __initdata = 0;
30
31int fallback_aper_order __initdata = 1; /* 64MB */
32int fallback_aper_force __initdata = 0;
33
34int fix_aperture __initdata = 1;
35
36/* This code runs before the PCI subsystem is initialized, so just
37 access the northbridge directly. */
38
39#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
40
41static u32 __init allocate_aperture(void)
42{
43#ifdef CONFIG_DISCONTIGMEM
44 pg_data_t *nd0 = NODE_DATA(0);
45#else
46 pg_data_t *nd0 = &contig_page_data;
47#endif
48 u32 aper_size;
49 void *p;
50
51 if (fallback_aper_order > 7)
52 fallback_aper_order = 7;
53 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
54
55 /*
56 * Aperture has to be naturally aligned. This means an 2GB aperture won't
57 * have much chances to find a place in the lower 4GB of memory.
58 * Unfortunately we cannot move it up because that would make the
59 * IOMMU useless.
60 */
61 p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0);
62 if (!p || __pa(p)+aper_size > 0xffffffff) {
63 printk("Cannot allocate aperture memory hole (%p,%uK)\n",
64 p, aper_size>>10);
65 if (p)
66 free_bootmem_node(nd0, (unsigned long)p, aper_size);
67 return 0;
68 }
69 printk("Mapping aperture over %d KB of RAM @ %lx\n",
70 aper_size >> 10, __pa(p));
71 return (u32)__pa(p);
72}
73
74static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
75{
76 if (!aper_base)
77 return 0;
78 if (aper_size < 64*1024*1024) {
79 printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20);
80 return 0;
81 }
82 if (aper_base + aper_size >= 0xffffffff) {
83 printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
84 return 0;
85 }
86 if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
87 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
88 return 0;
89 }
90 return 1;
91}
92
93/* Find a PCI capability */
94static __u32 __init find_cap(int num, int slot, int func, int cap)
95{
96 u8 pos;
97 int bytes;
98 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
99 return 0;
100 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
101 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
102 u8 id;
103 pos &= ~3;
104 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
105 if (id == 0xff)
106 break;
107 if (id == cap)
108 return pos;
109 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
110 }
111 return 0;
112}
113
114/* Read a standard AGPv3 bridge header */
115static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
116{
117 u32 apsize;
118 u32 apsizereg;
119 int nbits;
120 u32 aper_low, aper_hi;
121 u64 aper;
122
123 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
124 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
125 if (apsizereg == 0xffffffff) {
126 printk("APSIZE in AGP bridge unreadable\n");
127 return 0;
128 }
129
130 apsize = apsizereg & 0xfff;
131 /* Some BIOS use weird encodings not in the AGPv3 table. */
132 if (apsize & 0xff)
133 apsize |= 0xf00;
134 nbits = hweight16(apsize);
135 *order = 7 - nbits;
136 if ((int)*order < 0) /* < 32MB */
137 *order = 0;
138
139 aper_low = read_pci_config(num,slot,func, 0x10);
140 aper_hi = read_pci_config(num,slot,func,0x14);
141 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
142
143 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
144 aper, 32 << *order, apsizereg);
145
146 if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
147 return 0;
148 return (u32)aper;
149}
150
151/* Look for an AGP bridge. Windows only expects the aperture in the
152 AGP bridge and some BIOS forget to initialize the Northbridge too.
153 Work around this here.
154
155 Do an PCI bus scan by hand because we're running before the PCI
156 subsystem.
157
158 All K8 AGP bridges are AGPv3 compliant, so we can do this scan
159 generically. It's probably overkill to always scan all slots because
160 the AGP bridges should be always an own bus on the HT hierarchy,
161 but do it here for future safety. */
162static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
163{
164 int num, slot, func;
165
166 /* Poor man's PCI discovery */
167 for (num = 0; num < 32; num++) {
168 for (slot = 0; slot < 32; slot++) {
169 for (func = 0; func < 8; func++) {
170 u32 class, cap;
171 u8 type;
172 class = read_pci_config(num,slot,func,
173 PCI_CLASS_REVISION);
174 if (class == 0xffffffff)
175 break;
176
177 switch (class >> 16) {
178 case PCI_CLASS_BRIDGE_HOST:
179 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
180 /* AGP bridge? */
181 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
182 if (!cap)
183 break;
184 *valid_agp = 1;
185 return read_agp(num,slot,func,cap,order);
186 }
187
188 /* No multi-function device? */
189 type = read_pci_config_byte(num,slot,func,
190 PCI_HEADER_TYPE);
191 if (!(type & 0x80))
192 break;
193 }
194 }
195 }
196 printk("No AGP bridge found\n");
197 return 0;
198}
199
200void __init iommu_hole_init(void)
201{
202 int fix, num;
203 u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
204 u64 aper_base, last_aper_base = 0;
205 int valid_agp = 0;
206
207 if (iommu_aperture_disabled || !fix_aperture)
208 return;
209
210 printk("Checking aperture...\n");
211
212 fix = 0;
213 for (num = 24; num < 32; num++) {
214 char name[30];
215 if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
216 continue;
217
218 iommu_aperture = 1;
219
220 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
221 aper_size = (32 * 1024 * 1024) << aper_order;
222 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
223 aper_base <<= 25;
224
225 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
226 aper_base, aper_size>>20);
227
228 sprintf(name, "northbridge cpu %d", num-24);
229
230 if (!aperture_valid(name, aper_base, aper_size)) {
231 fix = 1;
232 break;
233 }
234
235 if ((last_aper_order && aper_order != last_aper_order) ||
236 (last_aper_base && aper_base != last_aper_base)) {
237 fix = 1;
238 break;
239 }
240 last_aper_order = aper_order;
241 last_aper_base = aper_base;
242 }
243
244 if (!fix && !fallback_aper_force)
245 return;
246
247 if (!fallback_aper_force)
248 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
249
250 if (aper_alloc) {
251 /* Got the aperture from the AGP bridge */
252 } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
253 force_iommu ||
254 valid_agp ||
255 fallback_aper_force) {
256 printk("Your BIOS doesn't leave a aperture memory hole\n");
257 printk("Please enable the IOMMU option in the BIOS setup\n");
258 printk("This costs you %d MB of RAM\n",
259 32 << fallback_aper_order);
260
261 aper_order = fallback_aper_order;
262 aper_alloc = allocate_aperture();
263 if (!aper_alloc) {
264 /* Could disable AGP and IOMMU here, but it's probably
265 not worth it. But the later users cannot deal with
266 bad apertures and turning on the aperture over memory
267 causes very strange problems, so it's better to
268 panic early. */
269 panic("Not enough memory for aperture");
270 }
271 } else {
272 return;
273 }
274
275 /* Fix up the north bridges */
276 for (num = 24; num < 32; num++) {
277 if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
278 continue;
279
280 /* Don't enable translation yet. That is done later.
281 Assume this BIOS didn't initialise the GART so
282 just overwrite all previous bits */
283 write_pci_config(0, num, 3, 0x90, aper_order<<1);
284 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
285 }
286}
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
new file mode 100644
index 000000000000..9826f03dc9a3
--- /dev/null
+++ b/arch/x86_64/kernel/apic.c
@@ -0,0 +1,1088 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/config.h>
18#include <linux/init.h>
19
20#include <linux/mm.h>
21#include <linux/irq.h>
22#include <linux/delay.h>
23#include <linux/bootmem.h>
24#include <linux/smp_lock.h>
25#include <linux/interrupt.h>
26#include <linux/mc146818rtc.h>
27#include <linux/kernel_stat.h>
28#include <linux/sysdev.h>
29
30#include <asm/atomic.h>
31#include <asm/smp.h>
32#include <asm/mtrr.h>
33#include <asm/mpspec.h>
34#include <asm/pgalloc.h>
35#include <asm/mach_apic.h>
36
37int apic_verbosity;
38
39int disable_apic_timer __initdata;
40
41/* Using APIC to generate smp_local_timer_interrupt? */
42int using_apic_timer = 0;
43
44static DEFINE_PER_CPU(int, prof_multiplier) = 1;
45static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
46static DEFINE_PER_CPU(int, prof_counter) = 1;
47
48static void apic_pm_activate(void);
49
50void enable_NMI_through_LVT0 (void * dummy)
51{
52 unsigned int v, ver;
53
54 ver = apic_read(APIC_LVR);
55 ver = GET_APIC_VERSION(ver);
56 v = APIC_DM_NMI; /* unmask and set to NMI */
57 apic_write_around(APIC_LVT0, v);
58}
59
60int get_maxlvt(void)
61{
62 unsigned int v, ver, maxlvt;
63
64 v = apic_read(APIC_LVR);
65 ver = GET_APIC_VERSION(v);
66 maxlvt = GET_APIC_MAXLVT(v);
67 return maxlvt;
68}
69
70void clear_local_APIC(void)
71{
72 int maxlvt;
73 unsigned int v;
74
75 maxlvt = get_maxlvt();
76
77 /*
78 * Masking an LVT entry on a P6 can trigger a local APIC error
79 * if the vector is zero. Mask LVTERR first to prevent this.
80 */
81 if (maxlvt >= 3) {
82 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
83 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
84 }
85 /*
86 * Careful: we have to set masks only first to deassert
87 * any level-triggered sources.
88 */
89 v = apic_read(APIC_LVTT);
90 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
91 v = apic_read(APIC_LVT0);
92 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
93 v = apic_read(APIC_LVT1);
94 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
95 if (maxlvt >= 4) {
96 v = apic_read(APIC_LVTPC);
97 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
98 }
99
100 /*
101 * Clean APIC state for other OSs:
102 */
103 apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
104 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
105 apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
106 if (maxlvt >= 3)
107 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
108 if (maxlvt >= 4)
109 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
110 v = GET_APIC_VERSION(apic_read(APIC_LVR));
111 if (APIC_INTEGRATED(v)) { /* !82489DX */
112 if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */
113 apic_write(APIC_ESR, 0);
114 apic_read(APIC_ESR);
115 }
116}
117
118void __init connect_bsp_APIC(void)
119{
120 if (pic_mode) {
121 /*
122 * Do not trust the local APIC being empty at bootup.
123 */
124 clear_local_APIC();
125 /*
126 * PIC mode, enable APIC mode in the IMCR, i.e.
127 * connect BSP's local APIC to INT and NMI lines.
128 */
129 apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
130 outb(0x70, 0x22);
131 outb(0x01, 0x23);
132 }
133}
134
135void disconnect_bsp_APIC(void)
136{
137 if (pic_mode) {
138 /*
139 * Put the board back into PIC mode (has an effect
140 * only on certain older boards). Note that APIC
141 * interrupts, including IPIs, won't work beyond
142 * this point! The only exception are INIT IPIs.
143 */
144 apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
145 outb(0x70, 0x22);
146 outb(0x00, 0x23);
147 }
148}
149
150void disable_local_APIC(void)
151{
152 unsigned int value;
153
154 clear_local_APIC();
155
156 /*
157 * Disable APIC (implies clearing of registers
158 * for 82489DX!).
159 */
160 value = apic_read(APIC_SPIV);
161 value &= ~APIC_SPIV_APIC_ENABLED;
162 apic_write_around(APIC_SPIV, value);
163}
164
165/*
166 * This is to verify that we're looking at a real local APIC.
167 * Check these against your board if the CPUs aren't getting
168 * started for no apparent reason.
169 */
170int __init verify_local_APIC(void)
171{
172 unsigned int reg0, reg1;
173
174 /*
175 * The version register is read-only in a real APIC.
176 */
177 reg0 = apic_read(APIC_LVR);
178 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
179 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
180 reg1 = apic_read(APIC_LVR);
181 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
182
183 /*
184 * The two version reads above should print the same
185 * numbers. If the second one is different, then we
186 * poke at a non-APIC.
187 */
188 if (reg1 != reg0)
189 return 0;
190
191 /*
192 * Check if the version looks reasonably.
193 */
194 reg1 = GET_APIC_VERSION(reg0);
195 if (reg1 == 0x00 || reg1 == 0xff)
196 return 0;
197 reg1 = get_maxlvt();
198 if (reg1 < 0x02 || reg1 == 0xff)
199 return 0;
200
201 /*
202 * The ID register is read/write in a real APIC.
203 */
204 reg0 = apic_read(APIC_ID);
205 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
206 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
207 reg1 = apic_read(APIC_ID);
208 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
209 apic_write(APIC_ID, reg0);
210 if (reg1 != (reg0 ^ APIC_ID_MASK))
211 return 0;
212
213 /*
214 * The next two are just to see if we have sane values.
215 * They're only really relevant if we're in Virtual Wire
216 * compatibility mode, but most boxes are anymore.
217 */
218 reg0 = apic_read(APIC_LVT0);
219 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
220 reg1 = apic_read(APIC_LVT1);
221 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
222
223 return 1;
224}
225
226void __init sync_Arb_IDs(void)
227{
228 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
229 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
230 if (ver >= 0x14) /* P4 or higher */
231 return;
232
233 /*
234 * Wait for idle.
235 */
236 apic_wait_icr_idle();
237
238 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
239 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
240 | APIC_DM_INIT);
241}
242
243extern void __error_in_apic_c (void);
244
245/*
246 * An initial setup of the virtual wire mode.
247 */
248void __init init_bsp_APIC(void)
249{
250 unsigned int value, ver;
251
252 /*
253 * Don't do the setup now if we have a SMP BIOS as the
254 * through-I/O-APIC virtual wire mode might be active.
255 */
256 if (smp_found_config || !cpu_has_apic)
257 return;
258
259 value = apic_read(APIC_LVR);
260 ver = GET_APIC_VERSION(value);
261
262 /*
263 * Do not trust the local APIC being empty at bootup.
264 */
265 clear_local_APIC();
266
267 /*
268 * Enable APIC.
269 */
270 value = apic_read(APIC_SPIV);
271 value &= ~APIC_VECTOR_MASK;
272 value |= APIC_SPIV_APIC_ENABLED;
273 value |= APIC_SPIV_FOCUS_DISABLED;
274 value |= SPURIOUS_APIC_VECTOR;
275 apic_write_around(APIC_SPIV, value);
276
277 /*
278 * Set up the virtual wire mode.
279 */
280 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
281 value = APIC_DM_NMI;
282 if (!APIC_INTEGRATED(ver)) /* 82489DX */
283 value |= APIC_LVT_LEVEL_TRIGGER;
284 apic_write_around(APIC_LVT1, value);
285}
286
287void __init setup_local_APIC (void)
288{
289 unsigned int value, ver, maxlvt;
290
291 /* Pound the ESR really hard over the head with a big hammer - mbligh */
292 if (esr_disable) {
293 apic_write(APIC_ESR, 0);
294 apic_write(APIC_ESR, 0);
295 apic_write(APIC_ESR, 0);
296 apic_write(APIC_ESR, 0);
297 }
298
299 value = apic_read(APIC_LVR);
300 ver = GET_APIC_VERSION(value);
301
302 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
303 __error_in_apic_c();
304
305 /*
306 * Double-check whether this APIC is really registered.
307 * This is meaningless in clustered apic mode, so we skip it.
308 */
309 if (!apic_id_registered())
310 BUG();
311
312 /*
313 * Intel recommends to set DFR, LDR and TPR before enabling
314 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
315 * document number 292116). So here it goes...
316 */
317 init_apic_ldr();
318
319 /*
320 * Set Task Priority to 'accept all'. We never change this
321 * later on.
322 */
323 value = apic_read(APIC_TASKPRI);
324 value &= ~APIC_TPRI_MASK;
325 apic_write_around(APIC_TASKPRI, value);
326
327 /*
328 * Now that we are all set up, enable the APIC
329 */
330 value = apic_read(APIC_SPIV);
331 value &= ~APIC_VECTOR_MASK;
332 /*
333 * Enable APIC
334 */
335 value |= APIC_SPIV_APIC_ENABLED;
336
337 /*
338 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
339 * certain networking cards. If high frequency interrupts are
340 * happening on a particular IOAPIC pin, plus the IOAPIC routing
341 * entry is masked/unmasked at a high rate as well then sooner or
342 * later IOAPIC line gets 'stuck', no more interrupts are received
343 * from the device. If focus CPU is disabled then the hang goes
344 * away, oh well :-(
345 *
346 * [ This bug can be reproduced easily with a level-triggered
347 * PCI Ne2000 networking cards and PII/PIII processors, dual
348 * BX chipset. ]
349 */
350 /*
351 * Actually disabling the focus CPU check just makes the hang less
352 * frequent as it makes the interrupt distributon model be more
353 * like LRU than MRU (the short-term load is more even across CPUs).
354 * See also the comment in end_level_ioapic_irq(). --macro
355 */
356#if 1
357 /* Enable focus processor (bit==0) */
358 value &= ~APIC_SPIV_FOCUS_DISABLED;
359#else
360 /* Disable focus processor (bit==1) */
361 value |= APIC_SPIV_FOCUS_DISABLED;
362#endif
363 /*
364 * Set spurious IRQ vector
365 */
366 value |= SPURIOUS_APIC_VECTOR;
367 apic_write_around(APIC_SPIV, value);
368
369 /*
370 * Set up LVT0, LVT1:
371 *
372 * set up through-local-APIC on the BP's LINT0. This is not
373 * strictly necessary in pure symmetric-IO mode, but sometimes
374 * we delegate interrupts to the 8259A.
375 */
376 /*
377 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
378 */
379 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
380 if (!smp_processor_id() && (pic_mode || !value)) {
381 value = APIC_DM_EXTINT;
382 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
383 } else {
384 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
385 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
386 }
387 apic_write_around(APIC_LVT0, value);
388
389 /*
390 * only the BP should see the LINT1 NMI signal, obviously.
391 */
392 if (!smp_processor_id())
393 value = APIC_DM_NMI;
394 else
395 value = APIC_DM_NMI | APIC_LVT_MASKED;
396 if (!APIC_INTEGRATED(ver)) /* 82489DX */
397 value |= APIC_LVT_LEVEL_TRIGGER;
398 apic_write_around(APIC_LVT1, value);
399
400 if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
401 unsigned oldvalue;
402 maxlvt = get_maxlvt();
403 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
404 apic_write(APIC_ESR, 0);
405 oldvalue = apic_read(APIC_ESR);
406 value = ERROR_APIC_VECTOR; // enables sending errors
407 apic_write_around(APIC_LVTERR, value);
408 /*
409 * spec says clear errors after enabling vector.
410 */
411 if (maxlvt > 3)
412 apic_write(APIC_ESR, 0);
413 value = apic_read(APIC_ESR);
414 if (value != oldvalue)
415 apic_printk(APIC_VERBOSE,
416 "ESR value after enabling vector: %08x, after %08x\n",
417 oldvalue, value);
418 } else {
419 if (esr_disable)
420 /*
421 * Something untraceble is creating bad interrupts on
422 * secondary quads ... for the moment, just leave the
423 * ESR disabled - we can't do anything useful with the
424 * errors anyway - mbligh
425 */
426 apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n");
427 else
428 apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n");
429 }
430
431 nmi_watchdog_default();
432 if (nmi_watchdog == NMI_LOCAL_APIC)
433 setup_apic_nmi_watchdog();
434 apic_pm_activate();
435}
436
437#ifdef CONFIG_PM
438
439static struct {
440 /* 'active' is true if the local APIC was enabled by us and
441 not the BIOS; this signifies that we are also responsible
442 for disabling it before entering apm/acpi suspend */
443 int active;
444 /* r/w apic fields */
445 unsigned int apic_id;
446 unsigned int apic_taskpri;
447 unsigned int apic_ldr;
448 unsigned int apic_dfr;
449 unsigned int apic_spiv;
450 unsigned int apic_lvtt;
451 unsigned int apic_lvtpc;
452 unsigned int apic_lvt0;
453 unsigned int apic_lvt1;
454 unsigned int apic_lvterr;
455 unsigned int apic_tmict;
456 unsigned int apic_tdcr;
457 unsigned int apic_thmr;
458} apic_pm_state;
459
460static int lapic_suspend(struct sys_device *dev, u32 state)
461{
462 unsigned long flags;
463
464 if (!apic_pm_state.active)
465 return 0;
466
467 apic_pm_state.apic_id = apic_read(APIC_ID);
468 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
469 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
470 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
471 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
472 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
473 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
474 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
475 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
476 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
477 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
478 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
479 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
480 local_save_flags(flags);
481 local_irq_disable();
482 disable_local_APIC();
483 local_irq_restore(flags);
484 return 0;
485}
486
487static int lapic_resume(struct sys_device *dev)
488{
489 unsigned int l, h;
490 unsigned long flags;
491
492 if (!apic_pm_state.active)
493 return 0;
494
495 /* XXX: Pavel needs this for S3 resume, but can't explain why */
496 set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
497
498 local_irq_save(flags);
499 rdmsr(MSR_IA32_APICBASE, l, h);
500 l &= ~MSR_IA32_APICBASE_BASE;
501 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
502 wrmsr(MSR_IA32_APICBASE, l, h);
503 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
504 apic_write(APIC_ID, apic_pm_state.apic_id);
505 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
506 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
507 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
508 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
509 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
510 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
511 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
512 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
513 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
514 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
515 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
516 apic_write(APIC_ESR, 0);
517 apic_read(APIC_ESR);
518 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
519 apic_write(APIC_ESR, 0);
520 apic_read(APIC_ESR);
521 local_irq_restore(flags);
522 return 0;
523}
524
525static struct sysdev_class lapic_sysclass = {
526 set_kset_name("lapic"),
527 .resume = lapic_resume,
528 .suspend = lapic_suspend,
529};
530
531static struct sys_device device_lapic = {
532 .id = 0,
533 .cls = &lapic_sysclass,
534};
535
536static void __init apic_pm_activate(void)
537{
538 apic_pm_state.active = 1;
539}
540
541static int __init init_lapic_sysfs(void)
542{
543 int error;
544 if (!cpu_has_apic)
545 return 0;
546 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
547 error = sysdev_class_register(&lapic_sysclass);
548 if (!error)
549 error = sysdev_register(&device_lapic);
550 return error;
551}
552device_initcall(init_lapic_sysfs);
553
554#else /* CONFIG_PM */
555
556static void apic_pm_activate(void) { }
557
558#endif /* CONFIG_PM */
559
560static int __init apic_set_verbosity(char *str)
561{
562 if (strcmp("debug", str) == 0)
563 apic_verbosity = APIC_DEBUG;
564 else if (strcmp("verbose", str) == 0)
565 apic_verbosity = APIC_VERBOSE;
566 else
567 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
568 " use apic=verbose or apic=debug", str);
569
570 return 0;
571}
572
573__setup("apic=", apic_set_verbosity);
574
575/*
576 * Detect and enable local APICs on non-SMP boards.
577 * Original code written by Keir Fraser.
578 * On AMD64 we trust the BIOS - if it says no APIC it is likely
579 * not correctly set up (usually the APIC timer won't work etc.)
580 */
581
582static int __init detect_init_APIC (void)
583{
584 if (!cpu_has_apic) {
585 printk(KERN_INFO "No local APIC present\n");
586 return -1;
587 }
588
589 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
590 boot_cpu_id = 0;
591 return 0;
592}
593
594void __init init_apic_mappings(void)
595{
596 unsigned long apic_phys;
597
598 /*
599 * If no local APIC can be found then set up a fake all
600 * zeroes page to simulate the local APIC and another
601 * one for the IO-APIC.
602 */
603 if (!smp_found_config && detect_init_APIC()) {
604 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
605 apic_phys = __pa(apic_phys);
606 } else
607 apic_phys = mp_lapic_addr;
608
609 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
610 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
611
612 /*
613 * Fetch the APIC ID of the BSP in case we have a
614 * default configuration (or the MP table is broken).
615 */
616 if (boot_cpu_id == -1U)
617 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
618
619#ifdef CONFIG_X86_IO_APIC
620 {
621 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
622 int i;
623
624 for (i = 0; i < nr_ioapics; i++) {
625 if (smp_found_config) {
626 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
627 } else {
628 ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
629 ioapic_phys = __pa(ioapic_phys);
630 }
631 set_fixmap_nocache(idx, ioapic_phys);
632 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
633 __fix_to_virt(idx), ioapic_phys);
634 idx++;
635 }
636 }
637#endif
638}
639
640/*
641 * This function sets up the local APIC timer, with a timeout of
642 * 'clocks' APIC bus clock. During calibration we actually call
643 * this function twice on the boot CPU, once with a bogus timeout
644 * value, second time for real. The other (noncalibrating) CPUs
645 * call this function only once, with the real, calibrated value.
646 *
647 * We do reads before writes even if unnecessary, to get around the
648 * P5 APIC double write bug.
649 */
650
651#define APIC_DIVISOR 16
652
653static void __setup_APIC_LVTT(unsigned int clocks)
654{
655 unsigned int lvtt_value, tmp_value, ver;
656
657 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
658 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
659 if (!APIC_INTEGRATED(ver))
660 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
661 apic_write_around(APIC_LVTT, lvtt_value);
662
663 /*
664 * Divide PICLK by 16
665 */
666 tmp_value = apic_read(APIC_TDCR);
667 apic_write_around(APIC_TDCR, (tmp_value
668 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
669 | APIC_TDR_DIV_16);
670
671 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
672}
673
674static void setup_APIC_timer(unsigned int clocks)
675{
676 unsigned long flags;
677
678 local_irq_save(flags);
679
680 /* For some reasons this doesn't work on Simics, so fake it for now */
681 if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) {
682 __setup_APIC_LVTT(clocks);
683 return;
684 }
685
686 /* wait for irq slice */
687 if (vxtime.hpet_address) {
688 int trigger = hpet_readl(HPET_T0_CMP);
689 while (hpet_readl(HPET_COUNTER) >= trigger)
690 /* do nothing */ ;
691 while (hpet_readl(HPET_COUNTER) < trigger)
692 /* do nothing */ ;
693 } else {
694 int c1, c2;
695 outb_p(0x00, 0x43);
696 c2 = inb_p(0x40);
697 c2 |= inb_p(0x40) << 8;
698 do {
699 c1 = c2;
700 outb_p(0x00, 0x43);
701 c2 = inb_p(0x40);
702 c2 |= inb_p(0x40) << 8;
703 } while (c2 - c1 < 300);
704 }
705
706 __setup_APIC_LVTT(clocks);
707
708 local_irq_restore(flags);
709}
710
711/*
712 * In this function we calibrate APIC bus clocks to the external
713 * timer. Unfortunately we cannot use jiffies and the timer irq
714 * to calibrate, since some later bootup code depends on getting
715 * the first irq? Ugh.
716 *
717 * We want to do the calibration only once since we
718 * want to have local timer irqs syncron. CPUs connected
719 * by the same APIC bus have the very same bus frequency.
720 * And we want to have irqs off anyways, no accidental
721 * APIC irq that way.
722 */
723
724#define TICK_COUNT 100000000
725
726static int __init calibrate_APIC_clock(void)
727{
728 int apic, apic_start, tsc, tsc_start;
729 int result;
730 /*
731 * Put whatever arbitrary (but long enough) timeout
732 * value into the APIC clock, we just want to get the
733 * counter running for calibration.
734 */
735 __setup_APIC_LVTT(1000000000);
736
737 apic_start = apic_read(APIC_TMCCT);
738 rdtscl(tsc_start);
739
740 do {
741 apic = apic_read(APIC_TMCCT);
742 rdtscl(tsc);
743 } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
744
745 result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
746
747 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
748 result / 1000 / 1000, result / 1000 % 1000);
749
750 return result * APIC_DIVISOR / HZ;
751}
752
753static unsigned int calibration_result;
754
755void __init setup_boot_APIC_clock (void)
756{
757 if (disable_apic_timer) {
758 printk(KERN_INFO "Disabling APIC timer\n");
759 return;
760 }
761
762 printk(KERN_INFO "Using local APIC timer interrupts.\n");
763 using_apic_timer = 1;
764
765 local_irq_disable();
766
767 calibration_result = calibrate_APIC_clock();
768 /*
769 * Now set up the timer for real.
770 */
771 setup_APIC_timer(calibration_result);
772
773 local_irq_enable();
774}
775
776void __init setup_secondary_APIC_clock(void)
777{
778 local_irq_disable(); /* FIXME: Do we need this? --RR */
779 setup_APIC_timer(calibration_result);
780 local_irq_enable();
781}
782
783void __init disable_APIC_timer(void)
784{
785 if (using_apic_timer) {
786 unsigned long v;
787
788 v = apic_read(APIC_LVTT);
789 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
790 }
791}
792
793void enable_APIC_timer(void)
794{
795 if (using_apic_timer) {
796 unsigned long v;
797
798 v = apic_read(APIC_LVTT);
799 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
800 }
801}
802
803/*
804 * the frequency of the profiling timer can be changed
805 * by writing a multiplier value into /proc/profile.
806 */
807int setup_profiling_timer(unsigned int multiplier)
808{
809 int i;
810
811 /*
812 * Sanity check. [at least 500 APIC cycles should be
813 * between APIC interrupts as a rule of thumb, to avoid
814 * irqs flooding us]
815 */
816 if ( (!multiplier) || (calibration_result/multiplier < 500))
817 return -EINVAL;
818
819 /*
820 * Set the new multiplier for each CPU. CPUs don't start using the
821 * new values until the next timer interrupt in which they do process
822 * accounting. At that time they also adjust their APIC timers
823 * accordingly.
824 */
825 for (i = 0; i < NR_CPUS; ++i)
826 per_cpu(prof_multiplier, i) = multiplier;
827
828 return 0;
829}
830
831#undef APIC_DIVISOR
832
833/*
834 * Local timer interrupt handler. It does both profiling and
835 * process statistics/rescheduling.
836 *
837 * We do profiling in every local tick, statistics/rescheduling
838 * happen only every 'profiling multiplier' ticks. The default
839 * multiplier is 1 and it can be changed by writing the new multiplier
840 * value into /proc/profile.
841 */
842
843void smp_local_timer_interrupt(struct pt_regs *regs)
844{
845 int cpu = smp_processor_id();
846
847 profile_tick(CPU_PROFILING, regs);
848 if (--per_cpu(prof_counter, cpu) <= 0) {
849 /*
850 * The multiplier may have changed since the last time we got
851 * to this point as a result of the user writing to
852 * /proc/profile. In this case we need to adjust the APIC
853 * timer accordingly.
854 *
855 * Interrupts are already masked off at this point.
856 */
857 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
858 if (per_cpu(prof_counter, cpu) !=
859 per_cpu(prof_old_multiplier, cpu)) {
860 __setup_APIC_LVTT(calibration_result/
861 per_cpu(prof_counter, cpu));
862 per_cpu(prof_old_multiplier, cpu) =
863 per_cpu(prof_counter, cpu);
864 }
865
866#ifdef CONFIG_SMP
867 update_process_times(user_mode(regs));
868#endif
869 }
870
871 /*
872 * We take the 'long' return path, and there every subsystem
873 * grabs the appropriate locks (kernel lock/ irq lock).
874 *
875 * we might want to decouple profiling from the 'long path',
876 * and do the profiling totally in assembly.
877 *
878 * Currently this isn't too much of an issue (performance wise),
879 * we can take more than 100K local irqs per second on a 100 MHz P5.
880 */
881}
882
883/*
884 * Local APIC timer interrupt. This is the most natural way for doing
885 * local interrupts, but local timer interrupts can be emulated by
886 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
887 *
888 * [ if a single-CPU system runs an SMP kernel then we call the local
889 * interrupt as well. Thus we cannot inline the local irq ... ]
890 */
891void smp_apic_timer_interrupt(struct pt_regs *regs)
892{
893 /*
894 * the NMI deadlock-detector uses this.
895 */
896 add_pda(apic_timer_irqs, 1);
897
898 /*
899 * NOTE! We'd better ACK the irq immediately,
900 * because timer handling can be slow.
901 */
902 ack_APIC_irq();
903 /*
904 * update_process_times() expects us to have done irq_enter().
905 * Besides, if we don't timer interrupts ignore the global
906 * interrupt lock, which is the WrongThing (tm) to do.
907 */
908 irq_enter();
909 smp_local_timer_interrupt(regs);
910 irq_exit();
911}
912
913/*
914 * oem_force_hpet_timer -- force HPET mode for some boxes.
915 *
916 * Thus far, the major user of this is IBM's Summit2 series:
917 *
918 * Clustered boxes may have unsynced TSC problems if they are
919 * multi-chassis. Use available data to take a good guess.
920 * If in doubt, go HPET.
921 */
922__init int oem_force_hpet_timer(void)
923{
924 int i, clusters, zeros;
925 unsigned id;
926 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
927
928 bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
929
930 for (i = 0; i < NR_CPUS; i++) {
931 id = bios_cpu_apicid[i];
932 if (id != BAD_APICID)
933 __set_bit(APIC_CLUSTERID(id), clustermap);
934 }
935
936 /* Problem: Partially populated chassis may not have CPUs in some of
937 * the APIC clusters they have been allocated. Only present CPUs have
938 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
939 * clusters are allocated sequentially, count zeros only if they are
940 * bounded by ones.
941 */
942 clusters = 0;
943 zeros = 0;
944 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
945 if (test_bit(i, clustermap)) {
946 clusters += 1 + zeros;
947 zeros = 0;
948 } else
949 ++zeros;
950 }
951
952 /*
953 * If clusters > 2, then should be multi-chassis. Return 1 for HPET.
954 * Else return 0 to use TSC.
955 * May have to revisit this when multi-core + hyperthreaded CPUs come
956 * out, but AFAIK this will work even for them.
957 */
958 return (clusters > 2);
959}
960
961/*
962 * This interrupt should _never_ happen with our APIC/SMP architecture
963 */
964asmlinkage void smp_spurious_interrupt(void)
965{
966 unsigned int v;
967 irq_enter();
968 /*
969 * Check if this really is a spurious interrupt and ACK it
970 * if it is a vectored one. Just in case...
971 * Spurious interrupts should not be ACKed.
972 */
973 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
974 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
975 ack_APIC_irq();
976
977#if 0
978 static unsigned long last_warning;
979 static unsigned long skipped;
980
981 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
982 if (time_before(last_warning+30*HZ,jiffies)) {
983 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
984 smp_processor_id(), skipped);
985 last_warning = jiffies;
986 skipped = 0;
987 } else {
988 skipped++;
989 }
990#endif
991 irq_exit();
992}
993
994/*
995 * This interrupt should never happen with our APIC/SMP architecture
996 */
997
998asmlinkage void smp_error_interrupt(void)
999{
1000 unsigned int v, v1;
1001
1002 irq_enter();
1003 /* First tickle the hardware, only then report what went on. -- REW */
1004 v = apic_read(APIC_ESR);
1005 apic_write(APIC_ESR, 0);
1006 v1 = apic_read(APIC_ESR);
1007 ack_APIC_irq();
1008 atomic_inc(&irq_err_count);
1009
1010 /* Here is what the APIC error bits mean:
1011 0: Send CS error
1012 1: Receive CS error
1013 2: Send accept error
1014 3: Receive accept error
1015 4: Reserved
1016 5: Send illegal vector
1017 6: Received illegal vector
1018 7: Illegal register address
1019 */
1020 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1021 smp_processor_id(), v , v1);
1022 irq_exit();
1023}
1024
1025int disable_apic;
1026
1027/*
1028 * This initializes the IO-APIC and APIC hardware if this is
1029 * a UP kernel.
1030 */
1031int __init APIC_init_uniprocessor (void)
1032{
1033 if (disable_apic) {
1034 printk(KERN_INFO "Apic disabled\n");
1035 return -1;
1036 }
1037 if (!cpu_has_apic) {
1038 disable_apic = 1;
1039 printk(KERN_INFO "Apic disabled by BIOS\n");
1040 return -1;
1041 }
1042
1043 verify_local_APIC();
1044
1045 connect_bsp_APIC();
1046
1047 phys_cpu_present_map = physid_mask_of_physid(0);
1048 apic_write_around(APIC_ID, boot_cpu_id);
1049
1050 setup_local_APIC();
1051
1052#ifdef CONFIG_X86_IO_APIC
1053 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1054 setup_IO_APIC();
1055 else
1056 nr_ioapics = 0;
1057#endif
1058 setup_boot_APIC_clock();
1059
1060 return 0;
1061}
1062
1063static __init int setup_disableapic(char *str)
1064{
1065 disable_apic = 1;
1066 return 0;
1067}
1068
1069static __init int setup_nolapic(char *str)
1070{
1071 disable_apic = 1;
1072 return 0;
1073}
1074
1075static __init int setup_noapictimer(char *str)
1076{
1077 disable_apic_timer = 1;
1078 return 0;
1079}
1080
1081/* dummy parsing: see setup.c */
1082
1083__setup("disableapic", setup_disableapic);
1084__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
1085
1086__setup("noapictimer", setup_noapictimer);
1087
1088/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..35b4c3fcbb37
--- /dev/null
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -0,0 +1,69 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6
7#include <linux/sched.h>
8#include <linux/stddef.h>
9#include <linux/errno.h>
10#include <linux/hardirq.h>
11#include <linux/suspend.h>
12#include <asm/pda.h>
13#include <asm/processor.h>
14#include <asm/segment.h>
15#include <asm/thread_info.h>
16#include <asm/ia32.h>
17
18#define DEFINE(sym, val) \
19 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
20
21#define BLANK() asm volatile("\n->" : : )
22
23int main(void)
24{
25#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
26 ENTRY(state);
27 ENTRY(flags);
28 ENTRY(thread);
29 ENTRY(pid);
30 BLANK();
31#undef ENTRY
32#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
33 ENTRY(flags);
34 ENTRY(addr_limit);
35 ENTRY(preempt_count);
36 BLANK();
37#undef ENTRY
38#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
39 ENTRY(kernelstack);
40 ENTRY(oldrsp);
41 ENTRY(pcurrent);
42 ENTRY(irqrsp);
43 ENTRY(irqcount);
44 ENTRY(cpunumber);
45 ENTRY(irqstackptr);
46 BLANK();
47#undef ENTRY
48#ifdef CONFIG_IA32_EMULATION
49#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
50 ENTRY(eax);
51 ENTRY(ebx);
52 ENTRY(ecx);
53 ENTRY(edx);
54 ENTRY(esi);
55 ENTRY(edi);
56 ENTRY(ebp);
57 ENTRY(esp);
58 ENTRY(eip);
59 BLANK();
60#undef ENTRY
61 DEFINE(IA32_RT_SIGFRAME_sigcontext,
62 offsetof (struct rt_sigframe32, uc.uc_mcontext));
63 BLANK();
64#endif
65 DEFINE(pbe_address, offsetof(struct pbe, address));
66 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
67 DEFINE(pbe_next, offsetof(struct pbe, next));
68 return 0;
69}
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..81f1562e5393
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -0,0 +1,96 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_POWERNOW_K8
14 tristate "AMD Opteron/Athlon64 PowerNow!"
15 select CPU_FREQ_TABLE
16 help
17 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
18
19 For details, take a look at <file:Documentation/cpu-freq/>.
20
21 If in doubt, say N.
22
23config X86_POWERNOW_K8_ACPI
24 bool
25 depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
26 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
27 default y
28
29config X86_SPEEDSTEP_CENTRINO
30 tristate "Intel Enhanced SpeedStep"
31 select CPU_FREQ_TABLE
32 depends on ACPI_PROCESSOR
33 help
34 This adds the CPUFreq driver for Enhanced SpeedStep enabled
35 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
36 or 64bit enabled Intel Xeons.
37
38 For details, take a look at <file:Documentation/cpu-freq/>.
39
40 If in doubt, say N.
41
42config X86_SPEEDSTEP_CENTRINO_ACPI
43 bool
44 depends on X86_SPEEDSTEP_CENTRINO
45 default y
46
47config X86_ACPI_CPUFREQ
48 tristate "ACPI Processor P-States driver"
49 depends on ACPI_PROCESSOR
50 help
51 This driver adds a CPUFreq driver which utilizes the ACPI
52 Processor Performance States.
53
54 For details, take a look at <file:Documentation/cpu-freq/>.
55
56 If in doubt, say N.
57
58comment "shared options"
59
60config X86_ACPI_CPUFREQ_PROC_INTF
61 bool "/proc/acpi/processor/../performance interface (deprecated)"
62 depends on PROC_FS
63 depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
64 help
65 This enables the deprecated /proc/acpi/processor/../performance
66 interface. While it is helpful for debugging, the generic,
67 cross-architecture cpufreq interfaces should be used.
68
69 If in doubt, say N.
70
71config X86_P4_CLOCKMOD
72 tristate "Intel Pentium 4 clock modulation"
73 depends on EMBEDDED
74 help
75 This adds the clock modulation driver for Intel Pentium 4 / XEON
76 processors. When enabled it will lower CPU temperature by skipping
77 clocks.
78
79 This driver should be only used in exceptional
80 circumstances when very low power is needed because it causes severe
81 slowdowns and noticeable latencies. Normally Speedstep should be used
82 instead.
83
84 For details, take a look at <file:Documentation/cpu-freq/>.
85
86 Unless you are absolutely sure say N.
87
88
89config X86_SPEEDSTEP_LIB
90 tristate
91 default X86_P4_CLOCKMOD
92
93endif
94
95endmenu
96
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..d8b593879224
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Makefile
@@ -0,0 +1,17 @@
1#
2# Reuse the i386 cpufreq drivers
3#
4
5SRCDIR := ../../../i386/kernel/cpu/cpufreq
6
7obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
8obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
9obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
10obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12
13powernow-k8-objs := ${SRCDIR}/powernow-k8.o
14speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o
15acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o
16p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o
17speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
new file mode 100644
index 000000000000..657003e461e6
--- /dev/null
+++ b/arch/x86_64/kernel/e820.c
@@ -0,0 +1,513 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
5 */
6#include <linux/config.h>
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/init.h>
10#include <linux/bootmem.h>
11#include <linux/ioport.h>
12#include <linux/string.h>
13#include <asm/page.h>
14#include <asm/e820.h>
15#include <asm/proto.h>
16#include <asm/bootsetup.h>
17
18extern char _end[];
19
20/*
21 * PFN of last memory page.
22 */
23unsigned long end_pfn;
24
25/*
26 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
27 * The direct mapping extends to end_pfn_map, so that we can directly access
28 * apertures, ACPI and other tables without having to play with fixmaps.
29 */
30unsigned long end_pfn_map;
31
32/*
33 * Last pfn which the user wants to use.
34 */
35unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
36
37extern struct resource code_resource, data_resource;
38
39/* Check for some hardcoded bad areas that early boot is not allowed to touch */
40static inline int bad_addr(unsigned long *addrp, unsigned long size)
41{
42 unsigned long addr = *addrp, last = addr + size;
43
44 /* various gunk below that needed for SMP startup */
45 if (addr < 0x8000) {
46 *addrp = 0x8000;
47 return 1;
48 }
49
50 /* direct mapping tables of the kernel */
51 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
52 *addrp = table_end << PAGE_SHIFT;
53 return 1;
54 }
55
56 /* initrd */
57#ifdef CONFIG_BLK_DEV_INITRD
58 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
59 addr < INITRD_START+INITRD_SIZE) {
60 *addrp = INITRD_START + INITRD_SIZE;
61 return 1;
62 }
63#endif
64 /* kernel code + 640k memory hole (later should not be needed, but
65 be paranoid for now) */
66 if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
67 *addrp = __pa_symbol(&_end);
68 return 1;
69 }
70 /* XXX ramdisk image here? */
71 return 0;
72}
73
74int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
75{
76 int i;
77 for (i = 0; i < e820.nr_map; i++) {
78 struct e820entry *ei = &e820.map[i];
79 if (type && ei->type != type)
80 continue;
81 if (ei->addr >= end || ei->addr + ei->size < start)
82 continue;
83 return 1;
84 }
85 return 0;
86}
87
88/*
89 * Find a free area in a specific range.
90 */
91unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
92{
93 int i;
94 for (i = 0; i < e820.nr_map; i++) {
95 struct e820entry *ei = &e820.map[i];
96 unsigned long addr = ei->addr, last;
97 if (ei->type != E820_RAM)
98 continue;
99 if (addr < start)
100 addr = start;
101 if (addr > ei->addr + ei->size)
102 continue;
103 while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
104 ;
105 last = addr + size;
106 if (last > ei->addr + ei->size)
107 continue;
108 if (last > end)
109 continue;
110 return addr;
111 }
112 return -1UL;
113}
114
115/*
116 * Free bootmem based on the e820 table for a node.
117 */
118void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
119{
120 int i;
121 for (i = 0; i < e820.nr_map; i++) {
122 struct e820entry *ei = &e820.map[i];
123 unsigned long last, addr;
124
125 if (ei->type != E820_RAM ||
126 ei->addr+ei->size <= start ||
127 ei->addr > end)
128 continue;
129
130 addr = round_up(ei->addr, PAGE_SIZE);
131 if (addr < start)
132 addr = start;
133
134 last = round_down(ei->addr + ei->size, PAGE_SIZE);
135 if (last >= end)
136 last = end;
137
138 if (last > addr && last-addr >= PAGE_SIZE)
139 free_bootmem_node(pgdat, addr, last-addr);
140 }
141}
142
143/*
144 * Find the highest page frame number we have available
145 */
146unsigned long __init e820_end_of_ram(void)
147{
148 int i;
149 unsigned long end_pfn = 0;
150
151 for (i = 0; i < e820.nr_map; i++) {
152 struct e820entry *ei = &e820.map[i];
153 unsigned long start, end;
154
155 start = round_up(ei->addr, PAGE_SIZE);
156 end = round_down(ei->addr + ei->size, PAGE_SIZE);
157 if (start >= end)
158 continue;
159 if (ei->type == E820_RAM) {
160 if (end > end_pfn<<PAGE_SHIFT)
161 end_pfn = end>>PAGE_SHIFT;
162 } else {
163 if (end > end_pfn_map<<PAGE_SHIFT)
164 end_pfn_map = end>>PAGE_SHIFT;
165 }
166 }
167
168 if (end_pfn > end_pfn_map)
169 end_pfn_map = end_pfn;
170 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
171 end_pfn_map = MAXMEM>>PAGE_SHIFT;
172 if (end_pfn > end_user_pfn)
173 end_pfn = end_user_pfn;
174 if (end_pfn > end_pfn_map)
175 end_pfn = end_pfn_map;
176
177 return end_pfn;
178}
179
180/*
181 * Mark e820 reserved areas as busy for the resource manager.
182 */
183void __init e820_reserve_resources(void)
184{
185 int i;
186 for (i = 0; i < e820.nr_map; i++) {
187 struct resource *res;
188 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
189 continue;
190 res = alloc_bootmem_low(sizeof(struct resource));
191 switch (e820.map[i].type) {
192 case E820_RAM: res->name = "System RAM"; break;
193 case E820_ACPI: res->name = "ACPI Tables"; break;
194 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
195 default: res->name = "reserved";
196 }
197 res->start = e820.map[i].addr;
198 res->end = res->start + e820.map[i].size - 1;
199 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
200 request_resource(&iomem_resource, res);
201 if (e820.map[i].type == E820_RAM) {
202 /*
203 * We don't know which RAM region contains kernel data,
204 * so we try it repeatedly and let the resource manager
205 * test it.
206 */
207 request_resource(res, &code_resource);
208 request_resource(res, &data_resource);
209 }
210 }
211}
212
213/*
214 * Add a memory region to the kernel e820 map.
215 */
216void __init add_memory_region(unsigned long start, unsigned long size, int type)
217{
218 int x = e820.nr_map;
219
220 if (x == E820MAX) {
221 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
222 return;
223 }
224
225 e820.map[x].addr = start;
226 e820.map[x].size = size;
227 e820.map[x].type = type;
228 e820.nr_map++;
229}
230
231void __init e820_print_map(char *who)
232{
233 int i;
234
235 for (i = 0; i < e820.nr_map; i++) {
236 printk(" %s: %016Lx - %016Lx ", who,
237 (unsigned long long) e820.map[i].addr,
238 (unsigned long long) (e820.map[i].addr + e820.map[i].size));
239 switch (e820.map[i].type) {
240 case E820_RAM: printk("(usable)\n");
241 break;
242 case E820_RESERVED:
243 printk("(reserved)\n");
244 break;
245 case E820_ACPI:
246 printk("(ACPI data)\n");
247 break;
248 case E820_NVS:
249 printk("(ACPI NVS)\n");
250 break;
251 default: printk("type %u\n", e820.map[i].type);
252 break;
253 }
254 }
255}
256
257/*
258 * Sanitize the BIOS e820 map.
259 *
260 * Some e820 responses include overlapping entries. The following
261 * replaces the original e820 map with a new one, removing overlaps.
262 *
263 */
264static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
265{
266 struct change_member {
267 struct e820entry *pbios; /* pointer to original bios entry */
268 unsigned long long addr; /* address for this change point */
269 };
270 static struct change_member change_point_list[2*E820MAX] __initdata;
271 static struct change_member *change_point[2*E820MAX] __initdata;
272 static struct e820entry *overlap_list[E820MAX] __initdata;
273 static struct e820entry new_bios[E820MAX] __initdata;
274 struct change_member *change_tmp;
275 unsigned long current_type, last_type;
276 unsigned long long last_addr;
277 int chgidx, still_changing;
278 int overlap_entries;
279 int new_bios_entry;
280 int old_nr, new_nr;
281 int i;
282
283 /*
284 Visually we're performing the following (1,2,3,4 = memory types)...
285
286 Sample memory map (w/overlaps):
287 ____22__________________
288 ______________________4_
289 ____1111________________
290 _44_____________________
291 11111111________________
292 ____________________33__
293 ___________44___________
294 __________33333_________
295 ______________22________
296 ___________________2222_
297 _________111111111______
298 _____________________11_
299 _________________4______
300
301 Sanitized equivalent (no overlap):
302 1_______________________
303 _44_____________________
304 ___1____________________
305 ____22__________________
306 ______11________________
307 _________1______________
308 __________3_____________
309 ___________44___________
310 _____________33_________
311 _______________2________
312 ________________1_______
313 _________________4______
314 ___________________2____
315 ____________________33__
316 ______________________4_
317 */
318
319 /* if there's only one memory region, don't bother */
320 if (*pnr_map < 2)
321 return -1;
322
323 old_nr = *pnr_map;
324
325 /* bail out if we find any unreasonable addresses in bios map */
326 for (i=0; i<old_nr; i++)
327 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
328 return -1;
329
330 /* create pointers for initial change-point information (for sorting) */
331 for (i=0; i < 2*old_nr; i++)
332 change_point[i] = &change_point_list[i];
333
334 /* record all known change-points (starting and ending addresses) */
335 chgidx = 0;
336 for (i=0; i < old_nr; i++) {
337 change_point[chgidx]->addr = biosmap[i].addr;
338 change_point[chgidx++]->pbios = &biosmap[i];
339 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
340 change_point[chgidx++]->pbios = &biosmap[i];
341 }
342
343 /* sort change-point list by memory addresses (low -> high) */
344 still_changing = 1;
345 while (still_changing) {
346 still_changing = 0;
347 for (i=1; i < 2*old_nr; i++) {
348 /* if <current_addr> > <last_addr>, swap */
349 /* or, if current=<start_addr> & last=<end_addr>, swap */
350 if ((change_point[i]->addr < change_point[i-1]->addr) ||
351 ((change_point[i]->addr == change_point[i-1]->addr) &&
352 (change_point[i]->addr == change_point[i]->pbios->addr) &&
353 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
354 )
355 {
356 change_tmp = change_point[i];
357 change_point[i] = change_point[i-1];
358 change_point[i-1] = change_tmp;
359 still_changing=1;
360 }
361 }
362 }
363
364 /* create a new bios memory map, removing overlaps */
365 overlap_entries=0; /* number of entries in the overlap table */
366 new_bios_entry=0; /* index for creating new bios map entries */
367 last_type = 0; /* start with undefined memory type */
368 last_addr = 0; /* start with 0 as last starting address */
369 /* loop through change-points, determining affect on the new bios map */
370 for (chgidx=0; chgidx < 2*old_nr; chgidx++)
371 {
372 /* keep track of all overlapping bios entries */
373 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
374 {
375 /* add map entry to overlap list (> 1 entry implies an overlap) */
376 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
377 }
378 else
379 {
380 /* remove entry from list (order independent, so swap with last) */
381 for (i=0; i<overlap_entries; i++)
382 {
383 if (overlap_list[i] == change_point[chgidx]->pbios)
384 overlap_list[i] = overlap_list[overlap_entries-1];
385 }
386 overlap_entries--;
387 }
388 /* if there are overlapping entries, decide which "type" to use */
389 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
390 current_type = 0;
391 for (i=0; i<overlap_entries; i++)
392 if (overlap_list[i]->type > current_type)
393 current_type = overlap_list[i]->type;
394 /* continue building up new bios map based on this information */
395 if (current_type != last_type) {
396 if (last_type != 0) {
397 new_bios[new_bios_entry].size =
398 change_point[chgidx]->addr - last_addr;
399 /* move forward only if the new size was non-zero */
400 if (new_bios[new_bios_entry].size != 0)
401 if (++new_bios_entry >= E820MAX)
402 break; /* no more space left for new bios entries */
403 }
404 if (current_type != 0) {
405 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
406 new_bios[new_bios_entry].type = current_type;
407 last_addr=change_point[chgidx]->addr;
408 }
409 last_type = current_type;
410 }
411 }
412 new_nr = new_bios_entry; /* retain count for new bios entries */
413
414 /* copy new bios mapping into original location */
415 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
416 *pnr_map = new_nr;
417
418 return 0;
419}
420
421/*
422 * Copy the BIOS e820 map into a safe place.
423 *
424 * Sanity-check it while we're at it..
425 *
426 * If we're lucky and live on a modern system, the setup code
427 * will have given us a memory map that we can use to properly
428 * set up memory. If we aren't, we'll fake a memory map.
429 *
430 * We check to see that the memory map contains at least 2 elements
431 * before we'll use it, because the detection code in setup.S may
432 * not be perfect and most every PC known to man has two memory
433 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
434 * thinkpad 560x, for example, does not cooperate with the memory
435 * detection code.)
436 */
437static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
438{
439 /* Only one memory region (or negative)? Ignore it */
440 if (nr_map < 2)
441 return -1;
442
443 do {
444 unsigned long start = biosmap->addr;
445 unsigned long size = biosmap->size;
446 unsigned long end = start + size;
447 unsigned long type = biosmap->type;
448
449 /* Overflow in 64 bits? Ignore the memory map. */
450 if (start > end)
451 return -1;
452
453 /*
454 * Some BIOSes claim RAM in the 640k - 1M region.
455 * Not right. Fix it up.
456 *
457 * This should be removed on Hammer which is supposed to not
458 * have non e820 covered ISA mappings there, but I had some strange
459 * problems so it stays for now. -AK
460 */
461 if (type == E820_RAM) {
462 if (start < 0x100000ULL && end > 0xA0000ULL) {
463 if (start < 0xA0000ULL)
464 add_memory_region(start, 0xA0000ULL-start, type);
465 if (end <= 0x100000ULL)
466 continue;
467 start = 0x100000ULL;
468 size = end - start;
469 }
470 }
471
472 add_memory_region(start, size, type);
473 } while (biosmap++,--nr_map);
474 return 0;
475}
476
477void __init setup_memory_region(void)
478{
479 char *who = "BIOS-e820";
480
481 /*
482 * Try to copy the BIOS-supplied E820-map.
483 *
484 * Otherwise fake a memory map; one section from 0k->640k,
485 * the next section from 1mb->appropriate_mem_k
486 */
487 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
488 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
489 unsigned long mem_size;
490
491 /* compare results from other methods and take the greater */
492 if (ALT_MEM_K < EXT_MEM_K) {
493 mem_size = EXT_MEM_K;
494 who = "BIOS-88";
495 } else {
496 mem_size = ALT_MEM_K;
497 who = "BIOS-e801";
498 }
499
500 e820.nr_map = 0;
501 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
502 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
503 }
504 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
505 e820_print_map(who);
506}
507
508void __init parse_memopt(char *p, char **from)
509{
510 end_user_pfn = memparse(p, from);
511 end_user_pfn >>= PAGE_SHIFT;
512}
513
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
new file mode 100644
index 000000000000..750bcd0655dc
--- /dev/null
+++ b/arch/x86_64/kernel/early_printk.c
@@ -0,0 +1,220 @@
1#include <linux/console.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/string.h>
5#include <asm/io.h>
6#include <asm/processor.h>
7
8/* Simple VGA output */
9
10#ifdef __i386__
11#define VGABASE (__ISA_IO_base + 0xb8000)
12#else
13#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
14#endif
15
16#define MAX_YPOS 25
17#define MAX_XPOS 80
18
19static int current_ypos = 1, current_xpos = 0;
20
21static void early_vga_write(struct console *con, const char *str, unsigned n)
22{
23 char c;
24 int i, k, j;
25
26 while ((c = *str++) != '\0' && n-- > 0) {
27 if (current_ypos >= MAX_YPOS) {
28 /* scroll 1 line up */
29 for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
30 for (i = 0; i < MAX_XPOS; i++) {
31 writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
32 VGABASE + 2*(MAX_XPOS*j + i));
33 }
34 }
35 for (i = 0; i < MAX_XPOS; i++)
36 writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
37 current_ypos = MAX_YPOS-1;
38 }
39 if (c == '\n') {
40 current_xpos = 0;
41 current_ypos++;
42 } else if (c != '\r') {
43 writew(((0x7 << 8) | (unsigned short) c),
44 VGABASE + 2*(MAX_XPOS*current_ypos +
45 current_xpos++));
46 if (current_xpos >= MAX_XPOS) {
47 current_xpos = 0;
48 current_ypos++;
49 }
50 }
51 }
52}
53
54static struct console early_vga_console = {
55 .name = "earlyvga",
56 .write = early_vga_write,
57 .flags = CON_PRINTBUFFER,
58 .index = -1,
59};
60
61/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
62
63int early_serial_base = 0x3f8; /* ttyS0 */
64
65#define XMTRDY 0x20
66
67#define DLAB 0x80
68
69#define TXR 0 /* Transmit register (WRITE) */
70#define RXR 0 /* Receive register (READ) */
71#define IER 1 /* Interrupt Enable */
72#define IIR 2 /* Interrupt ID */
73#define FCR 2 /* FIFO control */
74#define LCR 3 /* Line control */
75#define MCR 4 /* Modem control */
76#define LSR 5 /* Line Status */
77#define MSR 6 /* Modem Status */
78#define DLL 0 /* Divisor Latch Low */
79#define DLH 1 /* Divisor latch High */
80
81static int early_serial_putc(unsigned char ch)
82{
83 unsigned timeout = 0xffff;
84 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
85 cpu_relax();
86 outb(ch, early_serial_base + TXR);
87 return timeout ? 0 : -1;
88}
89
90static void early_serial_write(struct console *con, const char *s, unsigned n)
91{
92 while (*s && n-- > 0) {
93 early_serial_putc(*s);
94 if (*s == '\n')
95 early_serial_putc('\r');
96 s++;
97 }
98}
99
100#define DEFAULT_BAUD 9600
101
102static __init void early_serial_init(char *s)
103{
104 unsigned char c;
105 unsigned divisor;
106 unsigned baud = DEFAULT_BAUD;
107 char *e;
108
109 if (*s == ',')
110 ++s;
111
112 if (*s) {
113 unsigned port;
114 if (!strncmp(s,"0x",2)) {
115 early_serial_base = simple_strtoul(s, &e, 16);
116 } else {
117 static int bases[] = { 0x3f8, 0x2f8 };
118
119 if (!strncmp(s,"ttyS",4))
120 s += 4;
121 port = simple_strtoul(s, &e, 10);
122 if (port > 1 || s == e)
123 port = 0;
124 early_serial_base = bases[port];
125 }
126 s += strcspn(s, ",");
127 if (*s == ',')
128 s++;
129 }
130
131 outb(0x3, early_serial_base + LCR); /* 8n1 */
132 outb(0, early_serial_base + IER); /* no interrupt */
133 outb(0, early_serial_base + FCR); /* no fifo */
134 outb(0x3, early_serial_base + MCR); /* DTR + RTS */
135
136 if (*s) {
137 baud = simple_strtoul(s, &e, 0);
138 if (baud == 0 || s == e)
139 baud = DEFAULT_BAUD;
140 }
141
142 divisor = 115200 / baud;
143 c = inb(early_serial_base + LCR);
144 outb(c | DLAB, early_serial_base + LCR);
145 outb(divisor & 0xff, early_serial_base + DLL);
146 outb((divisor >> 8) & 0xff, early_serial_base + DLH);
147 outb(c & ~DLAB, early_serial_base + LCR);
148}
149
150static struct console early_serial_console = {
151 .name = "earlyser",
152 .write = early_serial_write,
153 .flags = CON_PRINTBUFFER,
154 .index = -1,
155};
156
157/* Direct interface for emergencies */
158struct console *early_console = &early_vga_console;
159static int early_console_initialized = 0;
160
161void early_printk(const char *fmt, ...)
162{
163 char buf[512];
164 int n;
165 va_list ap;
166
167 va_start(ap,fmt);
168 n = vscnprintf(buf,512,fmt,ap);
169 early_console->write(early_console,buf,n);
170 va_end(ap);
171}
172
173static int keep_early;
174
175int __init setup_early_printk(char *opt)
176{
177 char *space;
178 char buf[256];
179
180 if (early_console_initialized)
181 return -1;
182
183 opt = strchr(opt, '=') + 1;
184
185 strlcpy(buf,opt,sizeof(buf));
186 space = strchr(buf, ' ');
187 if (space)
188 *space = 0;
189
190 if (strstr(buf,"keep"))
191 keep_early = 1;
192
193 if (!strncmp(buf, "serial", 6)) {
194 early_serial_init(buf + 6);
195 early_console = &early_serial_console;
196 } else if (!strncmp(buf, "ttyS", 4)) {
197 early_serial_init(buf);
198 early_console = &early_serial_console;
199 } else if (!strncmp(buf, "vga", 3)) {
200 early_console = &early_vga_console;
201 }
202 early_console_initialized = 1;
203 register_console(early_console);
204 return 0;
205}
206
207void __init disable_early_printk(void)
208{
209 if (!early_console_initialized || !early_console)
210 return;
211 if (!keep_early) {
212 printk("disabling early console\n");
213 unregister_console(early_console);
214 early_console_initialized = 0;
215 } else {
216 printk("keeping early console\n");
217 }
218}
219
220__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
new file mode 100644
index 000000000000..e126284db7a8
--- /dev/null
+++ b/arch/x86_64/kernel/entry.S
@@ -0,0 +1,920 @@
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 *
8 * $Id$
9 */
10
11/*
12 * entry.S contains the system-call and fault low-level handling routines.
13 *
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 *
26 * TODO:
27 * - schedule it carefully for the final hardware.
28 */
29
30#define ASSEMBLY 1
31#include <linux/config.h>
32#include <linux/linkage.h>
33#include <asm/segment.h>
34#include <asm/smp.h>
35#include <asm/cache.h>
36#include <asm/errno.h>
37#include <asm/dwarf2.h>
38#include <asm/calling.h>
39#include <asm/offset.h>
40#include <asm/msr.h>
41#include <asm/unistd.h>
42#include <asm/thread_info.h>
43#include <asm/hw_irq.h>
44
45 .code64
46
47#ifdef CONFIG_PREEMPT
48#define preempt_stop cli
49#else
50#define preempt_stop
51#define retint_kernel retint_restore_args
52#endif
53
54/*
55 * C code is not supposed to know about undefined top of stack. Every time
56 * a C function with an pt_regs argument is called from the SYSCALL based
57 * fast path FIXUP_TOP_OF_STACK is needed.
58 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
59 * manipulation.
60 */
61
62 /* %rsp:at FRAMEEND */
63 .macro FIXUP_TOP_OF_STACK tmp
64 movq %gs:pda_oldrsp,\tmp
65 movq \tmp,RSP(%rsp)
66 movq $__USER_DS,SS(%rsp)
67 movq $__USER_CS,CS(%rsp)
68 movq $-1,RCX(%rsp)
69 movq R11(%rsp),\tmp /* get eflags */
70 movq \tmp,EFLAGS(%rsp)
71 .endm
72
73 .macro RESTORE_TOP_OF_STACK tmp,offset=0
74 movq RSP-\offset(%rsp),\tmp
75 movq \tmp,%gs:pda_oldrsp
76 movq EFLAGS-\offset(%rsp),\tmp
77 movq \tmp,R11-\offset(%rsp)
78 .endm
79
80 .macro FAKE_STACK_FRAME child_rip
81 /* push in order ss, rsp, eflags, cs, rip */
82 xorq %rax, %rax
83 pushq %rax /* ss */
84 CFI_ADJUST_CFA_OFFSET 8
85 pushq %rax /* rsp */
86 CFI_ADJUST_CFA_OFFSET 8
87 CFI_OFFSET rip,0
88 pushq $(1<<9) /* eflags - interrupts on */
89 CFI_ADJUST_CFA_OFFSET 8
90 pushq $__KERNEL_CS /* cs */
91 CFI_ADJUST_CFA_OFFSET 8
92 pushq \child_rip /* rip */
93 CFI_ADJUST_CFA_OFFSET 8
94 CFI_OFFSET rip,0
95 pushq %rax /* orig rax */
96 CFI_ADJUST_CFA_OFFSET 8
97 .endm
98
99 .macro UNFAKE_STACK_FRAME
100 addq $8*6, %rsp
101 CFI_ADJUST_CFA_OFFSET -(6*8)
102 .endm
103
104 .macro CFI_DEFAULT_STACK
105 CFI_ADJUST_CFA_OFFSET (SS)
106 CFI_OFFSET r15,R15-SS
107 CFI_OFFSET r14,R14-SS
108 CFI_OFFSET r13,R13-SS
109 CFI_OFFSET r12,R12-SS
110 CFI_OFFSET rbp,RBP-SS
111 CFI_OFFSET rbx,RBX-SS
112 CFI_OFFSET r11,R11-SS
113 CFI_OFFSET r10,R10-SS
114 CFI_OFFSET r9,R9-SS
115 CFI_OFFSET r8,R8-SS
116 CFI_OFFSET rax,RAX-SS
117 CFI_OFFSET rcx,RCX-SS
118 CFI_OFFSET rdx,RDX-SS
119 CFI_OFFSET rsi,RSI-SS
120 CFI_OFFSET rdi,RDI-SS
121 CFI_OFFSET rsp,RSP-SS
122 CFI_OFFSET rip,RIP-SS
123 .endm
124/*
125 * A newly forked process directly context switches into this.
126 */
127/* rdi: prev */
128ENTRY(ret_from_fork)
129 CFI_STARTPROC
130 CFI_DEFAULT_STACK
131 call schedule_tail
132 GET_THREAD_INFO(%rcx)
133 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
134 jnz rff_trace
135rff_action:
136 RESTORE_REST
137 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
138 je int_ret_from_sys_call
139 testl $_TIF_IA32,threadinfo_flags(%rcx)
140 jnz int_ret_from_sys_call
141 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
142 jmp ret_from_sys_call
143rff_trace:
144 movq %rsp,%rdi
145 call syscall_trace_leave
146 GET_THREAD_INFO(%rcx)
147 jmp rff_action
148 CFI_ENDPROC
149
150/*
151 * System call entry. Upto 6 arguments in registers are supported.
152 *
153 * SYSCALL does not save anything on the stack and does not change the
154 * stack pointer.
155 */
156
157/*
158 * Register setup:
159 * rax system call number
160 * rdi arg0
161 * rcx return address for syscall/sysret, C arg3
162 * rsi arg1
163 * rdx arg2
164 * r10 arg3 (--> moved to rcx for C)
165 * r8 arg4
166 * r9 arg5
167 * r11 eflags for syscall/sysret, temporary for C
168 * r12-r15,rbp,rbx saved by C code, not touched.
169 *
170 * Interrupts are off on entry.
171 * Only called from user space.
172 *
173 * XXX if we had a free scratch register we could save the RSP into the stack frame
174 * and report it properly in ps. Unfortunately we haven't.
175 */
176
177ENTRY(system_call)
178 CFI_STARTPROC
179 swapgs
180 movq %rsp,%gs:pda_oldrsp
181 movq %gs:pda_kernelstack,%rsp
182 sti
183 SAVE_ARGS 8,1
184 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
185 movq %rcx,RIP-ARGOFFSET(%rsp)
186 GET_THREAD_INFO(%rcx)
187 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
188 jnz tracesys
189 cmpq $__NR_syscall_max,%rax
190 ja badsys
191 movq %r10,%rcx
192 call *sys_call_table(,%rax,8) # XXX: rip relative
193 movq %rax,RAX-ARGOFFSET(%rsp)
194/*
195 * Syscall return path ending with SYSRET (fast path)
196 * Has incomplete stack frame and undefined top of stack.
197 */
198 .globl ret_from_sys_call
199ret_from_sys_call:
200 movl $_TIF_WORK_MASK,%edi
201 /* edi: flagmask */
202sysret_check:
203 GET_THREAD_INFO(%rcx)
204 cli
205 movl threadinfo_flags(%rcx),%edx
206 andl %edi,%edx
207 jnz sysret_careful
208 movq RIP-ARGOFFSET(%rsp),%rcx
209 RESTORE_ARGS 0,-ARG_SKIP,1
210 movq %gs:pda_oldrsp,%rsp
211 swapgs
212 sysretq
213
214 /* Handle reschedules */
215 /* edx: work, edi: workmask */
216sysret_careful:
217 bt $TIF_NEED_RESCHED,%edx
218 jnc sysret_signal
219 sti
220 pushq %rdi
221 call schedule
222 popq %rdi
223 jmp sysret_check
224
225 /* Handle a signal */
226sysret_signal:
227 sti
228 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
229 jz 1f
230
231 /* Really a signal */
232 /* edx: work flags (arg3) */
233 leaq do_notify_resume(%rip),%rax
234 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
235 xorl %esi,%esi # oldset -> arg2
236 call ptregscall_common
2371: movl $_TIF_NEED_RESCHED,%edi
238 jmp sysret_check
239
240 /* Do syscall tracing */
241tracesys:
242 SAVE_REST
243 movq $-ENOSYS,RAX(%rsp)
244 FIXUP_TOP_OF_STACK %rdi
245 movq %rsp,%rdi
246 call syscall_trace_enter
247 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
248 RESTORE_REST
249 cmpq $__NR_syscall_max,%rax
250 ja 1f
251 movq %r10,%rcx /* fixup for C */
252 call *sys_call_table(,%rax,8)
253 movq %rax,RAX-ARGOFFSET(%rsp)
2541: SAVE_REST
255 movq %rsp,%rdi
256 call syscall_trace_leave
257 RESTORE_TOP_OF_STACK %rbx
258 RESTORE_REST
259 jmp ret_from_sys_call
260
261badsys:
262 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
263 jmp ret_from_sys_call
264
265/*
266 * Syscall return path ending with IRET.
267 * Has correct top of stack, but partial stack frame.
268 */
269ENTRY(int_ret_from_sys_call)
270 cli
271 testl $3,CS-ARGOFFSET(%rsp)
272 je retint_restore_args
273 movl $_TIF_ALLWORK_MASK,%edi
274 /* edi: mask to check */
275int_with_check:
276 GET_THREAD_INFO(%rcx)
277 movl threadinfo_flags(%rcx),%edx
278 andl %edi,%edx
279 jnz int_careful
280 jmp retint_swapgs
281
282 /* Either reschedule or signal or syscall exit tracking needed. */
283 /* First do a reschedule test. */
284 /* edx: work, edi: workmask */
285int_careful:
286 bt $TIF_NEED_RESCHED,%edx
287 jnc int_very_careful
288 sti
289 pushq %rdi
290 call schedule
291 popq %rdi
292 jmp int_with_check
293
294 /* handle signals and tracing -- both require a full stack frame */
295int_very_careful:
296 sti
297 SAVE_REST
298 /* Check for syscall exit trace */
299 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
300 jz int_signal
301 pushq %rdi
302 leaq 8(%rsp),%rdi # &ptregs -> arg1
303 call syscall_trace_leave
304 popq %rdi
305 btr $TIF_SYSCALL_TRACE,%edi
306 btr $TIF_SYSCALL_AUDIT,%edi
307 btr $TIF_SINGLESTEP,%edi
308 jmp int_restore_rest
309
310int_signal:
311 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
312 jz 1f
313 movq %rsp,%rdi # &ptregs -> arg1
314 xorl %esi,%esi # oldset -> arg2
315 call do_notify_resume
3161: movl $_TIF_NEED_RESCHED,%edi
317int_restore_rest:
318 RESTORE_REST
319 jmp int_with_check
320 CFI_ENDPROC
321
322/*
323 * Certain special system calls that need to save a complete full stack frame.
324 */
325
326 .macro PTREGSCALL label,func,arg
327 .globl \label
328\label:
329 leaq \func(%rip),%rax
330 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
331 jmp ptregscall_common
332 .endm
333
334 PTREGSCALL stub_clone, sys_clone, %r8
335 PTREGSCALL stub_fork, sys_fork, %rdi
336 PTREGSCALL stub_vfork, sys_vfork, %rdi
337 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
338 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
339 PTREGSCALL stub_iopl, sys_iopl, %rsi
340
341ENTRY(ptregscall_common)
342 CFI_STARTPROC
343 popq %r11
344 CFI_ADJUST_CFA_OFFSET -8
345 SAVE_REST
346 movq %r11, %r15
347 FIXUP_TOP_OF_STACK %r11
348 call *%rax
349 RESTORE_TOP_OF_STACK %r11
350 movq %r15, %r11
351 RESTORE_REST
352 pushq %r11
353 CFI_ADJUST_CFA_OFFSET 8
354 ret
355 CFI_ENDPROC
356
357ENTRY(stub_execve)
358 CFI_STARTPROC
359 popq %r11
360 CFI_ADJUST_CFA_OFFSET -8
361 SAVE_REST
362 movq %r11, %r15
363 FIXUP_TOP_OF_STACK %r11
364 call sys_execve
365 GET_THREAD_INFO(%rcx)
366 bt $TIF_IA32,threadinfo_flags(%rcx)
367 jc exec_32bit
368 RESTORE_TOP_OF_STACK %r11
369 movq %r15, %r11
370 RESTORE_REST
371 push %r11
372 ret
373
374exec_32bit:
375 CFI_ADJUST_CFA_OFFSET REST_SKIP
376 movq %rax,RAX(%rsp)
377 RESTORE_REST
378 jmp int_ret_from_sys_call
379 CFI_ENDPROC
380
381/*
382 * sigreturn is special because it needs to restore all registers on return.
383 * This cannot be done with SYSRET, so use the IRET return path instead.
384 */
385ENTRY(stub_rt_sigreturn)
386 CFI_STARTPROC
387 addq $8, %rsp
388 SAVE_REST
389 movq %rsp,%rdi
390 FIXUP_TOP_OF_STACK %r11
391 call sys_rt_sigreturn
392 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
393 RESTORE_REST
394 jmp int_ret_from_sys_call
395 CFI_ENDPROC
396
397/*
398 * Interrupt entry/exit.
399 *
400 * Interrupt entry points save only callee clobbered registers in fast path.
401 *
402 * Entry runs with interrupts off.
403 */
404
405/* 0(%rsp): interrupt number */
406 .macro interrupt func
407 CFI_STARTPROC simple
408 CFI_DEF_CFA rsp,(SS-RDI)
409 CFI_REL_OFFSET rsp,(RSP-ORIG_RAX)
410 CFI_REL_OFFSET rip,(RIP-ORIG_RAX)
411 cld
412#ifdef CONFIG_DEBUG_INFO
413 SAVE_ALL
414 movq %rsp,%rdi
415 /*
416 * Setup a stack frame pointer. This allows gdb to trace
417 * back to the original stack.
418 */
419 movq %rsp,%rbp
420 CFI_DEF_CFA_REGISTER rbp
421#else
422 SAVE_ARGS
423 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
424#endif
425 testl $3,CS(%rdi)
426 je 1f
427 swapgs
4281: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
429 movq %gs:pda_irqstackptr,%rax
430 cmoveq %rax,%rsp
431 pushq %rdi # save old stack
432 call \func
433 .endm
434
435ENTRY(common_interrupt)
436 interrupt do_IRQ
437 /* 0(%rsp): oldrsp-ARGOFFSET */
438ret_from_intr:
439 popq %rdi
440 cli
441 subl $1,%gs:pda_irqcount
442#ifdef CONFIG_DEBUG_INFO
443 movq RBP(%rdi),%rbp
444#endif
445 leaq ARGOFFSET(%rdi),%rsp
446exit_intr:
447 GET_THREAD_INFO(%rcx)
448 testl $3,CS-ARGOFFSET(%rsp)
449 je retint_kernel
450
451 /* Interrupt came from user space */
452 /*
453 * Has a correct top of stack, but a partial stack frame
454 * %rcx: thread info. Interrupts off.
455 */
456retint_with_reschedule:
457 movl $_TIF_WORK_MASK,%edi
458retint_check:
459 movl threadinfo_flags(%rcx),%edx
460 andl %edi,%edx
461 jnz retint_careful
462retint_swapgs:
463 cli
464 swapgs
465retint_restore_args:
466 cli
467 RESTORE_ARGS 0,8,0
468iret_label:
469 iretq
470
471 .section __ex_table,"a"
472 .quad iret_label,bad_iret
473 .previous
474 .section .fixup,"ax"
475 /* force a signal here? this matches i386 behaviour */
476 /* running with kernel gs */
477bad_iret:
478 movq $-9999,%rdi /* better code? */
479 jmp do_exit
480 .previous
481
482 /* edi: workmask, edx: work */
483retint_careful:
484 bt $TIF_NEED_RESCHED,%edx
485 jnc retint_signal
486 sti
487 pushq %rdi
488 call schedule
489 popq %rdi
490 GET_THREAD_INFO(%rcx)
491 cli
492 jmp retint_check
493
494retint_signal:
495 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
496 jz retint_swapgs
497 sti
498 SAVE_REST
499 movq $-1,ORIG_RAX(%rsp)
500 xorq %rsi,%rsi # oldset
501 movq %rsp,%rdi # &pt_regs
502 call do_notify_resume
503 RESTORE_REST
504 cli
505 movl $_TIF_NEED_RESCHED,%edi
506 GET_THREAD_INFO(%rcx)
507 jmp retint_check
508
509#ifdef CONFIG_PREEMPT
510 /* Returning to kernel space. Check if we need preemption */
511 /* rcx: threadinfo. interrupts off. */
512 .p2align
513retint_kernel:
514 cmpl $0,threadinfo_preempt_count(%rcx)
515 jnz retint_restore_args
516 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
517 jnc retint_restore_args
518 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
519 jnc retint_restore_args
520 call preempt_schedule_irq
521 jmp exit_intr
522#endif
523 CFI_ENDPROC
524
525/*
526 * APIC interrupts.
527 */
528 .macro apicinterrupt num,func
529 pushq $\num-256
530 interrupt \func
531 jmp ret_from_intr
532 CFI_ENDPROC
533 .endm
534
535ENTRY(thermal_interrupt)
536 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
537
538#ifdef CONFIG_SMP
539ENTRY(reschedule_interrupt)
540 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
541
542ENTRY(invalidate_interrupt)
543 apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
544
545ENTRY(call_function_interrupt)
546 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
547#endif
548
549#ifdef CONFIG_X86_LOCAL_APIC
550ENTRY(apic_timer_interrupt)
551 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
552
553ENTRY(error_interrupt)
554 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
555
556ENTRY(spurious_interrupt)
557 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
558#endif
559
560/*
561 * Exception entry points.
562 */
563 .macro zeroentry sym
564 pushq $0 /* push error code/oldrax */
565 pushq %rax /* push real oldrax to the rdi slot */
566 leaq \sym(%rip),%rax
567 jmp error_entry
568 .endm
569
570 .macro errorentry sym
571 pushq %rax
572 leaq \sym(%rip),%rax
573 jmp error_entry
574 .endm
575
576 /* error code is on the stack already */
577 /* handle NMI like exceptions that can happen everywhere */
578 .macro paranoidentry sym
579 SAVE_ALL
580 cld
581 movl $1,%ebx
582 movl $MSR_GS_BASE,%ecx
583 rdmsr
584 testl %edx,%edx
585 js 1f
586 swapgs
587 xorl %ebx,%ebx
5881: movq %rsp,%rdi
589 movq ORIG_RAX(%rsp),%rsi
590 movq $-1,ORIG_RAX(%rsp)
591 call \sym
592 .endm
593
594/*
595 * Exception entry point. This expects an error code/orig_rax on the stack
596 * and the exception handler in %rax.
597 */
598ENTRY(error_entry)
599 CFI_STARTPROC simple
600 CFI_DEF_CFA rsp,(SS-RDI)
601 CFI_REL_OFFSET rsp,(RSP-RDI)
602 CFI_REL_OFFSET rip,(RIP-RDI)
603 /* rdi slot contains rax, oldrax contains error code */
604 cld
605 subq $14*8,%rsp
606 CFI_ADJUST_CFA_OFFSET (14*8)
607 movq %rsi,13*8(%rsp)
608 CFI_REL_OFFSET rsi,RSI
609 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
610 movq %rdx,12*8(%rsp)
611 CFI_REL_OFFSET rdx,RDX
612 movq %rcx,11*8(%rsp)
613 CFI_REL_OFFSET rcx,RCX
614 movq %rsi,10*8(%rsp) /* store rax */
615 CFI_REL_OFFSET rax,RAX
616 movq %r8, 9*8(%rsp)
617 CFI_REL_OFFSET r8,R8
618 movq %r9, 8*8(%rsp)
619 CFI_REL_OFFSET r9,R9
620 movq %r10,7*8(%rsp)
621 CFI_REL_OFFSET r10,R10
622 movq %r11,6*8(%rsp)
623 CFI_REL_OFFSET r11,R11
624 movq %rbx,5*8(%rsp)
625 CFI_REL_OFFSET rbx,RBX
626 movq %rbp,4*8(%rsp)
627 CFI_REL_OFFSET rbp,RBP
628 movq %r12,3*8(%rsp)
629 CFI_REL_OFFSET r12,R12
630 movq %r13,2*8(%rsp)
631 CFI_REL_OFFSET r13,R13
632 movq %r14,1*8(%rsp)
633 CFI_REL_OFFSET r14,R14
634 movq %r15,(%rsp)
635 CFI_REL_OFFSET r15,R15
636 xorl %ebx,%ebx
637 testl $3,CS(%rsp)
638 je error_kernelspace
639error_swapgs:
640 swapgs
641error_sti:
642 movq %rdi,RDI(%rsp)
643 movq %rsp,%rdi
644 movq ORIG_RAX(%rsp),%rsi /* get error code */
645 movq $-1,ORIG_RAX(%rsp)
646 call *%rax
647 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
648error_exit:
649 movl %ebx,%eax
650 RESTORE_REST
651 cli
652 GET_THREAD_INFO(%rcx)
653 testl %eax,%eax
654 jne retint_kernel
655 movl threadinfo_flags(%rcx),%edx
656 movl $_TIF_WORK_MASK,%edi
657 andl %edi,%edx
658 jnz retint_careful
659 swapgs
660 RESTORE_ARGS 0,8,0
661 iretq
662 CFI_ENDPROC
663
664error_kernelspace:
665 incl %ebx
666 /* There are two places in the kernel that can potentially fault with
667 usergs. Handle them here. The exception handlers after
668 iret run with kernel gs again, so don't set the user space flag.
669 B stepping K8s sometimes report an truncated RIP for IRET
670 exceptions returning to compat mode. Check for these here too. */
671 leaq iret_label(%rip),%rbp
672 cmpq %rbp,RIP(%rsp)
673 je error_swapgs
674 movl %ebp,%ebp /* zero extend */
675 cmpq %rbp,RIP(%rsp)
676 je error_swapgs
677 cmpq $gs_change,RIP(%rsp)
678 je error_swapgs
679 jmp error_sti
680
681 /* Reload gs selector with exception handling */
682 /* edi: new selector */
683ENTRY(load_gs_index)
684 pushf
685 cli
686 swapgs
687gs_change:
688 movl %edi,%gs
6892: mfence /* workaround */
690 swapgs
691 popf
692 ret
693
694 .section __ex_table,"a"
695 .align 8
696 .quad gs_change,bad_gs
697 .previous
698 .section .fixup,"ax"
699 /* running with kernelgs */
700bad_gs:
701 swapgs /* switch back to user gs */
702 xorl %eax,%eax
703 movl %eax,%gs
704 jmp 2b
705 .previous
706
707/*
708 * Create a kernel thread.
709 *
710 * C extern interface:
711 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
712 *
713 * asm input arguments:
714 * rdi: fn, rsi: arg, rdx: flags
715 */
716ENTRY(kernel_thread)
717 CFI_STARTPROC
718 FAKE_STACK_FRAME $child_rip
719 SAVE_ALL
720
721 # rdi: flags, rsi: usp, rdx: will be &pt_regs
722 movq %rdx,%rdi
723 orq kernel_thread_flags(%rip),%rdi
724 movq $-1, %rsi
725 movq %rsp, %rdx
726
727 xorl %r8d,%r8d
728 xorl %r9d,%r9d
729
730 # clone now
731 call do_fork
732 movq %rax,RAX(%rsp)
733 xorl %edi,%edi
734
735 /*
736 * It isn't worth to check for reschedule here,
737 * so internally to the x86_64 port you can rely on kernel_thread()
738 * not to reschedule the child before returning, this avoids the need
739 * of hacks for example to fork off the per-CPU idle tasks.
740 * [Hopefully no generic code relies on the reschedule -AK]
741 */
742 RESTORE_ALL
743 UNFAKE_STACK_FRAME
744 ret
745 CFI_ENDPROC
746
747
748child_rip:
749 /*
750 * Here we are in the child and the registers are set as they were
751 * at kernel_thread() invocation in the parent.
752 */
753 movq %rdi, %rax
754 movq %rsi, %rdi
755 call *%rax
756 # exit
757 xorq %rdi, %rdi
758 call do_exit
759
760/*
761 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
762 *
763 * C extern interface:
764 * extern long execve(char *name, char **argv, char **envp)
765 *
766 * asm input arguments:
767 * rdi: name, rsi: argv, rdx: envp
768 *
769 * We want to fallback into:
770 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
771 *
772 * do_sys_execve asm fallback arguments:
773 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
774 */
775ENTRY(execve)
776 CFI_STARTPROC
777 FAKE_STACK_FRAME $0
778 SAVE_ALL
779 call sys_execve
780 movq %rax, RAX(%rsp)
781 RESTORE_REST
782 testq %rax,%rax
783 je int_ret_from_sys_call
784 RESTORE_ARGS
785 UNFAKE_STACK_FRAME
786 ret
787 CFI_ENDPROC
788
789ENTRY(page_fault)
790 errorentry do_page_fault
791
792ENTRY(coprocessor_error)
793 zeroentry do_coprocessor_error
794
795ENTRY(simd_coprocessor_error)
796 zeroentry do_simd_coprocessor_error
797
798ENTRY(device_not_available)
799 zeroentry math_state_restore
800
801 /* runs on exception stack */
802ENTRY(debug)
803 CFI_STARTPROC
804 pushq $0
805 CFI_ADJUST_CFA_OFFSET 8
806 paranoidentry do_debug
807 /* switch back to process stack to restore the state ptrace touched */
808 movq %rax,%rsp
809 testl $3,CS(%rsp)
810 jnz paranoid_userspace
811 jmp paranoid_exit
812 CFI_ENDPROC
813
814 /* runs on exception stack */
815ENTRY(nmi)
816 CFI_STARTPROC
817 pushq $-1
818 CFI_ADJUST_CFA_OFFSET 8
819 paranoidentry do_nmi
820 /* ebx: no swapgs flag */
821paranoid_exit:
822 testl %ebx,%ebx /* swapgs needed? */
823 jnz paranoid_restore
824paranoid_swapgs:
825 cli
826 swapgs
827paranoid_restore:
828 RESTORE_ALL 8
829 iretq
830paranoid_userspace:
831 cli
832 GET_THREAD_INFO(%rcx)
833 movl threadinfo_flags(%rcx),%edx
834 testl $_TIF_NEED_RESCHED,%edx
835 jnz paranoid_resched
836 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
837 jnz paranoid_signal
838 jmp paranoid_swapgs
839paranoid_resched:
840 sti
841 call schedule
842 jmp paranoid_exit
843paranoid_signal:
844 sti
845 xorl %esi,%esi /* oldset */
846 movq %rsp,%rdi /* &pt_regs */
847 call do_notify_resume
848 jmp paranoid_exit
849 CFI_ENDPROC
850
851ENTRY(int3)
852 zeroentry do_int3
853
854ENTRY(overflow)
855 zeroentry do_overflow
856
857ENTRY(bounds)
858 zeroentry do_bounds
859
860ENTRY(invalid_op)
861 zeroentry do_invalid_op
862
863ENTRY(coprocessor_segment_overrun)
864 zeroentry do_coprocessor_segment_overrun
865
866ENTRY(reserved)
867 zeroentry do_reserved
868
869 /* runs on exception stack */
870ENTRY(double_fault)
871 CFI_STARTPROC
872 paranoidentry do_double_fault
873 movq %rax,%rsp
874 testl $3,CS(%rsp)
875 jnz paranoid_userspace
876 jmp paranoid_exit
877 CFI_ENDPROC
878
879ENTRY(invalid_TSS)
880 errorentry do_invalid_TSS
881
882ENTRY(segment_not_present)
883 errorentry do_segment_not_present
884
885 /* runs on exception stack */
886ENTRY(stack_segment)
887 CFI_STARTPROC
888 paranoidentry do_stack_segment
889 movq %rax,%rsp
890 testl $3,CS(%rsp)
891 jnz paranoid_userspace
892 jmp paranoid_exit
893 CFI_ENDPROC
894
895ENTRY(general_protection)
896 errorentry do_general_protection
897
898ENTRY(alignment_check)
899 errorentry do_alignment_check
900
901ENTRY(divide_error)
902 zeroentry do_divide_error
903
904ENTRY(spurious_interrupt_bug)
905 zeroentry do_spurious_interrupt_bug
906
907#ifdef CONFIG_X86_MCE
908 /* runs on exception stack */
909ENTRY(machine_check)
910 CFI_STARTPROC
911 pushq $0
912 CFI_ADJUST_CFA_OFFSET 8
913 paranoidentry do_machine_check
914 jmp paranoid_exit
915 CFI_ENDPROC
916#endif
917
918ENTRY(call_debug)
919 zeroentry do_call_debug
920
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
new file mode 100644
index 000000000000..d2c42fb99cb3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic.c
@@ -0,0 +1,89 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/config.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/module.h>
19
20#include <asm/smp.h>
21#include <asm/ipi.h>
22
23/* which logical CPU number maps to which CPU (physical APIC ID) */
24u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
25EXPORT_SYMBOL(x86_cpu_to_apicid);
26u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
27
28extern struct genapic apic_cluster;
29extern struct genapic apic_flat;
30
31struct genapic *genapic = &apic_flat;
32
33
34/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */
37void __init clustered_apic_check(void)
38{
39 long i;
40 u8 clusters, max_cluster;
41 u8 id;
42 u8 cluster_cnt[NUM_APIC_CLUSTERS];
43
44 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
45 /* AMD always uses flat mode right now */
46 genapic = &apic_flat;
47 goto print;
48 }
49
50 memset(cluster_cnt, 0, sizeof(cluster_cnt));
51
52 for (i = 0; i < NR_CPUS; i++) {
53 id = bios_cpu_apicid[i];
54 if (id != BAD_APICID)
55 cluster_cnt[APIC_CLUSTERID(id)]++;
56 }
57
58 clusters = 0;
59 max_cluster = 0;
60 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
61 if (cluster_cnt[i] > 0) {
62 ++clusters;
63 if (cluster_cnt[i] > max_cluster)
64 max_cluster = cluster_cnt[i];
65 }
66 }
67
68 /*
69 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
70 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
71 * else physical mode.
72 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
73 * can ignore the clustered logical case and go straight to physical.)
74 */
75 if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
76 genapic = &apic_flat;
77 else
78 genapic = &apic_cluster;
79
80print:
81 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
82}
83
84/* Same for both flat and clustered. */
85
86void send_IPI_self(int vector)
87{
88 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
89}
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
new file mode 100644
index 000000000000..9703da7202e3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -0,0 +1,130 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Clustered APIC subarch code. Up to 255 CPUs, physical delivery.
6 * (A more realistic maximum is around 230 CPUs.)
7 *
8 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
9 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
10 * James Cleverdon.
11 */
12#include <linux/config.h>
13#include <linux/threads.h>
14#include <linux/cpumask.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <asm/smp.h>
20#include <asm/ipi.h>
21
22
23/*
24 * Set up the logical destination ID.
25 *
26 * Intel recommends to set DFR, LDR and TPR before enabling
27 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
28 * document number 292116). So here it goes...
29 */
30static void cluster_init_apic_ldr(void)
31{
32 unsigned long val, id;
33 long i, count;
34 u8 lid;
35 u8 my_id = hard_smp_processor_id();
36 u8 my_cluster = APIC_CLUSTER(my_id);
37
38 /* Create logical APIC IDs by counting CPUs already in cluster. */
39 for (count = 0, i = NR_CPUS; --i >= 0; ) {
40 lid = x86_cpu_to_log_apicid[i];
41 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
42 ++count;
43 }
44 /*
45 * We only have a 4 wide bitmap in cluster mode. There's no way
46 * to get above 60 CPUs and still give each one it's own bit.
47 * But, we're using physical IRQ delivery, so we don't care.
48 * Use bit 3 for the 4th through Nth CPU in each cluster.
49 */
50 if (count >= XAPIC_DEST_CPUS_SHIFT)
51 count = 3;
52 id = my_cluster | (1UL << count);
53 x86_cpu_to_log_apicid[smp_processor_id()] = id;
54 apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
55 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
56 val |= SET_APIC_LOGICAL_ID(id);
57 apic_write_around(APIC_LDR, val);
58}
59
60/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
61
62static cpumask_t cluster_target_cpus(void)
63{
64 return cpumask_of_cpu(0);
65}
66
67static void cluster_send_IPI_mask(cpumask_t mask, int vector)
68{
69 send_IPI_mask_sequence(mask, vector);
70}
71
72static void cluster_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75 cpu_clear(smp_processor_id(), mask);
76
77 if (!cpus_empty(mask))
78 cluster_send_IPI_mask(mask, vector);
79}
80
81static void cluster_send_IPI_all(int vector)
82{
83 cluster_send_IPI_mask(cpu_online_map, vector);
84}
85
86static int cluster_apic_id_registered(void)
87{
88 return 1;
89}
90
91static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
92{
93 int cpu;
94
95 /*
96 * We're using fixed IRQ delivery, can only return one phys APIC ID.
97 * May as well be the first.
98 */
99 cpu = first_cpu(cpumask);
100 if ((unsigned)cpu < NR_CPUS)
101 return x86_cpu_to_apicid[cpu];
102 else
103 return BAD_APICID;
104}
105
106/* cpuid returns the value latched in the HW at reset, not the APIC ID
107 * register's value. For any box whose BIOS changes APIC IDs, like
108 * clustered APIC systems, we must use hard_smp_processor_id.
109 *
110 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
111 */
112static unsigned int phys_pkg_id(int index_msb)
113{
114 return hard_smp_processor_id() >> index_msb;
115}
116
117struct genapic apic_cluster = {
118 .name = "clustered",
119 .int_delivery_mode = dest_Fixed,
120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
121 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
122 .target_cpus = cluster_target_cpus,
123 .apic_id_registered = cluster_apic_id_registered,
124 .init_apic_ldr = cluster_init_apic_ldr,
125 .send_IPI_all = cluster_send_IPI_all,
126 .send_IPI_allbutself = cluster_send_IPI_allbutself,
127 .send_IPI_mask = cluster_send_IPI_mask,
128 .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
129 .phys_pkg_id = phys_pkg_id,
130};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
new file mode 100644
index 000000000000..b4cbbad04226
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Flat APIC subarch code. Maximum 8 CPUs, logical delivery.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/config.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <asm/smp.h>
19#include <asm/ipi.h>
20
21
22static cpumask_t flat_target_cpus(void)
23{
24 return cpu_online_map;
25}
26
27/*
28 * Set up the logical destination ID.
29 *
30 * Intel recommends to set DFR, LDR and TPR before enabling
31 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
32 * document number 292116). So here it goes...
33 */
34static void flat_init_apic_ldr(void)
35{
36 unsigned long val;
37 unsigned long num, id;
38
39 num = smp_processor_id();
40 id = 1UL << num;
41 x86_cpu_to_log_apicid[num] = id;
42 apic_write_around(APIC_DFR, APIC_DFR_FLAT);
43 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
44 val |= SET_APIC_LOGICAL_ID(id);
45 apic_write_around(APIC_LDR, val);
46}
47
48static void flat_send_IPI_allbutself(int vector)
49{
50 /*
51 * if there are no other CPUs in the system then
52 * we get an APIC send error if we try to broadcast.
53 * thus we have to avoid sending IPIs in this case.
54 */
55 if (num_online_cpus() > 1)
56 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
57}
58
59static void flat_send_IPI_all(int vector)
60{
61 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
62}
63
64static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
65{
66 unsigned long mask = cpus_addr(cpumask)[0];
67 unsigned long cfg;
68 unsigned long flags;
69
70 local_save_flags(flags);
71 local_irq_disable();
72
73 /*
74 * Wait for idle.
75 */
76 apic_wait_icr_idle();
77
78 /*
79 * prepare target chip field
80 */
81 cfg = __prepare_ICR2(mask);
82 apic_write_around(APIC_ICR2, cfg);
83
84 /*
85 * program the ICR
86 */
87 cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
88
89 /*
90 * Send the IPI. The write to APIC_ICR fires this off.
91 */
92 apic_write_around(APIC_ICR, cfg);
93 local_irq_restore(flags);
94}
95
96static int flat_apic_id_registered(void)
97{
98 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
99}
100
101static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
102{
103 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
104}
105
106static unsigned int phys_pkg_id(int index_msb)
107{
108 u32 ebx;
109
110 ebx = cpuid_ebx(1);
111 return ((ebx >> 24) & 0xFF) >> index_msb;
112}
113
114struct genapic apic_flat = {
115 .name = "flat",
116 .int_delivery_mode = dest_LowestPrio,
117 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
118 .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
119 .target_cpus = flat_target_cpus,
120 .apic_id_registered = flat_apic_id_registered,
121 .init_apic_ldr = flat_init_apic_ldr,
122 .send_IPI_all = flat_send_IPI_all,
123 .send_IPI_allbutself = flat_send_IPI_allbutself,
124 .send_IPI_mask = flat_send_IPI_mask,
125 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
126 .phys_pkg_id = phys_pkg_id,
127};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
new file mode 100644
index 000000000000..b6d8725c1f61
--- /dev/null
+++ b/arch/x86_64/kernel/head.S
@@ -0,0 +1,396 @@
1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 *
9 * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
10 */
11
12
13#include <linux/linkage.h>
14#include <linux/threads.h>
15#include <asm/desc.h>
16#include <asm/segment.h>
17#include <asm/page.h>
18#include <asm/msr.h>
19#include <asm/cache.h>
20
21/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
22 * because we need identity-mapped pages on setup so define __START_KERNEL to
23 * 0x100000 for this stage
24 *
25 */
26
27 .text
28 .code32
29 .globl startup_32
30/* %bx: 1 if coming from smp trampoline on secondary cpu */
31startup_32:
32
33 /*
34 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
35 * paging disabled and the point of this file is to switch to 64bit
36 * long mode with a kernel mapping for kerneland to jump into the
37 * kernel virtual addresses.
38 * There is no stack until we set one up.
39 */
40
41 /* Initialize the %ds segment register */
42 movl $__KERNEL_DS,%eax
43 movl %eax,%ds
44
45 /* Load new GDT with the 64bit segments using 32bit descriptor */
46 lgdt pGDT32 - __START_KERNEL_map
47
48 /* If the CPU doesn't support CPUID this will double fault.
49 * Unfortunately it is hard to check for CPUID without a stack.
50 */
51
52 /* Check if extended functions are implemented */
53 movl $0x80000000, %eax
54 cpuid
55 cmpl $0x80000000, %eax
56 jbe no_long_mode
57 /* Check if long mode is implemented */
58 mov $0x80000001, %eax
59 cpuid
60 btl $29, %edx
61 jnc no_long_mode
62
63 /*
64 * Prepare for entering 64bits mode
65 */
66
67 /* Enable PAE mode */
68 xorl %eax, %eax
69 btsl $5, %eax
70 movl %eax, %cr4
71
72 /* Setup early boot stage 4 level pagetables */
73 movl $(init_level4_pgt - __START_KERNEL_map), %eax
74 movl %eax, %cr3
75
76 /* Setup EFER (Extended Feature Enable Register) */
77 movl $MSR_EFER, %ecx
78 rdmsr
79
80 /* Enable Long Mode */
81 btsl $_EFER_LME, %eax
82
83 /* Make changes effective */
84 wrmsr
85
86 xorl %eax, %eax
87 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
88 btsl $0, %eax /* Enable protected mode */
89 /* Make changes effective */
90 movl %eax, %cr0
91 /*
92 * At this point we're in long mode but in 32bit compatibility mode
93 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
94 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
95 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
96 */
97 ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
98
99 .code64
100 .org 0x100
101 .globl startup_64
102startup_64:
103 /* We come here either from startup_32
104 * or directly from a 64bit bootloader.
105 * Since we may have come directly from a bootloader we
106 * reload the page tables here.
107 */
108
109 /* Enable PAE mode and PGE */
110 xorq %rax, %rax
111 btsq $5, %rax
112 btsq $7, %rax
113 movq %rax, %cr4
114
115 /* Setup early boot stage 4 level pagetables. */
116 movq $(init_level4_pgt - __START_KERNEL_map), %rax
117 movq %rax, %cr3
118
119 /* Check if nx is implemented */
120 movl $0x80000001, %eax
121 cpuid
122 movl %edx,%edi
123
124 /* Setup EFER (Extended Feature Enable Register) */
125 movl $MSR_EFER, %ecx
126 rdmsr
127
128 /* Enable System Call */
129 btsl $_EFER_SCE, %eax
130
131 /* No Execute supported? */
132 btl $20,%edi
133 jnc 1f
134 btsl $_EFER_NX, %eax
1351:
136 /* Make changes effective */
137 wrmsr
138
139 /* Setup cr0 */
140 xorq %rax, %rax
141 btsq $31, %rax /* Enable paging */
142 btsq $0, %rax /* Enable protected mode */
143 btsq $1, %rax /* Enable MP */
144 btsq $4, %rax /* Enable ET */
145 btsq $5, %rax /* Enable NE */
146 btsq $16, %rax /* Enable WP */
147 btsq $18, %rax /* Enable AM */
148 /* Make changes effective */
149 movq %rax, %cr0
150
151 /* Setup a boot time stack */
152 movq init_rsp(%rip),%rsp
153
154 /* zero EFLAGS after setting rsp */
155 pushq $0
156 popfq
157
158 /*
159 * We must switch to a new descriptor in kernel space for the GDT
160 * because soon the kernel won't have access anymore to the userspace
161 * addresses where we're currently running on. We have to do that here
162 * because in 32bit we couldn't load a 64bit linear address.
163 */
164 lgdt cpu_gdt_descr
165
166 /*
167 * Setup up a dummy PDA. this is just for some early bootup code
168 * that does in_interrupt()
169 */
170 movl $MSR_GS_BASE,%ecx
171 movq $empty_zero_page,%rax
172 movq %rax,%rdx
173 shrq $32,%rdx
174 wrmsr
175
176 /* set up data segments. actually 0 would do too */
177 movl $__KERNEL_DS,%eax
178 movl %eax,%ds
179 movl %eax,%ss
180 movl %eax,%es
181
182 /* esi is pointer to real mode structure with interesting info.
183 pass it to C */
184 movl %esi, %edi
185
186 /* Finally jump to run C code and to be on real kernel address
187 * Since we are running on identity-mapped space we have to jump
188 * to the full 64bit address , this is only possible as indirect
189 * jump
190 */
191 movq initial_code(%rip),%rax
192 jmp *%rax
193
194 /* SMP bootup changes these two */
195 .globl initial_code
196initial_code:
197 .quad x86_64_start_kernel
198 .globl init_rsp
199init_rsp:
200 .quad init_thread_union+THREAD_SIZE-8
201
202ENTRY(early_idt_handler)
203 xorl %eax,%eax
204 movq 8(%rsp),%rsi # get rip
205 movq (%rsp),%rdx
206 movq %cr2,%rcx
207 leaq early_idt_msg(%rip),%rdi
208 call early_printk
2091: hlt
210 jmp 1b
211
212early_idt_msg:
213 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
214
215.code32
216ENTRY(no_long_mode)
217 /* This isn't an x86-64 CPU so hang */
2181:
219 jmp 1b
220
221.org 0xf00
222 .globl pGDT32
223pGDT32:
224 .word gdt_end-cpu_gdt_table
225 .long cpu_gdt_table-__START_KERNEL_map
226
227.org 0xf10
228ljumpvector:
229 .long startup_64-__START_KERNEL_map
230 .word __KERNEL_CS
231
232ENTRY(stext)
233ENTRY(_stext)
234
235 /*
236 * This default setting generates an ident mapping at address 0x100000
237 * and a mapping for the kernel that precisely maps virtual address
238 * 0xffffffff80000000 to physical address 0x000000. (always using
239 * 2Mbyte large pages provided by PAE mode)
240 */
241.org 0x1000
242ENTRY(init_level4_pgt)
243 .quad 0x0000000000102007 /* -> level3_ident_pgt */
244 .fill 255,8,0
245 .quad 0x000000000010a007
246 .fill 254,8,0
247 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
248 .quad 0x0000000000103007 /* -> level3_kernel_pgt */
249
250.org 0x2000
251ENTRY(level3_ident_pgt)
252 .quad 0x0000000000104007
253 .fill 511,8,0
254
255.org 0x3000
256ENTRY(level3_kernel_pgt)
257 .fill 510,8,0
258 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
259 .quad 0x0000000000105007 /* -> level2_kernel_pgt */
260 .fill 1,8,0
261
262.org 0x4000
263ENTRY(level2_ident_pgt)
264 /* 40MB for bootup. */
265 .quad 0x0000000000000283
266 .quad 0x0000000000200183
267 .quad 0x0000000000400183
268 .quad 0x0000000000600183
269 .quad 0x0000000000800183
270 .quad 0x0000000000A00183
271 .quad 0x0000000000C00183
272 .quad 0x0000000000E00183
273 .quad 0x0000000001000183
274 .quad 0x0000000001200183
275 .quad 0x0000000001400183
276 .quad 0x0000000001600183
277 .quad 0x0000000001800183
278 .quad 0x0000000001A00183
279 .quad 0x0000000001C00183
280 .quad 0x0000000001E00183
281 .quad 0x0000000002000183
282 .quad 0x0000000002200183
283 .quad 0x0000000002400183
284 .quad 0x0000000002600183
285 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
286 .globl temp_boot_pmds
287temp_boot_pmds:
288 .fill 492,8,0
289
290.org 0x5000
291ENTRY(level2_kernel_pgt)
292 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
293 When you change this change KERNEL_TEXT_SIZE in page.h too. */
294 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
295 .quad 0x0000000000000183
296 .quad 0x0000000000200183
297 .quad 0x0000000000400183
298 .quad 0x0000000000600183
299 .quad 0x0000000000800183
300 .quad 0x0000000000A00183
301 .quad 0x0000000000C00183
302 .quad 0x0000000000E00183
303 .quad 0x0000000001000183
304 .quad 0x0000000001200183
305 .quad 0x0000000001400183
306 .quad 0x0000000001600183
307 .quad 0x0000000001800183
308 .quad 0x0000000001A00183
309 .quad 0x0000000001C00183
310 .quad 0x0000000001E00183
311 .quad 0x0000000002000183
312 .quad 0x0000000002200183
313 .quad 0x0000000002400183
314 .quad 0x0000000002600183
315 /* Module mapping starts here */
316 .fill 492,8,0
317
318.org 0x6000
319ENTRY(empty_zero_page)
320
321.org 0x7000
322ENTRY(empty_bad_page)
323
324.org 0x8000
325ENTRY(empty_bad_pte_table)
326
327.org 0x9000
328ENTRY(empty_bad_pmd_table)
329
330.org 0xa000
331ENTRY(level3_physmem_pgt)
332 .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
333
334 .org 0xb000
335#ifdef CONFIG_ACPI_SLEEP
336ENTRY(wakeup_level4_pgt)
337 .quad 0x0000000000102007 /* -> level3_ident_pgt */
338 .fill 255,8,0
339 .quad 0x000000000010a007
340 .fill 254,8,0
341 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
342 .quad 0x0000000000103007 /* -> level3_kernel_pgt */
343#endif
344
345 .data
346
347 .align 16
348 .globl cpu_gdt_descr
349cpu_gdt_descr:
350 .word gdt_end-cpu_gdt_table
351gdt:
352 .quad cpu_gdt_table
353#ifdef CONFIG_SMP
354 .rept NR_CPUS-1
355 .word 0
356 .quad 0
357 .endr
358#endif
359
360/* We need valid kernel segments for data and code in long mode too
361 * IRET will check the segment types kkeil 2000/10/28
362 * Also sysret mandates a special GDT layout
363 */
364
365.align L1_CACHE_BYTES
366
367/* The TLS descriptors are currently at a different place compared to i386.
368 Hopefully nobody expects them at a fixed place (Wine?) */
369
370ENTRY(cpu_gdt_table)
371 .quad 0x0000000000000000 /* NULL descriptor */
372 .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */
373 .quad 0x00af9a000000ffff /* __KERNEL_CS */
374 .quad 0x00cf92000000ffff /* __KERNEL_DS */
375 .quad 0x00cffa000000ffff /* __USER32_CS */
376 .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
377 .quad 0x00affa000000ffff /* __USER_CS */
378 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
379 .quad 0,0 /* TSS */
380 .quad 0,0 /* LDT */
381 .quad 0,0,0 /* three TLS descriptors */
382 .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
383 /* base must be patched for real base address. */
384gdt_end:
385 /* asm/segment.h:GDT_ENTRIES must match this */
386 /* This should be a multiple of the cache line size */
387 /* GDTs of other CPUs: */
388 .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
389
390 .align L1_CACHE_BYTES
391ENTRY(idt_table)
392 .rept 256
393 .quad 0
394 .quad 0
395 .endr
396
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
new file mode 100644
index 000000000000..6cad46c98a23
--- /dev/null
+++ b/arch/x86_64/kernel/head64.c
@@ -0,0 +1,117 @@
1/*
2 * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 *
6 * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
7 */
8
9#include <linux/init.h>
10#include <linux/linkage.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/percpu.h>
15
16#include <asm/processor.h>
17#include <asm/proto.h>
18#include <asm/smp.h>
19#include <asm/bootsetup.h>
20#include <asm/setup.h>
21#include <asm/desc.h>
22
23/* Don't add a printk in there. printk relies on the PDA which is not initialized
24 yet. */
25static void __init clear_bss(void)
26{
27 extern char __bss_start[], __bss_end[];
28 memset(__bss_start, 0,
29 (unsigned long) __bss_end - (unsigned long) __bss_start);
30}
31
32extern char x86_boot_params[2048];
33
34#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
35#define OLD_CL_MAGIC_ADDR 0x90020
36#define OLD_CL_MAGIC 0xA33F
37#define OLD_CL_BASE_ADDR 0x90000
38#define OLD_CL_OFFSET 0x90022
39
40extern char saved_command_line[];
41
42static void __init copy_bootdata(char *real_mode_data)
43{
44 int new_data;
45 char * command_line;
46
47 memcpy(x86_boot_params, real_mode_data, 2048);
48 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
49 if (!new_data) {
50 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
51 printk("so old bootloader that it does not support commandline?!\n");
52 return;
53 }
54 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
55 printk("old bootloader convention, maybe loadlin?\n");
56 }
57 command_line = (char *) ((u64)(new_data));
58 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
59 printk("Bootdata ok (command line is %s)\n", saved_command_line);
60}
61
62static void __init setup_boot_cpu_data(void)
63{
64 unsigned int dummy, eax;
65
66 /* get vendor info */
67 cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
68 (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
69 (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
70 (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
71
72 /* get cpu type */
73 cpuid(1, &eax, &dummy, &dummy,
74 (unsigned int *) &boot_cpu_data.x86_capability);
75 boot_cpu_data.x86 = (eax >> 8) & 0xf;
76 boot_cpu_data.x86_model = (eax >> 4) & 0xf;
77 boot_cpu_data.x86_mask = eax & 0xf;
78}
79
80extern char _end[];
81
82void __init x86_64_start_kernel(char * real_mode_data)
83{
84 char *s;
85 int i;
86
87 for (i = 0; i < 256; i++)
88 set_intr_gate(i, early_idt_handler);
89 asm volatile("lidt %0" :: "m" (idt_descr));
90 clear_bss();
91 pda_init(0);
92 copy_bootdata(real_mode_data);
93#ifdef CONFIG_SMP
94 cpu_set(0, cpu_online_map);
95#endif
96 /* default console: */
97 if (!strstr(saved_command_line, "console="))
98 strcat(saved_command_line, " console=tty0");
99 s = strstr(saved_command_line, "earlyprintk=");
100 if (s != NULL)
101 setup_early_printk(s);
102#ifdef CONFIG_DISCONTIGMEM
103 s = strstr(saved_command_line, "numa=");
104 if (s != NULL)
105 numa_setup(s+5);
106#endif
107#ifdef CONFIG_X86_IO_APIC
108 if (strstr(saved_command_line, "disableapic"))
109 disable_apic = 1;
110#endif
111 /* You need early console to see that */
112 if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
113 panic("Kernel too big for kernel mapping\n");
114
115 setup_boot_cpu_data();
116 start_kernel();
117}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
new file mode 100644
index 000000000000..ba139cac57ce
--- /dev/null
+++ b/arch/x86_64/kernel/i387.c
@@ -0,0 +1,155 @@
1/*
2 * linux/arch/x86_64/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * General FPU state handling cleanups
9 * Gareth Hughes <gareth@valinux.com>, May 2000
10 *
11 * x86-64 rework 2002 Andi Kleen.
12 * Does direct fxsave in and out of user space now for signal handlers.
13 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
14 * the 64bit user space sees a FXSAVE frame directly.
15 */
16
17#include <linux/config.h>
18#include <linux/sched.h>
19#include <linux/init.h>
20#include <asm/processor.h>
21#include <asm/i387.h>
22#include <asm/sigcontext.h>
23#include <asm/user.h>
24#include <asm/ptrace.h>
25#include <asm/uaccess.h>
26
27unsigned int mxcsr_feature_mask = 0xffffffff;
28
29void mxcsr_feature_mask_init(void)
30{
31 unsigned int mask;
32 clts();
33 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
34 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
35 mask = current->thread.i387.fxsave.mxcsr_mask;
36 if (mask == 0) mask = 0x0000ffbf;
37 mxcsr_feature_mask &= mask;
38 stts();
39}
40
41/*
42 * Called at bootup to set up the initial FPU state that is later cloned
43 * into all processes.
44 */
45void __init fpu_init(void)
46{
47 unsigned long oldcr0 = read_cr0();
48 extern void __bad_fxsave_alignment(void);
49
50 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
51 __bad_fxsave_alignment();
52 set_in_cr4(X86_CR4_OSFXSR);
53 set_in_cr4(X86_CR4_OSXMMEXCPT);
54
55 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
56
57 mxcsr_feature_mask_init();
58 /* clean state in init */
59 current_thread_info()->status = 0;
60 clear_used_math();
61}
62
63void init_fpu(struct task_struct *child)
64{
65 if (tsk_used_math(child)) {
66 if (child == current)
67 unlazy_fpu(child);
68 return;
69 }
70 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
71 child->thread.i387.fxsave.cwd = 0x37f;
72 child->thread.i387.fxsave.mxcsr = 0x1f80;
73 /* only the device not available exception or ptrace can call init_fpu */
74 set_stopped_child_used_math(child);
75}
76
77/*
78 * Signal frame handlers.
79 */
80
81int save_i387(struct _fpstate __user *buf)
82{
83 struct task_struct *tsk = current;
84 int err = 0;
85
86 {
87 extern void bad_user_i387_struct(void);
88 if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
89 bad_user_i387_struct();
90 }
91
92 if ((unsigned long)buf % 16)
93 printk("save_i387: bad fpstate %p\n",buf);
94
95 if (!used_math())
96 return 0;
97 clear_used_math(); /* trigger finit */
98 if (tsk->thread_info->status & TS_USEDFPU) {
99 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
100 if (err) return err;
101 stts();
102 } else {
103 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
104 sizeof(struct i387_fxsave_struct)))
105 return -1;
106 }
107 return 1;
108}
109
110/*
111 * ptrace request handlers.
112 */
113
114int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
115{
116 init_fpu(tsk);
117 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
118 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
119}
120
121int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
122{
123 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
124 sizeof(struct user_i387_struct)))
125 return -EFAULT;
126 return 0;
127}
128
129/*
130 * FPU state for core dumps.
131 */
132
133int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
134{
135 struct task_struct *tsk = current;
136
137 if (!used_math())
138 return 0;
139
140 unlazy_fpu(tsk);
141 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
142 return 1;
143}
144
145int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
146{
147 int fpvalid = !!tsk_used_math(tsk);
148
149 if (fpvalid) {
150 if (tsk == current)
151 unlazy_fpu(tsk);
152 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
153}
154 return fpvalid;
155}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
new file mode 100644
index 000000000000..d52701246010
--- /dev/null
+++ b/arch/x86_64/kernel/i8259.c
@@ -0,0 +1,579 @@
1#include <linux/linkage.h>
2#include <linux/config.h>
3#include <linux/errno.h>
4#include <linux/signal.h>
5#include <linux/sched.h>
6#include <linux/ioport.h>
7#include <linux/interrupt.h>
8#include <linux/timex.h>
9#include <linux/slab.h>
10#include <linux/random.h>
11#include <linux/smp_lock.h>
12#include <linux/init.h>
13#include <linux/kernel_stat.h>
14#include <linux/sysdev.h>
15#include <linux/bitops.h>
16
17#include <asm/acpi.h>
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/io.h>
21#include <asm/irq.h>
22#include <asm/hw_irq.h>
23#include <asm/pgtable.h>
24#include <asm/delay.h>
25#include <asm/desc.h>
26#include <asm/apic.h>
27
28#include <linux/irq.h>
29
30/*
31 * Common place to define all x86 IRQ vectors
32 *
33 * This builds up the IRQ handler stubs using some ugly macros in irq.h
34 *
35 * These macros create the low-level assembly IRQ routines that save
36 * register context and call do_IRQ(). do_IRQ() then does all the
37 * operations that are needed to keep the AT (or SMP IOAPIC)
38 * interrupt-controller happy.
39 */
40
41#define BI(x,y) \
42 BUILD_IRQ(x##y)
43
44#define BUILD_16_IRQS(x) \
45 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
46 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
47 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
48 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
49
50#define BUILD_14_IRQS(x) \
51 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
52 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
53 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
54 BI(x,c) BI(x,d)
55
56/*
57 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
58 * (these are usually mapped to vectors 0x20-0x2f)
59 */
60BUILD_16_IRQS(0x0)
61
62#ifdef CONFIG_X86_LOCAL_APIC
63/*
64 * The IO-APIC gives us many more interrupt sources. Most of these
65 * are unused but an SMP system is supposed to have enough memory ...
66 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
67 * across the spectrum, so we really want to be prepared to get all
68 * of these. Plus, more powerful systems might have more than 64
69 * IO-APIC registers.
70 *
71 * (these are usually mapped into the 0x30-0xff vector range)
72 */
73 BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
74BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
75BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
76BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
77
78#ifdef CONFIG_PCI_MSI
79 BUILD_14_IRQS(0xe)
80#endif
81
82#endif
83
84#undef BUILD_16_IRQS
85#undef BUILD_14_IRQS
86#undef BI
87
88
89#define IRQ(x,y) \
90 IRQ##x##y##_interrupt
91
92#define IRQLIST_16(x) \
93 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
94 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
95 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
96 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
97
98#define IRQLIST_14(x) \
99 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
100 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
101 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
102 IRQ(x,c), IRQ(x,d)
103
104void (*interrupt[NR_IRQS])(void) = {
105 IRQLIST_16(0x0),
106
107#ifdef CONFIG_X86_IO_APIC
108 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
109 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
110 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
111 IRQLIST_16(0xc), IRQLIST_16(0xd)
112
113#ifdef CONFIG_PCI_MSI
114 , IRQLIST_14(0xe)
115#endif
116
117#endif
118};
119
120#undef IRQ
121#undef IRQLIST_16
122#undef IRQLIST_14
123
124/*
125 * This is the 'legacy' 8259A Programmable Interrupt Controller,
126 * present in the majority of PC/AT boxes.
127 * plus some generic x86 specific things if generic specifics makes
128 * any sense at all.
129 * this file should become arch/i386/kernel/irq.c when the old irq.c
130 * moves to arch independent land
131 */
132
133DEFINE_SPINLOCK(i8259A_lock);
134
135static void end_8259A_irq (unsigned int irq)
136{
137 if (irq > 256) {
138 char var;
139 printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info);
140
141 BUG();
142 }
143
144 if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
145 irq_desc[irq].action)
146 enable_8259A_irq(irq);
147}
148
149#define shutdown_8259A_irq disable_8259A_irq
150
151static void mask_and_ack_8259A(unsigned int);
152
153static unsigned int startup_8259A_irq(unsigned int irq)
154{
155 enable_8259A_irq(irq);
156 return 0; /* never anything pending */
157}
158
159static struct hw_interrupt_type i8259A_irq_type = {
160 "XT-PIC",
161 startup_8259A_irq,
162 shutdown_8259A_irq,
163 enable_8259A_irq,
164 disable_8259A_irq,
165 mask_and_ack_8259A,
166 end_8259A_irq,
167 NULL
168};
169
170/*
171 * 8259A PIC functions to handle ISA devices:
172 */
173
174/*
175 * This contains the irq mask for both 8259A irq controllers,
176 */
177static unsigned int cached_irq_mask = 0xffff;
178
179#define __byte(x,y) (((unsigned char *)&(y))[x])
180#define cached_21 (__byte(0,cached_irq_mask))
181#define cached_A1 (__byte(1,cached_irq_mask))
182
183/*
184 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
185 * boards the timer interrupt is not really connected to any IO-APIC pin,
186 * it's fed to the master 8259A's IR0 line only.
187 *
188 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
189 * this 'mixed mode' IRQ handling costs nothing because it's only used
190 * at IRQ setup time.
191 */
192unsigned long io_apic_irqs;
193
194void disable_8259A_irq(unsigned int irq)
195{
196 unsigned int mask = 1 << irq;
197 unsigned long flags;
198
199 spin_lock_irqsave(&i8259A_lock, flags);
200 cached_irq_mask |= mask;
201 if (irq & 8)
202 outb(cached_A1,0xA1);
203 else
204 outb(cached_21,0x21);
205 spin_unlock_irqrestore(&i8259A_lock, flags);
206}
207
208void enable_8259A_irq(unsigned int irq)
209{
210 unsigned int mask = ~(1 << irq);
211 unsigned long flags;
212
213 spin_lock_irqsave(&i8259A_lock, flags);
214 cached_irq_mask &= mask;
215 if (irq & 8)
216 outb(cached_A1,0xA1);
217 else
218 outb(cached_21,0x21);
219 spin_unlock_irqrestore(&i8259A_lock, flags);
220}
221
222int i8259A_irq_pending(unsigned int irq)
223{
224 unsigned int mask = 1<<irq;
225 unsigned long flags;
226 int ret;
227
228 spin_lock_irqsave(&i8259A_lock, flags);
229 if (irq < 8)
230 ret = inb(0x20) & mask;
231 else
232 ret = inb(0xA0) & (mask >> 8);
233 spin_unlock_irqrestore(&i8259A_lock, flags);
234
235 return ret;
236}
237
238void make_8259A_irq(unsigned int irq)
239{
240 disable_irq_nosync(irq);
241 io_apic_irqs &= ~(1<<irq);
242 irq_desc[irq].handler = &i8259A_irq_type;
243 enable_irq(irq);
244}
245
246/*
247 * This function assumes to be called rarely. Switching between
248 * 8259A registers is slow.
249 * This has to be protected by the irq controller spinlock
250 * before being called.
251 */
252static inline int i8259A_irq_real(unsigned int irq)
253{
254 int value;
255 int irqmask = 1<<irq;
256
257 if (irq < 8) {
258 outb(0x0B,0x20); /* ISR register */
259 value = inb(0x20) & irqmask;
260 outb(0x0A,0x20); /* back to the IRR register */
261 return value;
262 }
263 outb(0x0B,0xA0); /* ISR register */
264 value = inb(0xA0) & (irqmask >> 8);
265 outb(0x0A,0xA0); /* back to the IRR register */
266 return value;
267}
268
269/*
270 * Careful! The 8259A is a fragile beast, it pretty
271 * much _has_ to be done exactly like this (mask it
272 * first, _then_ send the EOI, and the order of EOI
273 * to the two 8259s is important!
274 */
275static void mask_and_ack_8259A(unsigned int irq)
276{
277 unsigned int irqmask = 1 << irq;
278 unsigned long flags;
279
280 spin_lock_irqsave(&i8259A_lock, flags);
281 /*
282 * Lightweight spurious IRQ detection. We do not want
283 * to overdo spurious IRQ handling - it's usually a sign
284 * of hardware problems, so we only do the checks we can
285 * do without slowing down good hardware unnecesserily.
286 *
287 * Note that IRQ7 and IRQ15 (the two spurious IRQs
288 * usually resulting from the 8259A-1|2 PICs) occur
289 * even if the IRQ is masked in the 8259A. Thus we
290 * can check spurious 8259A IRQs without doing the
291 * quite slow i8259A_irq_real() call for every IRQ.
292 * This does not cover 100% of spurious interrupts,
293 * but should be enough to warn the user that there
294 * is something bad going on ...
295 */
296 if (cached_irq_mask & irqmask)
297 goto spurious_8259A_irq;
298 cached_irq_mask |= irqmask;
299
300handle_real_irq:
301 if (irq & 8) {
302 inb(0xA1); /* DUMMY - (do we need this?) */
303 outb(cached_A1,0xA1);
304 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
305 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
306 } else {
307 inb(0x21); /* DUMMY - (do we need this?) */
308 outb(cached_21,0x21);
309 outb(0x60+irq,0x20); /* 'Specific EOI' to master */
310 }
311 spin_unlock_irqrestore(&i8259A_lock, flags);
312 return;
313
314spurious_8259A_irq:
315 /*
316 * this is the slow path - should happen rarely.
317 */
318 if (i8259A_irq_real(irq))
319 /*
320 * oops, the IRQ _is_ in service according to the
321 * 8259A - not spurious, go handle it.
322 */
323 goto handle_real_irq;
324
325 {
326 static int spurious_irq_mask;
327 /*
328 * At this point we can be sure the IRQ is spurious,
329 * lets ACK and report it. [once per IRQ]
330 */
331 if (!(spurious_irq_mask & irqmask)) {
332 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
333 spurious_irq_mask |= irqmask;
334 }
335 atomic_inc(&irq_err_count);
336 /*
337 * Theoretically we do not have to handle this IRQ,
338 * but in Linux this does not cause problems and is
339 * simpler for us.
340 */
341 goto handle_real_irq;
342 }
343}
344
345void init_8259A(int auto_eoi)
346{
347 unsigned long flags;
348
349 spin_lock_irqsave(&i8259A_lock, flags);
350
351 outb(0xff, 0x21); /* mask all of 8259A-1 */
352 outb(0xff, 0xA1); /* mask all of 8259A-2 */
353
354 /*
355 * outb_p - this has to work on a wide range of PC hardware.
356 */
357 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
358 outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
359 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
360 if (auto_eoi)
361 outb_p(0x03, 0x21); /* master does Auto EOI */
362 else
363 outb_p(0x01, 0x21); /* master expects normal EOI */
364
365 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
366 outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
367 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
368 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
369 is to be investigated) */
370
371 if (auto_eoi)
372 /*
373 * in AEOI mode we just have to mask the interrupt
374 * when acking.
375 */
376 i8259A_irq_type.ack = disable_8259A_irq;
377 else
378 i8259A_irq_type.ack = mask_and_ack_8259A;
379
380 udelay(100); /* wait for 8259A to initialize */
381
382 outb(cached_21, 0x21); /* restore master IRQ mask */
383 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
384
385 spin_unlock_irqrestore(&i8259A_lock, flags);
386}
387
388static char irq_trigger[2];
389/**
390 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
391 */
392static void restore_ELCR(char *trigger)
393{
394 outb(trigger[0], 0x4d0);
395 outb(trigger[1], 0x4d1);
396}
397
398static void save_ELCR(char *trigger)
399{
400 /* IRQ 0,1,2,8,13 are marked as reserved */
401 trigger[0] = inb(0x4d0) & 0xF8;
402 trigger[1] = inb(0x4d1) & 0xDE;
403}
404
405static int i8259A_resume(struct sys_device *dev)
406{
407 init_8259A(0);
408 restore_ELCR(irq_trigger);
409 return 0;
410}
411
412static int i8259A_suspend(struct sys_device *dev, u32 state)
413{
414 save_ELCR(irq_trigger);
415 return 0;
416}
417
418static struct sysdev_class i8259_sysdev_class = {
419 set_kset_name("i8259"),
420 .suspend = i8259A_suspend,
421 .resume = i8259A_resume,
422};
423
424static struct sys_device device_i8259A = {
425 .id = 0,
426 .cls = &i8259_sysdev_class,
427};
428
429static int __init i8259A_init_sysfs(void)
430{
431 int error = sysdev_class_register(&i8259_sysdev_class);
432 if (!error)
433 error = sysdev_register(&device_i8259A);
434 return error;
435}
436
437device_initcall(i8259A_init_sysfs);
438
439/*
440 * IRQ2 is cascade interrupt to second interrupt controller
441 */
442
443static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
444
445void __init init_ISA_irqs (void)
446{
447 int i;
448
449#ifdef CONFIG_X86_LOCAL_APIC
450 init_bsp_APIC();
451#endif
452 init_8259A(0);
453
454 for (i = 0; i < NR_IRQS; i++) {
455 irq_desc[i].status = IRQ_DISABLED;
456 irq_desc[i].action = NULL;
457 irq_desc[i].depth = 1;
458
459 if (i < 16) {
460 /*
461 * 16 old-style INTA-cycle interrupts:
462 */
463 irq_desc[i].handler = &i8259A_irq_type;
464 } else {
465 /*
466 * 'high' PCI IRQs filled in on demand
467 */
468 irq_desc[i].handler = &no_irq_type;
469 }
470 }
471}
472
473void apic_timer_interrupt(void);
474void spurious_interrupt(void);
475void error_interrupt(void);
476void reschedule_interrupt(void);
477void call_function_interrupt(void);
478void invalidate_interrupt(void);
479void thermal_interrupt(void);
480void i8254_timer_resume(void);
481
482static void setup_timer(void)
483{
484 outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
485 udelay(10);
486 outb_p(LATCH & 0xff , 0x40); /* LSB */
487 udelay(10);
488 outb(LATCH >> 8 , 0x40); /* MSB */
489}
490
491static int timer_resume(struct sys_device *dev)
492{
493 setup_timer();
494 return 0;
495}
496
497void i8254_timer_resume(void)
498{
499 setup_timer();
500}
501
502static struct sysdev_class timer_sysclass = {
503 set_kset_name("timer"),
504 .resume = timer_resume,
505};
506
507static struct sys_device device_timer = {
508 .id = 0,
509 .cls = &timer_sysclass,
510};
511
512static int __init init_timer_sysfs(void)
513{
514 int error = sysdev_class_register(&timer_sysclass);
515 if (!error)
516 error = sysdev_register(&device_timer);
517 return error;
518}
519
520device_initcall(init_timer_sysfs);
521
522void __init init_IRQ(void)
523{
524 int i;
525
526 init_ISA_irqs();
527 /*
528 * Cover the whole vector space, no vector can escape
529 * us. (some of these will be overridden and become
530 * 'special' SMP interrupts)
531 */
532 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
533 int vector = FIRST_EXTERNAL_VECTOR + i;
534 if (i >= NR_IRQS)
535 break;
536 if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) {
537 set_intr_gate(vector, interrupt[i]);
538 }
539 }
540
541#ifdef CONFIG_SMP
542 /*
543 * IRQ0 must be given a fixed assignment and initialized,
544 * because it's used before the IO-APIC is set up.
545 */
546 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
547
548 /*
549 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
550 * IPI, driven by wakeup.
551 */
552 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
553
554 /* IPI for invalidation */
555 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
556
557 /* IPI for generic function call */
558 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
559#endif
560 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
561
562#ifdef CONFIG_X86_LOCAL_APIC
563 /* self generated IPI for local APIC timer */
564 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
565
566 /* IPI vectors for APIC spurious and error interrupts */
567 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
568 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
569#endif
570
571 /*
572 * Set the clock to HZ Hz, we already have a valid
573 * vector now:
574 */
575 setup_timer();
576
577 if (!acpi_ioapic)
578 setup_irq(2, &irq2);
579}
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
new file mode 100644
index 000000000000..c4dc91491157
--- /dev/null
+++ b/arch/x86_64/kernel/init_task.c
@@ -0,0 +1,49 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial task structure.
23 *
24 * We need to make sure that this is 8192-byte aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40/*
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data.cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */
47DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp;
48
49#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
new file mode 100644
index 000000000000..adebe70c180e
--- /dev/null
+++ b/arch/x86_64/kernel/io_apic.c
@@ -0,0 +1,1982 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/irq.h>
25#include <linux/interrupt.h>
26#include <linux/init.h>
27#include <linux/delay.h>
28#include <linux/sched.h>
29#include <linux/config.h>
30#include <linux/smp_lock.h>
31#include <linux/mc146818rtc.h>
32#include <linux/acpi.h>
33#include <linux/sysdev.h>
34
35#include <asm/io.h>
36#include <asm/smp.h>
37#include <asm/desc.h>
38#include <asm/proto.h>
39#include <asm/mach_apic.h>
40
41#define __apicdebuginit __init
42
43int sis_apic_bug; /* not actually supported, dummy for compile */
44
45static DEFINE_SPINLOCK(ioapic_lock);
46
47/*
48 * # of IRQ routing registers
49 */
50int nr_ioapic_registers[MAX_IO_APICS];
51
52/*
53 * Rough estimation of how many shared IRQs there are, can
54 * be changed anytime.
55 */
56#define MAX_PLUS_SHARED_IRQS NR_IRQS
57#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
58
59/*
60 * This is performance-critical, we want to do it O(1)
61 *
62 * the indexing order of this array favors 1:1 mappings
63 * between pins and IRQs.
64 */
65
66static struct irq_pin_list {
67 short apic, pin, next;
68} irq_2_pin[PIN_MAP_SIZE];
69
70int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
71#ifdef CONFIG_PCI_MSI
72#define vector_to_irq(vector) \
73 (platform_legacy_irq(vector) ? vector : vector_irq[vector])
74#else
75#define vector_to_irq(vector) (vector)
76#endif
77
78/*
79 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
80 * shared ISA-space IRQs, so we have to support them. We are super
81 * fast in the common case, and fast for shared ISA-space IRQs.
82 */
83static void add_pin_to_irq(unsigned int irq, int apic, int pin)
84{
85 static int first_free_entry = NR_IRQS;
86 struct irq_pin_list *entry = irq_2_pin + irq;
87
88 while (entry->next)
89 entry = irq_2_pin + entry->next;
90
91 if (entry->pin != -1) {
92 entry->next = first_free_entry;
93 entry = irq_2_pin + entry->next;
94 if (++first_free_entry >= PIN_MAP_SIZE)
95 panic("io_apic.c: whoops");
96 }
97 entry->apic = apic;
98 entry->pin = pin;
99}
100
101#define __DO_ACTION(R, ACTION, FINAL) \
102 \
103{ \
104 int pin; \
105 struct irq_pin_list *entry = irq_2_pin + irq; \
106 \
107 for (;;) { \
108 unsigned int reg; \
109 pin = entry->pin; \
110 if (pin == -1) \
111 break; \
112 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
113 reg ACTION; \
114 io_apic_modify(entry->apic, reg); \
115 if (!entry->next) \
116 break; \
117 entry = irq_2_pin + entry->next; \
118 } \
119 FINAL; \
120}
121
122#define DO_ACTION(name,R,ACTION, FINAL) \
123 \
124 static void name##_IO_APIC_irq (unsigned int irq) \
125 __DO_ACTION(R, ACTION, FINAL)
126
127DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
128 /* mask = 1 */
129DO_ACTION( __unmask, 0, &= 0xfffeffff, )
130 /* mask = 0 */
131
132static void mask_IO_APIC_irq (unsigned int irq)
133{
134 unsigned long flags;
135
136 spin_lock_irqsave(&ioapic_lock, flags);
137 __mask_IO_APIC_irq(irq);
138 spin_unlock_irqrestore(&ioapic_lock, flags);
139}
140
141static void unmask_IO_APIC_irq (unsigned int irq)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&ioapic_lock, flags);
146 __unmask_IO_APIC_irq(irq);
147 spin_unlock_irqrestore(&ioapic_lock, flags);
148}
149
150static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
151{
152 struct IO_APIC_route_entry entry;
153 unsigned long flags;
154
155 /* Check delivery_mode to be sure we're not clearing an SMI pin */
156 spin_lock_irqsave(&ioapic_lock, flags);
157 *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
158 *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
159 spin_unlock_irqrestore(&ioapic_lock, flags);
160 if (entry.delivery_mode == dest_SMI)
161 return;
162 /*
163 * Disable it in the IO-APIC irq-routing table:
164 */
165 memset(&entry, 0, sizeof(entry));
166 entry.mask = 1;
167 spin_lock_irqsave(&ioapic_lock, flags);
168 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
169 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
170 spin_unlock_irqrestore(&ioapic_lock, flags);
171}
172
173static void clear_IO_APIC (void)
174{
175 int apic, pin;
176
177 for (apic = 0; apic < nr_ioapics; apic++)
178 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
179 clear_IO_APIC_pin(apic, pin);
180}
181
182/*
183 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
184 * specific CPU-side IRQs.
185 */
186
187#define MAX_PIRQS 8
188static int pirq_entries [MAX_PIRQS];
189static int pirqs_enabled;
190int skip_ioapic_setup;
191int ioapic_force;
192
193/* dummy parsing: see setup.c */
194
195static int __init disable_ioapic_setup(char *str)
196{
197 skip_ioapic_setup = 1;
198 return 1;
199}
200
201static int __init enable_ioapic_setup(char *str)
202{
203 ioapic_force = 1;
204 skip_ioapic_setup = 0;
205 return 1;
206}
207
208__setup("noapic", disable_ioapic_setup);
209__setup("apic", enable_ioapic_setup);
210
211#include <asm/pci-direct.h>
212#include <linux/pci_ids.h>
213#include <linux/pci.h>
214
215/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
216 off. Check for an Nvidia or VIA PCI bridge and turn it off.
217 Use pci direct infrastructure because this runs before the PCI subsystem.
218
219 Can be overwritten with "apic"
220
221 And another hack to disable the IOMMU on VIA chipsets.
222
223 Kludge-O-Rama. */
224void __init check_ioapic(void)
225{
226 int num,slot,func;
227 if (ioapic_force)
228 return;
229
230 /* Poor man's PCI discovery */
231 for (num = 0; num < 32; num++) {
232 for (slot = 0; slot < 32; slot++) {
233 for (func = 0; func < 8; func++) {
234 u32 class;
235 u32 vendor;
236 u8 type;
237 class = read_pci_config(num,slot,func,
238 PCI_CLASS_REVISION);
239 if (class == 0xffffffff)
240 break;
241
242 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
243 continue;
244
245 vendor = read_pci_config(num, slot, func,
246 PCI_VENDOR_ID);
247 vendor &= 0xffff;
248 switch (vendor) {
249 case PCI_VENDOR_ID_VIA:
250#ifdef CONFIG_GART_IOMMU
251 if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) ||
252 force_iommu) &&
253 !iommu_aperture_allowed) {
254 printk(KERN_INFO
255 "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
256 iommu_aperture_disabled = 1;
257 }
258#endif
259 return;
260 case PCI_VENDOR_ID_NVIDIA:
261#ifdef CONFIG_ACPI
262 /* All timer overrides on Nvidia
263 seem to be wrong. Skip them. */
264 acpi_skip_timer_override = 1;
265 printk(KERN_INFO
266 "Nvidia board detected. Ignoring ACPI timer override.\n");
267#endif
268 /* RED-PEN skip them on mptables too? */
269 return;
270 }
271
272 /* No multi-function device? */
273 type = read_pci_config_byte(num,slot,func,
274 PCI_HEADER_TYPE);
275 if (!(type & 0x80))
276 break;
277 }
278 }
279 }
280}
281
282static int __init ioapic_pirq_setup(char *str)
283{
284 int i, max;
285 int ints[MAX_PIRQS+1];
286
287 get_options(str, ARRAY_SIZE(ints), ints);
288
289 for (i = 0; i < MAX_PIRQS; i++)
290 pirq_entries[i] = -1;
291
292 pirqs_enabled = 1;
293 apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
294 max = MAX_PIRQS;
295 if (ints[0] < MAX_PIRQS)
296 max = ints[0];
297
298 for (i = 0; i < max; i++) {
299 apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
300 /*
301 * PIRQs are mapped upside down, usually.
302 */
303 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
304 }
305 return 1;
306}
307
308__setup("pirq=", ioapic_pirq_setup);
309
310/*
311 * Find the IRQ entry number of a certain pin.
312 */
313static int find_irq_entry(int apic, int pin, int type)
314{
315 int i;
316
317 for (i = 0; i < mp_irq_entries; i++)
318 if (mp_irqs[i].mpc_irqtype == type &&
319 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
320 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
321 mp_irqs[i].mpc_dstirq == pin)
322 return i;
323
324 return -1;
325}
326
327/*
328 * Find the pin to which IRQ[irq] (ISA) is connected
329 */
330static int __init find_isa_irq_pin(int irq, int type)
331{
332 int i;
333
334 for (i = 0; i < mp_irq_entries; i++) {
335 int lbus = mp_irqs[i].mpc_srcbus;
336
337 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
338 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
339 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
340 (mp_irqs[i].mpc_irqtype == type) &&
341 (mp_irqs[i].mpc_srcbusirq == irq))
342
343 return mp_irqs[i].mpc_dstirq;
344 }
345 return -1;
346}
347
348/*
349 * Find a specific PCI IRQ entry.
350 * Not an __init, possibly needed by modules
351 */
352static int pin_2_irq(int idx, int apic, int pin);
353
354int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
355{
356 int apic, i, best_guess = -1;
357
358 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
359 bus, slot, pin);
360 if (mp_bus_id_to_pci_bus[bus] == -1) {
361 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
362 return -1;
363 }
364 for (i = 0; i < mp_irq_entries; i++) {
365 int lbus = mp_irqs[i].mpc_srcbus;
366
367 for (apic = 0; apic < nr_ioapics; apic++)
368 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
369 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
370 break;
371
372 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
373 !mp_irqs[i].mpc_irqtype &&
374 (bus == lbus) &&
375 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
376 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
377
378 if (!(apic || IO_APIC_IRQ(irq)))
379 continue;
380
381 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
382 return irq;
383 /*
384 * Use the first all-but-pin matching entry as a
385 * best-guess fuzzy result for broken mptables.
386 */
387 if (best_guess < 0)
388 best_guess = irq;
389 }
390 }
391 return best_guess;
392}
393
394/*
395 * EISA Edge/Level control register, ELCR
396 */
397static int EISA_ELCR(unsigned int irq)
398{
399 if (irq < 16) {
400 unsigned int port = 0x4d0 + (irq >> 3);
401 return (inb(port) >> (irq & 7)) & 1;
402 }
403 apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
404 return 0;
405}
406
407/* EISA interrupts are always polarity zero and can be edge or level
408 * trigger depending on the ELCR value. If an interrupt is listed as
409 * EISA conforming in the MP table, that means its trigger type must
410 * be read in from the ELCR */
411
412#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
413#define default_EISA_polarity(idx) (0)
414
415/* ISA interrupts are always polarity zero edge triggered,
416 * when listed as conforming in the MP table. */
417
418#define default_ISA_trigger(idx) (0)
419#define default_ISA_polarity(idx) (0)
420
421/* PCI interrupts are always polarity one level triggered,
422 * when listed as conforming in the MP table. */
423
424#define default_PCI_trigger(idx) (1)
425#define default_PCI_polarity(idx) (1)
426
427/* MCA interrupts are always polarity zero level triggered,
428 * when listed as conforming in the MP table. */
429
430#define default_MCA_trigger(idx) (1)
431#define default_MCA_polarity(idx) (0)
432
433static int __init MPBIOS_polarity(int idx)
434{
435 int bus = mp_irqs[idx].mpc_srcbus;
436 int polarity;
437
438 /*
439 * Determine IRQ line polarity (high active or low active):
440 */
441 switch (mp_irqs[idx].mpc_irqflag & 3)
442 {
443 case 0: /* conforms, ie. bus-type dependent polarity */
444 {
445 switch (mp_bus_id_to_type[bus])
446 {
447 case MP_BUS_ISA: /* ISA pin */
448 {
449 polarity = default_ISA_polarity(idx);
450 break;
451 }
452 case MP_BUS_EISA: /* EISA pin */
453 {
454 polarity = default_EISA_polarity(idx);
455 break;
456 }
457 case MP_BUS_PCI: /* PCI pin */
458 {
459 polarity = default_PCI_polarity(idx);
460 break;
461 }
462 case MP_BUS_MCA: /* MCA pin */
463 {
464 polarity = default_MCA_polarity(idx);
465 break;
466 }
467 default:
468 {
469 printk(KERN_WARNING "broken BIOS!!\n");
470 polarity = 1;
471 break;
472 }
473 }
474 break;
475 }
476 case 1: /* high active */
477 {
478 polarity = 0;
479 break;
480 }
481 case 2: /* reserved */
482 {
483 printk(KERN_WARNING "broken BIOS!!\n");
484 polarity = 1;
485 break;
486 }
487 case 3: /* low active */
488 {
489 polarity = 1;
490 break;
491 }
492 default: /* invalid */
493 {
494 printk(KERN_WARNING "broken BIOS!!\n");
495 polarity = 1;
496 break;
497 }
498 }
499 return polarity;
500}
501
502static int MPBIOS_trigger(int idx)
503{
504 int bus = mp_irqs[idx].mpc_srcbus;
505 int trigger;
506
507 /*
508 * Determine IRQ trigger mode (edge or level sensitive):
509 */
510 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
511 {
512 case 0: /* conforms, ie. bus-type dependent */
513 {
514 switch (mp_bus_id_to_type[bus])
515 {
516 case MP_BUS_ISA: /* ISA pin */
517 {
518 trigger = default_ISA_trigger(idx);
519 break;
520 }
521 case MP_BUS_EISA: /* EISA pin */
522 {
523 trigger = default_EISA_trigger(idx);
524 break;
525 }
526 case MP_BUS_PCI: /* PCI pin */
527 {
528 trigger = default_PCI_trigger(idx);
529 break;
530 }
531 case MP_BUS_MCA: /* MCA pin */
532 {
533 trigger = default_MCA_trigger(idx);
534 break;
535 }
536 default:
537 {
538 printk(KERN_WARNING "broken BIOS!!\n");
539 trigger = 1;
540 break;
541 }
542 }
543 break;
544 }
545 case 1: /* edge */
546 {
547 trigger = 0;
548 break;
549 }
550 case 2: /* reserved */
551 {
552 printk(KERN_WARNING "broken BIOS!!\n");
553 trigger = 1;
554 break;
555 }
556 case 3: /* level */
557 {
558 trigger = 1;
559 break;
560 }
561 default: /* invalid */
562 {
563 printk(KERN_WARNING "broken BIOS!!\n");
564 trigger = 0;
565 break;
566 }
567 }
568 return trigger;
569}
570
571static inline int irq_polarity(int idx)
572{
573 return MPBIOS_polarity(idx);
574}
575
576static inline int irq_trigger(int idx)
577{
578 return MPBIOS_trigger(idx);
579}
580
581static int pin_2_irq(int idx, int apic, int pin)
582{
583 int irq, i;
584 int bus = mp_irqs[idx].mpc_srcbus;
585
586 /*
587 * Debugging check, we are in big trouble if this message pops up!
588 */
589 if (mp_irqs[idx].mpc_dstirq != pin)
590 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
591
592 switch (mp_bus_id_to_type[bus])
593 {
594 case MP_BUS_ISA: /* ISA pin */
595 case MP_BUS_EISA:
596 case MP_BUS_MCA:
597 {
598 irq = mp_irqs[idx].mpc_srcbusirq;
599 break;
600 }
601 case MP_BUS_PCI: /* PCI pin */
602 {
603 /*
604 * PCI IRQs are mapped in order
605 */
606 i = irq = 0;
607 while (i < apic)
608 irq += nr_ioapic_registers[i++];
609 irq += pin;
610 break;
611 }
612 default:
613 {
614 printk(KERN_ERR "unknown bus type %d.\n",bus);
615 irq = 0;
616 break;
617 }
618 }
619
620 /*
621 * PCI IRQ command line redirection. Yes, limits are hardcoded.
622 */
623 if ((pin >= 16) && (pin <= 23)) {
624 if (pirq_entries[pin-16] != -1) {
625 if (!pirq_entries[pin-16]) {
626 apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
627 } else {
628 irq = pirq_entries[pin-16];
629 apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
630 pin-16, irq);
631 }
632 }
633 }
634 return irq;
635}
636
637static inline int IO_APIC_irq_trigger(int irq)
638{
639 int apic, idx, pin;
640
641 for (apic = 0; apic < nr_ioapics; apic++) {
642 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
643 idx = find_irq_entry(apic,pin,mp_INT);
644 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
645 return irq_trigger(idx);
646 }
647 }
648 /*
649 * nonexistent IRQs are edge default
650 */
651 return 0;
652}
653
654/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
655u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
656
657int assign_irq_vector(int irq)
658{
659 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
660
661 BUG_ON(irq >= NR_IRQ_VECTORS);
662 if (IO_APIC_VECTOR(irq) > 0)
663 return IO_APIC_VECTOR(irq);
664next:
665 current_vector += 8;
666 if (current_vector == IA32_SYSCALL_VECTOR)
667 goto next;
668
669 if (current_vector >= FIRST_SYSTEM_VECTOR) {
670 offset++;
671 if (!(offset%8))
672 return -ENOSPC;
673 current_vector = FIRST_DEVICE_VECTOR + offset;
674 }
675
676 vector_irq[current_vector] = irq;
677 if (irq != AUTO_ASSIGN)
678 IO_APIC_VECTOR(irq) = current_vector;
679
680 return current_vector;
681}
682
683extern void (*interrupt[NR_IRQS])(void);
684static struct hw_interrupt_type ioapic_level_type;
685static struct hw_interrupt_type ioapic_edge_type;
686
687#define IOAPIC_AUTO -1
688#define IOAPIC_EDGE 0
689#define IOAPIC_LEVEL 1
690
691static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
692{
693 if (use_pci_vector() && !platform_legacy_irq(irq)) {
694 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
695 trigger == IOAPIC_LEVEL)
696 irq_desc[vector].handler = &ioapic_level_type;
697 else
698 irq_desc[vector].handler = &ioapic_edge_type;
699 set_intr_gate(vector, interrupt[vector]);
700 } else {
701 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
702 trigger == IOAPIC_LEVEL)
703 irq_desc[irq].handler = &ioapic_level_type;
704 else
705 irq_desc[irq].handler = &ioapic_edge_type;
706 set_intr_gate(vector, interrupt[irq]);
707 }
708}
709
710static void __init setup_IO_APIC_irqs(void)
711{
712 struct IO_APIC_route_entry entry;
713 int apic, pin, idx, irq, first_notcon = 1, vector;
714 unsigned long flags;
715
716 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
717
718 for (apic = 0; apic < nr_ioapics; apic++) {
719 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
720
721 /*
722 * add it to the IO-APIC irq-routing table:
723 */
724 memset(&entry,0,sizeof(entry));
725
726 entry.delivery_mode = INT_DELIVERY_MODE;
727 entry.dest_mode = INT_DEST_MODE;
728 entry.mask = 0; /* enable IRQ */
729 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
730
731 idx = find_irq_entry(apic,pin,mp_INT);
732 if (idx == -1) {
733 if (first_notcon) {
734 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
735 first_notcon = 0;
736 } else
737 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
738 continue;
739 }
740
741 entry.trigger = irq_trigger(idx);
742 entry.polarity = irq_polarity(idx);
743
744 if (irq_trigger(idx)) {
745 entry.trigger = 1;
746 entry.mask = 1;
747 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
748 }
749
750 irq = pin_2_irq(idx, apic, pin);
751 add_pin_to_irq(irq, apic, pin);
752
753 if (!apic && !IO_APIC_IRQ(irq))
754 continue;
755
756 if (IO_APIC_IRQ(irq)) {
757 vector = assign_irq_vector(irq);
758 entry.vector = vector;
759
760 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
761 if (!apic && (irq < 16))
762 disable_8259A_irq(irq);
763 }
764 spin_lock_irqsave(&ioapic_lock, flags);
765 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
766 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
767 spin_unlock_irqrestore(&ioapic_lock, flags);
768 }
769 }
770
771 if (!first_notcon)
772 apic_printk(APIC_VERBOSE," not connected.\n");
773}
774
775/*
776 * Set up the 8259A-master output pin as broadcast to all
777 * CPUs.
778 */
779static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
780{
781 struct IO_APIC_route_entry entry;
782 unsigned long flags;
783
784 memset(&entry,0,sizeof(entry));
785
786 disable_8259A_irq(0);
787
788 /* mask LVT0 */
789 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
790
791 /*
792 * We use logical delivery to get the timer IRQ
793 * to the first CPU.
794 */
795 entry.dest_mode = INT_DEST_MODE;
796 entry.mask = 0; /* unmask IRQ now */
797 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
798 entry.delivery_mode = INT_DELIVERY_MODE;
799 entry.polarity = 0;
800 entry.trigger = 0;
801 entry.vector = vector;
802
803 /*
804 * The timer IRQ doesn't have to know that behind the
805 * scene we have a 8259A-master in AEOI mode ...
806 */
807 irq_desc[0].handler = &ioapic_edge_type;
808
809 /*
810 * Add it to the IO-APIC irq-routing table:
811 */
812 spin_lock_irqsave(&ioapic_lock, flags);
813 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
814 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
815 spin_unlock_irqrestore(&ioapic_lock, flags);
816
817 enable_8259A_irq(0);
818}
819
820void __init UNEXPECTED_IO_APIC(void)
821{
822}
823
824void __apicdebuginit print_IO_APIC(void)
825{
826 int apic, i;
827 union IO_APIC_reg_00 reg_00;
828 union IO_APIC_reg_01 reg_01;
829 union IO_APIC_reg_02 reg_02;
830 unsigned long flags;
831
832 if (apic_verbosity == APIC_QUIET)
833 return;
834
835 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
836 for (i = 0; i < nr_ioapics; i++)
837 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
838 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
839
840 /*
841 * We are a bit conservative about what we expect. We have to
842 * know about every hardware change ASAP.
843 */
844 printk(KERN_INFO "testing the IO APIC.......................\n");
845
846 for (apic = 0; apic < nr_ioapics; apic++) {
847
848 spin_lock_irqsave(&ioapic_lock, flags);
849 reg_00.raw = io_apic_read(apic, 0);
850 reg_01.raw = io_apic_read(apic, 1);
851 if (reg_01.bits.version >= 0x10)
852 reg_02.raw = io_apic_read(apic, 2);
853 spin_unlock_irqrestore(&ioapic_lock, flags);
854
855 printk("\n");
856 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
857 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
858 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
859 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
860 UNEXPECTED_IO_APIC();
861
862 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
863 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
864 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
865 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
866 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
867 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
868 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
869 (reg_01.bits.entries != 0x2E) &&
870 (reg_01.bits.entries != 0x3F) &&
871 (reg_01.bits.entries != 0x03)
872 )
873 UNEXPECTED_IO_APIC();
874
875 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
876 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
877 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
878 (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
879 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
880 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
881 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
882 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
883 )
884 UNEXPECTED_IO_APIC();
885 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
886 UNEXPECTED_IO_APIC();
887
888 if (reg_01.bits.version >= 0x10) {
889 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
890 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
891 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
892 UNEXPECTED_IO_APIC();
893 }
894
895 printk(KERN_DEBUG ".... IRQ redirection table:\n");
896
897 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
898 " Stat Dest Deli Vect: \n");
899
900 for (i = 0; i <= reg_01.bits.entries; i++) {
901 struct IO_APIC_route_entry entry;
902
903 spin_lock_irqsave(&ioapic_lock, flags);
904 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
905 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
906 spin_unlock_irqrestore(&ioapic_lock, flags);
907
908 printk(KERN_DEBUG " %02x %03X %02X ",
909 i,
910 entry.dest.logical.logical_dest,
911 entry.dest.physical.physical_dest
912 );
913
914 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
915 entry.mask,
916 entry.trigger,
917 entry.irr,
918 entry.polarity,
919 entry.delivery_status,
920 entry.dest_mode,
921 entry.delivery_mode,
922 entry.vector
923 );
924 }
925 }
926 if (use_pci_vector())
927 printk(KERN_INFO "Using vector-based indexing\n");
928 printk(KERN_DEBUG "IRQ to pin mappings:\n");
929 for (i = 0; i < NR_IRQS; i++) {
930 struct irq_pin_list *entry = irq_2_pin + i;
931 if (entry->pin < 0)
932 continue;
933 if (use_pci_vector() && !platform_legacy_irq(i))
934 printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
935 else
936 printk(KERN_DEBUG "IRQ%d ", i);
937 for (;;) {
938 printk("-> %d:%d", entry->apic, entry->pin);
939 if (!entry->next)
940 break;
941 entry = irq_2_pin + entry->next;
942 }
943 printk("\n");
944 }
945
946 printk(KERN_INFO ".................................... done.\n");
947
948 return;
949}
950
951#if 0
952
953static __apicdebuginit void print_APIC_bitfield (int base)
954{
955 unsigned int v;
956 int i, j;
957
958 if (apic_verbosity == APIC_QUIET)
959 return;
960
961 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
962 for (i = 0; i < 8; i++) {
963 v = apic_read(base + i*0x10);
964 for (j = 0; j < 32; j++) {
965 if (v & (1<<j))
966 printk("1");
967 else
968 printk("0");
969 }
970 printk("\n");
971 }
972}
973
974void __apicdebuginit print_local_APIC(void * dummy)
975{
976 unsigned int v, ver, maxlvt;
977
978 if (apic_verbosity == APIC_QUIET)
979 return;
980
981 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
982 smp_processor_id(), hard_smp_processor_id());
983 v = apic_read(APIC_ID);
984 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
985 v = apic_read(APIC_LVR);
986 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
987 ver = GET_APIC_VERSION(v);
988 maxlvt = get_maxlvt();
989
990 v = apic_read(APIC_TASKPRI);
991 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
992
993 if (APIC_INTEGRATED(ver)) { /* !82489DX */
994 v = apic_read(APIC_ARBPRI);
995 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
996 v & APIC_ARBPRI_MASK);
997 v = apic_read(APIC_PROCPRI);
998 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
999 }
1000
1001 v = apic_read(APIC_EOI);
1002 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1003 v = apic_read(APIC_RRR);
1004 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1005 v = apic_read(APIC_LDR);
1006 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1007 v = apic_read(APIC_DFR);
1008 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1009 v = apic_read(APIC_SPIV);
1010 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1011
1012 printk(KERN_DEBUG "... APIC ISR field:\n");
1013 print_APIC_bitfield(APIC_ISR);
1014 printk(KERN_DEBUG "... APIC TMR field:\n");
1015 print_APIC_bitfield(APIC_TMR);
1016 printk(KERN_DEBUG "... APIC IRR field:\n");
1017 print_APIC_bitfield(APIC_IRR);
1018
1019 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1020 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1021 apic_write(APIC_ESR, 0);
1022 v = apic_read(APIC_ESR);
1023 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1024 }
1025
1026 v = apic_read(APIC_ICR);
1027 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1028 v = apic_read(APIC_ICR2);
1029 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1030
1031 v = apic_read(APIC_LVTT);
1032 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1033
1034 if (maxlvt > 3) { /* PC is LVT#4. */
1035 v = apic_read(APIC_LVTPC);
1036 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1037 }
1038 v = apic_read(APIC_LVT0);
1039 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1040 v = apic_read(APIC_LVT1);
1041 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1042
1043 if (maxlvt > 2) { /* ERR is LVT#3. */
1044 v = apic_read(APIC_LVTERR);
1045 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1046 }
1047
1048 v = apic_read(APIC_TMICT);
1049 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1050 v = apic_read(APIC_TMCCT);
1051 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1052 v = apic_read(APIC_TDCR);
1053 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1054 printk("\n");
1055}
1056
1057void print_all_local_APICs (void)
1058{
1059 on_each_cpu(print_local_APIC, NULL, 1, 1);
1060}
1061
1062void __apicdebuginit print_PIC(void)
1063{
1064 extern spinlock_t i8259A_lock;
1065 unsigned int v;
1066 unsigned long flags;
1067
1068 if (apic_verbosity == APIC_QUIET)
1069 return;
1070
1071 printk(KERN_DEBUG "\nprinting PIC contents\n");
1072
1073 spin_lock_irqsave(&i8259A_lock, flags);
1074
1075 v = inb(0xa1) << 8 | inb(0x21);
1076 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1077
1078 v = inb(0xa0) << 8 | inb(0x20);
1079 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1080
1081 outb(0x0b,0xa0);
1082 outb(0x0b,0x20);
1083 v = inb(0xa0) << 8 | inb(0x20);
1084 outb(0x0a,0xa0);
1085 outb(0x0a,0x20);
1086
1087 spin_unlock_irqrestore(&i8259A_lock, flags);
1088
1089 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1090
1091 v = inb(0x4d1) << 8 | inb(0x4d0);
1092 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1093}
1094
1095#endif /* 0 */
1096
1097static void __init enable_IO_APIC(void)
1098{
1099 union IO_APIC_reg_01 reg_01;
1100 int i;
1101 unsigned long flags;
1102
1103 for (i = 0; i < PIN_MAP_SIZE; i++) {
1104 irq_2_pin[i].pin = -1;
1105 irq_2_pin[i].next = 0;
1106 }
1107 if (!pirqs_enabled)
1108 for (i = 0; i < MAX_PIRQS; i++)
1109 pirq_entries[i] = -1;
1110
1111 /*
1112 * The number of IO-APIC IRQ registers (== #pins):
1113 */
1114 for (i = 0; i < nr_ioapics; i++) {
1115 spin_lock_irqsave(&ioapic_lock, flags);
1116 reg_01.raw = io_apic_read(i, 1);
1117 spin_unlock_irqrestore(&ioapic_lock, flags);
1118 nr_ioapic_registers[i] = reg_01.bits.entries+1;
1119 }
1120
1121 /*
1122 * Do not trust the IO-APIC being empty at bootup
1123 */
1124 clear_IO_APIC();
1125}
1126
1127/*
1128 * Not an __init, needed by the reboot code
1129 */
1130void disable_IO_APIC(void)
1131{
1132 /*
1133 * Clear the IO-APIC before rebooting:
1134 */
1135 clear_IO_APIC();
1136
1137 disconnect_bsp_APIC();
1138}
1139
1140/*
1141 * function to set the IO-APIC physical IDs based on the
1142 * values stored in the MPC table.
1143 *
1144 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1145 */
1146
1147static void __init setup_ioapic_ids_from_mpc (void)
1148{
1149 union IO_APIC_reg_00 reg_00;
1150 int apic;
1151 int i;
1152 unsigned char old_id;
1153 unsigned long flags;
1154
1155 /*
1156 * Set the IOAPIC ID to the value stored in the MPC table.
1157 */
1158 for (apic = 0; apic < nr_ioapics; apic++) {
1159
1160 /* Read the register 0 value */
1161 spin_lock_irqsave(&ioapic_lock, flags);
1162 reg_00.raw = io_apic_read(apic, 0);
1163 spin_unlock_irqrestore(&ioapic_lock, flags);
1164
1165 old_id = mp_ioapics[apic].mpc_apicid;
1166
1167
1168 printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
1169
1170
1171 /*
1172 * We need to adjust the IRQ routing table
1173 * if the ID changed.
1174 */
1175 if (old_id != mp_ioapics[apic].mpc_apicid)
1176 for (i = 0; i < mp_irq_entries; i++)
1177 if (mp_irqs[i].mpc_dstapic == old_id)
1178 mp_irqs[i].mpc_dstapic
1179 = mp_ioapics[apic].mpc_apicid;
1180
1181 /*
1182 * Read the right value from the MPC table and
1183 * write it into the ID register.
1184 */
1185 apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
1186 mp_ioapics[apic].mpc_apicid);
1187
1188 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1189 spin_lock_irqsave(&ioapic_lock, flags);
1190 io_apic_write(apic, 0, reg_00.raw);
1191 spin_unlock_irqrestore(&ioapic_lock, flags);
1192
1193 /*
1194 * Sanity check
1195 */
1196 spin_lock_irqsave(&ioapic_lock, flags);
1197 reg_00.raw = io_apic_read(apic, 0);
1198 spin_unlock_irqrestore(&ioapic_lock, flags);
1199 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1200 printk("could not set ID!\n");
1201 else
1202 apic_printk(APIC_VERBOSE," ok.\n");
1203 }
1204}
1205
1206/*
1207 * There is a nasty bug in some older SMP boards, their mptable lies
1208 * about the timer IRQ. We do the following to work around the situation:
1209 *
1210 * - timer IRQ defaults to IO-APIC IRQ
1211 * - if this function detects that timer IRQs are defunct, then we fall
1212 * back to ISA timer IRQs
1213 */
1214static int __init timer_irq_works(void)
1215{
1216 unsigned long t1 = jiffies;
1217
1218 local_irq_enable();
1219 /* Let ten ticks pass... */
1220 mdelay((10 * 1000) / HZ);
1221
1222 /*
1223 * Expect a few ticks at least, to be sure some possible
1224 * glue logic does not lock up after one or two first
1225 * ticks in a non-ExtINT mode. Also the local APIC
1226 * might have cached one ExtINT interrupt. Finally, at
1227 * least one tick may be lost due to delays.
1228 */
1229
1230 /* jiffies wrap? */
1231 if (jiffies - t1 > 4)
1232 return 1;
1233 return 0;
1234}
1235
1236/*
1237 * In the SMP+IOAPIC case it might happen that there are an unspecified
1238 * number of pending IRQ events unhandled. These cases are very rare,
1239 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1240 * better to do it this way as thus we do not have to be aware of
1241 * 'pending' interrupts in the IRQ path, except at this point.
1242 */
1243/*
1244 * Edge triggered needs to resend any interrupt
1245 * that was delayed but this is now handled in the device
1246 * independent code.
1247 */
1248
1249/*
1250 * Starting up a edge-triggered IO-APIC interrupt is
1251 * nasty - we need to make sure that we get the edge.
1252 * If it is already asserted for some reason, we need
1253 * return 1 to indicate that is was pending.
1254 *
1255 * This is not complete - we should be able to fake
1256 * an edge even if it isn't on the 8259A...
1257 */
1258
1259static unsigned int startup_edge_ioapic_irq(unsigned int irq)
1260{
1261 int was_pending = 0;
1262 unsigned long flags;
1263
1264 spin_lock_irqsave(&ioapic_lock, flags);
1265 if (irq < 16) {
1266 disable_8259A_irq(irq);
1267 if (i8259A_irq_pending(irq))
1268 was_pending = 1;
1269 }
1270 __unmask_IO_APIC_irq(irq);
1271 spin_unlock_irqrestore(&ioapic_lock, flags);
1272
1273 return was_pending;
1274}
1275
1276/*
1277 * Once we have recorded IRQ_PENDING already, we can mask the
1278 * interrupt for real. This prevents IRQ storms from unhandled
1279 * devices.
1280 */
1281static void ack_edge_ioapic_irq(unsigned int irq)
1282{
1283 if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
1284 == (IRQ_PENDING | IRQ_DISABLED))
1285 mask_IO_APIC_irq(irq);
1286 ack_APIC_irq();
1287}
1288
1289/*
1290 * Level triggered interrupts can just be masked,
1291 * and shutting down and starting up the interrupt
1292 * is the same as enabling and disabling them -- except
1293 * with a startup need to return a "was pending" value.
1294 *
1295 * Level triggered interrupts are special because we
1296 * do not touch any IO-APIC register while handling
1297 * them. We ack the APIC in the end-IRQ handler, not
1298 * in the start-IRQ-handler. Protection against reentrance
1299 * from the same interrupt is still provided, both by the
1300 * generic IRQ layer and by the fact that an unacked local
1301 * APIC does not accept IRQs.
1302 */
1303static unsigned int startup_level_ioapic_irq (unsigned int irq)
1304{
1305 unmask_IO_APIC_irq(irq);
1306
1307 return 0; /* don't check for pending */
1308}
1309
1310static void end_level_ioapic_irq (unsigned int irq)
1311{
1312 ack_APIC_irq();
1313}
1314
1315static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1316{
1317 unsigned long flags;
1318 unsigned int dest;
1319
1320 dest = cpu_mask_to_apicid(mask);
1321
1322 /*
1323 * Only the high 8 bits are valid.
1324 */
1325 dest = SET_APIC_LOGICAL_ID(dest);
1326
1327 spin_lock_irqsave(&ioapic_lock, flags);
1328 __DO_ACTION(1, = dest, )
1329 spin_unlock_irqrestore(&ioapic_lock, flags);
1330}
1331
1332#ifdef CONFIG_PCI_MSI
1333static unsigned int startup_edge_ioapic_vector(unsigned int vector)
1334{
1335 int irq = vector_to_irq(vector);
1336
1337 return startup_edge_ioapic_irq(irq);
1338}
1339
1340static void ack_edge_ioapic_vector(unsigned int vector)
1341{
1342 int irq = vector_to_irq(vector);
1343
1344 ack_edge_ioapic_irq(irq);
1345}
1346
1347static unsigned int startup_level_ioapic_vector (unsigned int vector)
1348{
1349 int irq = vector_to_irq(vector);
1350
1351 return startup_level_ioapic_irq (irq);
1352}
1353
1354static void end_level_ioapic_vector (unsigned int vector)
1355{
1356 int irq = vector_to_irq(vector);
1357
1358 end_level_ioapic_irq(irq);
1359}
1360
1361static void mask_IO_APIC_vector (unsigned int vector)
1362{
1363 int irq = vector_to_irq(vector);
1364
1365 mask_IO_APIC_irq(irq);
1366}
1367
1368static void unmask_IO_APIC_vector (unsigned int vector)
1369{
1370 int irq = vector_to_irq(vector);
1371
1372 unmask_IO_APIC_irq(irq);
1373}
1374
1375static void set_ioapic_affinity_vector (unsigned int vector,
1376 cpumask_t cpu_mask)
1377{
1378 int irq = vector_to_irq(vector);
1379
1380 set_ioapic_affinity_irq(irq, cpu_mask);
1381}
1382#endif
1383
1384/*
1385 * Level and edge triggered IO-APIC interrupts need different handling,
1386 * so we use two separate IRQ descriptors. Edge triggered IRQs can be
1387 * handled with the level-triggered descriptor, but that one has slightly
1388 * more overhead. Level-triggered interrupts cannot be handled with the
1389 * edge-triggered handler, without risking IRQ storms and other ugly
1390 * races.
1391 */
1392
1393static struct hw_interrupt_type ioapic_edge_type = {
1394 .typename = "IO-APIC-edge",
1395 .startup = startup_edge_ioapic,
1396 .shutdown = shutdown_edge_ioapic,
1397 .enable = enable_edge_ioapic,
1398 .disable = disable_edge_ioapic,
1399 .ack = ack_edge_ioapic,
1400 .end = end_edge_ioapic,
1401 .set_affinity = set_ioapic_affinity,
1402};
1403
1404static struct hw_interrupt_type ioapic_level_type = {
1405 .typename = "IO-APIC-level",
1406 .startup = startup_level_ioapic,
1407 .shutdown = shutdown_level_ioapic,
1408 .enable = enable_level_ioapic,
1409 .disable = disable_level_ioapic,
1410 .ack = mask_and_ack_level_ioapic,
1411 .end = end_level_ioapic,
1412 .set_affinity = set_ioapic_affinity,
1413};
1414
1415static inline void init_IO_APIC_traps(void)
1416{
1417 int irq;
1418
1419 /*
1420 * NOTE! The local APIC isn't very good at handling
1421 * multiple interrupts at the same interrupt level.
1422 * As the interrupt level is determined by taking the
1423 * vector number and shifting that right by 4, we
1424 * want to spread these out a bit so that they don't
1425 * all fall in the same interrupt level.
1426 *
1427 * Also, we've got to be careful not to trash gate
1428 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1429 */
1430 for (irq = 0; irq < NR_IRQS ; irq++) {
1431 int tmp = irq;
1432 if (use_pci_vector()) {
1433 if (!platform_legacy_irq(tmp))
1434 if ((tmp = vector_to_irq(tmp)) == -1)
1435 continue;
1436 }
1437 if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
1438 /*
1439 * Hmm.. We don't have an entry for this,
1440 * so default to an old-fashioned 8259
1441 * interrupt if we can..
1442 */
1443 if (irq < 16)
1444 make_8259A_irq(irq);
1445 else
1446 /* Strange. Oh, well.. */
1447 irq_desc[irq].handler = &no_irq_type;
1448 }
1449 }
1450}
1451
1452static void enable_lapic_irq (unsigned int irq)
1453{
1454 unsigned long v;
1455
1456 v = apic_read(APIC_LVT0);
1457 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
1458}
1459
1460static void disable_lapic_irq (unsigned int irq)
1461{
1462 unsigned long v;
1463
1464 v = apic_read(APIC_LVT0);
1465 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
1466}
1467
1468static void ack_lapic_irq (unsigned int irq)
1469{
1470 ack_APIC_irq();
1471}
1472
1473static void end_lapic_irq (unsigned int i) { /* nothing */ }
1474
1475static struct hw_interrupt_type lapic_irq_type = {
1476 .typename = "local-APIC-edge",
1477 .startup = NULL, /* startup_irq() not used for IRQ0 */
1478 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1479 .enable = enable_lapic_irq,
1480 .disable = disable_lapic_irq,
1481 .ack = ack_lapic_irq,
1482 .end = end_lapic_irq,
1483};
1484
1485static void setup_nmi (void)
1486{
1487 /*
1488 * Dirty trick to enable the NMI watchdog ...
1489 * We put the 8259A master into AEOI mode and
1490 * unmask on all local APICs LVT0 as NMI.
1491 *
1492 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1493 * is from Maciej W. Rozycki - so we do not have to EOI from
1494 * the NMI handler or the timer interrupt.
1495 */
1496 printk(KERN_INFO "activating NMI Watchdog ...");
1497
1498 enable_NMI_through_LVT0(NULL);
1499
1500 printk(" done.\n");
1501}
1502
1503/*
1504 * This looks a bit hackish but it's about the only one way of sending
1505 * a few INTA cycles to 8259As and any associated glue logic. ICR does
1506 * not support the ExtINT mode, unfortunately. We need to send these
1507 * cycles as some i82489DX-based boards have glue logic that keeps the
1508 * 8259A interrupt line asserted until INTA. --macro
1509 */
1510static inline void unlock_ExtINT_logic(void)
1511{
1512 int pin, i;
1513 struct IO_APIC_route_entry entry0, entry1;
1514 unsigned char save_control, save_freq_select;
1515 unsigned long flags;
1516
1517 pin = find_isa_irq_pin(8, mp_INT);
1518 if (pin == -1)
1519 return;
1520
1521 spin_lock_irqsave(&ioapic_lock, flags);
1522 *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
1523 *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
1524 spin_unlock_irqrestore(&ioapic_lock, flags);
1525 clear_IO_APIC_pin(0, pin);
1526
1527 memset(&entry1, 0, sizeof(entry1));
1528
1529 entry1.dest_mode = 0; /* physical delivery */
1530 entry1.mask = 0; /* unmask IRQ now */
1531 entry1.dest.physical.physical_dest = hard_smp_processor_id();
1532 entry1.delivery_mode = dest_ExtINT;
1533 entry1.polarity = entry0.polarity;
1534 entry1.trigger = 0;
1535 entry1.vector = 0;
1536
1537 spin_lock_irqsave(&ioapic_lock, flags);
1538 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
1539 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
1540 spin_unlock_irqrestore(&ioapic_lock, flags);
1541
1542 save_control = CMOS_READ(RTC_CONTROL);
1543 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
1544 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
1545 RTC_FREQ_SELECT);
1546 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
1547
1548 i = 100;
1549 while (i-- > 0) {
1550 mdelay(10);
1551 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
1552 i -= 10;
1553 }
1554
1555 CMOS_WRITE(save_control, RTC_CONTROL);
1556 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
1557 clear_IO_APIC_pin(0, pin);
1558
1559 spin_lock_irqsave(&ioapic_lock, flags);
1560 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
1561 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
1562 spin_unlock_irqrestore(&ioapic_lock, flags);
1563}
1564
1565/*
1566 * This code may look a bit paranoid, but it's supposed to cooperate with
1567 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1568 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1569 * fanatically on his truly buggy board.
1570 */
1571static inline void check_timer(void)
1572{
1573 int pin1, pin2;
1574 int vector;
1575
1576 /*
1577 * get/set the timer IRQ vector:
1578 */
1579 disable_8259A_irq(0);
1580 vector = assign_irq_vector(0);
1581 set_intr_gate(vector, interrupt[0]);
1582
1583 /*
1584 * Subtle, code in do_timer_interrupt() expects an AEOI
1585 * mode for the 8259A whenever interrupts are routed
1586 * through I/O APICs. Also IRQ0 has to be enabled in
1587 * the 8259A which implies the virtual wire has to be
1588 * disabled in the local APIC.
1589 */
1590 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1591 init_8259A(1);
1592 enable_8259A_irq(0);
1593
1594 pin1 = find_isa_irq_pin(0, mp_INT);
1595 pin2 = find_isa_irq_pin(0, mp_ExtINT);
1596
1597 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
1598
1599 if (pin1 != -1) {
1600 /*
1601 * Ok, does IRQ0 through the IOAPIC work?
1602 */
1603 unmask_IO_APIC_irq(0);
1604 if (timer_irq_works()) {
1605 nmi_watchdog_default();
1606 if (nmi_watchdog == NMI_IO_APIC) {
1607 disable_8259A_irq(0);
1608 setup_nmi();
1609 enable_8259A_irq(0);
1610 check_nmi_watchdog();
1611 }
1612 return;
1613 }
1614 clear_IO_APIC_pin(0, pin1);
1615 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
1616 }
1617
1618 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
1619 if (pin2 != -1) {
1620 apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2);
1621 /*
1622 * legacy devices should be connected to IO APIC #0
1623 */
1624 setup_ExtINT_IRQ0_pin(pin2, vector);
1625 if (timer_irq_works()) {
1626 printk("works.\n");
1627 nmi_watchdog_default();
1628 if (nmi_watchdog == NMI_IO_APIC) {
1629 setup_nmi();
1630 check_nmi_watchdog();
1631 }
1632 return;
1633 }
1634 /*
1635 * Cleanup, just in case ...
1636 */
1637 clear_IO_APIC_pin(0, pin2);
1638 }
1639 printk(" failed.\n");
1640
1641 if (nmi_watchdog) {
1642 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
1643 nmi_watchdog = 0;
1644 }
1645
1646 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
1647
1648 disable_8259A_irq(0);
1649 irq_desc[0].handler = &lapic_irq_type;
1650 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
1651 enable_8259A_irq(0);
1652
1653 if (timer_irq_works()) {
1654 apic_printk(APIC_QUIET, " works.\n");
1655 return;
1656 }
1657 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
1658 apic_printk(APIC_VERBOSE," failed.\n");
1659
1660 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
1661
1662 init_8259A(0);
1663 make_8259A_irq(0);
1664 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
1665
1666 unlock_ExtINT_logic();
1667
1668 if (timer_irq_works()) {
1669 apic_printk(APIC_VERBOSE," works.\n");
1670 return;
1671 }
1672 apic_printk(APIC_VERBOSE," failed :(.\n");
1673 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
1674}
1675
1676/*
1677 *
1678 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
1679 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
1680 * Linux doesn't really care, as it's not actually used
1681 * for any interrupt handling anyway.
1682 */
1683#define PIC_IRQS (1<<2)
1684
1685void __init setup_IO_APIC(void)
1686{
1687 enable_IO_APIC();
1688
1689 if (acpi_ioapic)
1690 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1691 else
1692 io_apic_irqs = ~PIC_IRQS;
1693
1694 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1695
1696 /*
1697 * Set up the IO-APIC IRQ routing table.
1698 */
1699 if (!acpi_ioapic)
1700 setup_ioapic_ids_from_mpc();
1701 sync_Arb_IDs();
1702 setup_IO_APIC_irqs();
1703 init_IO_APIC_traps();
1704 check_timer();
1705 if (!acpi_ioapic)
1706 print_IO_APIC();
1707}
1708
1709struct sysfs_ioapic_data {
1710 struct sys_device dev;
1711 struct IO_APIC_route_entry entry[0];
1712};
1713static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
1714
1715static int ioapic_suspend(struct sys_device *dev, u32 state)
1716{
1717 struct IO_APIC_route_entry *entry;
1718 struct sysfs_ioapic_data *data;
1719 unsigned long flags;
1720 int i;
1721
1722 data = container_of(dev, struct sysfs_ioapic_data, dev);
1723 entry = data->entry;
1724 spin_lock_irqsave(&ioapic_lock, flags);
1725 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
1726 *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
1727 *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
1728 }
1729 spin_unlock_irqrestore(&ioapic_lock, flags);
1730
1731 return 0;
1732}
1733
1734static int ioapic_resume(struct sys_device *dev)
1735{
1736 struct IO_APIC_route_entry *entry;
1737 struct sysfs_ioapic_data *data;
1738 unsigned long flags;
1739 union IO_APIC_reg_00 reg_00;
1740 int i;
1741
1742 data = container_of(dev, struct sysfs_ioapic_data, dev);
1743 entry = data->entry;
1744
1745 spin_lock_irqsave(&ioapic_lock, flags);
1746 reg_00.raw = io_apic_read(dev->id, 0);
1747 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
1748 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
1749 io_apic_write(dev->id, 0, reg_00.raw);
1750 }
1751 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
1752 io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
1753 io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
1754 }
1755 spin_unlock_irqrestore(&ioapic_lock, flags);
1756
1757 return 0;
1758}
1759
1760static struct sysdev_class ioapic_sysdev_class = {
1761 set_kset_name("ioapic"),
1762 .suspend = ioapic_suspend,
1763 .resume = ioapic_resume,
1764};
1765
1766static int __init ioapic_init_sysfs(void)
1767{
1768 struct sys_device * dev;
1769 int i, size, error = 0;
1770
1771 error = sysdev_class_register(&ioapic_sysdev_class);
1772 if (error)
1773 return error;
1774
1775 for (i = 0; i < nr_ioapics; i++ ) {
1776 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
1777 * sizeof(struct IO_APIC_route_entry);
1778 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
1779 if (!mp_ioapic_data[i]) {
1780 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1781 continue;
1782 }
1783 memset(mp_ioapic_data[i], 0, size);
1784 dev = &mp_ioapic_data[i]->dev;
1785 dev->id = i;
1786 dev->cls = &ioapic_sysdev_class;
1787 error = sysdev_register(dev);
1788 if (error) {
1789 kfree(mp_ioapic_data[i]);
1790 mp_ioapic_data[i] = NULL;
1791 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1792 continue;
1793 }
1794 }
1795
1796 return 0;
1797}
1798
1799device_initcall(ioapic_init_sysfs);
1800
1801/* --------------------------------------------------------------------------
1802 ACPI-based IOAPIC Configuration
1803 -------------------------------------------------------------------------- */
1804
1805#ifdef CONFIG_ACPI_BOOT
1806
1807#define IO_APIC_MAX_ID 0xFE
1808
1809int __init io_apic_get_unique_id (int ioapic, int apic_id)
1810{
1811 union IO_APIC_reg_00 reg_00;
1812 static physid_mask_t apic_id_map;
1813 unsigned long flags;
1814 int i = 0;
1815
1816 /*
1817 * The P4 platform supports up to 256 APIC IDs on two separate APIC
1818 * buses (one for LAPICs, one for IOAPICs), where predecessors only
1819 * supports up to 16 on one shared APIC bus.
1820 *
1821 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
1822 * advantage of new APIC bus architecture.
1823 */
1824
1825 if (physids_empty(apic_id_map))
1826 apic_id_map = phys_cpu_present_map;
1827
1828 spin_lock_irqsave(&ioapic_lock, flags);
1829 reg_00.raw = io_apic_read(ioapic, 0);
1830 spin_unlock_irqrestore(&ioapic_lock, flags);
1831
1832 if (apic_id >= IO_APIC_MAX_ID) {
1833 apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
1834 "%d\n", ioapic, apic_id, reg_00.bits.ID);
1835 apic_id = reg_00.bits.ID;
1836 }
1837
1838 /*
1839 * Every APIC in a system must have a unique ID or we get lots of nice
1840 * 'stuck on smp_invalidate_needed IPI wait' messages.
1841 */
1842 if (physid_isset(apic_id, apic_id_map)) {
1843
1844 for (i = 0; i < IO_APIC_MAX_ID; i++) {
1845 if (!physid_isset(i, apic_id_map))
1846 break;
1847 }
1848
1849 if (i == IO_APIC_MAX_ID)
1850 panic("Max apic_id exceeded!\n");
1851
1852 apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
1853 "trying %d\n", ioapic, apic_id, i);
1854
1855 apic_id = i;
1856 }
1857
1858 physid_set(apic_id, apic_id_map);
1859
1860 if (reg_00.bits.ID != apic_id) {
1861 reg_00.bits.ID = apic_id;
1862
1863 spin_lock_irqsave(&ioapic_lock, flags);
1864 io_apic_write(ioapic, 0, reg_00.raw);
1865 reg_00.raw = io_apic_read(ioapic, 0);
1866 spin_unlock_irqrestore(&ioapic_lock, flags);
1867
1868 /* Sanity check */
1869 if (reg_00.bits.ID != apic_id)
1870 panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
1871 }
1872
1873 apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
1874
1875 return apic_id;
1876}
1877
1878
1879int __init io_apic_get_version (int ioapic)
1880{
1881 union IO_APIC_reg_01 reg_01;
1882 unsigned long flags;
1883
1884 spin_lock_irqsave(&ioapic_lock, flags);
1885 reg_01.raw = io_apic_read(ioapic, 1);
1886 spin_unlock_irqrestore(&ioapic_lock, flags);
1887
1888 return reg_01.bits.version;
1889}
1890
1891
1892int __init io_apic_get_redir_entries (int ioapic)
1893{
1894 union IO_APIC_reg_01 reg_01;
1895 unsigned long flags;
1896
1897 spin_lock_irqsave(&ioapic_lock, flags);
1898 reg_01.raw = io_apic_read(ioapic, 1);
1899 spin_unlock_irqrestore(&ioapic_lock, flags);
1900
1901 return reg_01.bits.entries;
1902}
1903
1904
1905int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
1906{
1907 struct IO_APIC_route_entry entry;
1908 unsigned long flags;
1909
1910 if (!IO_APIC_IRQ(irq)) {
1911 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
1912 ioapic);
1913 return -EINVAL;
1914 }
1915
1916 /*
1917 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
1918 * Note that we mask (disable) IRQs now -- these get enabled when the
1919 * corresponding device driver registers for this IRQ.
1920 */
1921
1922 memset(&entry,0,sizeof(entry));
1923
1924 entry.delivery_mode = INT_DELIVERY_MODE;
1925 entry.dest_mode = INT_DEST_MODE;
1926 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1927 entry.trigger = edge_level;
1928 entry.polarity = active_high_low;
1929 entry.mask = 1; /* Disabled (masked) */
1930
1931 /*
1932 * IRQs < 16 are already in the irq_2_pin[] map
1933 */
1934 if (irq >= 16)
1935 add_pin_to_irq(irq, ioapic, pin);
1936
1937 entry.vector = assign_irq_vector(irq);
1938
1939 apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
1940 "IRQ %d Mode:%i Active:%i)\n", ioapic,
1941 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
1942 edge_level, active_high_low);
1943
1944 ioapic_register_intr(irq, entry.vector, edge_level);
1945
1946 if (!ioapic && (irq < 16))
1947 disable_8259A_irq(irq);
1948
1949 spin_lock_irqsave(&ioapic_lock, flags);
1950 io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
1951 io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
1952 spin_unlock_irqrestore(&ioapic_lock, flags);
1953
1954 return 0;
1955}
1956
1957#endif /*CONFIG_ACPI_BOOT*/
1958
1959
1960/*
1961 * This function currently is only a helper for the i386 smp boot process where
1962 * we need to reprogram the ioredtbls to cater for the cpus which have come online
1963 * so mask in all cases should simply be TARGET_CPUS
1964 */
1965void __init setup_ioapic_dest(void)
1966{
1967 int pin, ioapic, irq, irq_entry;
1968
1969 if (skip_ioapic_setup == 1)
1970 return;
1971
1972 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
1973 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
1974 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
1975 if (irq_entry == -1)
1976 continue;
1977 irq = pin_2_irq(irq_entry, ioapic, pin);
1978 set_ioapic_affinity_irq(irq, TARGET_CPUS);
1979 }
1980
1981 }
1982}
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
new file mode 100644
index 000000000000..cde0e868efe3
--- /dev/null
+++ b/arch/x86_64/kernel/ioport.c
@@ -0,0 +1,117 @@
1/*
2 * linux/arch/x86_64/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/types.h>
12#include <linux/ioport.h>
13#include <linux/smp.h>
14#include <linux/smp_lock.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
21{
22 int i;
23 if (new_value)
24 for (i = base; i < base + extent; i++)
25 __set_bit(i, bitmap);
26 else
27 for (i = base; i < base + extent; i++)
28 clear_bit(i, bitmap);
29}
30
31/*
32 * this changes the io permissions bitmap in the current task.
33 */
34asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
35{
36 unsigned int i, max_long, bytes, bytes_updated;
37 struct thread_struct * t = &current->thread;
38 struct tss_struct * tss;
39 unsigned long *bitmap;
40
41 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
42 return -EINVAL;
43 if (turn_on && !capable(CAP_SYS_RAWIO))
44 return -EPERM;
45
46 /*
47 * If it's the first ioperm() call in this thread's lifetime, set the
48 * IO bitmap up. ioperm() is much less timing critical than clone(),
49 * this is why we delay this operation until now:
50 */
51 if (!t->io_bitmap_ptr) {
52 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
53 if (!bitmap)
54 return -ENOMEM;
55
56 memset(bitmap, 0xff, IO_BITMAP_BYTES);
57 t->io_bitmap_ptr = bitmap;
58 }
59
60 /*
61 * do it in the per-thread copy and in the TSS ...
62 *
63 * Disable preemption via get_cpu() - we must not switch away
64 * because the ->io_bitmap_max value must match the bitmap
65 * contents:
66 */
67 tss = &per_cpu(init_tss, get_cpu());
68
69 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
70
71 /*
72 * Search for a (possibly new) maximum. This is simple and stupid,
73 * to keep it obviously correct:
74 */
75 max_long = 0;
76 for (i = 0; i < IO_BITMAP_LONGS; i++)
77 if (t->io_bitmap_ptr[i] != ~0UL)
78 max_long = i;
79
80 bytes = (max_long + 1) * sizeof(long);
81 bytes_updated = max(bytes, t->io_bitmap_max);
82
83 t->io_bitmap_max = bytes;
84
85 /* Update the TSS: */
86 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
87
88 put_cpu();
89
90 return 0;
91}
92
93/*
94 * sys_iopl has to be used when you want to access the IO ports
95 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
96 * you'd need 8kB of bitmaps/process, which is a bit excessive.
97 *
98 * Here we just change the eflags value on the stack: we allow
99 * only the super-user to do it. This depends on the stack-layout
100 * on system-call entry - see also fork() and the signal handling
101 * code.
102 */
103
104asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
105{
106 unsigned int old = (regs->eflags >> 12) & 3;
107
108 if (level > 3)
109 return -EINVAL;
110 /* Trying to gain more privileges? */
111 if (level > old) {
112 if (!capable(CAP_SYS_RAWIO))
113 return -EPERM;
114 }
115 regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
116 return 0;
117}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
new file mode 100644
index 000000000000..62b112e4deb4
--- /dev/null
+++ b/arch/x86_64/kernel/irq.c
@@ -0,0 +1,108 @@
1/*
2 * linux/arch/x86_64/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86_64-specific interrupt
7 * entry and irq statistics code. All the remaining irq logic is
8 * done by the generic kernel/irq/ code and in the
9 * x86_64-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/kernel_stat.h>
14#include <linux/interrupt.h>
15#include <linux/seq_file.h>
16#include <linux/module.h>
17#include <asm/uaccess.h>
18#include <asm/io_apic.h>
19
20atomic_t irq_err_count;
21#ifdef CONFIG_X86_IO_APIC
22#ifdef APIC_MISMATCH_DEBUG
23atomic_t irq_mis_count;
24#endif
25#endif
26
27/*
28 * Generic, controller-independent functions:
29 */
30
31int show_interrupts(struct seq_file *p, void *v)
32{
33 int i = *(loff_t *) v, j;
34 struct irqaction * action;
35 unsigned long flags;
36
37 if (i == 0) {
38 seq_printf(p, " ");
39 for (j=0; j<NR_CPUS; j++)
40 if (cpu_online(j))
41 seq_printf(p, "CPU%d ",j);
42 seq_putc(p, '\n');
43 }
44
45 if (i < NR_IRQS) {
46 spin_lock_irqsave(&irq_desc[i].lock, flags);
47 action = irq_desc[i].action;
48 if (!action)
49 goto skip;
50 seq_printf(p, "%3d: ",i);
51#ifndef CONFIG_SMP
52 seq_printf(p, "%10u ", kstat_irqs(i));
53#else
54 for (j=0; j<NR_CPUS; j++)
55 if (cpu_online(j))
56 seq_printf(p, "%10u ",
57 kstat_cpu(j).irqs[i]);
58#endif
59 seq_printf(p, " %14s", irq_desc[i].handler->typename);
60
61 seq_printf(p, " %s", action->name);
62 for (action=action->next; action; action = action->next)
63 seq_printf(p, ", %s", action->name);
64 seq_putc(p, '\n');
65skip:
66 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
67 } else if (i == NR_IRQS) {
68 seq_printf(p, "NMI: ");
69 for (j = 0; j < NR_CPUS; j++)
70 if (cpu_online(j))
71 seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
72 seq_putc(p, '\n');
73#ifdef CONFIG_X86_LOCAL_APIC
74 seq_printf(p, "LOC: ");
75 for (j = 0; j < NR_CPUS; j++)
76 if (cpu_online(j))
77 seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
78 seq_putc(p, '\n');
79#endif
80 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
81#ifdef CONFIG_X86_IO_APIC
82#ifdef APIC_MISMATCH_DEBUG
83 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
84#endif
85#endif
86 }
87 return 0;
88}
89
90/*
91 * do_IRQ handles all normal device IRQ's (the special
92 * SMP cross-CPU interrupts have their own specific
93 * handlers).
94 */
95asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
96{
97 /* high bits used in ret_from_ code */
98 unsigned irq = regs->orig_rax & 0xff;
99
100 irq_enter();
101 BUG_ON(irq > 256);
102
103 __do_IRQ(irq, regs);
104 irq_exit();
105
106 return 1;
107}
108
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
new file mode 100644
index 000000000000..4f2a852299b6
--- /dev/null
+++ b/arch/x86_64/kernel/kprobes.c
@@ -0,0 +1,631 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/x86_64/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> adapted for x86_64
28 * 2005-Mar Roland McGrath <roland@redhat.com>
29 * Fixed to handle %rip-relative addressing mode correctly.
30 */
31
32#include <linux/config.h>
33#include <linux/kprobes.h>
34#include <linux/ptrace.h>
35#include <linux/spinlock.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/preempt.h>
39#include <linux/moduleloader.h>
40
41#include <asm/pgtable.h>
42#include <asm/kdebug.h>
43
44static DECLARE_MUTEX(kprobe_mutex);
45
46/* kprobe_status settings */
47#define KPROBE_HIT_ACTIVE 0x00000001
48#define KPROBE_HIT_SS 0x00000002
49
50static struct kprobe *current_kprobe;
51static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
52static struct pt_regs jprobe_saved_regs;
53static long *jprobe_saved_rsp;
54static kprobe_opcode_t *get_insn_slot(void);
55static void free_insn_slot(kprobe_opcode_t *slot);
56void jprobe_return_end(void);
57
58/* copy of the kernel stack at the probe fire time */
59static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
60
61/*
62 * returns non-zero if opcode modifies the interrupt flag.
63 */
64static inline int is_IF_modifier(kprobe_opcode_t *insn)
65{
66 switch (*insn) {
67 case 0xfa: /* cli */
68 case 0xfb: /* sti */
69 case 0xcf: /* iret/iretd */
70 case 0x9d: /* popf/popfd */
71 return 1;
72 }
73
74 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
75 return 1;
76 return 0;
77}
78
79int arch_prepare_kprobe(struct kprobe *p)
80{
81 /* insn: must be on special executable page on x86_64. */
82 up(&kprobe_mutex);
83 p->ainsn.insn = get_insn_slot();
84 down(&kprobe_mutex);
85 if (!p->ainsn.insn) {
86 return -ENOMEM;
87 }
88 return 0;
89}
90
91/*
92 * Determine if the instruction uses the %rip-relative addressing mode.
93 * If it does, return the address of the 32-bit displacement word.
94 * If not, return null.
95 */
96static inline s32 *is_riprel(u8 *insn)
97{
98#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
99 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
100 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
101 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
102 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
103 << (row % 64))
104 static const u64 onebyte_has_modrm[256 / 64] = {
105 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
106 /* ------------------------------- */
107 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
108 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
109 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
110 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
111 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
112 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
113 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
114 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
115 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
116 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
117 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
118 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
119 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
120 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
121 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
122 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
123 /* ------------------------------- */
124 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
125 };
126 static const u64 twobyte_has_modrm[256 / 64] = {
127 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
128 /* ------------------------------- */
129 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
130 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
131 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
132 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
133 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
134 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
135 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
136 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
137 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
138 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
139 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
140 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
141 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
142 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
143 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
144 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
145 /* ------------------------------- */
146 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
147 };
148#undef W
149 int need_modrm;
150
151 /* Skip legacy instruction prefixes. */
152 while (1) {
153 switch (*insn) {
154 case 0x66:
155 case 0x67:
156 case 0x2e:
157 case 0x3e:
158 case 0x26:
159 case 0x64:
160 case 0x65:
161 case 0x36:
162 case 0xf0:
163 case 0xf3:
164 case 0xf2:
165 ++insn;
166 continue;
167 }
168 break;
169 }
170
171 /* Skip REX instruction prefix. */
172 if ((*insn & 0xf0) == 0x40)
173 ++insn;
174
175 if (*insn == 0x0f) { /* Two-byte opcode. */
176 ++insn;
177 need_modrm = test_bit(*insn, twobyte_has_modrm);
178 } else { /* One-byte opcode. */
179 need_modrm = test_bit(*insn, onebyte_has_modrm);
180 }
181
182 if (need_modrm) {
183 u8 modrm = *++insn;
184 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
185 /* Displacement follows ModRM byte. */
186 return (s32 *) ++insn;
187 }
188 }
189
190 /* No %rip-relative addressing mode here. */
191 return NULL;
192}
193
194void arch_copy_kprobe(struct kprobe *p)
195{
196 s32 *ripdisp;
197 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
198 ripdisp = is_riprel(p->ainsn.insn);
199 if (ripdisp) {
200 /*
201 * The copied instruction uses the %rip-relative
202 * addressing mode. Adjust the displacement for the
203 * difference between the original location of this
204 * instruction and the location of the copy that will
205 * actually be run. The tricky bit here is making sure
206 * that the sign extension happens correctly in this
207 * calculation, since we need a signed 32-bit result to
208 * be sign-extended to 64 bits when it's added to the
209 * %rip value and yield the same 64-bit result that the
210 * sign-extension of the original signed 32-bit
211 * displacement would have given.
212 */
213 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
214 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
215 *ripdisp = disp;
216 }
217}
218
219void arch_remove_kprobe(struct kprobe *p)
220{
221 up(&kprobe_mutex);
222 free_insn_slot(p->ainsn.insn);
223 down(&kprobe_mutex);
224}
225
226static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
227{
228 *p->addr = p->opcode;
229 regs->rip = (unsigned long)p->addr;
230}
231
232static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
233{
234 regs->eflags |= TF_MASK;
235 regs->eflags &= ~IF_MASK;
236 /*single step inline if the instruction is an int3*/
237 if (p->opcode == BREAKPOINT_INSTRUCTION)
238 regs->rip = (unsigned long)p->addr;
239 else
240 regs->rip = (unsigned long)p->ainsn.insn;
241}
242
243/*
244 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
245 * remain disabled thorough out this function.
246 */
247int kprobe_handler(struct pt_regs *regs)
248{
249 struct kprobe *p;
250 int ret = 0;
251 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
252
253 /* We're in an interrupt, but this is clear and BUG()-safe. */
254 preempt_disable();
255
256 /* Check we're not actually recursing */
257 if (kprobe_running()) {
258 /* We *are* holding lock here, so this is safe.
259 Disarm the probe we just hit, and ignore it. */
260 p = get_kprobe(addr);
261 if (p) {
262 if (kprobe_status == KPROBE_HIT_SS) {
263 regs->eflags &= ~TF_MASK;
264 regs->eflags |= kprobe_saved_rflags;
265 unlock_kprobes();
266 goto no_kprobe;
267 }
268 disarm_kprobe(p, regs);
269 ret = 1;
270 } else {
271 p = current_kprobe;
272 if (p->break_handler && p->break_handler(p, regs)) {
273 goto ss_probe;
274 }
275 }
276 /* If it's not ours, can't be delete race, (we hold lock). */
277 goto no_kprobe;
278 }
279
280 lock_kprobes();
281 p = get_kprobe(addr);
282 if (!p) {
283 unlock_kprobes();
284 if (*addr != BREAKPOINT_INSTRUCTION) {
285 /*
286 * The breakpoint instruction was removed right
287 * after we hit it. Another cpu has removed
288 * either a probepoint or a debugger breakpoint
289 * at this address. In either case, no further
290 * handling of this interrupt is appropriate.
291 */
292 ret = 1;
293 }
294 /* Not one of ours: let kernel handle it */
295 goto no_kprobe;
296 }
297
298 kprobe_status = KPROBE_HIT_ACTIVE;
299 current_kprobe = p;
300 kprobe_saved_rflags = kprobe_old_rflags
301 = (regs->eflags & (TF_MASK | IF_MASK));
302 if (is_IF_modifier(p->ainsn.insn))
303 kprobe_saved_rflags &= ~IF_MASK;
304
305 if (p->pre_handler && p->pre_handler(p, regs))
306 /* handler has already set things up, so skip ss setup */
307 return 1;
308
309ss_probe:
310 prepare_singlestep(p, regs);
311 kprobe_status = KPROBE_HIT_SS;
312 return 1;
313
314no_kprobe:
315 preempt_enable_no_resched();
316 return ret;
317}
318
319/*
320 * Called after single-stepping. p->addr is the address of the
321 * instruction whose first byte has been replaced by the "int 3"
322 * instruction. To avoid the SMP problems that can occur when we
323 * temporarily put back the original opcode to single-step, we
324 * single-stepped a copy of the instruction. The address of this
325 * copy is p->ainsn.insn.
326 *
327 * This function prepares to return from the post-single-step
328 * interrupt. We have to fix up the stack as follows:
329 *
330 * 0) Except in the case of absolute or indirect jump or call instructions,
331 * the new rip is relative to the copied instruction. We need to make
332 * it relative to the original instruction.
333 *
334 * 1) If the single-stepped instruction was pushfl, then the TF and IF
335 * flags are set in the just-pushed eflags, and may need to be cleared.
336 *
337 * 2) If the single-stepped instruction was a call, the return address
338 * that is atop the stack is the address following the copied instruction.
339 * We need to make it the address following the original instruction.
340 */
341static void resume_execution(struct kprobe *p, struct pt_regs *regs)
342{
343 unsigned long *tos = (unsigned long *)regs->rsp;
344 unsigned long next_rip = 0;
345 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
346 unsigned long orig_rip = (unsigned long)p->addr;
347 kprobe_opcode_t *insn = p->ainsn.insn;
348
349 /*skip the REX prefix*/
350 if (*insn >= 0x40 && *insn <= 0x4f)
351 insn++;
352
353 switch (*insn) {
354 case 0x9c: /* pushfl */
355 *tos &= ~(TF_MASK | IF_MASK);
356 *tos |= kprobe_old_rflags;
357 break;
358 case 0xe8: /* call relative - Fix return addr */
359 *tos = orig_rip + (*tos - copy_rip);
360 break;
361 case 0xff:
362 if ((*insn & 0x30) == 0x10) {
363 /* call absolute, indirect */
364 /* Fix return addr; rip is correct. */
365 next_rip = regs->rip;
366 *tos = orig_rip + (*tos - copy_rip);
367 } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */
368 ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */
369 /* rip is correct. */
370 next_rip = regs->rip;
371 }
372 break;
373 case 0xea: /* jmp absolute -- rip is correct */
374 next_rip = regs->rip;
375 break;
376 default:
377 break;
378 }
379
380 regs->eflags &= ~TF_MASK;
381 if (next_rip) {
382 regs->rip = next_rip;
383 } else {
384 regs->rip = orig_rip + (regs->rip - copy_rip);
385 }
386}
387
388/*
389 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
390 * remain disabled thoroughout this function. And we hold kprobe lock.
391 */
392int post_kprobe_handler(struct pt_regs *regs)
393{
394 if (!kprobe_running())
395 return 0;
396
397 if (current_kprobe->post_handler)
398 current_kprobe->post_handler(current_kprobe, regs, 0);
399
400 resume_execution(current_kprobe, regs);
401 regs->eflags |= kprobe_saved_rflags;
402
403 unlock_kprobes();
404 preempt_enable_no_resched();
405
406 /*
407 * if somebody else is singlestepping across a probe point, eflags
408 * will have TF set, in which case, continue the remaining processing
409 * of do_debug, as if this is not a probe hit.
410 */
411 if (regs->eflags & TF_MASK)
412 return 0;
413
414 return 1;
415}
416
417/* Interrupts disabled, kprobe_lock held. */
418int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
419{
420 if (current_kprobe->fault_handler
421 && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
422 return 1;
423
424 if (kprobe_status & KPROBE_HIT_SS) {
425 resume_execution(current_kprobe, regs);
426 regs->eflags |= kprobe_old_rflags;
427
428 unlock_kprobes();
429 preempt_enable_no_resched();
430 }
431 return 0;
432}
433
434/*
435 * Wrapper routine for handling exceptions.
436 */
437int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
438 void *data)
439{
440 struct die_args *args = (struct die_args *)data;
441 switch (val) {
442 case DIE_INT3:
443 if (kprobe_handler(args->regs))
444 return NOTIFY_STOP;
445 break;
446 case DIE_DEBUG:
447 if (post_kprobe_handler(args->regs))
448 return NOTIFY_STOP;
449 break;
450 case DIE_GPF:
451 if (kprobe_running() &&
452 kprobe_fault_handler(args->regs, args->trapnr))
453 return NOTIFY_STOP;
454 break;
455 case DIE_PAGE_FAULT:
456 if (kprobe_running() &&
457 kprobe_fault_handler(args->regs, args->trapnr))
458 return NOTIFY_STOP;
459 break;
460 default:
461 break;
462 }
463 return NOTIFY_DONE;
464}
465
466int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
467{
468 struct jprobe *jp = container_of(p, struct jprobe, kp);
469 unsigned long addr;
470
471 jprobe_saved_regs = *regs;
472 jprobe_saved_rsp = (long *) regs->rsp;
473 addr = (unsigned long)jprobe_saved_rsp;
474 /*
475 * As Linus pointed out, gcc assumes that the callee
476 * owns the argument space and could overwrite it, e.g.
477 * tailcall optimization. So, to be absolutely safe
478 * we also save and restore enough stack bytes to cover
479 * the argument area.
480 */
481 memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
482 regs->eflags &= ~IF_MASK;
483 regs->rip = (unsigned long)(jp->entry);
484 return 1;
485}
486
487void jprobe_return(void)
488{
489 preempt_enable_no_resched();
490 asm volatile (" xchg %%rbx,%%rsp \n"
491 " int3 \n"
492 " .globl jprobe_return_end \n"
493 " jprobe_return_end: \n"
494 " nop \n"::"b"
495 (jprobe_saved_rsp):"memory");
496}
497
498int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
499{
500 u8 *addr = (u8 *) (regs->rip - 1);
501 unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
502 struct jprobe *jp = container_of(p, struct jprobe, kp);
503
504 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
505 if ((long *)regs->rsp != jprobe_saved_rsp) {
506 struct pt_regs *saved_regs =
507 container_of(jprobe_saved_rsp, struct pt_regs, rsp);
508 printk("current rsp %p does not match saved rsp %p\n",
509 (long *)regs->rsp, jprobe_saved_rsp);
510 printk("Saved registers for jprobe %p\n", jp);
511 show_registers(saved_regs);
512 printk("Current registers\n");
513 show_registers(regs);
514 BUG();
515 }
516 *regs = jprobe_saved_regs;
517 memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
518 MIN_STACK_SIZE(stack_addr));
519 return 1;
520 }
521 return 0;
522}
523
524/*
525 * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
526 * By default on x86_64, pages we get from kmalloc or vmalloc are not
527 * executable. Single-stepping an instruction on such a page yields an
528 * oops. So instead of storing the instruction copies in their respective
529 * kprobe objects, we allocate a page, map it executable, and store all the
530 * instruction copies there. (We can allocate additional pages if somebody
531 * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE
532 * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
533 * bytes.
534 */
535#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
536struct kprobe_insn_page {
537 struct hlist_node hlist;
538 kprobe_opcode_t *insns; /* page of instruction slots */
539 char slot_used[INSNS_PER_PAGE];
540 int nused;
541};
542
543static struct hlist_head kprobe_insn_pages;
544
545/**
546 * get_insn_slot() - Find a slot on an executable page for an instruction.
547 * We allocate an executable page if there's no room on existing ones.
548 */
549static kprobe_opcode_t *get_insn_slot(void)
550{
551 struct kprobe_insn_page *kip;
552 struct hlist_node *pos;
553
554 hlist_for_each(pos, &kprobe_insn_pages) {
555 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
556 if (kip->nused < INSNS_PER_PAGE) {
557 int i;
558 for (i = 0; i < INSNS_PER_PAGE; i++) {
559 if (!kip->slot_used[i]) {
560 kip->slot_used[i] = 1;
561 kip->nused++;
562 return kip->insns + (i*MAX_INSN_SIZE);
563 }
564 }
565 /* Surprise! No unused slots. Fix kip->nused. */
566 kip->nused = INSNS_PER_PAGE;
567 }
568 }
569
570 /* All out of space. Need to allocate a new page. Use slot 0.*/
571 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
572 if (!kip) {
573 return NULL;
574 }
575
576 /*
577 * For the %rip-relative displacement fixups to be doable, we
578 * need our instruction copy to be within +/- 2GB of any data it
579 * might access via %rip. That is, within 2GB of where the
580 * kernel image and loaded module images reside. So we allocate
581 * a page in the module loading area.
582 */
583 kip->insns = module_alloc(PAGE_SIZE);
584 if (!kip->insns) {
585 kfree(kip);
586 return NULL;
587 }
588 INIT_HLIST_NODE(&kip->hlist);
589 hlist_add_head(&kip->hlist, &kprobe_insn_pages);
590 memset(kip->slot_used, 0, INSNS_PER_PAGE);
591 kip->slot_used[0] = 1;
592 kip->nused = 1;
593 return kip->insns;
594}
595
596/**
597 * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
598 */
599static void free_insn_slot(kprobe_opcode_t *slot)
600{
601 struct kprobe_insn_page *kip;
602 struct hlist_node *pos;
603
604 hlist_for_each(pos, &kprobe_insn_pages) {
605 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
606 if (kip->insns <= slot
607 && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
608 int i = (slot - kip->insns) / MAX_INSN_SIZE;
609 kip->slot_used[i] = 0;
610 kip->nused--;
611 if (kip->nused == 0) {
612 /*
613 * Page is no longer in use. Free it unless
614 * it's the last one. We keep the last one
615 * so as not to have to set it up again the
616 * next time somebody inserts a probe.
617 */
618 hlist_del(&kip->hlist);
619 if (hlist_empty(&kprobe_insn_pages)) {
620 INIT_HLIST_NODE(&kip->hlist);
621 hlist_add_head(&kip->hlist,
622 &kprobe_insn_pages);
623 } else {
624 module_free(NULL, kip->insns);
625 kfree(kip);
626 }
627 }
628 return;
629 }
630 }
631}
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
new file mode 100644
index 000000000000..d7e5d0cf4285
--- /dev/null
+++ b/arch/x86_64/kernel/ldt.c
@@ -0,0 +1,253 @@
1/*
2 * linux/arch/x86_64/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 * Copyright (C) 2002 Andi Kleen
7 *
8 * This handles calls from both 32bit and 64bit mode.
9 */
10
11#include <linux/errno.h>
12#include <linux/sched.h>
13#include <linux/string.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/smp_lock.h>
17#include <linux/vmalloc.h>
18#include <linux/slab.h>
19
20#include <asm/uaccess.h>
21#include <asm/system.h>
22#include <asm/ldt.h>
23#include <asm/desc.h>
24#include <asm/proto.h>
25
26#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
27static void flush_ldt(void *null)
28{
29 if (current->active_mm)
30 load_LDT(&current->active_mm->context);
31}
32#endif
33
34static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
35{
36 void *oldldt;
37 void *newldt;
38 unsigned oldsize;
39
40 if (mincount <= (unsigned)pc->size)
41 return 0;
42 oldsize = pc->size;
43 mincount = (mincount+511)&(~511);
44 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
45 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
46 else
47 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
48
49 if (!newldt)
50 return -ENOMEM;
51
52 if (oldsize)
53 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
54 oldldt = pc->ldt;
55 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
56 wmb();
57 pc->ldt = newldt;
58 wmb();
59 pc->size = mincount;
60 wmb();
61 if (reload) {
62#ifdef CONFIG_SMP
63 cpumask_t mask;
64
65 preempt_disable();
66 mask = cpumask_of_cpu(smp_processor_id());
67 load_LDT(pc);
68 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
69 smp_call_function(flush_ldt, NULL, 1, 1);
70 preempt_enable();
71#else
72 load_LDT(pc);
73#endif
74 }
75 if (oldsize) {
76 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
77 vfree(oldldt);
78 else
79 kfree(oldldt);
80 }
81 return 0;
82}
83
84static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
85{
86 int err = alloc_ldt(new, old->size, 0);
87 if (err < 0)
88 return err;
89 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
90 return 0;
91}
92
93/*
94 * we do not have to muck with descriptors here, that is
95 * done in switch_mm() as needed.
96 */
97int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
98{
99 struct mm_struct * old_mm;
100 int retval = 0;
101
102 init_MUTEX(&mm->context.sem);
103 mm->context.size = 0;
104 old_mm = current->mm;
105 if (old_mm && old_mm->context.size > 0) {
106 down(&old_mm->context.sem);
107 retval = copy_ldt(&mm->context, &old_mm->context);
108 up(&old_mm->context.sem);
109 }
110 return retval;
111}
112
113/*
114 *
115 * Don't touch the LDT register - we're already in the next thread.
116 */
117void destroy_context(struct mm_struct *mm)
118{
119 if (mm->context.size) {
120 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
121 vfree(mm->context.ldt);
122 else
123 kfree(mm->context.ldt);
124 mm->context.size = 0;
125 }
126}
127
128static int read_ldt(void __user * ptr, unsigned long bytecount)
129{
130 int err;
131 unsigned long size;
132 struct mm_struct * mm = current->mm;
133
134 if (!mm->context.size)
135 return 0;
136 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
137 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
138
139 down(&mm->context.sem);
140 size = mm->context.size*LDT_ENTRY_SIZE;
141 if (size > bytecount)
142 size = bytecount;
143
144 err = 0;
145 if (copy_to_user(ptr, mm->context.ldt, size))
146 err = -EFAULT;
147 up(&mm->context.sem);
148 if (err < 0)
149 goto error_return;
150 if (size != bytecount) {
151 /* zero-fill the rest */
152 if (clear_user(ptr+size, bytecount-size) != 0) {
153 err = -EFAULT;
154 goto error_return;
155 }
156 }
157 return bytecount;
158error_return:
159 return err;
160}
161
162static int read_default_ldt(void __user * ptr, unsigned long bytecount)
163{
164 /* Arbitrary number */
165 /* x86-64 default LDT is all zeros */
166 if (bytecount > 128)
167 bytecount = 128;
168 if (clear_user(ptr, bytecount))
169 return -EFAULT;
170 return bytecount;
171}
172
173static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
174{
175 struct task_struct *me = current;
176 struct mm_struct * mm = me->mm;
177 __u32 entry_1, entry_2, *lp;
178 int error;
179 struct user_desc ldt_info;
180
181 error = -EINVAL;
182
183 if (bytecount != sizeof(ldt_info))
184 goto out;
185 error = -EFAULT;
186 if (copy_from_user(&ldt_info, ptr, bytecount))
187 goto out;
188
189 error = -EINVAL;
190 if (ldt_info.entry_number >= LDT_ENTRIES)
191 goto out;
192 if (ldt_info.contents == 3) {
193 if (oldmode)
194 goto out;
195 if (ldt_info.seg_not_present == 0)
196 goto out;
197 }
198
199 down(&mm->context.sem);
200 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
201 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
202 if (error < 0)
203 goto out_unlock;
204 }
205
206 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
207
208 /* Allow LDTs to be cleared by the user. */
209 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
210 if (oldmode || LDT_empty(&ldt_info)) {
211 entry_1 = 0;
212 entry_2 = 0;
213 goto install;
214 }
215 }
216
217 entry_1 = LDT_entry_a(&ldt_info);
218 entry_2 = LDT_entry_b(&ldt_info);
219 if (oldmode)
220 entry_2 &= ~(1 << 20);
221
222 /* Install the new entry ... */
223install:
224 *lp = entry_1;
225 *(lp+1) = entry_2;
226 error = 0;
227
228out_unlock:
229 up(&mm->context.sem);
230out:
231 return error;
232}
233
234asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
235{
236 int ret = -ENOSYS;
237
238 switch (func) {
239 case 0:
240 ret = read_ldt(ptr, bytecount);
241 break;
242 case 1:
243 ret = write_ldt(ptr, bytecount, 1);
244 break;
245 case 2:
246 ret = read_default_ldt(ptr, bytecount);
247 break;
248 case 0x11:
249 ret = write_ldt(ptr, bytecount, 0);
250 break;
251 }
252 return ret;
253}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
new file mode 100644
index 000000000000..86f9fd85016a
--- /dev/null
+++ b/arch/x86_64/kernel/mce.c
@@ -0,0 +1,548 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
18#include <asm/processor.h>
19#include <asm/msr.h>
20#include <asm/mce.h>
21#include <asm/kdebug.h>
22#include <asm/uaccess.h>
23
24#define MISC_MCELOG_MINOR 227
25#define NR_BANKS 5
26
27static int mce_dont_init;
28
29/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
30 3: never panic or exit (for testing only) */
31static int tolerant = 1;
32static int banks;
33static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
34static unsigned long console_logged;
35static int notify_user;
36
37/*
38 * Lockless MCE logging infrastructure.
39 * This avoids deadlocks on printk locks without having to break locks. Also
40 * separate MCEs from kernel messages to avoid bogus bug reports.
41 */
42
43struct mce_log mcelog = {
44 MCE_LOG_SIGNATURE,
45 MCE_LOG_LEN,
46};
47
48void mce_log(struct mce *mce)
49{
50 unsigned next, entry;
51 mce->finished = 0;
52 smp_wmb();
53 for (;;) {
54 entry = rcu_dereference(mcelog.next);
55 /* When the buffer fills up discard new entries. Assume
56 that the earlier errors are the more interesting. */
57 if (entry >= MCE_LOG_LEN) {
58 set_bit(MCE_OVERFLOW, &mcelog.flags);
59 return;
60 }
61 /* Old left over entry. Skip. */
62 if (mcelog.entry[entry].finished)
63 continue;
64 smp_rmb();
65 next = entry + 1;
66 if (cmpxchg(&mcelog.next, entry, next) == entry)
67 break;
68 }
69 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
70 smp_wmb();
71 mcelog.entry[entry].finished = 1;
72 smp_wmb();
73
74 if (!test_and_set_bit(0, &console_logged))
75 notify_user = 1;
76}
77
78static void print_mce(struct mce *m)
79{
80 printk(KERN_EMERG "\n"
81 KERN_EMERG
82 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
83 m->cpu, m->mcgstatus, m->bank, m->status);
84 if (m->rip) {
85 printk(KERN_EMERG
86 "RIP%s %02x:<%016Lx> ",
87 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
88 m->cs, m->rip);
89 if (m->cs == __KERNEL_CS)
90 print_symbol("{%s}", m->rip);
91 printk("\n");
92 }
93 printk(KERN_EMERG "TSC %Lx ", m->tsc);
94 if (m->addr)
95 printk("ADDR %Lx ", m->addr);
96 if (m->misc)
97 printk("MISC %Lx ", m->misc);
98 printk("\n");
99}
100
101static void mce_panic(char *msg, struct mce *backup, unsigned long start)
102{
103 int i;
104 oops_begin();
105 for (i = 0; i < MCE_LOG_LEN; i++) {
106 unsigned long tsc = mcelog.entry[i].tsc;
107 if (time_before(tsc, start))
108 continue;
109 print_mce(&mcelog.entry[i]);
110 if (backup && mcelog.entry[i].tsc == backup->tsc)
111 backup = NULL;
112 }
113 if (backup)
114 print_mce(backup);
115 if (tolerant >= 3)
116 printk("Fake panic: %s\n", msg);
117 else
118 panic(msg);
119}
120
121static int mce_available(struct cpuinfo_x86 *c)
122{
123 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
124 test_bit(X86_FEATURE_MCA, &c->x86_capability);
125}
126
127/*
128 * The actual machine check handler
129 */
130
131void do_machine_check(struct pt_regs * regs, long error_code)
132{
133 struct mce m, panicm;
134 int nowayout = (tolerant < 1);
135 int kill_it = 0;
136 u64 mcestart = 0;
137 int i;
138 int panicm_found = 0;
139
140 if (regs)
141 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
142 if (!banks)
143 return;
144
145 memset(&m, 0, sizeof(struct mce));
146 m.cpu = hard_smp_processor_id();
147 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
148 if (!(m.mcgstatus & MCG_STATUS_RIPV))
149 kill_it = 1;
150
151 rdtscll(mcestart);
152 barrier();
153
154 for (i = 0; i < banks; i++) {
155 if (!bank[i])
156 continue;
157
158 m.misc = 0;
159 m.addr = 0;
160 m.bank = i;
161 m.tsc = 0;
162
163 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
164 if ((m.status & MCI_STATUS_VAL) == 0)
165 continue;
166
167 if (m.status & MCI_STATUS_EN) {
168 /* In theory _OVER could be a nowayout too, but
169 assume any overflowed errors were no fatal. */
170 nowayout |= !!(m.status & MCI_STATUS_PCC);
171 kill_it |= !!(m.status & MCI_STATUS_UC);
172 }
173
174 if (m.status & MCI_STATUS_MISCV)
175 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
176 if (m.status & MCI_STATUS_ADDRV)
177 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
178
179 if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) {
180 m.rip = regs->rip;
181 m.cs = regs->cs;
182 } else {
183 m.rip = 0;
184 m.cs = 0;
185 }
186
187 if (error_code != -1)
188 rdtscll(m.tsc);
189 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
190 mce_log(&m);
191
192 /* Did this bank cause the exception? */
193 /* Assume that the bank with uncorrectable errors did it,
194 and that there is only a single one. */
195 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
196 panicm = m;
197 panicm_found = 1;
198 }
199
200 tainted |= TAINT_MACHINE_CHECK;
201 }
202
203 /* Never do anything final in the polling timer */
204 if (!regs)
205 goto out;
206
207 /* If we didn't find an uncorrectable error, pick
208 the last one (shouldn't happen, just being safe). */
209 if (!panicm_found)
210 panicm = m;
211 if (nowayout)
212 mce_panic("Machine check", &panicm, mcestart);
213 if (kill_it) {
214 int user_space = 0;
215
216 if (m.mcgstatus & MCG_STATUS_RIPV)
217 user_space = panicm.rip && (panicm.cs & 3);
218
219 /* When the machine was in user space and the CPU didn't get
220 confused it's normally not necessary to panic, unless you
221 are paranoid (tolerant == 0)
222
223 RED-PEN could be more tolerant for MCEs in idle,
224 but most likely they occur at boot anyways, where
225 it is best to just halt the machine. */
226 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
227 (unsigned)current->pid <= 1)
228 mce_panic("Uncorrected machine check", &panicm, mcestart);
229
230 /* do_exit takes an awful lot of locks and has as
231 slight risk of deadlocking. If you don't want that
232 don't set tolerant >= 2 */
233 if (tolerant < 3)
234 do_exit(SIGBUS);
235 }
236
237 out:
238 /* Last thing done in the machine check exception to clear state. */
239 wrmsrl(MSR_IA32_MCG_STATUS, 0);
240}
241
242/*
243 * Periodic polling timer for "silent" machine check errors.
244 */
245
246static int check_interval = 5 * 60; /* 5 minutes */
247static void mcheck_timer(void *data);
248static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
249
250static void mcheck_check_cpu(void *info)
251{
252 if (mce_available(&current_cpu_data))
253 do_machine_check(NULL, 0);
254}
255
256static void mcheck_timer(void *data)
257{
258 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
259 schedule_delayed_work(&mcheck_work, check_interval * HZ);
260
261 /*
262 * It's ok to read stale data here for notify_user and
263 * console_logged as we'll simply get the updated versions
264 * on the next mcheck_timer execution and atomic operations
265 * on console_logged act as synchronization for notify_user
266 * writes.
267 */
268 if (notify_user && console_logged) {
269 notify_user = 0;
270 clear_bit(0, &console_logged);
271 printk(KERN_INFO "Machine check events logged\n");
272 }
273}
274
275
276static __init int periodic_mcheck_init(void)
277{
278 if (check_interval)
279 schedule_delayed_work(&mcheck_work, check_interval*HZ);
280 return 0;
281}
282__initcall(periodic_mcheck_init);
283
284
285/*
286 * Initialize Machine Checks for a CPU.
287 */
288static void mce_init(void *dummy)
289{
290 u64 cap;
291 int i;
292
293 rdmsrl(MSR_IA32_MCG_CAP, cap);
294 banks = cap & 0xff;
295 if (banks > NR_BANKS) {
296 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
297 banks = NR_BANKS;
298 }
299
300 /* Log the machine checks left over from the previous reset.
301 This also clears all registers */
302 do_machine_check(NULL, -1);
303
304 set_in_cr4(X86_CR4_MCE);
305
306 if (cap & MCG_CTL_P)
307 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
308
309 for (i = 0; i < banks; i++) {
310 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
311 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
312 }
313}
314
315/* Add per CPU specific workarounds here */
316static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
317{
318 /* This should be disabled by the BIOS, but isn't always */
319 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
320 /* disable GART TBL walk error reporting, which trips off
321 incorrectly with the IOMMU & 3ware & Cerberus. */
322 clear_bit(10, &bank[4]);
323 }
324}
325
326static void __init mce_cpu_features(struct cpuinfo_x86 *c)
327{
328 switch (c->x86_vendor) {
329 case X86_VENDOR_INTEL:
330 mce_intel_feature_init(c);
331 break;
332 default:
333 break;
334 }
335}
336
337/*
338 * Called for each booted CPU to set up machine checks.
339 * Must be called with preempt off.
340 */
341void __init mcheck_init(struct cpuinfo_x86 *c)
342{
343 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
344
345 mce_cpu_quirks(c);
346
347 if (mce_dont_init ||
348 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
349 !mce_available(c))
350 return;
351
352 mce_init(NULL);
353 mce_cpu_features(c);
354}
355
356/*
357 * Character device to read and clear the MCE log.
358 */
359
360static void collect_tscs(void *data)
361{
362 unsigned long *cpu_tsc = (unsigned long *)data;
363 rdtscll(cpu_tsc[smp_processor_id()]);
364}
365
366static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
367{
368 unsigned long cpu_tsc[NR_CPUS];
369 static DECLARE_MUTEX(mce_read_sem);
370 unsigned next;
371 char __user *buf = ubuf;
372 int i, err;
373
374 down(&mce_read_sem);
375 next = rcu_dereference(mcelog.next);
376
377 /* Only supports full reads right now */
378 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
379 up(&mce_read_sem);
380 return -EINVAL;
381 }
382
383 err = 0;
384 for (i = 0; i < next; i++) {
385 if (!mcelog.entry[i].finished)
386 continue;
387 smp_rmb();
388 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
389 buf += sizeof(struct mce);
390 }
391
392 memset(mcelog.entry, 0, next * sizeof(struct mce));
393 mcelog.next = 0;
394
395 synchronize_kernel();
396
397 /* Collect entries that were still getting written before the synchronize. */
398
399 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
400 for (i = next; i < MCE_LOG_LEN; i++) {
401 if (mcelog.entry[i].finished &&
402 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
403 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
404 smp_rmb();
405 buf += sizeof(struct mce);
406 memset(&mcelog.entry[i], 0, sizeof(struct mce));
407 }
408 }
409 up(&mce_read_sem);
410 return err ? -EFAULT : buf - ubuf;
411}
412
413static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
414{
415 int __user *p = (int __user *)arg;
416 if (!capable(CAP_SYS_ADMIN))
417 return -EPERM;
418 switch (cmd) {
419 case MCE_GET_RECORD_LEN:
420 return put_user(sizeof(struct mce), p);
421 case MCE_GET_LOG_LEN:
422 return put_user(MCE_LOG_LEN, p);
423 case MCE_GETCLEAR_FLAGS: {
424 unsigned flags;
425 do {
426 flags = mcelog.flags;
427 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
428 return put_user(flags, p);
429 }
430 default:
431 return -ENOTTY;
432 }
433}
434
435static struct file_operations mce_chrdev_ops = {
436 .read = mce_read,
437 .ioctl = mce_ioctl,
438};
439
440static struct miscdevice mce_log_device = {
441 MISC_MCELOG_MINOR,
442 "mcelog",
443 &mce_chrdev_ops,
444};
445
446/*
447 * Old style boot options parsing. Only for compatibility.
448 */
449
450static int __init mcheck_disable(char *str)
451{
452 mce_dont_init = 1;
453 return 0;
454}
455
456/* mce=off disables machine check. Note you can reenable it later
457 using sysfs */
458static int __init mcheck_enable(char *str)
459{
460 if (!strcmp(str, "off"))
461 mce_dont_init = 1;
462 else
463 printk("mce= argument %s ignored. Please use /sys", str);
464 return 0;
465}
466
467__setup("nomce", mcheck_disable);
468__setup("mce", mcheck_enable);
469
470/*
471 * Sysfs support
472 */
473
474/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
475static int mce_resume(struct sys_device *dev)
476{
477 on_each_cpu(mce_init, NULL, 1, 1);
478 return 0;
479}
480
481/* Reinit MCEs after user configuration changes */
482static void mce_restart(void)
483{
484 if (check_interval)
485 cancel_delayed_work(&mcheck_work);
486 /* Timer race is harmless here */
487 on_each_cpu(mce_init, NULL, 1, 1);
488 if (check_interval)
489 schedule_delayed_work(&mcheck_work, check_interval*HZ);
490}
491
492static struct sysdev_class mce_sysclass = {
493 .resume = mce_resume,
494 set_kset_name("machinecheck"),
495};
496
497static struct sys_device device_mce = {
498 .id = 0,
499 .cls = &mce_sysclass,
500};
501
502/* Why are there no generic functions for this? */
503#define ACCESSOR(name, var, start) \
504 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
505 return sprintf(buf, "%lx\n", (unsigned long)var); \
506 } \
507 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
508 char *end; \
509 unsigned long new = simple_strtoul(buf, &end, 0); \
510 if (end == buf) return -EINVAL; \
511 var = new; \
512 start; \
513 return end-buf; \
514 } \
515 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
516
517ACCESSOR(bank0ctl,bank[0],mce_restart())
518ACCESSOR(bank1ctl,bank[1],mce_restart())
519ACCESSOR(bank2ctl,bank[2],mce_restart())
520ACCESSOR(bank3ctl,bank[3],mce_restart())
521ACCESSOR(bank4ctl,bank[4],mce_restart())
522ACCESSOR(tolerant,tolerant,)
523ACCESSOR(check_interval,check_interval,mce_restart())
524
525static __init int mce_init_device(void)
526{
527 int err;
528 if (!mce_available(&boot_cpu_data))
529 return -EIO;
530 err = sysdev_class_register(&mce_sysclass);
531 if (!err)
532 err = sysdev_register(&device_mce);
533 if (!err) {
534 /* could create per CPU objects, but it is not worth it. */
535 sysdev_create_file(&device_mce, &attr_bank0ctl);
536 sysdev_create_file(&device_mce, &attr_bank1ctl);
537 sysdev_create_file(&device_mce, &attr_bank2ctl);
538 sysdev_create_file(&device_mce, &attr_bank3ctl);
539 sysdev_create_file(&device_mce, &attr_bank4ctl);
540 sysdev_create_file(&device_mce, &attr_tolerant);
541 sysdev_create_file(&device_mce, &attr_check_interval);
542 }
543
544 misc_register(&mce_log_device);
545 return err;
546
547}
548device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
new file mode 100644
index 000000000000..4db9a640069f
--- /dev/null
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -0,0 +1,99 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 */
5
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/percpu.h>
9#include <asm/processor.h>
10#include <asm/msr.h>
11#include <asm/mce.h>
12#include <asm/hw_irq.h>
13
14static DEFINE_PER_CPU(unsigned long, next_check);
15
16asmlinkage void smp_thermal_interrupt(void)
17{
18 struct mce m;
19
20 ack_APIC_irq();
21
22 irq_enter();
23 if (time_before(jiffies, __get_cpu_var(next_check)))
24 goto done;
25
26 __get_cpu_var(next_check) = jiffies + HZ*300;
27 memset(&m, 0, sizeof(m));
28 m.cpu = smp_processor_id();
29 m.bank = MCE_THERMAL_BANK;
30 rdtscll(m.tsc);
31 rdmsrl(MSR_IA32_THERM_STATUS, m.status);
32 if (m.status & 0x1) {
33 printk(KERN_EMERG
34 "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
35 add_taint(TAINT_MACHINE_CHECK);
36 } else {
37 printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
38 }
39
40 mce_log(&m);
41done:
42 irq_exit();
43}
44
45static void __init intel_init_thermal(struct cpuinfo_x86 *c)
46{
47 u32 l, h;
48 int tm2 = 0;
49 unsigned int cpu = smp_processor_id();
50
51 if (!cpu_has(c, X86_FEATURE_ACPI))
52 return;
53
54 if (!cpu_has(c, X86_FEATURE_ACC))
55 return;
56
57 /* first check if TM1 is already enabled by the BIOS, in which
58 * case there might be some SMM goo which handles it, so we can't even
59 * put a handler since it might be delivered via SMI already.
60 */
61 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
62 h = apic_read(APIC_LVTTHMR);
63 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
64 printk(KERN_DEBUG
65 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
66 return;
67 }
68
69 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
70 tm2 = 1;
71
72 if (h & APIC_VECTOR_MASK) {
73 printk(KERN_DEBUG
74 "CPU%d: Thermal LVT vector (%#x) already "
75 "installed\n", cpu, (h & APIC_VECTOR_MASK));
76 return;
77 }
78
79 h = THERMAL_APIC_VECTOR;
80 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
81 apic_write_around(APIC_LVTTHMR, h);
82
83 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
84 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
85
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
88
89 l = apic_read(APIC_LVTTHMR);
90 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
91 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
92 cpu, tm2 ? "TM2" : "TM1");
93 return;
94}
95
96void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
97{
98 intel_init_thermal(c);
99}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
new file mode 100644
index 000000000000..c2ffea8845ed
--- /dev/null
+++ b/arch/x86_64/kernel/module.c
@@ -0,0 +1,166 @@
1/* Kernel module help for x86-64
2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/
19#include <linux/moduleloader.h>
20#include <linux/elf.h>
21#include <linux/vmalloc.h>
22#include <linux/fs.h>
23#include <linux/string.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26
27#include <asm/system.h>
28#include <asm/page.h>
29#include <asm/pgtable.h>
30
31#define DEBUGP(fmt...)
32
33void module_free(struct module *mod, void *module_region)
34{
35 vfree(module_region);
36}
37
38void *module_alloc(unsigned long size)
39{
40 struct vm_struct *area;
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL;
47
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
49 if (!area)
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
53}
54
55/* We don't need anything special. */
56int module_frob_arch_sections(Elf_Ehdr *hdr,
57 Elf_Shdr *sechdrs,
58 char *secstrings,
59 struct module *mod)
60{
61 return 0;
62}
63
64int apply_relocate_add(Elf64_Shdr *sechdrs,
65 const char *strtab,
66 unsigned int symindex,
67 unsigned int relsec,
68 struct module *me)
69{
70 unsigned int i;
71 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
72 Elf64_Sym *sym;
73 void *loc;
74 u64 val;
75
76 DEBUGP("Applying relocate section %u to %u\n", relsec,
77 sechdrs[relsec].sh_info);
78 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
79 /* This is where to make the change */
80 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
81 + rel[i].r_offset;
82
83 /* This is the symbol it is referring to. Note that all
84 undefined symbols have been resolved. */
85 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
86 + ELF64_R_SYM(rel[i].r_info);
87
88 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
89 (int)ELF64_R_TYPE(rel[i].r_info),
90 sym->st_value, rel[i].r_addend, (u64)loc);
91
92 val = sym->st_value + rel[i].r_addend;
93
94 switch (ELF64_R_TYPE(rel[i].r_info)) {
95 case R_X86_64_NONE:
96 break;
97 case R_X86_64_64:
98 *(u64 *)loc = val;
99 break;
100 case R_X86_64_32:
101 *(u32 *)loc = val;
102 if (val != *(u32 *)loc)
103 goto overflow;
104 break;
105 case R_X86_64_32S:
106 *(s32 *)loc = val;
107 if ((s64)val != *(s32 *)loc)
108 goto overflow;
109 break;
110 case R_X86_64_PC32:
111 val -= (u64)loc;
112 *(u32 *)loc = val;
113#if 0
114 if ((s64)val != *(s32 *)loc)
115 goto overflow;
116#endif
117 break;
118 default:
119 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
120 me->name, ELF64_R_TYPE(rel[i].r_info));
121 return -ENOEXEC;
122 }
123 }
124 return 0;
125
126overflow:
127 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
128 (int)ELF64_R_TYPE(rel[i].r_info), val);
129 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
130 me->name);
131 return -ENOEXEC;
132}
133
134int apply_relocate(Elf_Shdr *sechdrs,
135 const char *strtab,
136 unsigned int symindex,
137 unsigned int relsec,
138 struct module *me)
139{
140 printk("non add relocation not supported\n");
141 return -ENOSYS;
142}
143
144extern void apply_alternatives(void *start, void *end);
145
146int module_finalize(const Elf_Ehdr *hdr,
147 const Elf_Shdr *sechdrs,
148 struct module *me)
149{
150 const Elf_Shdr *s;
151 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
152
153 /* look for .altinstructions to patch */
154 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
155 void *seg;
156 if (strcmp(".altinstructions", secstrings + s->sh_name))
157 continue;
158 seg = (void *)s->sh_addr;
159 apply_alternatives(seg, seg + s->sh_size);
160 }
161 return 0;
162}
163
164void module_arch_cleanup(struct module *mod)
165{
166}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
new file mode 100644
index 000000000000..7ec031c6ca10
--- /dev/null
+++ b/arch/x86_64/kernel/mpparse.c
@@ -0,0 +1,949 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/irq.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/config.h>
21#include <linux/bootmem.h>
22#include <linux/smp_lock.h>
23#include <linux/kernel_stat.h>
24#include <linux/mc146818rtc.h>
25#include <linux/acpi.h>
26
27#include <asm/smp.h>
28#include <asm/mtrr.h>
29#include <asm/mpspec.h>
30#include <asm/pgalloc.h>
31#include <asm/io_apic.h>
32#include <asm/proto.h>
33
34/* Have we found an MP table */
35int smp_found_config;
36unsigned int __initdata maxcpus = NR_CPUS;
37
38int acpi_found_madt;
39
40/*
41 * Various Linux-internal data structures created from the
42 * MP-table.
43 */
44int apic_version [MAX_APICS];
45unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
46int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
47cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
48
49static int mp_current_pci_id = 0;
50/* I/O APIC entries */
51struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
52
53/* # of MP IRQ source entries */
54struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
55
56/* MP IRQ source entries */
57int mp_irq_entries;
58
59int nr_ioapics;
60int pic_mode;
61unsigned long mp_lapic_addr = 0;
62
63
64
65/* Processor that is doing the boot up */
66unsigned int boot_cpu_id = -1U;
67/* Internal processor count */
68static unsigned int num_processors = 0;
69
70/* Bitmask of physically existing CPUs */
71physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
72
73/* ACPI MADT entry parsing functions */
74#ifdef CONFIG_ACPI_BOOT
75extern struct acpi_boot_flags acpi_boot;
76#ifdef CONFIG_X86_LOCAL_APIC
77extern int acpi_parse_lapic (acpi_table_entry_header *header);
78extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
79extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
80#endif /*CONFIG_X86_LOCAL_APIC*/
81#ifdef CONFIG_X86_IO_APIC
82extern int acpi_parse_ioapic (acpi_table_entry_header *header);
83#endif /*CONFIG_X86_IO_APIC*/
84#endif /*CONFIG_ACPI_BOOT*/
85
86u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
87
88
89/*
90 * Intel MP BIOS table parsing routines:
91 */
92
93/*
94 * Checksum an MP configuration block.
95 */
96
97static int __init mpf_checksum(unsigned char *mp, int len)
98{
99 int sum = 0;
100
101 while (len--)
102 sum += *mp++;
103
104 return sum & 0xFF;
105}
106
107static void __init MP_processor_info (struct mpc_config_processor *m)
108{
109 int ver;
110
111 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return;
113
114 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
115 m->mpc_apicid,
116 (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
117 (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
118 m->mpc_apicver);
119
120 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
121 Dprintk(" Bootup CPU\n");
122 boot_cpu_id = m->mpc_apicid;
123 }
124 if (num_processors >= NR_CPUS) {
125 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
126 " Processor ignored.\n", NR_CPUS);
127 return;
128 }
129 if (num_processors >= maxcpus) {
130 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
131 " Processor ignored.\n", maxcpus);
132 return;
133 }
134
135 num_processors++;
136
137 if (m->mpc_apicid > MAX_APICS) {
138 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
139 m->mpc_apicid, MAX_APICS);
140 return;
141 }
142 ver = m->mpc_apicver;
143
144 physid_set(m->mpc_apicid, phys_cpu_present_map);
145 /*
146 * Validate version
147 */
148 if (ver == 0x0) {
149 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
150 ver = 0x10;
151 }
152 apic_version[m->mpc_apicid] = ver;
153 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
154}
155
156static void __init MP_bus_info (struct mpc_config_bus *m)
157{
158 char str[7];
159
160 memcpy(str, m->mpc_bustype, 6);
161 str[6] = 0;
162 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
163
164 if (strncmp(str, "ISA", 3) == 0) {
165 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
166 } else if (strncmp(str, "EISA", 4) == 0) {
167 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
168 } else if (strncmp(str, "PCI", 3) == 0) {
169 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
170 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
171 mp_current_pci_id++;
172 } else if (strncmp(str, "MCA", 3) == 0) {
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
174 } else {
175 printk(KERN_ERR "Unknown bustype %s\n", str);
176 }
177}
178
179static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
180{
181 if (!(m->mpc_flags & MPC_APIC_USABLE))
182 return;
183
184 printk("I/O APIC #%d Version %d at 0x%X.\n",
185 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
186 if (nr_ioapics >= MAX_IO_APICS) {
187 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
188 MAX_IO_APICS, nr_ioapics);
189 panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
190 }
191 if (!m->mpc_apicaddr) {
192 printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
193 " found in MP table, skipping!\n");
194 return;
195 }
196 mp_ioapics[nr_ioapics] = *m;
197 nr_ioapics++;
198}
199
200static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
201{
202 mp_irqs [mp_irq_entries] = *m;
203 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
204 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
205 m->mpc_irqtype, m->mpc_irqflag & 3,
206 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
207 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
208 if (++mp_irq_entries == MAX_IRQ_SOURCES)
209 panic("Max # of irq sources exceeded!!\n");
210}
211
212static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
213{
214 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
215 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
216 m->mpc_irqtype, m->mpc_irqflag & 3,
217 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
218 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
219 /*
220 * Well it seems all SMP boards in existence
221 * use ExtINT/LVT1 == LINT0 and
222 * NMI/LVT2 == LINT1 - the following check
223 * will show us if this assumptions is false.
224 * Until then we do not have to add baggage.
225 */
226 if ((m->mpc_irqtype == mp_ExtINT) &&
227 (m->mpc_destapiclint != 0))
228 BUG();
229 if ((m->mpc_irqtype == mp_NMI) &&
230 (m->mpc_destapiclint != 1))
231 BUG();
232}
233
234/*
235 * Read/parse the MPC
236 */
237
238static int __init smp_read_mpc(struct mp_config_table *mpc)
239{
240 char str[16];
241 int count=sizeof(*mpc);
242 unsigned char *mpt=((unsigned char *)mpc)+count;
243
244 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
245 printk("SMP mptable: bad signature [%c%c%c%c]!\n",
246 mpc->mpc_signature[0],
247 mpc->mpc_signature[1],
248 mpc->mpc_signature[2],
249 mpc->mpc_signature[3]);
250 return 0;
251 }
252 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
253 printk("SMP mptable: checksum error!\n");
254 return 0;
255 }
256 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
257 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
258 mpc->mpc_spec);
259 return 0;
260 }
261 if (!mpc->mpc_lapic) {
262 printk(KERN_ERR "SMP mptable: null local APIC address!\n");
263 return 0;
264 }
265 memcpy(str,mpc->mpc_oem,8);
266 str[8]=0;
267 printk(KERN_INFO "OEM ID: %s ",str);
268
269 memcpy(str,mpc->mpc_productid,12);
270 str[12]=0;
271 printk(KERN_INFO "Product ID: %s ",str);
272
273 printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic);
274
275 /* save the local APIC address, it might be non-default */
276 if (!acpi_lapic)
277 mp_lapic_addr = mpc->mpc_lapic;
278
279 /*
280 * Now process the configuration blocks.
281 */
282 while (count < mpc->mpc_length) {
283 switch(*mpt) {
284 case MP_PROCESSOR:
285 {
286 struct mpc_config_processor *m=
287 (struct mpc_config_processor *)mpt;
288 if (!acpi_lapic)
289 MP_processor_info(m);
290 mpt += sizeof(*m);
291 count += sizeof(*m);
292 break;
293 }
294 case MP_BUS:
295 {
296 struct mpc_config_bus *m=
297 (struct mpc_config_bus *)mpt;
298 MP_bus_info(m);
299 mpt += sizeof(*m);
300 count += sizeof(*m);
301 break;
302 }
303 case MP_IOAPIC:
304 {
305 struct mpc_config_ioapic *m=
306 (struct mpc_config_ioapic *)mpt;
307 MP_ioapic_info(m);
308 mpt+=sizeof(*m);
309 count+=sizeof(*m);
310 break;
311 }
312 case MP_INTSRC:
313 {
314 struct mpc_config_intsrc *m=
315 (struct mpc_config_intsrc *)mpt;
316
317 MP_intsrc_info(m);
318 mpt+=sizeof(*m);
319 count+=sizeof(*m);
320 break;
321 }
322 case MP_LINTSRC:
323 {
324 struct mpc_config_lintsrc *m=
325 (struct mpc_config_lintsrc *)mpt;
326 MP_lintsrc_info(m);
327 mpt+=sizeof(*m);
328 count+=sizeof(*m);
329 break;
330 }
331 }
332 }
333 clustered_apic_check();
334 if (!num_processors)
335 printk(KERN_ERR "SMP mptable: no processors registered!\n");
336 return num_processors;
337}
338
339static int __init ELCR_trigger(unsigned int irq)
340{
341 unsigned int port;
342
343 port = 0x4d0 + (irq >> 3);
344 return (inb(port) >> (irq & 7)) & 1;
345}
346
347static void __init construct_default_ioirq_mptable(int mpc_default_type)
348{
349 struct mpc_config_intsrc intsrc;
350 int i;
351 int ELCR_fallback = 0;
352
353 intsrc.mpc_type = MP_INTSRC;
354 intsrc.mpc_irqflag = 0; /* conforming */
355 intsrc.mpc_srcbus = 0;
356 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
357
358 intsrc.mpc_irqtype = mp_INT;
359
360 /*
361 * If true, we have an ISA/PCI system with no IRQ entries
362 * in the MP table. To prevent the PCI interrupts from being set up
363 * incorrectly, we try to use the ELCR. The sanity check to see if
364 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
365 * never be level sensitive, so we simply see if the ELCR agrees.
366 * If it does, we assume it's valid.
367 */
368 if (mpc_default_type == 5) {
369 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
370
371 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
372 printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
373 else {
374 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
375 ELCR_fallback = 1;
376 }
377 }
378
379 for (i = 0; i < 16; i++) {
380 switch (mpc_default_type) {
381 case 2:
382 if (i == 0 || i == 13)
383 continue; /* IRQ0 & IRQ13 not connected */
384 /* fall through */
385 default:
386 if (i == 2)
387 continue; /* IRQ2 is never connected */
388 }
389
390 if (ELCR_fallback) {
391 /*
392 * If the ELCR indicates a level-sensitive interrupt, we
393 * copy that information over to the MP table in the
394 * irqflag field (level sensitive, active high polarity).
395 */
396 if (ELCR_trigger(i))
397 intsrc.mpc_irqflag = 13;
398 else
399 intsrc.mpc_irqflag = 0;
400 }
401
402 intsrc.mpc_srcbusirq = i;
403 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
404 MP_intsrc_info(&intsrc);
405 }
406
407 intsrc.mpc_irqtype = mp_ExtINT;
408 intsrc.mpc_srcbusirq = 0;
409 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
410 MP_intsrc_info(&intsrc);
411}
412
413static inline void __init construct_default_ISA_mptable(int mpc_default_type)
414{
415 struct mpc_config_processor processor;
416 struct mpc_config_bus bus;
417 struct mpc_config_ioapic ioapic;
418 struct mpc_config_lintsrc lintsrc;
419 int linttypes[2] = { mp_ExtINT, mp_NMI };
420 int i;
421
422 /*
423 * local APIC has default address
424 */
425 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
426
427 /*
428 * 2 CPUs, numbered 0 & 1.
429 */
430 processor.mpc_type = MP_PROCESSOR;
431 /* Either an integrated APIC or a discrete 82489DX. */
432 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
433 processor.mpc_cpuflag = CPU_ENABLED;
434 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
435 (boot_cpu_data.x86_model << 4) |
436 boot_cpu_data.x86_mask;
437 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
438 processor.mpc_reserved[0] = 0;
439 processor.mpc_reserved[1] = 0;
440 for (i = 0; i < 2; i++) {
441 processor.mpc_apicid = i;
442 MP_processor_info(&processor);
443 }
444
445 bus.mpc_type = MP_BUS;
446 bus.mpc_busid = 0;
447 switch (mpc_default_type) {
448 default:
449 printk(KERN_ERR "???\nUnknown standard configuration %d\n",
450 mpc_default_type);
451 /* fall through */
452 case 1:
453 case 5:
454 memcpy(bus.mpc_bustype, "ISA ", 6);
455 break;
456 case 2:
457 case 6:
458 case 3:
459 memcpy(bus.mpc_bustype, "EISA ", 6);
460 break;
461 case 4:
462 case 7:
463 memcpy(bus.mpc_bustype, "MCA ", 6);
464 }
465 MP_bus_info(&bus);
466 if (mpc_default_type > 4) {
467 bus.mpc_busid = 1;
468 memcpy(bus.mpc_bustype, "PCI ", 6);
469 MP_bus_info(&bus);
470 }
471
472 ioapic.mpc_type = MP_IOAPIC;
473 ioapic.mpc_apicid = 2;
474 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
475 ioapic.mpc_flags = MPC_APIC_USABLE;
476 ioapic.mpc_apicaddr = 0xFEC00000;
477 MP_ioapic_info(&ioapic);
478
479 /*
480 * We set up most of the low 16 IO-APIC pins according to MPS rules.
481 */
482 construct_default_ioirq_mptable(mpc_default_type);
483
484 lintsrc.mpc_type = MP_LINTSRC;
485 lintsrc.mpc_irqflag = 0; /* conforming */
486 lintsrc.mpc_srcbusid = 0;
487 lintsrc.mpc_srcbusirq = 0;
488 lintsrc.mpc_destapic = MP_APIC_ALL;
489 for (i = 0; i < 2; i++) {
490 lintsrc.mpc_irqtype = linttypes[i];
491 lintsrc.mpc_destapiclint = i;
492 MP_lintsrc_info(&lintsrc);
493 }
494}
495
496static struct intel_mp_floating *mpf_found;
497
498/*
499 * Scan the memory blocks for an SMP configuration block.
500 */
501void __init get_smp_config (void)
502{
503 struct intel_mp_floating *mpf = mpf_found;
504
505 /*
506 * ACPI may be used to obtain the entire SMP configuration or just to
507 * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that
508 * ACPI supports both logical (e.g. Hyper-Threading) and physical
509 * processors, where MPS only supports physical.
510 */
511 if (acpi_lapic && acpi_ioapic) {
512 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
513 return;
514 }
515 else if (acpi_lapic)
516 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
517
518 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
519 if (mpf->mpf_feature2 & (1<<7)) {
520 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
521 pic_mode = 1;
522 } else {
523 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
524 pic_mode = 0;
525 }
526
527 /*
528 * Now see if we need to read further.
529 */
530 if (mpf->mpf_feature1 != 0) {
531
532 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
533 construct_default_ISA_mptable(mpf->mpf_feature1);
534
535 } else if (mpf->mpf_physptr) {
536
537 /*
538 * Read the physical hardware table. Anything here will
539 * override the defaults.
540 */
541 if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
542 smp_found_config = 0;
543 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
544 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
545 return;
546 }
547 /*
548 * If there are no explicit MP IRQ entries, then we are
549 * broken. We set up most of the low 16 IO-APIC pins to
550 * ISA defaults and hope it will work.
551 */
552 if (!mp_irq_entries) {
553 struct mpc_config_bus bus;
554
555 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
556
557 bus.mpc_type = MP_BUS;
558 bus.mpc_busid = 0;
559 memcpy(bus.mpc_bustype, "ISA ", 6);
560 MP_bus_info(&bus);
561
562 construct_default_ioirq_mptable(0);
563 }
564
565 } else
566 BUG();
567
568 printk(KERN_INFO "Processors: %d\n", num_processors);
569 /*
570 * Only use the first configuration found.
571 */
572}
573
574static int __init smp_scan_config (unsigned long base, unsigned long length)
575{
576 extern void __bad_mpf_size(void);
577 unsigned int *bp = phys_to_virt(base);
578 struct intel_mp_floating *mpf;
579
580 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
581 if (sizeof(*mpf) != 16)
582 __bad_mpf_size();
583
584 while (length > 0) {
585 mpf = (struct intel_mp_floating *)bp;
586 if ((*bp == SMP_MAGIC_IDENT) &&
587 (mpf->mpf_length == 1) &&
588 !mpf_checksum((unsigned char *)bp, 16) &&
589 ((mpf->mpf_specification == 1)
590 || (mpf->mpf_specification == 4)) ) {
591
592 smp_found_config = 1;
593 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
594 if (mpf->mpf_physptr)
595 reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
596 mpf_found = mpf;
597 return 1;
598 }
599 bp += 4;
600 length -= 16;
601 }
602 return 0;
603}
604
605void __init find_intel_smp (void)
606{
607 unsigned int address;
608
609 /*
610 * FIXME: Linux assumes you have 640K of base ram..
611 * this continues the error...
612 *
613 * 1) Scan the bottom 1K for a signature
614 * 2) Scan the top 1K of base RAM
615 * 3) Scan the 64K of bios
616 */
617 if (smp_scan_config(0x0,0x400) ||
618 smp_scan_config(639*0x400,0x400) ||
619 smp_scan_config(0xF0000,0x10000))
620 return;
621 /*
622 * If it is an SMP machine we should know now, unless the
623 * configuration is in an EISA/MCA bus machine with an
624 * extended bios data area.
625 *
626 * there is a real-mode segmented pointer pointing to the
627 * 4K EBDA area at 0x40E, calculate and scan it here.
628 *
629 * NOTE! There are Linux loaders that will corrupt the EBDA
630 * area, and as such this kind of SMP config may be less
631 * trustworthy, simply because the SMP table may have been
632 * stomped on during early boot. These loaders are buggy and
633 * should be fixed.
634 */
635
636 address = *(unsigned short *)phys_to_virt(0x40E);
637 address <<= 4;
638 if (smp_scan_config(address, 0x1000))
639 return;
640
641 /* If we have come this far, we did not find an MP table */
642 printk(KERN_INFO "No mptable found.\n");
643}
644
645/*
646 * - Intel MP Configuration Table
647 */
648void __init find_smp_config (void)
649{
650#ifdef CONFIG_X86_LOCAL_APIC
651 find_intel_smp();
652#endif
653}
654
655
656/* --------------------------------------------------------------------------
657 ACPI-based MP Configuration
658 -------------------------------------------------------------------------- */
659
660#ifdef CONFIG_ACPI_BOOT
661
662void __init mp_register_lapic_address (
663 u64 address)
664{
665 mp_lapic_addr = (unsigned long) address;
666
667 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
668
669 if (boot_cpu_id == -1U)
670 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
671
672 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
673}
674
675
676void __init mp_register_lapic (
677 u8 id,
678 u8 enabled)
679{
680 struct mpc_config_processor processor;
681 int boot_cpu = 0;
682
683 if (id >= MAX_APICS) {
684 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
685 id, MAX_APICS);
686 return;
687 }
688
689 if (id == boot_cpu_physical_apicid)
690 boot_cpu = 1;
691
692 processor.mpc_type = MP_PROCESSOR;
693 processor.mpc_apicid = id;
694 processor.mpc_apicver = 0x10; /* TBD: lapic version */
695 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
696 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
697 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
698 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
699 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
700 processor.mpc_reserved[0] = 0;
701 processor.mpc_reserved[1] = 0;
702
703 MP_processor_info(&processor);
704}
705
706#ifdef CONFIG_X86_IO_APIC
707
708#define MP_ISA_BUS 0
709#define MP_MAX_IOAPIC_PIN 127
710
711static struct mp_ioapic_routing {
712 int apic_id;
713 int gsi_start;
714 int gsi_end;
715 u32 pin_programmed[4];
716} mp_ioapic_routing[MAX_IO_APICS];
717
718
719static int mp_find_ioapic (
720 int gsi)
721{
722 int i = 0;
723
724 /* Find the IOAPIC that manages this GSI. */
725 for (i = 0; i < nr_ioapics; i++) {
726 if ((gsi >= mp_ioapic_routing[i].gsi_start)
727 && (gsi <= mp_ioapic_routing[i].gsi_end))
728 return i;
729 }
730
731 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
732
733 return -1;
734}
735
736
737void __init mp_register_ioapic (
738 u8 id,
739 u32 address,
740 u32 gsi_base)
741{
742 int idx = 0;
743
744 if (nr_ioapics >= MAX_IO_APICS) {
745 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
746 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
747 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
748 }
749 if (!address) {
750 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
751 " found in MADT table, skipping!\n");
752 return;
753 }
754
755 idx = nr_ioapics++;
756
757 mp_ioapics[idx].mpc_type = MP_IOAPIC;
758 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
759 mp_ioapics[idx].mpc_apicaddr = address;
760
761 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
762 mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
763 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
764
765 /*
766 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
767 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
768 */
769 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
770 mp_ioapic_routing[idx].gsi_start = gsi_base;
771 mp_ioapic_routing[idx].gsi_end = gsi_base +
772 io_apic_get_redir_entries(idx);
773
774 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
775 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
776 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
777 mp_ioapic_routing[idx].gsi_start,
778 mp_ioapic_routing[idx].gsi_end);
779
780 return;
781}
782
783
784void __init mp_override_legacy_irq (
785 u8 bus_irq,
786 u8 polarity,
787 u8 trigger,
788 u32 gsi)
789{
790 struct mpc_config_intsrc intsrc;
791 int ioapic = -1;
792 int pin = -1;
793
794 /*
795 * Convert 'gsi' to 'ioapic.pin'.
796 */
797 ioapic = mp_find_ioapic(gsi);
798 if (ioapic < 0)
799 return;
800 pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
801
802 /*
803 * TBD: This check is for faulty timer entries, where the override
804 * erroneously sets the trigger to level, resulting in a HUGE
805 * increase of timer interrupts!
806 */
807 if ((bus_irq == 0) && (trigger == 3))
808 trigger = 1;
809
810 intsrc.mpc_type = MP_INTSRC;
811 intsrc.mpc_irqtype = mp_INT;
812 intsrc.mpc_irqflag = (trigger << 2) | polarity;
813 intsrc.mpc_srcbus = MP_ISA_BUS;
814 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
815 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
816 intsrc.mpc_dstirq = pin; /* INTIN# */
817
818 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
819 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
820 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
821 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
822
823 mp_irqs[mp_irq_entries] = intsrc;
824 if (++mp_irq_entries == MAX_IRQ_SOURCES)
825 panic("Max # of irq sources exceeded!\n");
826
827 return;
828}
829
830
831void __init mp_config_acpi_legacy_irqs (void)
832{
833 struct mpc_config_intsrc intsrc;
834 int i = 0;
835 int ioapic = -1;
836
837 /*
838 * Fabricate the legacy ISA bus (bus #31).
839 */
840 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
841 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
842
843 /*
844 * Locate the IOAPIC that manages the ISA IRQs (0-15).
845 */
846 ioapic = mp_find_ioapic(0);
847 if (ioapic < 0)
848 return;
849
850 intsrc.mpc_type = MP_INTSRC;
851 intsrc.mpc_irqflag = 0; /* Conforming */
852 intsrc.mpc_srcbus = MP_ISA_BUS;
853 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
854
855 /*
856 * Use the default configuration for the IRQs 0-15. Unless
857 * overridden by (MADT) interrupt source override entries.
858 */
859 for (i = 0; i < 16; i++) {
860 int idx;
861
862 for (idx = 0; idx < mp_irq_entries; idx++) {
863 struct mpc_config_intsrc *irq = mp_irqs + idx;
864
865 /* Do we already have a mapping for this ISA IRQ? */
866 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
867 break;
868
869 /* Do we already have a mapping for this IOAPIC pin */
870 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
871 (irq->mpc_dstirq == i))
872 break;
873 }
874
875 if (idx != mp_irq_entries) {
876 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
877 continue; /* IRQ already used */
878 }
879
880 intsrc.mpc_irqtype = mp_INT;
881 intsrc.mpc_srcbusirq = i; /* Identity mapped */
882 intsrc.mpc_dstirq = i;
883
884 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
885 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
886 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
887 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
888 intsrc.mpc_dstirq);
889
890 mp_irqs[mp_irq_entries] = intsrc;
891 if (++mp_irq_entries == MAX_IRQ_SOURCES)
892 panic("Max # of irq sources exceeded!\n");
893 }
894
895 return;
896}
897
898int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
899{
900 int ioapic = -1;
901 int ioapic_pin = 0;
902 int idx, bit = 0;
903
904 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
905 return gsi;
906
907#ifdef CONFIG_ACPI_BUS
908 /* Don't set up the ACPI SCI because it's already set up */
909 if (acpi_fadt.sci_int == gsi)
910 return gsi;
911#endif
912
913 ioapic = mp_find_ioapic(gsi);
914 if (ioapic < 0) {
915 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
916 return gsi;
917 }
918
919 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
920
921 /*
922 * Avoid pin reprogramming. PRTs typically include entries
923 * with redundant pin->gsi mappings (but unique PCI devices);
924 * we only program the IOAPIC on the first.
925 */
926 bit = ioapic_pin % 32;
927 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
928 if (idx > 3) {
929 printk(KERN_ERR "Invalid reference to IOAPIC pin "
930 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
931 ioapic_pin);
932 return gsi;
933 }
934 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
935 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
936 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
937 return gsi;
938 }
939
940 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
941
942 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
943 edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
944 active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
945 return gsi;
946}
947
948#endif /*CONFIG_X86_IO_APIC*/
949#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c
new file mode 100644
index 000000000000..598953ab0154
--- /dev/null
+++ b/arch/x86_64/kernel/msr.c
@@ -0,0 +1,279 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * msr.c
15 *
16 * x86 MSR access device
17 *
18 * This device is accessed by lseek() to the appropriate register number
19 * and then read/write in chunks of 8 bytes. A larger size means multiple
20 * reads or writes of the same register.
21 *
22 * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27#include <linux/config.h>
28
29#include <linux/types.h>
30#include <linux/errno.h>
31#include <linux/fcntl.h>
32#include <linux/init.h>
33#include <linux/poll.h>
34#include <linux/smp.h>
35#include <linux/smp_lock.h>
36#include <linux/major.h>
37#include <linux/fs.h>
38
39#include <asm/processor.h>
40#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h>
43
44/* Note: "err" is handled in a funny way below. Otherwise one version
45 of gcc or another breaks. */
46
47static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
48{
49 int err;
50
51 asm volatile ("1: wrmsr\n"
52 "2:\n"
53 ".section .fixup,\"ax\"\n"
54 "3: movl %4,%0\n"
55 " jmp 2b\n"
56 ".previous\n"
57 ".section __ex_table,\"a\"\n"
58 " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err)
59 :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
60
61 return err;
62}
63
64static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
65{
66 int err;
67
68 asm volatile ("1: rdmsr\n"
69 "2:\n"
70 ".section .fixup,\"ax\"\n"
71 "3: movl %4,%0\n"
72 " jmp 2b\n"
73 ".previous\n"
74 ".section __ex_table,\"a\"\n"
75 " .align 8\n"
76 " .quad 1b,3b\n"
77 ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
78 :"c"(reg), "i"(-EIO), "0"(0));
79
80 return err;
81}
82
83#ifdef CONFIG_SMP
84
85struct msr_command {
86 int cpu;
87 int err;
88 u32 reg;
89 u32 data[2];
90};
91
92static void msr_smp_wrmsr(void *cmd_block)
93{
94 struct msr_command *cmd = (struct msr_command *)cmd_block;
95
96 if (cmd->cpu == smp_processor_id())
97 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
98}
99
100static void msr_smp_rdmsr(void *cmd_block)
101{
102 struct msr_command *cmd = (struct msr_command *)cmd_block;
103
104 if (cmd->cpu == smp_processor_id())
105 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
106}
107
108static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
109{
110 struct msr_command cmd;
111 int ret;
112
113 preempt_disable();
114 if (cpu == smp_processor_id()) {
115 ret = wrmsr_eio(reg, eax, edx);
116 } else {
117 cmd.cpu = cpu;
118 cmd.reg = reg;
119 cmd.data[0] = eax;
120 cmd.data[1] = edx;
121
122 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
123 ret = cmd.err;
124 }
125 preempt_enable();
126 return ret;
127}
128
129static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
130{
131 struct msr_command cmd;
132 int ret;
133
134 preempt_disable();
135 if (cpu == smp_processor_id()) {
136 ret = rdmsr_eio(reg, eax, edx);
137 } else {
138 cmd.cpu = cpu;
139 cmd.reg = reg;
140
141 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
142
143 *eax = cmd.data[0];
144 *edx = cmd.data[1];
145
146 ret = cmd.err;
147 }
148 preempt_enable();
149 return ret;
150}
151
152#else /* ! CONFIG_SMP */
153
154static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
155{
156 return wrmsr_eio(reg, eax, edx);
157}
158
159static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx)
160{
161 return rdmsr_eio(reg, eax, edx);
162}
163
164#endif /* ! CONFIG_SMP */
165
166static loff_t msr_seek(struct file *file, loff_t offset, int orig)
167{
168 loff_t ret = -EINVAL;
169
170 lock_kernel();
171 switch (orig) {
172 case 0:
173 file->f_pos = offset;
174 ret = file->f_pos;
175 break;
176 case 1:
177 file->f_pos += offset;
178 ret = file->f_pos;
179 }
180 unlock_kernel();
181 return ret;
182}
183
184static ssize_t msr_read(struct file *file, char __user * buf,
185 size_t count, loff_t * ppos)
186{
187 u32 __user *tmp = (u32 __user *) buf;
188 u32 data[2];
189 size_t rv;
190 u32 reg = *ppos;
191 int cpu = iminor(file->f_dentry->d_inode);
192 int err;
193
194 if (count % 8)
195 return -EINVAL; /* Invalid chunk size */
196
197 for (rv = 0; count; count -= 8) {
198 err = do_rdmsr(cpu, reg, &data[0], &data[1]);
199 if (err)
200 return err;
201 if (copy_to_user(tmp, &data, 8))
202 return -EFAULT;
203 tmp += 2;
204 }
205
206 return ((char __user *)tmp) - buf;
207}
208
209static ssize_t msr_write(struct file *file, const char __user *buf,
210 size_t count, loff_t *ppos)
211{
212 const u32 __user *tmp = (const u32 __user *)buf;
213 u32 data[2];
214 size_t rv;
215 u32 reg = *ppos;
216 int cpu = iminor(file->f_dentry->d_inode);
217 int err;
218
219 if (count % 8)
220 return -EINVAL; /* Invalid chunk size */
221
222 for (rv = 0; count; count -= 8) {
223 if (copy_from_user(&data, tmp, 8))
224 return -EFAULT;
225 err = do_wrmsr(cpu, reg, data[0], data[1]);
226 if (err)
227 return err;
228 tmp += 2;
229 }
230
231 return ((char __user *)tmp) - buf;
232}
233
234static int msr_open(struct inode *inode, struct file *file)
235{
236 unsigned int cpu = iminor(file->f_dentry->d_inode);
237 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
238
239 if (cpu >= NR_CPUS || !cpu_online(cpu))
240 return -ENXIO; /* No such CPU */
241 if (!cpu_has(c, X86_FEATURE_MSR))
242 return -EIO; /* MSR not supported */
243
244 return 0;
245}
246
247/*
248 * File operations we support
249 */
250static struct file_operations msr_fops = {
251 .owner = THIS_MODULE,
252 .llseek = msr_seek,
253 .read = msr_read,
254 .write = msr_write,
255 .open = msr_open,
256};
257
258static int __init msr_init(void)
259{
260 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
261 printk(KERN_ERR "msr: unable to get major %d for msr\n",
262 MSR_MAJOR);
263 return -EBUSY;
264 }
265
266 return 0;
267}
268
269static void __exit msr_exit(void)
270{
271 unregister_chrdev(MSR_MAJOR, "cpu/msr");
272}
273
274module_init(msr_init);
275module_exit(msr_exit)
276
277MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
278MODULE_DESCRIPTION("x86 generic MSR driver");
279MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
new file mode 100644
index 000000000000..d9867de6a626
--- /dev/null
+++ b/arch/x86_64/kernel/nmi.c
@@ -0,0 +1,488 @@
1/*
2 * linux/arch/x86_64/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Pavel Machek and
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/irq.h>
18#include <linux/delay.h>
19#include <linux/bootmem.h>
20#include <linux/smp_lock.h>
21#include <linux/interrupt.h>
22#include <linux/mc146818rtc.h>
23#include <linux/kernel_stat.h>
24#include <linux/module.h>
25#include <linux/sysdev.h>
26#include <linux/nmi.h>
27#include <linux/sysctl.h>
28
29#include <asm/smp.h>
30#include <asm/mtrr.h>
31#include <asm/mpspec.h>
32#include <asm/nmi.h>
33#include <asm/msr.h>
34#include <asm/proto.h>
35#include <asm/kdebug.h>
36
37/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
39 * - it may be reserved by some other driver, or not
40 * - when not reserved by some other driver, it may be used for
41 * the NMI watchdog, or not
42 *
43 * This is maintained separately from nmi_active because the NMI
44 * watchdog may also be driven from the I/O APIC timer.
45 */
46static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
47static unsigned int lapic_nmi_owner;
48#define LAPIC_NMI_WATCHDOG (1<<0)
49#define LAPIC_NMI_RESERVED (1<<1)
50
51/* nmi_active:
52 * +1: the lapic NMI watchdog is active, but can be disabled
53 * 0: the lapic NMI watchdog has not been set up, and cannot
54 * be enabled
55 * -1: the lapic NMI watchdog is disabled, but can be enabled
56 */
57int nmi_active; /* oprofile uses this */
58int panic_on_timeout;
59
60unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
63
64/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */
66
67#define K7_EVNTSEL_ENABLE (1 << 22)
68#define K7_EVNTSEL_INT (1 << 20)
69#define K7_EVNTSEL_OS (1 << 17)
70#define K7_EVNTSEL_USR (1 << 16)
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73
74#define P6_EVNTSEL0_ENABLE (1 << 22)
75#define P6_EVNTSEL_INT (1 << 20)
76#define P6_EVNTSEL_OS (1 << 17)
77#define P6_EVNTSEL_USR (1 << 16)
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
80
81/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void)
83{
84 if (nmi_watchdog != NMI_DEFAULT)
85 return;
86
87 /* For some reason the IO APIC watchdog doesn't work on the AMD
88 8111 chipset. For now switch to local APIC mode using
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC;
99 }
100}
101
102/* Why is there no CPUID flag for this? */
103static __init int cpu_has_lapic(void)
104{
105 switch (boot_cpu_data.x86_vendor) {
106 case X86_VENDOR_INTEL:
107 case X86_VENDOR_AMD:
108 return boot_cpu_data.x86 >= 6;
109 /* .... add more cpus here or find a different way to figure this out. */
110 default:
111 return 0;
112 }
113}
114
115int __init check_nmi_watchdog (void)
116{
117 int counts[NR_CPUS];
118 int cpu;
119
120 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) {
121 nmi_watchdog = NMI_NONE;
122 return -1;
123 }
124
125 printk(KERN_INFO "testing NMI watchdog ... ");
126
127 for (cpu = 0; cpu < NR_CPUS; cpu++)
128 counts[cpu] = cpu_pda[cpu].__nmi_count;
129 local_irq_enable();
130 mdelay((10*1000)/nmi_hz); // wait 10 ticks
131
132 for (cpu = 0; cpu < NR_CPUS; cpu++) {
133#ifdef CONFIG_SMP
134 /* Check cpu_callin_map here because that is set
135 after the timer is started. */
136 if (!cpu_isset(cpu, cpu_callin_map))
137 continue;
138#endif
139 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
140 printk("CPU#%d: NMI appears to be stuck (%d)!\n",
141 cpu,
142 cpu_pda[cpu].__nmi_count);
143 nmi_active = 0;
144 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
145 return -1;
146 }
147 }
148 printk("OK.\n");
149
150 /* now that we know it works we can reduce NMI frequency to
151 something more reasonable; makes a difference in some configs */
152 if (nmi_watchdog == NMI_LOCAL_APIC)
153 nmi_hz = 1;
154
155 return 0;
156}
157
158int __init setup_nmi_watchdog(char *str)
159{
160 int nmi;
161
162 if (!strncmp(str,"panic",5)) {
163 panic_on_timeout = 1;
164 str = strchr(str, ',');
165 if (!str)
166 return 1;
167 ++str;
168 }
169
170 get_option(&str, &nmi);
171
172 if (nmi >= NMI_INVALID)
173 return 0;
174 nmi_watchdog = nmi;
175 return 1;
176}
177
178__setup("nmi_watchdog=", setup_nmi_watchdog);
179
180static void disable_lapic_nmi_watchdog(void)
181{
182 if (nmi_active <= 0)
183 return;
184 switch (boot_cpu_data.x86_vendor) {
185 case X86_VENDOR_AMD:
186 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
187 break;
188 case X86_VENDOR_INTEL:
189 wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
190 break;
191 }
192 nmi_active = -1;
193 /* tell do_nmi() and others that we're not active any more */
194 nmi_watchdog = 0;
195}
196
197static void enable_lapic_nmi_watchdog(void)
198{
199 if (nmi_active < 0) {
200 nmi_watchdog = NMI_LOCAL_APIC;
201 setup_apic_nmi_watchdog();
202 }
203}
204
205int reserve_lapic_nmi(void)
206{
207 unsigned int old_owner;
208
209 spin_lock(&lapic_nmi_owner_lock);
210 old_owner = lapic_nmi_owner;
211 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
212 spin_unlock(&lapic_nmi_owner_lock);
213 if (old_owner & LAPIC_NMI_RESERVED)
214 return -EBUSY;
215 if (old_owner & LAPIC_NMI_WATCHDOG)
216 disable_lapic_nmi_watchdog();
217 return 0;
218}
219
220void release_lapic_nmi(void)
221{
222 unsigned int new_owner;
223
224 spin_lock(&lapic_nmi_owner_lock);
225 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
226 lapic_nmi_owner = new_owner;
227 spin_unlock(&lapic_nmi_owner_lock);
228 if (new_owner & LAPIC_NMI_WATCHDOG)
229 enable_lapic_nmi_watchdog();
230}
231
232void disable_timer_nmi_watchdog(void)
233{
234 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
235 return;
236
237 disable_irq(0);
238 unset_nmi_callback();
239 nmi_active = -1;
240 nmi_watchdog = NMI_NONE;
241}
242
243void enable_timer_nmi_watchdog(void)
244{
245 if (nmi_active < 0) {
246 nmi_watchdog = NMI_IO_APIC;
247 touch_nmi_watchdog();
248 nmi_active = 1;
249 enable_irq(0);
250 }
251}
252
253#ifdef CONFIG_PM
254
255static int nmi_pm_active; /* nmi_active before suspend */
256
257static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
258{
259 nmi_pm_active = nmi_active;
260 disable_lapic_nmi_watchdog();
261 return 0;
262}
263
264static int lapic_nmi_resume(struct sys_device *dev)
265{
266 if (nmi_pm_active > 0)
267 enable_lapic_nmi_watchdog();
268 return 0;
269}
270
271static struct sysdev_class nmi_sysclass = {
272 set_kset_name("lapic_nmi"),
273 .resume = lapic_nmi_resume,
274 .suspend = lapic_nmi_suspend,
275};
276
277static struct sys_device device_lapic_nmi = {
278 .id = 0,
279 .cls = &nmi_sysclass,
280};
281
282static int __init init_lapic_nmi_sysfs(void)
283{
284 int error;
285
286 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
287 return 0;
288
289 error = sysdev_class_register(&nmi_sysclass);
290 if (!error)
291 error = sysdev_register(&device_lapic_nmi);
292 return error;
293}
294/* must come after the local APIC's device_initcall() */
295late_initcall(init_lapic_nmi_sysfs);
296
297#endif /* CONFIG_PM */
298
299/*
300 * Activate the NMI watchdog via the local APIC.
301 * Original code written by Keith Owens.
302 */
303
304static void setup_k7_watchdog(void)
305{
306 int i;
307 unsigned int evntsel;
308
309 /* No check, so can start with slow frequency */
310 nmi_hz = 1;
311
312 /* XXX should check these in EFER */
313
314 nmi_perfctr_msr = MSR_K7_PERFCTR0;
315
316 for(i = 0; i < 4; ++i) {
317 /* Simulator may not support it */
318 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
319 return;
320 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
321 }
322
323 evntsel = K7_EVNTSEL_INT
324 | K7_EVNTSEL_OS
325 | K7_EVNTSEL_USR
326 | K7_NMI_EVENT;
327
328 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
329 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
330 apic_write(APIC_LVTPC, APIC_DM_NMI);
331 evntsel |= K7_EVNTSEL_ENABLE;
332 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
333}
334
335void setup_apic_nmi_watchdog(void)
336{
337 switch (boot_cpu_data.x86_vendor) {
338 case X86_VENDOR_AMD:
339 if (boot_cpu_data.x86 < 6)
340 return;
341 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
342 return;
343 setup_k7_watchdog();
344 break;
345 default:
346 return;
347 }
348 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
349 nmi_active = 1;
350}
351
352/*
353 * the best way to detect whether a CPU has a 'hard lockup' problem
354 * is to check it's local APIC timer IRQ counts. If they are not
355 * changing then that CPU has some problem.
356 *
357 * as these watchdog NMI IRQs are generated on every CPU, we only
358 * have to check the current processor.
359 *
360 * since NMIs don't listen to _any_ locks, we have to be extremely
361 * careful not to rely on unsafe variables. The printk might lock
362 * up though, so we have to break up any console locks first ...
363 * [when there will be more tty-related locks, break them up
364 * here too!]
365 */
366
367static unsigned int
368 last_irq_sums [NR_CPUS],
369 alert_counter [NR_CPUS];
370
371void touch_nmi_watchdog (void)
372{
373 int i;
374
375 /*
376 * Just reset the alert counters, (other CPUs might be
377 * spinning on locks we hold):
378 */
379 for (i = 0; i < NR_CPUS; i++)
380 alert_counter[i] = 0;
381}
382
383void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
384{
385 int sum, cpu;
386
387 cpu = safe_smp_processor_id();
388 sum = read_pda(apic_timer_irqs);
389 if (last_irq_sums[cpu] == sum) {
390 /*
391 * Ayiee, looks like this CPU is stuck ...
392 * wait a few IRQs (5 seconds) before doing the oops ...
393 */
394 alert_counter[cpu]++;
395 if (alert_counter[cpu] == 5*nmi_hz) {
396 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
397 == NOTIFY_STOP) {
398 alert_counter[cpu] = 0;
399 return;
400 }
401 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
402 }
403 } else {
404 last_irq_sums[cpu] = sum;
405 alert_counter[cpu] = 0;
406 }
407 if (nmi_perfctr_msr)
408 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
409}
410
411static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
412{
413 return 0;
414}
415
416static nmi_callback_t nmi_callback = dummy_nmi_callback;
417
418asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
419{
420 int cpu = safe_smp_processor_id();
421
422 nmi_enter();
423 add_pda(__nmi_count,1);
424 if (!nmi_callback(regs, cpu))
425 default_do_nmi(regs);
426 nmi_exit();
427}
428
429void set_nmi_callback(nmi_callback_t callback)
430{
431 nmi_callback = callback;
432}
433
434void unset_nmi_callback(void)
435{
436 nmi_callback = dummy_nmi_callback;
437}
438
439#ifdef CONFIG_SYSCTL
440
441static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
442{
443 unsigned char reason = get_nmi_reason();
444 char buf[64];
445
446 if (!(reason & 0xc0)) {
447 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
448 die_nmi(buf,regs);
449 }
450 return 0;
451}
452
453/*
454 * proc handler for /proc/sys/kernel/unknown_nmi_panic
455 */
456int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
457 void __user *buffer, size_t *length, loff_t *ppos)
458{
459 int old_state;
460
461 old_state = unknown_nmi_panic;
462 proc_dointvec(table, write, file, buffer, length, ppos);
463 if (!!old_state == !!unknown_nmi_panic)
464 return 0;
465
466 if (unknown_nmi_panic) {
467 if (reserve_lapic_nmi() < 0) {
468 unknown_nmi_panic = 0;
469 return -EBUSY;
470 } else {
471 set_nmi_callback(unknown_nmi_panic_callback);
472 }
473 } else {
474 release_lapic_nmi();
475 unset_nmi_callback();
476 }
477 return 0;
478}
479
480#endif
481
482EXPORT_SYMBOL(nmi_active);
483EXPORT_SYMBOL(nmi_watchdog);
484EXPORT_SYMBOL(reserve_lapic_nmi);
485EXPORT_SYMBOL(release_lapic_nmi);
486EXPORT_SYMBOL(disable_timer_nmi_watchdog);
487EXPORT_SYMBOL(enable_timer_nmi_watchdog);
488EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
new file mode 100644
index 000000000000..cab471cf3edb
--- /dev/null
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -0,0 +1,60 @@
1/*
2 * Dynamic DMA mapping support.
3 */
4
5#include <linux/types.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/pci.h>
9#include <linux/module.h>
10#include <asm/io.h>
11
12/* Map a set of buffers described by scatterlist in streaming
13 * mode for DMA. This is the scatter-gather version of the
14 * above pci_map_single interface. Here the scatter gather list
15 * elements are each tagged with the appropriate dma address
16 * and length. They are obtained via sg_dma_{address,length}(SG).
17 *
18 * NOTE: An implementation may be able to use a smaller number of
19 * DMA address/length pairs than there are SG table elements.
20 * (for example via virtual mapping capabilities)
21 * The routine returns the number of addr/length pairs actually
22 * used, at most nents.
23 *
24 * Device ownership issues as mentioned above for pci_map_single are
25 * the same here.
26 */
27int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
28 int nents, int direction)
29{
30 int i;
31
32 BUG_ON(direction == DMA_NONE);
33 for (i = 0; i < nents; i++ ) {
34 struct scatterlist *s = &sg[i];
35 BUG_ON(!s->page);
36 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
37 s->dma_length = s->length;
38 }
39 return nents;
40}
41
42EXPORT_SYMBOL(dma_map_sg);
43
44/* Unmap a set of streaming mode DMA translations.
45 * Again, cpu read rules concerning calls here are the same as for
46 * pci_unmap_single() above.
47 */
48void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
49 int nents, int dir)
50{
51 int i;
52 for (i = 0; i < nents; i++) {
53 struct scatterlist *s = &sg[i];
54 BUG_ON(s->page == NULL);
55 BUG_ON(s->dma_address == 0);
56 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
57 }
58}
59
60EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
new file mode 100644
index 000000000000..57f35c68aa34
--- /dev/null
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -0,0 +1,980 @@
1/*
2 * Dynamic DMA mapping support for AMD Hammer.
3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB.
7 *
8 * See Documentation/DMA-mapping.txt for the interface specification.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12
13#include <linux/config.h>
14#include <linux/types.h>
15#include <linux/ctype.h>
16#include <linux/agp_backend.h>
17#include <linux/init.h>
18#include <linux/mm.h>
19#include <linux/string.h>
20#include <linux/spinlock.h>
21#include <linux/pci.h>
22#include <linux/module.h>
23#include <linux/topology.h>
24#include <linux/interrupt.h>
25#include <linux/bitops.h>
26#include <asm/atomic.h>
27#include <asm/io.h>
28#include <asm/mtrr.h>
29#include <asm/pgtable.h>
30#include <asm/proto.h>
31#include <asm/cacheflush.h>
32#include <asm/kdebug.h>
33
34dma_addr_t bad_dma_address;
35
36unsigned long iommu_bus_base; /* GART remapping area (physical) */
37static unsigned long iommu_size; /* size of remapping area bytes */
38static unsigned long iommu_pages; /* .. and in pages */
39
40u32 *iommu_gatt_base; /* Remapping table */
41
42int no_iommu;
43static int no_agp;
44#ifdef CONFIG_IOMMU_DEBUG
45int panic_on_overflow = 1;
46int force_iommu = 1;
47#else
48int panic_on_overflow = 0;
49int force_iommu = 0;
50#endif
51int iommu_merge = 1;
52int iommu_sac_force = 0;
53
54/* If this is disabled the IOMMU will use an optimized flushing strategy
55 of only flushing when an mapping is reused. With it true the GART is flushed
56 for every mapping. Problem is that doing the lazy flush seems to trigger
57 bugs with some popular PCI cards, in particular 3ware (but has been also
58 also seen with Qlogic at least). */
59int iommu_fullflush = 1;
60
61/* This tells the BIO block layer to assume merging. Default to off
62 because we cannot guarantee merging later. */
63int iommu_bio_merge = 0;
64
65#define MAX_NB 8
66
67/* Allocation bitmap for the remapping area */
68static DEFINE_SPINLOCK(iommu_bitmap_lock);
69static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
70
71static u32 gart_unmapped_entry;
72
73#define GPTE_VALID 1
74#define GPTE_COHERENT 2
75#define GPTE_ENCODE(x) \
76 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
77#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
78
79#define to_pages(addr,size) \
80 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
81
82#define for_all_nb(dev) \
83 dev = NULL; \
84 while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\
85 if (dev->bus->number == 0 && \
86 (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31))
87
88static struct pci_dev *northbridges[MAX_NB];
89static u32 northbridge_flush_word[MAX_NB];
90
91#define EMERGENCY_PAGES 32 /* = 128KB */
92
93#ifdef CONFIG_AGP
94#define AGPEXTERN extern
95#else
96#define AGPEXTERN
97#endif
98
99/* backdoor interface to AGP driver */
100AGPEXTERN int agp_memory_reserved;
101AGPEXTERN __u32 *agp_gatt_table;
102
103static unsigned long next_bit; /* protected by iommu_bitmap_lock */
104static int need_flush; /* global flush state. set for each gart wrap */
105static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
106 size_t size, int dir, int do_panic);
107
108/* Dummy device used for NULL arguments (normally ISA). Better would
109 be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */
110static struct device fallback_dev = {
111 .bus_id = "fallback device",
112 .coherent_dma_mask = 0xffffffff,
113 .dma_mask = &fallback_dev.coherent_dma_mask,
114};
115
116static unsigned long alloc_iommu(int size)
117{
118 unsigned long offset, flags;
119
120 spin_lock_irqsave(&iommu_bitmap_lock, flags);
121 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
122 if (offset == -1) {
123 need_flush = 1;
124 offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
125 }
126 if (offset != -1) {
127 set_bit_string(iommu_gart_bitmap, offset, size);
128 next_bit = offset+size;
129 if (next_bit >= iommu_pages) {
130 next_bit = 0;
131 need_flush = 1;
132 }
133 }
134 if (iommu_fullflush)
135 need_flush = 1;
136 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
137 return offset;
138}
139
140static void free_iommu(unsigned long offset, int size)
141{
142 unsigned long flags;
143 if (size == 1) {
144 clear_bit(offset, iommu_gart_bitmap);
145 return;
146 }
147 spin_lock_irqsave(&iommu_bitmap_lock, flags);
148 __clear_bit_string(iommu_gart_bitmap, offset, size);
149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
150}
151
152/*
153 * Use global flush state to avoid races with multiple flushers.
154 */
155static void flush_gart(struct device *dev)
156{
157 unsigned long flags;
158 int flushed = 0;
159 int i, max;
160
161 spin_lock_irqsave(&iommu_bitmap_lock, flags);
162 if (need_flush) {
163 max = 0;
164 for (i = 0; i < MAX_NB; i++) {
165 if (!northbridges[i])
166 continue;
167 pci_write_config_dword(northbridges[i], 0x9c,
168 northbridge_flush_word[i] | 1);
169 flushed++;
170 max = i;
171 }
172 for (i = 0; i <= max; i++) {
173 u32 w;
174 if (!northbridges[i])
175 continue;
176 /* Make sure the hardware actually executed the flush. */
177 do {
178 pci_read_config_dword(northbridges[i], 0x9c, &w);
179 } while (w & 1);
180 }
181 if (!flushed)
182 printk("nothing to flush?\n");
183 need_flush = 0;
184 }
185 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
186}
187
188/* Allocate DMA memory on node near device */
189noinline
190static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
191{
192 struct page *page;
193 int node;
194 if (dev->bus == &pci_bus_type) {
195 cpumask_t mask;
196 mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
197 node = cpu_to_node(first_cpu(mask));
198 } else
199 node = numa_node_id();
200 page = alloc_pages_node(node, gfp, order);
201 return page ? page_address(page) : NULL;
202}
203
204/*
205 * Allocate memory for a coherent mapping.
206 */
207void *
208dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
209 unsigned gfp)
210{
211 void *memory;
212 unsigned long dma_mask = 0;
213 u64 bus;
214
215 if (!dev)
216 dev = &fallback_dev;
217 dma_mask = dev->coherent_dma_mask;
218 if (dma_mask == 0)
219 dma_mask = 0xffffffff;
220
221 /* Kludge to make it bug-to-bug compatible with i386. i386
222 uses the normal dma_mask for alloc_coherent. */
223 dma_mask &= *dev->dma_mask;
224
225 again:
226 memory = dma_alloc_pages(dev, gfp, get_order(size));
227 if (memory == NULL)
228 return NULL;
229
230 {
231 int high, mmu;
232 bus = virt_to_bus(memory);
233 high = (bus + size) >= dma_mask;
234 mmu = high;
235 if (force_iommu && !(gfp & GFP_DMA))
236 mmu = 1;
237 if (no_iommu || dma_mask < 0xffffffffUL) {
238 if (high) {
239 free_pages((unsigned long)memory,
240 get_order(size));
241
242 if (swiotlb) {
243 return
244 swiotlb_alloc_coherent(dev, size,
245 dma_handle,
246 gfp);
247 }
248
249 if (!(gfp & GFP_DMA)) {
250 gfp |= GFP_DMA;
251 goto again;
252 }
253 return NULL;
254 }
255 mmu = 0;
256 }
257 memset(memory, 0, size);
258 if (!mmu) {
259 *dma_handle = virt_to_bus(memory);
260 return memory;
261 }
262 }
263
264 *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0);
265 if (*dma_handle == bad_dma_address)
266 goto error;
267 flush_gart(dev);
268 return memory;
269
270error:
271 if (panic_on_overflow)
272 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size);
273 free_pages((unsigned long)memory, get_order(size));
274 return NULL;
275}
276
277/*
278 * Unmap coherent memory.
279 * The caller must ensure that the device has finished accessing the mapping.
280 */
281void dma_free_coherent(struct device *dev, size_t size,
282 void *vaddr, dma_addr_t bus)
283{
284 if (swiotlb) {
285 swiotlb_free_coherent(dev, size, vaddr, bus);
286 return;
287 }
288
289 dma_unmap_single(dev, bus, size, 0);
290 free_pages((unsigned long)vaddr, get_order(size));
291}
292
293#ifdef CONFIG_IOMMU_LEAK
294
295#define SET_LEAK(x) if (iommu_leak_tab) \
296 iommu_leak_tab[x] = __builtin_return_address(0);
297#define CLEAR_LEAK(x) if (iommu_leak_tab) \
298 iommu_leak_tab[x] = NULL;
299
300/* Debugging aid for drivers that don't free their IOMMU tables */
301static void **iommu_leak_tab;
302static int leak_trace;
303int iommu_leak_pages = 20;
304void dump_leak(void)
305{
306 int i;
307 static int dump;
308 if (dump || !iommu_leak_tab) return;
309 dump = 1;
310 show_stack(NULL,NULL);
311 /* Very crude. dump some from the end of the table too */
312 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages);
313 for (i = 0; i < iommu_leak_pages; i+=2) {
314 printk("%lu: ", iommu_pages-i);
315 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
316 printk("%c", (i+1)%2 == 0 ? '\n' : ' ');
317 }
318 printk("\n");
319}
320#else
321#define SET_LEAK(x)
322#define CLEAR_LEAK(x)
323#endif
324
325static void iommu_full(struct device *dev, size_t size, int dir, int do_panic)
326{
327 /*
328 * Ran out of IOMMU space for this operation. This is very bad.
329 * Unfortunately the drivers cannot handle this operation properly.
330 * Return some non mapped prereserved space in the aperture and
331 * let the Northbridge deal with it. This will result in garbage
332 * in the IO operation. When the size exceeds the prereserved space
333 * memory corruption will occur or random memory will be DMAed
334 * out. Hopefully no network devices use single mappings that big.
335 */
336
337 printk(KERN_ERR
338 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
339 size, dev->bus_id);
340
341 if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) {
342 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
343 panic("PCI-DMA: Memory would be corrupted\n");
344 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
345 panic("PCI-DMA: Random memory would be DMAed\n");
346 }
347
348#ifdef CONFIG_IOMMU_LEAK
349 dump_leak();
350#endif
351}
352
353static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
354{
355 u64 mask = *dev->dma_mask;
356 int high = addr + size >= mask;
357 int mmu = high;
358 if (force_iommu)
359 mmu = 1;
360 if (no_iommu) {
361 if (high)
362 panic("PCI-DMA: high address but no IOMMU.\n");
363 mmu = 0;
364 }
365 return mmu;
366}
367
368static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
369{
370 u64 mask = *dev->dma_mask;
371 int high = addr + size >= mask;
372 int mmu = high;
373 if (no_iommu) {
374 if (high)
375 panic("PCI-DMA: high address but no IOMMU.\n");
376 mmu = 0;
377 }
378 return mmu;
379}
380
381/* Map a single continuous physical area into the IOMMU.
382 * Caller needs to check if the iommu is needed and flush.
383 */
384static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
385 size_t size, int dir, int do_panic)
386{
387 unsigned long npages = to_pages(phys_mem, size);
388 unsigned long iommu_page = alloc_iommu(npages);
389 int i;
390 if (iommu_page == -1) {
391 if (!nonforced_iommu(dev, phys_mem, size))
392 return phys_mem;
393 if (panic_on_overflow)
394 panic("dma_map_area overflow %lu bytes\n", size);
395 iommu_full(dev, size, dir, do_panic);
396 return bad_dma_address;
397 }
398
399 for (i = 0; i < npages; i++) {
400 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
401 SET_LEAK(iommu_page + i);
402 phys_mem += PAGE_SIZE;
403 }
404 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
405}
406
407/* Map a single area into the IOMMU */
408dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir)
409{
410 unsigned long phys_mem, bus;
411
412 BUG_ON(dir == DMA_NONE);
413
414 if (swiotlb)
415 return swiotlb_map_single(dev,addr,size,dir);
416 if (!dev)
417 dev = &fallback_dev;
418
419 phys_mem = virt_to_phys(addr);
420 if (!need_iommu(dev, phys_mem, size))
421 return phys_mem;
422
423 bus = dma_map_area(dev, phys_mem, size, dir, 1);
424 flush_gart(dev);
425 return bus;
426}
427
428/* Fallback for dma_map_sg in case of overflow */
429static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
430 int nents, int dir)
431{
432 int i;
433
434#ifdef CONFIG_IOMMU_DEBUG
435 printk(KERN_DEBUG "dma_map_sg overflow\n");
436#endif
437
438 for (i = 0; i < nents; i++ ) {
439 struct scatterlist *s = &sg[i];
440 unsigned long addr = page_to_phys(s->page) + s->offset;
441 if (nonforced_iommu(dev, addr, s->length)) {
442 addr = dma_map_area(dev, addr, s->length, dir, 0);
443 if (addr == bad_dma_address) {
444 if (i > 0)
445 dma_unmap_sg(dev, sg, i, dir);
446 nents = 0;
447 sg[0].dma_length = 0;
448 break;
449 }
450 }
451 s->dma_address = addr;
452 s->dma_length = s->length;
453 }
454 flush_gart(dev);
455 return nents;
456}
457
458/* Map multiple scatterlist entries continuous into the first. */
459static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
460 struct scatterlist *sout, unsigned long pages)
461{
462 unsigned long iommu_start = alloc_iommu(pages);
463 unsigned long iommu_page = iommu_start;
464 int i;
465
466 if (iommu_start == -1)
467 return -1;
468
469 for (i = start; i < stopat; i++) {
470 struct scatterlist *s = &sg[i];
471 unsigned long pages, addr;
472 unsigned long phys_addr = s->dma_address;
473
474 BUG_ON(i > start && s->offset);
475 if (i == start) {
476 *sout = *s;
477 sout->dma_address = iommu_bus_base;
478 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
479 sout->dma_length = s->length;
480 } else {
481 sout->dma_length += s->length;
482 }
483
484 addr = phys_addr;
485 pages = to_pages(s->offset, s->length);
486 while (pages--) {
487 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
488 SET_LEAK(iommu_page);
489 addr += PAGE_SIZE;
490 iommu_page++;
491 }
492 }
493 BUG_ON(iommu_page - iommu_start != pages);
494 return 0;
495}
496
497static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
498 struct scatterlist *sout,
499 unsigned long pages, int need)
500{
501 if (!need) {
502 BUG_ON(stopat - start != 1);
503 *sout = sg[start];
504 sout->dma_length = sg[start].length;
505 return 0;
506 }
507 return __dma_map_cont(sg, start, stopat, sout, pages);
508}
509
510/*
511 * DMA map all entries in a scatterlist.
512 * Merge chunks that have page aligned sizes into a continuous mapping.
513 */
514int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
515{
516 int i;
517 int out;
518 int start;
519 unsigned long pages = 0;
520 int need = 0, nextneed;
521
522 BUG_ON(dir == DMA_NONE);
523 if (nents == 0)
524 return 0;
525
526 if (swiotlb)
527 return swiotlb_map_sg(dev,sg,nents,dir);
528 if (!dev)
529 dev = &fallback_dev;
530
531 out = 0;
532 start = 0;
533 for (i = 0; i < nents; i++) {
534 struct scatterlist *s = &sg[i];
535 dma_addr_t addr = page_to_phys(s->page) + s->offset;
536 s->dma_address = addr;
537 BUG_ON(s->length == 0);
538
539 nextneed = need_iommu(dev, addr, s->length);
540
541 /* Handle the previous not yet processed entries */
542 if (i > start) {
543 struct scatterlist *ps = &sg[i-1];
544 /* Can only merge when the last chunk ends on a page
545 boundary and the new one doesn't have an offset. */
546 if (!iommu_merge || !nextneed || !need || s->offset ||
547 (ps->offset + ps->length) % PAGE_SIZE) {
548 if (dma_map_cont(sg, start, i, sg+out, pages,
549 need) < 0)
550 goto error;
551 out++;
552 pages = 0;
553 start = i;
554 }
555 }
556
557 need = nextneed;
558 pages += to_pages(s->offset, s->length);
559 }
560 if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
561 goto error;
562 out++;
563 flush_gart(dev);
564 if (out < nents)
565 sg[out].dma_length = 0;
566 return out;
567
568error:
569 flush_gart(NULL);
570 dma_unmap_sg(dev, sg, nents, dir);
571 /* When it was forced try again unforced */
572 if (force_iommu)
573 return dma_map_sg_nonforce(dev, sg, nents, dir);
574 if (panic_on_overflow)
575 panic("dma_map_sg: overflow on %lu pages\n", pages);
576 iommu_full(dev, pages << PAGE_SHIFT, dir, 0);
577 for (i = 0; i < nents; i++)
578 sg[i].dma_address = bad_dma_address;
579 return 0;
580}
581
582/*
583 * Free a DMA mapping.
584 */
585void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
586 size_t size, int direction)
587{
588 unsigned long iommu_page;
589 int npages;
590 int i;
591
592 if (swiotlb) {
593 swiotlb_unmap_single(dev,dma_addr,size,direction);
594 return;
595 }
596
597 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
598 dma_addr >= iommu_bus_base + iommu_size)
599 return;
600 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
601 npages = to_pages(dma_addr, size);
602 for (i = 0; i < npages; i++) {
603 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
604 CLEAR_LEAK(iommu_page + i);
605 }
606 free_iommu(iommu_page, npages);
607}
608
609/*
610 * Wrapper for pci_unmap_single working with scatterlists.
611 */
612void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
613{
614 int i;
615 if (swiotlb) {
616 swiotlb_unmap_sg(dev,sg,nents,dir);
617 return;
618 }
619 for (i = 0; i < nents; i++) {
620 struct scatterlist *s = &sg[i];
621 if (!s->dma_length || !s->length)
622 break;
623 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
624 }
625}
626
627int dma_supported(struct device *dev, u64 mask)
628{
629 /* Copied from i386. Doesn't make much sense, because it will
630 only work for pci_alloc_coherent.
631 The caller just has to use GFP_DMA in this case. */
632 if (mask < 0x00ffffff)
633 return 0;
634
635 /* Tell the device to use SAC when IOMMU force is on.
636 This allows the driver to use cheaper accesses in some cases.
637
638 Problem with this is that if we overflow the IOMMU area
639 and return DAC as fallback address the device may not handle it correctly.
640
641 As a special case some controllers have a 39bit address mode
642 that is as efficient as 32bit (aic79xx). Don't force SAC for these.
643 Assume all masks <= 40 bits are of this type. Normally this doesn't
644 make any difference, but gives more gentle handling of IOMMU overflow. */
645 if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
646 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
647 return 0;
648 }
649
650 return 1;
651}
652
653int dma_get_cache_alignment(void)
654{
655 return boot_cpu_data.x86_clflush_size;
656}
657
658EXPORT_SYMBOL(dma_unmap_sg);
659EXPORT_SYMBOL(dma_map_sg);
660EXPORT_SYMBOL(dma_map_single);
661EXPORT_SYMBOL(dma_unmap_single);
662EXPORT_SYMBOL(dma_supported);
663EXPORT_SYMBOL(no_iommu);
664EXPORT_SYMBOL(force_iommu);
665EXPORT_SYMBOL(bad_dma_address);
666EXPORT_SYMBOL(iommu_bio_merge);
667EXPORT_SYMBOL(iommu_sac_force);
668EXPORT_SYMBOL(dma_get_cache_alignment);
669EXPORT_SYMBOL(dma_alloc_coherent);
670EXPORT_SYMBOL(dma_free_coherent);
671
672static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
673{
674 unsigned long a;
675 if (!iommu_size) {
676 iommu_size = aper_size;
677 if (!no_agp)
678 iommu_size /= 2;
679 }
680
681 a = aper + iommu_size;
682 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
683
684 if (iommu_size < 64*1024*1024)
685 printk(KERN_WARNING
686 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20);
687
688 return iommu_size;
689}
690
691static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
692{
693 unsigned aper_size = 0, aper_base_32;
694 u64 aper_base;
695 unsigned aper_order;
696
697 pci_read_config_dword(dev, 0x94, &aper_base_32);
698 pci_read_config_dword(dev, 0x90, &aper_order);
699 aper_order = (aper_order >> 1) & 7;
700
701 aper_base = aper_base_32 & 0x7fff;
702 aper_base <<= 25;
703
704 aper_size = (32 * 1024 * 1024) << aper_order;
705 if (aper_base + aper_size >= 0xffffffff || !aper_size)
706 aper_base = 0;
707
708 *size = aper_size;
709 return aper_base;
710}
711
712/*
713 * Private Northbridge GATT initialization in case we cannot use the
714 * AGP driver for some reason.
715 */
716static __init int init_k8_gatt(struct agp_kern_info *info)
717{
718 struct pci_dev *dev;
719 void *gatt;
720 unsigned aper_base, new_aper_base;
721 unsigned aper_size, gatt_size, new_aper_size;
722
723 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
724 aper_size = aper_base = info->aper_size = 0;
725 for_all_nb(dev) {
726 new_aper_base = read_aperture(dev, &new_aper_size);
727 if (!new_aper_base)
728 goto nommu;
729
730 if (!aper_base) {
731 aper_size = new_aper_size;
732 aper_base = new_aper_base;
733 }
734 if (aper_size != new_aper_size || aper_base != new_aper_base)
735 goto nommu;
736 }
737 if (!aper_base)
738 goto nommu;
739 info->aper_base = aper_base;
740 info->aper_size = aper_size>>20;
741
742 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
743 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
744 if (!gatt)
745 panic("Cannot allocate GATT table");
746 memset(gatt, 0, gatt_size);
747 agp_gatt_table = gatt;
748
749 for_all_nb(dev) {
750 u32 ctl;
751 u32 gatt_reg;
752
753 gatt_reg = __pa(gatt) >> 12;
754 gatt_reg <<= 4;
755 pci_write_config_dword(dev, 0x98, gatt_reg);
756 pci_read_config_dword(dev, 0x90, &ctl);
757
758 ctl |= 1;
759 ctl &= ~((1<<4) | (1<<5));
760
761 pci_write_config_dword(dev, 0x90, ctl);
762 }
763 flush_gart(NULL);
764
765 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
766 return 0;
767
768 nommu:
769 /* Should not happen anymore */
770 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
771 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.");
772 return -1;
773}
774
775extern int agp_amd64_init(void);
776
777static int __init pci_iommu_init(void)
778{
779 struct agp_kern_info info;
780 unsigned long aper_size;
781 unsigned long iommu_start;
782 struct pci_dev *dev;
783 unsigned long scratch;
784 long i;
785
786#ifndef CONFIG_AGP_AMD64
787 no_agp = 1;
788#else
789 /* Makefile puts PCI initialization via subsys_initcall first. */
790 /* Add other K8 AGP bridge drivers here */
791 no_agp = no_agp ||
792 (agp_amd64_init() < 0) ||
793 (agp_copy_info(agp_bridge, &info) < 0);
794#endif
795
796 if (swiotlb) {
797 no_iommu = 1;
798 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
799 return -1;
800 }
801
802 if (no_iommu ||
803 (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) ||
804 !iommu_aperture ||
805 (no_agp && init_k8_gatt(&info) < 0)) {
806 printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
807 no_iommu = 1;
808 return -1;
809 }
810
811 aper_size = info.aper_size * 1024 * 1024;
812 iommu_size = check_iommu_size(info.aper_base, aper_size);
813 iommu_pages = iommu_size >> PAGE_SHIFT;
814
815 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL,
816 get_order(iommu_pages/8));
817 if (!iommu_gart_bitmap)
818 panic("Cannot allocate iommu bitmap\n");
819 memset(iommu_gart_bitmap, 0, iommu_pages/8);
820
821#ifdef CONFIG_IOMMU_LEAK
822 if (leak_trace) {
823 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
824 get_order(iommu_pages*sizeof(void *)));
825 if (iommu_leak_tab)
826 memset(iommu_leak_tab, 0, iommu_pages * 8);
827 else
828 printk("PCI-DMA: Cannot allocate leak trace area\n");
829 }
830#endif
831
832 /*
833 * Out of IOMMU space handling.
834 * Reserve some invalid pages at the beginning of the GART.
835 */
836 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
837
838 agp_memory_reserved = iommu_size;
839 printk(KERN_INFO
840 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
841 iommu_size>>20);
842
843 iommu_start = aper_size - iommu_size;
844 iommu_bus_base = info.aper_base + iommu_start;
845 bad_dma_address = iommu_bus_base;
846 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
847
848 /*
849 * Unmap the IOMMU part of the GART. The alias of the page is
850 * always mapped with cache enabled and there is no full cache
851 * coherency across the GART remapping. The unmapping avoids
852 * automatic prefetches from the CPU allocating cache lines in
853 * there. All CPU accesses are done via the direct mapping to
854 * the backing memory. The GART address is only used by PCI
855 * devices.
856 */
857 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
858
859 /*
860 * Try to workaround a bug (thanks to BenH)
861 * Set unmapped entries to a scratch page instead of 0.
862 * Any prefetches that hit unmapped entries won't get an bus abort
863 * then.
864 */
865 scratch = get_zeroed_page(GFP_KERNEL);
866 if (!scratch)
867 panic("Cannot allocate iommu scratch page");
868 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
869 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
870 iommu_gatt_base[i] = gart_unmapped_entry;
871
872 for_all_nb(dev) {
873 u32 flag;
874 int cpu = PCI_SLOT(dev->devfn) - 24;
875 if (cpu >= MAX_NB)
876 continue;
877 northbridges[cpu] = dev;
878 pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
879 northbridge_flush_word[cpu] = flag;
880 }
881
882 flush_gart(NULL);
883
884 return 0;
885}
886
887/* Must execute after PCI subsystem */
888fs_initcall(pci_iommu_init);
889
890/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
891 [,forcesac][,fullflush][,nomerge][,biomerge]
892 size set size of iommu (in bytes)
893 noagp don't initialize the AGP driver and use full aperture.
894 off don't use the IOMMU
895 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
896 memaper[=order] allocate an own aperture over RAM with size 32MB^order.
897 noforce don't force IOMMU usage. Default.
898 force Force IOMMU.
899 merge Do lazy merging. This may improve performance on some block devices.
900 Implies force (experimental)
901 biomerge Do merging at the BIO layer. This is more efficient than merge,
902 but should be only done with very big IOMMUs. Implies merge,force.
903 nomerge Don't do SG merging.
904 forcesac For SAC mode for masks <40bits (experimental)
905 fullflush Flush IOMMU on each allocation (default)
906 nofullflush Don't use IOMMU fullflush
907 allowed overwrite iommu off workarounds for specific chipsets.
908 soft Use software bounce buffering (default for Intel machines)
909 noaperture Don't touch the aperture for AGP.
910*/
911__init int iommu_setup(char *p)
912{
913 int arg;
914
915 while (*p) {
916 if (!strncmp(p,"noagp",5))
917 no_agp = 1;
918 if (!strncmp(p,"off",3))
919 no_iommu = 1;
920 if (!strncmp(p,"force",5)) {
921 force_iommu = 1;
922 iommu_aperture_allowed = 1;
923 }
924 if (!strncmp(p,"allowed",7))
925 iommu_aperture_allowed = 1;
926 if (!strncmp(p,"noforce",7)) {
927 iommu_merge = 0;
928 force_iommu = 0;
929 }
930 if (!strncmp(p, "memaper", 7)) {
931 fallback_aper_force = 1;
932 p += 7;
933 if (*p == '=') {
934 ++p;
935 if (get_option(&p, &arg))
936 fallback_aper_order = arg;
937 }
938 }
939 if (!strncmp(p, "biomerge",8)) {
940 iommu_bio_merge = 4096;
941 iommu_merge = 1;
942 force_iommu = 1;
943 }
944 if (!strncmp(p, "panic",5))
945 panic_on_overflow = 1;
946 if (!strncmp(p, "nopanic",7))
947 panic_on_overflow = 0;
948 if (!strncmp(p, "merge",5)) {
949 iommu_merge = 1;
950 force_iommu = 1;
951 }
952 if (!strncmp(p, "nomerge",7))
953 iommu_merge = 0;
954 if (!strncmp(p, "forcesac",8))
955 iommu_sac_force = 1;
956 if (!strncmp(p, "fullflush",8))
957 iommu_fullflush = 1;
958 if (!strncmp(p, "nofullflush",11))
959 iommu_fullflush = 0;
960 if (!strncmp(p, "soft",4))
961 swiotlb = 1;
962 if (!strncmp(p, "noaperture",10))
963 fix_aperture = 0;
964#ifdef CONFIG_IOMMU_LEAK
965 if (!strncmp(p,"leak",4)) {
966 leak_trace = 1;
967 p += 4;
968 if (*p == '=') ++p;
969 if (isdigit(*p) && get_option(&p, &arg))
970 iommu_leak_pages = arg;
971 } else
972#endif
973 if (isdigit(*p) && get_option(&p, &arg))
974 iommu_size = arg;
975 p += strcspn(p, ",");
976 if (*p == ',')
977 ++p;
978 }
979 return 1;
980}
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
new file mode 100644
index 000000000000..67d90b89af0b
--- /dev/null
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -0,0 +1,94 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <asm/proto.h>
8#include <asm/processor.h>
9
10int iommu_merge = 0;
11EXPORT_SYMBOL(iommu_merge);
12
13dma_addr_t bad_dma_address;
14EXPORT_SYMBOL(bad_dma_address);
15
16int iommu_bio_merge = 0;
17EXPORT_SYMBOL(iommu_bio_merge);
18
19int iommu_sac_force = 0;
20EXPORT_SYMBOL(iommu_sac_force);
21
22/*
23 * Dummy IO MMU functions
24 */
25
26void *dma_alloc_coherent(struct device *hwdev, size_t size,
27 dma_addr_t *dma_handle, unsigned gfp)
28{
29 void *ret;
30 u64 mask;
31 int order = get_order(size);
32
33 if (hwdev)
34 mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
35 else
36 mask = 0xffffffff;
37 for (;;) {
38 ret = (void *)__get_free_pages(gfp, order);
39 if (ret == NULL)
40 return NULL;
41 *dma_handle = virt_to_bus(ret);
42 if ((*dma_handle & ~mask) == 0)
43 break;
44 free_pages((unsigned long)ret, order);
45 if (gfp & GFP_DMA)
46 return NULL;
47 gfp |= GFP_DMA;
48 }
49
50 memset(ret, 0, size);
51 return ret;
52}
53EXPORT_SYMBOL(dma_alloc_coherent);
54
55void dma_free_coherent(struct device *hwdev, size_t size,
56 void *vaddr, dma_addr_t dma_handle)
57{
58 free_pages((unsigned long)vaddr, get_order(size));
59}
60EXPORT_SYMBOL(dma_free_coherent);
61
62int dma_supported(struct device *hwdev, u64 mask)
63{
64 /*
65 * we fall back to GFP_DMA when the mask isn't all 1s,
66 * so we can't guarantee allocations that must be
67 * within a tighter range than GFP_DMA..
68 * RED-PEN this won't work for pci_map_single. Caller has to
69 * use GFP_DMA in the first place.
70 */
71 if (mask < 0x00ffffff)
72 return 0;
73
74 return 1;
75}
76EXPORT_SYMBOL(dma_supported);
77
78int dma_get_cache_alignment(void)
79{
80 return boot_cpu_data.x86_clflush_size;
81}
82EXPORT_SYMBOL(dma_get_cache_alignment);
83
84static int __init check_ram(void)
85{
86 if (end_pfn >= 0xffffffff>>PAGE_SHIFT) {
87 printk(
88 KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
89 KERN_ERR "WARNING 32bit PCI may malfunction.\n");
90 }
91 return 0;
92}
93__initcall(check_ram);
94
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
new file mode 100644
index 000000000000..9922d2ba24a3
--- /dev/null
+++ b/arch/x86_64/kernel/process.c
@@ -0,0 +1,770 @@
1/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
11 *
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
13 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
21#include <linux/errno.h>
22#include <linux/sched.h>
23#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h>
31#include <linux/interrupt.h>
32#include <linux/delay.h>
33#include <linux/irq.h>
34#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
37
38#include <asm/uaccess.h>
39#include <asm/pgtable.h>
40#include <asm/system.h>
41#include <asm/io.h>
42#include <asm/processor.h>
43#include <asm/i387.h>
44#include <asm/mmu_context.h>
45#include <asm/pda.h>
46#include <asm/prctl.h>
47#include <asm/kdebug.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51
52asmlinkage extern void ret_from_fork(void);
53
54unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
55
56static atomic_t hlt_counter = ATOMIC_INIT(0);
57
58unsigned long boot_option_idle_override = 0;
59EXPORT_SYMBOL(boot_option_idle_override);
60
61/*
62 * Powermanagement idle function, if any..
63 */
64void (*pm_idle)(void);
65static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
66
67void disable_hlt(void)
68{
69 atomic_inc(&hlt_counter);
70}
71
72EXPORT_SYMBOL(disable_hlt);
73
74void enable_hlt(void)
75{
76 atomic_dec(&hlt_counter);
77}
78
79EXPORT_SYMBOL(enable_hlt);
80
81/*
82 * We use this if we don't have any better
83 * idle routine..
84 */
85void default_idle(void)
86{
87 if (!atomic_read(&hlt_counter)) {
88 local_irq_disable();
89 if (!need_resched())
90 safe_halt();
91 else
92 local_irq_enable();
93 }
94}
95
96/*
97 * On SMP it's slightly faster (but much more power-consuming!)
98 * to poll the ->need_resched flag instead of waiting for the
99 * cross-CPU IPI to arrive. Use this option with caution.
100 */
101static void poll_idle (void)
102{
103 int oldval;
104
105 local_irq_enable();
106
107 /*
108 * Deal with another CPU just having chosen a thread to
109 * run here:
110 */
111 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
112
113 if (!oldval) {
114 set_thread_flag(TIF_POLLING_NRFLAG);
115 asm volatile(
116 "2:"
117 "testl %0,%1;"
118 "rep; nop;"
119 "je 2b;"
120 : :
121 "i" (_TIF_NEED_RESCHED),
122 "m" (current_thread_info()->flags));
123 } else {
124 set_need_resched();
125 }
126}
127
128void cpu_idle_wait(void)
129{
130 unsigned int cpu, this_cpu = get_cpu();
131 cpumask_t map;
132
133 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
134 put_cpu();
135
136 cpus_clear(map);
137 for_each_online_cpu(cpu) {
138 per_cpu(cpu_idle_state, cpu) = 1;
139 cpu_set(cpu, map);
140 }
141
142 __get_cpu_var(cpu_idle_state) = 0;
143
144 wmb();
145 do {
146 ssleep(1);
147 for_each_online_cpu(cpu) {
148 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
149 cpu_clear(cpu, map);
150 }
151 cpus_and(map, map, cpu_online_map);
152 } while (!cpus_empty(map));
153}
154EXPORT_SYMBOL_GPL(cpu_idle_wait);
155
156/*
157 * The idle thread. There's no useful work to be
158 * done, so just try to conserve power and have a
159 * low exit latency (ie sit in a loop waiting for
160 * somebody to say that they'd like to reschedule)
161 */
162void cpu_idle (void)
163{
164 /* endless idle loop with no priority at all */
165 while (1) {
166 while (!need_resched()) {
167 void (*idle)(void);
168
169 if (__get_cpu_var(cpu_idle_state))
170 __get_cpu_var(cpu_idle_state) = 0;
171
172 rmb();
173 idle = pm_idle;
174 if (!idle)
175 idle = default_idle;
176 idle();
177 }
178
179 schedule();
180 }
181}
182
183/*
184 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
185 * which can obviate IPI to trigger checking of need_resched.
186 * We execute MONITOR against need_resched and enter optimized wait state
187 * through MWAIT. Whenever someone changes need_resched, we would be woken
188 * up from MWAIT (without an IPI).
189 */
190static void mwait_idle(void)
191{
192 local_irq_enable();
193
194 if (!need_resched()) {
195 set_thread_flag(TIF_POLLING_NRFLAG);
196 do {
197 __monitor((void *)&current_thread_info()->flags, 0, 0);
198 if (need_resched())
199 break;
200 __mwait(0, 0);
201 } while (!need_resched());
202 clear_thread_flag(TIF_POLLING_NRFLAG);
203 }
204}
205
206void __init select_idle_routine(const struct cpuinfo_x86 *c)
207{
208 static int printed;
209 if (cpu_has(c, X86_FEATURE_MWAIT)) {
210 /*
211 * Skip, if setup has overridden idle.
212 * One CPU supports mwait => All CPUs supports mwait
213 */
214 if (!pm_idle) {
215 if (!printed) {
216 printk("using mwait in idle threads.\n");
217 printed = 1;
218 }
219 pm_idle = mwait_idle;
220 }
221 }
222}
223
224static int __init idle_setup (char *str)
225{
226 if (!strncmp(str, "poll", 4)) {
227 printk("using polling idle threads.\n");
228 pm_idle = poll_idle;
229 }
230
231 boot_option_idle_override = 1;
232 return 1;
233}
234
235__setup("idle=", idle_setup);
236
237/* Prints also some state that isn't saved in the pt_regs */
238void __show_regs(struct pt_regs * regs)
239{
240 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
241 unsigned int fsindex,gsindex;
242 unsigned int ds,cs,es;
243
244 printk("\n");
245 print_modules();
246 printk("Pid: %d, comm: %.20s %s %s\n",
247 current->pid, current->comm, print_tainted(), system_utsname.release);
248 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
249 printk_address(regs->rip);
250 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
251 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
252 regs->rax, regs->rbx, regs->rcx);
253 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
254 regs->rdx, regs->rsi, regs->rdi);
255 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
256 regs->rbp, regs->r8, regs->r9);
257 printk("R10: %016lx R11: %016lx R12: %016lx\n",
258 regs->r10, regs->r11, regs->r12);
259 printk("R13: %016lx R14: %016lx R15: %016lx\n",
260 regs->r13, regs->r14, regs->r15);
261
262 asm("movl %%ds,%0" : "=r" (ds));
263 asm("movl %%cs,%0" : "=r" (cs));
264 asm("movl %%es,%0" : "=r" (es));
265 asm("movl %%fs,%0" : "=r" (fsindex));
266 asm("movl %%gs,%0" : "=r" (gsindex));
267
268 rdmsrl(MSR_FS_BASE, fs);
269 rdmsrl(MSR_GS_BASE, gs);
270 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
271
272 asm("movq %%cr0, %0": "=r" (cr0));
273 asm("movq %%cr2, %0": "=r" (cr2));
274 asm("movq %%cr3, %0": "=r" (cr3));
275 asm("movq %%cr4, %0": "=r" (cr4));
276
277 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
278 fs,fsindex,gs,gsindex,shadowgs);
279 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
280 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
281}
282
283void show_regs(struct pt_regs *regs)
284{
285 __show_regs(regs);
286 show_trace(&regs->rsp);
287}
288
289/*
290 * Free current thread data structures etc..
291 */
292void exit_thread(void)
293{
294 struct task_struct *me = current;
295 struct thread_struct *t = &me->thread;
296 if (me->thread.io_bitmap_ptr) {
297 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
298
299 kfree(t->io_bitmap_ptr);
300 t->io_bitmap_ptr = NULL;
301 /*
302 * Careful, clear this in the TSS too:
303 */
304 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
305 t->io_bitmap_max = 0;
306 put_cpu();
307 }
308}
309
310void flush_thread(void)
311{
312 struct task_struct *tsk = current;
313 struct thread_info *t = current_thread_info();
314
315 if (t->flags & _TIF_ABI_PENDING)
316 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
317
318 tsk->thread.debugreg0 = 0;
319 tsk->thread.debugreg1 = 0;
320 tsk->thread.debugreg2 = 0;
321 tsk->thread.debugreg3 = 0;
322 tsk->thread.debugreg6 = 0;
323 tsk->thread.debugreg7 = 0;
324 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
325 /*
326 * Forget coprocessor state..
327 */
328 clear_fpu(tsk);
329 clear_used_math();
330}
331
332void release_thread(struct task_struct *dead_task)
333{
334 if (dead_task->mm) {
335 if (dead_task->mm->context.size) {
336 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
337 dead_task->comm,
338 dead_task->mm->context.ldt,
339 dead_task->mm->context.size);
340 BUG();
341 }
342 }
343}
344
345static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
346{
347 struct user_desc ud = {
348 .base_addr = addr,
349 .limit = 0xfffff,
350 .seg_32bit = 1,
351 .limit_in_pages = 1,
352 .useable = 1,
353 };
354 struct n_desc_struct *desc = (void *)t->thread.tls_array;
355 desc += tls;
356 desc->a = LDT_entry_a(&ud);
357 desc->b = LDT_entry_b(&ud);
358}
359
360static inline u32 read_32bit_tls(struct task_struct *t, int tls)
361{
362 struct desc_struct *desc = (void *)t->thread.tls_array;
363 desc += tls;
364 return desc->base0 |
365 (((u32)desc->base1) << 16) |
366 (((u32)desc->base2) << 24);
367}
368
369/*
370 * This gets called before we allocate a new thread and copy
371 * the current task into it.
372 */
373void prepare_to_copy(struct task_struct *tsk)
374{
375 unlazy_fpu(tsk);
376}
377
378int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
379 unsigned long unused,
380 struct task_struct * p, struct pt_regs * regs)
381{
382 int err;
383 struct pt_regs * childregs;
384 struct task_struct *me = current;
385
386 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
387
388 *childregs = *regs;
389
390 childregs->rax = 0;
391 childregs->rsp = rsp;
392 if (rsp == ~0UL) {
393 childregs->rsp = (unsigned long)childregs;
394 }
395
396 p->thread.rsp = (unsigned long) childregs;
397 p->thread.rsp0 = (unsigned long) (childregs+1);
398 p->thread.userrsp = me->thread.userrsp;
399
400 set_ti_thread_flag(p->thread_info, TIF_FORK);
401
402 p->thread.fs = me->thread.fs;
403 p->thread.gs = me->thread.gs;
404
405 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
406 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
407 asm("movl %%es,%0" : "=m" (p->thread.es));
408 asm("movl %%ds,%0" : "=m" (p->thread.ds));
409
410 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
411 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
412 if (!p->thread.io_bitmap_ptr) {
413 p->thread.io_bitmap_max = 0;
414 return -ENOMEM;
415 }
416 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
417 }
418
419 /*
420 * Set a new TLS for the child thread?
421 */
422 if (clone_flags & CLONE_SETTLS) {
423#ifdef CONFIG_IA32_EMULATION
424 if (test_thread_flag(TIF_IA32))
425 err = ia32_child_tls(p, childregs);
426 else
427#endif
428 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
429 if (err)
430 goto out;
431 }
432 err = 0;
433out:
434 if (err && p->thread.io_bitmap_ptr) {
435 kfree(p->thread.io_bitmap_ptr);
436 p->thread.io_bitmap_max = 0;
437 }
438 return err;
439}
440
441/*
442 * This special macro can be used to load a debugging register
443 */
444#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
445
446/*
447 * switch_to(x,y) should switch tasks from x to y.
448 *
449 * This could still be optimized:
450 * - fold all the options into a flag word and test it with a single test.
451 * - could test fs/gs bitsliced
452 */
453struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
454{
455 struct thread_struct *prev = &prev_p->thread,
456 *next = &next_p->thread;
457 int cpu = smp_processor_id();
458 struct tss_struct *tss = &per_cpu(init_tss, cpu);
459
460 unlazy_fpu(prev_p);
461
462 /*
463 * Reload esp0, LDT and the page table pointer:
464 */
465 tss->rsp0 = next->rsp0;
466
467 /*
468 * Switch DS and ES.
469 * This won't pick up thread selector changes, but I guess that is ok.
470 */
471 asm volatile("movl %%es,%0" : "=m" (prev->es));
472 if (unlikely(next->es | prev->es))
473 loadsegment(es, next->es);
474
475 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
476 if (unlikely(next->ds | prev->ds))
477 loadsegment(ds, next->ds);
478
479 load_TLS(next, cpu);
480
481 /*
482 * Switch FS and GS.
483 */
484 {
485 unsigned fsindex;
486 asm volatile("movl %%fs,%0" : "=r" (fsindex));
487 /* segment register != 0 always requires a reload.
488 also reload when it has changed.
489 when prev process used 64bit base always reload
490 to avoid an information leak. */
491 if (unlikely(fsindex | next->fsindex | prev->fs)) {
492 loadsegment(fs, next->fsindex);
493 /* check if the user used a selector != 0
494 * if yes clear 64bit base, since overloaded base
495 * is always mapped to the Null selector
496 */
497 if (fsindex)
498 prev->fs = 0;
499 }
500 /* when next process has a 64bit base use it */
501 if (next->fs)
502 wrmsrl(MSR_FS_BASE, next->fs);
503 prev->fsindex = fsindex;
504 }
505 {
506 unsigned gsindex;
507 asm volatile("movl %%gs,%0" : "=r" (gsindex));
508 if (unlikely(gsindex | next->gsindex | prev->gs)) {
509 load_gs_index(next->gsindex);
510 if (gsindex)
511 prev->gs = 0;
512 }
513 if (next->gs)
514 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
515 prev->gsindex = gsindex;
516 }
517
518 /*
519 * Switch the PDA context.
520 */
521 prev->userrsp = read_pda(oldrsp);
522 write_pda(oldrsp, next->userrsp);
523 write_pda(pcurrent, next_p);
524 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
525
526 /*
527 * Now maybe reload the debug registers
528 */
529 if (unlikely(next->debugreg7)) {
530 loaddebug(next, 0);
531 loaddebug(next, 1);
532 loaddebug(next, 2);
533 loaddebug(next, 3);
534 /* no 4 and 5 */
535 loaddebug(next, 6);
536 loaddebug(next, 7);
537 }
538
539
540 /*
541 * Handle the IO bitmap
542 */
543 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
544 if (next->io_bitmap_ptr)
545 /*
546 * Copy the relevant range of the IO bitmap.
547 * Normally this is 128 bytes or less:
548 */
549 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
550 max(prev->io_bitmap_max, next->io_bitmap_max));
551 else {
552 /*
553 * Clear any possible leftover bits:
554 */
555 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
556 }
557 }
558
559 return prev_p;
560}
561
562/*
563 * sys_execve() executes a new program.
564 */
565asmlinkage
566long sys_execve(char __user *name, char __user * __user *argv,
567 char __user * __user *envp, struct pt_regs regs)
568{
569 long error;
570 char * filename;
571
572 filename = getname(name);
573 error = PTR_ERR(filename);
574 if (IS_ERR(filename))
575 return error;
576 error = do_execve(filename, argv, envp, &regs);
577 if (error == 0) {
578 task_lock(current);
579 current->ptrace &= ~PT_DTRACE;
580 task_unlock(current);
581 }
582 putname(filename);
583 return error;
584}
585
586void set_personality_64bit(void)
587{
588 /* inherit personality from parent */
589
590 /* Make sure to be in 64bit mode */
591 clear_thread_flag(TIF_IA32);
592
593 /* TBD: overwrites user setup. Should have two bits.
594 But 64bit processes have always behaved this way,
595 so it's not too bad. The main problem is just that
596 32bit childs are affected again. */
597 current->personality &= ~READ_IMPLIES_EXEC;
598}
599
600asmlinkage long sys_fork(struct pt_regs *regs)
601{
602 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
603}
604
605asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
606{
607 if (!newsp)
608 newsp = regs->rsp;
609 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
610}
611
612/*
613 * This is trivial, and on the face of it looks like it
614 * could equally well be done in user mode.
615 *
616 * Not so, for quite unobvious reasons - register pressure.
617 * In user mode vfork() cannot have a stack frame, and if
618 * done by calling the "clone()" system call directly, you
619 * do not have enough call-clobbered registers to hold all
620 * the information you need.
621 */
622asmlinkage long sys_vfork(struct pt_regs *regs)
623{
624 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
625 NULL, NULL);
626}
627
628unsigned long get_wchan(struct task_struct *p)
629{
630 unsigned long stack;
631 u64 fp,rip;
632 int count = 0;
633
634 if (!p || p == current || p->state==TASK_RUNNING)
635 return 0;
636 stack = (unsigned long)p->thread_info;
637 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
638 return 0;
639 fp = *(u64 *)(p->thread.rsp);
640 do {
641 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
642 return 0;
643 rip = *(u64 *)(fp+8);
644 if (!in_sched_functions(rip))
645 return rip;
646 fp = *(u64 *)fp;
647 } while (count++ < 16);
648 return 0;
649}
650
651long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
652{
653 int ret = 0;
654 int doit = task == current;
655 int cpu;
656
657 switch (code) {
658 case ARCH_SET_GS:
659 if (addr >= TASK_SIZE)
660 return -EPERM;
661 cpu = get_cpu();
662 /* handle small bases via the GDT because that's faster to
663 switch. */
664 if (addr <= 0xffffffff) {
665 set_32bit_tls(task, GS_TLS, addr);
666 if (doit) {
667 load_TLS(&task->thread, cpu);
668 load_gs_index(GS_TLS_SEL);
669 }
670 task->thread.gsindex = GS_TLS_SEL;
671 task->thread.gs = 0;
672 } else {
673 task->thread.gsindex = 0;
674 task->thread.gs = addr;
675 if (doit) {
676 load_gs_index(0);
677 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
678 }
679 }
680 put_cpu();
681 break;
682 case ARCH_SET_FS:
683 /* Not strictly needed for fs, but do it for symmetry
684 with gs */
685 if (addr >= TASK_SIZE)
686 return -EPERM;
687 cpu = get_cpu();
688 /* handle small bases via the GDT because that's faster to
689 switch. */
690 if (addr <= 0xffffffff) {
691 set_32bit_tls(task, FS_TLS, addr);
692 if (doit) {
693 load_TLS(&task->thread, cpu);
694 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
695 }
696 task->thread.fsindex = FS_TLS_SEL;
697 task->thread.fs = 0;
698 } else {
699 task->thread.fsindex = 0;
700 task->thread.fs = addr;
701 if (doit) {
702 /* set the selector to 0 to not confuse
703 __switch_to */
704 asm volatile("movl %0,%%fs" :: "r" (0));
705 ret = checking_wrmsrl(MSR_FS_BASE, addr);
706 }
707 }
708 put_cpu();
709 break;
710 case ARCH_GET_FS: {
711 unsigned long base;
712 if (task->thread.fsindex == FS_TLS_SEL)
713 base = read_32bit_tls(task, FS_TLS);
714 else if (doit) {
715 rdmsrl(MSR_FS_BASE, base);
716 } else
717 base = task->thread.fs;
718 ret = put_user(base, (unsigned long __user *)addr);
719 break;
720 }
721 case ARCH_GET_GS: {
722 unsigned long base;
723 if (task->thread.gsindex == GS_TLS_SEL)
724 base = read_32bit_tls(task, GS_TLS);
725 else if (doit) {
726 rdmsrl(MSR_KERNEL_GS_BASE, base);
727 } else
728 base = task->thread.gs;
729 ret = put_user(base, (unsigned long __user *)addr);
730 break;
731 }
732
733 default:
734 ret = -EINVAL;
735 break;
736 }
737
738 return ret;
739}
740
741long sys_arch_prctl(int code, unsigned long addr)
742{
743 return do_arch_prctl(current, code, addr);
744}
745
746/*
747 * Capture the user space registers if the task is not running (in user space)
748 */
749int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
750{
751 struct pt_regs *pp, ptregs;
752
753 pp = (struct pt_regs *)(tsk->thread.rsp0);
754 --pp;
755
756 ptregs = *pp;
757 ptregs.cs &= 0xffff;
758 ptregs.ss &= 0xffff;
759
760 elf_core_copy_regs(regs, &ptregs);
761
762 return 1;
763}
764
765unsigned long arch_align_stack(unsigned long sp)
766{
767 if (randomize_va_space)
768 sp -= get_random_int() % 8192;
769 return sp & ~0xf;
770}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
new file mode 100644
index 000000000000..0b7b101debdf
--- /dev/null
+++ b/arch/x86_64/kernel/ptrace.c
@@ -0,0 +1,547 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * x86-64 port 2000-2002 Andi Kleen
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/smp_lock.h>
15#include <linux/errno.h>
16#include <linux/ptrace.h>
17#include <linux/user.h>
18#include <linux/security.h>
19#include <linux/audit.h>
20#include <linux/seccomp.h>
21
22#include <asm/uaccess.h>
23#include <asm/pgtable.h>
24#include <asm/system.h>
25#include <asm/processor.h>
26#include <asm/i387.h>
27#include <asm/debugreg.h>
28#include <asm/ldt.h>
29#include <asm/desc.h>
30#include <asm/proto.h>
31#include <asm/ia32.h>
32
33/*
34 * does not yet catch signals sent when the child dies.
35 * in exit.c or in signal.c.
36 */
37
38/* determines which flags the user has access to. */
39/* 1 = access 0 = no access */
40#define FLAG_MASK 0x44dd5UL
41
42/* set's the trap flag. */
43#define TRAP_FLAG 0x100UL
44
45/*
46 * eflags and offset of eflags on child stack..
47 */
48#define EFLAGS offsetof(struct pt_regs, eflags)
49#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
50
51/*
52 * this routine will get a word off of the processes privileged stack.
53 * the offset is how far from the base addr as stored in the TSS.
54 * this routine assumes that all the privileged stacks are in our
55 * data space.
56 */
57static inline unsigned long get_stack_long(struct task_struct *task, int offset)
58{
59 unsigned char *stack;
60
61 stack = (unsigned char *)task->thread.rsp0;
62 stack += offset;
63 return (*((unsigned long *)stack));
64}
65
66/*
67 * this routine will put a word on the processes privileged stack.
68 * the offset is how far from the base addr as stored in the TSS.
69 * this routine assumes that all the privileged stacks are in our
70 * data space.
71 */
72static inline long put_stack_long(struct task_struct *task, int offset,
73 unsigned long data)
74{
75 unsigned char * stack;
76
77 stack = (unsigned char *) task->thread.rsp0;
78 stack += offset;
79 *(unsigned long *) stack = data;
80 return 0;
81}
82
83/*
84 * Called by kernel/ptrace.c when detaching..
85 *
86 * Make sure the single step bit is not set.
87 */
88void ptrace_disable(struct task_struct *child)
89{
90 long tmp;
91
92 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
93 tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
94 put_stack_long(child, EFL_OFFSET, tmp);
95}
96
97static int putreg(struct task_struct *child,
98 unsigned long regno, unsigned long value)
99{
100 unsigned long tmp;
101
102 /* Some code in the 64bit emulation may not be 64bit clean.
103 Don't take any chances. */
104 if (test_tsk_thread_flag(child, TIF_IA32))
105 value &= 0xffffffff;
106 switch (regno) {
107 case offsetof(struct user_regs_struct,fs):
108 if (value && (value & 3) != 3)
109 return -EIO;
110 child->thread.fsindex = value & 0xffff;
111 return 0;
112 case offsetof(struct user_regs_struct,gs):
113 if (value && (value & 3) != 3)
114 return -EIO;
115 child->thread.gsindex = value & 0xffff;
116 return 0;
117 case offsetof(struct user_regs_struct,ds):
118 if (value && (value & 3) != 3)
119 return -EIO;
120 child->thread.ds = value & 0xffff;
121 return 0;
122 case offsetof(struct user_regs_struct,es):
123 if (value && (value & 3) != 3)
124 return -EIO;
125 child->thread.es = value & 0xffff;
126 return 0;
127 case offsetof(struct user_regs_struct,ss):
128 if ((value & 3) != 3)
129 return -EIO;
130 value &= 0xffff;
131 return 0;
132 case offsetof(struct user_regs_struct,fs_base):
133 if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
134 return -EIO;
135 child->thread.fs = value;
136 return 0;
137 case offsetof(struct user_regs_struct,gs_base):
138 if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
139 return -EIO;
140 child->thread.gs = value;
141 return 0;
142 case offsetof(struct user_regs_struct, eflags):
143 value &= FLAG_MASK;
144 tmp = get_stack_long(child, EFL_OFFSET);
145 tmp &= ~FLAG_MASK;
146 value |= tmp;
147 break;
148 case offsetof(struct user_regs_struct,cs):
149 if ((value & 3) != 3)
150 return -EIO;
151 value &= 0xffff;
152 break;
153 }
154 put_stack_long(child, regno - sizeof(struct pt_regs), value);
155 return 0;
156}
157
158static unsigned long getreg(struct task_struct *child, unsigned long regno)
159{
160 unsigned long val;
161 switch (regno) {
162 case offsetof(struct user_regs_struct, fs):
163 return child->thread.fsindex;
164 case offsetof(struct user_regs_struct, gs):
165 return child->thread.gsindex;
166 case offsetof(struct user_regs_struct, ds):
167 return child->thread.ds;
168 case offsetof(struct user_regs_struct, es):
169 return child->thread.es;
170 case offsetof(struct user_regs_struct, fs_base):
171 return child->thread.fs;
172 case offsetof(struct user_regs_struct, gs_base):
173 return child->thread.gs;
174 default:
175 regno = regno - sizeof(struct pt_regs);
176 val = get_stack_long(child, regno);
177 if (test_tsk_thread_flag(child, TIF_IA32))
178 val &= 0xffffffff;
179 return val;
180 }
181
182}
183
184asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
185{
186 struct task_struct *child;
187 long i, ret;
188 unsigned ui;
189
190 /* This lock_kernel fixes a subtle race with suid exec */
191 lock_kernel();
192 ret = -EPERM;
193 if (request == PTRACE_TRACEME) {
194 /* are we already being traced? */
195 if (current->ptrace & PT_PTRACED)
196 goto out;
197 ret = security_ptrace(current->parent, current);
198 if (ret)
199 goto out;
200 /* set the ptrace bit in the process flags. */
201 current->ptrace |= PT_PTRACED;
202 ret = 0;
203 goto out;
204 }
205 ret = -ESRCH;
206 read_lock(&tasklist_lock);
207 child = find_task_by_pid(pid);
208 if (child)
209 get_task_struct(child);
210 read_unlock(&tasklist_lock);
211 if (!child)
212 goto out;
213
214 ret = -EPERM;
215 if (pid == 1) /* you may not mess with init */
216 goto out_tsk;
217
218 if (request == PTRACE_ATTACH) {
219 ret = ptrace_attach(child);
220 goto out_tsk;
221 }
222 ret = ptrace_check_attach(child, request == PTRACE_KILL);
223 if (ret < 0)
224 goto out_tsk;
225
226 switch (request) {
227 /* when I and D space are separate, these will need to be fixed. */
228 case PTRACE_PEEKTEXT: /* read word at location addr. */
229 case PTRACE_PEEKDATA: {
230 unsigned long tmp;
231 int copied;
232
233 copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
234 ret = -EIO;
235 if (copied != sizeof(tmp))
236 break;
237 ret = put_user(tmp,(unsigned long __user *) data);
238 break;
239 }
240
241 /* read the word at location addr in the USER area. */
242 case PTRACE_PEEKUSR: {
243 unsigned long tmp;
244
245 ret = -EIO;
246 if ((addr & 7) ||
247 addr > sizeof(struct user) - 7)
248 break;
249
250 switch (addr) {
251 case 0 ... sizeof(struct user_regs_struct):
252 tmp = getreg(child, addr);
253 break;
254 case offsetof(struct user, u_debugreg[0]):
255 tmp = child->thread.debugreg0;
256 break;
257 case offsetof(struct user, u_debugreg[1]):
258 tmp = child->thread.debugreg1;
259 break;
260 case offsetof(struct user, u_debugreg[2]):
261 tmp = child->thread.debugreg2;
262 break;
263 case offsetof(struct user, u_debugreg[3]):
264 tmp = child->thread.debugreg3;
265 break;
266 case offsetof(struct user, u_debugreg[6]):
267 tmp = child->thread.debugreg6;
268 break;
269 case offsetof(struct user, u_debugreg[7]):
270 tmp = child->thread.debugreg7;
271 break;
272 default:
273 tmp = 0;
274 break;
275 }
276 ret = put_user(tmp,(unsigned long __user *) data);
277 break;
278 }
279
280 /* when I and D space are separate, this will have to be fixed. */
281 case PTRACE_POKETEXT: /* write the word at location addr. */
282 case PTRACE_POKEDATA:
283 ret = 0;
284 if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
285 break;
286 ret = -EIO;
287 break;
288
289 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
290 ret = -EIO;
291 if ((addr & 7) ||
292 addr > sizeof(struct user) - 7)
293 break;
294
295 switch (addr) {
296 case 0 ... sizeof(struct user_regs_struct):
297 ret = putreg(child, addr, data);
298 break;
299 /* Disallows to set a breakpoint into the vsyscall */
300 case offsetof(struct user, u_debugreg[0]):
301 if (data >= TASK_SIZE-7) break;
302 child->thread.debugreg0 = data;
303 ret = 0;
304 break;
305 case offsetof(struct user, u_debugreg[1]):
306 if (data >= TASK_SIZE-7) break;
307 child->thread.debugreg1 = data;
308 ret = 0;
309 break;
310 case offsetof(struct user, u_debugreg[2]):
311 if (data >= TASK_SIZE-7) break;
312 child->thread.debugreg2 = data;
313 ret = 0;
314 break;
315 case offsetof(struct user, u_debugreg[3]):
316 if (data >= TASK_SIZE-7) break;
317 child->thread.debugreg3 = data;
318 ret = 0;
319 break;
320 case offsetof(struct user, u_debugreg[6]):
321 if (data >> 32)
322 break;
323 child->thread.debugreg6 = data;
324 ret = 0;
325 break;
326 case offsetof(struct user, u_debugreg[7]):
327 /* See arch/i386/kernel/ptrace.c for an explanation of
328 * this awkward check.*/
329 data &= ~DR_CONTROL_RESERVED;
330 for(i=0; i<4; i++)
331 if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
332 break;
333 if (i == 4) {
334 child->thread.debugreg7 = data;
335 ret = 0;
336 }
337 break;
338 }
339 break;
340 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
341 case PTRACE_CONT: { /* restart after signal. */
342 long tmp;
343
344 ret = -EIO;
345 if ((unsigned long) data > _NSIG)
346 break;
347 if (request == PTRACE_SYSCALL)
348 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
349 else
350 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
351 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
352 child->exit_code = data;
353 /* make sure the single step bit is not set. */
354 tmp = get_stack_long(child, EFL_OFFSET);
355 tmp &= ~TRAP_FLAG;
356 put_stack_long(child, EFL_OFFSET,tmp);
357 wake_up_process(child);
358 ret = 0;
359 break;
360 }
361
362#ifdef CONFIG_IA32_EMULATION
363 /* This makes only sense with 32bit programs. Allow a
364 64bit debugger to fully examine them too. Better
365 don't use it against 64bit processes, use
366 PTRACE_ARCH_PRCTL instead. */
367 case PTRACE_SET_THREAD_AREA: {
368 struct user_desc __user *p;
369 int old;
370 p = (struct user_desc __user *)data;
371 get_user(old, &p->entry_number);
372 put_user(addr, &p->entry_number);
373 ret = do_set_thread_area(&child->thread, p);
374 put_user(old, &p->entry_number);
375 break;
376 case PTRACE_GET_THREAD_AREA:
377 p = (struct user_desc __user *)data;
378 get_user(old, &p->entry_number);
379 put_user(addr, &p->entry_number);
380 ret = do_get_thread_area(&child->thread, p);
381 put_user(old, &p->entry_number);
382 break;
383 }
384#endif
385 /* normal 64bit interface to access TLS data.
386 Works just like arch_prctl, except that the arguments
387 are reversed. */
388 case PTRACE_ARCH_PRCTL:
389 ret = do_arch_prctl(child, data, addr);
390 break;
391
392/*
393 * make the child exit. Best I can do is send it a sigkill.
394 * perhaps it should be put in the status that it wants to
395 * exit.
396 */
397 case PTRACE_KILL: {
398 long tmp;
399
400 ret = 0;
401 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
402 break;
403 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
404 child->exit_code = SIGKILL;
405 /* make sure the single step bit is not set. */
406 tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
407 put_stack_long(child, EFL_OFFSET, tmp);
408 wake_up_process(child);
409 break;
410 }
411
412 case PTRACE_SINGLESTEP: { /* set the trap flag. */
413 long tmp;
414
415 ret = -EIO;
416 if ((unsigned long) data > _NSIG)
417 break;
418 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
419 if ((child->ptrace & PT_DTRACE) == 0) {
420 /* Spurious delayed TF traps may occur */
421 child->ptrace |= PT_DTRACE;
422 }
423 tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG;
424 put_stack_long(child, EFL_OFFSET, tmp);
425 set_tsk_thread_flag(child, TIF_SINGLESTEP);
426 child->exit_code = data;
427 /* give it a chance to run. */
428 wake_up_process(child);
429 ret = 0;
430 break;
431 }
432
433 case PTRACE_DETACH:
434 /* detach a process that was attached. */
435 ret = ptrace_detach(child, data);
436 break;
437
438 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
439 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
440 sizeof(struct user_regs_struct))) {
441 ret = -EIO;
442 break;
443 }
444 ret = 0;
445 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
446 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
447 data += sizeof(long);
448 }
449 break;
450 }
451
452 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
453 unsigned long tmp;
454 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
455 sizeof(struct user_regs_struct))) {
456 ret = -EIO;
457 break;
458 }
459 ret = 0;
460 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
461 ret |= __get_user(tmp, (unsigned long __user *) data);
462 putreg(child, ui, tmp);
463 data += sizeof(long);
464 }
465 break;
466 }
467
468 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
469 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
470 sizeof(struct user_i387_struct))) {
471 ret = -EIO;
472 break;
473 }
474 ret = get_fpregs((struct user_i387_struct __user *)data, child);
475 break;
476 }
477
478 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
479 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
480 sizeof(struct user_i387_struct))) {
481 ret = -EIO;
482 break;
483 }
484 set_stopped_child_used_math(child);
485 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
486 break;
487 }
488
489 default:
490 ret = ptrace_request(child, request, addr, data);
491 break;
492 }
493out_tsk:
494 put_task_struct(child);
495out:
496 unlock_kernel();
497 return ret;
498}
499
500static void syscall_trace(struct pt_regs *regs)
501{
502
503#if 0
504 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
505 current->comm,
506 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
507 current_thread_info()->flags, current->ptrace);
508#endif
509
510 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
511 ? 0x80 : 0));
512 /*
513 * this isn't the same as continuing with a signal, but it will do
514 * for normal use. strace only continues with a signal if the
515 * stopping signal is not SIGTRAP. -brl
516 */
517 if (current->exit_code) {
518 send_sig(current->exit_code, current, 1);
519 current->exit_code = 0;
520 }
521}
522
523asmlinkage void syscall_trace_enter(struct pt_regs *regs)
524{
525 /* do the secure computing check first */
526 secure_computing(regs->orig_rax);
527
528 if (unlikely(current->audit_context))
529 audit_syscall_entry(current, regs->orig_rax,
530 regs->rdi, regs->rsi,
531 regs->rdx, regs->r10);
532
533 if (test_thread_flag(TIF_SYSCALL_TRACE)
534 && (current->ptrace & PT_PTRACED))
535 syscall_trace(regs);
536}
537
538asmlinkage void syscall_trace_leave(struct pt_regs *regs)
539{
540 if (unlikely(current->audit_context))
541 audit_syscall_exit(current, regs->rax);
542
543 if ((test_thread_flag(TIF_SYSCALL_TRACE)
544 || test_thread_flag(TIF_SINGLESTEP))
545 && (current->ptrace & PT_PTRACED))
546 syscall_trace(regs);
547}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
new file mode 100644
index 000000000000..be4b36f762cf
--- /dev/null
+++ b/arch/x86_64/kernel/reboot.c
@@ -0,0 +1,163 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <asm/io.h>
10#include <asm/kdebug.h>
11#include <asm/delay.h>
12#include <asm/hw_irq.h>
13#include <asm/system.h>
14#include <asm/pgtable.h>
15#include <asm/tlbflush.h>
16#include <asm/apic.h>
17
18/*
19 * Power off function, if any
20 */
21void (*pm_power_off)(void);
22
23static long no_idt[3];
24static enum {
25 BOOT_TRIPLE = 't',
26 BOOT_KBD = 'k'
27} reboot_type = BOOT_KBD;
28static int reboot_mode = 0;
29int reboot_force;
30
31/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
32 warm Don't set the cold reboot flag
33 cold Set the cold reboot flag
34 triple Force a triple fault (init)
35 kbd Use the keyboard controller. cold reset (default)
36 force Avoid anything that could hang.
37 */
38static int __init reboot_setup(char *str)
39{
40 for (;;) {
41 switch (*str) {
42 case 'w':
43 reboot_mode = 0x1234;
44 break;
45
46 case 'c':
47 reboot_mode = 0;
48 break;
49
50 case 't':
51 case 'b':
52 case 'k':
53 reboot_type = *str;
54 break;
55 case 'f':
56 reboot_force = 1;
57 break;
58 }
59 if((str = strchr(str,',')) != NULL)
60 str++;
61 else
62 break;
63 }
64 return 1;
65}
66
67__setup("reboot=", reboot_setup);
68
69#ifdef CONFIG_SMP
70static void smp_halt(void)
71{
72 int cpuid = safe_smp_processor_id();
73 static int first_entry = 1;
74
75 if (reboot_force)
76 return;
77
78 if (first_entry) {
79 first_entry = 0;
80 smp_call_function((void *)machine_restart, NULL, 1, 0);
81 }
82
83 smp_stop_cpu();
84
85 /* AP calling this. Just halt */
86 if (cpuid != boot_cpu_id) {
87 for (;;)
88 asm("hlt");
89 }
90
91 /* Wait for all other CPUs to have run smp_stop_cpu */
92 while (!cpus_empty(cpu_online_map))
93 rep_nop();
94}
95#endif
96
97static inline void kb_wait(void)
98{
99 int i;
100
101 for (i=0; i<0x10000; i++)
102 if ((inb_p(0x64) & 0x02) == 0)
103 break;
104}
105
106void machine_restart(char * __unused)
107{
108 int i;
109
110 printk("machine restart\n");
111
112#ifdef CONFIG_SMP
113 smp_halt();
114#endif
115
116 if (!reboot_force) {
117 local_irq_disable();
118#ifndef CONFIG_SMP
119 disable_local_APIC();
120#endif
121 disable_IO_APIC();
122 local_irq_enable();
123 }
124
125 /* Tell the BIOS if we want cold or warm reboot */
126 *((unsigned short *)__va(0x472)) = reboot_mode;
127
128 for (;;) {
129 /* Could also try the reset bit in the Hammer NB */
130 switch (reboot_type) {
131 case BOOT_KBD:
132 for (i=0; i<100; i++) {
133 kb_wait();
134 udelay(50);
135 outb(0xfe,0x64); /* pulse reset low */
136 udelay(50);
137 }
138
139 case BOOT_TRIPLE:
140 __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
141 __asm__ __volatile__("int3");
142
143 reboot_type = BOOT_KBD;
144 break;
145 }
146 }
147}
148
149EXPORT_SYMBOL(machine_restart);
150
151void machine_halt(void)
152{
153}
154
155EXPORT_SYMBOL(machine_halt);
156
157void machine_power_off(void)
158{
159 if (pm_power_off)
160 pm_power_off();
161}
162
163EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
new file mode 100644
index 000000000000..48f7c18172b9
--- /dev/null
+++ b/arch/x86_64/kernel/semaphore.c
@@ -0,0 +1,180 @@
1/*
2 * x86_64 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15#include <linux/config.h>
16#include <linux/sched.h>
17#include <linux/init.h>
18#include <asm/errno.h>
19
20#include <asm/semaphore.h>
21
22/*
23 * Semaphores are implemented using a two-way counter:
24 * The "count" variable is decremented for each process
25 * that tries to acquire the semaphore, while the "sleeping"
26 * variable is a count of such acquires.
27 *
28 * Notably, the inline "up()" and "down()" functions can
29 * efficiently test if they need to do any extra work (up
30 * needs to do something only if count was negative before
31 * the increment operation.
32 *
33 * "sleeping" and the contention routine ordering is protected
34 * by the spinlock in the semaphore's waitqueue head.
35 *
36 * Note that these functions are only called when there is
37 * contention on the lock, and as such all this is the
38 * "non-critical" part of the whole semaphore business. The
39 * critical part is the inline stuff in <asm/semaphore.h>
40 * where we want to avoid any extra jumps and calls.
41 */
42
43/*
44 * Logic:
45 * - only on a boundary condition do we need to care. When we go
46 * from a negative count to a non-negative, we wake people up.
47 * - when we go from a non-negative count to a negative do we
48 * (a) synchronize with the "sleeper" count and (b) make sure
49 * that we're on the wakeup list before we synchronize so that
50 * we cannot lose wakeup events.
51 */
52
53void __up(struct semaphore *sem)
54{
55 wake_up(&sem->wait);
56}
57
58void __sched __down(struct semaphore * sem)
59{
60 struct task_struct *tsk = current;
61 DECLARE_WAITQUEUE(wait, tsk);
62 unsigned long flags;
63
64 tsk->state = TASK_UNINTERRUPTIBLE;
65 spin_lock_irqsave(&sem->wait.lock, flags);
66 add_wait_queue_exclusive_locked(&sem->wait, &wait);
67
68 sem->sleepers++;
69 for (;;) {
70 int sleepers = sem->sleepers;
71
72 /*
73 * Add "everybody else" into it. They aren't
74 * playing, because we own the spinlock in
75 * the wait_queue_head.
76 */
77 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
78 sem->sleepers = 0;
79 break;
80 }
81 sem->sleepers = 1; /* us - see -1 above */
82 spin_unlock_irqrestore(&sem->wait.lock, flags);
83
84 schedule();
85
86 spin_lock_irqsave(&sem->wait.lock, flags);
87 tsk->state = TASK_UNINTERRUPTIBLE;
88 }
89 remove_wait_queue_locked(&sem->wait, &wait);
90 wake_up_locked(&sem->wait);
91 spin_unlock_irqrestore(&sem->wait.lock, flags);
92 tsk->state = TASK_RUNNING;
93}
94
95int __sched __down_interruptible(struct semaphore * sem)
96{
97 int retval = 0;
98 struct task_struct *tsk = current;
99 DECLARE_WAITQUEUE(wait, tsk);
100 unsigned long flags;
101
102 tsk->state = TASK_INTERRUPTIBLE;
103 spin_lock_irqsave(&sem->wait.lock, flags);
104 add_wait_queue_exclusive_locked(&sem->wait, &wait);
105
106 sem->sleepers++;
107 for (;;) {
108 int sleepers = sem->sleepers;
109
110 /*
111 * With signals pending, this turns into
112 * the trylock failure case - we won't be
113 * sleeping, and we* can't get the lock as
114 * it has contention. Just correct the count
115 * and exit.
116 */
117 if (signal_pending(current)) {
118 retval = -EINTR;
119 sem->sleepers = 0;
120 atomic_add(sleepers, &sem->count);
121 break;
122 }
123
124 /*
125 * Add "everybody else" into it. They aren't
126 * playing, because we own the spinlock in
127 * wait_queue_head. The "-1" is because we're
128 * still hoping to get the semaphore.
129 */
130 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
131 sem->sleepers = 0;
132 break;
133 }
134 sem->sleepers = 1; /* us - see -1 above */
135 spin_unlock_irqrestore(&sem->wait.lock, flags);
136
137 schedule();
138
139 spin_lock_irqsave(&sem->wait.lock, flags);
140 tsk->state = TASK_INTERRUPTIBLE;
141 }
142 remove_wait_queue_locked(&sem->wait, &wait);
143 wake_up_locked(&sem->wait);
144 spin_unlock_irqrestore(&sem->wait.lock, flags);
145
146 tsk->state = TASK_RUNNING;
147 return retval;
148}
149
150/*
151 * Trylock failed - make sure we correct for
152 * having decremented the count.
153 *
154 * We could have done the trylock with a
155 * single "cmpxchg" without failure cases,
156 * but then it wouldn't work on a 386.
157 */
158int __down_trylock(struct semaphore * sem)
159{
160 int sleepers;
161 unsigned long flags;
162
163 spin_lock_irqsave(&sem->wait.lock, flags);
164 sleepers = sem->sleepers + 1;
165 sem->sleepers = 0;
166
167 /*
168 * Add "everybody else" and us into it. They aren't
169 * playing, because we own the spinlock in the
170 * wait_queue_head.
171 */
172 if (!atomic_add_negative(sleepers, &sem->count)) {
173 wake_up_locked(&sem->wait);
174 }
175
176 spin_unlock_irqrestore(&sem->wait.lock, flags);
177 return 1;
178}
179
180
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
new file mode 100644
index 000000000000..e50fc41de6b9
--- /dev/null
+++ b/arch/x86_64/kernel/setup.c
@@ -0,0 +1,1189 @@
1/*
2 * linux/arch/x86-64/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Nov 2001 Dave Jones <davej@suse.de>
7 * Forked from i386 setup code.
8 *
9 * $Id$
10 */
11
12/*
13 * This file handles the architecture-dependent parts of initialization
14 */
15
16#include <linux/errno.h>
17#include <linux/sched.h>
18#include <linux/kernel.h>
19#include <linux/mm.h>
20#include <linux/stddef.h>
21#include <linux/unistd.h>
22#include <linux/ptrace.h>
23#include <linux/slab.h>
24#include <linux/user.h>
25#include <linux/a.out.h>
26#include <linux/tty.h>
27#include <linux/ioport.h>
28#include <linux/delay.h>
29#include <linux/config.h>
30#include <linux/init.h>
31#include <linux/initrd.h>
32#include <linux/highmem.h>
33#include <linux/bootmem.h>
34#include <linux/module.h>
35#include <asm/processor.h>
36#include <linux/console.h>
37#include <linux/seq_file.h>
38#include <linux/root_dev.h>
39#include <linux/pci.h>
40#include <linux/acpi.h>
41#include <linux/kallsyms.h>
42#include <linux/edd.h>
43#include <asm/mtrr.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <asm/smp.h>
48#include <asm/msr.h>
49#include <asm/desc.h>
50#include <video/edid.h>
51#include <asm/e820.h>
52#include <asm/dma.h>
53#include <asm/mpspec.h>
54#include <asm/mmu_context.h>
55#include <asm/bootsetup.h>
56#include <asm/proto.h>
57#include <asm/setup.h>
58#include <asm/mach_apic.h>
59#include <asm/numa.h>
60
61/*
62 * Machine setup..
63 */
64
65struct cpuinfo_x86 boot_cpu_data;
66
67unsigned long mmu_cr4_features;
68
69int acpi_disabled;
70EXPORT_SYMBOL(acpi_disabled);
71#ifdef CONFIG_ACPI_BOOT
72extern int __initdata acpi_ht;
73extern acpi_interrupt_flags acpi_sci_flags;
74int __initdata acpi_force = 0;
75#endif
76
77int acpi_numa __initdata;
78
79/* For PCI or other memory-mapped resources */
80unsigned long pci_mem_start = 0x10000000;
81
82/* Boot loader ID as an integer, for the benefit of proc_dointvec */
83int bootloader_type;
84
85unsigned long saved_video_mode;
86
87#ifdef CONFIG_SWIOTLB
88int swiotlb;
89EXPORT_SYMBOL(swiotlb);
90#endif
91
92/*
93 * Setup options
94 */
95struct drive_info_struct { char dummy[32]; } drive_info;
96struct screen_info screen_info;
97struct sys_desc_table_struct {
98 unsigned short length;
99 unsigned char table[0];
100};
101
102struct edid_info edid_info;
103struct e820map e820;
104
105extern int root_mountflags;
106extern char _text, _etext, _edata, _end;
107
108char command_line[COMMAND_LINE_SIZE];
109
110struct resource standard_io_resources[] = {
111 { .name = "dma1", .start = 0x00, .end = 0x1f,
112 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
113 { .name = "pic1", .start = 0x20, .end = 0x21,
114 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
115 { .name = "timer0", .start = 0x40, .end = 0x43,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
117 { .name = "timer1", .start = 0x50, .end = 0x53,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
119 { .name = "keyboard", .start = 0x60, .end = 0x6f,
120 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
121 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
122 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
123 { .name = "pic2", .start = 0xa0, .end = 0xa1,
124 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
125 { .name = "dma2", .start = 0xc0, .end = 0xdf,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
127 { .name = "fpu", .start = 0xf0, .end = 0xff,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
129};
130
131#define STANDARD_IO_RESOURCES \
132 (sizeof standard_io_resources / sizeof standard_io_resources[0])
133
134#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
135
136struct resource data_resource = {
137 .name = "Kernel data",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_RAM,
141};
142struct resource code_resource = {
143 .name = "Kernel code",
144 .start = 0,
145 .end = 0,
146 .flags = IORESOURCE_RAM,
147};
148
149#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
150
151static struct resource system_rom_resource = {
152 .name = "System ROM",
153 .start = 0xf0000,
154 .end = 0xfffff,
155 .flags = IORESOURCE_ROM,
156};
157
158static struct resource extension_rom_resource = {
159 .name = "Extension ROM",
160 .start = 0xe0000,
161 .end = 0xeffff,
162 .flags = IORESOURCE_ROM,
163};
164
165static struct resource adapter_rom_resources[] = {
166 { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
167 .flags = IORESOURCE_ROM },
168 { .name = "Adapter ROM", .start = 0, .end = 0,
169 .flags = IORESOURCE_ROM },
170 { .name = "Adapter ROM", .start = 0, .end = 0,
171 .flags = IORESOURCE_ROM },
172 { .name = "Adapter ROM", .start = 0, .end = 0,
173 .flags = IORESOURCE_ROM },
174 { .name = "Adapter ROM", .start = 0, .end = 0,
175 .flags = IORESOURCE_ROM },
176 { .name = "Adapter ROM", .start = 0, .end = 0,
177 .flags = IORESOURCE_ROM }
178};
179
180#define ADAPTER_ROM_RESOURCES \
181 (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
182
183static struct resource video_rom_resource = {
184 .name = "Video ROM",
185 .start = 0xc0000,
186 .end = 0xc7fff,
187 .flags = IORESOURCE_ROM,
188};
189
190static struct resource video_ram_resource = {
191 .name = "Video RAM area",
192 .start = 0xa0000,
193 .end = 0xbffff,
194 .flags = IORESOURCE_RAM,
195};
196
197#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
198
199static int __init romchecksum(unsigned char *rom, unsigned long length)
200{
201 unsigned char *p, sum = 0;
202
203 for (p = rom; p < rom + length; p++)
204 sum += *p;
205 return sum == 0;
206}
207
208static void __init probe_roms(void)
209{
210 unsigned long start, length, upper;
211 unsigned char *rom;
212 int i;
213
214 /* video rom */
215 upper = adapter_rom_resources[0].start;
216 for (start = video_rom_resource.start; start < upper; start += 2048) {
217 rom = isa_bus_to_virt(start);
218 if (!romsignature(rom))
219 continue;
220
221 video_rom_resource.start = start;
222
223 /* 0 < length <= 0x7f * 512, historically */
224 length = rom[2] * 512;
225
226 /* if checksum okay, trust length byte */
227 if (length && romchecksum(rom, length))
228 video_rom_resource.end = start + length - 1;
229
230 request_resource(&iomem_resource, &video_rom_resource);
231 break;
232 }
233
234 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
235 if (start < upper)
236 start = upper;
237
238 /* system rom */
239 request_resource(&iomem_resource, &system_rom_resource);
240 upper = system_rom_resource.start;
241
242 /* check for extension rom (ignore length byte!) */
243 rom = isa_bus_to_virt(extension_rom_resource.start);
244 if (romsignature(rom)) {
245 length = extension_rom_resource.end - extension_rom_resource.start + 1;
246 if (romchecksum(rom, length)) {
247 request_resource(&iomem_resource, &extension_rom_resource);
248 upper = extension_rom_resource.start;
249 }
250 }
251
252 /* check for adapter roms on 2k boundaries */
253 for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
254 rom = isa_bus_to_virt(start);
255 if (!romsignature(rom))
256 continue;
257
258 /* 0 < length <= 0x7f * 512, historically */
259 length = rom[2] * 512;
260
261 /* but accept any length that fits if checksum okay */
262 if (!length || start + length > upper || !romchecksum(rom, length))
263 continue;
264
265 adapter_rom_resources[i].start = start;
266 adapter_rom_resources[i].end = start + length - 1;
267 request_resource(&iomem_resource, &adapter_rom_resources[i]);
268
269 start = adapter_rom_resources[i++].end & ~2047UL;
270 }
271}
272
273static __init void parse_cmdline_early (char ** cmdline_p)
274{
275 char c = ' ', *to = command_line, *from = COMMAND_LINE;
276 int len = 0;
277
278 /* Save unparsed command line copy for /proc/cmdline */
279 memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
280 saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
281
282 for (;;) {
283 if (c != ' ')
284 goto next_char;
285
286#ifdef CONFIG_SMP
287 /*
288 * If the BIOS enumerates physical processors before logical,
289 * maxcpus=N at enumeration-time can be used to disable HT.
290 */
291 else if (!memcmp(from, "maxcpus=", 8)) {
292 extern unsigned int maxcpus;
293
294 maxcpus = simple_strtoul(from + 8, NULL, 0);
295 }
296#endif
297#ifdef CONFIG_ACPI_BOOT
298 /* "acpi=off" disables both ACPI table parsing and interpreter init */
299 if (!memcmp(from, "acpi=off", 8))
300 disable_acpi();
301
302 if (!memcmp(from, "acpi=force", 10)) {
303 /* add later when we do DMI horrors: */
304 acpi_force = 1;
305 acpi_disabled = 0;
306 }
307
308 /* acpi=ht just means: do ACPI MADT parsing
309 at bootup, but don't enable the full ACPI interpreter */
310 if (!memcmp(from, "acpi=ht", 7)) {
311 if (!acpi_force)
312 disable_acpi();
313 acpi_ht = 1;
314 }
315 else if (!memcmp(from, "pci=noacpi", 10))
316 acpi_disable_pci();
317 else if (!memcmp(from, "acpi=noirq", 10))
318 acpi_noirq_set();
319
320 else if (!memcmp(from, "acpi_sci=edge", 13))
321 acpi_sci_flags.trigger = 1;
322 else if (!memcmp(from, "acpi_sci=level", 14))
323 acpi_sci_flags.trigger = 3;
324 else if (!memcmp(from, "acpi_sci=high", 13))
325 acpi_sci_flags.polarity = 1;
326 else if (!memcmp(from, "acpi_sci=low", 12))
327 acpi_sci_flags.polarity = 3;
328
329 /* acpi=strict disables out-of-spec workarounds */
330 else if (!memcmp(from, "acpi=strict", 11)) {
331 acpi_strict = 1;
332 }
333#endif
334
335 if (!memcmp(from, "nolapic", 7) ||
336 !memcmp(from, "disableapic", 11))
337 disable_apic = 1;
338
339 if (!memcmp(from, "noapic", 6))
340 skip_ioapic_setup = 1;
341
342 if (!memcmp(from, "apic", 4)) {
343 skip_ioapic_setup = 0;
344 ioapic_force = 1;
345 }
346
347 if (!memcmp(from, "mem=", 4))
348 parse_memopt(from+4, &from);
349
350#ifdef CONFIG_DISCONTIGMEM
351 if (!memcmp(from, "numa=", 5))
352 numa_setup(from+5);
353#endif
354
355#ifdef CONFIG_GART_IOMMU
356 if (!memcmp(from,"iommu=",6)) {
357 iommu_setup(from+6);
358 }
359#endif
360
361 if (!memcmp(from,"oops=panic", 10))
362 panic_on_oops = 1;
363
364 if (!memcmp(from, "noexec=", 7))
365 nonx_setup(from + 7);
366
367 next_char:
368 c = *(from++);
369 if (!c)
370 break;
371 if (COMMAND_LINE_SIZE <= ++len)
372 break;
373 *(to++) = c;
374 }
375 *to = '\0';
376 *cmdline_p = command_line;
377}
378
379#ifndef CONFIG_DISCONTIGMEM
380static void __init contig_initmem_init(void)
381{
382 unsigned long bootmap_size, bootmap;
383 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
384 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
385 if (bootmap == -1L)
386 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
387 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
388 e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT);
389 reserve_bootmem(bootmap, bootmap_size);
390}
391#endif
392
393/* Use inline assembly to define this because the nops are defined
394 as inline assembly strings in the include files and we cannot
395 get them easily into strings. */
396asm("\t.data\nk8nops: "
397 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
398 K8_NOP7 K8_NOP8);
399
400extern unsigned char k8nops[];
401static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
402 NULL,
403 k8nops,
404 k8nops + 1,
405 k8nops + 1 + 2,
406 k8nops + 1 + 2 + 3,
407 k8nops + 1 + 2 + 3 + 4,
408 k8nops + 1 + 2 + 3 + 4 + 5,
409 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
410 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
411};
412
413/* Replace instructions with better alternatives for this CPU type.
414
415 This runs before SMP is initialized to avoid SMP problems with
416 self modifying code. This implies that assymetric systems where
417 APs have less capabilities than the boot processor are not handled.
418 In this case boot with "noreplacement". */
419void apply_alternatives(void *start, void *end)
420{
421 struct alt_instr *a;
422 int diff, i, k;
423 for (a = start; (void *)a < end; a++) {
424 if (!boot_cpu_has(a->cpuid))
425 continue;
426
427 BUG_ON(a->replacementlen > a->instrlen);
428 __inline_memcpy(a->instr, a->replacement, a->replacementlen);
429 diff = a->instrlen - a->replacementlen;
430
431 /* Pad the rest with nops */
432 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
433 k = diff;
434 if (k > ASM_NOP_MAX)
435 k = ASM_NOP_MAX;
436 __inline_memcpy(a->instr + i, k8_nops[k], k);
437 }
438 }
439}
440
441static int no_replacement __initdata = 0;
442
443void __init alternative_instructions(void)
444{
445 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
446 if (no_replacement)
447 return;
448 apply_alternatives(__alt_instructions, __alt_instructions_end);
449}
450
451static int __init noreplacement_setup(char *s)
452{
453 no_replacement = 1;
454 return 0;
455}
456
457__setup("noreplacement", noreplacement_setup);
458
459#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
460struct edd edd;
461#ifdef CONFIG_EDD_MODULE
462EXPORT_SYMBOL(edd);
463#endif
464/**
465 * copy_edd() - Copy the BIOS EDD information
466 * from boot_params into a safe place.
467 *
468 */
469static inline void copy_edd(void)
470{
471 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
472 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
473 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
474 edd.edd_info_nr = EDD_NR;
475}
476#else
477static inline void copy_edd(void)
478{
479}
480#endif
481
482#define EBDA_ADDR_POINTER 0x40E
483static void __init reserve_ebda_region(void)
484{
485 unsigned int addr;
486 /**
487 * there is a real-mode segmented pointer pointing to the
488 * 4K EBDA area at 0x40E
489 */
490 addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
491 addr <<= 4;
492 if (addr)
493 reserve_bootmem_generic(addr, PAGE_SIZE);
494}
495
496void __init setup_arch(char **cmdline_p)
497{
498 unsigned long low_mem_size;
499 unsigned long kernel_end;
500
501 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
502 drive_info = DRIVE_INFO;
503 screen_info = SCREEN_INFO;
504 edid_info = EDID_INFO;
505 saved_video_mode = SAVED_VIDEO_MODE;
506 bootloader_type = LOADER_TYPE;
507
508#ifdef CONFIG_BLK_DEV_RAM
509 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
510 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
511 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
512#endif
513 setup_memory_region();
514 copy_edd();
515
516 if (!MOUNT_ROOT_RDONLY)
517 root_mountflags &= ~MS_RDONLY;
518 init_mm.start_code = (unsigned long) &_text;
519 init_mm.end_code = (unsigned long) &_etext;
520 init_mm.end_data = (unsigned long) &_edata;
521 init_mm.brk = (unsigned long) &_end;
522
523 code_resource.start = virt_to_phys(&_text);
524 code_resource.end = virt_to_phys(&_etext)-1;
525 data_resource.start = virt_to_phys(&_etext);
526 data_resource.end = virt_to_phys(&_edata)-1;
527
528 parse_cmdline_early(cmdline_p);
529
530 early_identify_cpu(&boot_cpu_data);
531
532 /*
533 * partially used pages are not usable - thus
534 * we are rounding upwards:
535 */
536 end_pfn = e820_end_of_ram();
537
538 check_efer();
539
540 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
541
542#ifdef CONFIG_ACPI_BOOT
543 /*
544 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
545 * Call this early for SRAT node setup.
546 */
547 acpi_boot_table_init();
548#endif
549
550#ifdef CONFIG_ACPI_NUMA
551 /*
552 * Parse SRAT to discover nodes.
553 */
554 acpi_numa_init();
555#endif
556
557#ifdef CONFIG_DISCONTIGMEM
558 numa_initmem_init(0, end_pfn);
559#else
560 contig_initmem_init();
561#endif
562
563 /* Reserve direct mapping */
564 reserve_bootmem_generic(table_start << PAGE_SHIFT,
565 (table_end - table_start) << PAGE_SHIFT);
566
567 /* reserve kernel */
568 kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
569 reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
570
571 /*
572 * reserve physical page 0 - it's a special BIOS page on many boxes,
573 * enabling clean reboots, SMP operation, laptop functions.
574 */
575 reserve_bootmem_generic(0, PAGE_SIZE);
576
577 /* reserve ebda region */
578 reserve_ebda_region();
579
580#ifdef CONFIG_SMP
581 /*
582 * But first pinch a few for the stack/trampoline stuff
583 * FIXME: Don't need the extra page at 4K, but need to fix
584 * trampoline before removing it. (see the GDT stuff)
585 */
586 reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
587
588 /* Reserve SMP trampoline */
589 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
590#endif
591
592#ifdef CONFIG_ACPI_SLEEP
593 /*
594 * Reserve low memory region for sleep support.
595 */
596 acpi_reserve_bootmem();
597#endif
598#ifdef CONFIG_X86_LOCAL_APIC
599 /*
600 * Find and reserve possible boot-time SMP configuration:
601 */
602 find_smp_config();
603#endif
604#ifdef CONFIG_BLK_DEV_INITRD
605 if (LOADER_TYPE && INITRD_START) {
606 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
607 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
608 initrd_start =
609 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
610 initrd_end = initrd_start+INITRD_SIZE;
611 }
612 else {
613 printk(KERN_ERR "initrd extends beyond end of memory "
614 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
615 (unsigned long)(INITRD_START + INITRD_SIZE),
616 (unsigned long)(end_pfn << PAGE_SHIFT));
617 initrd_start = 0;
618 }
619 }
620#endif
621 paging_init();
622
623 check_ioapic();
624
625#ifdef CONFIG_ACPI_BOOT
626 /*
627 * Read APIC and some other early information from ACPI tables.
628 */
629 acpi_boot_init();
630#endif
631
632#ifdef CONFIG_X86_LOCAL_APIC
633 /*
634 * get boot-time SMP configuration:
635 */
636 if (smp_found_config)
637 get_smp_config();
638 init_apic_mappings();
639#endif
640
641 /*
642 * Request address space for all standard RAM and ROM resources
643 * and also for regions reported as reserved by the e820.
644 */
645 probe_roms();
646 e820_reserve_resources();
647
648 request_resource(&iomem_resource, &video_ram_resource);
649
650 {
651 unsigned i;
652 /* request I/O space for devices used on all i[345]86 PCs */
653 for (i = 0; i < STANDARD_IO_RESOURCES; i++)
654 request_resource(&ioport_resource, &standard_io_resources[i]);
655 }
656
657 /* Will likely break when you have unassigned resources with more
658 than 4GB memory and bridges that don't support more than 4GB.
659 Doing it properly would require to use pci_alloc_consistent
660 in this case. */
661 low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff;
662 if (low_mem_size > pci_mem_start)
663 pci_mem_start = low_mem_size;
664
665#ifdef CONFIG_GART_IOMMU
666 iommu_hole_init();
667#endif
668
669#ifdef CONFIG_VT
670#if defined(CONFIG_VGA_CONSOLE)
671 conswitchp = &vga_con;
672#elif defined(CONFIG_DUMMY_CONSOLE)
673 conswitchp = &dummy_con;
674#endif
675#endif
676}
677
678static int __init get_model_name(struct cpuinfo_x86 *c)
679{
680 unsigned int *v;
681
682 if (c->x86_cpuid_level < 0x80000004)
683 return 0;
684
685 v = (unsigned int *) c->x86_model_id;
686 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
687 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
688 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
689 c->x86_model_id[48] = 0;
690 return 1;
691}
692
693
694static void __init display_cacheinfo(struct cpuinfo_x86 *c)
695{
696 unsigned int n, dummy, eax, ebx, ecx, edx;
697
698 n = c->x86_cpuid_level;
699
700 if (n >= 0x80000005) {
701 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
702 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
703 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
704 c->x86_cache_size=(ecx>>24)+(edx>>24);
705 /* On K8 L1 TLB is inclusive, so don't count it */
706 c->x86_tlbsize = 0;
707 }
708
709 if (n >= 0x80000006) {
710 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
711 ecx = cpuid_ecx(0x80000006);
712 c->x86_cache_size = ecx >> 16;
713 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
714
715 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
716 c->x86_cache_size, ecx & 0xFF);
717 }
718
719 if (n >= 0x80000007)
720 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
721 if (n >= 0x80000008) {
722 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
723 c->x86_virt_bits = (eax >> 8) & 0xff;
724 c->x86_phys_bits = eax & 0xff;
725 }
726}
727
728
729static int __init init_amd(struct cpuinfo_x86 *c)
730{
731 int r;
732 int level;
733#ifdef CONFIG_NUMA
734 int cpu;
735#endif
736
737 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
738 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
739 clear_bit(0*32+31, &c->x86_capability);
740
741 /* C-stepping K8? */
742 level = cpuid_eax(1);
743 if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
744 set_bit(X86_FEATURE_K8_C, &c->x86_capability);
745
746 r = get_model_name(c);
747 if (!r) {
748 switch (c->x86) {
749 case 15:
750 /* Should distinguish Models here, but this is only
751 a fallback anyways. */
752 strcpy(c->x86_model_id, "Hammer");
753 break;
754 }
755 }
756 display_cacheinfo(c);
757
758 if (c->x86_cpuid_level >= 0x80000008) {
759 c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
760 if (c->x86_num_cores & (c->x86_num_cores - 1))
761 c->x86_num_cores = 1;
762
763#ifdef CONFIG_NUMA
764 /* On a dual core setup the lower bits of apic id
765 distingush the cores. Fix up the CPU<->node mappings
766 here based on that.
767 Assumes number of cores is a power of two.
768 When using SRAT use mapping from SRAT. */
769 cpu = c->x86_apicid;
770 if (acpi_numa <= 0 && c->x86_num_cores > 1) {
771 cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
772 if (!node_online(cpu_to_node[cpu]))
773 cpu_to_node[cpu] = first_node(node_online_map);
774 }
775 printk(KERN_INFO "CPU %d(%d) -> Node %d\n",
776 cpu, c->x86_num_cores, cpu_to_node[cpu]);
777#endif
778 }
779
780 return r;
781}
782
783static void __init detect_ht(struct cpuinfo_x86 *c)
784{
785#ifdef CONFIG_SMP
786 u32 eax, ebx, ecx, edx;
787 int index_lsb, index_msb, tmp;
788 int cpu = smp_processor_id();
789
790 if (!cpu_has(c, X86_FEATURE_HT))
791 return;
792
793 cpuid(1, &eax, &ebx, &ecx, &edx);
794 smp_num_siblings = (ebx & 0xff0000) >> 16;
795
796 if (smp_num_siblings == 1) {
797 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
798 } else if (smp_num_siblings > 1) {
799 index_lsb = 0;
800 index_msb = 31;
801 /*
802 * At this point we only support two siblings per
803 * processor package.
804 */
805 if (smp_num_siblings > NR_CPUS) {
806 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
807 smp_num_siblings = 1;
808 return;
809 }
810 tmp = smp_num_siblings;
811 while ((tmp & 1) == 0) {
812 tmp >>=1 ;
813 index_lsb++;
814 }
815 tmp = smp_num_siblings;
816 while ((tmp & 0x80000000 ) == 0) {
817 tmp <<=1 ;
818 index_msb--;
819 }
820 if (index_lsb != index_msb )
821 index_msb++;
822 phys_proc_id[cpu] = phys_pkg_id(index_msb);
823
824 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
825 phys_proc_id[cpu]);
826 }
827#endif
828}
829
830static void __init sched_cmp_hack(struct cpuinfo_x86 *c)
831{
832#ifdef CONFIG_SMP
833 /* AMD dual core looks like HT but isn't really. Hide it from the
834 scheduler. This works around problems with the domain scheduler.
835 Also probably gives slightly better scheduling and disables
836 SMT nice which is harmful on dual core.
837 TBD tune the domain scheduler for dual core. */
838 if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY))
839 smp_num_siblings = 1;
840#endif
841}
842
843static void __init init_intel(struct cpuinfo_x86 *c)
844{
845 /* Cache sizes */
846 unsigned n;
847
848 init_intel_cacheinfo(c);
849 n = c->x86_cpuid_level;
850 if (n >= 0x80000008) {
851 unsigned eax = cpuid_eax(0x80000008);
852 c->x86_virt_bits = (eax >> 8) & 0xff;
853 c->x86_phys_bits = eax & 0xff;
854 }
855
856 if (c->x86 == 15)
857 c->x86_cache_alignment = c->x86_clflush_size * 2;
858}
859
860void __init get_cpu_vendor(struct cpuinfo_x86 *c)
861{
862 char *v = c->x86_vendor_id;
863
864 if (!strcmp(v, "AuthenticAMD"))
865 c->x86_vendor = X86_VENDOR_AMD;
866 else if (!strcmp(v, "GenuineIntel"))
867 c->x86_vendor = X86_VENDOR_INTEL;
868 else
869 c->x86_vendor = X86_VENDOR_UNKNOWN;
870}
871
872struct cpu_model_info {
873 int vendor;
874 int family;
875 char *model_names[16];
876};
877
878/* Do some early cpuid on the boot CPU to get some parameter that are
879 needed before check_bugs. Everything advanced is in identify_cpu
880 below. */
881void __init early_identify_cpu(struct cpuinfo_x86 *c)
882{
883 u32 tfms;
884
885 c->loops_per_jiffy = loops_per_jiffy;
886 c->x86_cache_size = -1;
887 c->x86_vendor = X86_VENDOR_UNKNOWN;
888 c->x86_model = c->x86_mask = 0; /* So far unknown... */
889 c->x86_vendor_id[0] = '\0'; /* Unset */
890 c->x86_model_id[0] = '\0'; /* Unset */
891 c->x86_clflush_size = 64;
892 c->x86_cache_alignment = c->x86_clflush_size;
893 c->x86_num_cores = 1;
894 c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
895 c->x86_cpuid_level = 0;
896 memset(&c->x86_capability, 0, sizeof c->x86_capability);
897
898 /* Get vendor name */
899 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
900 (unsigned int *)&c->x86_vendor_id[0],
901 (unsigned int *)&c->x86_vendor_id[8],
902 (unsigned int *)&c->x86_vendor_id[4]);
903
904 get_cpu_vendor(c);
905
906 /* Initialize the standard set of capabilities */
907 /* Note that the vendor-specific code below might override */
908
909 /* Intel-defined flags: level 0x00000001 */
910 if (c->cpuid_level >= 0x00000001) {
911 __u32 misc;
912 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
913 &c->x86_capability[0]);
914 c->x86 = (tfms >> 8) & 0xf;
915 c->x86_model = (tfms >> 4) & 0xf;
916 c->x86_mask = tfms & 0xf;
917 if (c->x86 == 0xf) {
918 c->x86 += (tfms >> 20) & 0xff;
919 c->x86_model += ((tfms >> 16) & 0xF) << 4;
920 }
921 if (c->x86_capability[0] & (1<<19))
922 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
923 c->x86_apicid = misc >> 24;
924 } else {
925 /* Have CPUID level 0 only - unheard of */
926 c->x86 = 4;
927 }
928}
929
930/*
931 * This does the hard work of actually picking apart the CPU stuff...
932 */
933void __init identify_cpu(struct cpuinfo_x86 *c)
934{
935 int i;
936 u32 xlvl;
937
938 early_identify_cpu(c);
939
940 /* AMD-defined flags: level 0x80000001 */
941 xlvl = cpuid_eax(0x80000000);
942 c->x86_cpuid_level = xlvl;
943 if ((xlvl & 0xffff0000) == 0x80000000) {
944 if (xlvl >= 0x80000001) {
945 c->x86_capability[1] = cpuid_edx(0x80000001);
946 c->x86_capability[5] = cpuid_ecx(0x80000001);
947 }
948 if (xlvl >= 0x80000004)
949 get_model_name(c); /* Default name */
950 }
951
952 /* Transmeta-defined flags: level 0x80860001 */
953 xlvl = cpuid_eax(0x80860000);
954 if ((xlvl & 0xffff0000) == 0x80860000) {
955 /* Don't set x86_cpuid_level here for now to not confuse. */
956 if (xlvl >= 0x80860001)
957 c->x86_capability[2] = cpuid_edx(0x80860001);
958 }
959
960 /*
961 * Vendor-specific initialization. In this section we
962 * canonicalize the feature flags, meaning if there are
963 * features a certain CPU supports which CPUID doesn't
964 * tell us, CPUID claiming incorrect flags, or other bugs,
965 * we handle them here.
966 *
967 * At the end of this section, c->x86_capability better
968 * indicate the features this CPU genuinely supports!
969 */
970 switch (c->x86_vendor) {
971 case X86_VENDOR_AMD:
972 init_amd(c);
973 break;
974
975 case X86_VENDOR_INTEL:
976 init_intel(c);
977 break;
978
979 case X86_VENDOR_UNKNOWN:
980 default:
981 display_cacheinfo(c);
982 break;
983 }
984
985 select_idle_routine(c);
986 detect_ht(c);
987 sched_cmp_hack(c);
988
989 /*
990 * On SMP, boot_cpu_data holds the common feature set between
991 * all CPUs; so make sure that we indicate which features are
992 * common between the CPUs. The first time this routine gets
993 * executed, c == &boot_cpu_data.
994 */
995 if (c != &boot_cpu_data) {
996 /* AND the already accumulated flags with these */
997 for (i = 0 ; i < NCAPINTS ; i++)
998 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
999 }
1000
1001#ifdef CONFIG_X86_MCE
1002 mcheck_init(c);
1003#endif
1004#ifdef CONFIG_NUMA
1005 if (c != &boot_cpu_data)
1006 numa_add_cpu(c - cpu_data);
1007#endif
1008}
1009
1010
1011void __init print_cpu_info(struct cpuinfo_x86 *c)
1012{
1013 if (c->x86_model_id[0])
1014 printk("%s", c->x86_model_id);
1015
1016 if (c->x86_mask || c->cpuid_level >= 0)
1017 printk(" stepping %02x\n", c->x86_mask);
1018 else
1019 printk("\n");
1020}
1021
1022/*
1023 * Get CPU information for use by the procfs.
1024 */
1025
1026static int show_cpuinfo(struct seq_file *m, void *v)
1027{
1028 struct cpuinfo_x86 *c = v;
1029
1030 /*
1031 * These flag bits must match the definitions in <asm/cpufeature.h>.
1032 * NULL means this bit is undefined or reserved; either way it doesn't
1033 * have meaning as far as Linux is concerned. Note that it's important
1034 * to realize there is a difference between this table and CPUID -- if
1035 * applications want to get the raw CPUID data, they should access
1036 * /dev/cpu/<cpu_nr>/cpuid instead.
1037 */
1038 static char *x86_cap_flags[] = {
1039 /* Intel-defined */
1040 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
1041 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
1042 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
1043 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
1044
1045 /* AMD-defined */
1046 "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1047 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1048 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1049 NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow",
1050
1051 /* Transmeta-defined */
1052 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
1053 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1054 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1055 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1056
1057 /* Other (Linux-defined) */
1058 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL,
1059 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1060 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1061 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1062
1063 /* Intel-defined (#2) */
1064 "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
1065 "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
1066 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1067 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1068
1069 /* AMD-defined (#2) */
1070 "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL,
1071 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1072 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1073 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
1074 };
1075 static char *x86_power_flags[] = {
1076 "ts", /* temperature sensor */
1077 "fid", /* frequency id control */
1078 "vid", /* voltage id control */
1079 "ttp", /* thermal trip */
1080 "tm",
1081 "stc"
1082 };
1083
1084
1085#ifdef CONFIG_SMP
1086 if (!cpu_online(c-cpu_data))
1087 return 0;
1088#endif
1089
1090 seq_printf(m,"processor\t: %u\n"
1091 "vendor_id\t: %s\n"
1092 "cpu family\t: %d\n"
1093 "model\t\t: %d\n"
1094 "model name\t: %s\n",
1095 (unsigned)(c-cpu_data),
1096 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1097 c->x86,
1098 (int)c->x86_model,
1099 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1100
1101 if (c->x86_mask || c->cpuid_level >= 0)
1102 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1103 else
1104 seq_printf(m, "stepping\t: unknown\n");
1105
1106 if (cpu_has(c,X86_FEATURE_TSC)) {
1107 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1108 cpu_khz / 1000, (cpu_khz % 1000));
1109 }
1110
1111 /* Cache size */
1112 if (c->x86_cache_size >= 0)
1113 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1114
1115#ifdef CONFIG_SMP
1116 seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
1117 seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings);
1118#endif
1119
1120 seq_printf(m,
1121 "fpu\t\t: yes\n"
1122 "fpu_exception\t: yes\n"
1123 "cpuid level\t: %d\n"
1124 "wp\t\t: yes\n"
1125 "flags\t\t:",
1126 c->cpuid_level);
1127
1128 {
1129 int i;
1130 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
1131 if ( test_bit(i, &c->x86_capability) &&
1132 x86_cap_flags[i] != NULL )
1133 seq_printf(m, " %s", x86_cap_flags[i]);
1134 }
1135
1136 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1137 c->loops_per_jiffy/(500000/HZ),
1138 (c->loops_per_jiffy/(5000/HZ)) % 100);
1139
1140 if (c->x86_tlbsize > 0)
1141 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1142 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1143 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1144
1145 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1146 c->x86_phys_bits, c->x86_virt_bits);
1147
1148 seq_printf(m, "power management:");
1149 {
1150 unsigned i;
1151 for (i = 0; i < 32; i++)
1152 if (c->x86_power & (1 << i)) {
1153 if (i < ARRAY_SIZE(x86_power_flags))
1154 seq_printf(m, " %s", x86_power_flags[i]);
1155 else
1156 seq_printf(m, " [%d]", i);
1157 }
1158 }
1159 seq_printf(m, "\n");
1160
1161 if (c->x86_num_cores > 1)
1162 seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
1163
1164 seq_printf(m, "\n\n");
1165
1166 return 0;
1167}
1168
1169static void *c_start(struct seq_file *m, loff_t *pos)
1170{
1171 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
1172}
1173
1174static void *c_next(struct seq_file *m, void *v, loff_t *pos)
1175{
1176 ++*pos;
1177 return c_start(m, pos);
1178}
1179
1180static void c_stop(struct seq_file *m, void *v)
1181{
1182}
1183
1184struct seq_operations cpuinfo_op = {
1185 .start =c_start,
1186 .next = c_next,
1187 .stop = c_stop,
1188 .show = show_cpuinfo,
1189};
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
new file mode 100644
index 000000000000..b5305b04bc40
--- /dev/null
+++ b/arch/x86_64/kernel/setup64.c
@@ -0,0 +1,292 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
7 */
8#include <linux/config.h>
9#include <linux/init.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/bootmem.h>
14#include <linux/bitops.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/mtrr.h>
25#include <asm/proto.h>
26#include <asm/mman.h>
27#include <asm/numa.h>
28
29char x86_boot_params[2048] __initdata = {0,};
30
31cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
32
33struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
34
35extern struct task_struct init_task;
36
37extern unsigned char __per_cpu_start[], __per_cpu_end[];
38
39extern struct desc_ptr cpu_gdt_descr[];
40struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
41
42char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
43
44unsigned long __supported_pte_mask = ~0UL;
45static int do_not_nx __initdata = 0;
46
47/* noexec=on|off
48Control non executable mappings for 64bit processes.
49
50on Enable(default)
51off Disable
52*/
53int __init nonx_setup(char *str)
54{
55 if (!strncmp(str, "on", 2)) {
56 __supported_pte_mask |= _PAGE_NX;
57 do_not_nx = 0;
58 } else if (!strncmp(str, "off", 3)) {
59 do_not_nx = 1;
60 __supported_pte_mask &= ~_PAGE_NX;
61 }
62 return 0;
63}
64__setup("noexec=", nonx_setup); /* parsed early actually */
65
66int force_personality32 = READ_IMPLIES_EXEC;
67
68/* noexec32=on|off
69Control non executable heap for 32bit processes.
70To control the stack too use noexec=off
71
72on PROT_READ does not imply PROT_EXEC for 32bit processes
73off PROT_READ implies PROT_EXEC (default)
74*/
75static int __init nonx32_setup(char *str)
76{
77 if (!strcmp(str, "on"))
78 force_personality32 &= ~READ_IMPLIES_EXEC;
79 else if (!strcmp(str, "off"))
80 force_personality32 |= READ_IMPLIES_EXEC;
81 return 0;
82}
83__setup("noexec32=", nonx32_setup);
84
85/*
86 * Great future plan:
87 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
88 * Always point %gs to its beginning
89 */
90void __init setup_per_cpu_areas(void)
91{
92 int i;
93 unsigned long size;
94
95 /* Copy section for each CPU (we discard the original) */
96 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
97#ifdef CONFIG_MODULES
98 if (size < PERCPU_ENOUGH_ROOM)
99 size = PERCPU_ENOUGH_ROOM;
100#endif
101
102 for (i = 0; i < NR_CPUS; i++) {
103 unsigned char *ptr;
104
105 if (!NODE_DATA(cpu_to_node(i))) {
106 printk("cpu with no node %d, num_online_nodes %d\n",
107 i, num_online_nodes());
108 ptr = alloc_bootmem(size);
109 } else {
110 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
111 }
112 if (!ptr)
113 panic("Cannot allocate cpu data for CPU %d\n", i);
114 cpu_pda[i].data_offset = ptr - __per_cpu_start;
115 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
116 }
117}
118
119void pda_init(int cpu)
120{
121 struct x8664_pda *pda = &cpu_pda[cpu];
122
123 /* Setup up data that may be needed in __get_free_pages early */
124 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
125 wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
126
127 pda->me = pda;
128 pda->cpunumber = cpu;
129 pda->irqcount = -1;
130 pda->kernelstack =
131 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
132 pda->active_mm = &init_mm;
133 pda->mmu_state = 0;
134
135 if (cpu == 0) {
136 /* others are initialized in smpboot.c */
137 pda->pcurrent = &init_task;
138 pda->irqstackptr = boot_cpu_stack;
139 } else {
140 pda->irqstackptr = (char *)
141 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
142 if (!pda->irqstackptr)
143 panic("cannot allocate irqstack for cpu %d", cpu);
144 }
145
146 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
147
148 pda->irqstackptr += IRQSTACKSIZE-64;
149}
150
151char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ]
152__attribute__((section(".bss.page_aligned")));
153
154/* May not be marked __init: used by software suspend */
155void syscall_init(void)
156{
157 /*
158 * LSTAR and STAR live in a bit strange symbiosis.
159 * They both write to the same internal register. STAR allows to set CS/DS
160 * but only a 32bit target. LSTAR sets the 64bit rip.
161 */
162 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
163 wrmsrl(MSR_LSTAR, system_call);
164
165#ifdef CONFIG_IA32_EMULATION
166 syscall32_cpu_init ();
167#endif
168
169 /* Flags to clear on syscall */
170 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
171}
172
173void __init check_efer(void)
174{
175 unsigned long efer;
176
177 rdmsrl(MSR_EFER, efer);
178 if (!(efer & EFER_NX) || do_not_nx) {
179 __supported_pte_mask &= ~_PAGE_NX;
180 }
181}
182
183/*
184 * cpu_init() initializes state that is per-CPU. Some data is already
185 * initialized (naturally) in the bootstrap process, such as the GDT
186 * and IDT. We reload them nevertheless, this function acts as a
187 * 'CPU state barrier', nothing should get across.
188 * A lot of state is already set up in PDA init.
189 */
190void __init cpu_init (void)
191{
192#ifdef CONFIG_SMP
193 int cpu = stack_smp_processor_id();
194#else
195 int cpu = smp_processor_id();
196#endif
197 struct tss_struct *t = &per_cpu(init_tss, cpu);
198 unsigned long v;
199 char *estacks = NULL;
200 struct task_struct *me;
201 int i;
202
203 /* CPU 0 is initialised in head64.c */
204 if (cpu != 0) {
205 pda_init(cpu);
206 } else
207 estacks = boot_exception_stacks;
208
209 me = current;
210
211 if (cpu_test_and_set(cpu, cpu_initialized))
212 panic("CPU#%d already initialized!\n", cpu);
213
214 printk("Initializing CPU#%d\n", cpu);
215
216 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
217
218 /*
219 * Initialize the per-CPU GDT with the boot GDT,
220 * and set up the GDT descriptor:
221 */
222 if (cpu) {
223 memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE);
224 }
225
226 cpu_gdt_descr[cpu].size = GDT_SIZE;
227 cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
228 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
229 asm volatile("lidt %0" :: "m" (idt_descr));
230
231 memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8);
232
233 /*
234 * Delete NT
235 */
236
237 asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
238
239 syscall_init();
240
241 wrmsrl(MSR_FS_BASE, 0);
242 wrmsrl(MSR_KERNEL_GS_BASE, 0);
243 barrier();
244
245 check_efer();
246
247 /*
248 * set up and load the per-CPU TSS
249 */
250 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
251 if (cpu) {
252 estacks = (char *)__get_free_pages(GFP_ATOMIC,
253 EXCEPTION_STACK_ORDER);
254 if (!estacks)
255 panic("Cannot allocate exception stack %ld %d\n",
256 v, cpu);
257 }
258 estacks += EXCEPTION_STKSZ;
259 t->ist[v] = (unsigned long)estacks;
260 }
261
262 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
263 /*
264 * <= is required because the CPU will access up to
265 * 8 bits beyond the end of the IO permission bitmap.
266 */
267 for (i = 0; i <= IO_BITMAP_LONGS; i++)
268 t->io_bitmap[i] = ~0UL;
269
270 atomic_inc(&init_mm.mm_count);
271 me->active_mm = &init_mm;
272 if (me->mm)
273 BUG();
274 enter_lazy_tlb(&init_mm, me);
275
276 set_tss_desc(cpu, t);
277 load_TR_desc();
278 load_LDT(&init_mm.context);
279
280 /*
281 * Clear all 6 debug registers:
282 */
283
284 set_debug(0UL, 0);
285 set_debug(0UL, 1);
286 set_debug(0UL, 2);
287 set_debug(0UL, 3);
288 set_debug(0UL, 6);
289 set_debug(0UL, 7);
290
291 fpu_init();
292}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
new file mode 100644
index 000000000000..5ace32c6ac5c
--- /dev/null
+++ b/arch/x86_64/kernel/signal.c
@@ -0,0 +1,486 @@
1/*
2 * linux/arch/x86_64/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 *
7 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
8 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
9 * 2000-2002 x86-64 support by Andi Kleen
10 *
11 * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
12 */
13
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/smp_lock.h>
18#include <linux/kernel.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/wait.h>
22#include <linux/ptrace.h>
23#include <linux/unistd.h>
24#include <linux/stddef.h>
25#include <linux/personality.h>
26#include <linux/compiler.h>
27#include <asm/ucontext.h>
28#include <asm/uaccess.h>
29#include <asm/i387.h>
30#include <asm/proto.h>
31
32/* #define DEBUG_SIG 1 */
33
34#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
35
36void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
37 sigset_t *set, struct pt_regs * regs);
38void ia32_setup_frame(int sig, struct k_sigaction *ka,
39 sigset_t *set, struct pt_regs * regs);
40
41asmlinkage long
42sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
43{
44 sigset_t saveset, newset;
45
46 /* XXX: Don't preclude handling different sized sigset_t's. */
47 if (sigsetsize != sizeof(sigset_t))
48 return -EINVAL;
49
50 if (copy_from_user(&newset, unewset, sizeof(newset)))
51 return -EFAULT;
52 sigdelsetmask(&newset, ~_BLOCKABLE);
53
54 spin_lock_irq(&current->sighand->siglock);
55 saveset = current->blocked;
56 current->blocked = newset;
57 recalc_sigpending();
58 spin_unlock_irq(&current->sighand->siglock);
59#ifdef DEBUG_SIG
60 printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
61 saveset, newset, regs, regs->rip);
62#endif
63 regs->rax = -EINTR;
64 while (1) {
65 current->state = TASK_INTERRUPTIBLE;
66 schedule();
67 if (do_signal(regs, &saveset))
68 return -EINTR;
69 }
70}
71
72asmlinkage long
73sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
74 struct pt_regs *regs)
75{
76 return do_sigaltstack(uss, uoss, regs->rsp);
77}
78
79
80/*
81 * Do a signal return; undo the signal stack.
82 */
83
84struct rt_sigframe
85{
86 char *pretcode;
87 struct ucontext uc;
88 struct siginfo info;
89};
90
91static int
92restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
93{
94 unsigned int err = 0;
95
96 /* Always make any pending restarted system calls return -EINTR */
97 current_thread_info()->restart_block.fn = do_no_restart_syscall;
98
99#define COPY(x) err |= __get_user(regs->x, &sc->x)
100
101 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
102 COPY(rdx); COPY(rcx); COPY(rip);
103 COPY(r8);
104 COPY(r9);
105 COPY(r10);
106 COPY(r11);
107 COPY(r12);
108 COPY(r13);
109 COPY(r14);
110 COPY(r15);
111
112 {
113 unsigned int tmpflags;
114 err |= __get_user(tmpflags, &sc->eflags);
115 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
116 regs->orig_rax = -1; /* disable syscall checks */
117 }
118
119 {
120 struct _fpstate __user * buf;
121 err |= __get_user(buf, &sc->fpstate);
122
123 if (buf) {
124 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
125 goto badframe;
126 err |= restore_i387(buf);
127 } else {
128 struct task_struct *me = current;
129 if (used_math()) {
130 clear_fpu(me);
131 clear_used_math();
132 }
133 }
134 }
135
136 err |= __get_user(*prax, &sc->rax);
137 return err;
138
139badframe:
140 return 1;
141}
142
143asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
144{
145 struct rt_sigframe __user *frame;
146 sigset_t set;
147 unsigned long eax;
148
149 frame = (struct rt_sigframe __user *)(regs->rsp - 8);
150 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
151 goto badframe;
152 }
153 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
154 goto badframe;
155 }
156
157 sigdelsetmask(&set, ~_BLOCKABLE);
158 spin_lock_irq(&current->sighand->siglock);
159 current->blocked = set;
160 recalc_sigpending();
161 spin_unlock_irq(&current->sighand->siglock);
162
163 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
164 goto badframe;
165
166#ifdef DEBUG_SIG
167 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax);
168#endif
169
170 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
171 goto badframe;
172
173 return eax;
174
175badframe:
176 signal_fault(regs,frame,"sigreturn");
177 return 0;
178}
179
180/*
181 * Set up a signal frame.
182 */
183
184static inline int
185setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
186{
187 int err = 0;
188 unsigned long eflags;
189
190 err |= __put_user(0, &sc->gs);
191 err |= __put_user(0, &sc->fs);
192
193 err |= __put_user(regs->rdi, &sc->rdi);
194 err |= __put_user(regs->rsi, &sc->rsi);
195 err |= __put_user(regs->rbp, &sc->rbp);
196 err |= __put_user(regs->rsp, &sc->rsp);
197 err |= __put_user(regs->rbx, &sc->rbx);
198 err |= __put_user(regs->rdx, &sc->rdx);
199 err |= __put_user(regs->rcx, &sc->rcx);
200 err |= __put_user(regs->rax, &sc->rax);
201 err |= __put_user(regs->r8, &sc->r8);
202 err |= __put_user(regs->r9, &sc->r9);
203 err |= __put_user(regs->r10, &sc->r10);
204 err |= __put_user(regs->r11, &sc->r11);
205 err |= __put_user(regs->r12, &sc->r12);
206 err |= __put_user(regs->r13, &sc->r13);
207 err |= __put_user(regs->r14, &sc->r14);
208 err |= __put_user(regs->r15, &sc->r15);
209 err |= __put_user(me->thread.trap_no, &sc->trapno);
210 err |= __put_user(me->thread.error_code, &sc->err);
211 err |= __put_user(regs->rip, &sc->rip);
212 eflags = regs->eflags;
213 if (current->ptrace & PT_PTRACED) {
214 eflags &= ~TF_MASK;
215 }
216 err |= __put_user(eflags, &sc->eflags);
217 err |= __put_user(mask, &sc->oldmask);
218 err |= __put_user(me->thread.cr2, &sc->cr2);
219
220 return err;
221}
222
223/*
224 * Determine which stack to use..
225 */
226
227static void __user *
228get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
229{
230 unsigned long rsp;
231
232 /* Default to using normal stack - redzone*/
233 rsp = regs->rsp - 128;
234
235 /* This is the X/Open sanctioned signal stack switching. */
236 /* RED-PEN: redzone on that stack? */
237 if (ka->sa.sa_flags & SA_ONSTACK) {
238 if (sas_ss_flags(rsp) == 0)
239 rsp = current->sas_ss_sp + current->sas_ss_size;
240 }
241
242 return (void __user *)round_down(rsp - size, 16);
243}
244
245static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
246 sigset_t *set, struct pt_regs * regs)
247{
248 struct rt_sigframe __user *frame;
249 struct _fpstate __user *fp = NULL;
250 int err = 0;
251 struct task_struct *me = current;
252
253 if (used_math()) {
254 fp = get_stack(ka, regs, sizeof(struct _fpstate));
255 frame = (void __user *)round_down(
256 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
257
258 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
259 goto give_sigsegv;
260
261 if (save_i387(fp) < 0)
262 err |= -1;
263 } else
264 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
265
266 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
267 goto give_sigsegv;
268
269 if (ka->sa.sa_flags & SA_SIGINFO) {
270 err |= copy_siginfo_to_user(&frame->info, info);
271 if (err)
272 goto give_sigsegv;
273 }
274
275 /* Create the ucontext. */
276 err |= __put_user(0, &frame->uc.uc_flags);
277 err |= __put_user(0, &frame->uc.uc_link);
278 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
279 err |= __put_user(sas_ss_flags(regs->rsp),
280 &frame->uc.uc_stack.ss_flags);
281 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
282 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
283 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
284 if (sizeof(*set) == 16) {
285 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
286 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
287 } else
288 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
289
290 /* Set up to return from userspace. If provided, use a stub
291 already in userspace. */
292 /* x86-64 should always use SA_RESTORER. */
293 if (ka->sa.sa_flags & SA_RESTORER) {
294 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
295 } else {
296 /* could use a vstub here */
297 goto give_sigsegv;
298 }
299
300 if (err)
301 goto give_sigsegv;
302
303#ifdef DEBUG_SIG
304 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
305#endif
306
307 /* Set up registers for signal handler */
308 {
309 struct exec_domain *ed = current_thread_info()->exec_domain;
310 if (unlikely(ed && ed->signal_invmap && sig < 32))
311 sig = ed->signal_invmap[sig];
312 }
313 regs->rdi = sig;
314 /* In case the signal handler was declared without prototypes */
315 regs->rax = 0;
316
317 /* This also works for non SA_SIGINFO handlers because they expect the
318 next argument after the signal number on the stack. */
319 regs->rsi = (unsigned long)&frame->info;
320 regs->rdx = (unsigned long)&frame->uc;
321 regs->rip = (unsigned long) ka->sa.sa_handler;
322
323 regs->rsp = (unsigned long)frame;
324
325 set_fs(USER_DS);
326 if (regs->eflags & TF_MASK) {
327 if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) {
328 ptrace_notify(SIGTRAP);
329 } else {
330 regs->eflags &= ~TF_MASK;
331 }
332 }
333
334#ifdef DEBUG_SIG
335 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
336 current->comm, current->pid, frame, regs->rip, frame->pretcode);
337#endif
338
339 return;
340
341give_sigsegv:
342 force_sigsegv(sig, current);
343}
344
345/*
346 * OK, we're invoking a handler
347 */
348
349static void
350handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
351 sigset_t *oldset, struct pt_regs *regs)
352{
353#ifdef DEBUG_SIG
354 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
355 current->pid, sig,
356 regs->rip, regs->rsp, regs);
357#endif
358
359 /* Are we from a system call? */
360 if ((long)regs->orig_rax >= 0) {
361 /* If so, check system call restarting.. */
362 switch (regs->rax) {
363 case -ERESTART_RESTARTBLOCK:
364 case -ERESTARTNOHAND:
365 regs->rax = -EINTR;
366 break;
367
368 case -ERESTARTSYS:
369 if (!(ka->sa.sa_flags & SA_RESTART)) {
370 regs->rax = -EINTR;
371 break;
372 }
373 /* fallthrough */
374 case -ERESTARTNOINTR:
375 regs->rax = regs->orig_rax;
376 regs->rip -= 2;
377 break;
378 }
379 }
380
381#ifdef CONFIG_IA32_EMULATION
382 if (test_thread_flag(TIF_IA32)) {
383 if (ka->sa.sa_flags & SA_SIGINFO)
384 ia32_setup_rt_frame(sig, ka, info, oldset, regs);
385 else
386 ia32_setup_frame(sig, ka, oldset, regs);
387 } else
388#endif
389 setup_rt_frame(sig, ka, info, oldset, regs);
390
391 if (!(ka->sa.sa_flags & SA_NODEFER)) {
392 spin_lock_irq(&current->sighand->siglock);
393 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
394 sigaddset(&current->blocked,sig);
395 recalc_sigpending();
396 spin_unlock_irq(&current->sighand->siglock);
397 }
398}
399
400/*
401 * Note that 'init' is a special process: it doesn't get signals it doesn't
402 * want to handle. Thus you cannot kill init even with a SIGKILL even by
403 * mistake.
404 */
405int do_signal(struct pt_regs *regs, sigset_t *oldset)
406{
407 struct k_sigaction ka;
408 siginfo_t info;
409 int signr;
410
411 /*
412 * We want the common case to go fast, which
413 * is why we may in certain cases get here from
414 * kernel mode. Just return without doing anything
415 * if so.
416 */
417 if ((regs->cs & 3) != 3)
418 return 1;
419
420 if (try_to_freeze(0))
421 goto no_signal;
422
423 if (!oldset)
424 oldset = &current->blocked;
425
426 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
427 if (signr > 0) {
428 /* Reenable any watchpoints before delivering the
429 * signal to user space. The processor register will
430 * have been cleared if the watchpoint triggered
431 * inside the kernel.
432 */
433 if (current->thread.debugreg7)
434 asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7));
435
436 /* Whee! Actually deliver the signal. */
437 handle_signal(signr, &info, &ka, oldset, regs);
438 return 1;
439 }
440
441 no_signal:
442 /* Did we come from a system call? */
443 if ((long)regs->orig_rax >= 0) {
444 /* Restart the system call - no handlers present */
445 long res = regs->rax;
446 if (res == -ERESTARTNOHAND ||
447 res == -ERESTARTSYS ||
448 res == -ERESTARTNOINTR) {
449 regs->rax = regs->orig_rax;
450 regs->rip -= 2;
451 }
452 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
453 regs->rax = __NR_restart_syscall;
454 regs->rip -= 2;
455 }
456 }
457 return 0;
458}
459
460void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
461{
462#ifdef DEBUG_SIG
463 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
464 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
465#endif
466
467 /* Pending single-step? */
468 if (thread_info_flags & _TIF_SINGLESTEP) {
469 regs->eflags |= TF_MASK;
470 clear_thread_flag(TIF_SINGLESTEP);
471 }
472
473 /* deal with pending signal delivery */
474 if (thread_info_flags & _TIF_SIGPENDING)
475 do_signal(regs,oldset);
476}
477
478void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
479{
480 struct task_struct *me = current;
481 if (exception_trace)
482 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
483 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
484
485 force_sig(SIGSEGV, me);
486}
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
new file mode 100644
index 000000000000..e3cdbf9a88bd
--- /dev/null
+++ b/arch/x86_64/kernel/smp.c
@@ -0,0 +1,415 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 *
8 * This code is released under the GNU General Public License version 2 or
9 * later.
10 */
11
12#include <linux/init.h>
13
14#include <linux/mm.h>
15#include <linux/irq.h>
16#include <linux/delay.h>
17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
19#include <linux/smp.h>
20#include <linux/kernel_stat.h>
21#include <linux/mc146818rtc.h>
22#include <linux/interrupt.h>
23
24#include <asm/mtrr.h>
25#include <asm/pgalloc.h>
26#include <asm/tlbflush.h>
27#include <asm/mach_apic.h>
28#include <asm/mmu_context.h>
29#include <asm/proto.h>
30
31/*
32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds.
34 *
35 * These mean you can really definitely utterly forget about
36 * writing to user space from interrupts. (Its not allowed anyway).
37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 */
40
41static cpumask_t flush_cpumask;
42static struct mm_struct * flush_mm;
43static unsigned long flush_va;
44static DEFINE_SPINLOCK(tlbstate_lock);
45#define FLUSH_ALL -1ULL
46
47/*
48 * We cannot call mmdrop() because we are in interrupt context,
49 * instead update mm->cpu_vm_mask.
50 */
51static inline void leave_mm (unsigned long cpu)
52{
53 if (read_pda(mmu_state) == TLBSTATE_OK)
54 BUG();
55 clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
56 load_cr3(swapper_pg_dir);
57}
58
59/*
60 *
61 * The flush IPI assumes that a thread switch happens in this order:
62 * [cpu0: the cpu that switches]
63 * 1) switch_mm() either 1a) or 1b)
64 * 1a) thread switch to a different mm
65 * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
66 * Stop ipi delivery for the old mm. This is not synchronized with
67 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
68 * for the wrong mm, and in the worst case we perform a superfluous
69 * tlb flush.
70 * 1a2) set cpu mmu_state to TLBSTATE_OK
71 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
72 * was in lazy tlb mode.
73 * 1a3) update cpu active_mm
74 * Now cpu0 accepts tlb flushes for the new mm.
75 * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
76 * Now the other cpus will send tlb flush ipis.
77 * 1a4) change cr3.
78 * 1b) thread switch without mm change
79 * cpu active_mm is correct, cpu0 already handles
80 * flush ipis.
81 * 1b1) set cpu mmu_state to TLBSTATE_OK
82 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
83 * Atomically set the bit [other cpus will start sending flush ipis],
84 * and test the bit.
85 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
86 * 2) switch %%esp, ie current
87 *
88 * The interrupt must handle 2 special cases:
89 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
90 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
91 * runs in kernel space, the cpu could load tlb entries for user space
92 * pages.
93 *
94 * The good news is that cpu mmu_state is local to each cpu, no
95 * write/read ordering problems.
96 */
97
98/*
99 * TLB flush IPI:
100 *
101 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
102 * 2) Leave the mm if we are in the lazy tlb mode.
103 */
104
105asmlinkage void smp_invalidate_interrupt (void)
106{
107 unsigned long cpu;
108
109 cpu = get_cpu();
110
111 if (!cpu_isset(cpu, flush_cpumask))
112 goto out;
113 /*
114 * This was a BUG() but until someone can quote me the
115 * line from the intel manual that guarantees an IPI to
116 * multiple CPUs is retried _only_ on the erroring CPUs
117 * its staying as a return
118 *
119 * BUG();
120 */
121
122 if (flush_mm == read_pda(active_mm)) {
123 if (read_pda(mmu_state) == TLBSTATE_OK) {
124 if (flush_va == FLUSH_ALL)
125 local_flush_tlb();
126 else
127 __flush_tlb_one(flush_va);
128 } else
129 leave_mm(cpu);
130 }
131 ack_APIC_irq();
132 cpu_clear(cpu, flush_cpumask);
133
134out:
135 put_cpu_no_resched();
136}
137
138static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
139 unsigned long va)
140{
141 cpumask_t tmp;
142 /*
143 * A couple of (to be removed) sanity checks:
144 *
145 * - we do not send IPIs to not-yet booted CPUs.
146 * - current CPU must not be in mask
147 * - mask must exist :)
148 */
149 BUG_ON(cpus_empty(cpumask));
150 cpus_and(tmp, cpumask, cpu_online_map);
151 BUG_ON(!cpus_equal(tmp, cpumask));
152 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
153 if (!mm)
154 BUG();
155
156 /*
157 * I'm not happy about this global shared spinlock in the
158 * MM hot path, but we'll see how contended it is.
159 * Temporarily this turns IRQs off, so that lockups are
160 * detected by the NMI watchdog.
161 */
162 spin_lock(&tlbstate_lock);
163
164 flush_mm = mm;
165 flush_va = va;
166 cpus_or(flush_cpumask, cpumask, flush_cpumask);
167
168 /*
169 * We have to send the IPI only to
170 * CPUs affected.
171 */
172 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
173
174 while (!cpus_empty(flush_cpumask))
175 mb(); /* nothing. lockup detection does not belong here */;
176
177 flush_mm = NULL;
178 flush_va = 0;
179 spin_unlock(&tlbstate_lock);
180}
181
182void flush_tlb_current_task(void)
183{
184 struct mm_struct *mm = current->mm;
185 cpumask_t cpu_mask;
186
187 preempt_disable();
188 cpu_mask = mm->cpu_vm_mask;
189 cpu_clear(smp_processor_id(), cpu_mask);
190
191 local_flush_tlb();
192 if (!cpus_empty(cpu_mask))
193 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
194 preempt_enable();
195}
196
197void flush_tlb_mm (struct mm_struct * mm)
198{
199 cpumask_t cpu_mask;
200
201 preempt_disable();
202 cpu_mask = mm->cpu_vm_mask;
203 cpu_clear(smp_processor_id(), cpu_mask);
204
205 if (current->active_mm == mm) {
206 if (current->mm)
207 local_flush_tlb();
208 else
209 leave_mm(smp_processor_id());
210 }
211 if (!cpus_empty(cpu_mask))
212 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
213
214 preempt_enable();
215}
216
217void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
218{
219 struct mm_struct *mm = vma->vm_mm;
220 cpumask_t cpu_mask;
221
222 preempt_disable();
223 cpu_mask = mm->cpu_vm_mask;
224 cpu_clear(smp_processor_id(), cpu_mask);
225
226 if (current->active_mm == mm) {
227 if(current->mm)
228 __flush_tlb_one(va);
229 else
230 leave_mm(smp_processor_id());
231 }
232
233 if (!cpus_empty(cpu_mask))
234 flush_tlb_others(cpu_mask, mm, va);
235
236 preempt_enable();
237}
238
239static void do_flush_tlb_all(void* info)
240{
241 unsigned long cpu = smp_processor_id();
242
243 __flush_tlb_all();
244 if (read_pda(mmu_state) == TLBSTATE_LAZY)
245 leave_mm(cpu);
246}
247
248void flush_tlb_all(void)
249{
250 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
251}
252
253void smp_kdb_stop(void)
254{
255 send_IPI_allbutself(KDB_VECTOR);
256}
257
258/*
259 * this function sends a 'reschedule' IPI to another CPU.
260 * it goes straight through and wastes no time serializing
261 * anything. Worst case is that we lose a reschedule ...
262 */
263
264void smp_send_reschedule(int cpu)
265{
266 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
267}
268
269/*
270 * Structure and data for smp_call_function(). This is designed to minimise
271 * static memory requirements. It also looks cleaner.
272 */
273static DEFINE_SPINLOCK(call_lock);
274
275struct call_data_struct {
276 void (*func) (void *info);
277 void *info;
278 atomic_t started;
279 atomic_t finished;
280 int wait;
281};
282
283static struct call_data_struct * call_data;
284
285/*
286 * this function sends a 'generic call function' IPI to all other CPUs
287 * in the system.
288 */
289static void __smp_call_function (void (*func) (void *info), void *info,
290 int nonatomic, int wait)
291{
292 struct call_data_struct data;
293 int cpus = num_online_cpus()-1;
294
295 if (!cpus)
296 return;
297
298 data.func = func;
299 data.info = info;
300 atomic_set(&data.started, 0);
301 data.wait = wait;
302 if (wait)
303 atomic_set(&data.finished, 0);
304
305 call_data = &data;
306 wmb();
307 /* Send a message to all other CPUs and wait for them to respond */
308 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
309
310 /* Wait for response */
311 while (atomic_read(&data.started) != cpus)
312 cpu_relax();
313
314 if (!wait)
315 return;
316
317 while (atomic_read(&data.finished) != cpus)
318 cpu_relax();
319}
320
321/*
322 * smp_call_function - run a function on all other CPUs.
323 * @func: The function to run. This must be fast and non-blocking.
324 * @info: An arbitrary pointer to pass to the function.
325 * @nonatomic: currently unused.
326 * @wait: If true, wait (atomically) until function has completed on other
327 * CPUs.
328 *
329 * Returns 0 on success, else a negative status code. Does not return until
330 * remote CPUs are nearly ready to execute func or are or have executed.
331 *
332 * You must not call this function with disabled interrupts or from a
333 * hardware interrupt handler or from a bottom half handler.
334 * Actually there are a few legal cases, like panic.
335 */
336int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
337 int wait)
338{
339 spin_lock(&call_lock);
340 __smp_call_function(func,info,nonatomic,wait);
341 spin_unlock(&call_lock);
342 return 0;
343}
344
345void smp_stop_cpu(void)
346{
347 /*
348 * Remove this CPU:
349 */
350 cpu_clear(smp_processor_id(), cpu_online_map);
351 local_irq_disable();
352 disable_local_APIC();
353 local_irq_enable();
354}
355
356static void smp_really_stop_cpu(void *dummy)
357{
358 smp_stop_cpu();
359 for (;;)
360 asm("hlt");
361}
362
363void smp_send_stop(void)
364{
365 int nolock = 0;
366 if (reboot_force)
367 return;
368 /* Don't deadlock on the call lock in panic */
369 if (!spin_trylock(&call_lock)) {
370 /* ignore locking because we have paniced anyways */
371 nolock = 1;
372 }
373 __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
374 if (!nolock)
375 spin_unlock(&call_lock);
376
377 local_irq_disable();
378 disable_local_APIC();
379 local_irq_enable();
380}
381
382/*
383 * Reschedule call back. Nothing to do,
384 * all the work is done automatically when
385 * we return from the interrupt.
386 */
387asmlinkage void smp_reschedule_interrupt(void)
388{
389 ack_APIC_irq();
390}
391
392asmlinkage void smp_call_function_interrupt(void)
393{
394 void (*func) (void *info) = call_data->func;
395 void *info = call_data->info;
396 int wait = call_data->wait;
397
398 ack_APIC_irq();
399 /*
400 * Notify initiating CPU that I've grabbed the data and am
401 * about to execute the function
402 */
403 mb();
404 atomic_inc(&call_data->started);
405 /*
406 * At this point the info structure may be out of scope unless wait==1
407 */
408 irq_enter();
409 (*func)(info);
410 irq_exit();
411 if (wait) {
412 mb();
413 atomic_inc(&call_data->finished);
414 }
415}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
new file mode 100644
index 000000000000..afd3600b919e
--- /dev/null
+++ b/arch/x86_64/kernel/smpboot.c
@@ -0,0 +1,938 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2 or
16 * later.
17 *
18 * Fixes
19 * Felix Koop : NR_CPUS used properly
20 * Jose Renau : Handle single CPU case.
21 * Alan Cox : By repeated request 8) - Total BogoMIP report.
22 * Greg Wright : Fix for kernel stacks panic.
23 * Erich Boleyn : MP v1.4 and additional changes.
24 * Matthias Sattler : Changes for 2.1 kernel map.
25 * Michel Lespinasse : Changes for 2.1 kernel map.
26 * Michael Chastain : Change trampoline.S to gnu as.
27 * Alan Cox : Dumb bug: 'B' step PPro's are fine
28 * Ingo Molnar : Added APIC timers, based on code
29 * from Jose Renau
30 * Ingo Molnar : various cleanups and rewrites
31 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
32 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
33 * Andi Kleen : Changed for SMP boot into long mode.
34 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
35 */
36
37#include <linux/config.h>
38#include <linux/init.h>
39
40#include <linux/mm.h>
41#include <linux/kernel_stat.h>
42#include <linux/smp_lock.h>
43#include <linux/irq.h>
44#include <linux/bootmem.h>
45#include <linux/thread_info.h>
46#include <linux/module.h>
47
48#include <linux/delay.h>
49#include <linux/mc146818rtc.h>
50#include <asm/mtrr.h>
51#include <asm/pgalloc.h>
52#include <asm/desc.h>
53#include <asm/kdebug.h>
54#include <asm/tlbflush.h>
55#include <asm/proto.h>
56
57/* Number of siblings per CPU package */
58int smp_num_siblings = 1;
59/* Package ID of each logical CPU */
60u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
61EXPORT_SYMBOL(phys_proc_id);
62
63/* Bitmask of currently online CPUs */
64cpumask_t cpu_online_map;
65
66cpumask_t cpu_callin_map;
67cpumask_t cpu_callout_map;
68static cpumask_t smp_commenced_mask;
69
70/* Per CPU bogomips and other parameters */
71struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
72
73cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
74
75/*
76 * Trampoline 80x86 program as an array.
77 */
78
79extern unsigned char trampoline_data [];
80extern unsigned char trampoline_end [];
81
82/*
83 * Currently trivial. Write the real->protected mode
84 * bootstrap into the page concerned. The caller
85 * has made sure it's suitably aligned.
86 */
87
88static unsigned long __init setup_trampoline(void)
89{
90 void *tramp = __va(SMP_TRAMPOLINE_BASE);
91 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
92 return virt_to_phys(tramp);
93}
94
95/*
96 * The bootstrap kernel entry code has set these up. Save them for
97 * a given CPU
98 */
99
100static void __init smp_store_cpu_info(int id)
101{
102 struct cpuinfo_x86 *c = cpu_data + id;
103
104 *c = boot_cpu_data;
105 identify_cpu(c);
106}
107
108/*
109 * TSC synchronization.
110 *
111 * We first check whether all CPUs have their TSC's synchronized,
112 * then we print a warning if not, and always resync.
113 */
114
115static atomic_t tsc_start_flag = ATOMIC_INIT(0);
116static atomic_t tsc_count_start = ATOMIC_INIT(0);
117static atomic_t tsc_count_stop = ATOMIC_INIT(0);
118static unsigned long long tsc_values[NR_CPUS];
119
120#define NR_LOOPS 5
121
122extern unsigned int fast_gettimeoffset_quotient;
123
124static void __init synchronize_tsc_bp (void)
125{
126 int i;
127 unsigned long long t0;
128 unsigned long long sum, avg;
129 long long delta;
130 long one_usec;
131 int buggy = 0;
132
133 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus());
134
135 one_usec = cpu_khz;
136
137 atomic_set(&tsc_start_flag, 1);
138 wmb();
139
140 /*
141 * We loop a few times to get a primed instruction cache,
142 * then the last pass is more or less synchronized and
143 * the BP and APs set their cycle counters to zero all at
144 * once. This reduces the chance of having random offsets
145 * between the processors, and guarantees that the maximum
146 * delay between the cycle counters is never bigger than
147 * the latency of information-passing (cachelines) between
148 * two CPUs.
149 */
150 for (i = 0; i < NR_LOOPS; i++) {
151 /*
152 * all APs synchronize but they loop on '== num_cpus'
153 */
154 while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb();
155 atomic_set(&tsc_count_stop, 0);
156 wmb();
157 /*
158 * this lets the APs save their current TSC:
159 */
160 atomic_inc(&tsc_count_start);
161
162 sync_core();
163 rdtscll(tsc_values[smp_processor_id()]);
164 /*
165 * We clear the TSC in the last loop:
166 */
167 if (i == NR_LOOPS-1)
168 write_tsc(0, 0);
169
170 /*
171 * Wait for all APs to leave the synchronization point:
172 */
173 while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb();
174 atomic_set(&tsc_count_start, 0);
175 wmb();
176 atomic_inc(&tsc_count_stop);
177 }
178
179 sum = 0;
180 for (i = 0; i < NR_CPUS; i++) {
181 if (cpu_isset(i, cpu_callout_map)) {
182 t0 = tsc_values[i];
183 sum += t0;
184 }
185 }
186 avg = sum / num_booting_cpus();
187
188 sum = 0;
189 for (i = 0; i < NR_CPUS; i++) {
190 if (!cpu_isset(i, cpu_callout_map))
191 continue;
192
193 delta = tsc_values[i] - avg;
194 if (delta < 0)
195 delta = -delta;
196 /*
197 * We report bigger than 2 microseconds clock differences.
198 */
199 if (delta > 2*one_usec) {
200 long realdelta;
201 if (!buggy) {
202 buggy = 1;
203 printk("\n");
204 }
205 realdelta = delta / one_usec;
206 if (tsc_values[i] < avg)
207 realdelta = -realdelta;
208
209 printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
210 i, realdelta);
211 }
212
213 sum += delta;
214 }
215 if (!buggy)
216 printk("passed.\n");
217}
218
219static void __init synchronize_tsc_ap (void)
220{
221 int i;
222
223 /*
224 * Not every cpu is online at the time
225 * this gets called, so we first wait for the BP to
226 * finish SMP initialization:
227 */
228 while (!atomic_read(&tsc_start_flag)) mb();
229
230 for (i = 0; i < NR_LOOPS; i++) {
231 atomic_inc(&tsc_count_start);
232 while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
233
234 sync_core();
235 rdtscll(tsc_values[smp_processor_id()]);
236 if (i == NR_LOOPS-1)
237 write_tsc(0, 0);
238
239 atomic_inc(&tsc_count_stop);
240 while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
241 }
242}
243#undef NR_LOOPS
244
245static atomic_t init_deasserted;
246
247static void __init smp_callin(void)
248{
249 int cpuid, phys_id;
250 unsigned long timeout;
251
252 /*
253 * If waken up by an INIT in an 82489DX configuration
254 * we may get here before an INIT-deassert IPI reaches
255 * our local APIC. We have to wait for the IPI or we'll
256 * lock up on an APIC access.
257 */
258 while (!atomic_read(&init_deasserted));
259
260 /*
261 * (This works even if the APIC is not enabled.)
262 */
263 phys_id = GET_APIC_ID(apic_read(APIC_ID));
264 cpuid = smp_processor_id();
265 if (cpu_isset(cpuid, cpu_callin_map)) {
266 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
267 phys_id, cpuid);
268 }
269 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
270
271 /*
272 * STARTUP IPIs are fragile beasts as they might sometimes
273 * trigger some glue motherboard logic. Complete APIC bus
274 * silence for 1 second, this overestimates the time the
275 * boot CPU is spending to send the up to 2 STARTUP IPIs
276 * by a factor of two. This should be enough.
277 */
278
279 /*
280 * Waiting 2s total for startup (udelay is not yet working)
281 */
282 timeout = jiffies + 2*HZ;
283 while (time_before(jiffies, timeout)) {
284 /*
285 * Has the boot CPU finished it's STARTUP sequence?
286 */
287 if (cpu_isset(cpuid, cpu_callout_map))
288 break;
289 rep_nop();
290 }
291
292 if (!time_before(jiffies, timeout)) {
293 panic("smp_callin: CPU%d started up but did not get a callout!\n",
294 cpuid);
295 }
296
297 /*
298 * the boot CPU has finished the init stage and is spinning
299 * on callin_map until we finish. We are free to set up this
300 * CPU, first the APIC. (this is probably redundant on most
301 * boards)
302 */
303
304 Dprintk("CALLIN, before setup_local_APIC().\n");
305 setup_local_APIC();
306
307 local_irq_enable();
308
309 /*
310 * Get our bogomips.
311 */
312 calibrate_delay();
313 Dprintk("Stack at about %p\n",&cpuid);
314
315 disable_APIC_timer();
316
317 /*
318 * Save our processor parameters
319 */
320 smp_store_cpu_info(cpuid);
321
322 local_irq_disable();
323
324 /*
325 * Allow the master to continue.
326 */
327 cpu_set(cpuid, cpu_callin_map);
328
329 /*
330 * Synchronize the TSC with the BP
331 */
332 if (cpu_has_tsc)
333 synchronize_tsc_ap();
334}
335
336static int cpucount;
337
338/*
339 * Activate a secondary processor.
340 */
341void __init start_secondary(void)
342{
343 /*
344 * Dont put anything before smp_callin(), SMP
345 * booting is too fragile that we want to limit the
346 * things done here to the most necessary things.
347 */
348 cpu_init();
349 smp_callin();
350
351 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
352 barrier();
353
354 Dprintk("cpu %d: waiting for commence\n", smp_processor_id());
355 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
356 rep_nop();
357
358 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
359 setup_secondary_APIC_clock();
360
361 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
362
363 if (nmi_watchdog == NMI_IO_APIC) {
364 disable_8259A_irq(0);
365 enable_NMI_through_LVT0(NULL);
366 enable_8259A_irq(0);
367 }
368
369
370 enable_APIC_timer();
371
372 /*
373 * low-memory mappings have been cleared, flush them from
374 * the local TLBs too.
375 */
376 local_flush_tlb();
377
378 Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id());
379 cpu_set(smp_processor_id(), cpu_online_map);
380 wmb();
381
382 cpu_idle();
383}
384
385extern volatile unsigned long init_rsp;
386extern void (*initial_code)(void);
387
388#if APIC_DEBUG
389static inline void inquire_remote_apic(int apicid)
390{
391 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
392 char *names[] = { "ID", "VERSION", "SPIV" };
393 int timeout, status;
394
395 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
396
397 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
398 printk("... APIC #%d %s: ", apicid, names[i]);
399
400 /*
401 * Wait for idle.
402 */
403 apic_wait_icr_idle();
404
405 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
406 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
407
408 timeout = 0;
409 do {
410 udelay(100);
411 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
412 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
413
414 switch (status) {
415 case APIC_ICR_RR_VALID:
416 status = apic_read(APIC_RRR);
417 printk("%08x\n", status);
418 break;
419 default:
420 printk("failed\n");
421 }
422 }
423}
424#endif
425
426static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
427{
428 unsigned long send_status = 0, accept_status = 0;
429 int maxlvt, timeout, num_starts, j;
430
431 Dprintk("Asserting INIT.\n");
432
433 /*
434 * Turn INIT on target chip
435 */
436 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
437
438 /*
439 * Send IPI
440 */
441 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
442 | APIC_DM_INIT);
443
444 Dprintk("Waiting for send to finish...\n");
445 timeout = 0;
446 do {
447 Dprintk("+");
448 udelay(100);
449 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
450 } while (send_status && (timeout++ < 1000));
451
452 mdelay(10);
453
454 Dprintk("Deasserting INIT.\n");
455
456 /* Target chip */
457 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
458
459 /* Send IPI */
460 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
461
462 Dprintk("Waiting for send to finish...\n");
463 timeout = 0;
464 do {
465 Dprintk("+");
466 udelay(100);
467 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
468 } while (send_status && (timeout++ < 1000));
469
470 atomic_set(&init_deasserted, 1);
471
472 /*
473 * Should we send STARTUP IPIs ?
474 *
475 * Determine this based on the APIC version.
476 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
477 */
478 if (APIC_INTEGRATED(apic_version[phys_apicid]))
479 num_starts = 2;
480 else
481 num_starts = 0;
482
483 /*
484 * Run STARTUP IPI loop.
485 */
486 Dprintk("#startup loops: %d.\n", num_starts);
487
488 maxlvt = get_maxlvt();
489
490 for (j = 1; j <= num_starts; j++) {
491 Dprintk("Sending STARTUP #%d.\n",j);
492 apic_read_around(APIC_SPIV);
493 apic_write(APIC_ESR, 0);
494 apic_read(APIC_ESR);
495 Dprintk("After apic_write.\n");
496
497 /*
498 * STARTUP IPI
499 */
500
501 /* Target chip */
502 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
503
504 /* Boot on the stack */
505 /* Kick the second */
506 apic_write_around(APIC_ICR, APIC_DM_STARTUP
507 | (start_rip >> 12));
508
509 /*
510 * Give the other CPU some time to accept the IPI.
511 */
512 udelay(300);
513
514 Dprintk("Startup point 1.\n");
515
516 Dprintk("Waiting for send to finish...\n");
517 timeout = 0;
518 do {
519 Dprintk("+");
520 udelay(100);
521 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
522 } while (send_status && (timeout++ < 1000));
523
524 /*
525 * Give the other CPU some time to accept the IPI.
526 */
527 udelay(200);
528 /*
529 * Due to the Pentium erratum 3AP.
530 */
531 if (maxlvt > 3) {
532 apic_read_around(APIC_SPIV);
533 apic_write(APIC_ESR, 0);
534 }
535 accept_status = (apic_read(APIC_ESR) & 0xEF);
536 if (send_status || accept_status)
537 break;
538 }
539 Dprintk("After Startup.\n");
540
541 if (send_status)
542 printk(KERN_ERR "APIC never delivered???\n");
543 if (accept_status)
544 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
545
546 return (send_status | accept_status);
547}
548
549static void __init do_boot_cpu (int apicid)
550{
551 struct task_struct *idle;
552 unsigned long boot_error;
553 int timeout, cpu;
554 unsigned long start_rip;
555
556 cpu = ++cpucount;
557 /*
558 * We can't use kernel_thread since we must avoid to
559 * reschedule the child.
560 */
561 idle = fork_idle(cpu);
562 if (IS_ERR(idle))
563 panic("failed fork for CPU %d", cpu);
564 x86_cpu_to_apicid[cpu] = apicid;
565
566 cpu_pda[cpu].pcurrent = idle;
567
568 start_rip = setup_trampoline();
569
570 init_rsp = idle->thread.rsp;
571 per_cpu(init_tss,cpu).rsp0 = init_rsp;
572 initial_code = start_secondary;
573 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
574
575 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
576 start_rip, init_rsp);
577
578 /*
579 * This grunge runs the startup process for
580 * the targeted processor.
581 */
582
583 atomic_set(&init_deasserted, 0);
584
585 Dprintk("Setting warm reset code and vector.\n");
586
587 CMOS_WRITE(0xa, 0xf);
588 local_flush_tlb();
589 Dprintk("1.\n");
590 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
591 Dprintk("2.\n");
592 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
593 Dprintk("3.\n");
594
595 /*
596 * Be paranoid about clearing APIC errors.
597 */
598 if (APIC_INTEGRATED(apic_version[apicid])) {
599 apic_read_around(APIC_SPIV);
600 apic_write(APIC_ESR, 0);
601 apic_read(APIC_ESR);
602 }
603
604 /*
605 * Status is now clean
606 */
607 boot_error = 0;
608
609 /*
610 * Starting actual IPI sequence...
611 */
612 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
613
614 if (!boot_error) {
615 /*
616 * allow APs to start initializing.
617 */
618 Dprintk("Before Callout %d.\n", cpu);
619 cpu_set(cpu, cpu_callout_map);
620 Dprintk("After Callout %d.\n", cpu);
621
622 /*
623 * Wait 5s total for a response
624 */
625 for (timeout = 0; timeout < 50000; timeout++) {
626 if (cpu_isset(cpu, cpu_callin_map))
627 break; /* It has booted */
628 udelay(100);
629 }
630
631 if (cpu_isset(cpu, cpu_callin_map)) {
632 /* number CPUs logically, starting from 1 (BSP is 0) */
633 Dprintk("OK.\n");
634 print_cpu_info(&cpu_data[cpu]);
635 Dprintk("CPU has booted.\n");
636 } else {
637 boot_error = 1;
638 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
639 == 0xA5)
640 /* trampoline started but...? */
641 printk("Stuck ??\n");
642 else
643 /* trampoline code not run */
644 printk("Not responding.\n");
645#if APIC_DEBUG
646 inquire_remote_apic(apicid);
647#endif
648 }
649 }
650 if (boot_error) {
651 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
652 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
653 cpucount--;
654 x86_cpu_to_apicid[cpu] = BAD_APICID;
655 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
656 }
657}
658
659static void smp_tune_scheduling (void)
660{
661 int cachesize; /* kB */
662 unsigned long bandwidth = 1000; /* MB/s */
663 /*
664 * Rough estimation for SMP scheduling, this is the number of
665 * cycles it takes for a fully memory-limited process to flush
666 * the SMP-local cache.
667 *
668 * (For a P5 this pretty much means we will choose another idle
669 * CPU almost always at wakeup time (this is due to the small
670 * L1 cache), on PIIs it's around 50-100 usecs, depending on
671 * the cache size)
672 */
673
674 if (!cpu_khz) {
675 return;
676 } else {
677 cachesize = boot_cpu_data.x86_cache_size;
678 if (cachesize == -1) {
679 cachesize = 16; /* Pentiums, 2x8kB cache */
680 bandwidth = 100;
681 }
682 }
683}
684
685/*
686 * Cycle through the processors sending APIC IPIs to boot each.
687 */
688
689static void __init smp_boot_cpus(unsigned int max_cpus)
690{
691 unsigned apicid, cpu, bit, kicked;
692
693 nmi_watchdog_default();
694
695 /*
696 * Setup boot CPU information
697 */
698 smp_store_cpu_info(0); /* Final full version of the data */
699 printk(KERN_INFO "CPU%d: ", 0);
700 print_cpu_info(&cpu_data[0]);
701
702 current_thread_info()->cpu = 0;
703 smp_tune_scheduling();
704
705 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
706 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
707 hard_smp_processor_id());
708 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
709 }
710
711 /*
712 * If we couldn't find an SMP configuration at boot time,
713 * get out of here now!
714 */
715 if (!smp_found_config) {
716 printk(KERN_NOTICE "SMP motherboard not detected.\n");
717 io_apic_irqs = 0;
718 cpu_online_map = cpumask_of_cpu(0);
719 cpu_set(0, cpu_sibling_map[0]);
720 phys_cpu_present_map = physid_mask_of_physid(0);
721 if (APIC_init_uniprocessor())
722 printk(KERN_NOTICE "Local APIC not detected."
723 " Using dummy APIC emulation.\n");
724 goto smp_done;
725 }
726
727 /*
728 * Should not be necessary because the MP table should list the boot
729 * CPU too, but we do it for the sake of robustness anyway.
730 */
731 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
732 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
733 boot_cpu_id);
734 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
735 }
736
737 /*
738 * If we couldn't find a local APIC, then get out of here now!
739 */
740 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
741 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
742 boot_cpu_id);
743 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
744 io_apic_irqs = 0;
745 cpu_online_map = cpumask_of_cpu(0);
746 cpu_set(0, cpu_sibling_map[0]);
747 phys_cpu_present_map = physid_mask_of_physid(0);
748 disable_apic = 1;
749 goto smp_done;
750 }
751
752 verify_local_APIC();
753
754 /*
755 * If SMP should be disabled, then really disable it!
756 */
757 if (!max_cpus) {
758 smp_found_config = 0;
759 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
760 io_apic_irqs = 0;
761 cpu_online_map = cpumask_of_cpu(0);
762 cpu_set(0, cpu_sibling_map[0]);
763 phys_cpu_present_map = physid_mask_of_physid(0);
764 disable_apic = 1;
765 goto smp_done;
766 }
767
768 connect_bsp_APIC();
769 setup_local_APIC();
770
771 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
772 BUG();
773
774 x86_cpu_to_apicid[0] = boot_cpu_id;
775
776 /*
777 * Now scan the CPU present map and fire up the other CPUs.
778 */
779 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
780
781 kicked = 1;
782 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
783 apicid = cpu_present_to_apicid(bit);
784 /*
785 * Don't even attempt to start the boot CPU!
786 */
787 if (apicid == boot_cpu_id || (apicid == BAD_APICID))
788 continue;
789
790 if (!physid_isset(apicid, phys_cpu_present_map))
791 continue;
792 if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
793 continue;
794
795 do_boot_cpu(apicid);
796 ++kicked;
797 }
798
799 /*
800 * Cleanup possible dangling ends...
801 */
802 {
803 /*
804 * Install writable page 0 entry to set BIOS data area.
805 */
806 local_flush_tlb();
807
808 /*
809 * Paranoid: Set warm reset code and vector here back
810 * to default values.
811 */
812 CMOS_WRITE(0, 0xf);
813
814 *((volatile int *) phys_to_virt(0x467)) = 0;
815 }
816
817 /*
818 * Allow the user to impress friends.
819 */
820
821 Dprintk("Before bogomips.\n");
822 if (!cpucount) {
823 printk(KERN_INFO "Only one processor found.\n");
824 } else {
825 unsigned long bogosum = 0;
826 for (cpu = 0; cpu < NR_CPUS; cpu++)
827 if (cpu_isset(cpu, cpu_callout_map))
828 bogosum += cpu_data[cpu].loops_per_jiffy;
829 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
830 cpucount+1,
831 bogosum/(500000/HZ),
832 (bogosum/(5000/HZ))%100);
833 Dprintk("Before bogocount - setting activated=1.\n");
834 }
835
836 /*
837 * Construct cpu_sibling_map[], so that we can tell the
838 * sibling CPU efficiently.
839 */
840 for (cpu = 0; cpu < NR_CPUS; cpu++)
841 cpus_clear(cpu_sibling_map[cpu]);
842
843 for (cpu = 0; cpu < NR_CPUS; cpu++) {
844 int siblings = 0;
845 int i;
846 if (!cpu_isset(cpu, cpu_callout_map))
847 continue;
848
849 if (smp_num_siblings > 1) {
850 for (i = 0; i < NR_CPUS; i++) {
851 if (!cpu_isset(i, cpu_callout_map))
852 continue;
853 if (phys_proc_id[cpu] == phys_proc_id[i]) {
854 siblings++;
855 cpu_set(i, cpu_sibling_map[cpu]);
856 }
857 }
858 } else {
859 siblings++;
860 cpu_set(cpu, cpu_sibling_map[cpu]);
861 }
862
863 if (siblings != smp_num_siblings) {
864 printk(KERN_WARNING
865 "WARNING: %d siblings found for CPU%d, should be %d\n",
866 siblings, cpu, smp_num_siblings);
867 smp_num_siblings = siblings;
868 }
869 }
870
871 Dprintk("Boot done.\n");
872
873 /*
874 * Here we can be sure that there is an IO-APIC in the system. Let's
875 * go and set it up:
876 */
877 if (!skip_ioapic_setup && nr_ioapics)
878 setup_IO_APIC();
879 else
880 nr_ioapics = 0;
881
882 setup_boot_APIC_clock();
883
884 /*
885 * Synchronize the TSC with the AP
886 */
887 if (cpu_has_tsc && cpucount)
888 synchronize_tsc_bp();
889
890 smp_done:
891 time_init_smp();
892}
893
894/* These are wrappers to interface to the new boot process. Someone
895 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
896void __init smp_prepare_cpus(unsigned int max_cpus)
897{
898 smp_boot_cpus(max_cpus);
899}
900
901void __devinit smp_prepare_boot_cpu(void)
902{
903 cpu_set(smp_processor_id(), cpu_online_map);
904 cpu_set(smp_processor_id(), cpu_callout_map);
905}
906
907int __devinit __cpu_up(unsigned int cpu)
908{
909 /* This only works at boot for x86. See "rewrite" above. */
910 if (cpu_isset(cpu, smp_commenced_mask)) {
911 local_irq_enable();
912 return -ENOSYS;
913 }
914
915 /* In case one didn't come up */
916 if (!cpu_isset(cpu, cpu_callin_map)) {
917 local_irq_enable();
918 return -EIO;
919 }
920 local_irq_enable();
921
922 /* Unleash the CPU! */
923 Dprintk("waiting for cpu %d\n", cpu);
924
925 cpu_set(cpu, smp_commenced_mask);
926 while (!cpu_isset(cpu, cpu_online_map))
927 mb();
928 return 0;
929}
930
931void __init smp_cpus_done(unsigned int max_cpus)
932{
933#ifdef CONFIG_X86_IO_APIC
934 setup_ioapic_dest();
935#endif
936 zap_low_mappings();
937}
938
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
new file mode 100644
index 000000000000..ebaa1e37d657
--- /dev/null
+++ b/arch/x86_64/kernel/suspend.c
@@ -0,0 +1,157 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/config.h>
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/spinlock.h>
16#include <linux/poll.h>
17#include <linux/delay.h>
18#include <linux/sysrq.h>
19#include <linux/proc_fs.h>
20#include <linux/irq.h>
21#include <linux/pm.h>
22#include <linux/device.h>
23#include <linux/suspend.h>
24#include <asm/uaccess.h>
25#include <asm/acpi.h>
26#include <asm/tlbflush.h>
27#include <asm/io.h>
28#include <asm/proto.h>
29
30struct saved_context saved_context;
31
32unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
33unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
34unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
35unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
36unsigned long saved_context_eflags;
37
38void __save_processor_state(struct saved_context *ctxt)
39{
40 kernel_fpu_begin();
41
42 /*
43 * descriptor tables
44 */
45 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
46 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
47 asm volatile ("sldt %0" : "=m" (ctxt->ldt));
48 asm volatile ("str %0" : "=m" (ctxt->tr));
49
50 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
51 /* EFER should be constant for kernel version, no need to handle it. */
52 /*
53 * segment registers
54 */
55 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
56 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
57 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
58 asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
59 asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
60
61 rdmsrl(MSR_FS_BASE, ctxt->fs_base);
62 rdmsrl(MSR_GS_BASE, ctxt->gs_base);
63 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
64
65 /*
66 * control registers
67 */
68 asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
69 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
70 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
71 asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
72}
73
74void save_processor_state(void)
75{
76 __save_processor_state(&saved_context);
77}
78
79static void
80do_fpu_end(void)
81{
82 /* restore FPU regs if necessary */
83 /* Do it out of line so that gcc does not move cr0 load to some stupid place */
84 kernel_fpu_end();
85 mxcsr_feature_mask_init();
86}
87
88void __restore_processor_state(struct saved_context *ctxt)
89{
90 /*
91 * control registers
92 */
93 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
94 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
95 asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
96 asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
97
98 /*
99 * segment registers
100 */
101 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
102 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
103 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
104 load_gs_index(ctxt->gs);
105 asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
106
107 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
108 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
109 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
110
111 /*
112 * now restore the descriptor tables to their proper values
113 * ltr is done i fix_processor_context().
114 */
115 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
116 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
117 asm volatile ("lldt %0" :: "m" (ctxt->ldt));
118
119 fix_processor_context();
120
121 do_fpu_end();
122}
123
124void restore_processor_state(void)
125{
126 __restore_processor_state(&saved_context);
127}
128
129void fix_processor_context(void)
130{
131 int cpu = smp_processor_id();
132 struct tss_struct *t = &per_cpu(init_tss, cpu);
133
134 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
135
136 cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9;
137
138 syscall_init(); /* This sets MSR_*STAR and related */
139 load_TR_desc(); /* This does ltr */
140 load_LDT(&current->active_mm->context); /* This does lldt */
141
142 /*
143 * Now maybe reload the debug registers
144 */
145 if (current->thread.debugreg7){
146 loaddebug(&current->thread, 0);
147 loaddebug(&current->thread, 1);
148 loaddebug(&current->thread, 2);
149 loaddebug(&current->thread, 3);
150 /* no 4 and 5 */
151 loaddebug(&current->thread, 6);
152 loaddebug(&current->thread, 7);
153 }
154
155}
156
157
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
new file mode 100644
index 000000000000..53f8e1659511
--- /dev/null
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -0,0 +1,104 @@
1/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
2 *
3 * Distribute under GPLv2.
4 *
5 * swsusp_arch_resume may not use any stack, nor any variable that is
6 * not "NoSave" during copying pages:
7 *
8 * Its rewriting one kernel image with another. What is stack in "old"
9 * image could very well be data page in "new" image, and overwriting
10 * your own stack under you is bad idea.
11 */
12
13 .text
14#include <linux/linkage.h>
15#include <asm/segment.h>
16#include <asm/page.h>
17#include <asm/offset.h>
18
19ENTRY(swsusp_arch_suspend)
20
21 movq %rsp, saved_context_esp(%rip)
22 movq %rax, saved_context_eax(%rip)
23 movq %rbx, saved_context_ebx(%rip)
24 movq %rcx, saved_context_ecx(%rip)
25 movq %rdx, saved_context_edx(%rip)
26 movq %rbp, saved_context_ebp(%rip)
27 movq %rsi, saved_context_esi(%rip)
28 movq %rdi, saved_context_edi(%rip)
29 movq %r8, saved_context_r08(%rip)
30 movq %r9, saved_context_r09(%rip)
31 movq %r10, saved_context_r10(%rip)
32 movq %r11, saved_context_r11(%rip)
33 movq %r12, saved_context_r12(%rip)
34 movq %r13, saved_context_r13(%rip)
35 movq %r14, saved_context_r14(%rip)
36 movq %r15, saved_context_r15(%rip)
37 pushfq ; popq saved_context_eflags(%rip)
38
39 call swsusp_save
40 ret
41
42ENTRY(swsusp_arch_resume)
43 /* set up cr3 */
44 leaq init_level4_pgt(%rip),%rax
45 subq $__START_KERNEL_map,%rax
46 movq %rax,%cr3
47
48 movq mmu_cr4_features(%rip), %rax
49 movq %rax, %rdx
50 andq $~(1<<7), %rdx # PGE
51 movq %rdx, %cr4; # turn off PGE
52 movq %cr3, %rcx; # flush TLB
53 movq %rcx, %cr3;
54 movq %rax, %cr4; # turn PGE back on
55
56 movq pagedir_nosave(%rip), %rdx
57loop:
58 testq %rdx, %rdx
59 jz done
60
61 /* get addresses from the pbe and copy the page */
62 movq pbe_address(%rdx), %rsi
63 movq pbe_orig_address(%rdx), %rdi
64 movq $512, %rcx
65 rep
66 movsq
67
68 /* progress to the next pbe */
69 movq pbe_next(%rdx), %rdx
70 jmp loop
71done:
72 /* Flush TLB, including "global" things (vmalloc) */
73 movq mmu_cr4_features(%rip), %rax
74 movq %rax, %rdx
75 andq $~(1<<7), %rdx; # PGE
76 movq %rdx, %cr4; # turn off PGE
77 movq %cr3, %rcx; # flush TLB
78 movq %rcx, %cr3
79 movq %rax, %cr4; # turn PGE back on
80
81 movl $24, %eax
82 movl %eax, %ds
83
84 movq saved_context_esp(%rip), %rsp
85 movq saved_context_ebp(%rip), %rbp
86 /* Don't restore %rax, it must be 0 anyway */
87 movq saved_context_ebx(%rip), %rbx
88 movq saved_context_ecx(%rip), %rcx
89 movq saved_context_edx(%rip), %rdx
90 movq saved_context_esi(%rip), %rsi
91 movq saved_context_edi(%rip), %rdi
92 movq saved_context_r08(%rip), %r8
93 movq saved_context_r09(%rip), %r9
94 movq saved_context_r10(%rip), %r10
95 movq saved_context_r11(%rip), %r11
96 movq saved_context_r12(%rip), %r12
97 movq saved_context_r13(%rip), %r13
98 movq saved_context_r14(%rip), %r14
99 movq saved_context_r15(%rip), %r15
100 pushq saved_context_eflags(%rip) ; popfq
101
102 xorq %rax, %rax
103
104 ret
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..477d8be57d64
--- /dev/null
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -0,0 +1,173 @@
1/*
2 * linux/arch/x86_64/kernel/sys_x86_64.c
3 */
4
5#include <linux/errno.h>
6#include <linux/sched.h>
7#include <linux/syscalls.h>
8#include <linux/mm.h>
9#include <linux/smp.h>
10#include <linux/smp_lock.h>
11#include <linux/sem.h>
12#include <linux/msg.h>
13#include <linux/shm.h>
14#include <linux/stat.h>
15#include <linux/mman.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/personality.h>
19
20#include <asm/uaccess.h>
21#include <asm/ia32.h>
22
23/*
24 * sys_pipe() is the normal C calling standard for creating
25 * a pipe. It's not the way Unix traditionally does this, though.
26 */
27asmlinkage long sys_pipe(int __user *fildes)
28{
29 int fd[2];
30 int error;
31
32 error = do_pipe(fd);
33 if (!error) {
34 if (copy_to_user(fildes, fd, 2*sizeof(int)))
35 error = -EFAULT;
36 }
37 return error;
38}
39
40asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
41 unsigned long fd, unsigned long off)
42{
43 long error;
44 struct file * file;
45
46 error = -EINVAL;
47 if (off & ~PAGE_MASK)
48 goto out;
49
50 error = -EBADF;
51 file = NULL;
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58 down_write(&current->mm->mmap_sem);
59 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
60 up_write(&current->mm->mmap_sem);
61
62 if (file)
63 fput(file);
64out:
65 return error;
66}
67
68static void find_start_end(unsigned long flags, unsigned long *begin,
69 unsigned long *end)
70{
71#ifdef CONFIG_IA32_EMULATION
72 if (test_thread_flag(TIF_IA32)) {
73 *begin = TASK_UNMAPPED_32;
74 *end = IA32_PAGE_OFFSET;
75 } else
76#endif
77 if (flags & MAP_32BIT) {
78 /* This is usually used needed to map code in small
79 model, so it needs to be in the first 31bit. Limit
80 it to that. This means we need to move the
81 unmapped base down for this case. This can give
82 conflicts with the heap, but we assume that glibc
83 malloc knows how to fall back to mmap. Give it 1GB
84 of playground for now. -AK */
85 *begin = 0x40000000;
86 *end = 0x80000000;
87 } else {
88 *begin = TASK_UNMAPPED_64;
89 *end = TASK_SIZE;
90 }
91}
92
93unsigned long
94arch_get_unmapped_area(struct file *filp, unsigned long addr,
95 unsigned long len, unsigned long pgoff, unsigned long flags)
96{
97 struct mm_struct *mm = current->mm;
98 struct vm_area_struct *vma;
99 unsigned long start_addr;
100 unsigned long begin, end;
101
102 find_start_end(flags, &begin, &end);
103
104 if (len > end)
105 return -ENOMEM;
106
107 if (addr) {
108 addr = PAGE_ALIGN(addr);
109 vma = find_vma(mm, addr);
110 if (end - len >= addr &&
111 (!vma || addr + len <= vma->vm_start))
112 return addr;
113 }
114 addr = mm->free_area_cache;
115 if (addr < begin)
116 addr = begin;
117 start_addr = addr;
118
119full_search:
120 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
121 /* At this point: (!vma || addr < vma->vm_end). */
122 if (end - len < addr) {
123 /*
124 * Start a new search - just in case we missed
125 * some holes.
126 */
127 if (start_addr != begin) {
128 start_addr = addr = begin;
129 goto full_search;
130 }
131 return -ENOMEM;
132 }
133 if (!vma || addr + len <= vma->vm_start) {
134 /*
135 * Remember the place where we stopped the search:
136 */
137 mm->free_area_cache = addr + len;
138 return addr;
139 }
140 addr = vma->vm_end;
141 }
142}
143
144asmlinkage long sys_uname(struct new_utsname __user * name)
145{
146 int err;
147 down_read(&uts_sem);
148 err = copy_to_user(name, &system_utsname, sizeof (*name));
149 up_read(&uts_sem);
150 if (personality(current->personality) == PER_LINUX32)
151 err |= copy_to_user(&name->machine, "i686", 5);
152 return err ? -EFAULT : 0;
153}
154
155asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg)
156{
157 unsigned long raddr;
158 return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr;
159}
160
161asmlinkage long sys_time64(long __user * tloc)
162{
163 struct timeval now;
164 int i;
165
166 do_gettimeofday(&now);
167 i = now.tv_sec;
168 if (tloc) {
169 if (put_user(i,tloc))
170 i = -EFAULT;
171 }
172 return i;
173}
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
new file mode 100644
index 000000000000..e263685f864c
--- /dev/null
+++ b/arch/x86_64/kernel/syscall.c
@@ -0,0 +1,26 @@
1/* System call table for x86-64. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <linux/config.h>
7
8#define __NO_STUBS
9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_
12#include <asm-x86_64/unistd.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [ nr ] = sym,
16#undef _ASM_X86_64_UNISTD_H_
17
18typedef void (*sys_call_ptr_t)(void);
19
20extern void sys_ni_syscall(void);
21
22sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm-x86_64/unistd.h>
26};
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
new file mode 100644
index 000000000000..3bafe438fa75
--- /dev/null
+++ b/arch/x86_64/kernel/time.c
@@ -0,0 +1,1262 @@
1/*
2 * linux/arch/x86-64/kernel/time.c
3 *
4 * "High Precision Event Timer" based timekeeping.
5 *
6 * Copyright (c) 1991,1992,1995 Linus Torvalds
7 * Copyright (c) 1994 Alan Modra
8 * Copyright (c) 1995 Markus Kuhn
9 * Copyright (c) 1996 Ingo Molnar
10 * Copyright (c) 1998 Andrea Arcangeli
11 * Copyright (c) 2002 Vojtech Pavlik
12 * Copyright (c) 2003 Andi Kleen
13 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
14 */
15
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/mc146818rtc.h>
21#include <linux/irq.h>
22#include <linux/time.h>
23#include <linux/ioport.h>
24#include <linux/module.h>
25#include <linux/device.h>
26#include <linux/sysdev.h>
27#include <linux/bcd.h>
28#include <linux/kallsyms.h>
29#include <asm/8253pit.h>
30#include <asm/pgtable.h>
31#include <asm/vsyscall.h>
32#include <asm/timex.h>
33#include <asm/proto.h>
34#include <asm/hpet.h>
35#include <asm/sections.h>
36#include <linux/cpufreq.h>
37#include <linux/hpet.h>
38#ifdef CONFIG_X86_LOCAL_APIC
39#include <asm/apic.h>
40#endif
41
42u64 jiffies_64 = INITIAL_JIFFIES;
43
44EXPORT_SYMBOL(jiffies_64);
45
46#ifdef CONFIG_CPU_FREQ
47static void cpufreq_delayed_get(void);
48#endif
49extern void i8254_timer_resume(void);
50extern int using_apic_timer;
51
52DEFINE_SPINLOCK(rtc_lock);
53DEFINE_SPINLOCK(i8253_lock);
54
55static int nohpet __initdata = 0;
56static int notsc __initdata = 0;
57
58#undef HPET_HACK_ENABLE_DANGEROUS
59
60unsigned int cpu_khz; /* TSC clocks / usec, not used here */
61static unsigned long hpet_period; /* fsecs / HPET clock */
62unsigned long hpet_tick; /* HPET clocks / interrupt */
63unsigned long vxtime_hz = PIT_TICK_RATE;
64int report_lost_ticks; /* command line option */
65unsigned long long monotonic_base;
66
67struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
68
69volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
70unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
71struct timespec __xtime __section_xtime;
72struct timezone __sys_tz __section_sys_tz;
73
74static inline void rdtscll_sync(unsigned long *tsc)
75{
76#ifdef CONFIG_SMP
77 sync_core();
78#endif
79 rdtscll(*tsc);
80}
81
82/*
83 * do_gettimeoffset() returns microseconds since last timer interrupt was
84 * triggered by hardware. A memory read of HPET is slower than a register read
85 * of TSC, but much more reliable. It's also synchronized to the timer
86 * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
87 * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
88 * This is not a problem, because jiffies hasn't updated either. They are bound
89 * together by xtime_lock.
90 */
91
92static inline unsigned int do_gettimeoffset_tsc(void)
93{
94 unsigned long t;
95 unsigned long x;
96 rdtscll_sync(&t);
97 if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
98 x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
99 return x;
100}
101
102static inline unsigned int do_gettimeoffset_hpet(void)
103{
104 return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
105}
106
107unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
108
109/*
110 * This version of gettimeofday() has microsecond resolution and better than
111 * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
112 * MHz) HPET timer.
113 */
114
115void do_gettimeofday(struct timeval *tv)
116{
117 unsigned long seq, t;
118 unsigned int sec, usec;
119
120 do {
121 seq = read_seqbegin(&xtime_lock);
122
123 sec = xtime.tv_sec;
124 usec = xtime.tv_nsec / 1000;
125
126 /* i386 does some correction here to keep the clock
127 monotonous even when ntpd is fixing drift.
128 But they didn't work for me, there is a non monotonic
129 clock anyways with ntp.
130 I dropped all corrections now until a real solution can
131 be found. Note when you fix it here you need to do the same
132 in arch/x86_64/kernel/vsyscall.c and export all needed
133 variables in vmlinux.lds. -AK */
134
135 t = (jiffies - wall_jiffies) * (1000000L / HZ) +
136 do_gettimeoffset();
137 usec += t;
138
139 } while (read_seqretry(&xtime_lock, seq));
140
141 tv->tv_sec = sec + usec / 1000000;
142 tv->tv_usec = usec % 1000000;
143}
144
145EXPORT_SYMBOL(do_gettimeofday);
146
147/*
148 * settimeofday() first undoes the correction that gettimeofday would do
149 * on the time, and then saves it. This is ugly, but has been like this for
150 * ages already.
151 */
152
153int do_settimeofday(struct timespec *tv)
154{
155 time_t wtm_sec, sec = tv->tv_sec;
156 long wtm_nsec, nsec = tv->tv_nsec;
157
158 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
159 return -EINVAL;
160
161 write_seqlock_irq(&xtime_lock);
162
163 nsec -= do_gettimeoffset() * 1000 +
164 (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
165
166 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
167 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
168
169 set_normalized_timespec(&xtime, sec, nsec);
170 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
171
172 time_adjust = 0; /* stop active adjtime() */
173 time_status |= STA_UNSYNC;
174 time_maxerror = NTP_PHASE_LIMIT;
175 time_esterror = NTP_PHASE_LIMIT;
176
177 write_sequnlock_irq(&xtime_lock);
178 clock_was_set();
179 return 0;
180}
181
182EXPORT_SYMBOL(do_settimeofday);
183
184unsigned long profile_pc(struct pt_regs *regs)
185{
186 unsigned long pc = instruction_pointer(regs);
187
188 /* Assume the lock function has either no stack frame or only a single word.
189 This checks if the address on the stack looks like a kernel text address.
190 There is a small window for false hits, but in that case the tick
191 is just accounted to the spinlock function.
192 Better would be to write these functions in assembler again
193 and check exactly. */
194 if (in_lock_functions(pc)) {
195 char *v = *(char **)regs->rsp;
196 if ((v >= _stext && v <= _etext) ||
197 (v >= _sinittext && v <= _einittext) ||
198 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
199 return (unsigned long)v;
200 return ((unsigned long *)regs->rsp)[1];
201 }
202 return pc;
203}
204EXPORT_SYMBOL(profile_pc);
205
206/*
207 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
208 * ms after the second nowtime has started, because when nowtime is written
209 * into the registers of the CMOS clock, it will jump to the next second
210 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
211 * sheet for details.
212 */
213
214static void set_rtc_mmss(unsigned long nowtime)
215{
216 int real_seconds, real_minutes, cmos_minutes;
217 unsigned char control, freq_select;
218
219/*
220 * IRQs are disabled when we're called from the timer interrupt,
221 * no need for spin_lock_irqsave()
222 */
223
224 spin_lock(&rtc_lock);
225
226/*
227 * Tell the clock it's being set and stop it.
228 */
229
230 control = CMOS_READ(RTC_CONTROL);
231 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
232
233 freq_select = CMOS_READ(RTC_FREQ_SELECT);
234 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
235
236 cmos_minutes = CMOS_READ(RTC_MINUTES);
237 BCD_TO_BIN(cmos_minutes);
238
239/*
240 * since we're only adjusting minutes and seconds, don't interfere with hour
241 * overflow. This avoids messing with unknown time zones but requires your RTC
242 * not to be off by more than 15 minutes. Since we're calling it only when
243 * our clock is externally synchronized using NTP, this shouldn't be a problem.
244 */
245
246 real_seconds = nowtime % 60;
247 real_minutes = nowtime / 60;
248 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
249 real_minutes += 30; /* correct for half hour time zone */
250 real_minutes %= 60;
251
252#if 0
253 /* AMD 8111 is a really bad time keeper and hits this regularly.
254 It probably was an attempt to avoid screwing up DST, but ignore
255 that for now. */
256 if (abs(real_minutes - cmos_minutes) >= 30) {
257 printk(KERN_WARNING "time.c: can't update CMOS clock "
258 "from %d to %d\n", cmos_minutes, real_minutes);
259 } else
260#endif
261
262 {
263 BIN_TO_BCD(real_seconds);
264 BIN_TO_BCD(real_minutes);
265 CMOS_WRITE(real_seconds, RTC_SECONDS);
266 CMOS_WRITE(real_minutes, RTC_MINUTES);
267 }
268
269/*
270 * The following flags have to be released exactly in this order, otherwise the
271 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
272 * not reset the oscillator and will not update precisely 500 ms later. You
273 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
274 * believes data sheets anyway ... -- Markus Kuhn
275 */
276
277 CMOS_WRITE(control, RTC_CONTROL);
278 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
279
280 spin_unlock(&rtc_lock);
281}
282
283
284/* monotonic_clock(): returns # of nanoseconds passed since time_init()
285 * Note: This function is required to return accurate
286 * time even in the absence of multiple timer ticks.
287 */
288unsigned long long monotonic_clock(void)
289{
290 unsigned long seq;
291 u32 last_offset, this_offset, offset;
292 unsigned long long base;
293
294 if (vxtime.mode == VXTIME_HPET) {
295 do {
296 seq = read_seqbegin(&xtime_lock);
297
298 last_offset = vxtime.last;
299 base = monotonic_base;
300 this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
301
302 } while (read_seqretry(&xtime_lock, seq));
303 offset = (this_offset - last_offset);
304 offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
305 return base + offset;
306 }else{
307 do {
308 seq = read_seqbegin(&xtime_lock);
309
310 last_offset = vxtime.last_tsc;
311 base = monotonic_base;
312 } while (read_seqretry(&xtime_lock, seq));
313 sync_core();
314 rdtscll(this_offset);
315 offset = (this_offset - last_offset)*1000/cpu_khz;
316 return base + offset;
317 }
318
319
320}
321EXPORT_SYMBOL(monotonic_clock);
322
323static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
324{
325 static long lost_count;
326 static int warned;
327
328 if (report_lost_ticks) {
329 printk(KERN_WARNING "time.c: Lost %d timer "
330 "tick(s)! ", lost);
331 print_symbol("rip %s)\n", regs->rip);
332 }
333
334 if (lost_count == 1000 && !warned) {
335 printk(KERN_WARNING
336 "warning: many lost ticks.\n"
337 KERN_WARNING "Your time source seems to be instable or "
338 "some driver is hogging interupts\n");
339 print_symbol("rip %s\n", regs->rip);
340 if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
341 printk(KERN_WARNING "Falling back to HPET\n");
342 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
343 vxtime.mode = VXTIME_HPET;
344 do_gettimeoffset = do_gettimeoffset_hpet;
345 }
346 /* else should fall back to PIT, but code missing. */
347 warned = 1;
348 } else
349 lost_count++;
350
351#ifdef CONFIG_CPU_FREQ
352 /* In some cases the CPU can change frequency without us noticing
353 (like going into thermal throttle)
354 Give cpufreq a change to catch up. */
355 if ((lost_count+1) % 25 == 0) {
356 cpufreq_delayed_get();
357 }
358#endif
359}
360
361static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
362{
363 static unsigned long rtc_update = 0;
364 unsigned long tsc;
365 int delay, offset = 0, lost = 0;
366
367/*
368 * Here we are in the timer irq handler. We have irqs locally disabled (so we
369 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
370 * on the other CPU, so we need a lock. We also need to lock the vsyscall
371 * variables, because both do_timer() and us change them -arca+vojtech
372 */
373
374 write_seqlock(&xtime_lock);
375
376 if (vxtime.hpet_address) {
377 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
378 delay = hpet_readl(HPET_COUNTER) - offset;
379 } else {
380 spin_lock(&i8253_lock);
381 outb_p(0x00, 0x43);
382 delay = inb_p(0x40);
383 delay |= inb(0x40) << 8;
384 spin_unlock(&i8253_lock);
385 delay = LATCH - 1 - delay;
386 }
387
388 rdtscll_sync(&tsc);
389
390 if (vxtime.mode == VXTIME_HPET) {
391 if (offset - vxtime.last > hpet_tick) {
392 lost = (offset - vxtime.last) / hpet_tick - 1;
393 }
394
395 monotonic_base +=
396 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
397
398 vxtime.last = offset;
399 } else {
400 offset = (((tsc - vxtime.last_tsc) *
401 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
402
403 if (offset < 0)
404 offset = 0;
405
406 if (offset > (USEC_PER_SEC / HZ)) {
407 lost = offset / (USEC_PER_SEC / HZ);
408 offset %= (USEC_PER_SEC / HZ);
409 }
410
411 monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
412
413 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
414
415 if ((((tsc - vxtime.last_tsc) *
416 vxtime.tsc_quot) >> 32) < offset)
417 vxtime.last_tsc = tsc -
418 (((long) offset << 32) / vxtime.tsc_quot) - 1;
419 }
420
421 if (lost > 0) {
422 handle_lost_ticks(lost, regs);
423 jiffies += lost;
424 }
425
426/*
427 * Do the timer stuff.
428 */
429
430 do_timer(regs);
431#ifndef CONFIG_SMP
432 update_process_times(user_mode(regs));
433#endif
434
435/*
436 * In the SMP case we use the local APIC timer interrupt to do the profiling,
437 * except when we simulate SMP mode on a uniprocessor system, in that case we
438 * have to call the local interrupt handler.
439 */
440
441#ifndef CONFIG_X86_LOCAL_APIC
442 profile_tick(CPU_PROFILING, regs);
443#else
444 if (!using_apic_timer)
445 smp_local_timer_interrupt(regs);
446#endif
447
448/*
449 * If we have an externally synchronized Linux clock, then update CMOS clock
450 * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
451 * closest to exactly 500 ms before the next second. If the update fails, we
452 * don't care, as it'll be updated on the next turn, and the problem (time way
453 * off) isn't likely to go away much sooner anyway.
454 */
455
456 if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
457 abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
458 set_rtc_mmss(xtime.tv_sec);
459 rtc_update = xtime.tv_sec + 660;
460 }
461
462 write_sequnlock(&xtime_lock);
463
464 return IRQ_HANDLED;
465}
466
467static unsigned int cyc2ns_scale;
468#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
469
470static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
471{
472 cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
473}
474
475static inline unsigned long long cycles_2_ns(unsigned long long cyc)
476{
477 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
478}
479
480unsigned long long sched_clock(void)
481{
482 unsigned long a = 0;
483
484#if 0
485 /* Don't do a HPET read here. Using TSC always is much faster
486 and HPET may not be mapped yet when the scheduler first runs.
487 Disadvantage is a small drift between CPUs in some configurations,
488 but that should be tolerable. */
489 if (__vxtime.mode == VXTIME_HPET)
490 return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
491#endif
492
493 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
494 which means it is not completely exact and may not be monotonous between
495 CPUs. But the errors should be too small to matter for scheduling
496 purposes. */
497
498 rdtscll(a);
499 return cycles_2_ns(a);
500}
501
502unsigned long get_cmos_time(void)
503{
504 unsigned int timeout, year, mon, day, hour, min, sec;
505 unsigned char last, this;
506 unsigned long flags;
507
508/*
509 * The Linux interpretation of the CMOS clock register contents: When the
510 * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
511 * second which has precisely just started. Waiting for this can take up to 1
512 * second, we timeout approximately after 2.4 seconds on a machine with
513 * standard 8.3 MHz ISA bus.
514 */
515
516 spin_lock_irqsave(&rtc_lock, flags);
517
518 timeout = 1000000;
519 last = this = 0;
520
521 while (timeout && last && !this) {
522 last = this;
523 this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
524 timeout--;
525 }
526
527/*
528 * Here we are safe to assume the registers won't change for a whole second, so
529 * we just go ahead and read them.
530 */
531
532 sec = CMOS_READ(RTC_SECONDS);
533 min = CMOS_READ(RTC_MINUTES);
534 hour = CMOS_READ(RTC_HOURS);
535 day = CMOS_READ(RTC_DAY_OF_MONTH);
536 mon = CMOS_READ(RTC_MONTH);
537 year = CMOS_READ(RTC_YEAR);
538
539 spin_unlock_irqrestore(&rtc_lock, flags);
540
541/*
542 * We know that x86-64 always uses BCD format, no need to check the config
543 * register.
544 */
545
546 BCD_TO_BIN(sec);
547 BCD_TO_BIN(min);
548 BCD_TO_BIN(hour);
549 BCD_TO_BIN(day);
550 BCD_TO_BIN(mon);
551 BCD_TO_BIN(year);
552
553/*
554 * x86-64 systems only exists since 2002.
555 * This will work up to Dec 31, 2100
556 */
557 year += 2000;
558
559 return mktime(year, mon, day, hour, min, sec);
560}
561
562#ifdef CONFIG_CPU_FREQ
563
564/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
565 changes.
566
567 RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
568 not that important because current Opteron setups do not support
569 scaling on SMP anyroads.
570
571 Should fix up last_tsc too. Currently gettimeofday in the
572 first tick after the change will be slightly wrong. */
573
574#include <linux/workqueue.h>
575
576static unsigned int cpufreq_delayed_issched = 0;
577static unsigned int cpufreq_init = 0;
578static struct work_struct cpufreq_delayed_get_work;
579
580static void handle_cpufreq_delayed_get(void *v)
581{
582 unsigned int cpu;
583 for_each_online_cpu(cpu) {
584 cpufreq_get(cpu);
585 }
586 cpufreq_delayed_issched = 0;
587}
588
589/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
590 * to verify the CPU frequency the timing core thinks the CPU is running
591 * at is still correct.
592 */
593static void cpufreq_delayed_get(void)
594{
595 static int warned;
596 if (cpufreq_init && !cpufreq_delayed_issched) {
597 cpufreq_delayed_issched = 1;
598 if (!warned) {
599 warned = 1;
600 printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
601 }
602 schedule_work(&cpufreq_delayed_get_work);
603 }
604}
605
606static unsigned int ref_freq = 0;
607static unsigned long loops_per_jiffy_ref = 0;
608
609static unsigned long cpu_khz_ref = 0;
610
611static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
612 void *data)
613{
614 struct cpufreq_freqs *freq = data;
615 unsigned long *lpj, dummy;
616
617 lpj = &dummy;
618 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
619#ifdef CONFIG_SMP
620 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
621#else
622 lpj = &boot_cpu_data.loops_per_jiffy;
623#endif
624
625
626
627 if (!ref_freq) {
628 ref_freq = freq->old;
629 loops_per_jiffy_ref = *lpj;
630 cpu_khz_ref = cpu_khz;
631 }
632 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
633 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
634 (val == CPUFREQ_RESUMECHANGE)) {
635 *lpj =
636 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
637
638 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
639 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
640 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
641 }
642
643 set_cyc2ns_scale(cpu_khz_ref / 1000);
644
645 return 0;
646}
647
648static struct notifier_block time_cpufreq_notifier_block = {
649 .notifier_call = time_cpufreq_notifier
650};
651
652static int __init cpufreq_tsc(void)
653{
654 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
655 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
656 CPUFREQ_TRANSITION_NOTIFIER))
657 cpufreq_init = 1;
658 return 0;
659}
660
661core_initcall(cpufreq_tsc);
662
663#endif
664
665/*
666 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
667 * it to the HPET timer of known frequency.
668 */
669
670#define TICK_COUNT 100000000
671
672static unsigned int __init hpet_calibrate_tsc(void)
673{
674 int tsc_start, hpet_start;
675 int tsc_now, hpet_now;
676 unsigned long flags;
677
678 local_irq_save(flags);
679 local_irq_disable();
680
681 hpet_start = hpet_readl(HPET_COUNTER);
682 rdtscl(tsc_start);
683
684 do {
685 local_irq_disable();
686 hpet_now = hpet_readl(HPET_COUNTER);
687 sync_core();
688 rdtscl(tsc_now);
689 local_irq_restore(flags);
690 } while ((tsc_now - tsc_start) < TICK_COUNT &&
691 (hpet_now - hpet_start) < TICK_COUNT);
692
693 return (tsc_now - tsc_start) * 1000000000L
694 / ((hpet_now - hpet_start) * hpet_period / 1000);
695}
696
697
698/*
699 * pit_calibrate_tsc() uses the speaker output (channel 2) of
700 * the PIT. This is better than using the timer interrupt output,
701 * because we can read the value of the speaker with just one inb(),
702 * where we need three i/o operations for the interrupt channel.
703 * We count how many ticks the TSC does in 50 ms.
704 */
705
706static unsigned int __init pit_calibrate_tsc(void)
707{
708 unsigned long start, end;
709 unsigned long flags;
710
711 spin_lock_irqsave(&i8253_lock, flags);
712
713 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
714
715 outb(0xb0, 0x43);
716 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
717 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
718 rdtscll(start);
719 sync_core();
720 while ((inb(0x61) & 0x20) == 0);
721 sync_core();
722 rdtscll(end);
723
724 spin_unlock_irqrestore(&i8253_lock, flags);
725
726 return (end - start) / 50;
727}
728
729#ifdef CONFIG_HPET
730static __init int late_hpet_init(void)
731{
732 struct hpet_data hd;
733 unsigned int ntimer;
734
735 if (!vxtime.hpet_address)
736 return -1;
737
738 memset(&hd, 0, sizeof (hd));
739
740 ntimer = hpet_readl(HPET_ID);
741 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
742 ntimer++;
743
744 /*
745 * Register with driver.
746 * Timer0 and Timer1 is used by platform.
747 */
748 hd.hd_phys_address = vxtime.hpet_address;
749 hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE);
750 hd.hd_nirqs = ntimer;
751 hd.hd_flags = HPET_DATA_PLATFORM;
752 hpet_reserve_timer(&hd, 0);
753#ifdef CONFIG_HPET_EMULATE_RTC
754 hpet_reserve_timer(&hd, 1);
755#endif
756 hd.hd_irq[0] = HPET_LEGACY_8254;
757 hd.hd_irq[1] = HPET_LEGACY_RTC;
758 if (ntimer > 2) {
759 struct hpet *hpet;
760 struct hpet_timer *timer;
761 int i;
762
763 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
764
765 for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
766 timer++, i++)
767 hd.hd_irq[i] = (timer->hpet_config &
768 Tn_INT_ROUTE_CNF_MASK) >>
769 Tn_INT_ROUTE_CNF_SHIFT;
770
771 }
772
773 hpet_alloc(&hd);
774 return 0;
775}
776fs_initcall(late_hpet_init);
777#endif
778
779static int hpet_timer_stop_set_go(unsigned long tick)
780{
781 unsigned int cfg;
782
783/*
784 * Stop the timers and reset the main counter.
785 */
786
787 cfg = hpet_readl(HPET_CFG);
788 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
789 hpet_writel(cfg, HPET_CFG);
790 hpet_writel(0, HPET_COUNTER);
791 hpet_writel(0, HPET_COUNTER + 4);
792
793/*
794 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
795 * and period also hpet_tick.
796 */
797
798 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
799 HPET_TN_32BIT, HPET_T0_CFG);
800 hpet_writel(hpet_tick, HPET_T0_CMP);
801 hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
802
803/*
804 * Go!
805 */
806
807 cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
808 hpet_writel(cfg, HPET_CFG);
809
810 return 0;
811}
812
813static int hpet_init(void)
814{
815 unsigned int id;
816
817 if (!vxtime.hpet_address)
818 return -1;
819 set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
820 __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
821
822/*
823 * Read the period, compute tick and quotient.
824 */
825
826 id = hpet_readl(HPET_ID);
827
828 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
829 !(id & HPET_ID_LEGSUP))
830 return -1;
831
832 hpet_period = hpet_readl(HPET_PERIOD);
833 if (hpet_period < 100000 || hpet_period > 100000000)
834 return -1;
835
836 hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
837 hpet_period;
838
839 return hpet_timer_stop_set_go(hpet_tick);
840}
841
842static int hpet_reenable(void)
843{
844 return hpet_timer_stop_set_go(hpet_tick);
845}
846
847void __init pit_init(void)
848{
849 unsigned long flags;
850
851 spin_lock_irqsave(&i8253_lock, flags);
852 outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */
853 outb_p(LATCH & 0xff, 0x40); /* LSB */
854 outb_p(LATCH >> 8, 0x40); /* MSB */
855 spin_unlock_irqrestore(&i8253_lock, flags);
856}
857
858int __init time_setup(char *str)
859{
860 report_lost_ticks = 1;
861 return 1;
862}
863
864static struct irqaction irq0 = {
865 timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
866};
867
868extern void __init config_acpi_tables(void);
869
870void __init time_init(void)
871{
872 char *timename;
873
874#ifdef HPET_HACK_ENABLE_DANGEROUS
875 if (!vxtime.hpet_address) {
876 printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
877 "manually!\n");
878 outl(0x800038a0, 0xcf8);
879 outl(0xff000001, 0xcfc);
880 outl(0x800038a0, 0xcf8);
881 vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
882 printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
883 "at %#lx.\n", vxtime.hpet_address);
884 }
885#endif
886 if (nohpet)
887 vxtime.hpet_address = 0;
888
889 xtime.tv_sec = get_cmos_time();
890 xtime.tv_nsec = 0;
891
892 set_normalized_timespec(&wall_to_monotonic,
893 -xtime.tv_sec, -xtime.tv_nsec);
894
895 if (!hpet_init()) {
896 vxtime_hz = (1000000000000000L + hpet_period / 2) /
897 hpet_period;
898 cpu_khz = hpet_calibrate_tsc();
899 timename = "HPET";
900 } else {
901 pit_init();
902 cpu_khz = pit_calibrate_tsc();
903 timename = "PIT";
904 }
905
906 printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
907 vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
908 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
909 cpu_khz / 1000, cpu_khz % 1000);
910 vxtime.mode = VXTIME_TSC;
911 vxtime.quot = (1000000L << 32) / vxtime_hz;
912 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
913 vxtime.hz = vxtime_hz;
914 rdtscll_sync(&vxtime.last_tsc);
915 setup_irq(0, &irq0);
916
917 set_cyc2ns_scale(cpu_khz / 1000);
918}
919
920void __init time_init_smp(void)
921{
922 char *timetype;
923
924 /*
925 * AMD systems with more than one CPU don't have fully synchronized
926 * TSCs. Always use HPET gettimeofday for these, although it is slower.
927 * Intel SMP systems usually have synchronized TSCs, so use always
928 * the TSC.
929 *
930 * Exceptions:
931 * IBM Summit2 checked by oem_force_hpet_timer().
932 * AMD dual core may also not need HPET. Check me.
933 *
934 * Can be turned off with "notsc".
935 */
936 if (num_online_cpus() > 1 &&
937 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
938 notsc = 1;
939 /* Some systems will want to disable TSC and use HPET. */
940 if (oem_force_hpet_timer())
941 notsc = 1;
942 if (vxtime.hpet_address && notsc) {
943 timetype = "HPET";
944 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
945 vxtime.mode = VXTIME_HPET;
946 do_gettimeoffset = do_gettimeoffset_hpet;
947 } else {
948 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
949 vxtime.mode = VXTIME_TSC;
950 }
951
952 printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
953}
954
955__setup("report_lost_ticks", time_setup);
956
957static long clock_cmos_diff;
958static unsigned long sleep_start;
959
960static int timer_suspend(struct sys_device *dev, u32 state)
961{
962 /*
963 * Estimate time zone so that set_time can update the clock
964 */
965 long cmos_time = get_cmos_time();
966
967 clock_cmos_diff = -cmos_time;
968 clock_cmos_diff += get_seconds();
969 sleep_start = cmos_time;
970 return 0;
971}
972
973static int timer_resume(struct sys_device *dev)
974{
975 unsigned long flags;
976 unsigned long sec;
977 unsigned long ctime = get_cmos_time();
978 unsigned long sleep_length = (ctime - sleep_start) * HZ;
979
980 if (vxtime.hpet_address)
981 hpet_reenable();
982 else
983 i8254_timer_resume();
984
985 sec = ctime + clock_cmos_diff;
986 write_seqlock_irqsave(&xtime_lock,flags);
987 xtime.tv_sec = sec;
988 xtime.tv_nsec = 0;
989 write_sequnlock_irqrestore(&xtime_lock,flags);
990 jiffies += sleep_length;
991 wall_jiffies += sleep_length;
992 return 0;
993}
994
995static struct sysdev_class timer_sysclass = {
996 .resume = timer_resume,
997 .suspend = timer_suspend,
998 set_kset_name("timer"),
999};
1000
1001
1002/* XXX this driverfs stuff should probably go elsewhere later -john */
1003static struct sys_device device_timer = {
1004 .id = 0,
1005 .cls = &timer_sysclass,
1006};
1007
1008static int time_init_device(void)
1009{
1010 int error = sysdev_class_register(&timer_sysclass);
1011 if (!error)
1012 error = sysdev_register(&device_timer);
1013 return error;
1014}
1015
1016device_initcall(time_init_device);
1017
1018#ifdef CONFIG_HPET_EMULATE_RTC
1019/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
1020 * is enabled, we support RTC interrupt functionality in software.
1021 * RTC has 3 kinds of interrupts:
1022 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
1023 * is updated
1024 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
1025 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
1026 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
1027 * (1) and (2) above are implemented using polling at a frequency of
1028 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
1029 * overhead. (DEFAULT_RTC_INT_FREQ)
1030 * For (3), we use interrupts at 64Hz or user specified periodic
1031 * frequency, whichever is higher.
1032 */
1033#include <linux/rtc.h>
1034
1035extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
1036
1037#define DEFAULT_RTC_INT_FREQ 64
1038#define RTC_NUM_INTS 1
1039
1040static unsigned long UIE_on;
1041static unsigned long prev_update_sec;
1042
1043static unsigned long AIE_on;
1044static struct rtc_time alarm_time;
1045
1046static unsigned long PIE_on;
1047static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
1048static unsigned long PIE_count;
1049
1050static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
1051
1052int is_hpet_enabled(void)
1053{
1054 return vxtime.hpet_address != 0;
1055}
1056
1057/*
1058 * Timer 1 for RTC, we do not use periodic interrupt feature,
1059 * even if HPET supports periodic interrupts on Timer 1.
1060 * The reason being, to set up a periodic interrupt in HPET, we need to
1061 * stop the main counter. And if we do that everytime someone diables/enables
1062 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
1063 * So, for the time being, simulate the periodic interrupt in software.
1064 *
1065 * hpet_rtc_timer_init() is called for the first time and during subsequent
1066 * interuppts reinit happens through hpet_rtc_timer_reinit().
1067 */
1068int hpet_rtc_timer_init(void)
1069{
1070 unsigned int cfg, cnt;
1071 unsigned long flags;
1072
1073 if (!is_hpet_enabled())
1074 return 0;
1075 /*
1076 * Set the counter 1 and enable the interrupts.
1077 */
1078 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1079 hpet_rtc_int_freq = PIE_freq;
1080 else
1081 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1082
1083 local_irq_save(flags);
1084 cnt = hpet_readl(HPET_COUNTER);
1085 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
1086 hpet_writel(cnt, HPET_T1_CMP);
1087 local_irq_restore(flags);
1088
1089 cfg = hpet_readl(HPET_T1_CFG);
1090 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
1091 hpet_writel(cfg, HPET_T1_CFG);
1092
1093 return 1;
1094}
1095
1096static void hpet_rtc_timer_reinit(void)
1097{
1098 unsigned int cfg, cnt;
1099
1100 if (!(PIE_on | AIE_on | UIE_on))
1101 return;
1102
1103 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1104 hpet_rtc_int_freq = PIE_freq;
1105 else
1106 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1107
1108 /* It is more accurate to use the comparator value than current count.*/
1109 cnt = hpet_readl(HPET_T1_CMP);
1110 cnt += hpet_tick*HZ/hpet_rtc_int_freq;
1111 hpet_writel(cnt, HPET_T1_CMP);
1112
1113 cfg = hpet_readl(HPET_T1_CFG);
1114 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
1115 hpet_writel(cfg, HPET_T1_CFG);
1116
1117 return;
1118}
1119
1120/*
1121 * The functions below are called from rtc driver.
1122 * Return 0 if HPET is not being used.
1123 * Otherwise do the necessary changes and return 1.
1124 */
1125int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1126{
1127 if (!is_hpet_enabled())
1128 return 0;
1129
1130 if (bit_mask & RTC_UIE)
1131 UIE_on = 0;
1132 if (bit_mask & RTC_PIE)
1133 PIE_on = 0;
1134 if (bit_mask & RTC_AIE)
1135 AIE_on = 0;
1136
1137 return 1;
1138}
1139
1140int hpet_set_rtc_irq_bit(unsigned long bit_mask)
1141{
1142 int timer_init_reqd = 0;
1143
1144 if (!is_hpet_enabled())
1145 return 0;
1146
1147 if (!(PIE_on | AIE_on | UIE_on))
1148 timer_init_reqd = 1;
1149
1150 if (bit_mask & RTC_UIE) {
1151 UIE_on = 1;
1152 }
1153 if (bit_mask & RTC_PIE) {
1154 PIE_on = 1;
1155 PIE_count = 0;
1156 }
1157 if (bit_mask & RTC_AIE) {
1158 AIE_on = 1;
1159 }
1160
1161 if (timer_init_reqd)
1162 hpet_rtc_timer_init();
1163
1164 return 1;
1165}
1166
1167int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
1168{
1169 if (!is_hpet_enabled())
1170 return 0;
1171
1172 alarm_time.tm_hour = hrs;
1173 alarm_time.tm_min = min;
1174 alarm_time.tm_sec = sec;
1175
1176 return 1;
1177}
1178
1179int hpet_set_periodic_freq(unsigned long freq)
1180{
1181 if (!is_hpet_enabled())
1182 return 0;
1183
1184 PIE_freq = freq;
1185 PIE_count = 0;
1186
1187 return 1;
1188}
1189
1190int hpet_rtc_dropped_irq(void)
1191{
1192 if (!is_hpet_enabled())
1193 return 0;
1194
1195 return 1;
1196}
1197
1198irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
1199{
1200 struct rtc_time curr_time;
1201 unsigned long rtc_int_flag = 0;
1202 int call_rtc_interrupt = 0;
1203
1204 hpet_rtc_timer_reinit();
1205
1206 if (UIE_on | AIE_on) {
1207 rtc_get_rtc_time(&curr_time);
1208 }
1209 if (UIE_on) {
1210 if (curr_time.tm_sec != prev_update_sec) {
1211 /* Set update int info, call real rtc int routine */
1212 call_rtc_interrupt = 1;
1213 rtc_int_flag = RTC_UF;
1214 prev_update_sec = curr_time.tm_sec;
1215 }
1216 }
1217 if (PIE_on) {
1218 PIE_count++;
1219 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
1220 /* Set periodic int info, call real rtc int routine */
1221 call_rtc_interrupt = 1;
1222 rtc_int_flag |= RTC_PF;
1223 PIE_count = 0;
1224 }
1225 }
1226 if (AIE_on) {
1227 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
1228 (curr_time.tm_min == alarm_time.tm_min) &&
1229 (curr_time.tm_hour == alarm_time.tm_hour)) {
1230 /* Set alarm int info, call real rtc int routine */
1231 call_rtc_interrupt = 1;
1232 rtc_int_flag |= RTC_AF;
1233 }
1234 }
1235 if (call_rtc_interrupt) {
1236 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
1237 rtc_interrupt(rtc_int_flag, dev_id, regs);
1238 }
1239 return IRQ_HANDLED;
1240}
1241#endif
1242
1243
1244
1245static int __init nohpet_setup(char *s)
1246{
1247 nohpet = 1;
1248 return 0;
1249}
1250
1251__setup("nohpet", nohpet_setup);
1252
1253
1254static int __init notsc_setup(char *s)
1255{
1256 notsc = 1;
1257 return 0;
1258}
1259
1260__setup("notsc", notsc_setup);
1261
1262
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
new file mode 100644
index 000000000000..6d9c9a8e7d0e
--- /dev/null
+++ b/arch/x86_64/kernel/trampoline.S
@@ -0,0 +1,64 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * Entry: CS:IP point to the start of our code, we are
8 * in real mode with no stack, but the rest of the
9 * trampoline page to make our stack and everything else
10 * is a mystery.
11 *
12 * In fact we don't actually need a stack so we don't
13 * set one up.
14 *
15 * On entry to trampoline_data, the processor is in real mode
16 * with 16-bit addressing and 16-bit data. CS has some value
17 * and IP is zero. Thus, data addresses need to be absolute
18 * (no relocation) and are taken with regard to r_base.
19 *
20 * If you work on this file, check the object module with objdump
21 * --full-contents --reloc to make sure there are no relocation
22 * entries. For the GDT entry we do hand relocation in smpboot.c
23 * because of 64bit linker limitations.
24 */
25
26#include <linux/linkage.h>
27#include <asm/segment.h>
28#include <asm/page.h>
29
30.data
31
32.code16
33
34ENTRY(trampoline_data)
35r_base = .
36 wbinvd
37 mov %cs, %ax # Code and data in the same place
38 mov %ax, %ds
39
40 cli # We should be safe anyway
41
42 movl $0xA5A5A5A5, trampoline_data - r_base
43 # write marker for master knows we're running
44
45 lidt idt_48 - r_base # load idt with 0, 0
46 lgdt gdt_48 - r_base # load gdt with whatever is appropriate
47
48 xor %ax, %ax
49 inc %ax # protected mode (PE) bit
50 lmsw %ax # into protected mode
51 # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S
52 ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map)
53
54 # Careful these need to be in the same 64K segment as the above;
55idt_48:
56 .word 0 # idt limit = 0
57 .word 0, 0 # idt base = 0L
58
59gdt_48:
60 .short __KERNEL32_CS + 7 # gdt limit
61 .long cpu_gdt_table-__START_KERNEL_map
62
63.globl trampoline_end
64trampoline_end:
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
new file mode 100644
index 000000000000..d87ac6496cc2
--- /dev/null
+++ b/arch/x86_64/kernel/traps.c
@@ -0,0 +1,948 @@
1/*
2 * linux/arch/x86-64/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 *
10 * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
11 */
12
13/*
14 * 'Traps.c' handles hardware traps and faults after we have saved some
15 * state in 'entry.S'.
16 */
17#include <linux/config.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/string.h>
21#include <linux/errno.h>
22#include <linux/ptrace.h>
23#include <linux/timer.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/spinlock.h>
28#include <linux/interrupt.h>
29#include <linux/module.h>
30#include <linux/moduleparam.h>
31
32#include <asm/system.h>
33#include <asm/uaccess.h>
34#include <asm/io.h>
35#include <asm/atomic.h>
36#include <asm/debugreg.h>
37#include <asm/desc.h>
38#include <asm/i387.h>
39#include <asm/kdebug.h>
40#include <asm/processor.h>
41
42#include <asm/smp.h>
43#include <asm/pgalloc.h>
44#include <asm/pda.h>
45#include <asm/proto.h>
46#include <asm/nmi.h>
47
48#include <linux/irq.h>
49
50
51extern struct gate_struct idt_table[256];
52
53asmlinkage void divide_error(void);
54asmlinkage void debug(void);
55asmlinkage void nmi(void);
56asmlinkage void int3(void);
57asmlinkage void overflow(void);
58asmlinkage void bounds(void);
59asmlinkage void invalid_op(void);
60asmlinkage void device_not_available(void);
61asmlinkage void double_fault(void);
62asmlinkage void coprocessor_segment_overrun(void);
63asmlinkage void invalid_TSS(void);
64asmlinkage void segment_not_present(void);
65asmlinkage void stack_segment(void);
66asmlinkage void general_protection(void);
67asmlinkage void page_fault(void);
68asmlinkage void coprocessor_error(void);
69asmlinkage void simd_coprocessor_error(void);
70asmlinkage void reserved(void);
71asmlinkage void alignment_check(void);
72asmlinkage void machine_check(void);
73asmlinkage void spurious_interrupt_bug(void);
74asmlinkage void call_debug(void);
75
76struct notifier_block *die_chain;
77static DEFINE_SPINLOCK(die_notifier_lock);
78
79int register_die_notifier(struct notifier_block *nb)
80{
81 int err = 0;
82 unsigned long flags;
83 spin_lock_irqsave(&die_notifier_lock, flags);
84 err = notifier_chain_register(&die_chain, nb);
85 spin_unlock_irqrestore(&die_notifier_lock, flags);
86 return err;
87}
88
89static inline void conditional_sti(struct pt_regs *regs)
90{
91 if (regs->eflags & X86_EFLAGS_IF)
92 local_irq_enable();
93}
94
95static int kstack_depth_to_print = 10;
96
97#ifdef CONFIG_KALLSYMS
98#include <linux/kallsyms.h>
99int printk_address(unsigned long address)
100{
101 unsigned long offset = 0, symsize;
102 const char *symname;
103 char *modname;
104 char *delim = ":";
105 char namebuf[128];
106
107 symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
108 if (!symname)
109 return printk("[<%016lx>]", address);
110 if (!modname)
111 modname = delim = "";
112 return printk("<%016lx>{%s%s%s%s%+ld}",
113 address,delim,modname,delim,symname,offset);
114}
115#else
116int printk_address(unsigned long address)
117{
118 return printk("[<%016lx>]", address);
119}
120#endif
121
122unsigned long *in_exception_stack(int cpu, unsigned long stack)
123{
124 int k;
125 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
126 struct tss_struct *tss = &per_cpu(init_tss, cpu);
127 unsigned long start = tss->ist[k] - EXCEPTION_STKSZ;
128
129 if (stack >= start && stack < tss->ist[k])
130 return (unsigned long *)tss->ist[k];
131 }
132 return NULL;
133}
134
135/*
136 * x86-64 can have upto three kernel stacks:
137 * process stack
138 * interrupt stack
139 * severe exception (double fault, nmi, stack fault) hardware stack
140 * Check and process them in order.
141 */
142
143void show_trace(unsigned long *stack)
144{
145 unsigned long addr;
146 unsigned long *irqstack, *irqstack_end, *estack_end;
147 const int cpu = safe_smp_processor_id();
148 int i;
149
150 printk("\nCall Trace:");
151 i = 0;
152
153 estack_end = in_exception_stack(cpu, (unsigned long)stack);
154 if (estack_end) {
155 while (stack < estack_end) {
156 addr = *stack++;
157 if (__kernel_text_address(addr)) {
158 i += printk_address(addr);
159 i += printk(" ");
160 if (i > 50) {
161 printk("\n");
162 i = 0;
163 }
164 }
165 }
166 i += printk(" <EOE> ");
167 i += 7;
168 stack = (unsigned long *) estack_end[-2];
169 }
170
171 irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
172 irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64);
173
174 if (stack >= irqstack && stack < irqstack_end) {
175 printk("<IRQ> ");
176 while (stack < irqstack_end) {
177 addr = *stack++;
178 /*
179 * If the address is either in the text segment of the
180 * kernel, or in the region which contains vmalloc'ed
181 * memory, it *may* be the address of a calling
182 * routine; if so, print it so that someone tracing
183 * down the cause of the crash will be able to figure
184 * out the call path that was taken.
185 */
186 if (__kernel_text_address(addr)) {
187 i += printk_address(addr);
188 i += printk(" ");
189 if (i > 50) {
190 printk("\n ");
191 i = 0;
192 }
193 }
194 }
195 stack = (unsigned long *) (irqstack_end[-1]);
196 printk(" <EOI> ");
197 i += 7;
198 }
199
200 while (((long) stack & (THREAD_SIZE-1)) != 0) {
201 addr = *stack++;
202 if (__kernel_text_address(addr)) {
203 i += printk_address(addr);
204 i += printk(" ");
205 if (i > 50) {
206 printk("\n ");
207 i = 0;
208 }
209 }
210 }
211 printk("\n");
212}
213
214void show_stack(struct task_struct *tsk, unsigned long * rsp)
215{
216 unsigned long *stack;
217 int i;
218 const int cpu = safe_smp_processor_id();
219 unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
220 unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);
221
222 // debugging aid: "show_stack(NULL, NULL);" prints the
223 // back trace for this cpu.
224
225 if (rsp == NULL) {
226 if (tsk)
227 rsp = (unsigned long *)tsk->thread.rsp;
228 else
229 rsp = (unsigned long *)&rsp;
230 }
231
232 stack = rsp;
233 for(i=0; i < kstack_depth_to_print; i++) {
234 if (stack >= irqstack && stack <= irqstack_end) {
235 if (stack == irqstack_end) {
236 stack = (unsigned long *) (irqstack_end[-1]);
237 printk(" <EOI> ");
238 }
239 } else {
240 if (((long) stack & (THREAD_SIZE-1)) == 0)
241 break;
242 }
243 if (i && ((i % 4) == 0))
244 printk("\n ");
245 printk("%016lx ", *stack++);
246 }
247 show_trace((unsigned long *)rsp);
248}
249
250/*
251 * The architecture-independent dump_stack generator
252 */
253void dump_stack(void)
254{
255 unsigned long dummy;
256 show_trace(&dummy);
257}
258
259EXPORT_SYMBOL(dump_stack);
260
261void show_registers(struct pt_regs *regs)
262{
263 int i;
264 int in_kernel = (regs->cs & 3) == 0;
265 unsigned long rsp;
266 const int cpu = safe_smp_processor_id();
267 struct task_struct *cur = cpu_pda[cpu].pcurrent;
268
269 rsp = regs->rsp;
270
271 printk("CPU %d ", cpu);
272 __show_regs(regs);
273 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
274 cur->comm, cur->pid, cur->thread_info, cur);
275
276 /*
277 * When in-kernel, we also print out the stack and code at the
278 * time of the fault..
279 */
280 if (in_kernel) {
281
282 printk("Stack: ");
283 show_stack(NULL, (unsigned long*)rsp);
284
285 printk("\nCode: ");
286 if(regs->rip < PAGE_OFFSET)
287 goto bad;
288
289 for(i=0;i<20;i++)
290 {
291 unsigned char c;
292 if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
293bad:
294 printk(" Bad RIP value.");
295 break;
296 }
297 printk("%02x ", c);
298 }
299 }
300 printk("\n");
301}
302
303void handle_BUG(struct pt_regs *regs)
304{
305 struct bug_frame f;
306 char tmp;
307
308 if (regs->cs & 3)
309 return;
310 if (__copy_from_user(&f, (struct bug_frame *) regs->rip,
311 sizeof(struct bug_frame)))
312 return;
313 if ((unsigned long)f.filename < __PAGE_OFFSET ||
314 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
315 return;
316 if (__get_user(tmp, f.filename))
317 f.filename = "unmapped filename";
318 printk("----------- [cut here ] --------- [please bite here ] ---------\n");
319 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
320}
321
322void out_of_line_bug(void)
323{
324 BUG();
325}
326
327static DEFINE_SPINLOCK(die_lock);
328static int die_owner = -1;
329
330void oops_begin(void)
331{
332 int cpu = safe_smp_processor_id();
333 /* racy, but better than risking deadlock. */
334 local_irq_disable();
335 if (!spin_trylock(&die_lock)) {
336 if (cpu == die_owner)
337 /* nested oops. should stop eventually */;
338 else
339 spin_lock(&die_lock);
340 }
341 die_owner = cpu;
342 console_verbose();
343 bust_spinlocks(1);
344}
345
346void oops_end(void)
347{
348 die_owner = -1;
349 bust_spinlocks(0);
350 spin_unlock(&die_lock);
351 if (panic_on_oops)
352 panic("Oops");
353}
354
355void __die(const char * str, struct pt_regs * regs, long err)
356{
357 static int die_counter;
358 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
359#ifdef CONFIG_PREEMPT
360 printk("PREEMPT ");
361#endif
362#ifdef CONFIG_SMP
363 printk("SMP ");
364#endif
365#ifdef CONFIG_DEBUG_PAGEALLOC
366 printk("DEBUG_PAGEALLOC");
367#endif
368 printk("\n");
369 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
370 show_registers(regs);
371 /* Executive summary in case the oops scrolled away */
372 printk(KERN_ALERT "RIP ");
373 printk_address(regs->rip);
374 printk(" RSP <%016lx>\n", regs->rsp);
375}
376
377void die(const char * str, struct pt_regs * regs, long err)
378{
379 oops_begin();
380 handle_BUG(regs);
381 __die(str, regs, err);
382 oops_end();
383 do_exit(SIGSEGV);
384}
385static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
386{
387 if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS))
388 die(str, regs, err);
389}
390
391void die_nmi(char *str, struct pt_regs *regs)
392{
393 oops_begin();
394 /*
395 * We are in trouble anyway, lets at least try
396 * to get a message out.
397 */
398 printk(str, safe_smp_processor_id());
399 show_registers(regs);
400 if (panic_on_timeout || panic_on_oops)
401 panic("nmi watchdog");
402 printk("console shuts up ...\n");
403 oops_end();
404 do_exit(SIGSEGV);
405}
406
407static void do_trap(int trapnr, int signr, char *str,
408 struct pt_regs * regs, long error_code, siginfo_t *info)
409{
410 conditional_sti(regs);
411
412#ifdef CONFIG_CHECKING
413 {
414 unsigned long gs;
415 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
416 rdmsrl(MSR_GS_BASE, gs);
417 if (gs != (unsigned long)pda) {
418 wrmsrl(MSR_GS_BASE, pda);
419 printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
420 regs->rip);
421 }
422 }
423#endif
424
425 if ((regs->cs & 3) != 0) {
426 struct task_struct *tsk = current;
427
428 if (exception_trace && unhandled_signal(tsk, signr))
429 printk(KERN_INFO
430 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
431 tsk->comm, tsk->pid, str,
432 regs->rip,regs->rsp,error_code);
433
434 tsk->thread.error_code = error_code;
435 tsk->thread.trap_no = trapnr;
436 if (info)
437 force_sig_info(signr, info, tsk);
438 else
439 force_sig(signr, tsk);
440 return;
441 }
442
443
444 /* kernel trap */
445 {
446 const struct exception_table_entry *fixup;
447 fixup = search_exception_tables(regs->rip);
448 if (fixup) {
449 regs->rip = fixup->fixup;
450 } else
451 die(str, regs, error_code);
452 return;
453 }
454}
455
456#define DO_ERROR(trapnr, signr, str, name) \
457asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
458{ \
459 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
460 == NOTIFY_STOP) \
461 return; \
462 do_trap(trapnr, signr, str, regs, error_code, NULL); \
463}
464
465#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
466asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
467{ \
468 siginfo_t info; \
469 info.si_signo = signr; \
470 info.si_errno = 0; \
471 info.si_code = sicode; \
472 info.si_addr = (void __user *)siaddr; \
473 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
474 == NOTIFY_STOP) \
475 return; \
476 do_trap(trapnr, signr, str, regs, error_code, &info); \
477}
478
479DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
480DO_ERROR( 4, SIGSEGV, "overflow", overflow)
481DO_ERROR( 5, SIGSEGV, "bounds", bounds)
482DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip)
483DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
484DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
485DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
486DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
487DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
488DO_ERROR(18, SIGSEGV, "reserved", reserved)
489
490#define DO_ERROR_STACK(trapnr, signr, str, name) \
491asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \
492{ \
493 struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \
494 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
495 == NOTIFY_STOP) \
496 return regs; \
497 if (regs->cs & 3) { \
498 memcpy(pr, regs, sizeof(struct pt_regs)); \
499 regs = pr; \
500 } \
501 do_trap(trapnr, signr, str, regs, error_code, NULL); \
502 return regs; \
503}
504
505DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment)
506DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault)
507
508asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
509{
510 conditional_sti(regs);
511
512#ifdef CONFIG_CHECKING
513 {
514 unsigned long gs;
515 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
516 rdmsrl(MSR_GS_BASE, gs);
517 if (gs != (unsigned long)pda) {
518 wrmsrl(MSR_GS_BASE, pda);
519 oops_in_progress++;
520 printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
521 oops_in_progress--;
522 }
523 }
524#endif
525
526 if ((regs->cs & 3)!=0) {
527 struct task_struct *tsk = current;
528
529 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
530 printk(KERN_INFO
531 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
532 tsk->comm, tsk->pid,
533 regs->rip,regs->rsp,error_code);
534
535 tsk->thread.error_code = error_code;
536 tsk->thread.trap_no = 13;
537 force_sig(SIGSEGV, tsk);
538 return;
539 }
540
541 /* kernel gp */
542 {
543 const struct exception_table_entry *fixup;
544 fixup = search_exception_tables(regs->rip);
545 if (fixup) {
546 regs->rip = fixup->fixup;
547 return;
548 }
549 if (notify_die(DIE_GPF, "general protection fault", regs,
550 error_code, 13, SIGSEGV) == NOTIFY_STOP)
551 return;
552 die("general protection fault", regs, error_code);
553 }
554}
555
556static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
557{
558 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
559 printk("You probably have a hardware problem with your RAM chips\n");
560
561 /* Clear and disable the memory parity error line. */
562 reason = (reason & 0xf) | 4;
563 outb(reason, 0x61);
564}
565
566static void io_check_error(unsigned char reason, struct pt_regs * regs)
567{
568 printk("NMI: IOCK error (debug interrupt?)\n");
569 show_registers(regs);
570
571 /* Re-enable the IOCK line, wait for a few seconds */
572 reason = (reason & 0xf) | 8;
573 outb(reason, 0x61);
574 mdelay(2000);
575 reason &= ~8;
576 outb(reason, 0x61);
577}
578
579static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
580{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
581 printk("Dazed and confused, but trying to continue\n");
582 printk("Do you have a strange power saving mode enabled?\n");
583}
584
585asmlinkage void default_do_nmi(struct pt_regs *regs)
586{
587 unsigned char reason = 0;
588
589 /* Only the BSP gets external NMIs from the system. */
590 if (!smp_processor_id())
591 reason = get_nmi_reason();
592
593 if (!(reason & 0xc0)) {
594 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
595 == NOTIFY_STOP)
596 return;
597#ifdef CONFIG_X86_LOCAL_APIC
598 /*
599 * Ok, so this is none of the documented NMI sources,
600 * so it must be the NMI watchdog.
601 */
602 if (nmi_watchdog > 0) {
603 nmi_watchdog_tick(regs,reason);
604 return;
605 }
606#endif
607 unknown_nmi_error(reason, regs);
608 return;
609 }
610 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
611 return;
612
613 /* AK: following checks seem to be broken on modern chipsets. FIXME */
614
615 if (reason & 0x80)
616 mem_parity_error(reason, regs);
617 if (reason & 0x40)
618 io_check_error(reason, regs);
619}
620
621asmlinkage void do_int3(struct pt_regs * regs, long error_code)
622{
623 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
624 return;
625 }
626 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
627 return;
628}
629
630/* runs on IST stack. */
631asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code)
632{
633 struct pt_regs *pr;
634 unsigned long condition;
635 struct task_struct *tsk = current;
636 siginfo_t info;
637
638 pr = (struct pt_regs *)(current->thread.rsp0)-1;
639 if (regs->cs & 3) {
640 memcpy(pr, regs, sizeof(struct pt_regs));
641 regs = pr;
642 }
643
644#ifdef CONFIG_CHECKING
645 {
646 /* RED-PEN interaction with debugger - could destroy gs */
647 unsigned long gs;
648 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
649 rdmsrl(MSR_GS_BASE, gs);
650 if (gs != (unsigned long)pda) {
651 wrmsrl(MSR_GS_BASE, pda);
652 printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
653 }
654 }
655#endif
656
657 asm("movq %%db6,%0" : "=r" (condition));
658
659 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
660 SIGTRAP) == NOTIFY_STOP) {
661 return regs;
662 }
663 conditional_sti(regs);
664
665 /* Mask out spurious debug traps due to lazy DR7 setting */
666 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
667 if (!tsk->thread.debugreg7) {
668 goto clear_dr7;
669 }
670 }
671
672 tsk->thread.debugreg6 = condition;
673
674 /* Mask out spurious TF errors due to lazy TF clearing */
675 if ((condition & DR_STEP) &&
676 (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition,
677 1, SIGTRAP) != NOTIFY_STOP)) {
678 /*
679 * The TF error should be masked out only if the current
680 * process is not traced and if the TRAP flag has been set
681 * previously by a tracing process (condition detected by
682 * the PT_DTRACE flag); remember that the i386 TRAP flag
683 * can be modified by the process itself in user mode,
684 * allowing programs to debug themselves without the ptrace()
685 * interface.
686 */
687 if ((regs->cs & 3) == 0)
688 goto clear_TF_reenable;
689 if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
690 goto clear_TF;
691 }
692
693 /* Ok, finally something we can handle */
694 tsk->thread.trap_no = 1;
695 tsk->thread.error_code = error_code;
696 info.si_signo = SIGTRAP;
697 info.si_errno = 0;
698 info.si_code = TRAP_BRKPT;
699 if ((regs->cs & 3) == 0)
700 goto clear_dr7;
701
702 info.si_addr = (void __user *)regs->rip;
703 force_sig_info(SIGTRAP, &info, tsk);
704clear_dr7:
705 asm volatile("movq %0,%%db7"::"r"(0UL));
706 notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP);
707 return regs;
708
709clear_TF_reenable:
710 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
711
712clear_TF:
713 /* RED-PEN could cause spurious errors */
714 if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP)
715 != NOTIFY_STOP)
716 regs->eflags &= ~TF_MASK;
717 return regs;
718}
719
720static int kernel_math_error(struct pt_regs *regs, char *str)
721{
722 const struct exception_table_entry *fixup;
723 fixup = search_exception_tables(regs->rip);
724 if (fixup) {
725 regs->rip = fixup->fixup;
726 return 1;
727 }
728 notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE);
729#if 0
730 /* This should be a die, but warn only for now */
731 die(str, regs, 0);
732#else
733 printk(KERN_DEBUG "%s: %s at ", current->comm, str);
734 printk_address(regs->rip);
735 printk("\n");
736#endif
737 return 0;
738}
739
740/*
741 * Note that we play around with the 'TS' bit in an attempt to get
742 * the correct behaviour even in the presence of the asynchronous
743 * IRQ13 behaviour
744 */
745asmlinkage void do_coprocessor_error(struct pt_regs *regs)
746{
747 void __user *rip = (void __user *)(regs->rip);
748 struct task_struct * task;
749 siginfo_t info;
750 unsigned short cwd, swd;
751
752 conditional_sti(regs);
753 if ((regs->cs & 3) == 0 &&
754 kernel_math_error(regs, "kernel x87 math error"))
755 return;
756
757 /*
758 * Save the info for the exception handler and clear the error.
759 */
760 task = current;
761 save_init_fpu(task);
762 task->thread.trap_no = 16;
763 task->thread.error_code = 0;
764 info.si_signo = SIGFPE;
765 info.si_errno = 0;
766 info.si_code = __SI_FAULT;
767 info.si_addr = rip;
768 /*
769 * (~cwd & swd) will mask out exceptions that are not set to unmasked
770 * status. 0x3f is the exception bits in these regs, 0x200 is the
771 * C1 reg you need in case of a stack fault, 0x040 is the stack
772 * fault bit. We should only be taking one exception at a time,
773 * so if this combination doesn't produce any single exception,
774 * then we have a bad program that isn't synchronizing its FPU usage
775 * and it will suffer the consequences since we won't be able to
776 * fully reproduce the context of the exception
777 */
778 cwd = get_fpu_cwd(task);
779 swd = get_fpu_swd(task);
780 switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
781 case 0x000:
782 default:
783 break;
784 case 0x001: /* Invalid Op */
785 case 0x041: /* Stack Fault */
786 case 0x241: /* Stack Fault | Direction */
787 info.si_code = FPE_FLTINV;
788 break;
789 case 0x002: /* Denormalize */
790 case 0x010: /* Underflow */
791 info.si_code = FPE_FLTUND;
792 break;
793 case 0x004: /* Zero Divide */
794 info.si_code = FPE_FLTDIV;
795 break;
796 case 0x008: /* Overflow */
797 info.si_code = FPE_FLTOVF;
798 break;
799 case 0x020: /* Precision */
800 info.si_code = FPE_FLTRES;
801 break;
802 }
803 force_sig_info(SIGFPE, &info, task);
804}
805
806asmlinkage void bad_intr(void)
807{
808 printk("bad interrupt");
809}
810
811asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
812{
813 void __user *rip = (void __user *)(regs->rip);
814 struct task_struct * task;
815 siginfo_t info;
816 unsigned short mxcsr;
817
818 conditional_sti(regs);
819 if ((regs->cs & 3) == 0 &&
820 kernel_math_error(regs, "simd math error"))
821 return;
822
823 /*
824 * Save the info for the exception handler and clear the error.
825 */
826 task = current;
827 save_init_fpu(task);
828 task->thread.trap_no = 19;
829 task->thread.error_code = 0;
830 info.si_signo = SIGFPE;
831 info.si_errno = 0;
832 info.si_code = __SI_FAULT;
833 info.si_addr = rip;
834 /*
835 * The SIMD FPU exceptions are handled a little differently, as there
836 * is only a single status/control register. Thus, to determine which
837 * unmasked exception was caught we must mask the exception mask bits
838 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
839 */
840 mxcsr = get_fpu_mxcsr(task);
841 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
842 case 0x000:
843 default:
844 break;
845 case 0x001: /* Invalid Op */
846 info.si_code = FPE_FLTINV;
847 break;
848 case 0x002: /* Denormalize */
849 case 0x010: /* Underflow */
850 info.si_code = FPE_FLTUND;
851 break;
852 case 0x004: /* Zero Divide */
853 info.si_code = FPE_FLTDIV;
854 break;
855 case 0x008: /* Overflow */
856 info.si_code = FPE_FLTOVF;
857 break;
858 case 0x020: /* Precision */
859 info.si_code = FPE_FLTRES;
860 break;
861 }
862 force_sig_info(SIGFPE, &info, task);
863}
864
865asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
866{
867}
868
869asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
870{
871}
872
873/*
874 * 'math_state_restore()' saves the current math information in the
875 * old math state array, and gets the new ones from the current task
876 *
877 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
878 * Don't touch unless you *really* know how it works.
879 */
880asmlinkage void math_state_restore(void)
881{
882 struct task_struct *me = current;
883 clts(); /* Allow maths ops (or we recurse) */
884
885 if (!used_math())
886 init_fpu(me);
887 restore_fpu_checking(&me->thread.i387.fxsave);
888 me->thread_info->status |= TS_USEDFPU;
889}
890
891void do_call_debug(struct pt_regs *regs)
892{
893 notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT);
894}
895
896void __init trap_init(void)
897{
898 set_intr_gate(0,&divide_error);
899 set_intr_gate_ist(1,&debug,DEBUG_STACK);
900 set_intr_gate_ist(2,&nmi,NMI_STACK);
901 set_system_gate(3,&int3);
902 set_system_gate(4,&overflow); /* int4-5 can be called from all */
903 set_system_gate(5,&bounds);
904 set_intr_gate(6,&invalid_op);
905 set_intr_gate(7,&device_not_available);
906 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
907 set_intr_gate(9,&coprocessor_segment_overrun);
908 set_intr_gate(10,&invalid_TSS);
909 set_intr_gate(11,&segment_not_present);
910 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
911 set_intr_gate(13,&general_protection);
912 set_intr_gate(14,&page_fault);
913 set_intr_gate(15,&spurious_interrupt_bug);
914 set_intr_gate(16,&coprocessor_error);
915 set_intr_gate(17,&alignment_check);
916#ifdef CONFIG_X86_MCE
917 set_intr_gate_ist(18,&machine_check, MCE_STACK);
918#endif
919 set_intr_gate(19,&simd_coprocessor_error);
920
921#ifdef CONFIG_IA32_EMULATION
922 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
923#endif
924
925 set_intr_gate(KDB_VECTOR, call_debug);
926
927 /*
928 * Should be a barrier for any external CPU state.
929 */
930 cpu_init();
931}
932
933
934/* Actual parsing is done early in setup.c. */
935static int __init oops_dummy(char *s)
936{
937 panic_on_oops = 1;
938 return -1;
939}
940__setup("oops=", oops_dummy);
941
942static int __init kstack_setup(char *s)
943{
944 kstack_depth_to_print = simple_strtoul(s,NULL,0);
945 return 0;
946}
947__setup("kstack=", kstack_setup);
948
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..59ebd5beda87
--- /dev/null
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -0,0 +1,164 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#include <asm-generic/vmlinux.lds.h>
6#include <linux/config.h>
7
8OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
9OUTPUT_ARCH(i386:x86-64)
10ENTRY(phys_startup_64)
11jiffies_64 = jiffies;
12SECTIONS
13{
14 . = 0xffffffff80100000;
15 phys_startup_64 = startup_64 - LOAD_OFFSET;
16 _text = .; /* Text and read-only data */
17 .text : {
18 *(.text)
19 SCHED_TEXT
20 LOCK_TEXT
21 *(.fixup)
22 *(.gnu.warning)
23 } = 0x9090
24 .text.lock : { *(.text.lock) } /* out-of-line lock text */
25
26 _etext = .; /* End of text section */
27
28 . = ALIGN(16); /* Exception table */
29 __start___ex_table = .;
30 __ex_table : { *(__ex_table) }
31 __stop___ex_table = .;
32
33 RODATA
34
35 .data : { /* Data */
36 *(.data)
37 CONSTRUCTORS
38 }
39
40 _edata = .; /* End of data section */
41
42 __bss_start = .; /* BSS */
43 .bss : {
44 *(.bss.page_aligned)
45 *(.bss)
46 }
47 __bss_end = .;
48
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
50 .data.cacheline_aligned : { *(.data.cacheline_aligned) }
51
52#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
53#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1))
54#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
55
56 .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
57 __vsyscall_0 = LOADADDR(.vsyscall_0);
58 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
59 .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
60 xtime_lock = LOADADDR(.xtime_lock);
61 .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
62 vxtime = LOADADDR(.vxtime);
63 .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
64 wall_jiffies = LOADADDR(.wall_jiffies);
65 .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
66 sys_tz = LOADADDR(.sys_tz);
67 .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
68 sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
69 .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
70 xtime = LOADADDR(.xtime);
71 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
72 .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
73 jiffies = LOADADDR(.jiffies);
74 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
75 . = LOADADDR(.vsyscall_0) + 4096;
76
77 . = ALIGN(8192); /* init_task */
78 .data.init_task : { *(.data.init_task) }
79
80 . = ALIGN(4096);
81 .data.page_aligned : { *(.data.page_aligned) }
82
83 . = ALIGN(4096); /* Init code and data */
84 __init_begin = .;
85 .init.text : {
86 _sinittext = .;
87 *(.init.text)
88 _einittext = .;
89 }
90 __initdata_begin = .;
91 .init.data : { *(.init.data) }
92 __initdata_end = .;
93 . = ALIGN(16);
94 __setup_start = .;
95 .init.setup : { *(.init.setup) }
96 __setup_end = .;
97 __initcall_start = .;
98 .initcall.init : {
99 *(.initcall1.init)
100 *(.initcall2.init)
101 *(.initcall3.init)
102 *(.initcall4.init)
103 *(.initcall5.init)
104 *(.initcall6.init)
105 *(.initcall7.init)
106 }
107 __initcall_end = .;
108 __con_initcall_start = .;
109 .con_initcall.init : { *(.con_initcall.init) }
110 __con_initcall_end = .;
111 SECURITY_INIT
112 . = ALIGN(8);
113 __alt_instructions = .;
114 .altinstructions : { *(.altinstructions) }
115 __alt_instructions_end = .;
116 .altinstr_replacement : { *(.altinstr_replacement) }
117 /* .exit.text is discard at runtime, not link time, to deal with references
118 from .altinstructions and .eh_frame */
119 .exit.text : { *(.exit.text) }
120 .exit.data : { *(.exit.data) }
121 . = ALIGN(4096);
122 __initramfs_start = .;
123 .init.ramfs : { *(.init.ramfs) }
124 __initramfs_end = .;
125 . = ALIGN(32);
126 __per_cpu_start = .;
127 .data.percpu : { *(.data.percpu) }
128 __per_cpu_end = .;
129 . = ALIGN(4096);
130 __init_end = .;
131
132 . = ALIGN(4096);
133 __nosave_begin = .;
134 .data_nosave : { *(.data.nosave) }
135 . = ALIGN(4096);
136 __nosave_end = .;
137
138 _end = . ;
139
140 /* Sections to be discarded */
141 /DISCARD/ : {
142 *(.exitcall.exit)
143#ifndef CONFIG_DEBUG_INFO
144 *(.eh_frame)
145#endif
146 }
147
148 /* DWARF 2 */
149 .debug_info 0 : { *(.debug_info) }
150 .debug_abbrev 0 : { *(.debug_abbrev) }
151 .debug_line 0 : { *(.debug_line) }
152 .debug_frame 0 : { *(.debug_frame) }
153 .debug_str 0 : { *(.debug_str) }
154 .debug_loc 0 : { *(.debug_loc) }
155 .debug_macinfo 0 : { *(.debug_macinfo) }
156 /* SGI/MIPS DWARF 2 extensions */
157 .debug_weaknames 0 : { *(.debug_weaknames) }
158 .debug_funcnames 0 : { *(.debug_funcnames) }
159 .debug_typenames 0 : { *(.debug_typenames) }
160 .debug_varnames 0 : { *(.debug_varnames) }
161
162
163 .comment 0 : { *(.comment) }
164}
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
new file mode 100644
index 000000000000..b4b8dc59663a
--- /dev/null
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -0,0 +1,225 @@
1/*
2 * linux/arch/x86_64/kernel/vsyscall.c
3 *
4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright 2003 Andi Kleen, SuSE Labs.
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 * If we want more than four we need a vDSO.
17 *
18 * Note: the concept clashes with user mode linux. If you use UML and
19 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
20 */
21
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/timer.h>
26#include <linux/seqlock.h>
27#include <linux/jiffies.h>
28#include <linux/sysctl.h>
29
30#include <asm/vsyscall.h>
31#include <asm/pgtable.h>
32#include <asm/page.h>
33#include <asm/fixmap.h>
34#include <asm/errno.h>
35#include <asm/io.h>
36
37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
38#define force_inline __attribute__((always_inline)) inline
39
40int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
41seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
42
43#include <asm/unistd.h>
44
45static force_inline void timeval_normalize(struct timeval * tv)
46{
47 time_t __sec;
48
49 __sec = tv->tv_usec / 1000000;
50 if (__sec) {
51 tv->tv_usec %= 1000000;
52 tv->tv_sec += __sec;
53 }
54}
55
56static force_inline void do_vgettimeofday(struct timeval * tv)
57{
58 long sequence, t;
59 unsigned long sec, usec;
60
61 do {
62 sequence = read_seqbegin(&__xtime_lock);
63
64 sec = __xtime.tv_sec;
65 usec = (__xtime.tv_nsec / 1000) +
66 (__jiffies - __wall_jiffies) * (1000000 / HZ);
67
68 if (__vxtime.mode == VXTIME_TSC) {
69 sync_core();
70 rdtscll(t);
71 if (t < __vxtime.last_tsc)
72 t = __vxtime.last_tsc;
73 usec += ((t - __vxtime.last_tsc) *
74 __vxtime.tsc_quot) >> 32;
75 /* See comment in x86_64 do_gettimeofday. */
76 } else {
77 usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
78 __vxtime.last) * __vxtime.quot) >> 32;
79 }
80 } while (read_seqretry(&__xtime_lock, sequence));
81
82 tv->tv_sec = sec + usec / 1000000;
83 tv->tv_usec = usec % 1000000;
84}
85
86/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
87static force_inline void do_get_tz(struct timezone * tz)
88{
89 *tz = __sys_tz;
90}
91
92static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
93{
94 int ret;
95 asm volatile("vsysc2: syscall"
96 : "=a" (ret)
97 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
98 return ret;
99}
100
101static force_inline long time_syscall(long *t)
102{
103 long secs;
104 asm volatile("vsysc1: syscall"
105 : "=a" (secs)
106 : "0" (__NR_time),"D" (t) : __syscall_clobber);
107 return secs;
108}
109
110static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
111{
112 if (unlikely(!__sysctl_vsyscall))
113 return gettimeofday(tv,tz);
114 if (tv)
115 do_vgettimeofday(tv);
116 if (tz)
117 do_get_tz(tz);
118 return 0;
119}
120
121/* This will break when the xtime seconds get inaccurate, but that is
122 * unlikely */
123static time_t __vsyscall(1) vtime(time_t *t)
124{
125 if (unlikely(!__sysctl_vsyscall))
126 return time_syscall(t);
127 else if (t)
128 *t = __xtime.tv_sec;
129 return __xtime.tv_sec;
130}
131
132static long __vsyscall(2) venosys_0(void)
133{
134 return -ENOSYS;
135}
136
137static long __vsyscall(3) venosys_1(void)
138{
139 return -ENOSYS;
140}
141
142#ifdef CONFIG_SYSCTL
143
144#define SYSCALL 0x050f
145#define NOP2 0x9090
146
147/*
148 * NOP out syscall in vsyscall page when not needed.
149 */
150static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
151 void __user *buffer, size_t *lenp, loff_t *ppos)
152{
153 extern u16 vsysc1, vsysc2;
154 u16 *map1, *map2;
155 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
156 if (!write)
157 return ret;
158 /* gcc has some trouble with __va(__pa()), so just do it this
159 way. */
160 map1 = ioremap(__pa_symbol(&vsysc1), 2);
161 if (!map1)
162 return -ENOMEM;
163 map2 = ioremap(__pa_symbol(&vsysc2), 2);
164 if (!map2) {
165 ret = -ENOMEM;
166 goto out;
167 }
168 if (!sysctl_vsyscall) {
169 *map1 = SYSCALL;
170 *map2 = SYSCALL;
171 } else {
172 *map1 = NOP2;
173 *map2 = NOP2;
174 }
175 iounmap(map2);
176out:
177 iounmap(map1);
178 return ret;
179}
180
181static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
182 void __user *oldval, size_t __user *oldlenp,
183 void __user *newval, size_t newlen,
184 void **context)
185{
186 return -ENOSYS;
187}
188
189static ctl_table kernel_table2[] = {
190 { .ctl_name = 99, .procname = "vsyscall64",
191 .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
192 .strategy = vsyscall_sysctl_nostrat,
193 .proc_handler = vsyscall_sysctl_change },
194 { 0, }
195};
196
197static ctl_table kernel_root_table2[] = {
198 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
199 .child = kernel_table2 },
200 { 0 },
201};
202
203#endif
204
205static void __init map_vsyscall(void)
206{
207 extern char __vsyscall_0;
208 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
209
210 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
211}
212
213static int __init vsyscall_init(void)
214{
215 BUG_ON(((unsigned long) &vgettimeofday !=
216 VSYSCALL_ADDR(__NR_vgettimeofday)));
217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
219 map_vsyscall();
220 sysctl_vsyscall = 1;
221 register_sysctl_table(kernel_root_table2, 0);
222 return 0;
223}
224
225__initcall(vsyscall_init);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
new file mode 100644
index 000000000000..88626e626886
--- /dev/null
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -0,0 +1,221 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <linux/smp.h>
4#include <linux/user.h>
5#include <linux/sched.h>
6#include <linux/in6.h>
7#include <linux/interrupt.h>
8#include <linux/smp_lock.h>
9#include <linux/pm.h>
10#include <linux/pci.h>
11#include <linux/apm_bios.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/syscalls.h>
15#include <linux/tty.h>
16#include <linux/ioctl32.h>
17
18#include <asm/semaphore.h>
19#include <asm/processor.h>
20#include <asm/i387.h>
21#include <asm/uaccess.h>
22#include <asm/checksum.h>
23#include <asm/io.h>
24#include <asm/delay.h>
25#include <asm/irq.h>
26#include <asm/mmx.h>
27#include <asm/desc.h>
28#include <asm/pgtable.h>
29#include <asm/pgalloc.h>
30#include <asm/nmi.h>
31#include <asm/kdebug.h>
32#include <asm/unistd.h>
33#include <asm/tlbflush.h>
34#include <asm/kdebug.h>
35
36extern spinlock_t rtc_lock;
37
38#ifdef CONFIG_SMP
39extern void __write_lock_failed(rwlock_t *rw);
40extern void __read_lock_failed(rwlock_t *rw);
41#endif
42
43#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
44extern struct drive_info_struct drive_info;
45EXPORT_SYMBOL(drive_info);
46#endif
47
48extern unsigned long get_cmos_time(void);
49
50/* platform dependent support */
51EXPORT_SYMBOL(boot_cpu_data);
52//EXPORT_SYMBOL(dump_fpu);
53EXPORT_SYMBOL(__ioremap);
54EXPORT_SYMBOL(ioremap_nocache);
55EXPORT_SYMBOL(iounmap);
56EXPORT_SYMBOL(enable_irq);
57EXPORT_SYMBOL(disable_irq);
58EXPORT_SYMBOL(disable_irq_nosync);
59EXPORT_SYMBOL(probe_irq_mask);
60EXPORT_SYMBOL(kernel_thread);
61EXPORT_SYMBOL(pm_idle);
62EXPORT_SYMBOL(pm_power_off);
63EXPORT_SYMBOL(get_cmos_time);
64
65EXPORT_SYMBOL(__down_failed);
66EXPORT_SYMBOL(__down_failed_interruptible);
67EXPORT_SYMBOL(__down_failed_trylock);
68EXPORT_SYMBOL(__up_wakeup);
69/* Networking helper routines. */
70EXPORT_SYMBOL(csum_partial_copy_nocheck);
71EXPORT_SYMBOL(ip_compute_csum);
72/* Delay loops */
73EXPORT_SYMBOL(__udelay);
74EXPORT_SYMBOL(__ndelay);
75EXPORT_SYMBOL(__delay);
76EXPORT_SYMBOL(__const_udelay);
77
78EXPORT_SYMBOL(__get_user_1);
79EXPORT_SYMBOL(__get_user_2);
80EXPORT_SYMBOL(__get_user_4);
81EXPORT_SYMBOL(__get_user_8);
82EXPORT_SYMBOL(__put_user_1);
83EXPORT_SYMBOL(__put_user_2);
84EXPORT_SYMBOL(__put_user_4);
85EXPORT_SYMBOL(__put_user_8);
86
87EXPORT_SYMBOL(strpbrk);
88EXPORT_SYMBOL(strstr);
89
90EXPORT_SYMBOL(strncpy_from_user);
91EXPORT_SYMBOL(__strncpy_from_user);
92EXPORT_SYMBOL(clear_user);
93EXPORT_SYMBOL(__clear_user);
94EXPORT_SYMBOL(copy_user_generic);
95EXPORT_SYMBOL(copy_from_user);
96EXPORT_SYMBOL(copy_to_user);
97EXPORT_SYMBOL(copy_in_user);
98EXPORT_SYMBOL(strnlen_user);
99
100#ifdef CONFIG_PCI
101EXPORT_SYMBOL(pci_alloc_consistent);
102EXPORT_SYMBOL(pci_free_consistent);
103#endif
104
105#ifdef CONFIG_PCI
106EXPORT_SYMBOL(pci_mem_start);
107#endif
108
109EXPORT_SYMBOL(copy_page);
110EXPORT_SYMBOL(clear_page);
111
112EXPORT_SYMBOL(cpu_pda);
113#ifdef CONFIG_SMP
114EXPORT_SYMBOL(cpu_data);
115EXPORT_SYMBOL(cpu_online_map);
116EXPORT_SYMBOL(__write_lock_failed);
117EXPORT_SYMBOL(__read_lock_failed);
118
119EXPORT_SYMBOL(synchronize_irq);
120EXPORT_SYMBOL(smp_call_function);
121EXPORT_SYMBOL(cpu_callout_map);
122#endif
123
124#ifdef CONFIG_VT
125EXPORT_SYMBOL(screen_info);
126#endif
127
128EXPORT_SYMBOL(get_wchan);
129
130EXPORT_SYMBOL(rtc_lock);
131
132EXPORT_SYMBOL_GPL(set_nmi_callback);
133EXPORT_SYMBOL_GPL(unset_nmi_callback);
134
135/* Export string functions. We normally rely on gcc builtin for most of these,
136 but gcc sometimes decides not to inline them. */
137#undef memcpy
138#undef memset
139#undef memmove
140#undef memchr
141#undef strlen
142#undef strcpy
143#undef strncmp
144#undef strncpy
145#undef strchr
146#undef strcmp
147#undef strcpy
148#undef strcat
149#undef memcmp
150
151extern void * memset(void *,int,__kernel_size_t);
152extern size_t strlen(const char *);
153extern void * memmove(void * dest,const void *src,size_t count);
154extern char * strcpy(char * dest,const char *src);
155extern int strcmp(const char * cs,const char * ct);
156extern void *memchr(const void *s, int c, size_t n);
157extern void * memcpy(void *,const void *,__kernel_size_t);
158extern void * __memcpy(void *,const void *,__kernel_size_t);
159extern char * strcat(char *, const char *);
160extern int memcmp(const void * cs,const void * ct,size_t count);
161
162EXPORT_SYMBOL(memset);
163EXPORT_SYMBOL(strlen);
164EXPORT_SYMBOL(memmove);
165EXPORT_SYMBOL(strcpy);
166EXPORT_SYMBOL(strncmp);
167EXPORT_SYMBOL(strncpy);
168EXPORT_SYMBOL(strchr);
169EXPORT_SYMBOL(strcmp);
170EXPORT_SYMBOL(strcat);
171EXPORT_SYMBOL(strncat);
172EXPORT_SYMBOL(memchr);
173EXPORT_SYMBOL(strrchr);
174EXPORT_SYMBOL(strnlen);
175EXPORT_SYMBOL(memscan);
176EXPORT_SYMBOL(memcpy);
177EXPORT_SYMBOL(__memcpy);
178EXPORT_SYMBOL(memcmp);
179
180#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
181/* prototypes are wrong, these are assembly with custom calling functions */
182extern void rwsem_down_read_failed_thunk(void);
183extern void rwsem_wake_thunk(void);
184extern void rwsem_downgrade_thunk(void);
185extern void rwsem_down_write_failed_thunk(void);
186EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
187EXPORT_SYMBOL(rwsem_wake_thunk);
188EXPORT_SYMBOL(rwsem_downgrade_thunk);
189EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
190#endif
191
192EXPORT_SYMBOL(empty_zero_page);
193
194#ifdef CONFIG_HAVE_DEC_LOCK
195EXPORT_SYMBOL(_atomic_dec_and_lock);
196#endif
197
198EXPORT_SYMBOL(die_chain);
199EXPORT_SYMBOL(register_die_notifier);
200
201#ifdef CONFIG_SMP
202EXPORT_SYMBOL(cpu_sibling_map);
203EXPORT_SYMBOL(smp_num_siblings);
204#endif
205
206extern void do_softirq_thunk(void);
207EXPORT_SYMBOL(do_softirq_thunk);
208
209void out_of_line_bug(void);
210EXPORT_SYMBOL(out_of_line_bug);
211
212EXPORT_SYMBOL(init_level4_pgt);
213
214extern unsigned long __supported_pte_mask;
215EXPORT_SYMBOL(__supported_pte_mask);
216
217#ifdef CONFIG_SMP
218EXPORT_SYMBOL(flush_tlb_page);
219#endif
220
221EXPORT_SYMBOL(cpu_khz);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 000000000000..6b26a1c1e9ff
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for x86_64-specific library files.
3#
4
5CFLAGS_csum-partial.o := -funroll-loops
6
7obj-y := io.o
8
9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10 usercopy.o getuser.o putuser.o \
11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12lib-y += memcpy.o memmove.o memset.o copy_user.o
13
14lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 000000000000..a29fb75b33ac
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
1#include <linux/bitops.h>
2
3#undef find_first_zero_bit
4#undef find_next_zero_bit
5#undef find_first_bit
6#undef find_next_bit
7
8/**
9 * find_first_zero_bit - find the first zero bit in a memory region
10 * @addr: The address to start the search at
11 * @size: The maximum size to search
12 *
13 * Returns the bit-number of the first zero bit, not the number of the byte
14 * containing a bit.
15 */
16inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
17{
18 long d0, d1, d2;
19 long res;
20
21 if (!size)
22 return 0;
23 asm volatile(
24 " repe; scasq\n"
25 " je 1f\n"
26 " xorq -8(%%rdi),%%rax\n"
27 " subq $8,%%rdi\n"
28 " bsfq %%rax,%%rdx\n"
29 "1: subq %[addr],%%rdi\n"
30 " shlq $3,%%rdi\n"
31 " addq %%rdi,%%rdx"
32 :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
33 :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
34 [addr] "r" (addr) : "memory");
35 return res;
36}
37
38/**
39 * find_next_zero_bit - find the first zero bit in a memory region
40 * @addr: The address to base the search on
41 * @offset: The bitnumber to start searching at
42 * @size: The maximum size to search
43 */
44long find_next_zero_bit (const unsigned long * addr, long size, long offset)
45{
46 unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
47 unsigned long set = 0;
48 unsigned long res, bit = offset&63;
49
50 if (bit) {
51 /*
52 * Look for zero in first word
53 */
54 asm("bsfq %1,%0\n\t"
55 "cmoveq %2,%0"
56 : "=r" (set)
57 : "r" (~(*p >> bit)), "r"(64L));
58 if (set < (64 - bit))
59 return set + offset;
60 set = 64 - bit;
61 p++;
62 }
63 /*
64 * No zero yet, search remaining full words for a zero
65 */
66 res = find_first_zero_bit ((const unsigned long *)p,
67 size - 64 * (p - (unsigned long *) addr));
68 return (offset + set + res);
69}
70
71static inline long
72__find_first_bit(const unsigned long * addr, unsigned long size)
73{
74 long d0, d1;
75 long res;
76
77 asm volatile(
78 " repe; scasq\n"
79 " jz 1f\n"
80 " subq $8,%%rdi\n"
81 " bsfq (%%rdi),%%rax\n"
82 "1: subq %[addr],%%rdi\n"
83 " shlq $3,%%rdi\n"
84 " addq %%rdi,%%rax"
85 :"=a" (res), "=&c" (d0), "=&D" (d1)
86 :"0" (0ULL),
87 "1" ((size + 63) >> 6), "2" (addr),
88 [addr] "r" (addr) : "memory");
89 return res;
90}
91
92/**
93 * find_first_bit - find the first set bit in a memory region
94 * @addr: The address to start the search at
95 * @size: The maximum size to search
96 *
97 * Returns the bit-number of the first set bit, not the number of the byte
98 * containing a bit.
99 */
100long find_first_bit(const unsigned long * addr, unsigned long size)
101{
102 return __find_first_bit(addr,size);
103}
104
105/**
106 * find_next_bit - find the first set bit in a memory region
107 * @addr: The address to base the search on
108 * @offset: The bitnumber to start searching at
109 * @size: The maximum size to search
110 */
111long find_next_bit(const unsigned long * addr, long size, long offset)
112{
113 const unsigned long * p = addr + (offset >> 6);
114 unsigned long set = 0, bit = offset & 63, res;
115
116 if (bit) {
117 /*
118 * Look for nonzero in the first 64 bits:
119 */
120 asm("bsfq %1,%0\n\t"
121 "cmoveq %2,%0\n\t"
122 : "=r" (set)
123 : "r" (*p >> bit), "r" (64L));
124 if (set < (64 - bit))
125 return set + offset;
126 set = 64 - bit;
127 p++;
128 }
129 /*
130 * No set bit yet, search remaining full words for a bit
131 */
132 res = __find_first_bit (p, size - 64 * (p - addr));
133 return (offset + set + res);
134}
135
136#include <linux/module.h>
137
138EXPORT_SYMBOL(find_next_bit);
139EXPORT_SYMBOL(find_first_bit);
140EXPORT_SYMBOL(find_first_zero_bit);
141EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
1#include <linux/module.h>
2#include <linux/bitops.h>
3
4/* Find string of zero bits in a bitmap */
5unsigned long
6find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
7{
8 unsigned long n, end, i;
9
10 again:
11 n = find_next_zero_bit(bitmap, nbits, start);
12 if (n == -1)
13 return -1;
14
15 /* could test bitsliced, but it's hardly worth it */
16 end = n+len;
17 if (end >= nbits)
18 return -1;
19 for (i = n+1; i < end; i++) {
20 if (test_bit(i, bitmap)) {
21 start = i+1;
22 goto again;
23 }
24 }
25 return n;
26}
27
28EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 000000000000..30a9da458c15
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
1/*
2 * Zero a page.
3 * rdi page
4 */
5 .globl clear_page
6 .p2align 4
7clear_page:
8 xorl %eax,%eax
9 movl $4096/64,%ecx
10 .p2align 4
11.Lloop:
12 decl %ecx
13#define PUT(x) movq %rax,x*8(%rdi)
14 movq %rax,(%rdi)
15 PUT(1)
16 PUT(2)
17 PUT(3)
18 PUT(4)
19 PUT(5)
20 PUT(6)
21 PUT(7)
22 leaq 64(%rdi),%rdi
23 jnz .Lloop
24 nop
25 ret
26clear_page_end:
27
28 /* C stepping K8 run faster using the string instructions.
29 It is also a lot simpler. Use this when possible */
30
31#include <asm/cpufeature.h>
32
33 .section .altinstructions,"a"
34 .align 8
35 .quad clear_page
36 .quad clear_page_c
37 .byte X86_FEATURE_K8_C
38 .byte clear_page_end-clear_page
39 .byte clear_page_c_end-clear_page_c
40 .previous
41
42 .section .altinstr_replacement,"ax"
43clear_page_c:
44 movl $4096/8,%ecx
45 xorl %eax,%eax
46 rep
47 stosq
48 ret
49clear_page_c_end:
50 .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 000000000000..dd3aa47b6bf5
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2
3/* Don't use streaming store because it's better when the target
4 ends up in cache. */
5
6/* Could vary the prefetch distance based on SMP/UP */
7
8 .globl copy_page
9 .p2align 4
10copy_page:
11 subq $3*8,%rsp
12 movq %rbx,(%rsp)
13 movq %r12,1*8(%rsp)
14 movq %r13,2*8(%rsp)
15
16 movl $(4096/64)-5,%ecx
17 .p2align 4
18.Loop64:
19 dec %rcx
20
21 movq (%rsi), %rax
22 movq 8 (%rsi), %rbx
23 movq 16 (%rsi), %rdx
24 movq 24 (%rsi), %r8
25 movq 32 (%rsi), %r9
26 movq 40 (%rsi), %r10
27 movq 48 (%rsi), %r11
28 movq 56 (%rsi), %r12
29
30 prefetcht0 5*64(%rsi)
31
32 movq %rax, (%rdi)
33 movq %rbx, 8 (%rdi)
34 movq %rdx, 16 (%rdi)
35 movq %r8, 24 (%rdi)
36 movq %r9, 32 (%rdi)
37 movq %r10, 40 (%rdi)
38 movq %r11, 48 (%rdi)
39 movq %r12, 56 (%rdi)
40
41 leaq 64 (%rsi), %rsi
42 leaq 64 (%rdi), %rdi
43
44 jnz .Loop64
45
46 movl $5,%ecx
47 .p2align 4
48.Loop2:
49 decl %ecx
50
51 movq (%rsi), %rax
52 movq 8 (%rsi), %rbx
53 movq 16 (%rsi), %rdx
54 movq 24 (%rsi), %r8
55 movq 32 (%rsi), %r9
56 movq 40 (%rsi), %r10
57 movq 48 (%rsi), %r11
58 movq 56 (%rsi), %r12
59
60 movq %rax, (%rdi)
61 movq %rbx, 8 (%rdi)
62 movq %rdx, 16 (%rdi)
63 movq %r8, 24 (%rdi)
64 movq %r9, 32 (%rdi)
65 movq %r10, 40 (%rdi)
66 movq %r11, 48 (%rdi)
67 movq %r12, 56 (%rdi)
68
69 leaq 64(%rdi),%rdi
70 leaq 64(%rsi),%rsi
71
72 jnz .Loop2
73
74 movq (%rsp),%rbx
75 movq 1*8(%rsp),%r12
76 movq 2*8(%rsp),%r13
77 addq $3*8,%rsp
78 ret
79
80 /* C stepping K8 run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */
82
83#include <asm/cpufeature.h>
84
85 .section .altinstructions,"a"
86 .align 8
87 .quad copy_page
88 .quad copy_page_c
89 .byte X86_FEATURE_K8_C
90 .byte copy_page_c_end-copy_page_c
91 .byte copy_page_c_end-copy_page_c
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
96 movl $4096/8,%ecx
97 rep
98 movsq
99 ret
100copy_page_c_end:
101 .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 000000000000..bd556c804248
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#define FIX_ALIGNMENT 1
8
9 #include <asm/current.h>
10 #include <asm/offset.h>
11 #include <asm/thread_info.h>
12 #include <asm/cpufeature.h>
13
14/* Standard copy_to_user with segment limit checking */
15 .globl copy_to_user
16 .p2align 4
17copy_to_user:
18 GET_THREAD_INFO(%rax)
19 movq %rdi,%rcx
20 addq %rdx,%rcx
21 jc bad_to_user
22 cmpq threadinfo_addr_limit(%rax),%rcx
23 jae bad_to_user
242:
25 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f
271:
28
29 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31 .long copy_user_generic_c-1b /* offset */
32 .previous
33 .section .altinstructions,"a"
34 .align 8
35 .quad 2b
36 .quad 3b
37 .byte X86_FEATURE_K8_C
38 .byte 5
39 .byte 5
40 .previous
41
42/* Standard copy_from_user with segment limit checking */
43 .globl copy_from_user
44 .p2align 4
45copy_from_user:
46 GET_THREAD_INFO(%rax)
47 movq %rsi,%rcx
48 addq %rdx,%rcx
49 jc bad_from_user
50 cmpq threadinfo_addr_limit(%rax),%rcx
51 jae bad_from_user
52 /* FALL THROUGH to copy_user_generic */
53
54 .section .fixup,"ax"
55 /* must zero dest */
56bad_from_user:
57 movl %edx,%ecx
58 xorl %eax,%eax
59 rep
60 stosb
61bad_to_user:
62 movl %edx,%eax
63 ret
64 .previous
65
66
67/*
68 * copy_user_generic - memory copy with exception handling.
69 *
70 * Input:
71 * rdi destination
72 * rsi source
73 * rdx count
74 *
75 * Output:
76 * eax uncopied bytes or 0 if successful.
77 */
78 .globl copy_user_generic
79 .p2align 4
80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_K8_C
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx
98 xorl %eax,%eax /*zero for the exception handler */
99
100#ifdef FIX_ALIGNMENT
101 /* check for bad alignment of destination */
102 movl %edi,%ecx
103 andl $7,%ecx
104 jnz .Lbad_alignment
105.Lafter_bad_alignment:
106#endif
107
108 movq %rdx,%rcx
109
110 movl $64,%ebx
111 shrq $6,%rdx
112 decq %rdx
113 js .Lhandle_tail
114
115 .p2align 4
116.Lloop:
117.Ls1: movq (%rsi),%r11
118.Ls2: movq 1*8(%rsi),%r8
119.Ls3: movq 2*8(%rsi),%r9
120.Ls4: movq 3*8(%rsi),%r10
121.Ld1: movq %r11,(%rdi)
122.Ld2: movq %r8,1*8(%rdi)
123.Ld3: movq %r9,2*8(%rdi)
124.Ld4: movq %r10,3*8(%rdi)
125
126.Ls5: movq 4*8(%rsi),%r11
127.Ls6: movq 5*8(%rsi),%r8
128.Ls7: movq 6*8(%rsi),%r9
129.Ls8: movq 7*8(%rsi),%r10
130.Ld5: movq %r11,4*8(%rdi)
131.Ld6: movq %r8,5*8(%rdi)
132.Ld7: movq %r9,6*8(%rdi)
133.Ld8: movq %r10,7*8(%rdi)
134
135 decq %rdx
136
137 leaq 64(%rsi),%rsi
138 leaq 64(%rdi),%rdi
139
140 jns .Lloop
141
142 .p2align 4
143.Lhandle_tail:
144 movl %ecx,%edx
145 andl $63,%ecx
146 shrl $3,%ecx
147 jz .Lhandle_7
148 movl $8,%ebx
149 .p2align 4
150.Lloop_8:
151.Ls9: movq (%rsi),%r8
152.Ld9: movq %r8,(%rdi)
153 decl %ecx
154 leaq 8(%rdi),%rdi
155 leaq 8(%rsi),%rsi
156 jnz .Lloop_8
157
158.Lhandle_7:
159 movl %edx,%ecx
160 andl $7,%ecx
161 jz .Lende
162 .p2align 4
163.Lloop_1:
164.Ls10: movb (%rsi),%bl
165.Ld10: movb %bl,(%rdi)
166 incq %rdi
167 incq %rsi
168 decl %ecx
169 jnz .Lloop_1
170
171.Lende:
172 popq %rbx
173 ret
174
175#ifdef FIX_ALIGNMENT
176 /* align destination */
177 .p2align 4
178.Lbad_alignment:
179 movl $8,%r9d
180 subl %ecx,%r9d
181 movl %r9d,%ecx
182 cmpq %r9,%rdx
183 jz .Lhandle_7
184 js .Lhandle_7
185.Lalign_1:
186.Ls11: movb (%rsi),%bl
187.Ld11: movb %bl,(%rdi)
188 incq %rsi
189 incq %rdi
190 decl %ecx
191 jnz .Lalign_1
192 subq %r9,%rdx
193 jmp .Lafter_bad_alignment
194#endif
195
196 /* table sorted by exception address */
197 .section __ex_table,"a"
198 .align 8
199 .quad .Ls1,.Ls1e
200 .quad .Ls2,.Ls2e
201 .quad .Ls3,.Ls3e
202 .quad .Ls4,.Ls4e
203 .quad .Ld1,.Ls1e
204 .quad .Ld2,.Ls2e
205 .quad .Ld3,.Ls3e
206 .quad .Ld4,.Ls4e
207 .quad .Ls5,.Ls5e
208 .quad .Ls6,.Ls6e
209 .quad .Ls7,.Ls7e
210 .quad .Ls8,.Ls8e
211 .quad .Ld5,.Ls5e
212 .quad .Ld6,.Ls6e
213 .quad .Ld7,.Ls7e
214 .quad .Ld8,.Ls8e
215 .quad .Ls9,.Le_quad
216 .quad .Ld9,.Le_quad
217 .quad .Ls10,.Le_byte
218 .quad .Ld10,.Le_byte
219#ifdef FIX_ALIGNMENT
220 .quad .Ls11,.Lzero_rest
221 .quad .Ld11,.Lzero_rest
222#endif
223 .quad .Le5,.Le_zero
224 .previous
225
226 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
227 pessimistic side. this is gross. it would be better to fix the
228 interface. */
229 /* eax: zero, ebx: 64 */
230.Ls1e: addl $8,%eax
231.Ls2e: addl $8,%eax
232.Ls3e: addl $8,%eax
233.Ls4e: addl $8,%eax
234.Ls5e: addl $8,%eax
235.Ls6e: addl $8,%eax
236.Ls7e: addl $8,%eax
237.Ls8e: addl $8,%eax
238 addq %rbx,%rdi /* +64 */
239 subq %rax,%rdi /* correct destination with computed offset */
240
241 shlq $6,%rdx /* loop counter * 64 (stride length) */
242 addq %rax,%rdx /* add offset to loopcnt */
243 andl $63,%ecx /* remaining bytes */
244 addq %rcx,%rdx /* add them */
245 jmp .Lzero_rest
246
247 /* exception on quad word loop in tail handling */
248 /* ecx: loopcnt/8, %edx: length, rdi: correct */
249.Le_quad:
250 shll $3,%ecx
251 andl $7,%edx
252 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest:
255 movq %rdx,%rcx
256.Le_byte:
257 xorl %eax,%eax
258.Le5: rep
259 stosb
260 /* when there is another exception while zeroing the rest just return */
261.Le_zero:
262 movq %rdx,%rax
263 jmp .Lende
264
265 /* C stepping K8 run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible.
267 Patch in jmps to this code instead of copying it fully
268 to avoid unwanted aliasing in the exception tables. */
269
270 /* rdi destination
271 * rsi source
272 * rdx count
273 *
274 * Output:
275 * eax uncopied bytes or 0 if successfull.
276 */
277copy_user_generic_c:
278 movl %edx,%ecx
279 shrl $3,%ecx
280 andl $7,%edx
2811: rep
282 movsq
283 movl %edx,%ecx
2842: rep
285 movsb
2864: movl %ecx,%eax
287 ret
2883: lea (%rdx,%rcx,8),%rax
289 ret
290
291 .section __ex_table,"a"
292 .quad 1b,3b
293 .quad 2b,4b
294 .previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 000000000000..01808ec37836
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all.
7 */
8 #include <linux/linkage.h>
9 #include <asm/errno.h>
10
11/*
12 * Checksum copy with exception handling.
13 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
14 * destination is zeroed.
15 *
16 * Input
17 * rdi source
18 * rsi destination
19 * edx len (32bit)
20 * ecx sum (32bit)
21 * r8 src_err_ptr (int)
22 * r9 dst_err_ptr (int)
23 *
24 * Output
25 * eax 64bit sum. undefined in case of exception.
26 *
27 * Wrappers need to take care of valid exception sum and zeroing.
28 * They also should align source or destination to 8 bytes.
29 */
30
31 .macro source
3210:
33 .section __ex_table,"a"
34 .align 8
35 .quad 10b,.Lbad_source
36 .previous
37 .endm
38
39 .macro dest
4020:
41 .section __ex_table,"a"
42 .align 8
43 .quad 20b,.Lbad_dest
44 .previous
45 .endm
46
47 .macro ignore L=.Lignore
4830:
49 .section __ex_table,"a"
50 .align 8
51 .quad 30b,\L
52 .previous
53 .endm
54
55
56 .globl csum_partial_copy_generic
57 .p2align 4
58csum_partial_copy_generic:
59 cmpl $3*64,%edx
60 jle .Lignore
61
62.Lignore:
63 subq $7*8,%rsp
64 movq %rbx,2*8(%rsp)
65 movq %r12,3*8(%rsp)
66 movq %r14,4*8(%rsp)
67 movq %r13,5*8(%rsp)
68 movq %rbp,6*8(%rsp)
69
70 movq %r8,(%rsp)
71 movq %r9,1*8(%rsp)
72
73 movl %ecx,%eax
74 movl %edx,%ecx
75
76 xorl %r9d,%r9d
77 movq %rcx,%r12
78
79 shrq $6,%r12
80 jz .Lhandle_tail /* < 64 */
81
82 clc
83
84 /* main loop. clear in 64 byte blocks */
85 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
86 /* r11: temp3, rdx: temp4, r12 loopcnt */
87 /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
88 .p2align 4
89.Lloop:
90 source
91 movq (%rdi),%rbx
92 source
93 movq 8(%rdi),%r8
94 source
95 movq 16(%rdi),%r11
96 source
97 movq 24(%rdi),%rdx
98
99 source
100 movq 32(%rdi),%r10
101 source
102 movq 40(%rdi),%rbp
103 source
104 movq 48(%rdi),%r14
105 source
106 movq 56(%rdi),%r13
107
108 ignore 2f
109 prefetcht0 5*64(%rdi)
1102:
111 adcq %rbx,%rax
112 adcq %r8,%rax
113 adcq %r11,%rax
114 adcq %rdx,%rax
115 adcq %r10,%rax
116 adcq %rbp,%rax
117 adcq %r14,%rax
118 adcq %r13,%rax
119
120 decl %r12d
121
122 dest
123 movq %rbx,(%rsi)
124 dest
125 movq %r8,8(%rsi)
126 dest
127 movq %r11,16(%rsi)
128 dest
129 movq %rdx,24(%rsi)
130
131 dest
132 movq %r10,32(%rsi)
133 dest
134 movq %rbp,40(%rsi)
135 dest
136 movq %r14,48(%rsi)
137 dest
138 movq %r13,56(%rsi)
139
1403:
141
142 leaq 64(%rdi),%rdi
143 leaq 64(%rsi),%rsi
144
145 jnz .Lloop
146
147 adcq %r9,%rax
148
149 /* do last upto 56 bytes */
150.Lhandle_tail:
151 /* ecx: count */
152 movl %ecx,%r10d
153 andl $63,%ecx
154 shrl $3,%ecx
155 jz .Lfold
156 clc
157 .p2align 4
158.Lloop_8:
159 source
160 movq (%rdi),%rbx
161 adcq %rbx,%rax
162 decl %ecx
163 dest
164 movq %rbx,(%rsi)
165 leaq 8(%rsi),%rsi /* preserve carry */
166 leaq 8(%rdi),%rdi
167 jnz .Lloop_8
168 adcq %r9,%rax /* add in carry */
169
170.Lfold:
171 /* reduce checksum to 32bits */
172 movl %eax,%ebx
173 shrq $32,%rax
174 addl %ebx,%eax
175 adcl %r9d,%eax
176
177 /* do last upto 6 bytes */
178.Lhandle_7:
179 movl %r10d,%ecx
180 andl $7,%ecx
181 shrl $1,%ecx
182 jz .Lhandle_1
183 movl $2,%edx
184 xorl %ebx,%ebx
185 clc
186 .p2align 4
187.Lloop_1:
188 source
189 movw (%rdi),%bx
190 adcl %ebx,%eax
191 dest
192 decl %ecx
193 movw %bx,(%rsi)
194 leaq 2(%rdi),%rdi
195 leaq 2(%rsi),%rsi
196 jnz .Lloop_1
197 adcl %r9d,%eax /* add in carry */
198
199 /* handle last odd byte */
200.Lhandle_1:
201 testl $1,%r10d
202 jz .Lende
203 xorl %ebx,%ebx
204 source
205 movb (%rdi),%bl
206 dest
207 movb %bl,(%rsi)
208 addl %ebx,%eax
209 adcl %r9d,%eax /* carry */
210
211.Lende:
212 movq 2*8(%rsp),%rbx
213 movq 3*8(%rsp),%r12
214 movq 4*8(%rsp),%r14
215 movq 5*8(%rsp),%r13
216 movq 6*8(%rsp),%rbp
217 addq $7*8,%rsp
218 ret
219
220 /* Exception handlers. Very simple, zeroing is done in the wrappers */
221.Lbad_source:
222 movq (%rsp),%rax
223 testq %rax,%rax
224 jz .Lende
225 movl $-EFAULT,(%rax)
226 jmp .Lende
227
228.Lbad_dest:
229 movq 8(%rsp),%rax
230 testq %rax,%rax
231 jz .Lende
232 movl $-EFAULT,(%rax)
233 jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 000000000000..5384e227cdf6
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
1/*
2 * arch/x86_64/lib/csum-partial.c
3 *
4 * This file contains network checksum routines that are better done
5 * in an architecture-specific manner due to speed.
6 */
7
8#include <linux/compiler.h>
9#include <linux/module.h>
10#include <asm/checksum.h>
11
12#define __force_inline inline __attribute__((always_inline))
13
14static inline unsigned short from32to16(unsigned a)
15{
16 unsigned short b = a >> 16;
17 asm("addw %w2,%w0\n\t"
18 "adcw $0,%w0\n"
19 : "=r" (b)
20 : "0" (b), "r" (a));
21 return b;
22}
23
24/*
25 * Do a 64-bit checksum on an arbitrary memory area.
26 * Returns a 32bit checksum.
27 *
28 * This isn't as time critical as it used to be because many NICs
29 * do hardware checksumming these days.
30 *
31 * Things tried and found to not make it faster:
32 * Manual Prefetching
33 * Unrolling to an 128 bytes inner loop.
34 * Using interleaving with more registers to break the carry chains.
35 */
36static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
37{
38 unsigned odd, count;
39 unsigned long result = 0;
40
41 if (unlikely(len == 0))
42 return result;
43 odd = 1 & (unsigned long) buff;
44 if (unlikely(odd)) {
45 result = *buff << 8;
46 len--;
47 buff++;
48 }
49 count = len >> 1; /* nr of 16-bit words.. */
50 if (count) {
51 if (2 & (unsigned long) buff) {
52 result += *(unsigned short *)buff;
53 count--;
54 len -= 2;
55 buff += 2;
56 }
57 count >>= 1; /* nr of 32-bit words.. */
58 if (count) {
59 unsigned long zero;
60 unsigned count64;
61 if (4 & (unsigned long) buff) {
62 result += *(unsigned int *) buff;
63 count--;
64 len -= 4;
65 buff += 4;
66 }
67 count >>= 1; /* nr of 64-bit words.. */
68
69 /* main loop using 64byte blocks */
70 zero = 0;
71 count64 = count >> 3;
72 while (count64) {
73 asm("addq 0*8(%[src]),%[res]\n\t"
74 "adcq 1*8(%[src]),%[res]\n\t"
75 "adcq 2*8(%[src]),%[res]\n\t"
76 "adcq 3*8(%[src]),%[res]\n\t"
77 "adcq 4*8(%[src]),%[res]\n\t"
78 "adcq 5*8(%[src]),%[res]\n\t"
79 "adcq 6*8(%[src]),%[res]\n\t"
80 "adcq 7*8(%[src]),%[res]\n\t"
81 "adcq %[zero],%[res]"
82 : [res] "=r" (result)
83 : [src] "r" (buff), [zero] "r" (zero),
84 "[res]" (result));
85 buff += 64;
86 count64--;
87 }
88
89 /* last upto 7 8byte blocks */
90 count %= 8;
91 while (count) {
92 asm("addq %1,%0\n\t"
93 "adcq %2,%0\n"
94 : "=r" (result)
95 : "m" (*(unsigned long *)buff),
96 "r" (zero), "0" (result));
97 --count;
98 buff += 8;
99 }
100 result = add32_with_carry(result>>32,
101 result&0xffffffff);
102
103 if (len & 4) {
104 result += *(unsigned int *) buff;
105 buff += 4;
106 }
107 }
108 if (len & 2) {
109 result += *(unsigned short *) buff;
110 buff += 2;
111 }
112 }
113 if (len & 1)
114 result += *buff;
115 result = add32_with_carry(result>>32, result & 0xffffffff);
116 if (unlikely(odd)) {
117 result = from32to16(result);
118 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
119 }
120 return result;
121}
122
123/*
124 * computes the checksum of a memory block at buff, length len,
125 * and adds in "sum" (32-bit)
126 *
127 * returns a 32-bit number suitable for feeding into itself
128 * or csum_tcpudp_magic
129 *
130 * this function must be called with even lengths, except
131 * for the last fragment, which may be odd
132 *
133 * it's best to have buff aligned on a 64-bit boundary
134 */
135unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
136{
137 return add32_with_carry(do_csum(buff, len), sum);
138}
139
140EXPORT_SYMBOL(csum_partial);
141
142/*
143 * this routine is used for miscellaneous IP-like checksums, mainly
144 * in icmp.c
145 */
146unsigned short ip_compute_csum(unsigned char * buff, int len)
147{
148 return csum_fold(csum_partial(buff,len,0));
149}
150
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 000000000000..94323f20816e
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v.2
3 *
4 * Wrappers of assembly checksum functions for x86-64.
5 */
6
7#include <asm/checksum.h>
8#include <linux/module.h>
9
10/**
11 * csum_partial_copy_from_user - Copy and checksum from user space.
12 * @src: source address (user space)
13 * @dst: destination address
14 * @len: number of bytes to be copied.
15 * @isum: initial sum that is added into the result (32bit unfolded)
16 * @errp: set to -EFAULT for an bad source address.
17 *
18 * Returns an 32bit unfolded checksum of the buffer.
19 * src and dst are best aligned to 64bits.
20 */
21unsigned int
22csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
23 int len, unsigned int isum, int *errp)
24{
25 might_sleep();
26 *errp = 0;
27 if (likely(access_ok(VERIFY_READ,src, len))) {
28 /* Why 6, not 7? To handle odd addresses aligned we
29 would need to do considerable complications to fix the
30 checksum which is defined as an 16bit accumulator. The
31 fix alignment code is primarily for performance
32 compatibility with 32bit and that will handle odd
33 addresses slowly too. */
34 if (unlikely((unsigned long)src & 6)) {
35 while (((unsigned long)src & 6) && len >= 2) {
36 __u16 val16;
37 *errp = __get_user(val16, (__u16 __user *)src);
38 if (*errp)
39 return isum;
40 *(__u16 *)dst = val16;
41 isum = add32_with_carry(isum, val16);
42 src += 2;
43 dst += 2;
44 len -= 2;
45 }
46 }
47 isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
48 if (likely(*errp == 0))
49 return isum;
50 }
51 *errp = -EFAULT;
52 memset(dst,0,len);
53 return isum;
54}
55
56EXPORT_SYMBOL(csum_partial_copy_from_user);
57
58/**
59 * csum_partial_copy_to_user - Copy and checksum to user space.
60 * @src: source address
61 * @dst: destination address (user space)
62 * @len: number of bytes to be copied.
63 * @isum: initial sum that is added into the result (32bit unfolded)
64 * @errp: set to -EFAULT for an bad destination address.
65 *
66 * Returns an 32bit unfolded checksum of the buffer.
67 * src and dst are best aligned to 64bits.
68 */
69unsigned int
70csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
71 int len, unsigned int isum, int *errp)
72{
73 might_sleep();
74 if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
75 *errp = -EFAULT;
76 return 0;
77 }
78
79 if (unlikely((unsigned long)dst & 6)) {
80 while (((unsigned long)dst & 6) && len >= 2) {
81 __u16 val16 = *(__u16 *)src;
82 isum = add32_with_carry(isum, val16);
83 *errp = __put_user(val16, (__u16 __user *)dst);
84 if (*errp)
85 return isum;
86 src += 2;
87 dst += 2;
88 len -= 2;
89 }
90 }
91
92 *errp = 0;
93 return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp);
94}
95
96EXPORT_SYMBOL(csum_partial_copy_to_user);
97
98/**
99 * csum_partial_copy_nocheck - Copy and checksum.
100 * @src: source address
101 * @dst: destination address
102 * @len: number of bytes to be copied.
103 * @isum: initial sum that is added into the result (32bit unfolded)
104 *
105 * Returns an 32bit unfolded checksum of the buffer.
106 */
107unsigned int
108csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum)
109{
110 return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
111}
112
113unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
114 __u32 len, unsigned short proto, unsigned int sum)
115{
116 __u64 rest, sum64;
117
118 rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum;
119 asm(" addq (%[saddr]),%[sum]\n"
120 " adcq 8(%[saddr]),%[sum]\n"
121 " adcq (%[daddr]),%[sum]\n"
122 " adcq 8(%[daddr]),%[sum]\n"
123 " adcq $0,%[sum]\n"
124 : [sum] "=r" (sum64)
125 : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
126 return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32));
127}
128
129EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/x86_64/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
1/*
2 * x86 version of "atomic_dec_and_lock()" using
3 * the atomic "cmpxchg" instruction.
4 *
5 * (For CPU's lacking cmpxchg, we use the slow
6 * generic version, and this one never even gets
7 * compiled).
8 */
9
10#include <linux/spinlock.h>
11#include <asm/atomic.h>
12
13int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
14{
15 int counter;
16 int newcount;
17
18repeat:
19 counter = atomic_read(atomic);
20 newcount = counter-1;
21
22 if (!newcount)
23 goto slow_path;
24
25 asm volatile("lock; cmpxchgl %1,%2"
26 :"=a" (newcount)
27 :"r" (newcount), "m" (atomic->counter), "0" (counter));
28
29 /* If the above failed, "eax" will have changed */
30 if (newcount != counter)
31 goto repeat;
32 return 0;
33
34slow_path:
35 spin_lock(lock);
36 if (atomic_dec_and_test(atomic))
37 return 1;
38 spin_unlock(lock);
39 return 0;
40}
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
new file mode 100644
index 000000000000..6e2d66472eb1
--- /dev/null
+++ b/arch/x86_64/lib/delay.c
@@ -0,0 +1,48 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/config.h>
12#include <linux/sched.h>
13#include <linux/delay.h>
14#include <asm/delay.h>
15
16#ifdef CONFIG_SMP
17#include <asm/smp.h>
18#endif
19
20int x86_udelay_tsc = 0; /* Delay via TSC */
21
22void __delay(unsigned long loops)
23{
24 unsigned bclock, now;
25
26 rdtscl(bclock);
27 do
28 {
29 rep_nop();
30 rdtscl(now);
31 }
32 while((now-bclock) < loops);
33}
34
35inline void __const_udelay(unsigned long xloops)
36{
37 __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
38}
39
40void __udelay(unsigned long usecs)
41{
42 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
43}
44
45void __ndelay(unsigned long nsecs)
46{
47 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
48}
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
new file mode 100644
index 000000000000..f94ea8a44051
--- /dev/null
+++ b/arch/x86_64/lib/getuser.S
@@ -0,0 +1,101 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __get_user_X
15 *
16 * Inputs: %rcx contains the address.
17 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it.
19 *
20 * Outputs: %rax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value
22 *
23 * %r8 is destroyed.
24 *
25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly.
27 */
28
29#include <linux/linkage.h>
30#include <asm/page.h>
31#include <asm/errno.h>
32#include <asm/offset.h>
33#include <asm/thread_info.h>
34
35 .text
36 .p2align 4
37.globl __get_user_1
38__get_user_1:
39 GET_THREAD_INFO(%r8)
40 cmpq threadinfo_addr_limit(%r8),%rcx
41 jae bad_get_user
421: movzb (%rcx),%edx
43 xorl %eax,%eax
44 ret
45
46 .p2align 4
47.globl __get_user_2
48__get_user_2:
49 GET_THREAD_INFO(%r8)
50 addq $1,%rcx
51 jc 20f
52 cmpq threadinfo_addr_limit(%r8),%rcx
53 jae 20f
54 decq %rcx
552: movzwl (%rcx),%edx
56 xorl %eax,%eax
57 ret
5820: decq %rcx
59 jmp bad_get_user
60
61 .p2align 4
62.globl __get_user_4
63__get_user_4:
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl (%rcx),%edx
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_get_user
75
76 .p2align 4
77.globl __get_user_8
78__get_user_8:
79 GET_THREAD_INFO(%r8)
80 addq $7,%rcx
81 jc bad_get_user
82 cmpq threadinfo_addr_limit(%r8),%rcx
83 jae bad_get_user
84 subq $7,%rcx
854: movq (%rcx),%rdx
86 xorl %eax,%eax
87 ret
8840: subq $7,%rcx
89 jmp bad_get_user
90
91bad_get_user:
92 xorl %edx,%edx
93 movq $(-EFAULT),%rax
94 ret
95
96.section __ex_table,"a"
97 .quad 1b,bad_get_user
98 .quad 2b,bad_get_user
99 .quad 3b,bad_get_user
100 .quad 4b,bad_get_user
101.previous
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86_64/lib/io.c
@@ -0,0 +1,23 @@
1#include <linux/string.h>
2#include <asm/io.h>
3#include <linux/module.h>
4
5void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
6{
7 __inline_memcpy((void *) dst,src,len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
12{
13 __inline_memcpy(dst,(const void *) src,len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /* XXX: memset can mangle the IO patterns quite a bit.
20 perhaps it would be better to use a dumb one */
21 memset((void *)a,b,c);
22}
23EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 000000000000..c6c46494fef5
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
1/* Copyright 2002 Andi Kleen */
2
3 #include <asm/cpufeature.h>
4/*
5 * memcpy - Copy a memory block.
6 *
7 * Input:
8 * rdi destination
9 * rsi source
10 * rdx count
11 *
12 * Output:
13 * rax original destination
14 */
15
16 .globl __memcpy
17 .globl memcpy
18 .p2align 4
19__memcpy:
20memcpy:
21 pushq %rbx
22 movq %rdi,%rax
23
24 movl %edx,%ecx
25 shrl $6,%ecx
26 jz .Lhandle_tail
27
28 .p2align 4
29.Lloop_64:
30 decl %ecx
31
32 movq (%rsi),%r11
33 movq 8(%rsi),%r8
34
35 movq %r11,(%rdi)
36 movq %r8,1*8(%rdi)
37
38 movq 2*8(%rsi),%r9
39 movq 3*8(%rsi),%r10
40
41 movq %r9,2*8(%rdi)
42 movq %r10,3*8(%rdi)
43
44 movq 4*8(%rsi),%r11
45 movq 5*8(%rsi),%r8
46
47 movq %r11,4*8(%rdi)
48 movq %r8,5*8(%rdi)
49
50 movq 6*8(%rsi),%r9
51 movq 7*8(%rsi),%r10
52
53 movq %r9,6*8(%rdi)
54 movq %r10,7*8(%rdi)
55
56 leaq 64(%rsi),%rsi
57 leaq 64(%rdi),%rdi
58 jnz .Lloop_64
59
60.Lhandle_tail:
61 movl %edx,%ecx
62 andl $63,%ecx
63 shrl $3,%ecx
64 jz .Lhandle_7
65 .p2align 4
66.Lloop_8:
67 decl %ecx
68 movq (%rsi),%r8
69 movq %r8,(%rdi)
70 leaq 8(%rdi),%rdi
71 leaq 8(%rsi),%rsi
72 jnz .Lloop_8
73
74.Lhandle_7:
75 movl %edx,%ecx
76 andl $7,%ecx
77 jz .Lende
78 .p2align 4
79.Lloop_1:
80 movb (%rsi),%r8b
81 movb %r8b,(%rdi)
82 incq %rdi
83 incq %rsi
84 decl %ecx
85 jnz .Lloop_1
86
87.Lende:
88 popq %rbx
89 ret
90.Lfinal:
91
92 /* C stepping K8 run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */
94
95 .section .altinstructions,"a"
96 .align 8
97 .quad memcpy
98 .quad memcpy_c
99 .byte X86_FEATURE_K8_C
100 .byte .Lfinal-memcpy
101 .byte memcpy_c_end-memcpy_c
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
110 movq %rdi,%rax
111 movl %edx,%ecx
112 shrl $3,%ecx
113 andl $7,%edx
114 rep
115 movsq
116 movl %edx,%ecx
117 rep
118 movsb
119 ret
120memcpy_c_end:
121 .previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
new file mode 100644
index 000000000000..e93d5255fdc9
--- /dev/null
+++ b/arch/x86_64/lib/memmove.c
@@ -0,0 +1,19 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6
7#undef memmove
8void *memmove(void * dest,const void *src,size_t count)
9{
10 if (dest < src) {
11 __inline_memcpy(dest,src,count);
12 } else {
13 char *p = (char *) dest + count;
14 char *s = (char *) src + count;
15 while (count--)
16 *--p = *--s;
17 }
18 return dest;
19}
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 000000000000..4b4c40638640
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs */
2/*
3 * ISO C memset - set a memory block to a byte value.
4 *
5 * rdi destination
6 * rsi value (char)
7 * rdx count (bytes)
8 *
9 * rax original destination
10 */
11 .globl __memset
12 .globl memset
13 .p2align 4
14memset:
15__memset:
16 movq %rdi,%r10
17 movq %rdx,%r11
18
19 /* expand byte value */
20 movzbl %sil,%ecx
21 movabs $0x0101010101010101,%rax
22 mul %rcx /* with rax, clobbers rdx */
23
24 /* align dst */
25 movl %edi,%r9d
26 andl $7,%r9d
27 jnz .Lbad_alignment
28.Lafter_bad_alignment:
29
30 movl %r11d,%ecx
31 shrl $6,%ecx
32 jz .Lhandle_tail
33
34 .p2align 4
35.Lloop_64:
36 decl %ecx
37 movq %rax,(%rdi)
38 movq %rax,8(%rdi)
39 movq %rax,16(%rdi)
40 movq %rax,24(%rdi)
41 movq %rax,32(%rdi)
42 movq %rax,40(%rdi)
43 movq %rax,48(%rdi)
44 movq %rax,56(%rdi)
45 leaq 64(%rdi),%rdi
46 jnz .Lloop_64
47
48 /* Handle tail in loops. The loops should be faster than hard
49 to predict jump tables. */
50 .p2align 4
51.Lhandle_tail:
52 movl %r11d,%ecx
53 andl $63&(~7),%ecx
54 jz .Lhandle_7
55 shrl $3,%ecx
56 .p2align 4
57.Lloop_8:
58 decl %ecx
59 movq %rax,(%rdi)
60 leaq 8(%rdi),%rdi
61 jnz .Lloop_8
62
63.Lhandle_7:
64 movl %r11d,%ecx
65 andl $7,%ecx
66 jz .Lende
67 .p2align 4
68.Lloop_1:
69 decl %ecx
70 movb %al,(%rdi)
71 leaq 1(%rdi),%rdi
72 jnz .Lloop_1
73
74.Lende:
75 movq %r10,%rax
76 ret
77
78.Lbad_alignment:
79 cmpq $7,%r11
80 jbe .Lhandle_7
81 movq %rax,(%rdi) /* unaligned store */
82 movq $8,%r8
83 subq %r9,%r8
84 addq %r8,%rdi
85 subq %r8,%r11
86 jmp .Lafter_bad_alignment
87
88 /* C stepping K8 run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */
90
91#include <asm/cpufeature.h>
92
93 .section .altinstructions,"a"
94 .align 8
95 .quad memset
96 .quad memset_c
97 .byte X86_FEATURE_K8_C
98 .byte memset_c_end-memset_c
99 .byte memset_c_end-memset_c
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
108 movq %rdi,%r9
109 movl %edx,%r8d
110 andl $7,%r8d
111 movl %edx,%ecx
112 shrl $3,%ecx
113 /* expand byte value */
114 movzbl %sil,%esi
115 movabs $0x0101010101010101,%rax
116 mulq %rsi /* with rax, clobbers rdx */
117 rep
118 stosq
119 movl %r8d,%ecx
120 rep
121 stosb
122 movq %r9,%rax
123 ret
124memset_c_end:
125 .previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
new file mode 100644
index 000000000000..0dee1fdcb162
--- /dev/null
+++ b/arch/x86_64/lib/putuser.S
@@ -0,0 +1,89 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/page.h>
29#include <asm/errno.h>
30#include <asm/offset.h>
31#include <asm/thread_info.h>
32
33 .text
34 .p2align 4
35.globl __put_user_1
36__put_user_1:
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43
44 .p2align 4
45.globl __put_user_2
46__put_user_2:
47 GET_THREAD_INFO(%r8)
48 addq $1,%rcx
49 jc bad_put_user
50 cmpq threadinfo_addr_limit(%r8),%rcx
51 jae bad_put_user
522: movw %dx,-1(%rcx)
53 xorl %eax,%eax
54 ret
55
56 .p2align 4
57.globl __put_user_4
58__put_user_4:
59 GET_THREAD_INFO(%r8)
60 addq $3,%rcx
61 jc bad_put_user
62 cmpq threadinfo_addr_limit(%r8),%rcx
63 jae bad_put_user
643: movl %edx,-3(%rcx)
65 xorl %eax,%eax
66 ret
67
68 .p2align 4
69.globl __put_user_8
70__put_user_8:
71 GET_THREAD_INFO(%r8)
72 addq $7,%rcx
73 jc bad_put_user
74 cmpq threadinfo_addr_limit(%r8),%rcx
75 jae bad_put_user
764: movq %rdx,-7(%rcx)
77 xorl %eax,%eax
78 ret
79
80bad_put_user:
81 movq $(-EFAULT),%rax
82 ret
83
84.section __ex_table,"a"
85 .quad 1b,bad_put_user
86 .quad 2b,bad_put_user
87 .quad 3b,bad_put_user
88 .quad 4b,bad_put_user
89.previous
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
new file mode 100644
index 000000000000..acc1e2ca7ed7
--- /dev/null
+++ b/arch/x86_64/lib/thunk.S
@@ -0,0 +1,95 @@
1 /*
2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
7 */
8
9 #include <linux/config.h>
10 #include <linux/linkage.h>
11 #include <asm/dwarf2.h>
12 #include <asm/calling.h>
13 #include <asm/rwlock.h>
14
15 /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
16 .macro thunk name,func
17 .globl \name
18\name:
19 CFI_STARTPROC
20 SAVE_ARGS
21 call \func
22 jmp restore
23 CFI_ENDPROC
24 .endm
25
26 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
27 .macro thunk_retrax name,func
28 .globl \name
29\name:
30 CFI_STARTPROC
31 SAVE_ARGS
32 call \func
33 jmp restore_norax
34 CFI_ENDPROC
35 .endm
36
37
38 .section .sched.text
39#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
40 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
41 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
42 thunk rwsem_wake_thunk,rwsem_wake
43 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
44#endif
45 thunk do_softirq_thunk,do_softirq
46
47 thunk __down_failed,__down
48 thunk_retrax __down_failed_interruptible,__down_interruptible
49 thunk_retrax __down_failed_trylock,__down_trylock
50 thunk __up_wakeup,__up
51
52 /* SAVE_ARGS below is used only for the .cfi directives it contains. */
53 CFI_STARTPROC
54 SAVE_ARGS
55restore:
56 RESTORE_ARGS
57 ret
58 CFI_ENDPROC
59
60 CFI_STARTPROC
61 SAVE_ARGS
62restore_norax:
63 RESTORE_ARGS 1
64 ret
65 CFI_ENDPROC
66
67#ifdef CONFIG_SMP
68/* Support for read/write spinlocks. */
69 .text
70/* rax: pointer to rwlock_t */
71ENTRY(__write_lock_failed)
72 lock
73 addl $RW_LOCK_BIAS,(%rax)
741: rep
75 nop
76 cmpl $RW_LOCK_BIAS,(%rax)
77 jne 1b
78 lock
79 subl $RW_LOCK_BIAS,(%rax)
80 jnz __write_lock_failed
81 ret
82
83/* rax: pointer to rwlock_t */
84ENTRY(__read_lock_failed)
85 lock
86 incl (%rax)
871: rep
88 nop
89 cmpl $1,(%rax)
90 js 1b
91 lock
92 decl (%rax)
93 js __read_lock_failed
94 ret
95#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
new file mode 100644
index 000000000000..db8abba1ad81
--- /dev/null
+++ b/arch/x86_64/lib/usercopy.c
@@ -0,0 +1,153 @@
1/*
2 * User address space access functions.
3 *
4 * Copyright 1997 Andi Kleen <ak@muc.de>
5 * Copyright 1997 Linus Torvalds
6 * Copyright 2002 Andi Kleen <ak@suse.de>
7 */
8#include <asm/uaccess.h>
9
10/*
11 * Copy a null terminated string from userspace.
12 */
13
14#define __do_strncpy_from_user(dst,src,count,res) \
15do { \
16 long __d0, __d1, __d2; \
17 might_sleep(); \
18 __asm__ __volatile__( \
19 " testq %1,%1\n" \
20 " jz 2f\n" \
21 "0: lodsb\n" \
22 " stosb\n" \
23 " testb %%al,%%al\n" \
24 " jz 1f\n" \
25 " decq %1\n" \
26 " jnz 0b\n" \
27 "1: subq %1,%0\n" \
28 "2:\n" \
29 ".section .fixup,\"ax\"\n" \
30 "3: movq %5,%0\n" \
31 " jmp 2b\n" \
32 ".previous\n" \
33 ".section __ex_table,\"a\"\n" \
34 " .align 8\n" \
35 " .quad 0b,3b\n" \
36 ".previous" \
37 : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
38 "=&D" (__d2) \
39 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
40 : "memory"); \
41} while (0)
42
43long
44__strncpy_from_user(char *dst, const char __user *src, long count)
45{
46 long res;
47 __do_strncpy_from_user(dst, src, count, res);
48 return res;
49}
50
51long
52strncpy_from_user(char *dst, const char __user *src, long count)
53{
54 long res = -EFAULT;
55 if (access_ok(VERIFY_READ, src, 1))
56 __do_strncpy_from_user(dst, src, count, res);
57 return res;
58}
59
60/*
61 * Zero Userspace
62 */
63
64unsigned long __clear_user(void __user *addr, unsigned long size)
65{
66 long __d0;
67 might_sleep();
68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */
70 asm volatile(
71 " testq %[size8],%[size8]\n"
72 " jz 4f\n"
73 "0: movq %[zero],(%[dst])\n"
74 " addq %[eight],%[dst]\n"
75 " decl %%ecx ; jnz 0b\n"
76 "4: movq %[size1],%%rcx\n"
77 " testl %%ecx,%%ecx\n"
78 " jz 2f\n"
79 "1: movb %b[zero],(%[dst])\n"
80 " incq %[dst]\n"
81 " decl %%ecx ; jnz 1b\n"
82 "2:\n"
83 ".section .fixup,\"ax\"\n"
84 "3: lea 0(%[size1],%[size8],8),%[size8]\n"
85 " jmp 2b\n"
86 ".previous\n"
87 ".section __ex_table,\"a\"\n"
88 " .align 8\n"
89 " .quad 0b,3b\n"
90 " .quad 1b,2b\n"
91 ".previous"
92 : [size8] "=c"(size), [dst] "=&D" (__d0)
93 : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
94 [zero] "r" (0UL), [eight] "r" (8UL));
95 return size;
96}
97
98
99unsigned long clear_user(void __user *to, unsigned long n)
100{
101 if (access_ok(VERIFY_WRITE, to, n))
102 return __clear_user(to, n);
103 return n;
104}
105
106/*
107 * Return the size of a string (including the ending 0)
108 *
109 * Return 0 on exception, a value greater than N if too long
110 */
111
112long strnlen_user(const char __user *s, long n)
113{
114 long res = 0;
115 char c;
116
117 if (!access_ok(VERIFY_READ, s, n))
118 return 0;
119
120 while (1) {
121 if (res>n)
122 return n+1;
123 if (__get_user(c, s))
124 return 0;
125 if (!c)
126 return res+1;
127 res++;
128 s++;
129 }
130}
131
132long strlen_user(const char __user *s)
133{
134 long res = 0;
135 char c;
136
137 for (;;) {
138 if (get_user(c, s))
139 return 0;
140 if (!c)
141 return res+1;
142 res++;
143 s++;
144 }
145}
146
147unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
148{
149 if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
150 return copy_user_generic((__force void *)to, (__force void *)from, len);
151 }
152 return len;
153}
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
new file mode 100644
index 000000000000..66c354ad80ca
--- /dev/null
+++ b/arch/x86_64/mm/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for the linux x86_64-specific parts of the memory manager.
3#
4
5obj-y := init.o fault.o ioremap.o extable.o pageattr.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_DISCONTIGMEM) += numa.o
8obj-$(CONFIG_K8_NUMA) += k8topology.o
9obj-$(CONFIG_ACPI_NUMA) += srat.o
10
11hugetlbpage-y = ../../i386/mm/hugetlbpage.o
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c
new file mode 100644
index 000000000000..2d78f9fb4035
--- /dev/null
+++ b/arch/x86_64/mm/extable.c
@@ -0,0 +1,35 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/config.h>
6#include <linux/module.h>
7#include <linux/spinlock.h>
8#include <linux/init.h>
9#include <asm/uaccess.h>
10
11/* Simple binary search */
12const struct exception_table_entry *
13search_extable(const struct exception_table_entry *first,
14 const struct exception_table_entry *last,
15 unsigned long value)
16{
17 /* Work around a B stepping K8 bug */
18 if ((value >> 32) == 0)
19 value |= 0xffffffffUL << 32;
20
21 while (first <= last) {
22 const struct exception_table_entry *mid;
23 long diff;
24
25 mid = (last - first) / 2 + first;
26 diff = mid->insn - value;
27 if (diff == 0)
28 return mid;
29 else if (diff < 0)
30 first = mid+1;
31 else
32 last = mid-1;
33 }
34 return NULL;
35}
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
new file mode 100644
index 000000000000..5724370475cc
--- /dev/null
+++ b/arch/x86_64/mm/fault.c
@@ -0,0 +1,579 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/config.h>
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/smp.h>
19#include <linux/smp_lock.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/tty.h>
23#include <linux/vt_kern.h> /* For unblank_screen() */
24#include <linux/compiler.h>
25#include <linux/module.h>
26#include <linux/kprobes.h>
27
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm/kdebug.h>
35#include <asm-generic/sections.h>
36#include <asm/kdebug.h>
37
38void bust_spinlocks(int yes)
39{
40 int loglevel_save = console_loglevel;
41 if (yes) {
42 oops_in_progress = 1;
43 } else {
44#ifdef CONFIG_VT
45 unblank_screen();
46#endif
47 oops_in_progress = 0;
48 /*
49 * OK, the message is on the console. Now we call printk()
50 * without oops_in_progress set so that printk will give klogd
51 * a poke. Hold onto your hats...
52 */
53 console_loglevel = 15; /* NMI oopser may have shut the console up */
54 printk(" ");
55 console_loglevel = loglevel_save;
56 }
57}
58
59/* Sometimes the CPU reports invalid exceptions on prefetch.
60 Check that here and ignore.
61 Opcode checker based on code by Richard Brunner */
62static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
63 unsigned long error_code)
64{
65 unsigned char *instr = (unsigned char *)(regs->rip);
66 int scan_more = 1;
67 int prefetch = 0;
68 unsigned char *max_instr = instr + 15;
69
70 /* If it was a exec fault ignore */
71 if (error_code & (1<<4))
72 return 0;
73
74 /* Code segments in LDT could have a non zero base. Don't check
75 when that's possible */
76 if (regs->cs & (1<<2))
77 return 0;
78
79 if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
80 return 0;
81
82 while (scan_more && instr < max_instr) {
83 unsigned char opcode;
84 unsigned char instr_hi;
85 unsigned char instr_lo;
86
87 if (__get_user(opcode, instr))
88 break;
89
90 instr_hi = opcode & 0xf0;
91 instr_lo = opcode & 0x0f;
92 instr++;
93
94 switch (instr_hi) {
95 case 0x20:
96 case 0x30:
97 /* Values 0x26,0x2E,0x36,0x3E are valid x86
98 prefixes. In long mode, the CPU will signal
99 invalid opcode if some of these prefixes are
100 present so we will never get here anyway */
101 scan_more = ((instr_lo & 7) == 0x6);
102 break;
103
104 case 0x40:
105 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
106 Need to figure out under what instruction mode the
107 instruction was issued ... */
108 /* Could check the LDT for lm, but for now it's good
109 enough to assume that long mode only uses well known
110 segments or kernel. */
111 scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
112 break;
113
114 case 0x60:
115 /* 0x64 thru 0x67 are valid prefixes in all modes. */
116 scan_more = (instr_lo & 0xC) == 0x4;
117 break;
118 case 0xF0:
119 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
120 scan_more = !instr_lo || (instr_lo>>1) == 1;
121 break;
122 case 0x00:
123 /* Prefetch instruction is 0x0F0D or 0x0F18 */
124 scan_more = 0;
125 if (__get_user(opcode, instr))
126 break;
127 prefetch = (instr_lo == 0xF) &&
128 (opcode == 0x0D || opcode == 0x18);
129 break;
130 default:
131 scan_more = 0;
132 break;
133 }
134 }
135 return prefetch;
136}
137
138static int bad_address(void *p)
139{
140 unsigned long dummy;
141 return __get_user(dummy, (unsigned long *)p);
142}
143
144void dump_pagetable(unsigned long address)
145{
146 pgd_t *pgd;
147 pud_t *pud;
148 pmd_t *pmd;
149 pte_t *pte;
150
151 asm("movq %%cr3,%0" : "=r" (pgd));
152
153 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
154 pgd += pgd_index(address);
155 printk("PGD %lx ", pgd_val(*pgd));
156 if (bad_address(pgd)) goto bad;
157 if (!pgd_present(*pgd)) goto ret;
158
159 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
160 if (bad_address(pud)) goto bad;
161 printk("PUD %lx ", pud_val(*pud));
162 if (!pud_present(*pud)) goto ret;
163
164 pmd = pmd_offset(pud, address);
165 if (bad_address(pmd)) goto bad;
166 printk("PMD %lx ", pmd_val(*pmd));
167 if (!pmd_present(*pmd)) goto ret;
168
169 pte = pte_offset_kernel(pmd, address);
170 if (bad_address(pte)) goto bad;
171 printk("PTE %lx", pte_val(*pte));
172ret:
173 printk("\n");
174 return;
175bad:
176 printk("BAD\n");
177}
178
179static const char errata93_warning[] =
180KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
181KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
182KERN_ERR "******* Please consider a BIOS update.\n"
183KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
184
185/* Workaround for K8 erratum #93 & buggy BIOS.
186 BIOS SMM functions are required to use a specific workaround
187 to avoid corruption of the 64bit RIP register on C stepping K8.
188 A lot of BIOS that didn't get tested properly miss this.
189 The OS sees this as a page fault with the upper 32bits of RIP cleared.
190 Try to work around it here.
191 Note we only handle faults in kernel here. */
192
193static int is_errata93(struct pt_regs *regs, unsigned long address)
194{
195 static int warned;
196 if (address != regs->rip)
197 return 0;
198 if ((address >> 32) != 0)
199 return 0;
200 address |= 0xffffffffUL << 32;
201 if ((address >= (u64)_stext && address <= (u64)_etext) ||
202 (address >= MODULES_VADDR && address <= MODULES_END)) {
203 if (!warned) {
204 printk(errata93_warning);
205 warned = 1;
206 }
207 regs->rip = address;
208 return 1;
209 }
210 return 0;
211}
212
213int unhandled_signal(struct task_struct *tsk, int sig)
214{
215 if (tsk->pid == 1)
216 return 1;
217 /* Warn for strace, but not for gdb */
218 if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
219 (tsk->ptrace & PT_PTRACED))
220 return 0;
221 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
222 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
223}
224
225static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
226 unsigned long error_code)
227{
228 oops_begin();
229 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
230 current->comm, address);
231 dump_pagetable(address);
232 __die("Bad pagetable", regs, error_code);
233 oops_end();
234 do_exit(SIGKILL);
235}
236
237/*
238 * Handle a fault on the vmalloc or module mapping area
239 */
240static int vmalloc_fault(unsigned long address)
241{
242 pgd_t *pgd, *pgd_ref;
243 pud_t *pud, *pud_ref;
244 pmd_t *pmd, *pmd_ref;
245 pte_t *pte, *pte_ref;
246
247 /* Copy kernel mappings over when needed. This can also
248 happen within a race in page table update. In the later
249 case just flush. */
250
251 pgd = pgd_offset(current->mm ?: &init_mm, address);
252 pgd_ref = pgd_offset_k(address);
253 if (pgd_none(*pgd_ref))
254 return -1;
255 if (pgd_none(*pgd))
256 set_pgd(pgd, *pgd_ref);
257
258 /* Below here mismatches are bugs because these lower tables
259 are shared */
260
261 pud = pud_offset(pgd, address);
262 pud_ref = pud_offset(pgd_ref, address);
263 if (pud_none(*pud_ref))
264 return -1;
265 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
266 BUG();
267 pmd = pmd_offset(pud, address);
268 pmd_ref = pmd_offset(pud_ref, address);
269 if (pmd_none(*pmd_ref))
270 return -1;
271 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
272 BUG();
273 pte_ref = pte_offset_kernel(pmd_ref, address);
274 if (!pte_present(*pte_ref))
275 return -1;
276 pte = pte_offset_kernel(pmd, address);
277 if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
278 BUG();
279 __flush_tlb_all();
280 return 0;
281}
282
283int page_fault_trace = 0;
284int exception_trace = 1;
285
286/*
287 * This routine handles page faults. It determines the address,
288 * and the problem, and then passes it off to one of the appropriate
289 * routines.
290 *
291 * error_code:
292 * bit 0 == 0 means no page found, 1 means protection fault
293 * bit 1 == 0 means read, 1 means write
294 * bit 2 == 0 means kernel, 1 means user-mode
295 * bit 3 == 1 means fault was an instruction fetch
296 */
297asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
298{
299 struct task_struct *tsk;
300 struct mm_struct *mm;
301 struct vm_area_struct * vma;
302 unsigned long address;
303 const struct exception_table_entry *fixup;
304 int write;
305 siginfo_t info;
306
307#ifdef CONFIG_CHECKING
308 {
309 unsigned long gs;
310 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
311 rdmsrl(MSR_GS_BASE, gs);
312 if (gs != (unsigned long)pda) {
313 wrmsrl(MSR_GS_BASE, pda);
314 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
315 }
316 }
317#endif
318
319 /* get the address */
320 __asm__("movq %%cr2,%0":"=r" (address));
321 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
322 SIGSEGV) == NOTIFY_STOP)
323 return;
324
325 if (likely(regs->eflags & X86_EFLAGS_IF))
326 local_irq_enable();
327
328 if (unlikely(page_fault_trace))
329 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
330 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
331
332 tsk = current;
333 mm = tsk->mm;
334 info.si_code = SEGV_MAPERR;
335
336
337 /*
338 * We fault-in kernel-space virtual memory on-demand. The
339 * 'reference' page table is init_mm.pgd.
340 *
341 * NOTE! We MUST NOT take any locks for this case. We may
342 * be in an interrupt or a critical region, and should
343 * only copy the information from the master page table,
344 * nothing more.
345 *
346 * This verifies that the fault happens in kernel space
347 * (error_code & 4) == 0, and that the fault was not a
348 * protection error (error_code & 1) == 0.
349 */
350 if (unlikely(address >= TASK_SIZE)) {
351 if (!(error_code & 5)) {
352 if (vmalloc_fault(address) < 0)
353 goto bad_area_nosemaphore;
354 return;
355 }
356 /*
357 * Don't take the mm semaphore here. If we fixup a prefetch
358 * fault we could otherwise deadlock.
359 */
360 goto bad_area_nosemaphore;
361 }
362
363 if (unlikely(error_code & (1 << 3)))
364 pgtable_bad(address, regs, error_code);
365
366 /*
367 * If we're in an interrupt or have no user
368 * context, we must not take the fault..
369 */
370 if (unlikely(in_atomic() || !mm))
371 goto bad_area_nosemaphore;
372
373 again:
374 /* When running in the kernel we expect faults to occur only to
375 * addresses in user space. All other faults represent errors in the
376 * kernel and should generate an OOPS. Unfortunatly, in the case of an
377 * erroneous fault occuring in a code path which already holds mmap_sem
378 * we will deadlock attempting to validate the fault against the
379 * address space. Luckily the kernel only validly references user
380 * space from well defined areas of code, which are listed in the
381 * exceptions table.
382 *
383 * As the vast majority of faults will be valid we will only perform
384 * the source reference check when there is a possibilty of a deadlock.
385 * Attempt to lock the address space, if we cannot we then validate the
386 * source. If this is invalid we can skip the address space check,
387 * thus avoiding the deadlock.
388 */
389 if (!down_read_trylock(&mm->mmap_sem)) {
390 if ((error_code & 4) == 0 &&
391 !search_exception_tables(regs->rip))
392 goto bad_area_nosemaphore;
393 down_read(&mm->mmap_sem);
394 }
395
396 vma = find_vma(mm, address);
397 if (!vma)
398 goto bad_area;
399 if (likely(vma->vm_start <= address))
400 goto good_area;
401 if (!(vma->vm_flags & VM_GROWSDOWN))
402 goto bad_area;
403 if (error_code & 4) {
404 // XXX: align red zone size with ABI
405 if (address + 128 < regs->rsp)
406 goto bad_area;
407 }
408 if (expand_stack(vma, address))
409 goto bad_area;
410/*
411 * Ok, we have a good vm_area for this memory access, so
412 * we can handle it..
413 */
414good_area:
415 info.si_code = SEGV_ACCERR;
416 write = 0;
417 switch (error_code & 3) {
418 default: /* 3: write, present */
419 /* fall through */
420 case 2: /* write, not present */
421 if (!(vma->vm_flags & VM_WRITE))
422 goto bad_area;
423 write++;
424 break;
425 case 1: /* read, present */
426 goto bad_area;
427 case 0: /* read, not present */
428 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
429 goto bad_area;
430 }
431
432 /*
433 * If for any reason at all we couldn't handle the fault,
434 * make sure we exit gracefully rather than endlessly redo
435 * the fault.
436 */
437 switch (handle_mm_fault(mm, vma, address, write)) {
438 case 1:
439 tsk->min_flt++;
440 break;
441 case 2:
442 tsk->maj_flt++;
443 break;
444 case 0:
445 goto do_sigbus;
446 default:
447 goto out_of_memory;
448 }
449
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461
462#ifdef CONFIG_IA32_EMULATION
463 /* 32bit vsyscall. map on demand. */
464 if (test_thread_flag(TIF_IA32) &&
465 address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
466 if (map_syscall32(mm, address) < 0)
467 goto out_of_memory2;
468 return;
469 }
470#endif
471
472 /* User mode accesses just cause a SIGSEGV */
473 if (error_code & 4) {
474 if (is_prefetch(regs, address, error_code))
475 return;
476
477 /* Work around K8 erratum #100 K8 in compat mode
478 occasionally jumps to illegal addresses >4GB. We
479 catch this here in the page fault handler because
480 these addresses are not reachable. Just detect this
481 case and return. Any code segment in LDT is
482 compatibility mode. */
483 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
484 (address >> 32))
485 return;
486
487 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
488 printk(
489 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
490 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
491 tsk->comm, tsk->pid, address, regs->rip,
492 regs->rsp, error_code);
493 }
494
495 tsk->thread.cr2 = address;
496 /* Kernel addresses are always protection faults */
497 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
498 tsk->thread.trap_no = 14;
499 info.si_signo = SIGSEGV;
500 info.si_errno = 0;
501 /* info.si_code has been set above */
502 info.si_addr = (void __user *)address;
503 force_sig_info(SIGSEGV, &info, tsk);
504 return;
505 }
506
507no_context:
508
509 /* Are we prepared to handle this kernel fault? */
510 fixup = search_exception_tables(regs->rip);
511 if (fixup) {
512 regs->rip = fixup->fixup;
513 return;
514 }
515
516 /*
517 * Hall of shame of CPU/BIOS bugs.
518 */
519
520 if (is_prefetch(regs, address, error_code))
521 return;
522
523 if (is_errata93(regs, address))
524 return;
525
526/*
527 * Oops. The kernel tried to access some bad page. We'll have to
528 * terminate things with extreme prejudice.
529 */
530
531 oops_begin();
532
533 if (address < PAGE_SIZE)
534 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
535 else
536 printk(KERN_ALERT "Unable to handle kernel paging request");
537 printk(" at %016lx RIP: \n" KERN_ALERT,address);
538 printk_address(regs->rip);
539 printk("\n");
540 dump_pagetable(address);
541 __die("Oops", regs, error_code);
542 /* Executive summary in case the body of the oops scrolled away */
543 printk(KERN_EMERG "CR2: %016lx\n", address);
544 oops_end();
545 do_exit(SIGKILL);
546
547/*
548 * We ran out of memory, or some other thing happened to us that made
549 * us unable to handle the page fault gracefully.
550 */
551out_of_memory:
552 up_read(&mm->mmap_sem);
553out_of_memory2:
554 if (current->pid == 1) {
555 yield();
556 goto again;
557 }
558 printk("VM: killing process %s\n", tsk->comm);
559 if (error_code & 4)
560 do_exit(SIGKILL);
561 goto no_context;
562
563do_sigbus:
564 up_read(&mm->mmap_sem);
565
566 /* Kernel mode? Handle exceptions or die */
567 if (!(error_code & 4))
568 goto no_context;
569
570 tsk->thread.cr2 = address;
571 tsk->thread.error_code = error_code;
572 tsk->thread.trap_no = 14;
573 info.si_signo = SIGBUS;
574 info.si_errno = 0;
575 info.si_code = BUS_ADRERR;
576 info.si_addr = (void __user *)address;
577 force_sig_info(SIGBUS, &info, tsk);
578 return;
579}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
new file mode 100644
index 000000000000..b0d604551d86
--- /dev/null
+++ b/arch/x86_64/mm/init.c
@@ -0,0 +1,630 @@
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/config.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
25
26#include <asm/processor.h>
27#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgtable.h>
30#include <asm/pgalloc.h>
31#include <asm/dma.h>
32#include <asm/fixmap.h>
33#include <asm/e820.h>
34#include <asm/apic.h>
35#include <asm/tlb.h>
36#include <asm/mmu_context.h>
37#include <asm/proto.h>
38#include <asm/smp.h>
39
40#ifndef Dprintk
41#define Dprintk(x...)
42#endif
43
44#ifdef CONFIG_GART_IOMMU
45extern int swiotlb;
46#endif
47
48extern char _stext[];
49
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51
52/*
53 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
54 * physical space so we can cache the place of the first one and move
55 * around without checking the pgd every time.
56 */
57
58void show_mem(void)
59{
60 int i, total = 0, reserved = 0;
61 int shared = 0, cached = 0;
62 pg_data_t *pgdat;
63 struct page *page;
64
65 printk("Mem-info:\n");
66 show_free_areas();
67 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
68
69 for_each_pgdat(pgdat) {
70 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
71 page = pfn_to_page(pgdat->node_start_pfn + i);
72 total++;
73 if (PageReserved(page))
74 reserved++;
75 else if (PageSwapCache(page))
76 cached++;
77 else if (page_count(page))
78 shared += page_count(page) - 1;
79 }
80 }
81 printk("%d pages of RAM\n", total);
82 printk("%d reserved pages\n",reserved);
83 printk("%d pages shared\n",shared);
84 printk("%d pages swap cached\n",cached);
85}
86
87/* References to section boundaries */
88
89extern char _text, _etext, _edata, __bss_start, _end[];
90extern char __init_begin, __init_end;
91
92int after_bootmem;
93
94static void *spp_getpage(void)
95{
96 void *ptr;
97 if (after_bootmem)
98 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
99 else
100 ptr = alloc_bootmem_pages(PAGE_SIZE);
101 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
102 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
103
104 Dprintk("spp_getpage %p\n", ptr);
105 return ptr;
106}
107
108static void set_pte_phys(unsigned long vaddr,
109 unsigned long phys, pgprot_t prot)
110{
111 pgd_t *pgd;
112 pud_t *pud;
113 pmd_t *pmd;
114 pte_t *pte, new_pte;
115
116 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
117
118 pgd = pgd_offset_k(vaddr);
119 if (pgd_none(*pgd)) {
120 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
121 return;
122 }
123 pud = pud_offset(pgd, vaddr);
124 if (pud_none(*pud)) {
125 pmd = (pmd_t *) spp_getpage();
126 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
127 if (pmd != pmd_offset(pud, 0)) {
128 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
129 return;
130 }
131 }
132 pmd = pmd_offset(pud, vaddr);
133 if (pmd_none(*pmd)) {
134 pte = (pte_t *) spp_getpage();
135 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
136 if (pte != pte_offset_kernel(pmd, 0)) {
137 printk("PAGETABLE BUG #02!\n");
138 return;
139 }
140 }
141 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
142
143 pte = pte_offset_kernel(pmd, vaddr);
144 if (!pte_none(*pte) &&
145 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
146 pte_ERROR(*pte);
147 set_pte(pte, new_pte);
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154}
155
156/* NOTE: this is meant to be run only at boot */
157void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
158{
159 unsigned long address = __fix_to_virt(idx);
160
161 if (idx >= __end_of_fixed_addresses) {
162 printk("Invalid __set_fixmap\n");
163 return;
164 }
165 set_pte_phys(address, phys, prot);
166}
167
168unsigned long __initdata table_start, table_end;
169
170extern pmd_t temp_boot_pmds[];
171
172static struct temp_map {
173 pmd_t *pmd;
174 void *address;
175 int allocated;
176} temp_mappings[] __initdata = {
177 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
178 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
179 {}
180};
181
182static __init void *alloc_low_page(int *index, unsigned long *phys)
183{
184 struct temp_map *ti;
185 int i;
186 unsigned long pfn = table_end++, paddr;
187 void *adr;
188
189 if (pfn >= end_pfn)
190 panic("alloc_low_page: ran out of memory");
191 for (i = 0; temp_mappings[i].allocated; i++) {
192 if (!temp_mappings[i].pmd)
193 panic("alloc_low_page: ran out of temp mappings");
194 }
195 ti = &temp_mappings[i];
196 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
197 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
198 ti->allocated = 1;
199 __flush_tlb();
200 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
201 *index = i;
202 *phys = pfn * PAGE_SIZE;
203 return adr;
204}
205
206static __init void unmap_low_page(int i)
207{
208 struct temp_map *ti = &temp_mappings[i];
209 set_pmd(ti->pmd, __pmd(0));
210 ti->allocated = 0;
211}
212
213static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
214{
215 long i, j;
216
217 i = pud_index(address);
218 pud = pud + i;
219 for (; i < PTRS_PER_PUD; pud++, i++) {
220 int map;
221 unsigned long paddr, pmd_phys;
222 pmd_t *pmd;
223
224 paddr = address + i*PUD_SIZE;
225 if (paddr >= end) {
226 for (; i < PTRS_PER_PUD; i++, pud++)
227 set_pud(pud, __pud(0));
228 break;
229 }
230
231 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
232 set_pud(pud, __pud(0));
233 continue;
234 }
235
236 pmd = alloc_low_page(&map, &pmd_phys);
237 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
238 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
239 unsigned long pe;
240
241 if (paddr >= end) {
242 for (; j < PTRS_PER_PMD; j++, pmd++)
243 set_pmd(pmd, __pmd(0));
244 break;
245 }
246 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
247 pe &= __supported_pte_mask;
248 set_pmd(pmd, __pmd(pe));
249 }
250 unmap_low_page(map);
251 }
252 __flush_tlb();
253}
254
255static void __init find_early_table_space(unsigned long end)
256{
257 unsigned long puds, pmds, tables;
258
259 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
260 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
261 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
262 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
263
264 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
265 if (table_start == -1UL)
266 panic("Cannot find space for the kernel page tables");
267
268 table_start >>= PAGE_SHIFT;
269 table_end = table_start;
270}
271
272/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
273 This runs before bootmem is initialized and gets pages directly from the
274 physical memory. To access them they are temporarily mapped. */
275void __init init_memory_mapping(unsigned long start, unsigned long end)
276{
277 unsigned long next;
278
279 Dprintk("init_memory_mapping\n");
280
281 /*
282 * Find space for the kernel direct mapping tables.
283 * Later we should allocate these tables in the local node of the memory
284 * mapped. Unfortunately this is done currently before the nodes are
285 * discovered.
286 */
287 find_early_table_space(end);
288
289 start = (unsigned long)__va(start);
290 end = (unsigned long)__va(end);
291
292 for (; start < end; start = next) {
293 int map;
294 unsigned long pud_phys;
295 pud_t *pud = alloc_low_page(&map, &pud_phys);
296 next = start + PGDIR_SIZE;
297 if (next > end)
298 next = end;
299 phys_pud_init(pud, __pa(start), __pa(next));
300 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
301 unmap_low_page(map);
302 }
303
304 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
305 __flush_tlb_all();
306 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
307 table_start<<PAGE_SHIFT,
308 table_end<<PAGE_SHIFT);
309}
310
311extern struct x8664_pda cpu_pda[NR_CPUS];
312
313/* Assumes all CPUs still execute in init_mm */
314void zap_low_mappings(void)
315{
316 pgd_t *pgd = pgd_offset_k(0UL);
317 pgd_clear(pgd);
318 flush_tlb_all();
319}
320
321#ifndef CONFIG_DISCONTIGMEM
322void __init paging_init(void)
323{
324 {
325 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
326 unsigned int max_dma;
327
328 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
329
330 if (end_pfn < max_dma)
331 zones_size[ZONE_DMA] = end_pfn;
332 else {
333 zones_size[ZONE_DMA] = max_dma;
334 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
335 }
336 free_area_init(zones_size);
337 }
338 return;
339}
340#endif
341
342/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
343 from the CPU leading to inconsistent cache lines. address and size
344 must be aligned to 2MB boundaries.
345 Does nothing when the mapping doesn't exist. */
346void __init clear_kernel_mapping(unsigned long address, unsigned long size)
347{
348 unsigned long end = address + size;
349
350 BUG_ON(address & ~LARGE_PAGE_MASK);
351 BUG_ON(size & ~LARGE_PAGE_MASK);
352
353 for (; address < end; address += LARGE_PAGE_SIZE) {
354 pgd_t *pgd = pgd_offset_k(address);
355 pud_t *pud;
356 pmd_t *pmd;
357 if (pgd_none(*pgd))
358 continue;
359 pud = pud_offset(pgd, address);
360 if (pud_none(*pud))
361 continue;
362 pmd = pmd_offset(pud, address);
363 if (!pmd || pmd_none(*pmd))
364 continue;
365 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
366 /* Could handle this, but it should not happen currently. */
367 printk(KERN_ERR
368 "clear_kernel_mapping: mapping has been split. will leak memory\n");
369 pmd_ERROR(*pmd);
370 }
371 set_pmd(pmd, __pmd(0));
372 }
373 __flush_tlb_all();
374}
375
376static inline int page_is_ram (unsigned long pagenr)
377{
378 int i;
379
380 for (i = 0; i < e820.nr_map; i++) {
381 unsigned long addr, end;
382
383 if (e820.map[i].type != E820_RAM) /* not usable memory */
384 continue;
385 /*
386 * !!!FIXME!!! Some BIOSen report areas as RAM that
387 * are not. Notably the 640->1Mb area. We need a sanity
388 * check here.
389 */
390 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
391 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
392 if ((pagenr >= addr) && (pagenr < end))
393 return 1;
394 }
395 return 0;
396}
397
398extern int swiotlb_force;
399
400static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
401 kcore_vsyscall;
402
403void __init mem_init(void)
404{
405 int codesize, reservedpages, datasize, initsize;
406 int tmp;
407
408#ifdef CONFIG_SWIOTLB
409 if (swiotlb_force)
410 swiotlb = 1;
411 if (!iommu_aperture &&
412 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
413 swiotlb = 1;
414 if (swiotlb)
415 swiotlb_init();
416#endif
417
418 /* How many end-of-memory variables you have, grandma! */
419 max_low_pfn = end_pfn;
420 max_pfn = end_pfn;
421 num_physpages = end_pfn;
422 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
423
424 /* clear the zero-page */
425 memset(empty_zero_page, 0, PAGE_SIZE);
426
427 reservedpages = 0;
428
429 /* this will put all low memory onto the freelists */
430#ifdef CONFIG_DISCONTIGMEM
431 totalram_pages += numa_free_all_bootmem();
432 tmp = 0;
433 /* should count reserved pages here for all nodes */
434#else
435 max_mapnr = end_pfn;
436 if (!mem_map) BUG();
437
438 totalram_pages += free_all_bootmem();
439
440 for (tmp = 0; tmp < end_pfn; tmp++)
441 /*
442 * Only count reserved RAM pages
443 */
444 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
445 reservedpages++;
446#endif
447
448 after_bootmem = 1;
449
450 codesize = (unsigned long) &_etext - (unsigned long) &_text;
451 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
452 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
453
454 /* Register memory areas for /proc/kcore */
455 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
456 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
457 VMALLOC_END-VMALLOC_START);
458 kclist_add(&kcore_kernel, &_stext, _end - _stext);
459 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
460 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
461 VSYSCALL_END - VSYSCALL_START);
462
463 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
464 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
465 end_pfn << (PAGE_SHIFT-10),
466 codesize >> 10,
467 reservedpages << (PAGE_SHIFT-10),
468 datasize >> 10,
469 initsize >> 10);
470
471 /*
472 * Subtle. SMP is doing its boot stuff late (because it has to
473 * fork idle threads) - but it also needs low mappings for the
474 * protected-mode entry to work. We zap these entries only after
475 * the WP-bit has been tested.
476 */
477#ifndef CONFIG_SMP
478 zap_low_mappings();
479#endif
480}
481
482extern char __initdata_begin[], __initdata_end[];
483
484void free_initmem(void)
485{
486 unsigned long addr;
487
488 addr = (unsigned long)(&__init_begin);
489 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
490 ClearPageReserved(virt_to_page(addr));
491 set_page_count(virt_to_page(addr), 1);
492 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
493 free_page(addr);
494 totalram_pages++;
495 }
496 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
497 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
498}
499
500#ifdef CONFIG_BLK_DEV_INITRD
501void free_initrd_mem(unsigned long start, unsigned long end)
502{
503 if (start < (unsigned long)&_end)
504 return;
505 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
506 for (; start < end; start += PAGE_SIZE) {
507 ClearPageReserved(virt_to_page(start));
508 set_page_count(virt_to_page(start), 1);
509 free_page(start);
510 totalram_pages++;
511 }
512}
513#endif
514
515void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
516{
517 /* Should check here against the e820 map to avoid double free */
518#ifdef CONFIG_DISCONTIGMEM
519 int nid = phys_to_nid(phys);
520 reserve_bootmem_node(NODE_DATA(nid), phys, len);
521#else
522 reserve_bootmem(phys, len);
523#endif
524}
525
526int kern_addr_valid(unsigned long addr)
527{
528 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
529 pgd_t *pgd;
530 pud_t *pud;
531 pmd_t *pmd;
532 pte_t *pte;
533
534 if (above != 0 && above != -1UL)
535 return 0;
536
537 pgd = pgd_offset_k(addr);
538 if (pgd_none(*pgd))
539 return 0;
540
541 pud = pud_offset(pgd, addr);
542 if (pud_none(*pud))
543 return 0;
544
545 pmd = pmd_offset(pud, addr);
546 if (pmd_none(*pmd))
547 return 0;
548 if (pmd_large(*pmd))
549 return pfn_valid(pmd_pfn(*pmd));
550
551 pte = pte_offset_kernel(pmd, addr);
552 if (pte_none(*pte))
553 return 0;
554 return pfn_valid(pte_pfn(*pte));
555}
556
557#ifdef CONFIG_SYSCTL
558#include <linux/sysctl.h>
559
560extern int exception_trace, page_fault_trace;
561
562static ctl_table debug_table2[] = {
563 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
564 proc_dointvec },
565#ifdef CONFIG_CHECKING
566 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
567 proc_dointvec },
568#endif
569 { 0, }
570};
571
572static ctl_table debug_root_table2[] = {
573 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
574 .child = debug_table2 },
575 { 0 },
576};
577
578static __init int x8664_sysctl_init(void)
579{
580 register_sysctl_table(debug_root_table2, 1);
581 return 0;
582}
583__initcall(x8664_sysctl_init);
584#endif
585
586/* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two
587 different ones: one for 32bit and one for 64bit. Use the appropiate
588 for the target task. */
589
590static struct vm_area_struct gate_vma = {
591 .vm_start = VSYSCALL_START,
592 .vm_end = VSYSCALL_END,
593 .vm_page_prot = PAGE_READONLY
594};
595
596static struct vm_area_struct gate32_vma = {
597 .vm_start = VSYSCALL32_BASE,
598 .vm_end = VSYSCALL32_END,
599 .vm_page_prot = PAGE_READONLY
600};
601
602struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
603{
604#ifdef CONFIG_IA32_EMULATION
605 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
606 /* lookup code assumes the pages are present. set them up
607 now */
608 if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0)
609 return NULL;
610 return &gate32_vma;
611 }
612#endif
613 return &gate_vma;
614}
615
616int in_gate_area(struct task_struct *task, unsigned long addr)
617{
618 struct vm_area_struct *vma = get_gate_vma(task);
619 return (addr >= vma->vm_start) && (addr < vma->vm_end);
620}
621
622/* Use this when you have no reliable task/vma, typically from interrupt
623 * context. It is less reliable than using the task's vma and may give
624 * false positives.
625 */
626int in_gate_area_no_task(unsigned long addr)
627{
628 return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) ||
629 ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END)));
630}
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
new file mode 100644
index 000000000000..74ec8554b195
--- /dev/null
+++ b/arch/x86_64/mm/ioremap.c
@@ -0,0 +1,283 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <asm/io.h>
15#include <asm/pgalloc.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/proto.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
25 unsigned long phys_addr, unsigned long flags)
26{
27 unsigned long end;
28 unsigned long pfn;
29
30 address &= ~PMD_MASK;
31 end = address + size;
32 if (end > PMD_SIZE)
33 end = PMD_SIZE;
34 if (address >= end)
35 BUG();
36 pfn = phys_addr >> PAGE_SHIFT;
37 do {
38 if (!pte_none(*pte)) {
39 printk("remap_area_pte: page already exists\n");
40 BUG();
41 }
42 set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
43 _PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
44 address += PAGE_SIZE;
45 pfn++;
46 pte++;
47 } while (address && (address < end));
48}
49
50static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
51 unsigned long phys_addr, unsigned long flags)
52{
53 unsigned long end;
54
55 address &= ~PUD_MASK;
56 end = address + size;
57 if (end > PUD_SIZE)
58 end = PUD_SIZE;
59 phys_addr -= address;
60 if (address >= end)
61 BUG();
62 do {
63 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
64 if (!pte)
65 return -ENOMEM;
66 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
67 address = (address + PMD_SIZE) & PMD_MASK;
68 pmd++;
69 } while (address && (address < end));
70 return 0;
71}
72
73static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size,
74 unsigned long phys_addr, unsigned long flags)
75{
76 unsigned long end;
77
78 address &= ~PGDIR_MASK;
79 end = address + size;
80 if (end > PGDIR_SIZE)
81 end = PGDIR_SIZE;
82 phys_addr -= address;
83 if (address >= end)
84 BUG();
85 do {
86 pmd_t * pmd = pmd_alloc(&init_mm, pud, address);
87 if (!pmd)
88 return -ENOMEM;
89 remap_area_pmd(pmd, address, end - address, address + phys_addr, flags);
90 address = (address + PUD_SIZE) & PUD_MASK;
91 pud++;
92 } while (address && (address < end));
93 return 0;
94}
95
96static int remap_area_pages(unsigned long address, unsigned long phys_addr,
97 unsigned long size, unsigned long flags)
98{
99 int error;
100 pgd_t *pgd;
101 unsigned long end = address + size;
102
103 phys_addr -= address;
104 pgd = pgd_offset_k(address);
105 flush_cache_all();
106 if (address >= end)
107 BUG();
108 spin_lock(&init_mm.page_table_lock);
109 do {
110 pud_t *pud;
111 pud = pud_alloc(&init_mm, pgd, address);
112 error = -ENOMEM;
113 if (!pud)
114 break;
115 if (remap_area_pud(pud, address, end - address,
116 phys_addr + address, flags))
117 break;
118 error = 0;
119 address = (address + PGDIR_SIZE) & PGDIR_MASK;
120 pgd++;
121 } while (address && (address < end));
122 spin_unlock(&init_mm.page_table_lock);
123 flush_tlb_all();
124 return error;
125}
126
127/*
128 * Fix up the linear direct mapping of the kernel to avoid cache attribute
129 * conflicts.
130 */
131static int
132ioremap_change_attr(unsigned long phys_addr, unsigned long size,
133 unsigned long flags)
134{
135 int err = 0;
136 if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
137 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
138 unsigned long vaddr = (unsigned long) __va(phys_addr);
139
140 /*
141 * Must use a address here and not struct page because the phys addr
142 * can be a in hole between nodes and not have an memmap entry.
143 */
144 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
145 if (!err)
146 global_flush_tlb();
147 }
148 return err;
149}
150
151/*
152 * Generic mapping function
153 */
154
155/*
156 * Remap an arbitrary physical address space into the kernel virtual
157 * address space. Needed when the kernel wants to access high addresses
158 * directly.
159 *
160 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
161 * have to convert them into an offset in a page-aligned mapping, but the
162 * caller shouldn't need to know that small detail.
163 */
164void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
165{
166 void * addr;
167 struct vm_struct * area;
168 unsigned long offset, last_addr;
169
170 /* Don't allow wraparound or zero size */
171 last_addr = phys_addr + size - 1;
172 if (!size || last_addr < phys_addr)
173 return NULL;
174
175 /*
176 * Don't remap the low PCI/ISA area, it's always mapped..
177 */
178 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
179 return (__force void __iomem *)phys_to_virt(phys_addr);
180
181#ifndef CONFIG_DISCONTIGMEM
182 /*
183 * Don't allow anybody to remap normal RAM that we're using..
184 */
185 if (last_addr < virt_to_phys(high_memory)) {
186 char *t_addr, *t_end;
187 struct page *page;
188
189 t_addr = __va(phys_addr);
190 t_end = t_addr + (size - 1);
191
192 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
193 if(!PageReserved(page))
194 return NULL;
195 }
196#endif
197
198 /*
199 * Mappings have to be page-aligned
200 */
201 offset = phys_addr & ~PAGE_MASK;
202 phys_addr &= PAGE_MASK;
203 size = PAGE_ALIGN(last_addr+1) - phys_addr;
204
205 /*
206 * Ok, go for it..
207 */
208 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
209 if (!area)
210 return NULL;
211 area->phys_addr = phys_addr;
212 addr = area->addr;
213 if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) {
214 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
215 return NULL;
216 }
217 if (ioremap_change_attr(phys_addr, size, flags) < 0) {
218 area->flags &= 0xffffff;
219 vunmap(addr);
220 return NULL;
221 }
222 return (__force void __iomem *) (offset + (char *)addr);
223}
224
225/**
226 * ioremap_nocache - map bus memory into CPU space
227 * @offset: bus address of the memory
228 * @size: size of the resource to map
229 *
230 * ioremap_nocache performs a platform specific sequence of operations to
231 * make bus memory CPU accessible via the readb/readw/readl/writeb/
232 * writew/writel functions and the other mmio helpers. The returned
233 * address is not guaranteed to be usable directly as a virtual
234 * address.
235 *
236 * This version of ioremap ensures that the memory is marked uncachable
237 * on the CPU as well as honouring existing caching rules from things like
238 * the PCI bus. Note that there are other caches and buffers on many
239 * busses. In particular driver authors should read up on PCI writes
240 *
241 * It's useful if some control registers are in such an area and
242 * write combining or read caching is not desirable:
243 *
244 * Must be freed with iounmap.
245 */
246
247void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
248{
249 return __ioremap(phys_addr, size, _PAGE_PCD);
250}
251
252void iounmap(volatile void __iomem *addr)
253{
254 struct vm_struct *p, **pprev;
255
256 if (addr <= high_memory)
257 return;
258 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
259 addr < phys_to_virt(ISA_END_ADDRESS))
260 return;
261
262 write_lock(&vmlist_lock);
263 for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev)
264 if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr))
265 break;
266 if (!p) {
267 printk("__iounmap: bad address %p\n", addr);
268 goto out_unlock;
269 }
270 *pprev = p->next;
271 unmap_vm_area(p);
272 if ((p->flags >> 20) &&
273 p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
274 /* p->size includes the guard page, but cpa doesn't like that */
275 change_page_attr(virt_to_page(__va(p->phys_addr)),
276 p->size >> PAGE_SHIFT,
277 PAGE_KERNEL);
278 global_flush_tlb();
279 }
280out_unlock:
281 write_unlock(&vmlist_lock);
282 kfree(p);
283}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
new file mode 100644
index 000000000000..ec35747aacd7
--- /dev/null
+++ b/arch/x86_64/mm/k8topology.c
@@ -0,0 +1,168 @@
1/*
2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes.
4 *
5 * This version reads it directly from the K8 northbridge.
6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/string.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <asm/io.h>
15#include <linux/pci_ids.h>
16#include <asm/types.h>
17#include <asm/mmzone.h>
18#include <asm/proto.h>
19#include <asm/e820.h>
20#include <asm/pci-direct.h>
21#include <asm/numa.h>
22
23static __init int find_northbridge(void)
24{
25 int num;
26
27 for (num = 0; num < 32; num++) {
28 u32 header;
29
30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
32 continue;
33
34 header = read_pci_config(0, num, 1, 0x00);
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
36 continue;
37 return num;
38 }
39
40 return -1;
41}
42
43int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{
45 unsigned long prevbase;
46 struct node nodes[8];
47 int nodeid, i, nb;
48 int found = 0;
49 u32 reg;
50 unsigned numnodes;
51 nodemask_t nodes_parsed;
52
53 nodes_clear(nodes_parsed);
54
55 nb = find_northbridge();
56 if (nb < 0)
57 return nb;
58
59 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
60
61 reg = read_pci_config(0, nb, 0, 0x60);
62 numnodes = ((reg >> 4) & 0xF) + 1;
63
64 printk(KERN_INFO "Number of nodes %d\n", numnodes);
65
66 memset(&nodes,0,sizeof(nodes));
67 prevbase = 0;
68 for (i = 0; i < 8; i++) {
69 unsigned long base,limit;
70
71 base = read_pci_config(0, nb, 1, 0x40 + i*8);
72 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
73
74 nodeid = limit & 7;
75 if ((base & 3) == 0) {
76 if (i < numnodes)
77 printk("Skipping disabled node %d\n", i);
78 continue;
79 }
80 if (nodeid >= numnodes) {
81 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
82 base, limit);
83 continue;
84 }
85
86 if (!limit) {
87 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
88 base);
89 continue;
90 }
91 if ((base >> 8) & 3 || (limit >> 8) & 3) {
92 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
93 nodeid, (base>>8)&3, (limit>>8) & 3);
94 return -1;
95 }
96 if (node_isset(nodeid, nodes_parsed)) {
97 printk(KERN_INFO "Node %d already present. Skipping\n",
98 nodeid);
99 continue;
100 }
101
102 limit >>= 16;
103 limit <<= 24;
104 limit |= (1<<24)-1;
105
106 if (limit > end_pfn << PAGE_SHIFT)
107 limit = end_pfn << PAGE_SHIFT;
108 if (limit <= base)
109 continue;
110
111 base >>= 16;
112 base <<= 24;
113
114 if (base < start)
115 base = start;
116 if (limit > end)
117 limit = end;
118 if (limit == base) {
119 printk(KERN_ERR "Empty node %d\n", nodeid);
120 continue;
121 }
122 if (limit < base) {
123 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
124 nodeid, base, limit);
125 continue;
126 }
127
128 /* Could sort here, but pun for now. Should not happen anyroads. */
129 if (prevbase > base) {
130 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
131 prevbase,base);
132 return -1;
133 }
134
135 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
136 nodeid, base, limit);
137
138 found++;
139
140 nodes[nodeid].start = base;
141 nodes[nodeid].end = limit;
142
143 prevbase = base;
144
145 node_set(nodeid, nodes_parsed);
146 }
147
148 if (!found)
149 return -1;
150
151 memnode_shift = compute_hash_shift(nodes, numnodes);
152 if (memnode_shift < 0) {
153 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
154 return -1;
155 }
156 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
157
158 for (i = 0; i < 8; i++) {
159 if (nodes[i].start != nodes[i].end) {
160 /* assume 1:1 NODE:CPU */
161 cpu_to_node[i] = i;
162 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
163 }
164 }
165
166 numa_init_array();
167 return 0;
168}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
new file mode 100644
index 000000000000..fd9f25d7a6c4
--- /dev/null
+++ b/arch/x86_64/mm/numa.c
@@ -0,0 +1,294 @@
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES];
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
31unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32cpumask_t node_to_cpumask[MAX_NUMNODES];
33
34int numa_off __initdata;
35
36int __init compute_hash_shift(struct node *nodes, int numnodes)
37{
38 int i;
39 int shift = 24;
40 u64 addr;
41
42 /* When in doubt use brute force. */
43 while (shift < 48) {
44 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
45 for (i = 0; i < numnodes; i++) {
46 if (nodes[i].start == nodes[i].end)
47 continue;
48 for (addr = nodes[i].start;
49 addr < nodes[i].end;
50 addr += (1UL << shift)) {
51 if (memnodemap[addr >> shift] != 0xff &&
52 memnodemap[addr >> shift] != i) {
53 printk(KERN_INFO
54 "node %d shift %d addr %Lx conflict %d\n",
55 i, shift, addr, memnodemap[addr>>shift]);
56 goto next;
57 }
58 memnodemap[addr >> shift] = i;
59 }
60 }
61 return shift;
62 next:
63 shift++;
64 }
65 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
66 return -1;
67}
68
69/* Initialize bootmem allocator for a node */
70void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
71{
72 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
73 unsigned long nodedata_phys;
74 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
75
76 start = round_up(start, ZONE_ALIGN);
77
78 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
79
80 start_pfn = start >> PAGE_SHIFT;
81 end_pfn = end >> PAGE_SHIFT;
82
83 nodedata_phys = find_e820_area(start, end, pgdat_size);
84 if (nodedata_phys == -1L)
85 panic("Cannot find memory pgdat in node %d\n", nodeid);
86
87 Dprintk("nodedata_phys %lx\n", nodedata_phys);
88
89 node_data[nodeid] = phys_to_virt(nodedata_phys);
90 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
91 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
92 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
93 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
94
95 /* Find a place for the bootmem map */
96 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
97 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
98 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
99 if (bootmap_start == -1L)
100 panic("Not enough continuous space for bootmap on node %d", nodeid);
101 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
102
103 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
104 bootmap_start >> PAGE_SHIFT,
105 start_pfn, end_pfn);
106
107 e820_bootmem_free(NODE_DATA(nodeid), start, end);
108
109 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
110 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
111 node_set_online(nodeid);
112}
113
114/* Initialize final allocator for a zone */
115void __init setup_node_zones(int nodeid)
116{
117 unsigned long start_pfn, end_pfn;
118 unsigned long zones[MAX_NR_ZONES];
119 unsigned long dma_end_pfn;
120
121 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
122
123 start_pfn = node_start_pfn(nodeid);
124 end_pfn = node_end_pfn(nodeid);
125
126 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
127
128 /* All nodes > 0 have a zero length zone DMA */
129 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
130 if (start_pfn < dma_end_pfn) {
131 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
132 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
133 } else {
134 zones[ZONE_NORMAL] = end_pfn - start_pfn;
135 }
136
137 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
138 start_pfn, NULL);
139}
140
141void __init numa_init_array(void)
142{
143 int rr, i;
144 /* There are unfortunately some poorly designed mainboards around
145 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
146 mapping. To avoid this fill in the mapping for all possible
147 CPUs, as the number of CPUs is not known yet.
148 We round robin the existing nodes. */
149 rr = 0;
150 for (i = 0; i < NR_CPUS; i++) {
151 if (cpu_to_node[i] != NUMA_NO_NODE)
152 continue;
153 rr = next_node(rr, node_online_map);
154 if (rr == MAX_NUMNODES)
155 rr = first_node(node_online_map);
156 cpu_to_node[i] = rr;
157 rr++;
158 }
159
160 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
161}
162
163#ifdef CONFIG_NUMA_EMU
164int numa_fake __initdata = 0;
165
166/* Numa emulation */
167static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
168{
169 int i;
170 struct node nodes[MAX_NUMNODES];
171 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
172
173 /* Kludge needed for the hash function */
174 if (hweight64(sz) > 1) {
175 unsigned long x = 1;
176 while ((x << 1) < sz)
177 x <<= 1;
178 if (x < sz/2)
179 printk("Numa emulation unbalanced. Complain to maintainer\n");
180 sz = x;
181 }
182
183 memset(&nodes,0,sizeof(nodes));
184 for (i = 0; i < numa_fake; i++) {
185 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
186 if (i == numa_fake-1)
187 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
188 nodes[i].end = nodes[i].start + sz;
189 if (i != numa_fake-1)
190 nodes[i].end--;
191 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
192 i,
193 nodes[i].start, nodes[i].end,
194 (nodes[i].end - nodes[i].start) >> 20);
195 node_set_online(i);
196 }
197 memnode_shift = compute_hash_shift(nodes, numa_fake);
198 if (memnode_shift < 0) {
199 memnode_shift = 0;
200 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
201 return -1;
202 }
203 for_each_online_node(i)
204 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
205 numa_init_array();
206 return 0;
207}
208#endif
209
210void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
211{
212 int i;
213
214#ifdef CONFIG_NUMA_EMU
215 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
216 return;
217#endif
218
219#ifdef CONFIG_ACPI_NUMA
220 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
221 end_pfn << PAGE_SHIFT))
222 return;
223#endif
224
225#ifdef CONFIG_K8_NUMA
226 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
227 return;
228#endif
229 printk(KERN_INFO "%s\n",
230 numa_off ? "NUMA turned off" : "No NUMA configuration found");
231
232 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
233 start_pfn << PAGE_SHIFT,
234 end_pfn << PAGE_SHIFT);
235 /* setup dummy node covering all memory */
236 memnode_shift = 63;
237 memnodemap[0] = 0;
238 nodes_clear(node_online_map);
239 node_set_online(0);
240 for (i = 0; i < NR_CPUS; i++)
241 cpu_to_node[i] = 0;
242 node_to_cpumask[0] = cpumask_of_cpu(0);
243 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
244}
245
246__init void numa_add_cpu(int cpu)
247{
248 /* BP is initialized elsewhere */
249 if (cpu)
250 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
251}
252
253unsigned long __init numa_free_all_bootmem(void)
254{
255 int i;
256 unsigned long pages = 0;
257 for_each_online_node(i) {
258 pages += free_all_bootmem_node(NODE_DATA(i));
259 }
260 return pages;
261}
262
263void __init paging_init(void)
264{
265 int i;
266 for_each_online_node(i) {
267 setup_node_zones(i);
268 }
269}
270
271/* [numa=off] */
272__init int numa_setup(char *opt)
273{
274 if (!strncmp(opt,"off",3))
275 numa_off = 1;
276#ifdef CONFIG_NUMA_EMU
277 if(!strncmp(opt, "fake=", 5)) {
278 numa_fake = simple_strtoul(opt+5,NULL,0); ;
279 if (numa_fake >= MAX_NUMNODES)
280 numa_fake = MAX_NUMNODES;
281 }
282#endif
283#ifdef CONFIG_ACPI_NUMA
284 if (!strncmp(opt,"noacpi",6))
285 acpi_numa = -1;
286#endif
287 return 1;
288}
289
290EXPORT_SYMBOL(cpu_to_node);
291EXPORT_SYMBOL(node_to_cpumask);
292EXPORT_SYMBOL(memnode_shift);
293EXPORT_SYMBOL(memnodemap);
294EXPORT_SYMBOL(node_data);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
new file mode 100644
index 000000000000..94862e1ec032
--- /dev/null
+++ b/arch/x86_64/mm/pageattr.c
@@ -0,0 +1,235 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/config.h>
7#include <linux/mm.h>
8#include <linux/sched.h>
9#include <linux/highmem.h>
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <asm/uaccess.h>
13#include <asm/processor.h>
14#include <asm/tlbflush.h>
15#include <asm/io.h>
16
17static inline pte_t *lookup_address(unsigned long address)
18{
19 pgd_t *pgd = pgd_offset_k(address);
20 pud_t *pud;
21 pmd_t *pmd;
22 pte_t *pte;
23 if (pgd_none(*pgd))
24 return NULL;
25 pud = pud_offset(pgd, address);
26 if (!pud_present(*pud))
27 return NULL;
28 pmd = pmd_offset(pud, address);
29 if (!pmd_present(*pmd))
30 return NULL;
31 if (pmd_large(*pmd))
32 return (pte_t *)pmd;
33 pte = pte_offset_kernel(pmd, address);
34 if (pte && !pte_present(*pte))
35 pte = NULL;
36 return pte;
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base = alloc_pages(GFP_KERNEL, 0);
45 pte_t *pbase;
46 if (!base)
47 return NULL;
48 address = __pa(address);
49 addr = address & LARGE_PAGE_MASK;
50 pbase = (pte_t *)page_address(base);
51 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
52 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
53 addr == address ? prot : ref_prot);
54 }
55 return base;
56}
57
58
59static void flush_kernel_map(void *address)
60{
61 if (0 && address && cpu_has_clflush) {
62 /* is this worth it? */
63 int i;
64 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
65 asm volatile("clflush (%0)" :: "r" (address + i));
66 } else
67 asm volatile("wbinvd":::"memory");
68 if (address)
69 __flush_tlb_one(address);
70 else
71 __flush_tlb_all();
72}
73
74
75static inline void flush_map(unsigned long address)
76{
77 on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
78}
79
80struct deferred_page {
81 struct deferred_page *next;
82 struct page *fpage;
83 unsigned long address;
84};
85static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
86
87static inline void save_page(unsigned long address, struct page *fpage)
88{
89 struct deferred_page *df;
90 df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
91 if (!df) {
92 flush_map(address);
93 __free_page(fpage);
94 } else {
95 df->next = df_list;
96 df->fpage = fpage;
97 df->address = address;
98 df_list = df;
99 }
100}
101
102/*
103 * No more special protections in this 2/4MB area - revert to a
104 * large page again.
105 */
106static void revert_page(unsigned long address, pgprot_t ref_prot)
107{
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pte_t large_pte;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pgprot_val(ref_prot) |= _PAGE_PSE;
120 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
121 set_pte((pte_t *)pmd, large_pte);
122}
123
124static int
125__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
126 pgprot_t ref_prot)
127{
128 pte_t *kpte;
129 struct page *kpte_page;
130 unsigned kpte_flags;
131 kpte = lookup_address(address);
132 if (!kpte) return 0;
133 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
134 kpte_flags = pte_val(*kpte);
135 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
136 if ((kpte_flags & _PAGE_PSE) == 0) {
137 set_pte(kpte, pfn_pte(pfn, prot));
138 } else {
139 /*
140 * split_large_page will take the reference for this change_page_attr
141 * on the split page.
142 */
143 struct page *split = split_large_page(address, prot, ref_prot);
144 if (!split)
145 return -ENOMEM;
146 set_pte(kpte,mk_pte(split, ref_prot));
147 kpte_page = split;
148 }
149 get_page(kpte_page);
150 } else if ((kpte_flags & _PAGE_PSE) == 0) {
151 set_pte(kpte, pfn_pte(pfn, ref_prot));
152 __put_page(kpte_page);
153 } else
154 BUG();
155
156 /* on x86-64 the direct mapping set at boot is not using 4k pages */
157 BUG_ON(PageReserved(kpte_page));
158
159 switch (page_count(kpte_page)) {
160 case 1:
161 save_page(address, kpte_page);
162 revert_page(address, ref_prot);
163 break;
164 case 0:
165 BUG(); /* memleak and failed 2M page regeneration */
166 }
167 return 0;
168}
169
170/*
171 * Change the page attributes of an page in the linear mapping.
172 *
173 * This should be used when a page is mapped with a different caching policy
174 * than write-back somewhere - some CPUs do not like it when mappings with
175 * different caching policies exist. This changes the page attributes of the
176 * in kernel linear mapping too.
177 *
178 * The caller needs to ensure that there are no conflicting mappings elsewhere.
179 * This function only deals with the kernel linear map.
180 *
181 * Caller must call global_flush_tlb() after this.
182 */
183int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
184{
185 int err = 0;
186 int i;
187
188 down_write(&init_mm.mmap_sem);
189 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
190 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
191
192 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
193 if (err)
194 break;
195 /* Handle kernel mapping too which aliases part of the
196 * lowmem */
197 if (__pa(address) < KERNEL_TEXT_SIZE) {
198 unsigned long addr2;
199 pgprot_t prot2 = prot;
200 addr2 = __START_KERNEL_map + __pa(address);
201 pgprot_val(prot2) &= ~_PAGE_NX;
202 err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
203 }
204 }
205 up_write(&init_mm.mmap_sem);
206 return err;
207}
208
209/* Don't call this for MMIO areas that may not have a mem_map entry */
210int change_page_attr(struct page *page, int numpages, pgprot_t prot)
211{
212 unsigned long addr = (unsigned long)page_address(page);
213 return change_page_attr_addr(addr, numpages, prot);
214}
215
216void global_flush_tlb(void)
217{
218 struct deferred_page *df, *next_df;
219
220 down_read(&init_mm.mmap_sem);
221 df = xchg(&df_list, NULL);
222 up_read(&init_mm.mmap_sem);
223 if (!df)
224 return;
225 flush_map((df && !df->next) ? df->address : 0);
226 for (; df; df = next_df) {
227 next_df = df->next;
228 if (df->fpage)
229 __free_page(df->fpage);
230 kfree(df);
231 }
232}
233
234EXPORT_SYMBOL(change_page_attr);
235EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
new file mode 100644
index 000000000000..5d01b31472e1
--- /dev/null
+++ b/arch/x86_64/mm/srat.c
@@ -0,0 +1,217 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <asm/proto.h>
19#include <asm/numa.h>
20
21static struct acpi_table_slit *acpi_slit;
22
23static nodemask_t nodes_parsed __initdata;
24static nodemask_t nodes_found __initdata;
25static struct node nodes[MAX_NUMNODES] __initdata;
26static __u8 pxm2node[256] = { [0 ... 255] = 0xff };
27
28static __init int setup_node(int pxm)
29{
30 unsigned node = pxm2node[pxm];
31 if (node == 0xff) {
32 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
33 return -1;
34 node = first_unset_node(nodes_found);
35 node_set(node, nodes_found);
36 pxm2node[pxm] = node;
37 }
38 return pxm2node[pxm];
39}
40
41static __init int conflicting_nodes(unsigned long start, unsigned long end)
42{
43 int i;
44 for_each_online_node(i) {
45 struct node *nd = &nodes[i];
46 if (nd->start == nd->end)
47 continue;
48 if (nd->end > start && nd->start < end)
49 return 1;
50 if (nd->end == end && nd->start == start)
51 return 1;
52 }
53 return -1;
54}
55
56static __init void cutoff_node(int i, unsigned long start, unsigned long end)
57{
58 struct node *nd = &nodes[i];
59 if (nd->start < start) {
60 nd->start = start;
61 if (nd->end < nd->start)
62 nd->start = nd->end;
63 }
64 if (nd->end > end) {
65 if (!(end & 0xfff))
66 end--;
67 nd->end = end;
68 if (nd->start > nd->end)
69 nd->start = nd->end;
70 }
71}
72
73static __init void bad_srat(void)
74{
75 printk(KERN_ERR "SRAT: SRAT not used.\n");
76 acpi_numa = -1;
77}
78
79static __init inline int srat_disabled(void)
80{
81 return numa_off || acpi_numa < 0;
82}
83
84/* Callback for SLIT parsing */
85void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
86{
87 acpi_slit = slit;
88}
89
90/* Callback for Proximity Domain -> LAPIC mapping */
91void __init
92acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
93{
94 int pxm, node;
95 if (srat_disabled() || pa->flags.enabled == 0)
96 return;
97 pxm = pa->proximity_domain;
98 node = setup_node(pxm);
99 if (node < 0) {
100 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
101 bad_srat();
102 return;
103 }
104 if (pa->apic_id >= NR_CPUS) {
105 printk(KERN_ERR "SRAT: lapic %u too large.\n",
106 pa->apic_id);
107 bad_srat();
108 return;
109 }
110 cpu_to_node[pa->apic_id] = node;
111 acpi_numa = 1;
112 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
113 pxm, pa->apic_id, node);
114}
115
116/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
117void __init
118acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
119{
120 struct node *nd;
121 unsigned long start, end;
122 int node, pxm;
123 int i;
124
125 if (srat_disabled() || ma->flags.enabled == 0)
126 return;
127 /* hotplug bit is ignored for now */
128 pxm = ma->proximity_domain;
129 node = setup_node(pxm);
130 if (node < 0) {
131 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
132 bad_srat();
133 return;
134 }
135 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
136 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
137 i = conflicting_nodes(start, end);
138 if (i >= 0) {
139 printk(KERN_ERR
140 "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n",
141 pxm, start, end, i, nodes[i].start, nodes[i].end);
142 bad_srat();
143 return;
144 }
145 nd = &nodes[node];
146 if (!node_test_and_set(node, nodes_parsed)) {
147 nd->start = start;
148 nd->end = end;
149 } else {
150 if (start < nd->start)
151 nd->start = start;
152 if (nd->end < end)
153 nd->end = end;
154 }
155 if (!(nd->end & 0xfff))
156 nd->end--;
157 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
158 nd->start, nd->end);
159}
160
161void __init acpi_numa_arch_fixup(void) {}
162
163/* Use the information discovered above to actually set up the nodes. */
164int __init acpi_scan_nodes(unsigned long start, unsigned long end)
165{
166 int i;
167 if (acpi_numa <= 0)
168 return -1;
169 memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
170 if (memnode_shift < 0) {
171 printk(KERN_ERR
172 "SRAT: No NUMA node hash function found. Contact maintainer\n");
173 bad_srat();
174 return -1;
175 }
176 for (i = 0; i < MAX_NUMNODES; i++) {
177 if (!node_isset(i, nodes_parsed))
178 continue;
179 cutoff_node(i, start, end);
180 if (nodes[i].start == nodes[i].end) {
181 node_clear(i, nodes_parsed);
182 continue;
183 }
184 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
185 }
186 for (i = 0; i < NR_CPUS; i++) {
187 if (cpu_to_node[i] == NUMA_NO_NODE)
188 continue;
189 if (!node_isset(cpu_to_node[i], nodes_parsed))
190 cpu_to_node[i] = NUMA_NO_NODE;
191 }
192 numa_init_array();
193 return 0;
194}
195
196int node_to_pxm(int n)
197{
198 int i;
199 if (pxm2node[n] == n)
200 return n;
201 for (i = 0; i < 256; i++)
202 if (pxm2node[i] == n)
203 return i;
204 return 0;
205}
206
207int __node_distance(int a, int b)
208{
209 int index;
210
211 if (!acpi_slit)
212 return a == b ? 10 : 20;
213 index = acpi_slit->localities * node_to_pxm(a);
214 return acpi_slit->entry[index + node_to_pxm(b)];
215}
216
217EXPORT_SYMBOL(__node_distance);
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig
new file mode 100644
index 000000000000..5ade19801b97
--- /dev/null
+++ b/arch/x86_64/oprofile/Kconfig
@@ -0,0 +1,23 @@
1
2menu "Profiling support"
3 depends on EXPERIMENTAL
4
5config PROFILING
6 bool "Profiling support (EXPERIMENTAL)"
7 help
8 Say Y here to enable the extended profiling support mechanisms used
9 by profilers such as OProfile.
10
11
12config OPROFILE
13 tristate "OProfile system profiling (EXPERIMENTAL)"
14 depends on PROFILING
15 help
16 OProfile is a profiling system capable of profiling the
17 whole system, include the kernel, kernel modules, libraries,
18 and applications.
19
20 If unsure, say N.
21
22endmenu
23
diff --git a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile
new file mode 100644
index 000000000000..6be32683e1bc
--- /dev/null
+++ b/arch/x86_64/oprofile/Makefile
@@ -0,0 +1,19 @@
1#
2# oprofile for x86-64.
3# Just reuse the one from i386.
4#
5
6obj-$(CONFIG_OPROFILE) += oprofile.o
7
8DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
9 oprof.o cpu_buffer.o buffer_sync.o \
10 event_buffer.o oprofile_files.o \
11 oprofilefs.o oprofile_stats.o \
12 timer_int.o )
13
14OPROFILE-y := init.o backtrace.o
15OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
16 op_model_ppro.o
17OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
18
19oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
new file mode 100644
index 000000000000..37c92e841dec
--- /dev/null
+++ b/arch/x86_64/pci/Makefile
@@ -0,0 +1,24 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6CFLAGS += -Iarch/i386/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o
11obj-$(CONFIG_ACPI_PCI) += acpi.o
12obj-y += legacy.o irq.o common.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
15
16obj-$(CONFIG_NUMA) += k8-bus.o
17
18direct-y += ../../i386/pci/direct.o
19acpi-y += ../../i386/pci/acpi.o
20legacy-y += ../../i386/pci/legacy.o
21irq-y += ../../i386/pci/irq.o
22common-y += ../../i386/pci/common.o
23fixup-y += ../../i386/pci/fixup.o
24i386-y += ../../i386/pci/i386.o
diff --git a/arch/x86_64/pci/Makefile-BUS b/arch/x86_64/pci/Makefile-BUS
new file mode 100644
index 000000000000..291985f0d2e4
--- /dev/null
+++ b/arch/x86_64/pci/Makefile-BUS
@@ -0,0 +1,22 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6CFLAGS += -I arch/i386/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o
11obj-$(CONFIG_ACPI_PCI) += acpi.o
12obj-y += legacy.o irq.o common.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
15
16direct-y += ../../i386/pci/direct.o
17acpi-y += ../../i386/pci/acpi.o
18legacy-y += ../../i386/pci/legacy.o
19irq-y += ../../i386/pci/irq.o
20common-y += ../../i386/pci/common.o
21fixup-y += ../../i386/pci/fixup.o
22i386-y += ../../i386/pci/i386.o
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
new file mode 100644
index 000000000000..62349c78db57
--- /dev/null
+++ b/arch/x86_64/pci/k8-bus.c
@@ -0,0 +1,78 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3#include <asm/mpspec.h>
4#include <linux/cpumask.h>
5
6/*
7 * This discovers the pcibus <-> node mapping on AMD K8.
8 *
9 * RED-PEN need to call this again on PCI hotplug
10 * RED-PEN empty cpus get reported wrong
11 */
12
13#define NODE_ID_REGISTER 0x60
14#define NODE_ID(dword) (dword & 0x07)
15#define LDT_BUS_NUMBER_REGISTER_0 0x94
16#define LDT_BUS_NUMBER_REGISTER_1 0xB4
17#define LDT_BUS_NUMBER_REGISTER_2 0xD4
18#define NR_LDT_BUS_NUMBER_REGISTERS 3
19#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)
20#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
21#define PCI_DEVICE_ID_K8HTCONFIG 0x1100
22
23/**
24 * fill_mp_bus_to_cpumask()
25 * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
26 * Registers found in the K8 northbridge
27 */
28__init static int
29fill_mp_bus_to_cpumask(void)
30{
31 struct pci_dev *nb_dev = NULL;
32 int i, j, printed;
33 u32 ldtbus, nid;
34 static int lbnr[3] = {
35 LDT_BUS_NUMBER_REGISTER_0,
36 LDT_BUS_NUMBER_REGISTER_1,
37 LDT_BUS_NUMBER_REGISTER_2
38 };
39
40 while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
41 PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
42 pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
43
44 for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
45 pci_read_config_dword(nb_dev, lbnr[i], &ldtbus);
46 /*
47 * if there are no busses hanging off of the current
48 * ldt link then both the secondary and subordinate
49 * bus number fields are set to 0.
50 */
51 if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0
52 && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) {
53 for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
54 j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
55 j++)
56 pci_bus_to_cpumask[j] =
57 node_to_cpumask(NODE_ID(nid));
58 }
59 }
60 }
61
62 /* quick sanity check */
63 printed = 0;
64 for (i = 0; i < 256; i++) {
65 if (cpus_empty(pci_bus_to_cpumask[i])) {
66 pci_bus_to_cpumask[i] = CPU_MASK_ALL;
67 if (printed)
68 continue;
69 printk(KERN_ERR
70 "k8-bus.c: some busses have empty cpu mask\n");
71 printed = 1;
72 }
73 }
74
75 return 0;
76}
77
78fs_initcall(fill_mp_bus_to_cpumask);
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
new file mode 100644
index 000000000000..b693c232fd07
--- /dev/null
+++ b/arch/x86_64/pci/mmconfig.c
@@ -0,0 +1,104 @@
1/*
2 * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
3 *
4 * This is an 64bit optimized version that always keeps the full mmconfig
5 * space mapped. This allows lockless config space operation.
6 */
7
8#include <linux/pci.h>
9#include <linux/init.h>
10#include "pci.h"
11
12#define MMCONFIG_APER_SIZE (256*1024*1024)
13
14/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
15u32 pci_mmcfg_base_addr;
16
17/* Static virtual mapping of the MMCONFIG aperture */
18char *pci_mmcfg_virt;
19
20static inline char *pci_dev_base(unsigned int bus, unsigned int devfn)
21{
22 return pci_mmcfg_virt + ((bus << 20) | (devfn << 12));
23}
24
25static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
26 unsigned int devfn, int reg, int len, u32 *value)
27{
28 char *addr = pci_dev_base(bus, devfn);
29
30 if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
31 return -EINVAL;
32
33 switch (len) {
34 case 1:
35 *value = readb(addr + reg);
36 break;
37 case 2:
38 *value = readw(addr + reg);
39 break;
40 case 4:
41 *value = readl(addr + reg);
42 break;
43 }
44
45 return 0;
46}
47
48static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
49 unsigned int devfn, int reg, int len, u32 value)
50{
51 char *addr = pci_dev_base(bus,devfn);
52
53 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
54 return -EINVAL;
55
56 switch (len) {
57 case 1:
58 writeb(value, addr + reg);
59 break;
60 case 2:
61 writew(value, addr + reg);
62 break;
63 case 4:
64 writel(value, addr + reg);
65 break;
66 }
67
68 return 0;
69}
70
71static struct pci_raw_ops pci_mmcfg = {
72 .read = pci_mmcfg_read,
73 .write = pci_mmcfg_write,
74};
75
76static int __init pci_mmcfg_init(void)
77{
78 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
79 return 0;
80 if (!pci_mmcfg_base_addr)
81 return 0;
82
83 /* Kludge for now. Don't use mmconfig on AMD systems because
84 those have some busses where mmconfig doesn't work,
85 and we don't parse ACPI MCFG well enough to handle that.
86 Remove when proper handling is added. */
87 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
88 return 0;
89
90 /* RED-PEN i386 doesn't do _nocache right now */
91 pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE);
92 if (!pci_mmcfg_virt) {
93 printk("PCI: Cannot map mmconfig aperture\n");
94 return 0;
95 }
96
97 printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr);
98 raw_pci_ops = &pci_mmcfg;
99 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
100
101 return 0;
102}
103
104arch_initcall(pci_mmcfg_init);