aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
committerDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
commit0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree7b42490a676cf39ae0691b6859ecf7fd410f229b /arch/x86_64
parent4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig143
-rw-r--r--arch/x86_64/Kconfig.debug19
-rw-r--r--arch/x86_64/Makefile12
-rw-r--r--arch/x86_64/boot/.gitignore3
-rw-r--r--arch/x86_64/boot/Makefile2
-rw-r--r--arch/x86_64/boot/compressed/misc.c2
-rw-r--r--arch/x86_64/boot/compressed/miscsetup.h39
-rw-r--r--arch/x86_64/boot/install.sh40
-rw-r--r--arch/x86_64/boot/tools/.gitignore1
-rw-r--r--arch/x86_64/crypto/aes.c25
-rw-r--r--arch/x86_64/defconfig212
-rw-r--r--arch/x86_64/ia32/Makefile7
-rw-r--r--arch/x86_64/ia32/ia32_aout.c3
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c29
-rw-r--r--arch/x86_64/ia32/ia32_ioctl.c79
-rw-r--r--arch/x86_64/ia32/ia32_signal.c26
-rw-r--r--arch/x86_64/ia32/ia32entry.S56
-rw-r--r--arch/x86_64/ia32/mmap32.c78
-rw-r--r--arch/x86_64/ia32/ptrace32.c59
-rw-r--r--arch/x86_64/ia32/sys_ia32.c19
-rw-r--r--arch/x86_64/ia32/vsyscall-sigreturn.S1
-rw-r--r--arch/x86_64/ia32/vsyscall-syscall.S1
-rw-r--r--arch/x86_64/ia32/vsyscall-sysenter.S1
-rw-r--r--arch/x86_64/kernel/Makefile10
-rw-r--r--arch/x86_64/kernel/aperture.c5
-rw-r--r--arch/x86_64/kernel/apic.c208
-rw-r--r--arch/x86_64/kernel/asm-offsets.c4
-rw-r--r--arch/x86_64/kernel/crash.c156
-rw-r--r--arch/x86_64/kernel/crash_dump.c47
-rw-r--r--arch/x86_64/kernel/e820.c24
-rw-r--r--arch/x86_64/kernel/early_printk.c4
-rw-r--r--arch/x86_64/kernel/entry.S45
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c5
-rw-r--r--arch/x86_64/kernel/genapic_flat.c10
-rw-r--r--arch/x86_64/kernel/head.S153
-rw-r--r--arch/x86_64/kernel/head64.c19
-rw-r--r--arch/x86_64/kernel/i387.c2
-rw-r--r--arch/x86_64/kernel/i8259.c9
-rw-r--r--arch/x86_64/kernel/init_task.c2
-rw-r--r--arch/x86_64/kernel/io_apic.c247
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/irq.c6
-rw-r--r--arch/x86_64/kernel/kprobes.c18
-rw-r--r--arch/x86_64/kernel/mce.c52
-rw-r--r--arch/x86_64/kernel/mce_amd.c540
-rw-r--r--arch/x86_64/kernel/mce_intel.c6
-rw-r--r--arch/x86_64/kernel/mpparse.c23
-rw-r--r--arch/x86_64/kernel/nmi.c8
-rw-r--r--arch/x86_64/kernel/pci-dma.c286
-rw-r--r--arch/x86_64/kernel/pci-gart.c407
-rw-r--r--arch/x86_64/kernel/pci-nommu.c145
-rw-r--r--arch/x86_64/kernel/pci-swiotlb.c42
-rw-r--r--arch/x86_64/kernel/process.c137
-rw-r--r--arch/x86_64/kernel/ptrace.c19
-rw-r--r--arch/x86_64/kernel/reboot.c17
-rw-r--r--arch/x86_64/kernel/setup.c205
-rw-r--r--arch/x86_64/kernel/setup64.c50
-rw-r--r--arch/x86_64/kernel/signal.c17
-rw-r--r--arch/x86_64/kernel/smp.c14
-rw-r--r--arch/x86_64/kernel/smpboot.c160
-rw-r--r--arch/x86_64/kernel/suspend.c2
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c14
-rw-r--r--arch/x86_64/kernel/syscall.c2
-rw-r--r--arch/x86_64/kernel/time.c127
-rw-r--r--arch/x86_64/kernel/trampoline.S11
-rw-r--r--arch/x86_64/kernel/traps.c204
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86_64/kernel/vsmp.c45
-rw-r--r--arch/x86_64/kernel/vsyscall.c14
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c36
-rw-r--r--arch/x86_64/lib/clear_page.S38
-rw-r--r--arch/x86_64/lib/copy_page.S87
-rw-r--r--arch/x86_64/lib/copy_user.S244
-rw-r--r--arch/x86_64/lib/delay.c2
-rw-r--r--arch/x86_64/lib/memcpy.S93
-rw-r--r--arch/x86_64/lib/memset.S94
-rw-r--r--arch/x86_64/lib/usercopy.c12
-rw-r--r--arch/x86_64/mm/Makefile2
-rw-r--r--arch/x86_64/mm/fault.c73
-rw-r--r--arch/x86_64/mm/init.c333
-rw-r--r--arch/x86_64/mm/ioremap.c37
-rw-r--r--arch/x86_64/mm/k8topology.c1
-rw-r--r--arch/x86_64/mm/mmap.c30
-rw-r--r--arch/x86_64/mm/numa.c189
-rw-r--r--arch/x86_64/mm/pageattr.c9
-rw-r--r--arch/x86_64/mm/srat.c73
-rw-r--r--arch/x86_64/pci/Makefile2
-rw-r--r--arch/x86_64/pci/Makefile-BUS22
-rw-r--r--arch/x86_64/pci/mmconfig.c65
89 files changed, 3475 insertions, 2352 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 4cce2f6f170c..2f9deca31cc9 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -69,12 +69,34 @@ config ARCH_MAY_HAVE_PC_FDC
69 bool 69 bool
70 default y 70 default y
71 71
72config DMI
73 bool
74 default y
75
72source "init/Kconfig" 76source "init/Kconfig"
73 77
74 78
75menu "Processor type and features" 79menu "Processor type and features"
76 80
77choice 81choice
82 prompt "Subarchitecture Type"
83 default X86_PC
84
85config X86_PC
86 bool "PC-compatible"
87 help
88 Choose this option if your computer is a standard PC or compatible.
89
90config X86_VSMP
91 bool "Support for ScaleMP vSMP"
92 help
93 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
94 supposed to run on these EM64T-based machines. Only choose this option
95 if you have one of these machines.
96
97endchoice
98
99choice
78 prompt "Processor family" 100 prompt "Processor family"
79 default MK8 101 default MK8
80 102
@@ -226,22 +248,42 @@ config SCHED_SMT
226 248
227source "kernel/Kconfig.preempt" 249source "kernel/Kconfig.preempt"
228 250
229config K8_NUMA 251config NUMA
230 bool "K8 NUMA support" 252 bool "Non Uniform Memory Access (NUMA) Support"
231 select NUMA
232 depends on SMP 253 depends on SMP
233 help 254 help
234 Enable NUMA (Non Unified Memory Architecture) support for 255 Enable NUMA (Non Uniform Memory Access) support. The kernel
235 AMD Opteron Multiprocessor systems. The kernel will try to allocate 256 will try to allocate memory used by a CPU on the local memory
236 memory used by a CPU on the local memory controller of the CPU 257 controller of the CPU and add some more NUMA awareness to the kernel.
237 and add some more NUMA awareness to the kernel. 258 This code is recommended on all multiprocessor Opteron systems.
238 This code is recommended on all multiprocessor Opteron systems 259 If the system is EM64T, you should say N unless your system is EM64T
239 and normally doesn't hurt on others. 260 NUMA.
261
262config K8_NUMA
263 bool "Old style AMD Opteron NUMA detection"
264 depends on NUMA
265 default y
266 help
267 Enable K8 NUMA node topology detection. You should say Y here if
268 you have a multi processor AMD K8 system. This uses an old
269 method to read the NUMA configurtion directly from the builtin
270 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
271 instead, which also takes priority if both are compiled in.
272
273# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
274
275config X86_64_ACPI_NUMA
276 bool "ACPI NUMA detection"
277 depends on NUMA
278 select ACPI
279 select ACPI_NUMA
280 default y
281 help
282 Enable ACPI SRAT based node topology detection.
240 283
241config NUMA_EMU 284config NUMA_EMU
242 bool "NUMA emulation support" 285 bool "NUMA emulation"
243 select NUMA 286 depends on NUMA
244 depends on SMP
245 help 287 help
246 Enable NUMA emulation. A flat machine will be split 288 Enable NUMA emulation. A flat machine will be split
247 into virtual nodes when booted with "numa=fake=N", where N is the 289 into virtual nodes when booted with "numa=fake=N", where N is the
@@ -252,9 +294,6 @@ config ARCH_DISCONTIGMEM_ENABLE
252 depends on NUMA 294 depends on NUMA
253 default y 295 default y
254 296
255config NUMA
256 bool
257 default n
258 297
259config ARCH_DISCONTIGMEM_ENABLE 298config ARCH_DISCONTIGMEM_ENABLE
260 def_bool y 299 def_bool y
@@ -266,7 +305,11 @@ config ARCH_DISCONTIGMEM_DEFAULT
266 305
267config ARCH_SPARSEMEM_ENABLE 306config ARCH_SPARSEMEM_ENABLE
268 def_bool y 307 def_bool y
269 depends on NUMA 308 depends on (NUMA || EXPERIMENTAL)
309
310config ARCH_MEMORY_PROBE
311 def_bool y
312 depends on MEMORY_HOTPLUG
270 313
271config ARCH_FLATMEM_ENABLE 314config ARCH_FLATMEM_ENABLE
272 def_bool y 315 def_bool y
@@ -276,6 +319,7 @@ source "mm/Kconfig"
276 319
277config HAVE_ARCH_EARLY_PFN_TO_NID 320config HAVE_ARCH_EARLY_PFN_TO_NID
278 def_bool y 321 def_bool y
322 depends on NUMA
279 323
280config NR_CPUS 324config NR_CPUS
281 int "Maximum number of CPUs (2-256)" 325 int "Maximum number of CPUs (2-256)"
@@ -311,7 +355,7 @@ config HPET_TIMER
311 <http://www.intel.com/hardwaredesign/hpetspec.htm>. 355 <http://www.intel.com/hardwaredesign/hpetspec.htm>.
312 356
313config X86_PM_TIMER 357config X86_PM_TIMER
314 bool "PM timer" 358 bool "PM timer" if EMBEDDED
315 depends on ACPI 359 depends on ACPI
316 default y 360 default y
317 help 361 help
@@ -330,32 +374,24 @@ config HPET_EMULATE_RTC
330 depends on HPET_TIMER && RTC=y 374 depends on HPET_TIMER && RTC=y
331 375
332config GART_IOMMU 376config GART_IOMMU
333 bool "IOMMU support" 377 bool "K8 GART IOMMU support"
334 default y 378 default y
379 select SWIOTLB
335 depends on PCI 380 depends on PCI
336 help 381 help
337 Support the IOMMU. Needed to run systems with more than 3GB of memory 382 Support the IOMMU. Needed to run systems with more than 3GB of memory
338 properly with 32-bit PCI devices that do not support DAC (Double Address 383 properly with 32-bit PCI devices that do not support DAC (Double Address
339 Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. 384 Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
340 Normally the kernel will take the right choice by itself. 385 Normally the kernel will take the right choice by itself.
341 This option includes a driver for the AMD Opteron/Athlon64 IOMMU 386 This option includes a driver for the AMD Opteron/Athlon64 northbridge IOMMU
342 and a software emulation used on some other systems. 387 and a software emulation used on other systems.
343 If unsure, say Y. 388 If unsure, say Y.
344 389
345# need this always enabled with GART_IOMMU for the VIA workaround 390# need this always enabled with GART_IOMMU for the VIA workaround
346config SWIOTLB 391config SWIOTLB
347 bool
348 depends on GART_IOMMU
349 default y
350
351config DUMMY_IOMMU
352 bool 392 bool
353 depends on !GART_IOMMU && !SWIOTLB
354 default y 393 default y
355 help 394 depends on GART_IOMMU
356 Don't use IOMMU code. This will cause problems when you have more than 4GB
357 of memory and any 32-bit devices. Don't turn on unless you know what you
358 are doing.
359 395
360config X86_MCE 396config X86_MCE
361 bool "Machine check support" if EMBEDDED 397 bool "Machine check support" if EMBEDDED
@@ -374,16 +410,13 @@ config X86_MCE_INTEL
374 Additional support for intel specific MCE features such as 410 Additional support for intel specific MCE features such as
375 the thermal monitor. 411 the thermal monitor.
376 412
377config PHYSICAL_START 413config X86_MCE_AMD
378 hex "Physical address where the kernel is loaded" if EMBEDDED 414 bool "AMD MCE features"
379 default "0x100000" 415 depends on X86_MCE && X86_LOCAL_APIC
416 default y
380 help 417 help
381 This gives the physical address where the kernel is loaded. 418 Additional support for AMD specific MCE features such as
382 Primarily used in the case of kexec on panic where the 419 the DRAM Error Threshold.
383 fail safe kernel needs to run at a different address than
384 the panic-ed kernel.
385
386 Don't change this unless you know what you are doing.
387 420
388config KEXEC 421config KEXEC
389 bool "kexec system call (EXPERIMENTAL)" 422 bool "kexec system call (EXPERIMENTAL)"
@@ -402,6 +435,31 @@ config KEXEC
402 support. As of this writing the exact hardware interface is 435 support. As of this writing the exact hardware interface is
403 strongly in flux, so no good recommendation can be made. 436 strongly in flux, so no good recommendation can be made.
404 437
438config CRASH_DUMP
439 bool "kernel crash dumps (EXPERIMENTAL)"
440 depends on EXPERIMENTAL
441 help
442 Generate crash dump after being started by kexec.
443
444config PHYSICAL_START
445 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
446 default "0x1000000" if CRASH_DUMP
447 default "0x100000"
448 help
449 This gives the physical address where the kernel is loaded. Normally
450 for regular kernels this value is 0x100000 (1MB). But in the case
451 of kexec on panic the fail safe kernel needs to run at a different
452 address than the panic-ed kernel. This option is used to set the load
453 address for kernels used to capture crash dump on being kexec'ed
454 after panic. The default value for crash dump kernels is
455 0x1000000 (16MB). This can also be set based on the "X" value as
456 specified in the "crashkernel=YM@XM" command line boot parameter
457 passed to the panic-ed kernel. Typically this parameter is set as
458 crashkernel=64M@16M. Please take a look at
459 Documentation/kdump/kdump.txt for more details about crash dumps.
460
461 Don't change this unless you know what you are doing.
462
405config SECCOMP 463config SECCOMP
406 bool "Enable seccomp to safely compute untrusted bytecode" 464 bool "Enable seccomp to safely compute untrusted bytecode"
407 depends on PROC_FS 465 depends on PROC_FS
@@ -502,7 +560,7 @@ config IA32_EMULATION
502 left. 560 left.
503 561
504config IA32_AOUT 562config IA32_AOUT
505 bool "IA32 a.out support" 563 tristate "IA32 a.out support"
506 depends on IA32_EMULATION 564 depends on IA32_EMULATION
507 help 565 help
508 Support old a.out binaries in the 32bit emulation. 566 Support old a.out binaries in the 32bit emulation.
@@ -517,11 +575,6 @@ config SYSVIPC_COMPAT
517 depends on COMPAT && SYSVIPC 575 depends on COMPAT && SYSVIPC
518 default y 576 default y
519 577
520config UID16
521 bool
522 depends on IA32_EMULATION
523 default y
524
525endmenu 578endmenu
526 579
527source "net/Kconfig" 580source "net/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index d584ecc27ea1..fcb06a50fdd2 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -2,15 +2,6 @@ menu "Kernel hacking"
2 2
3source "lib/Kconfig.debug" 3source "lib/Kconfig.debug"
4 4
5# !SMP for now because the context switch early causes GPF in segment reloading
6# and the GS base checking does the wrong thing then, causing a hang.
7config CHECKING
8 bool "Additional run-time checks"
9 depends on DEBUG_KERNEL && !SMP
10 help
11 Enables some internal consistency checks for kernel debugging.
12 You should normally say N.
13
14config INIT_DEBUG 5config INIT_DEBUG
15 bool "Debug __init statements" 6 bool "Debug __init statements"
16 depends on DEBUG_KERNEL 7 depends on DEBUG_KERNEL
@@ -18,6 +9,16 @@ config INIT_DEBUG
18 Fill __init and __initdata at the end of boot. This helps debugging 9 Fill __init and __initdata at the end of boot. This helps debugging
19 illegal uses of __init and __initdata after initialization. 10 illegal uses of __init and __initdata after initialization.
20 11
12config DEBUG_RODATA
13 bool "Write protect kernel read-only data structures"
14 depends on DEBUG_KERNEL
15 help
16 Mark the kernel read-only data as write-protected in the pagetables,
17 in order to catch accidental (and incorrect) writes to such const data.
18 This option may have a slight performance impact because a portion
19 of the kernel code won't be covered by a 2MB TLB anymore.
20 If in doubt, say "N".
21
21config IOMMU_DEBUG 22config IOMMU_DEBUG
22 depends on GART_IOMMU && DEBUG_KERNEL 23 depends on GART_IOMMU && DEBUG_KERNEL
23 bool "Enable IOMMU debugging" 24 bool "Enable IOMMU debugging"
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index a9cd42e61828..d7fd46479c55 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -31,6 +31,7 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
32CFLAGS += $(cflags-y) 32CFLAGS += $(cflags-y)
33 33
34CFLAGS += -m64
34CFLAGS += -mno-red-zone 35CFLAGS += -mno-red-zone
35CFLAGS += -mcmodel=kernel 36CFLAGS += -mcmodel=kernel
36CFLAGS += -pipe 37CFLAGS += -pipe
@@ -38,8 +39,10 @@ CFLAGS += -pipe
38# actually it makes the kernel smaller too. 39# actually it makes the kernel smaller too.
39CFLAGS += -fno-reorder-blocks 40CFLAGS += -fno-reorder-blocks
40CFLAGS += -Wno-sign-compare 41CFLAGS += -Wno-sign-compare
41ifneq ($(CONFIG_DEBUG_INFO),y) 42ifneq ($(CONFIG_UNWIND_INFO),y)
42CFLAGS += -fno-asynchronous-unwind-tables 43CFLAGS += -fno-asynchronous-unwind-tables
44endif
45ifneq ($(CONFIG_DEBUG_INFO),y)
43# -fweb shrinks the kernel a bit, but the difference is very small 46# -fweb shrinks the kernel a bit, but the difference is very small
44# it also messes up debugging, so don't use it for now. 47# it also messes up debugging, so don't use it for now.
45#CFLAGS += $(call cc-option,-fweb) 48#CFLAGS += $(call cc-option,-fweb)
@@ -50,6 +53,8 @@ CFLAGS += $(call cc-option,-funit-at-a-time)
50# prevent gcc from generating any FP code by mistake 53# prevent gcc from generating any FP code by mistake
51CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) 54CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
52 55
56AFLAGS += -m64
57
53head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o 58head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
54 59
55libs-y += arch/x86_64/lib/ 60libs-y += arch/x86_64/lib/
@@ -80,9 +85,12 @@ bzlilo: vmlinux
80bzdisk: vmlinux 85bzdisk: vmlinux
81 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk 86 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
82 87
83install fdimage fdimage144 fdimage288: vmlinux 88fdimage fdimage144 fdimage288: vmlinux
84 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 89 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
85 90
91install:
92 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
93
86archclean: 94archclean:
87 $(Q)$(MAKE) $(clean)=$(boot) 95 $(Q)$(MAKE) $(clean)=$(boot)
88 96
diff --git a/arch/x86_64/boot/.gitignore b/arch/x86_64/boot/.gitignore
new file mode 100644
index 000000000000..495f20c085de
--- /dev/null
+++ b/arch/x86_64/boot/.gitignore
@@ -0,0 +1,3 @@
1bootsect
2bzImage
3setup
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
index 18c6e915d69b..29f8396ed151 100644
--- a/arch/x86_64/boot/Makefile
+++ b/arch/x86_64/boot/Makefile
@@ -98,5 +98,5 @@ zlilo: $(BOOTIMAGE)
98 cp System.map $(INSTALL_PATH)/ 98 cp System.map $(INSTALL_PATH)/
99 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi 99 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
100 100
101install: $(BOOTIMAGE) 101install:
102 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" 102 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 0e10fd84c7cc..cf4b88c416dc 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,7 +9,7 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#include "miscsetup.h" 12#include <linux/screen_info.h>
13#include <asm/io.h> 13#include <asm/io.h>
14#include <asm/page.h> 14#include <asm/page.h>
15 15
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
deleted file mode 100644
index bb1620531703..000000000000
--- a/arch/x86_64/boot/compressed/miscsetup.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#define NULL 0
2//typedef unsigned int size_t;
3
4
5struct screen_info {
6 unsigned char orig_x; /* 0x00 */
7 unsigned char orig_y; /* 0x01 */
8 unsigned short dontuse1; /* 0x02 -- EXT_MEM_K sits here */
9 unsigned short orig_video_page; /* 0x04 */
10 unsigned char orig_video_mode; /* 0x06 */
11 unsigned char orig_video_cols; /* 0x07 */
12 unsigned short unused2; /* 0x08 */
13 unsigned short orig_video_ega_bx; /* 0x0a */
14 unsigned short unused3; /* 0x0c */
15 unsigned char orig_video_lines; /* 0x0e */
16 unsigned char orig_video_isVGA; /* 0x0f */
17 unsigned short orig_video_points; /* 0x10 */
18
19 /* VESA graphic mode -- linear frame buffer */
20 unsigned short lfb_width; /* 0x12 */
21 unsigned short lfb_height; /* 0x14 */
22 unsigned short lfb_depth; /* 0x16 */
23 unsigned long lfb_base; /* 0x18 */
24 unsigned long lfb_size; /* 0x1c */
25 unsigned short dontuse2, dontuse3; /* 0x20 -- CL_MAGIC and CL_OFFSET here */
26 unsigned short lfb_linelength; /* 0x24 */
27 unsigned char red_size; /* 0x26 */
28 unsigned char red_pos; /* 0x27 */
29 unsigned char green_size; /* 0x28 */
30 unsigned char green_pos; /* 0x29 */
31 unsigned char blue_size; /* 0x2a */
32 unsigned char blue_pos; /* 0x2b */
33 unsigned char rsvd_size; /* 0x2c */
34 unsigned char rsvd_pos; /* 0x2d */
35 unsigned short vesapm_seg; /* 0x2e */
36 unsigned short vesapm_off; /* 0x30 */
37 unsigned short pages; /* 0x32 */
38 /* 0x34 -- 0x3f reserved for future expansion */
39};
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
index 198af15a7758..baaa2369bdb8 100644
--- a/arch/x86_64/boot/install.sh
+++ b/arch/x86_64/boot/install.sh
@@ -1,40 +1,2 @@
1#!/bin/sh 1#!/bin/sh
2# 2. $srctree/arch/i386/boot/install.sh
3# arch/x86_64/boot/install.sh
4#
5# This file is subject to the terms and conditions of the GNU General Public
6# License. See the file "COPYING" in the main directory of this archive
7# for more details.
8#
9# Copyright (C) 1995 by Linus Torvalds
10#
11# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
12#
13# "make install" script for i386 architecture
14#
15# Arguments:
16# $1 - kernel version
17# $2 - kernel image file
18# $3 - kernel map file
19# $4 - default install path (blank if root directory)
20#
21
22# User may have a custom install script
23
24if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
25if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
26
27# Default install - same as make zlilo
28
29if [ -f $4/vmlinuz ]; then
30 mv $4/vmlinuz $4/vmlinuz.old
31fi
32
33if [ -f $4/System.map ]; then
34 mv $4/System.map $4/System.old
35fi
36
37cat $2 > $4/vmlinuz
38cp $3 $4/System.map
39
40if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
diff --git a/arch/x86_64/boot/tools/.gitignore b/arch/x86_64/boot/tools/.gitignore
new file mode 100644
index 000000000000..378eac25d311
--- /dev/null
+++ b/arch/x86_64/boot/tools/.gitignore
@@ -0,0 +1 @@
build
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c
index acfdaa28791e..fb1b961a2e2f 100644
--- a/arch/x86_64/crypto/aes.c
+++ b/arch/x86_64/crypto/aes.c
@@ -74,8 +74,6 @@ static inline u8 byte(const u32 x, const unsigned n)
74 return x >> (n << 3); 74 return x >> (n << 3);
75} 75}
76 76
77#define u32_in(x) le32_to_cpu(*(const __le32 *)(x))
78
79struct aes_ctx 77struct aes_ctx
80{ 78{
81 u32 key_length; 79 u32 key_length;
@@ -234,6 +232,7 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
234 u32 *flags) 232 u32 *flags)
235{ 233{
236 struct aes_ctx *ctx = ctx_arg; 234 struct aes_ctx *ctx = ctx_arg;
235 const __le32 *key = (const __le32 *)in_key;
237 u32 i, j, t, u, v, w; 236 u32 i, j, t, u, v, w;
238 237
239 if (key_len != 16 && key_len != 24 && key_len != 32) { 238 if (key_len != 16 && key_len != 24 && key_len != 32) {
@@ -243,10 +242,10 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
243 242
244 ctx->key_length = key_len; 243 ctx->key_length = key_len;
245 244
246 D_KEY[key_len + 24] = E_KEY[0] = u32_in(in_key); 245 D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
247 D_KEY[key_len + 25] = E_KEY[1] = u32_in(in_key + 4); 246 D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
248 D_KEY[key_len + 26] = E_KEY[2] = u32_in(in_key + 8); 247 D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
249 D_KEY[key_len + 27] = E_KEY[3] = u32_in(in_key + 12); 248 D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
250 249
251 switch (key_len) { 250 switch (key_len) {
252 case 16: 251 case 16:
@@ -256,17 +255,17 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
256 break; 255 break;
257 256
258 case 24: 257 case 24:
259 E_KEY[4] = u32_in(in_key + 16); 258 E_KEY[4] = le32_to_cpu(key[4]);
260 t = E_KEY[5] = u32_in(in_key + 20); 259 t = E_KEY[5] = le32_to_cpu(key[5]);
261 for (i = 0; i < 8; ++i) 260 for (i = 0; i < 8; ++i)
262 loop6 (i); 261 loop6 (i);
263 break; 262 break;
264 263
265 case 32: 264 case 32:
266 E_KEY[4] = u32_in(in_key + 16); 265 E_KEY[4] = le32_to_cpu(key[4]);
267 E_KEY[5] = u32_in(in_key + 20); 266 E_KEY[5] = le32_to_cpu(key[5]);
268 E_KEY[6] = u32_in(in_key + 24); 267 E_KEY[6] = le32_to_cpu(key[6]);
269 t = E_KEY[7] = u32_in(in_key + 28); 268 t = E_KEY[7] = le32_to_cpu(key[7]);
270 for (i = 0; i < 7; ++i) 269 for (i = 0; i < 7; ++i)
271 loop8(i); 270 loop8(i);
272 break; 271 break;
@@ -290,6 +289,8 @@ extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in);
290 289
291static struct crypto_alg aes_alg = { 290static struct crypto_alg aes_alg = {
292 .cra_name = "aes", 291 .cra_name = "aes",
292 .cra_driver_name = "aes-x86_64",
293 .cra_priority = 200,
293 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 294 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
294 .cra_blocksize = AES_BLOCK_SIZE, 295 .cra_blocksize = AES_BLOCK_SIZE,
295 .cra_ctxsize = sizeof(struct aes_ctx), 296 .cra_ctxsize = sizeof(struct aes_ctx),
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index f8db7e500fbf..09a3eb743315 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.13-git11 3# Linux kernel version: 2.6.15-git12
4# Mon Sep 12 16:16:16 2005 4# Mon Jan 16 13:09:08 2006
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -15,6 +15,7 @@ CONFIG_EARLY_PRINTK=y
15CONFIG_GENERIC_ISA_DMA=y 15CONFIG_GENERIC_ISA_DMA=y
16CONFIG_GENERIC_IOMAP=y 16CONFIG_GENERIC_IOMAP=y
17CONFIG_ARCH_MAY_HAVE_PC_FDC=y 17CONFIG_ARCH_MAY_HAVE_PC_FDC=y
18CONFIG_DMI=y
18 19
19# 20#
20# Code maturity level options 21# Code maturity level options
@@ -35,18 +36,21 @@ CONFIG_POSIX_MQUEUE=y
35# CONFIG_BSD_PROCESS_ACCT is not set 36# CONFIG_BSD_PROCESS_ACCT is not set
36CONFIG_SYSCTL=y 37CONFIG_SYSCTL=y
37# CONFIG_AUDIT is not set 38# CONFIG_AUDIT is not set
38# CONFIG_HOTPLUG is not set
39CONFIG_KOBJECT_UEVENT=y
40CONFIG_IKCONFIG=y 39CONFIG_IKCONFIG=y
41CONFIG_IKCONFIG_PROC=y 40CONFIG_IKCONFIG_PROC=y
42# CONFIG_CPUSETS is not set 41# CONFIG_CPUSETS is not set
43CONFIG_INITRAMFS_SOURCE="" 42CONFIG_INITRAMFS_SOURCE=""
43CONFIG_UID16=y
44CONFIG_VM86=y
45CONFIG_CC_OPTIMIZE_FOR_SIZE=y
44# CONFIG_EMBEDDED is not set 46# CONFIG_EMBEDDED is not set
45CONFIG_KALLSYMS=y 47CONFIG_KALLSYMS=y
46CONFIG_KALLSYMS_ALL=y 48CONFIG_KALLSYMS_ALL=y
47# CONFIG_KALLSYMS_EXTRA_PASS is not set 49# CONFIG_KALLSYMS_EXTRA_PASS is not set
50CONFIG_HOTPLUG=y
48CONFIG_PRINTK=y 51CONFIG_PRINTK=y
49CONFIG_BUG=y 52CONFIG_BUG=y
53CONFIG_ELF_CORE=y
50CONFIG_BASE_FULL=y 54CONFIG_BASE_FULL=y
51CONFIG_FUTEX=y 55CONFIG_FUTEX=y
52CONFIG_EPOLL=y 56CONFIG_EPOLL=y
@@ -55,8 +59,10 @@ CONFIG_CC_ALIGN_FUNCTIONS=0
55CONFIG_CC_ALIGN_LABELS=0 59CONFIG_CC_ALIGN_LABELS=0
56CONFIG_CC_ALIGN_LOOPS=0 60CONFIG_CC_ALIGN_LOOPS=0
57CONFIG_CC_ALIGN_JUMPS=0 61CONFIG_CC_ALIGN_JUMPS=0
62CONFIG_SLAB=y
58# CONFIG_TINY_SHMEM is not set 63# CONFIG_TINY_SHMEM is not set
59CONFIG_BASE_SMALL=0 64CONFIG_BASE_SMALL=0
65# CONFIG_SLOB is not set
60 66
61# 67#
62# Loadable module support 68# Loadable module support
@@ -71,8 +77,28 @@ CONFIG_OBSOLETE_MODPARM=y
71CONFIG_STOP_MACHINE=y 77CONFIG_STOP_MACHINE=y
72 78
73# 79#
80# Block layer
81#
82CONFIG_LBD=y
83
84#
85# IO Schedulers
86#
87CONFIG_IOSCHED_NOOP=y
88# CONFIG_IOSCHED_AS is not set
89CONFIG_IOSCHED_DEADLINE=y
90CONFIG_IOSCHED_CFQ=y
91# CONFIG_DEFAULT_AS is not set
92# CONFIG_DEFAULT_DEADLINE is not set
93CONFIG_DEFAULT_CFQ=y
94# CONFIG_DEFAULT_NOOP is not set
95CONFIG_DEFAULT_IOSCHED="cfq"
96
97#
74# Processor type and features 98# Processor type and features
75# 99#
100CONFIG_X86_PC=y
101# CONFIG_X86_VSMP is not set
76# CONFIG_MK8 is not set 102# CONFIG_MK8 is not set
77# CONFIG_MPSC is not set 103# CONFIG_MPSC is not set
78CONFIG_GENERIC_CPU=y 104CONFIG_GENERIC_CPU=y
@@ -89,14 +115,15 @@ CONFIG_X86_LOCAL_APIC=y
89CONFIG_MTRR=y 115CONFIG_MTRR=y
90CONFIG_SMP=y 116CONFIG_SMP=y
91CONFIG_SCHED_SMT=y 117CONFIG_SCHED_SMT=y
92CONFIG_PREEMPT_NONE=y 118# CONFIG_PREEMPT_NONE is not set
93# CONFIG_PREEMPT_VOLUNTARY is not set 119CONFIG_PREEMPT_VOLUNTARY=y
94# CONFIG_PREEMPT is not set 120# CONFIG_PREEMPT is not set
95CONFIG_PREEMPT_BKL=y 121CONFIG_PREEMPT_BKL=y
122CONFIG_NUMA=y
96CONFIG_K8_NUMA=y 123CONFIG_K8_NUMA=y
97# CONFIG_NUMA_EMU is not set 124CONFIG_X86_64_ACPI_NUMA=y
125CONFIG_NUMA_EMU=y
98CONFIG_ARCH_DISCONTIGMEM_ENABLE=y 126CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
99CONFIG_NUMA=y
100CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y 127CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
101CONFIG_ARCH_SPARSEMEM_ENABLE=y 128CONFIG_ARCH_SPARSEMEM_ENABLE=y
102CONFIG_SELECT_MEMORY_MODEL=y 129CONFIG_SELECT_MEMORY_MODEL=y
@@ -107,9 +134,11 @@ CONFIG_DISCONTIGMEM=y
107CONFIG_FLAT_NODE_MEM_MAP=y 134CONFIG_FLAT_NODE_MEM_MAP=y
108CONFIG_NEED_MULTIPLE_NODES=y 135CONFIG_NEED_MULTIPLE_NODES=y
109# CONFIG_SPARSEMEM_STATIC is not set 136# CONFIG_SPARSEMEM_STATIC is not set
137CONFIG_SPLIT_PTLOCK_CPUS=4
138CONFIG_MIGRATION=y
110CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 139CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
111CONFIG_HAVE_DEC_LOCK=y
112CONFIG_NR_CPUS=32 140CONFIG_NR_CPUS=32
141CONFIG_HOTPLUG_CPU=y
113CONFIG_HPET_TIMER=y 142CONFIG_HPET_TIMER=y
114CONFIG_X86_PM_TIMER=y 143CONFIG_X86_PM_TIMER=y
115CONFIG_HPET_EMULATE_RTC=y 144CONFIG_HPET_EMULATE_RTC=y
@@ -117,8 +146,10 @@ CONFIG_GART_IOMMU=y
117CONFIG_SWIOTLB=y 146CONFIG_SWIOTLB=y
118CONFIG_X86_MCE=y 147CONFIG_X86_MCE=y
119CONFIG_X86_MCE_INTEL=y 148CONFIG_X86_MCE_INTEL=y
120CONFIG_PHYSICAL_START=0x100000 149CONFIG_X86_MCE_AMD=y
121# CONFIG_KEXEC is not set 150# CONFIG_KEXEC is not set
151# CONFIG_CRASH_DUMP is not set
152CONFIG_PHYSICAL_START=0x100000
122CONFIG_SECCOMP=y 153CONFIG_SECCOMP=y
123# CONFIG_HZ_100 is not set 154# CONFIG_HZ_100 is not set
124CONFIG_HZ_250=y 155CONFIG_HZ_250=y
@@ -133,21 +164,27 @@ CONFIG_GENERIC_PENDING_IRQ=y
133# Power management options 164# Power management options
134# 165#
135CONFIG_PM=y 166CONFIG_PM=y
167# CONFIG_PM_LEGACY is not set
136# CONFIG_PM_DEBUG is not set 168# CONFIG_PM_DEBUG is not set
137CONFIG_SOFTWARE_SUSPEND=y 169CONFIG_SOFTWARE_SUSPEND=y
138CONFIG_PM_STD_PARTITION="" 170CONFIG_PM_STD_PARTITION=""
171CONFIG_SUSPEND_SMP=y
139 172
140# 173#
141# ACPI (Advanced Configuration and Power Interface) Support 174# ACPI (Advanced Configuration and Power Interface) Support
142# 175#
143CONFIG_ACPI=y 176CONFIG_ACPI=y
177CONFIG_ACPI_SLEEP=y
178CONFIG_ACPI_SLEEP_PROC_FS=y
179CONFIG_ACPI_SLEEP_PROC_SLEEP=y
144CONFIG_ACPI_AC=y 180CONFIG_ACPI_AC=y
145CONFIG_ACPI_BATTERY=y 181CONFIG_ACPI_BATTERY=y
146CONFIG_ACPI_BUTTON=y 182CONFIG_ACPI_BUTTON=y
147# CONFIG_ACPI_VIDEO is not set 183# CONFIG_ACPI_VIDEO is not set
148CONFIG_ACPI_HOTKEY=m 184# CONFIG_ACPI_HOTKEY is not set
149CONFIG_ACPI_FAN=y 185CONFIG_ACPI_FAN=y
150CONFIG_ACPI_PROCESSOR=y 186CONFIG_ACPI_PROCESSOR=y
187CONFIG_ACPI_HOTPLUG_CPU=y
151CONFIG_ACPI_THERMAL=y 188CONFIG_ACPI_THERMAL=y
152CONFIG_ACPI_NUMA=y 189CONFIG_ACPI_NUMA=y
153# CONFIG_ACPI_ASUS is not set 190# CONFIG_ACPI_ASUS is not set
@@ -158,7 +195,7 @@ CONFIG_ACPI_BLACKLIST_YEAR=2001
158CONFIG_ACPI_EC=y 195CONFIG_ACPI_EC=y
159CONFIG_ACPI_POWER=y 196CONFIG_ACPI_POWER=y
160CONFIG_ACPI_SYSTEM=y 197CONFIG_ACPI_SYSTEM=y
161# CONFIG_ACPI_CONTAINER is not set 198CONFIG_ACPI_CONTAINER=y
162 199
163# 200#
164# CPU Frequency scaling 201# CPU Frequency scaling
@@ -197,7 +234,7 @@ CONFIG_PCI=y
197CONFIG_PCI_DIRECT=y 234CONFIG_PCI_DIRECT=y
198CONFIG_PCI_MMCONFIG=y 235CONFIG_PCI_MMCONFIG=y
199CONFIG_UNORDERED_IO=y 236CONFIG_UNORDERED_IO=y
200# CONFIG_PCIEPORTBUS is not set 237CONFIG_PCIEPORTBUS=y
201CONFIG_PCI_MSI=y 238CONFIG_PCI_MSI=y
202# CONFIG_PCI_LEGACY_PROC is not set 239# CONFIG_PCI_LEGACY_PROC is not set
203# CONFIG_PCI_DEBUG is not set 240# CONFIG_PCI_DEBUG is not set
@@ -221,7 +258,6 @@ CONFIG_IA32_EMULATION=y
221CONFIG_IA32_AOUT=y 258CONFIG_IA32_AOUT=y
222CONFIG_COMPAT=y 259CONFIG_COMPAT=y
223CONFIG_SYSVIPC_COMPAT=y 260CONFIG_SYSVIPC_COMPAT=y
224CONFIG_UID16=y
225 261
226# 262#
227# Networking 263# Networking
@@ -283,17 +319,24 @@ CONFIG_IPV6=y
283# CONFIG_ATALK is not set 319# CONFIG_ATALK is not set
284# CONFIG_X25 is not set 320# CONFIG_X25 is not set
285# CONFIG_LAPB is not set 321# CONFIG_LAPB is not set
322
323#
324# TIPC Configuration (EXPERIMENTAL)
325#
326# CONFIG_TIPC is not set
286# CONFIG_NET_DIVERT is not set 327# CONFIG_NET_DIVERT is not set
287# CONFIG_ECONET is not set 328# CONFIG_ECONET is not set
288# CONFIG_WAN_ROUTER is not set 329# CONFIG_WAN_ROUTER is not set
330
331#
332# QoS and/or fair queueing
333#
289# CONFIG_NET_SCHED is not set 334# CONFIG_NET_SCHED is not set
290# CONFIG_NET_CLS_ROUTE is not set
291 335
292# 336#
293# Network testing 337# Network testing
294# 338#
295# CONFIG_NET_PKTGEN is not set 339# CONFIG_NET_PKTGEN is not set
296# CONFIG_NETFILTER_NETLINK is not set
297# CONFIG_HAMRADIO is not set 340# CONFIG_HAMRADIO is not set
298# CONFIG_IRDA is not set 341# CONFIG_IRDA is not set
299# CONFIG_BT is not set 342# CONFIG_BT is not set
@@ -308,10 +351,15 @@ CONFIG_IPV6=y
308# 351#
309CONFIG_STANDALONE=y 352CONFIG_STANDALONE=y
310CONFIG_PREVENT_FIRMWARE_BUILD=y 353CONFIG_PREVENT_FIRMWARE_BUILD=y
311# CONFIG_FW_LOADER is not set 354CONFIG_FW_LOADER=y
312# CONFIG_DEBUG_DRIVER is not set 355# CONFIG_DEBUG_DRIVER is not set
313 356
314# 357#
358# Connector - unified userspace <-> kernelspace linker
359#
360# CONFIG_CONNECTOR is not set
361
362#
315# Memory Technology Devices (MTD) 363# Memory Technology Devices (MTD)
316# 364#
317# CONFIG_MTD is not set 365# CONFIG_MTD is not set
@@ -344,16 +392,7 @@ CONFIG_BLK_DEV_RAM=y
344CONFIG_BLK_DEV_RAM_COUNT=16 392CONFIG_BLK_DEV_RAM_COUNT=16
345CONFIG_BLK_DEV_RAM_SIZE=4096 393CONFIG_BLK_DEV_RAM_SIZE=4096
346CONFIG_BLK_DEV_INITRD=y 394CONFIG_BLK_DEV_INITRD=y
347CONFIG_LBD=y
348# CONFIG_CDROM_PKTCDVD is not set 395# CONFIG_CDROM_PKTCDVD is not set
349
350#
351# IO Schedulers
352#
353CONFIG_IOSCHED_NOOP=y
354# CONFIG_IOSCHED_AS is not set
355CONFIG_IOSCHED_DEADLINE=y
356CONFIG_IOSCHED_CFQ=y
357# CONFIG_ATA_OVER_ETH is not set 396# CONFIG_ATA_OVER_ETH is not set
358 397
359# 398#
@@ -393,7 +432,7 @@ CONFIG_IDEDMA_PCI_AUTO=y
393# CONFIG_BLK_DEV_AEC62XX is not set 432# CONFIG_BLK_DEV_AEC62XX is not set
394# CONFIG_BLK_DEV_ALI15X3 is not set 433# CONFIG_BLK_DEV_ALI15X3 is not set
395CONFIG_BLK_DEV_AMD74XX=y 434CONFIG_BLK_DEV_AMD74XX=y
396# CONFIG_BLK_DEV_ATIIXP is not set 435CONFIG_BLK_DEV_ATIIXP=y
397# CONFIG_BLK_DEV_CMD64X is not set 436# CONFIG_BLK_DEV_CMD64X is not set
398# CONFIG_BLK_DEV_TRIFLEX is not set 437# CONFIG_BLK_DEV_TRIFLEX is not set
399# CONFIG_BLK_DEV_CY82C693 is not set 438# CONFIG_BLK_DEV_CY82C693 is not set
@@ -441,19 +480,21 @@ CONFIG_BLK_DEV_SD=y
441# Some SCSI devices (e.g. CD jukebox) support multiple LUNs 480# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
442# 481#
443# CONFIG_SCSI_MULTI_LUN is not set 482# CONFIG_SCSI_MULTI_LUN is not set
444# CONFIG_SCSI_CONSTANTS is not set 483CONFIG_SCSI_CONSTANTS=y
445# CONFIG_SCSI_LOGGING is not set 484# CONFIG_SCSI_LOGGING is not set
446 485
447# 486#
448# SCSI Transport Attributes 487# SCSI Transport Attributes
449# 488#
450CONFIG_SCSI_SPI_ATTRS=y 489CONFIG_SCSI_SPI_ATTRS=y
451# CONFIG_SCSI_FC_ATTRS is not set 490CONFIG_SCSI_FC_ATTRS=y
452# CONFIG_SCSI_ISCSI_ATTRS is not set 491# CONFIG_SCSI_ISCSI_ATTRS is not set
492# CONFIG_SCSI_SAS_ATTRS is not set
453 493
454# 494#
455# SCSI low-level drivers 495# SCSI low-level drivers
456# 496#
497# CONFIG_ISCSI_TCP is not set
457# CONFIG_BLK_DEV_3W_XXXX_RAID is not set 498# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
458# CONFIG_SCSI_3W_9XXX is not set 499# CONFIG_SCSI_3W_9XXX is not set
459# CONFIG_SCSI_ACARD is not set 500# CONFIG_SCSI_ACARD is not set
@@ -467,22 +508,28 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000
467# CONFIG_AIC79XX_DEBUG_ENABLE is not set 508# CONFIG_AIC79XX_DEBUG_ENABLE is not set
468CONFIG_AIC79XX_DEBUG_MASK=0 509CONFIG_AIC79XX_DEBUG_MASK=0
469# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set 510# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
470# CONFIG_MEGARAID_NEWGEN is not set 511CONFIG_MEGARAID_NEWGEN=y
512CONFIG_MEGARAID_MM=y
513CONFIG_MEGARAID_MAILBOX=y
471# CONFIG_MEGARAID_LEGACY is not set 514# CONFIG_MEGARAID_LEGACY is not set
515CONFIG_MEGARAID_SAS=y
472CONFIG_SCSI_SATA=y 516CONFIG_SCSI_SATA=y
473# CONFIG_SCSI_SATA_AHCI is not set 517CONFIG_SCSI_SATA_AHCI=y
474# CONFIG_SCSI_SATA_SVW is not set 518# CONFIG_SCSI_SATA_SVW is not set
475CONFIG_SCSI_ATA_PIIX=y 519CONFIG_SCSI_ATA_PIIX=y
476# CONFIG_SCSI_SATA_MV is not set 520# CONFIG_SCSI_SATA_MV is not set
477# CONFIG_SCSI_SATA_NV is not set 521CONFIG_SCSI_SATA_NV=y
478# CONFIG_SCSI_SATA_PROMISE is not set 522# CONFIG_SCSI_PDC_ADMA is not set
479# CONFIG_SCSI_SATA_QSTOR is not set 523# CONFIG_SCSI_SATA_QSTOR is not set
524# CONFIG_SCSI_SATA_PROMISE is not set
480# CONFIG_SCSI_SATA_SX4 is not set 525# CONFIG_SCSI_SATA_SX4 is not set
481# CONFIG_SCSI_SATA_SIL is not set 526CONFIG_SCSI_SATA_SIL=y
527# CONFIG_SCSI_SATA_SIL24 is not set
482# CONFIG_SCSI_SATA_SIS is not set 528# CONFIG_SCSI_SATA_SIS is not set
483# CONFIG_SCSI_SATA_ULI is not set 529# CONFIG_SCSI_SATA_ULI is not set
484CONFIG_SCSI_SATA_VIA=y 530CONFIG_SCSI_SATA_VIA=y
485# CONFIG_SCSI_SATA_VITESSE is not set 531# CONFIG_SCSI_SATA_VITESSE is not set
532CONFIG_SCSI_SATA_INTEL_COMBINED=y
486# CONFIG_SCSI_BUSLOGIC is not set 533# CONFIG_SCSI_BUSLOGIC is not set
487# CONFIG_SCSI_DMX3191D is not set 534# CONFIG_SCSI_DMX3191D is not set
488# CONFIG_SCSI_EATA is not set 535# CONFIG_SCSI_EATA is not set
@@ -495,13 +542,7 @@ CONFIG_SCSI_SATA_VIA=y
495# CONFIG_SCSI_IPR is not set 542# CONFIG_SCSI_IPR is not set
496# CONFIG_SCSI_QLOGIC_FC is not set 543# CONFIG_SCSI_QLOGIC_FC is not set
497# CONFIG_SCSI_QLOGIC_1280 is not set 544# CONFIG_SCSI_QLOGIC_1280 is not set
498CONFIG_SCSI_QLA2XXX=y 545# CONFIG_SCSI_QLA_FC is not set
499# CONFIG_SCSI_QLA21XX is not set
500# CONFIG_SCSI_QLA22XX is not set
501# CONFIG_SCSI_QLA2300 is not set
502# CONFIG_SCSI_QLA2322 is not set
503# CONFIG_SCSI_QLA6312 is not set
504# CONFIG_SCSI_QLA24XX is not set
505# CONFIG_SCSI_LPFC is not set 546# CONFIG_SCSI_LPFC is not set
506# CONFIG_SCSI_DC395x is not set 547# CONFIG_SCSI_DC395x is not set
507# CONFIG_SCSI_DC390T is not set 548# CONFIG_SCSI_DC390T is not set
@@ -525,6 +566,7 @@ CONFIG_BLK_DEV_DM=y
525CONFIG_FUSION=y 566CONFIG_FUSION=y
526CONFIG_FUSION_SPI=y 567CONFIG_FUSION_SPI=y
527# CONFIG_FUSION_FC is not set 568# CONFIG_FUSION_FC is not set
569# CONFIG_FUSION_SAS is not set
528CONFIG_FUSION_MAX_SGE=128 570CONFIG_FUSION_MAX_SGE=128
529# CONFIG_FUSION_CTL is not set 571# CONFIG_FUSION_CTL is not set
530 572
@@ -564,6 +606,7 @@ CONFIG_NET_ETHERNET=y
564CONFIG_MII=y 606CONFIG_MII=y
565# CONFIG_HAPPYMEAL is not set 607# CONFIG_HAPPYMEAL is not set
566# CONFIG_SUNGEM is not set 608# CONFIG_SUNGEM is not set
609# CONFIG_CASSINI is not set
567CONFIG_NET_VENDOR_3COM=y 610CONFIG_NET_VENDOR_3COM=y
568CONFIG_VORTEX=y 611CONFIG_VORTEX=y
569# CONFIG_TYPHOON is not set 612# CONFIG_TYPHOON is not set
@@ -603,12 +646,14 @@ CONFIG_8139TOO=y
603# CONFIG_DL2K is not set 646# CONFIG_DL2K is not set
604CONFIG_E1000=y 647CONFIG_E1000=y
605# CONFIG_E1000_NAPI is not set 648# CONFIG_E1000_NAPI is not set
649# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
606# CONFIG_NS83820 is not set 650# CONFIG_NS83820 is not set
607# CONFIG_HAMACHI is not set 651# CONFIG_HAMACHI is not set
608# CONFIG_YELLOWFIN is not set 652# CONFIG_YELLOWFIN is not set
609# CONFIG_R8169 is not set 653# CONFIG_R8169 is not set
610# CONFIG_SIS190 is not set 654# CONFIG_SIS190 is not set
611# CONFIG_SKGE is not set 655# CONFIG_SKGE is not set
656# CONFIG_SKY2 is not set
612# CONFIG_SK98LIN is not set 657# CONFIG_SK98LIN is not set
613# CONFIG_VIA_VELOCITY is not set 658# CONFIG_VIA_VELOCITY is not set
614CONFIG_TIGON3=y 659CONFIG_TIGON3=y
@@ -621,7 +666,6 @@ CONFIG_TIGON3=y
621# CONFIG_IXGB is not set 666# CONFIG_IXGB is not set
622CONFIG_S2IO=m 667CONFIG_S2IO=m
623# CONFIG_S2IO_NAPI is not set 668# CONFIG_S2IO_NAPI is not set
624# CONFIG_2BUFF_MODE is not set
625 669
626# 670#
627# Token Ring devices 671# Token Ring devices
@@ -720,6 +764,7 @@ CONFIG_SERIAL_8250=y
720CONFIG_SERIAL_8250_CONSOLE=y 764CONFIG_SERIAL_8250_CONSOLE=y
721# CONFIG_SERIAL_8250_ACPI is not set 765# CONFIG_SERIAL_8250_ACPI is not set
722CONFIG_SERIAL_8250_NR_UARTS=4 766CONFIG_SERIAL_8250_NR_UARTS=4
767CONFIG_SERIAL_8250_RUNTIME_UARTS=4
723# CONFIG_SERIAL_8250_EXTENDED is not set 768# CONFIG_SERIAL_8250_EXTENDED is not set
724 769
725# 770#
@@ -727,7 +772,6 @@ CONFIG_SERIAL_8250_NR_UARTS=4
727# 772#
728CONFIG_SERIAL_CORE=y 773CONFIG_SERIAL_CORE=y
729CONFIG_SERIAL_CORE_CONSOLE=y 774CONFIG_SERIAL_CORE_CONSOLE=y
730# CONFIG_SERIAL_JSM is not set
731CONFIG_UNIX98_PTYS=y 775CONFIG_UNIX98_PTYS=y
732CONFIG_LEGACY_PTYS=y 776CONFIG_LEGACY_PTYS=y
733CONFIG_LEGACY_PTY_COUNT=256 777CONFIG_LEGACY_PTY_COUNT=256
@@ -740,7 +784,44 @@ CONFIG_LEGACY_PTY_COUNT=256
740# 784#
741# Watchdog Cards 785# Watchdog Cards
742# 786#
743# CONFIG_WATCHDOG is not set 787CONFIG_WATCHDOG=y
788# CONFIG_WATCHDOG_NOWAYOUT is not set
789
790#
791# Watchdog Device Drivers
792#
793CONFIG_SOFT_WATCHDOG=y
794# CONFIG_ACQUIRE_WDT is not set
795# CONFIG_ADVANTECH_WDT is not set
796# CONFIG_ALIM1535_WDT is not set
797# CONFIG_ALIM7101_WDT is not set
798# CONFIG_SC520_WDT is not set
799# CONFIG_EUROTECH_WDT is not set
800# CONFIG_IB700_WDT is not set
801# CONFIG_IBMASR is not set
802# CONFIG_WAFER_WDT is not set
803# CONFIG_I6300ESB_WDT is not set
804# CONFIG_I8XX_TCO is not set
805# CONFIG_SC1200_WDT is not set
806# CONFIG_60XX_WDT is not set
807# CONFIG_SBC8360_WDT is not set
808# CONFIG_CPU5_WDT is not set
809# CONFIG_W83627HF_WDT is not set
810# CONFIG_W83877F_WDT is not set
811# CONFIG_W83977F_WDT is not set
812# CONFIG_MACHZ_WDT is not set
813# CONFIG_SBC_EPX_C3_WATCHDOG is not set
814
815#
816# PCI-based Watchdog Cards
817#
818# CONFIG_PCIPCWATCHDOG is not set
819# CONFIG_WDTPCI is not set
820
821#
822# USB-based Watchdog Cards
823#
824# CONFIG_USBPCWATCHDOG is not set
744CONFIG_HW_RANDOM=y 825CONFIG_HW_RANDOM=y
745# CONFIG_NVRAM is not set 826# CONFIG_NVRAM is not set
746CONFIG_RTC=y 827CONFIG_RTC=y
@@ -757,16 +838,17 @@ CONFIG_AGP_INTEL=y
757# CONFIG_DRM is not set 838# CONFIG_DRM is not set
758# CONFIG_MWAVE is not set 839# CONFIG_MWAVE is not set
759CONFIG_RAW_DRIVER=y 840CONFIG_RAW_DRIVER=y
841CONFIG_MAX_RAW_DEVS=256
760CONFIG_HPET=y 842CONFIG_HPET=y
761# CONFIG_HPET_RTC_IRQ is not set 843# CONFIG_HPET_RTC_IRQ is not set
762CONFIG_HPET_MMAP=y 844CONFIG_HPET_MMAP=y
763CONFIG_MAX_RAW_DEVS=256
764# CONFIG_HANGCHECK_TIMER is not set 845# CONFIG_HANGCHECK_TIMER is not set
765 846
766# 847#
767# TPM devices 848# TPM devices
768# 849#
769# CONFIG_TCG_TPM is not set 850# CONFIG_TCG_TPM is not set
851# CONFIG_TELCLOCK is not set
770 852
771# 853#
772# I2C support 854# I2C support
@@ -774,6 +856,12 @@ CONFIG_MAX_RAW_DEVS=256
774# CONFIG_I2C is not set 856# CONFIG_I2C is not set
775 857
776# 858#
859# SPI support
860#
861# CONFIG_SPI is not set
862# CONFIG_SPI_MASTER is not set
863
864#
777# Dallas's 1-wire bus 865# Dallas's 1-wire bus
778# 866#
779# CONFIG_W1 is not set 867# CONFIG_W1 is not set
@@ -783,6 +871,7 @@ CONFIG_MAX_RAW_DEVS=256
783# 871#
784CONFIG_HWMON=y 872CONFIG_HWMON=y
785# CONFIG_HWMON_VID is not set 873# CONFIG_HWMON_VID is not set
874# CONFIG_SENSORS_HDAPS is not set
786# CONFIG_HWMON_DEBUG_CHIP is not set 875# CONFIG_HWMON_DEBUG_CHIP is not set
787 876
788# 877#
@@ -830,6 +919,7 @@ CONFIG_SOUND=y
830# Open Sound System 919# Open Sound System
831# 920#
832CONFIG_SOUND_PRIME=y 921CONFIG_SOUND_PRIME=y
922CONFIG_OBSOLETE_OSS_DRIVER=y
833# CONFIG_SOUND_BT878 is not set 923# CONFIG_SOUND_BT878 is not set
834# CONFIG_SOUND_CMPCI is not set 924# CONFIG_SOUND_CMPCI is not set
835# CONFIG_SOUND_EMU10K1 is not set 925# CONFIG_SOUND_EMU10K1 is not set
@@ -886,12 +976,15 @@ CONFIG_USB_UHCI_HCD=y
886# USB Device Class drivers 976# USB Device Class drivers
887# 977#
888# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set 978# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
889# CONFIG_USB_BLUETOOTH_TTY is not set
890# CONFIG_USB_ACM is not set 979# CONFIG_USB_ACM is not set
891CONFIG_USB_PRINTER=y 980CONFIG_USB_PRINTER=y
892 981
893# 982#
894# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information 983# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
984#
985
986#
987# may also be needed; see USB_STORAGE Help for more information
895# 988#
896CONFIG_USB_STORAGE=y 989CONFIG_USB_STORAGE=y
897# CONFIG_USB_STORAGE_DEBUG is not set 990# CONFIG_USB_STORAGE_DEBUG is not set
@@ -903,13 +996,15 @@ CONFIG_USB_STORAGE=y
903# CONFIG_USB_STORAGE_SDDR09 is not set 996# CONFIG_USB_STORAGE_SDDR09 is not set
904# CONFIG_USB_STORAGE_SDDR55 is not set 997# CONFIG_USB_STORAGE_SDDR55 is not set
905# CONFIG_USB_STORAGE_JUMPSHOT is not set 998# CONFIG_USB_STORAGE_JUMPSHOT is not set
906# CONFIG_USB_STORAGE_ONETOUCH is not set 999# CONFIG_USB_STORAGE_ALAUDA is not set
1000# CONFIG_USB_LIBUSUAL is not set
907 1001
908# 1002#
909# USB Input Devices 1003# USB Input Devices
910# 1004#
911CONFIG_USB_HID=y 1005CONFIG_USB_HID=y
912CONFIG_USB_HIDINPUT=y 1006CONFIG_USB_HIDINPUT=y
1007# CONFIG_USB_HIDINPUT_POWERBOOK is not set
913# CONFIG_HID_FF is not set 1008# CONFIG_HID_FF is not set
914# CONFIG_USB_HIDDEV is not set 1009# CONFIG_USB_HIDDEV is not set
915# CONFIG_USB_AIPTEK is not set 1010# CONFIG_USB_AIPTEK is not set
@@ -923,7 +1018,9 @@ CONFIG_USB_HIDINPUT=y
923# CONFIG_USB_YEALINK is not set 1018# CONFIG_USB_YEALINK is not set
924# CONFIG_USB_XPAD is not set 1019# CONFIG_USB_XPAD is not set
925# CONFIG_USB_ATI_REMOTE is not set 1020# CONFIG_USB_ATI_REMOTE is not set
1021# CONFIG_USB_ATI_REMOTE2 is not set
926# CONFIG_USB_KEYSPAN_REMOTE is not set 1022# CONFIG_USB_KEYSPAN_REMOTE is not set
1023# CONFIG_USB_APPLETOUCH is not set
927 1024
928# 1025#
929# USB Imaging devices 1026# USB Imaging devices
@@ -1005,7 +1102,7 @@ CONFIG_USB_MON=y
1005# 1102#
1006# CONFIG_EDD is not set 1103# CONFIG_EDD is not set
1007# CONFIG_DELL_RBU is not set 1104# CONFIG_DELL_RBU is not set
1008CONFIG_DCDBAS=m 1105# CONFIG_DCDBAS is not set
1009 1106
1010# 1107#
1011# File systems 1108# File systems
@@ -1031,13 +1128,14 @@ CONFIG_REISERFS_FS_POSIX_ACL=y
1031# CONFIG_JFS_FS is not set 1128# CONFIG_JFS_FS is not set
1032CONFIG_FS_POSIX_ACL=y 1129CONFIG_FS_POSIX_ACL=y
1033# CONFIG_XFS_FS is not set 1130# CONFIG_XFS_FS is not set
1131# CONFIG_OCFS2_FS is not set
1034# CONFIG_MINIX_FS is not set 1132# CONFIG_MINIX_FS is not set
1035# CONFIG_ROMFS_FS is not set 1133# CONFIG_ROMFS_FS is not set
1036CONFIG_INOTIFY=y 1134CONFIG_INOTIFY=y
1037# CONFIG_QUOTA is not set 1135# CONFIG_QUOTA is not set
1038CONFIG_DNOTIFY=y 1136CONFIG_DNOTIFY=y
1039CONFIG_AUTOFS_FS=y 1137CONFIG_AUTOFS_FS=y
1040# CONFIG_AUTOFS4_FS is not set 1138CONFIG_AUTOFS4_FS=y
1041# CONFIG_FUSE_FS is not set 1139# CONFIG_FUSE_FS is not set
1042 1140
1043# 1141#
@@ -1068,7 +1166,8 @@ CONFIG_TMPFS=y
1068CONFIG_HUGETLBFS=y 1166CONFIG_HUGETLBFS=y
1069CONFIG_HUGETLB_PAGE=y 1167CONFIG_HUGETLB_PAGE=y
1070CONFIG_RAMFS=y 1168CONFIG_RAMFS=y
1071# CONFIG_RELAYFS_FS is not set 1169CONFIG_RELAYFS_FS=y
1170# CONFIG_CONFIGFS_FS is not set
1072 1171
1073# 1172#
1074# Miscellaneous filesystems 1173# Miscellaneous filesystems
@@ -1166,30 +1265,35 @@ CONFIG_NLS_ISO8859_15=y
1166CONFIG_NLS_UTF8=y 1265CONFIG_NLS_UTF8=y
1167 1266
1168# 1267#
1169# Profiling support 1268# Instrumentation Support
1170# 1269#
1171CONFIG_PROFILING=y 1270CONFIG_PROFILING=y
1172CONFIG_OPROFILE=y 1271CONFIG_OPROFILE=y
1272CONFIG_KPROBES=y
1173 1273
1174# 1274#
1175# Kernel hacking 1275# Kernel hacking
1176# 1276#
1177# CONFIG_PRINTK_TIME is not set 1277# CONFIG_PRINTK_TIME is not set
1178CONFIG_DEBUG_KERNEL=y
1179CONFIG_MAGIC_SYSRQ=y 1278CONFIG_MAGIC_SYSRQ=y
1279CONFIG_DEBUG_KERNEL=y
1180CONFIG_LOG_BUF_SHIFT=18 1280CONFIG_LOG_BUF_SHIFT=18
1181CONFIG_DETECT_SOFTLOCKUP=y 1281CONFIG_DETECT_SOFTLOCKUP=y
1182# CONFIG_SCHEDSTATS is not set 1282# CONFIG_SCHEDSTATS is not set
1183# CONFIG_DEBUG_SLAB is not set 1283# CONFIG_DEBUG_SLAB is not set
1284# CONFIG_DEBUG_MUTEXES is not set
1184# CONFIG_DEBUG_SPINLOCK is not set 1285# CONFIG_DEBUG_SPINLOCK is not set
1185# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 1286# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
1186# CONFIG_DEBUG_KOBJECT is not set 1287# CONFIG_DEBUG_KOBJECT is not set
1187# CONFIG_DEBUG_INFO is not set 1288# CONFIG_DEBUG_INFO is not set
1188CONFIG_DEBUG_FS=y 1289CONFIG_DEBUG_FS=y
1290# CONFIG_DEBUG_VM is not set
1189# CONFIG_FRAME_POINTER is not set 1291# CONFIG_FRAME_POINTER is not set
1292# CONFIG_FORCED_INLINING is not set
1293# CONFIG_RCU_TORTURE_TEST is not set
1190CONFIG_INIT_DEBUG=y 1294CONFIG_INIT_DEBUG=y
1295# CONFIG_DEBUG_RODATA is not set
1191# CONFIG_IOMMU_DEBUG is not set 1296# CONFIG_IOMMU_DEBUG is not set
1192CONFIG_KPROBES=y
1193 1297
1194# 1298#
1195# Security options 1299# Security options
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index f76217d8f579..929e6b0771f8 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -2,9 +2,9 @@
2# Makefile for the ia32 kernel emulation subsystem. 2# Makefile for the ia32 kernel emulation subsystem.
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
6 ia32_signal.o tls32.o \ 6 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
7 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o 7 mmap32.o
8 8
9sysv-$(CONFIG_SYSVIPC) := ipc32.o 9sysv-$(CONFIG_SYSVIPC) := ipc32.o
10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) 10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
@@ -29,4 +29,3 @@ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
29 29
30AFLAGS_vsyscall-sysenter.o = -m32 30AFLAGS_vsyscall-sysenter.o = -m32
31AFLAGS_vsyscall-syscall.o = -m32 31AFLAGS_vsyscall-syscall.o = -m32
32CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 93c60f4aa47a..3bf58af98936 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -36,9 +36,6 @@
36#undef WARN_OLD 36#undef WARN_OLD
37#undef CORE_DUMP /* probably broken */ 37#undef CORE_DUMP /* probably broken */
38 38
39extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
40 unsigned long stack_top, int exec_stack);
41
42static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 39static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
43static int load_aout_library(struct file*); 40static int load_aout_library(struct file*);
44 41
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index d9161e395978..572b3b28772d 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -197,8 +197,7 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re
197 197
198static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) 198static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
199{ 199{
200 struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0); 200 struct pt_regs *pp = task_pt_regs(t);
201 --pp;
202 ELF_CORE_COPY_REGS((*elfregs), pp); 201 ELF_CORE_COPY_REGS((*elfregs), pp);
203 /* fix wrong segments */ 202 /* fix wrong segments */
204 (*elfregs)[7] = t->thread.ds; 203 (*elfregs)[7] = t->thread.ds;
@@ -217,8 +216,7 @@ elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpr
217 if (!tsk_used_math(tsk)) 216 if (!tsk_used_math(tsk))
218 return 0; 217 return 0;
219 if (!regs) 218 if (!regs)
220 regs = (struct pt_regs *)tsk->thread.rsp0; 219 regs = task_pt_regs(tsk);
221 --regs;
222 if (tsk == current) 220 if (tsk == current)
223 unlazy_fpu(tsk); 221 unlazy_fpu(tsk);
224 set_fs(KERNEL_DS); 222 set_fs(KERNEL_DS);
@@ -234,7 +232,7 @@ elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpr
234static inline int 232static inline int
235elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) 233elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
236{ 234{
237 struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; 235 struct pt_regs *regs = task_pt_regs(t);
238 if (!tsk_used_math(t)) 236 if (!tsk_used_math(t))
239 return 0; 237 return 0;
240 if (t == current) 238 if (t == current)
@@ -295,8 +293,6 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int
295} while(0) 293} while(0)
296 294
297 295
298#define elf_map elf32_map
299
300#include <linux/module.h> 296#include <linux/module.h>
301 297
302MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); 298MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
@@ -335,7 +331,8 @@ static void elf32_init(struct pt_regs *regs)
335 me->thread.es = __USER_DS; 331 me->thread.es = __USER_DS;
336} 332}
337 333
338int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) 334int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
335 int executable_stack)
339{ 336{
340 unsigned long stack_base; 337 unsigned long stack_base;
341 struct vm_area_struct *mpnt; 338 struct vm_area_struct *mpnt;
@@ -389,21 +386,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int exec
389 386
390 return 0; 387 return 0;
391} 388}
392 389EXPORT_SYMBOL(ia32_setup_arg_pages);
393static unsigned long
394elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
395{
396 unsigned long map_addr;
397 struct task_struct *me = current;
398
399 down_write(&me->mm->mmap_sem);
400 map_addr = do_mmap(filep, ELF_PAGESTART(addr),
401 eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot,
402 type,
403 eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
404 up_write(&me->mm->mmap_sem);
405 return(map_addr);
406}
407 390
408#ifdef CONFIG_SYSCTL 391#ifdef CONFIG_SYSCTL
409/* Register vsyscall32 into the ABI table */ 392/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
deleted file mode 100644
index e335bd0b637d..000000000000
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ /dev/null
@@ -1,79 +0,0 @@
1/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
2 * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
3 *
4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
7 *
8 * These routines maintain argument size conversion between 32bit and 64bit
9 * ioctls.
10 */
11
12#define INCLUDES
13#include <linux/syscalls.h>
14#include "compat_ioctl.c"
15#include <asm/ia32.h>
16
17#define CODE
18#include "compat_ioctl.c"
19
20#define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */
21#define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */
22#define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */
23#define RTC_EPOCH_SET32 _IOW('p', 0x0e, unsigned) /* Set epoch */
24
25static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
26{
27 unsigned long val;
28 mm_segment_t oldfs = get_fs();
29 int ret;
30
31 switch (cmd) {
32 case RTC_IRQP_READ32:
33 set_fs(KERNEL_DS);
34 ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val);
35 set_fs(oldfs);
36 if (!ret)
37 ret = put_user(val, (unsigned int __user *) arg);
38 return ret;
39
40 case RTC_IRQP_SET32:
41 cmd = RTC_IRQP_SET;
42 break;
43
44 case RTC_EPOCH_READ32:
45 set_fs(KERNEL_DS);
46 ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val);
47 set_fs(oldfs);
48 if (!ret)
49 ret = put_user(val, (unsigned int __user *) arg);
50 return ret;
51
52 case RTC_EPOCH_SET32:
53 cmd = RTC_EPOCH_SET;
54 break;
55 }
56 return sys_ioctl(fd,cmd,arg);
57}
58
59
60#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) },
61#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
62
63struct ioctl_trans ioctl_start[] = {
64#include <linux/compat_ioctl.h>
65#define DECLARES
66#include "compat_ioctl.c"
67
68/* And these ioctls need translation */
69/* realtime device */
70HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl)
71HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
72HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
73HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
74HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
75/* take care of sizeof(sizeof()) breakage */
76};
77
78int ioctl_table_size = ARRAY_SIZE(ioctl_start);
79
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 0903cc1faef2..e0a92439f634 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -353,7 +353,6 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
353 struct pt_regs *regs, unsigned int mask) 353 struct pt_regs *regs, unsigned int mask)
354{ 354{
355 int tmp, err = 0; 355 int tmp, err = 0;
356 u32 eflags;
357 356
358 tmp = 0; 357 tmp = 0;
359 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); 358 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
@@ -378,10 +377,7 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
378 err |= __put_user(current->thread.trap_no, &sc->trapno); 377 err |= __put_user(current->thread.trap_no, &sc->trapno);
379 err |= __put_user(current->thread.error_code, &sc->err); 378 err |= __put_user(current->thread.error_code, &sc->err);
380 err |= __put_user((u32)regs->rip, &sc->eip); 379 err |= __put_user((u32)regs->rip, &sc->eip);
381 eflags = regs->eflags; 380 err |= __put_user((u32)regs->eflags, &sc->eflags);
382 if (current->ptrace & PT_PTRACED)
383 eflags &= ~TF_MASK;
384 err |= __put_user((u32)eflags, &sc->eflags);
385 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); 381 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
386 382
387 tmp = save_i387_ia32(current, fpstate, regs, 0); 383 tmp = save_i387_ia32(current, fpstate, regs, 0);
@@ -505,13 +501,9 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
505 regs->ss = __USER32_DS; 501 regs->ss = __USER32_DS;
506 502
507 set_fs(USER_DS); 503 set_fs(USER_DS);
508 if (regs->eflags & TF_MASK) { 504 regs->eflags &= ~TF_MASK;
509 if (current->ptrace & PT_PTRACED) { 505 if (test_thread_flag(TIF_SINGLESTEP))
510 ptrace_notify(SIGTRAP); 506 ptrace_notify(SIGTRAP);
511 } else {
512 regs->eflags &= ~TF_MASK;
513 }
514 }
515 507
516#if DEBUG_SIG 508#if DEBUG_SIG
517 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 509 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
@@ -605,13 +597,9 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
605 regs->ss = __USER32_DS; 597 regs->ss = __USER32_DS;
606 598
607 set_fs(USER_DS); 599 set_fs(USER_DS);
608 if (regs->eflags & TF_MASK) { 600 regs->eflags &= ~TF_MASK;
609 if (current->ptrace & PT_PTRACED) { 601 if (test_thread_flag(TIF_SINGLESTEP))
610 ptrace_notify(SIGTRAP); 602 ptrace_notify(SIGTRAP);
611 } else {
612 regs->eflags &= ~TF_MASK;
613 }
614 }
615 603
616#if DEBUG_SIG 604#if DEBUG_SIG
617 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 605 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c712fe9..f05c2a802489 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -35,6 +35,18 @@
35 movq %rax,R8(%rsp) 35 movq %rax,R8(%rsp)
36 .endm 36 .endm
37 37
38 .macro CFI_STARTPROC32 simple
39 CFI_STARTPROC \simple
40 CFI_UNDEFINED r8
41 CFI_UNDEFINED r9
42 CFI_UNDEFINED r10
43 CFI_UNDEFINED r11
44 CFI_UNDEFINED r12
45 CFI_UNDEFINED r13
46 CFI_UNDEFINED r14
47 CFI_UNDEFINED r15
48 .endm
49
38/* 50/*
39 * 32bit SYSENTER instruction entry. 51 * 32bit SYSENTER instruction entry.
40 * 52 *
@@ -55,7 +67,7 @@
55 * with the int 0x80 path. 67 * with the int 0x80 path.
56 */ 68 */
57ENTRY(ia32_sysenter_target) 69ENTRY(ia32_sysenter_target)
58 CFI_STARTPROC simple 70 CFI_STARTPROC32 simple
59 CFI_DEF_CFA rsp,0 71 CFI_DEF_CFA rsp,0
60 CFI_REGISTER rsp,rbp 72 CFI_REGISTER rsp,rbp
61 swapgs 73 swapgs
@@ -92,6 +104,7 @@ ENTRY(ia32_sysenter_target)
92 .quad 1b,ia32_badarg 104 .quad 1b,ia32_badarg
93 .previous 105 .previous
94 GET_THREAD_INFO(%r10) 106 GET_THREAD_INFO(%r10)
107 orl $TS_COMPAT,threadinfo_status(%r10)
95 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 108 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
96 CFI_REMEMBER_STATE 109 CFI_REMEMBER_STATE
97 jnz sysenter_tracesys 110 jnz sysenter_tracesys
@@ -105,6 +118,7 @@ sysenter_do_call:
105 cli 118 cli
106 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 119 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
107 jnz int_ret_from_sys_call 120 jnz int_ret_from_sys_call
121 andl $~TS_COMPAT,threadinfo_status(%r10)
108 /* clear IF, that popfq doesn't enable interrupts early */ 122 /* clear IF, that popfq doesn't enable interrupts early */
109 andl $~0x200,EFLAGS-R11(%rsp) 123 andl $~0x200,EFLAGS-R11(%rsp)
110 RESTORE_ARGS 1,24,1,1,1,1 124 RESTORE_ARGS 1,24,1,1,1,1
@@ -161,7 +175,7 @@ sysenter_tracesys:
161 * with the int 0x80 path. 175 * with the int 0x80 path.
162 */ 176 */
163ENTRY(ia32_cstar_target) 177ENTRY(ia32_cstar_target)
164 CFI_STARTPROC simple 178 CFI_STARTPROC32 simple
165 CFI_DEF_CFA rsp,0 179 CFI_DEF_CFA rsp,0
166 CFI_REGISTER rip,rcx 180 CFI_REGISTER rip,rcx
167 /*CFI_REGISTER rflags,r11*/ 181 /*CFI_REGISTER rflags,r11*/
@@ -191,6 +205,7 @@ ENTRY(ia32_cstar_target)
191 .quad 1b,ia32_badarg 205 .quad 1b,ia32_badarg
192 .previous 206 .previous
193 GET_THREAD_INFO(%r10) 207 GET_THREAD_INFO(%r10)
208 orl $TS_COMPAT,threadinfo_status(%r10)
194 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 209 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
195 CFI_REMEMBER_STATE 210 CFI_REMEMBER_STATE
196 jnz cstar_tracesys 211 jnz cstar_tracesys
@@ -204,6 +219,7 @@ cstar_do_call:
204 cli 219 cli
205 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 220 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
206 jnz int_ret_from_sys_call 221 jnz int_ret_from_sys_call
222 andl $~TS_COMPAT,threadinfo_status(%r10)
207 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 223 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
208 movl RIP-ARGOFFSET(%rsp),%ecx 224 movl RIP-ARGOFFSET(%rsp),%ecx
209 CFI_REGISTER rip,rcx 225 CFI_REGISTER rip,rcx
@@ -276,6 +292,7 @@ ENTRY(ia32_syscall)
276 this could be a problem. */ 292 this could be a problem. */
277 SAVE_ARGS 0,0,1 293 SAVE_ARGS 0,0,1
278 GET_THREAD_INFO(%r10) 294 GET_THREAD_INFO(%r10)
295 orl $TS_COMPAT,threadinfo_status(%r10)
279 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 296 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
280 jnz ia32_tracesys 297 jnz ia32_tracesys
281ia32_do_syscall: 298ia32_do_syscall:
@@ -318,7 +335,7 @@ quiet_ni_syscall:
318 jmp ia32_ptregs_common 335 jmp ia32_ptregs_common
319 .endm 336 .endm
320 337
321 CFI_STARTPROC 338 CFI_STARTPROC32
322 339
323 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi 340 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
324 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi 341 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
@@ -333,15 +350,26 @@ quiet_ni_syscall:
333 350
334ENTRY(ia32_ptregs_common) 351ENTRY(ia32_ptregs_common)
335 popq %r11 352 popq %r11
336 CFI_ADJUST_CFA_OFFSET -8 353 CFI_ENDPROC
337 CFI_REGISTER rip, r11 354 CFI_STARTPROC32 simple
355 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
356 CFI_REL_OFFSET rax,RAX-ARGOFFSET
357 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
358 CFI_REL_OFFSET rdx,RDX-ARGOFFSET
359 CFI_REL_OFFSET rsi,RSI-ARGOFFSET
360 CFI_REL_OFFSET rdi,RDI-ARGOFFSET
361 CFI_REL_OFFSET rip,RIP-ARGOFFSET
362/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
363/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
364 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
365/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
338 SAVE_REST 366 SAVE_REST
339 call *%rax 367 call *%rax
340 RESTORE_REST 368 RESTORE_REST
341 jmp ia32_sysret /* misbalances the return cache */ 369 jmp ia32_sysret /* misbalances the return cache */
342 CFI_ENDPROC 370 CFI_ENDPROC
343 371
344 .data 372 .section .rodata,"a"
345 .align 8 373 .align 8
346 .globl ia32_sys_call_table 374 .globl ia32_sys_call_table
347ia32_sys_call_table: 375ia32_sys_call_table:
@@ -608,7 +636,7 @@ ia32_sys_call_table:
608 .quad sys_epoll_wait 636 .quad sys_epoll_wait
609 .quad sys_remap_file_pages 637 .quad sys_remap_file_pages
610 .quad sys_set_tid_address 638 .quad sys_set_tid_address
611 .quad sys32_timer_create 639 .quad compat_sys_timer_create
612 .quad compat_sys_timer_settime /* 260 */ 640 .quad compat_sys_timer_settime /* 260 */
613 .quad compat_sys_timer_gettime 641 .quad compat_sys_timer_gettime
614 .quad sys_timer_getoverrun 642 .quad sys_timer_getoverrun
@@ -643,6 +671,20 @@ ia32_sys_call_table:
643 .quad sys_inotify_init 671 .quad sys_inotify_init
644 .quad sys_inotify_add_watch 672 .quad sys_inotify_add_watch
645 .quad sys_inotify_rm_watch 673 .quad sys_inotify_rm_watch
674 .quad sys_migrate_pages
675 .quad compat_sys_openat /* 295 */
676 .quad sys_mkdirat
677 .quad sys_mknodat
678 .quad sys_fchownat
679 .quad sys_futimesat
680 .quad compat_sys_newfstatat /* 300 */
681 .quad sys_unlinkat
682 .quad sys_renameat
683 .quad sys_linkat
684 .quad sys_symlinkat
685 .quad sys_readlinkat /* 305 */
686 .quad sys_fchmodat
687 .quad sys_faccessat
646ia32_syscall_end: 688ia32_syscall_end:
647 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 689 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
648 .quad ni_syscall 690 .quad ni_syscall
diff --git a/arch/x86_64/ia32/mmap32.c b/arch/x86_64/ia32/mmap32.c
new file mode 100644
index 000000000000..079f4132575c
--- /dev/null
+++ b/arch/x86_64/ia32/mmap32.c
@@ -0,0 +1,78 @@
1/*
2 * linux/arch/x86_64/ia32/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Based on the i386 version which was
7 *
8 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
9 * All Rights Reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 *
26 * Started by Ingo Molnar <mingo@elte.hu>
27 */
28
29#include <linux/personality.h>
30#include <linux/mm.h>
31#include <linux/random.h>
32
33/*
34 * Top of mmap area (just below the process stack).
35 *
36 * Leave an at least ~128 MB hole.
37 */
38#define MIN_GAP (128*1024*1024)
39#define MAX_GAP (TASK_SIZE/6*5)
40
41static inline unsigned long mmap_base(struct mm_struct *mm)
42{
43 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
44 unsigned long random_factor = 0;
45
46 if (current->flags & PF_RANDOMIZE)
47 random_factor = get_random_int() % (1024*1024);
48
49 if (gap < MIN_GAP)
50 gap = MIN_GAP;
51 else if (gap > MAX_GAP)
52 gap = MAX_GAP;
53
54 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
55}
56
57/*
58 * This function, called very early during the creation of a new
59 * process VM image, sets up which VM layout function to use:
60 */
61void ia32_pick_mmap_layout(struct mm_struct *mm)
62{
63 /*
64 * Fall back to the standard layout if the personality
65 * bit is set, or if the expected stack growth is unlimited:
66 */
67 if (sysctl_legacy_va_layout ||
68 (current->personality & ADDR_COMPAT_LAYOUT) ||
69 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
70 mm->mmap_base = TASK_UNMAPPED_BASE;
71 mm->get_unmapped_area = arch_get_unmapped_area;
72 mm->unmap_area = arch_unmap_area;
73 } else {
74 mm->mmap_base = mmap_base(mm);
75 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
76 mm->unmap_area = arch_unmap_area_topdown;
77 }
78}
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 2a925e2af390..23a4515a73b4 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -28,9 +28,12 @@
28#include <asm/i387.h> 28#include <asm/i387.h>
29#include <asm/fpu32.h> 29#include <asm/fpu32.h>
30 30
31/* determines which flags the user has access to. */ 31/*
32/* 1 = access 0 = no access */ 32 * Determines which flags the user has access to [1 = access, 0 = no access].
33#define FLAG_MASK 0x44dd5UL 33 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
34 * Also masks reserved bits (31-22, 15, 5, 3, 1).
35 */
36#define FLAG_MASK 0x54dd5UL
34 37
35#define R32(l,q) \ 38#define R32(l,q) \
36 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break 39 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
@@ -38,7 +41,7 @@
38static int putreg32(struct task_struct *child, unsigned regno, u32 val) 41static int putreg32(struct task_struct *child, unsigned regno, u32 val)
39{ 42{
40 int i; 43 int i;
41 __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); 44 __u64 *stack = (__u64 *)task_pt_regs(child);
42 45
43 switch (regno) { 46 switch (regno) {
44 case offsetof(struct user32, regs.fs): 47 case offsetof(struct user32, regs.fs):
@@ -134,7 +137,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
134 137
135static int getreg32(struct task_struct *child, unsigned regno, u32 *val) 138static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
136{ 139{
137 __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); 140 __u64 *stack = (__u64 *)task_pt_regs(child);
138 141
139 switch (regno) { 142 switch (regno) {
140 case offsetof(struct user32, regs.fs): 143 case offsetof(struct user32, regs.fs):
@@ -196,36 +199,6 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
196 199
197#undef R32 200#undef R32
198 201
199static struct task_struct *find_target(int request, int pid, int *err)
200{
201 struct task_struct *child;
202
203 *err = -EPERM;
204 if (pid == 1)
205 return NULL;
206
207 *err = -ESRCH;
208 read_lock(&tasklist_lock);
209 child = find_task_by_pid(pid);
210 if (child)
211 get_task_struct(child);
212 read_unlock(&tasklist_lock);
213 if (child) {
214 *err = -EPERM;
215 if (child->pid == 1)
216 goto out;
217 *err = ptrace_check_attach(child, request == PTRACE_KILL);
218 if (*err < 0)
219 goto out;
220 return child;
221 }
222 out:
223 if (child)
224 put_task_struct(child);
225 return NULL;
226
227}
228
229asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) 202asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
230{ 203{
231 struct task_struct *child; 204 struct task_struct *child;
@@ -254,11 +227,18 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
254 break; 227 break;
255 } 228 }
256 229
257 child = find_target(request, pid, &ret); 230 if (request == PTRACE_TRACEME)
258 if (!child) 231 return ptrace_traceme();
259 return ret; 232
233 child = ptrace_get_task_struct(pid);
234 if (IS_ERR(child))
235 return PTR_ERR(child);
260 236
261 childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); 237 ret = ptrace_check_attach(child, request == PTRACE_KILL);
238 if (ret < 0)
239 goto out;
240
241 childregs = task_pt_regs(child);
262 242
263 switch (request) { 243 switch (request) {
264 case PTRACE_PEEKDATA: 244 case PTRACE_PEEKDATA:
@@ -373,6 +353,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
373 break; 353 break;
374 } 354 }
375 355
356 out:
376 put_task_struct(child); 357 put_task_struct(child);
377 return ret; 358 return ret;
378} 359}
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 5389df610e78..54481af5344a 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -969,25 +969,6 @@ long sys32_kill(int pid, int sig)
969 return sys_kill(pid, sig); 969 return sys_kill(pid, sig);
970} 970}
971 971
972extern asmlinkage long
973sys_timer_create(clockid_t which_clock,
974 struct sigevent __user *timer_event_spec,
975 timer_t __user * created_timer_id);
976
977long
978sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id)
979{
980 struct sigevent __user *p = NULL;
981 if (se32) {
982 struct sigevent se;
983 p = compat_alloc_user_space(sizeof(struct sigevent));
984 if (get_compat_sigevent(&se, se32) ||
985 copy_to_user(p, &se, sizeof(se)))
986 return -EFAULT;
987 }
988 return sys_timer_create(clock, p, timer_id);
989}
990
991long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 972long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
992 __u32 len_low, __u32 len_high, int advice) 973 __u32 len_low, __u32 len_high, int advice)
993{ 974{
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
index 8b5a4b060bb5..d90321fe9bba 100644
--- a/arch/x86_64/ia32/vsyscall-sigreturn.S
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -7,6 +7,7 @@
7 * by doing ".balign 32" must match in both versions of the page. 7 * by doing ".balign 32" must match in both versions of the page.
8 */ 8 */
9 9
10 .code32
10 .section .text.sigreturn,"ax" 11 .section .text.sigreturn,"ax"
11 .balign 32 12 .balign 32
12 .globl __kernel_sigreturn 13 .globl __kernel_sigreturn
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S
index b024965bb689..cf9ef678de3e 100644
--- a/arch/x86_64/ia32/vsyscall-syscall.S
+++ b/arch/x86_64/ia32/vsyscall-syscall.S
@@ -6,6 +6,7 @@
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/segment.h> 7#include <asm/segment.h>
8 8
9 .code32
9 .text 10 .text
10 .section .text.vsyscall,"ax" 11 .section .text.vsyscall,"ax"
11 .globl __kernel_vsyscall 12 .globl __kernel_vsyscall
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S
index 71f3de586b56..ae056e553d13 100644
--- a/arch/x86_64/ia32/vsyscall-sysenter.S
+++ b/arch/x86_64/ia32/vsyscall-sysenter.S
@@ -5,6 +5,7 @@
5#include <asm/ia32_unistd.h> 5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8 .code32
8 .text 9 .text
9 .section .text.vsyscall,"ax" 10 .section .text.vsyscall,"ax"
10 .globl __kernel_vsyscall 11 .globl __kernel_vsyscall
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 14328cab5d3a..72fe60c20d39 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -7,10 +7,12 @@ EXTRA_AFLAGS := -traditional
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \ 9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o 10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
11 dmi_scan.o pci-dma.o pci-nommu.o
11 12
12obj-$(CONFIG_X86_MCE) += mce.o 13obj-$(CONFIG_X86_MCE) += mce.o
13obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 14obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
15obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
14obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ 16obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
15obj-$(CONFIG_ACPI) += acpi/ 17obj-$(CONFIG_ACPI) += acpi/
16obj-$(CONFIG_X86_MSR) += msr.o 18obj-$(CONFIG_X86_MSR) += msr.o
@@ -21,14 +23,16 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 23obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o 24 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o 25obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
26obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
24obj-$(CONFIG_PM) += suspend.o 27obj-$(CONFIG_PM) += suspend.o
25obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o 28obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
26obj-$(CONFIG_CPU_FREQ) += cpufreq/ 29obj-$(CONFIG_CPU_FREQ) += cpufreq/
27obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 30obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
28obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o 31obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
29obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o 32obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
30obj-$(CONFIG_KPROBES) += kprobes.o 33obj-$(CONFIG_KPROBES) += kprobes.o
31obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o 34obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
35obj-$(CONFIG_X86_VSMP) += vsmp.o
32 36
33obj-$(CONFIG_MODULES) += module.o 37obj-$(CONFIG_MODULES) += module.o
34 38
@@ -45,3 +49,5 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
45quirks-y += ../../i386/kernel/quirks.o 49quirks-y += ../../i386/kernel/quirks.o
46i8237-y += ../../i386/kernel/i8237.o 50i8237-y += ../../i386/kernel/i8237.o
47msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 51msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
52dmi_scan-y += ../../i386/kernel/dmi_scan.o
53
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 962ad4823b6a..e4e2b7d01f89 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -23,6 +23,7 @@
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/proto.h> 24#include <asm/proto.h>
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/dma.h>
26 27
27int iommu_aperture; 28int iommu_aperture;
28int iommu_aperture_disabled __initdata = 0; 29int iommu_aperture_disabled __initdata = 0;
@@ -196,7 +197,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
196void __init iommu_hole_init(void) 197void __init iommu_hole_init(void)
197{ 198{
198 int fix, num; 199 int fix, num;
199 u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; 200 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
200 u64 aper_base, last_aper_base = 0; 201 u64 aper_base, last_aper_base = 0;
201 int valid_agp = 0; 202 int valid_agp = 0;
202 203
@@ -247,7 +248,7 @@ void __init iommu_hole_init(void)
247 /* Got the aperture from the AGP bridge */ 248 /* Got the aperture from the AGP bridge */
248 } else if (swiotlb && !valid_agp) { 249 } else if (swiotlb && !valid_agp) {
249 /* Do nothing */ 250 /* Do nothing */
250 } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || 251 } else if ((!no_iommu && end_pfn >= MAX_DMA32_PFN) ||
251 force_iommu || 252 force_iommu ||
252 valid_agp || 253 valid_agp ||
253 fallback_aper_force) { 254 fallback_aper_force) {
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index b6e7715d877f..5d3c5b07b8db 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
25#include <linux/mc146818rtc.h> 25#include <linux/mc146818rtc.h>
26#include <linux/kernel_stat.h> 26#include <linux/kernel_stat.h>
27#include <linux/sysdev.h> 27#include <linux/sysdev.h>
28#include <linux/module.h>
28 29
29#include <asm/atomic.h> 30#include <asm/atomic.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
@@ -33,36 +34,36 @@
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/mach_apic.h> 35#include <asm/mach_apic.h>
35#include <asm/nmi.h> 36#include <asm/nmi.h>
37#include <asm/idle.h>
36 38
37int apic_verbosity; 39int apic_verbosity;
38 40
39int disable_apic_timer __initdata; 41int disable_apic_timer __initdata;
40 42
43/*
44 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
45 * IPIs in place of local APIC timers
46 */
47static cpumask_t timer_interrupt_broadcast_ipi_mask;
48
41/* Using APIC to generate smp_local_timer_interrupt? */ 49/* Using APIC to generate smp_local_timer_interrupt? */
42int using_apic_timer = 0; 50int using_apic_timer = 0;
43 51
44static DEFINE_PER_CPU(int, prof_multiplier) = 1;
45static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
46static DEFINE_PER_CPU(int, prof_counter) = 1;
47
48static void apic_pm_activate(void); 52static void apic_pm_activate(void);
49 53
50void enable_NMI_through_LVT0 (void * dummy) 54void enable_NMI_through_LVT0 (void * dummy)
51{ 55{
52 unsigned int v, ver; 56 unsigned int v;
53 57
54 ver = apic_read(APIC_LVR);
55 ver = GET_APIC_VERSION(ver);
56 v = APIC_DM_NMI; /* unmask and set to NMI */ 58 v = APIC_DM_NMI; /* unmask and set to NMI */
57 apic_write_around(APIC_LVT0, v); 59 apic_write(APIC_LVT0, v);
58} 60}
59 61
60int get_maxlvt(void) 62int get_maxlvt(void)
61{ 63{
62 unsigned int v, ver, maxlvt; 64 unsigned int v, maxlvt;
63 65
64 v = apic_read(APIC_LVR); 66 v = apic_read(APIC_LVR);
65 ver = GET_APIC_VERSION(v);
66 maxlvt = GET_APIC_MAXLVT(v); 67 maxlvt = GET_APIC_MAXLVT(v);
67 return maxlvt; 68 return maxlvt;
68} 69}
@@ -80,33 +81,33 @@ void clear_local_APIC(void)
80 */ 81 */
81 if (maxlvt >= 3) { 82 if (maxlvt >= 3) {
82 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 83 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
83 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 84 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
84 } 85 }
85 /* 86 /*
86 * Careful: we have to set masks only first to deassert 87 * Careful: we have to set masks only first to deassert
87 * any level-triggered sources. 88 * any level-triggered sources.
88 */ 89 */
89 v = apic_read(APIC_LVTT); 90 v = apic_read(APIC_LVTT);
90 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 91 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
91 v = apic_read(APIC_LVT0); 92 v = apic_read(APIC_LVT0);
92 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 93 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
93 v = apic_read(APIC_LVT1); 94 v = apic_read(APIC_LVT1);
94 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 95 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
95 if (maxlvt >= 4) { 96 if (maxlvt >= 4) {
96 v = apic_read(APIC_LVTPC); 97 v = apic_read(APIC_LVTPC);
97 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 98 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
98 } 99 }
99 100
100 /* 101 /*
101 * Clean APIC state for other OSs: 102 * Clean APIC state for other OSs:
102 */ 103 */
103 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 104 apic_write(APIC_LVTT, APIC_LVT_MASKED);
104 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 105 apic_write(APIC_LVT0, APIC_LVT_MASKED);
105 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 106 apic_write(APIC_LVT1, APIC_LVT_MASKED);
106 if (maxlvt >= 3) 107 if (maxlvt >= 3)
107 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 108 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
108 if (maxlvt >= 4) 109 if (maxlvt >= 4)
109 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 110 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
110 v = GET_APIC_VERSION(apic_read(APIC_LVR)); 111 v = GET_APIC_VERSION(apic_read(APIC_LVR));
111 apic_write(APIC_ESR, 0); 112 apic_write(APIC_ESR, 0);
112 apic_read(APIC_ESR); 113 apic_read(APIC_ESR);
@@ -151,7 +152,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
151 value &= ~APIC_VECTOR_MASK; 152 value &= ~APIC_VECTOR_MASK;
152 value |= APIC_SPIV_APIC_ENABLED; 153 value |= APIC_SPIV_APIC_ENABLED;
153 value |= 0xf; 154 value |= 0xf;
154 apic_write_around(APIC_SPIV, value); 155 apic_write(APIC_SPIV, value);
155 156
156 if (!virt_wire_setup) { 157 if (!virt_wire_setup) {
157 /* For LVT0 make it edge triggered, active high, external and enabled */ 158 /* For LVT0 make it edge triggered, active high, external and enabled */
@@ -161,11 +162,11 @@ void disconnect_bsp_APIC(int virt_wire_setup)
161 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); 162 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
162 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 163 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
163 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 164 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
164 apic_write_around(APIC_LVT0, value); 165 apic_write(APIC_LVT0, value);
165 } 166 }
166 else { 167 else {
167 /* Disable LVT0 */ 168 /* Disable LVT0 */
168 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 169 apic_write(APIC_LVT0, APIC_LVT_MASKED);
169 } 170 }
170 171
171 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 172 /* For LVT1 make it edge triggered, active high, nmi and enabled */
@@ -176,7 +177,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
176 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 177 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
177 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 178 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
178 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 179 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
179 apic_write_around(APIC_LVT1, value); 180 apic_write(APIC_LVT1, value);
180 } 181 }
181} 182}
182 183
@@ -192,7 +193,7 @@ void disable_local_APIC(void)
192 */ 193 */
193 value = apic_read(APIC_SPIV); 194 value = apic_read(APIC_SPIV);
194 value &= ~APIC_SPIV_APIC_ENABLED; 195 value &= ~APIC_SPIV_APIC_ENABLED;
195 apic_write_around(APIC_SPIV, value); 196 apic_write(APIC_SPIV, value);
196} 197}
197 198
198/* 199/*
@@ -269,7 +270,7 @@ void __init sync_Arb_IDs(void)
269 apic_wait_icr_idle(); 270 apic_wait_icr_idle();
270 271
271 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 272 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
272 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 273 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
273 | APIC_DM_INIT); 274 | APIC_DM_INIT);
274} 275}
275 276
@@ -280,7 +281,7 @@ extern void __error_in_apic_c (void);
280 */ 281 */
281void __init init_bsp_APIC(void) 282void __init init_bsp_APIC(void)
282{ 283{
283 unsigned int value, ver; 284 unsigned int value;
284 285
285 /* 286 /*
286 * Don't do the setup now if we have a SMP BIOS as the 287 * Don't do the setup now if we have a SMP BIOS as the
@@ -290,7 +291,6 @@ void __init init_bsp_APIC(void)
290 return; 291 return;
291 292
292 value = apic_read(APIC_LVR); 293 value = apic_read(APIC_LVR);
293 ver = GET_APIC_VERSION(value);
294 294
295 /* 295 /*
296 * Do not trust the local APIC being empty at bootup. 296 * Do not trust the local APIC being empty at bootup.
@@ -305,22 +305,21 @@ void __init init_bsp_APIC(void)
305 value |= APIC_SPIV_APIC_ENABLED; 305 value |= APIC_SPIV_APIC_ENABLED;
306 value |= APIC_SPIV_FOCUS_DISABLED; 306 value |= APIC_SPIV_FOCUS_DISABLED;
307 value |= SPURIOUS_APIC_VECTOR; 307 value |= SPURIOUS_APIC_VECTOR;
308 apic_write_around(APIC_SPIV, value); 308 apic_write(APIC_SPIV, value);
309 309
310 /* 310 /*
311 * Set up the virtual wire mode. 311 * Set up the virtual wire mode.
312 */ 312 */
313 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 313 apic_write(APIC_LVT0, APIC_DM_EXTINT);
314 value = APIC_DM_NMI; 314 value = APIC_DM_NMI;
315 apic_write_around(APIC_LVT1, value); 315 apic_write(APIC_LVT1, value);
316} 316}
317 317
318void __cpuinit setup_local_APIC (void) 318void __cpuinit setup_local_APIC (void)
319{ 319{
320 unsigned int value, ver, maxlvt; 320 unsigned int value, maxlvt;
321 321
322 value = apic_read(APIC_LVR); 322 value = apic_read(APIC_LVR);
323 ver = GET_APIC_VERSION(value);
324 323
325 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) 324 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
326 __error_in_apic_c(); 325 __error_in_apic_c();
@@ -345,7 +344,7 @@ void __cpuinit setup_local_APIC (void)
345 */ 344 */
346 value = apic_read(APIC_TASKPRI); 345 value = apic_read(APIC_TASKPRI);
347 value &= ~APIC_TPRI_MASK; 346 value &= ~APIC_TPRI_MASK;
348 apic_write_around(APIC_TASKPRI, value); 347 apic_write(APIC_TASKPRI, value);
349 348
350 /* 349 /*
351 * Now that we are all set up, enable the APIC 350 * Now that we are all set up, enable the APIC
@@ -387,7 +386,7 @@ void __cpuinit setup_local_APIC (void)
387 * Set spurious IRQ vector 386 * Set spurious IRQ vector
388 */ 387 */
389 value |= SPURIOUS_APIC_VECTOR; 388 value |= SPURIOUS_APIC_VECTOR;
390 apic_write_around(APIC_SPIV, value); 389 apic_write(APIC_SPIV, value);
391 390
392 /* 391 /*
393 * Set up LVT0, LVT1: 392 * Set up LVT0, LVT1:
@@ -407,7 +406,7 @@ void __cpuinit setup_local_APIC (void)
407 value = APIC_DM_EXTINT | APIC_LVT_MASKED; 406 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
408 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); 407 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
409 } 408 }
410 apic_write_around(APIC_LVT0, value); 409 apic_write(APIC_LVT0, value);
411 410
412 /* 411 /*
413 * only the BP should see the LINT1 NMI signal, obviously. 412 * only the BP should see the LINT1 NMI signal, obviously.
@@ -416,14 +415,14 @@ void __cpuinit setup_local_APIC (void)
416 value = APIC_DM_NMI; 415 value = APIC_DM_NMI;
417 else 416 else
418 value = APIC_DM_NMI | APIC_LVT_MASKED; 417 value = APIC_DM_NMI | APIC_LVT_MASKED;
419 apic_write_around(APIC_LVT1, value); 418 apic_write(APIC_LVT1, value);
420 419
421 { 420 {
422 unsigned oldvalue; 421 unsigned oldvalue;
423 maxlvt = get_maxlvt(); 422 maxlvt = get_maxlvt();
424 oldvalue = apic_read(APIC_ESR); 423 oldvalue = apic_read(APIC_ESR);
425 value = ERROR_APIC_VECTOR; // enables sending errors 424 value = ERROR_APIC_VECTOR; // enables sending errors
426 apic_write_around(APIC_LVTERR, value); 425 apic_write(APIC_LVTERR, value);
427 /* 426 /*
428 * spec says clear errors after enabling vector. 427 * spec says clear errors after enabling vector.
429 */ 428 */
@@ -500,13 +499,10 @@ static int lapic_resume(struct sys_device *dev)
500 if (!apic_pm_state.active) 499 if (!apic_pm_state.active)
501 return 0; 500 return 0;
502 501
503 /* XXX: Pavel needs this for S3 resume, but can't explain why */
504 set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
505
506 local_irq_save(flags); 502 local_irq_save(flags);
507 rdmsr(MSR_IA32_APICBASE, l, h); 503 rdmsr(MSR_IA32_APICBASE, l, h);
508 l &= ~MSR_IA32_APICBASE_BASE; 504 l &= ~MSR_IA32_APICBASE_BASE;
509 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 505 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
510 wrmsr(MSR_IA32_APICBASE, l, h); 506 wrmsr(MSR_IA32_APICBASE, l, h);
511 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 507 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
512 apic_write(APIC_ID, apic_pm_state.apic_id); 508 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -660,20 +656,25 @@ void __init init_apic_mappings(void)
660static void __setup_APIC_LVTT(unsigned int clocks) 656static void __setup_APIC_LVTT(unsigned int clocks)
661{ 657{
662 unsigned int lvtt_value, tmp_value, ver; 658 unsigned int lvtt_value, tmp_value, ver;
659 int cpu = smp_processor_id();
663 660
664 ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 661 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
665 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; 662 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
666 apic_write_around(APIC_LVTT, lvtt_value); 663
664 if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
665 lvtt_value |= APIC_LVT_MASKED;
666
667 apic_write(APIC_LVTT, lvtt_value);
667 668
668 /* 669 /*
669 * Divide PICLK by 16 670 * Divide PICLK by 16
670 */ 671 */
671 tmp_value = apic_read(APIC_TDCR); 672 tmp_value = apic_read(APIC_TDCR);
672 apic_write_around(APIC_TDCR, (tmp_value 673 apic_write(APIC_TDCR, (tmp_value
673 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 674 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
674 | APIC_TDR_DIV_16); 675 | APIC_TDR_DIV_16);
675 676
676 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 677 apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
677} 678}
678 679
679static void setup_APIC_timer(unsigned int clocks) 680static void setup_APIC_timer(unsigned int clocks)
@@ -682,12 +683,6 @@ static void setup_APIC_timer(unsigned int clocks)
682 683
683 local_irq_save(flags); 684 local_irq_save(flags);
684 685
685 /* For some reasons this doesn't work on Simics, so fake it for now */
686 if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) {
687 __setup_APIC_LVTT(clocks);
688 return;
689 }
690
691 /* wait for irq slice */ 686 /* wait for irq slice */
692 if (vxtime.hpet_address) { 687 if (vxtime.hpet_address) {
693 int trigger = hpet_readl(HPET_T0_CMP); 688 int trigger = hpet_readl(HPET_T0_CMP);
@@ -700,7 +695,7 @@ static void setup_APIC_timer(unsigned int clocks)
700 outb_p(0x00, 0x43); 695 outb_p(0x00, 0x43);
701 c2 = inb_p(0x40); 696 c2 = inb_p(0x40);
702 c2 |= inb_p(0x40) << 8; 697 c2 |= inb_p(0x40) << 8;
703 do { 698 do {
704 c1 = c2; 699 c1 = c2;
705 outb_p(0x00, 0x43); 700 outb_p(0x00, 0x43);
706 c2 = inb_p(0x40); 701 c2 = inb_p(0x40);
@@ -785,54 +780,80 @@ void __cpuinit setup_secondary_APIC_clock(void)
785 local_irq_enable(); 780 local_irq_enable();
786} 781}
787 782
788void __cpuinit disable_APIC_timer(void) 783void disable_APIC_timer(void)
789{ 784{
790 if (using_apic_timer) { 785 if (using_apic_timer) {
791 unsigned long v; 786 unsigned long v;
792 787
793 v = apic_read(APIC_LVTT); 788 v = apic_read(APIC_LVTT);
794 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 789 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
795 } 790 }
796} 791}
797 792
798void enable_APIC_timer(void) 793void enable_APIC_timer(void)
799{ 794{
800 if (using_apic_timer) { 795 int cpu = smp_processor_id();
796
797 if (using_apic_timer &&
798 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
801 unsigned long v; 799 unsigned long v;
802 800
803 v = apic_read(APIC_LVTT); 801 v = apic_read(APIC_LVTT);
804 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); 802 apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
805 } 803 }
806} 804}
807 805
808/* 806void switch_APIC_timer_to_ipi(void *cpumask)
809 * the frequency of the profiling timer can be changed
810 * by writing a multiplier value into /proc/profile.
811 */
812int setup_profiling_timer(unsigned int multiplier)
813{ 807{
814 int i; 808 cpumask_t mask = *(cpumask_t *)cpumask;
809 int cpu = smp_processor_id();
815 810
816 /* 811 if (cpu_isset(cpu, mask) &&
817 * Sanity check. [at least 500 APIC cycles should be 812 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
818 * between APIC interrupts as a rule of thumb, to avoid 813 disable_APIC_timer();
819 * irqs flooding us] 814 cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
820 */ 815 }
821 if ( (!multiplier) || (calibration_result/multiplier < 500)) 816}
822 return -EINVAL; 817EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
823
824 /*
825 * Set the new multiplier for each CPU. CPUs don't start using the
826 * new values until the next timer interrupt in which they do process
827 * accounting. At that time they also adjust their APIC timers
828 * accordingly.
829 */
830 for (i = 0; i < NR_CPUS; ++i)
831 per_cpu(prof_multiplier, i) = multiplier;
832 818
833 return 0; 819void smp_send_timer_broadcast_ipi(void)
820{
821 cpumask_t mask;
822
823 cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
824 if (!cpus_empty(mask)) {
825 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
826 }
827}
828
829void switch_ipi_to_APIC_timer(void *cpumask)
830{
831 cpumask_t mask = *(cpumask_t *)cpumask;
832 int cpu = smp_processor_id();
833
834 if (cpu_isset(cpu, mask) &&
835 cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
836 cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
837 enable_APIC_timer();
838 }
839}
840EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
841
842int setup_profiling_timer(unsigned int multiplier)
843{
844 return -EINVAL;
834} 845}
835 846
847#ifdef CONFIG_X86_MCE_AMD
848void setup_threshold_lvt(unsigned long lvt_off)
849{
850 unsigned int v = 0;
851 unsigned long reg = (lvt_off << 4) + 0x500;
852 v |= THRESHOLD_APIC_VECTOR;
853 apic_write(reg, v);
854}
855#endif /* CONFIG_X86_MCE_AMD */
856
836#undef APIC_DIVISOR 857#undef APIC_DIVISOR
837 858
838/* 859/*
@@ -847,32 +868,10 @@ int setup_profiling_timer(unsigned int multiplier)
847 868
848void smp_local_timer_interrupt(struct pt_regs *regs) 869void smp_local_timer_interrupt(struct pt_regs *regs)
849{ 870{
850 int cpu = smp_processor_id();
851
852 profile_tick(CPU_PROFILING, regs); 871 profile_tick(CPU_PROFILING, regs);
853 if (--per_cpu(prof_counter, cpu) <= 0) {
854 /*
855 * The multiplier may have changed since the last time we got
856 * to this point as a result of the user writing to
857 * /proc/profile. In this case we need to adjust the APIC
858 * timer accordingly.
859 *
860 * Interrupts are already masked off at this point.
861 */
862 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
863 if (per_cpu(prof_counter, cpu) !=
864 per_cpu(prof_old_multiplier, cpu)) {
865 __setup_APIC_LVTT(calibration_result/
866 per_cpu(prof_counter, cpu));
867 per_cpu(prof_old_multiplier, cpu) =
868 per_cpu(prof_counter, cpu);
869 }
870
871#ifdef CONFIG_SMP 872#ifdef CONFIG_SMP
872 update_process_times(user_mode(regs)); 873 update_process_times(user_mode(regs));
873#endif 874#endif
874 }
875
876 /* 875 /*
877 * We take the 'long' return path, and there every subsystem 876 * We take the 'long' return path, and there every subsystem
878 * grabs the appropriate locks (kernel lock/ irq lock). 877 * grabs the appropriate locks (kernel lock/ irq lock).
@@ -910,6 +909,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
910 * Besides, if we don't timer interrupts ignore the global 909 * Besides, if we don't timer interrupts ignore the global
911 * interrupt lock, which is the WrongThing (tm) to do. 910 * interrupt lock, which is the WrongThing (tm) to do.
912 */ 911 */
912 exit_idle();
913 irq_enter(); 913 irq_enter();
914 smp_local_timer_interrupt(regs); 914 smp_local_timer_interrupt(regs);
915 irq_exit(); 915 irq_exit();
@@ -969,6 +969,7 @@ __init int oem_force_hpet_timer(void)
969asmlinkage void smp_spurious_interrupt(void) 969asmlinkage void smp_spurious_interrupt(void)
970{ 970{
971 unsigned int v; 971 unsigned int v;
972 exit_idle();
972 irq_enter(); 973 irq_enter();
973 /* 974 /*
974 * Check if this really is a spurious interrupt and ACK it 975 * Check if this really is a spurious interrupt and ACK it
@@ -1004,6 +1005,7 @@ asmlinkage void smp_error_interrupt(void)
1004{ 1005{
1005 unsigned int v, v1; 1006 unsigned int v, v1;
1006 1007
1008 exit_idle();
1007 irq_enter(); 1009 irq_enter();
1008 /* First tickle the hardware, only then report what went on. -- REW */ 1010 /* First tickle the hardware, only then report what went on. -- REW */
1009 v = apic_read(APIC_ESR); 1011 v = apic_read(APIC_ESR);
@@ -1050,7 +1052,7 @@ int __init APIC_init_uniprocessor (void)
1050 connect_bsp_APIC(); 1052 connect_bsp_APIC();
1051 1053
1052 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); 1054 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1053 apic_write_around(APIC_ID, boot_cpu_id); 1055 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1054 1056
1055 setup_local_APIC(); 1057 setup_local_APIC();
1056 1058
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index aaa6d3833517..38834bbbae11 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -33,6 +33,7 @@ int main(void)
33 ENTRY(flags); 33 ENTRY(flags);
34 ENTRY(addr_limit); 34 ENTRY(addr_limit);
35 ENTRY(preempt_count); 35 ENTRY(preempt_count);
36 ENTRY(status);
36 BLANK(); 37 BLANK();
37#undef ENTRY 38#undef ENTRY
38#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) 39#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
@@ -42,6 +43,7 @@ int main(void)
42 ENTRY(irqcount); 43 ENTRY(irqcount);
43 ENTRY(cpunumber); 44 ENTRY(cpunumber);
44 ENTRY(irqstackptr); 45 ENTRY(irqstackptr);
46 ENTRY(data_offset);
45 BLANK(); 47 BLANK();
46#undef ENTRY 48#undef ENTRY
47#ifdef CONFIG_IA32_EMULATION 49#ifdef CONFIG_IA32_EMULATION
@@ -64,5 +66,7 @@ int main(void)
64 DEFINE(pbe_address, offsetof(struct pbe, address)); 66 DEFINE(pbe_address, offsetof(struct pbe, address));
65 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 67 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
66 DEFINE(pbe_next, offsetof(struct pbe, next)); 68 DEFINE(pbe_next, offsetof(struct pbe, next));
69 BLANK();
70 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
67 return 0; 71 return 0;
68} 72}
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 535e04466079..4e6c3b729e39 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -11,19 +11,156 @@
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/irq.h>
14#include <linux/reboot.h> 15#include <linux/reboot.h>
15#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/delay.h>
18#include <linux/elf.h>
19#include <linux/elfcore.h>
16 20
17#include <asm/processor.h> 21#include <asm/processor.h>
18#include <asm/hardirq.h> 22#include <asm/hardirq.h>
19#include <asm/nmi.h> 23#include <asm/nmi.h>
20#include <asm/hw_irq.h> 24#include <asm/hw_irq.h>
25#include <asm/mach_apic.h>
21 26
22note_buf_t crash_notes[NR_CPUS]; 27/* This keeps a track of which one is crashing cpu. */
28static int crashing_cpu;
29
30static u32 *append_elf_note(u32 *buf, char *name, unsigned type,
31 void *data, size_t data_len)
32{
33 struct elf_note note;
34
35 note.n_namesz = strlen(name) + 1;
36 note.n_descsz = data_len;
37 note.n_type = type;
38 memcpy(buf, &note, sizeof(note));
39 buf += (sizeof(note) +3)/4;
40 memcpy(buf, name, note.n_namesz);
41 buf += (note.n_namesz + 3)/4;
42 memcpy(buf, data, note.n_descsz);
43 buf += (note.n_descsz + 3)/4;
44
45 return buf;
46}
47
48static void final_note(u32 *buf)
49{
50 struct elf_note note;
51
52 note.n_namesz = 0;
53 note.n_descsz = 0;
54 note.n_type = 0;
55 memcpy(buf, &note, sizeof(note));
56}
57
58static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
59{
60 struct elf_prstatus prstatus;
61 u32 *buf;
62
63 if ((cpu < 0) || (cpu >= NR_CPUS))
64 return;
65
66 /* Using ELF notes here is opportunistic.
67 * I need a well defined structure format
68 * for the data I pass, and I need tags
69 * on the data to indicate what information I have
70 * squirrelled away. ELF notes happen to provide
71 * all of that that no need to invent something new.
72 */
73
74 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
75
76 if (!buf)
77 return;
78
79 memset(&prstatus, 0, sizeof(prstatus));
80 prstatus.pr_pid = current->pid;
81 elf_core_copy_regs(&prstatus.pr_reg, regs);
82 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
83 sizeof(prstatus));
84 final_note(buf);
85}
86
87static void crash_save_self(struct pt_regs *regs)
88{
89 int cpu;
90
91 cpu = smp_processor_id();
92 crash_save_this_cpu(regs, cpu);
93}
94
95#ifdef CONFIG_SMP
96static atomic_t waiting_for_crash_ipi;
97
98static int crash_nmi_callback(struct pt_regs *regs, int cpu)
99{
100 /*
101 * Don't do anything if this handler is invoked on crashing cpu.
102 * Otherwise, system will completely hang. Crashing cpu can get
103 * an NMI if system was initially booted with nmi_watchdog parameter.
104 */
105 if (cpu == crashing_cpu)
106 return 1;
107 local_irq_disable();
108
109 crash_save_this_cpu(regs, cpu);
110 disable_local_APIC();
111 atomic_dec(&waiting_for_crash_ipi);
112 /* Assume hlt works */
113 for(;;)
114 asm("hlt");
115
116 return 1;
117}
118
119static void smp_send_nmi_allbutself(void)
120{
121 send_IPI_allbutself(APIC_DM_NMI);
122}
123
124/*
125 * This code is a best effort heuristic to get the
126 * other cpus to stop executing. So races with
127 * cpu hotplug shouldn't matter.
128 */
129
130static void nmi_shootdown_cpus(void)
131{
132 unsigned long msecs;
133
134 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
135 set_nmi_callback(crash_nmi_callback);
136
137 /*
138 * Ensure the new callback function is set before sending
139 * out the NMI
140 */
141 wmb();
142
143 smp_send_nmi_allbutself();
144
145 msecs = 1000; /* Wait at most a second for the other cpus to stop */
146 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
147 mdelay(1);
148 msecs--;
149 }
150 /* Leave the nmi callback set */
151 disable_local_APIC();
152}
153#else
154static void nmi_shootdown_cpus(void)
155{
156 /* There are no cpus to shootdown */
157}
158#endif
23 159
24void machine_crash_shutdown(struct pt_regs *regs) 160void machine_crash_shutdown(struct pt_regs *regs)
25{ 161{
26 /* This function is only called after the system 162 /*
163 * This function is only called after the system
27 * has paniced or is otherwise in a critical state. 164 * has paniced or is otherwise in a critical state.
28 * The minimum amount of code to allow a kexec'd kernel 165 * The minimum amount of code to allow a kexec'd kernel
29 * to run successfully needs to happen here. 166 * to run successfully needs to happen here.
@@ -31,4 +168,19 @@ void machine_crash_shutdown(struct pt_regs *regs)
31 * In practice this means shooting down the other cpus in 168 * In practice this means shooting down the other cpus in
32 * an SMP system. 169 * an SMP system.
33 */ 170 */
171 /* The kernel is broken so disable interrupts */
172 local_irq_disable();
173
174 /* Make a note of crashing cpu. Will be used in NMI callback.*/
175 crashing_cpu = smp_processor_id();
176 nmi_shootdown_cpus();
177
178 if(cpu_has_apic)
179 disable_local_APIC();
180
181#if defined(CONFIG_X86_IO_APIC)
182 disable_IO_APIC();
183#endif
184
185 crash_save_self(regs);
34} 186}
diff --git a/arch/x86_64/kernel/crash_dump.c b/arch/x86_64/kernel/crash_dump.c
new file mode 100644
index 000000000000..942deac4d43a
--- /dev/null
+++ b/arch/x86_64/kernel/crash_dump.c
@@ -0,0 +1,47 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/errno.h>
9#include <linux/crash_dump.h>
10
11#include <asm/uaccess.h>
12#include <asm/io.h>
13
14/**
15 * copy_oldmem_page - copy one page from "oldmem"
16 * @pfn: page frame number to be copied
17 * @buf: target memory address for the copy; this can be in kernel address
18 * space or user address space (see @userbuf)
19 * @csize: number of bytes to copy
20 * @offset: offset in bytes into the page (based on pfn) to begin the copy
21 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
22 * otherwise @buf is in kernel address space, use memcpy().
23 *
24 * Copy a page from "oldmem". For this page, there is no pte mapped
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf)
29{
30 void *vaddr;
31
32 if (!csize)
33 return 0;
34
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
36
37 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) {
39 iounmap(vaddr);
40 return -EFAULT;
41 }
42 } else
43 memcpy(buf, (vaddr + offset), csize);
44
45 iounmap(vaddr);
46 return csize;
47}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index ab3f87aaff70..293cd71a266a 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -23,8 +23,7 @@
23#include <asm/e820.h> 23#include <asm/e820.h>
24#include <asm/proto.h> 24#include <asm/proto.h>
25#include <asm/bootsetup.h> 25#include <asm/bootsetup.h>
26 26#include <asm/sections.h>
27extern char _end[];
28 27
29/* 28/*
30 * PFN of last memory page. 29 * PFN of last memory page.
@@ -560,6 +559,27 @@ void __init parse_memopt(char *p, char **from)
560 end_user_pfn >>= PAGE_SHIFT; 559 end_user_pfn >>= PAGE_SHIFT;
561} 560}
562 561
562void __init parse_memmapopt(char *p, char **from)
563{
564 unsigned long long start_at, mem_size;
565
566 mem_size = memparse(p, from);
567 p = *from;
568 if (*p == '@') {
569 start_at = memparse(p+1, from);
570 add_memory_region(start_at, mem_size, E820_RAM);
571 } else if (*p == '#') {
572 start_at = memparse(p+1, from);
573 add_memory_region(start_at, mem_size, E820_ACPI);
574 } else if (*p == '$') {
575 start_at = memparse(p+1, from);
576 add_memory_region(start_at, mem_size, E820_RESERVED);
577 } else {
578 end_user_pfn = (mem_size >> PAGE_SHIFT);
579 }
580 p = *from;
581}
582
563unsigned long pci_mem_start = 0xaeedbabe; 583unsigned long pci_mem_start = 0xaeedbabe;
564 584
565/* 585/*
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 9cd968dd0f5a..6dffb498ccd7 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -216,7 +216,7 @@ void early_printk(const char *fmt, ...)
216 va_end(ap); 216 va_end(ap);
217} 217}
218 218
219static int keep_early; 219static int __initdata keep_early;
220 220
221int __init setup_early_printk(char *opt) 221int __init setup_early_printk(char *opt)
222{ 222{
@@ -226,8 +226,6 @@ int __init setup_early_printk(char *opt)
226 if (early_console_initialized) 226 if (early_console_initialized)
227 return -1; 227 return -1;
228 228
229 opt = strchr(opt, '=') + 1;
230
231 strlcpy(buf,opt,sizeof(buf)); 229 strlcpy(buf,opt,sizeof(buf));
232 space = strchr(buf, ' '); 230 space = strchr(buf, ' ');
233 if (space) 231 if (space)
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7937971d1853..dbdba56e8faa 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -41,6 +41,7 @@
41#include <asm/unistd.h> 41#include <asm/unistd.h>
42#include <asm/thread_info.h> 42#include <asm/thread_info.h>
43#include <asm/hw_irq.h> 43#include <asm/hw_irq.h>
44#include <asm/page.h>
44 45
45 .code64 46 .code64
46 47
@@ -313,6 +314,7 @@ int_with_check:
313 movl threadinfo_flags(%rcx),%edx 314 movl threadinfo_flags(%rcx),%edx
314 andl %edi,%edx 315 andl %edi,%edx
315 jnz int_careful 316 jnz int_careful
317 andl $~TS_COMPAT,threadinfo_status(%rcx)
316 jmp retint_swapgs 318 jmp retint_swapgs
317 319
318 /* Either reschedule or signal or syscall exit tracking needed. */ 320 /* Either reschedule or signal or syscall exit tracking needed. */
@@ -612,6 +614,9 @@ retint_kernel:
612ENTRY(thermal_interrupt) 614ENTRY(thermal_interrupt)
613 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt 615 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
614 616
617ENTRY(threshold_interrupt)
618 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
619
615#ifdef CONFIG_SMP 620#ifdef CONFIG_SMP
616ENTRY(reschedule_interrupt) 621ENTRY(reschedule_interrupt)
617 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt 622 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
@@ -670,7 +675,7 @@ ENTRY(spurious_interrupt)
670 675
671 /* error code is on the stack already */ 676 /* error code is on the stack already */
672 /* handle NMI like exceptions that can happen everywhere */ 677 /* handle NMI like exceptions that can happen everywhere */
673 .macro paranoidentry sym 678 .macro paranoidentry sym, ist=0
674 SAVE_ALL 679 SAVE_ALL
675 cld 680 cld
676 movl $1,%ebx 681 movl $1,%ebx
@@ -680,10 +685,20 @@ ENTRY(spurious_interrupt)
680 js 1f 685 js 1f
681 swapgs 686 swapgs
682 xorl %ebx,%ebx 687 xorl %ebx,%ebx
6831: movq %rsp,%rdi 6881:
689 .if \ist
690 movq %gs:pda_data_offset, %rbp
691 .endif
692 movq %rsp,%rdi
684 movq ORIG_RAX(%rsp),%rsi 693 movq ORIG_RAX(%rsp),%rsi
685 movq $-1,ORIG_RAX(%rsp) 694 movq $-1,ORIG_RAX(%rsp)
695 .if \ist
696 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
697 .endif
686 call \sym 698 call \sym
699 .if \ist
700 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
701 .endif
687 cli 702 cli
688 .endm 703 .endm
689 704
@@ -751,7 +766,7 @@ error_exit:
751 jnz retint_careful 766 jnz retint_careful
752 swapgs 767 swapgs
753 RESTORE_ARGS 0,8,0 768 RESTORE_ARGS 0,8,0
754 iretq 769 jmp iret_label
755 CFI_ENDPROC 770 CFI_ENDPROC
756 771
757error_kernelspace: 772error_kernelspace:
@@ -901,7 +916,7 @@ KPROBE_ENTRY(debug)
901 INTR_FRAME 916 INTR_FRAME
902 pushq $0 917 pushq $0
903 CFI_ADJUST_CFA_OFFSET 8 918 CFI_ADJUST_CFA_OFFSET 8
904 paranoidentry do_debug 919 paranoidentry do_debug, DEBUG_STACK
905 jmp paranoid_exit 920 jmp paranoid_exit
906 CFI_ENDPROC 921 CFI_ENDPROC
907 .previous .text 922 .previous .text
@@ -956,7 +971,12 @@ paranoid_schedule:
956 CFI_ENDPROC 971 CFI_ENDPROC
957 972
958KPROBE_ENTRY(int3) 973KPROBE_ENTRY(int3)
959 zeroentry do_int3 974 INTR_FRAME
975 pushq $0
976 CFI_ADJUST_CFA_OFFSET 8
977 paranoidentry do_int3, DEBUG_STACK
978 jmp paranoid_exit
979 CFI_ENDPROC
960 .previous .text 980 .previous .text
961 981
962ENTRY(overflow) 982ENTRY(overflow)
@@ -1018,23 +1038,18 @@ ENTRY(machine_check)
1018 CFI_ENDPROC 1038 CFI_ENDPROC
1019#endif 1039#endif
1020 1040
1021ENTRY(call_debug)
1022 zeroentry do_call_debug
1023
1024ENTRY(call_softirq) 1041ENTRY(call_softirq)
1025 CFI_STARTPROC 1042 CFI_STARTPROC
1026 movq %gs:pda_irqstackptr,%rax 1043 movq %gs:pda_irqstackptr,%rax
1027 pushq %r15 1044 movq %rsp,%rdx
1028 CFI_ADJUST_CFA_OFFSET 8 1045 CFI_DEF_CFA_REGISTER rdx
1029 movq %rsp,%r15
1030 CFI_DEF_CFA_REGISTER r15
1031 incl %gs:pda_irqcount 1046 incl %gs:pda_irqcount
1032 cmove %rax,%rsp 1047 cmove %rax,%rsp
1048 pushq %rdx
1049 /*todo CFI_DEF_CFA_EXPRESSION ...*/
1033 call __do_softirq 1050 call __do_softirq
1034 movq %r15,%rsp 1051 popq %rsp
1035 CFI_DEF_CFA_REGISTER rsp 1052 CFI_DEF_CFA_REGISTER rsp
1036 decl %gs:pda_irqcount 1053 decl %gs:pda_irqcount
1037 popq %r15
1038 CFI_ADJUST_CFA_OFFSET -8
1039 ret 1054 ret
1040 CFI_ENDPROC 1055 CFI_ENDPROC
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
index a472d62f899a..43fcf62fef0f 100644
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -72,14 +72,11 @@ static void cluster_send_IPI_mask(cpumask_t mask, int vector)
72static void cluster_send_IPI_allbutself(int vector) 72static void cluster_send_IPI_allbutself(int vector)
73{ 73{
74 cpumask_t mask = cpu_online_map; 74 cpumask_t mask = cpu_online_map;
75 int me = get_cpu(); /* Ensure we are not preempted when we clear */
76 75
77 cpu_clear(me, mask); 76 cpu_clear(smp_processor_id(), mask);
78 77
79 if (!cpus_empty(mask)) 78 if (!cpus_empty(mask))
80 cluster_send_IPI_mask(mask, vector); 79 cluster_send_IPI_mask(mask, vector);
81
82 put_cpu();
83} 80}
84 81
85static void cluster_send_IPI_all(int vector) 82static void cluster_send_IPI_all(int vector)
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 9da3edb799ea..1a2ab825be98 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -83,12 +83,11 @@ static void flat_send_IPI_allbutself(int vector)
83 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 83 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
84#else 84#else
85 cpumask_t allbutme = cpu_online_map; 85 cpumask_t allbutme = cpu_online_map;
86 int me = get_cpu(); /* Ensure we are not preempted when we clear */ 86
87 cpu_clear(me, allbutme); 87 cpu_clear(smp_processor_id(), allbutme);
88 88
89 if (!cpus_empty(allbutme)) 89 if (!cpus_empty(allbutme))
90 flat_send_IPI_mask(allbutme, vector); 90 flat_send_IPI_mask(allbutme, vector);
91 put_cpu();
92#endif 91#endif
93} 92}
94 93
@@ -149,10 +148,9 @@ static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
149static void physflat_send_IPI_allbutself(int vector) 148static void physflat_send_IPI_allbutself(int vector)
150{ 149{
151 cpumask_t allbutme = cpu_online_map; 150 cpumask_t allbutme = cpu_online_map;
152 int me = get_cpu(); 151
153 cpu_clear(me, allbutme); 152 cpu_clear(smp_processor_id(), allbutme);
154 physflat_send_IPI_mask(allbutme, vector); 153 physflat_send_IPI_mask(allbutme, vector);
155 put_cpu();
156} 154}
157 155
158static void physflat_send_IPI_all(int vector) 156static void physflat_send_IPI_all(int vector)
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index b92e5f45ed46..692c737feddb 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -12,6 +12,7 @@
12 12
13#include <linux/linkage.h> 13#include <linux/linkage.h>
14#include <linux/threads.h> 14#include <linux/threads.h>
15#include <linux/init.h>
15#include <asm/desc.h> 16#include <asm/desc.h>
16#include <asm/segment.h> 17#include <asm/segment.h>
17#include <asm/page.h> 18#include <asm/page.h>
@@ -70,7 +71,7 @@ startup_32:
70 movl %eax, %cr4 71 movl %eax, %cr4
71 72
72 /* Setup early boot stage 4 level pagetables */ 73 /* Setup early boot stage 4 level pagetables */
73 movl $(init_level4_pgt - __START_KERNEL_map), %eax 74 movl $(boot_level4_pgt - __START_KERNEL_map), %eax
74 movl %eax, %cr3 75 movl %eax, %cr3
75 76
76 /* Setup EFER (Extended Feature Enable Register) */ 77 /* Setup EFER (Extended Feature Enable Register) */
@@ -113,7 +114,7 @@ startup_64:
113 movq %rax, %cr4 114 movq %rax, %cr4
114 115
115 /* Setup early boot stage 4 level pagetables. */ 116 /* Setup early boot stage 4 level pagetables. */
116 movq $(init_level4_pgt - __START_KERNEL_map), %rax 117 movq $(boot_level4_pgt - __START_KERNEL_map), %rax
117 movq %rax, %cr3 118 movq %rax, %cr3
118 119
119 /* Check if nx is implemented */ 120 /* Check if nx is implemented */
@@ -240,116 +241,90 @@ ljumpvector:
240ENTRY(stext) 241ENTRY(stext)
241ENTRY(_stext) 242ENTRY(_stext)
242 243
243 /* 244 $page = 0
244 * This default setting generates an ident mapping at address 0x100000 245#define NEXT_PAGE(name) \
245 * and a mapping for the kernel that precisely maps virtual address 246 $page = $page + 1; \
246 * 0xffffffff80000000 to physical address 0x000000. (always using 247 .org $page * 0x1000; \
247 * 2Mbyte large pages provided by PAE mode) 248 phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
248 */ 249ENTRY(name)
249.org 0x1000
250ENTRY(init_level4_pgt)
251 .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
252 .fill 255,8,0
253 .quad 0x000000000000a007 + __PHYSICAL_START
254 .fill 254,8,0
255 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
256 .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
257 250
258.org 0x2000 251NEXT_PAGE(init_level4_pgt)
259ENTRY(level3_ident_pgt) 252 /* This gets initialized in x86_64_start_kernel */
260 .quad 0x0000000000004007 + __PHYSICAL_START 253 .fill 512,8,0
254
255NEXT_PAGE(level3_ident_pgt)
256 .quad phys_level2_ident_pgt | 0x007
261 .fill 511,8,0 257 .fill 511,8,0
262 258
263.org 0x3000 259NEXT_PAGE(level3_kernel_pgt)
264ENTRY(level3_kernel_pgt)
265 .fill 510,8,0 260 .fill 510,8,0
266 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 261 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
267 .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ 262 .quad phys_level2_kernel_pgt | 0x007
268 .fill 1,8,0 263 .fill 1,8,0
269 264
270.org 0x4000 265NEXT_PAGE(level2_ident_pgt)
271ENTRY(level2_ident_pgt)
272 /* 40MB for bootup. */ 266 /* 40MB for bootup. */
273 .quad 0x0000000000000083 267 i = 0
274 .quad 0x0000000000200083 268 .rept 20
275 .quad 0x0000000000400083 269 .quad i << 21 | 0x083
276 .quad 0x0000000000600083 270 i = i + 1
277 .quad 0x0000000000800083 271 .endr
278 .quad 0x0000000000A00083
279 .quad 0x0000000000C00083
280 .quad 0x0000000000E00083
281 .quad 0x0000000001000083
282 .quad 0x0000000001200083
283 .quad 0x0000000001400083
284 .quad 0x0000000001600083
285 .quad 0x0000000001800083
286 .quad 0x0000000001A00083
287 .quad 0x0000000001C00083
288 .quad 0x0000000001E00083
289 .quad 0x0000000002000083
290 .quad 0x0000000002200083
291 .quad 0x0000000002400083
292 .quad 0x0000000002600083
293 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ 272 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
294 .globl temp_boot_pmds 273 .globl temp_boot_pmds
295temp_boot_pmds: 274temp_boot_pmds:
296 .fill 492,8,0 275 .fill 492,8,0
297 276
298.org 0x5000 277NEXT_PAGE(level2_kernel_pgt)
299ENTRY(level2_kernel_pgt)
300 /* 40MB kernel mapping. The kernel code cannot be bigger than that. 278 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
301 When you change this change KERNEL_TEXT_SIZE in page.h too. */ 279 When you change this change KERNEL_TEXT_SIZE in page.h too. */
302 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ 280 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
303 .quad 0x0000000000000183 281 i = 0
304 .quad 0x0000000000200183 282 .rept 20
305 .quad 0x0000000000400183 283 .quad i << 21 | 0x183
306 .quad 0x0000000000600183 284 i = i + 1
307 .quad 0x0000000000800183 285 .endr
308 .quad 0x0000000000A00183
309 .quad 0x0000000000C00183
310 .quad 0x0000000000E00183
311 .quad 0x0000000001000183
312 .quad 0x0000000001200183
313 .quad 0x0000000001400183
314 .quad 0x0000000001600183
315 .quad 0x0000000001800183
316 .quad 0x0000000001A00183
317 .quad 0x0000000001C00183
318 .quad 0x0000000001E00183
319 .quad 0x0000000002000183
320 .quad 0x0000000002200183
321 .quad 0x0000000002400183
322 .quad 0x0000000002600183
323 /* Module mapping starts here */ 286 /* Module mapping starts here */
324 .fill 492,8,0 287 .fill 492,8,0
325 288
326.org 0x6000 289NEXT_PAGE(empty_zero_page)
327ENTRY(empty_zero_page)
328
329.org 0x7000
330ENTRY(empty_bad_page)
331 290
332.org 0x8000 291NEXT_PAGE(level3_physmem_pgt)
333ENTRY(empty_bad_pte_table) 292 .quad phys_level2_kernel_pgt | 0x007 /* so that __va works even before pagetable_init */
293 .fill 511,8,0
334 294
335.org 0x9000 295#undef NEXT_PAGE
336ENTRY(empty_bad_pmd_table)
337 296
338.org 0xa000 297 .data
339ENTRY(level3_physmem_pgt)
340 .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
341 298
342 .org 0xb000
343#ifdef CONFIG_ACPI_SLEEP 299#ifdef CONFIG_ACPI_SLEEP
300 .align PAGE_SIZE
344ENTRY(wakeup_level4_pgt) 301ENTRY(wakeup_level4_pgt)
345 .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ 302 .quad phys_level3_ident_pgt | 0x007
346 .fill 255,8,0 303 .fill 255,8,0
347 .quad 0x000000000000a007 + __PHYSICAL_START 304 .quad phys_level3_physmem_pgt | 0x007
348 .fill 254,8,0 305 .fill 254,8,0
349 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 306 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
350 .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ 307 .quad phys_level3_kernel_pgt | 0x007
351#endif 308#endif
352 309
310#ifndef CONFIG_HOTPLUG_CPU
311 __INITDATA
312#endif
313 /*
314 * This default setting generates an ident mapping at address 0x100000
315 * and a mapping for the kernel that precisely maps virtual address
316 * 0xffffffff80000000 to physical address 0x000000. (always using
317 * 2Mbyte large pages provided by PAE mode)
318 */
319 .align PAGE_SIZE
320ENTRY(boot_level4_pgt)
321 .quad phys_level3_ident_pgt | 0x007
322 .fill 255,8,0
323 .quad phys_level3_physmem_pgt | 0x007
324 .fill 254,8,0
325 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
326 .quad phys_level3_kernel_pgt | 0x007
327
353 .data 328 .data
354 329
355 .align 16 330 .align 16
@@ -370,14 +345,14 @@ gdt:
370 * Also sysret mandates a special GDT layout 345 * Also sysret mandates a special GDT layout
371 */ 346 */
372 347
373.align L1_CACHE_BYTES 348.align PAGE_SIZE
374 349
375/* The TLS descriptors are currently at a different place compared to i386. 350/* The TLS descriptors are currently at a different place compared to i386.
376 Hopefully nobody expects them at a fixed place (Wine?) */ 351 Hopefully nobody expects them at a fixed place (Wine?) */
377 352
378ENTRY(cpu_gdt_table) 353ENTRY(cpu_gdt_table)
379 .quad 0x0000000000000000 /* NULL descriptor */ 354 .quad 0x0000000000000000 /* NULL descriptor */
380 .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ 355 .quad 0x0 /* unused */
381 .quad 0x00af9a000000ffff /* __KERNEL_CS */ 356 .quad 0x00af9a000000ffff /* __KERNEL_CS */
382 .quad 0x00cf92000000ffff /* __KERNEL_DS */ 357 .quad 0x00cf92000000ffff /* __KERNEL_DS */
383 .quad 0x00cffa000000ffff /* __USER32_CS */ 358 .quad 0x00cffa000000ffff /* __USER32_CS */
@@ -387,15 +362,15 @@ ENTRY(cpu_gdt_table)
387 .quad 0,0 /* TSS */ 362 .quad 0,0 /* TSS */
388 .quad 0,0 /* LDT */ 363 .quad 0,0 /* LDT */
389 .quad 0,0,0 /* three TLS descriptors */ 364 .quad 0,0,0 /* three TLS descriptors */
390 .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ 365 .quad 0 /* unused */
391 /* base must be patched for real base address. */
392gdt_end: 366gdt_end:
393 /* asm/segment.h:GDT_ENTRIES must match this */ 367 /* asm/segment.h:GDT_ENTRIES must match this */
394 /* This should be a multiple of the cache line size */ 368 /* This should be a multiple of the cache line size */
395 /* GDTs of other CPUs: */ 369 /* GDTs of other CPUs are now dynamically allocated */
396 .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) 370
371 /* zero the remaining page */
372 .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
397 373
398 .align L1_CACHE_BYTES
399ENTRY(idt_table) 374ENTRY(idt_table)
400 .rept 256 375 .rept 256
401 .quad 0 376 .quad 0
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cf6ab147a2a5..cea20a66c150 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -19,14 +19,15 @@
19#include <asm/bootsetup.h> 19#include <asm/bootsetup.h>
20#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/desc.h> 21#include <asm/desc.h>
22#include <asm/pgtable.h>
23#include <asm/sections.h>
22 24
23/* Don't add a printk in there. printk relies on the PDA which is not initialized 25/* Don't add a printk in there. printk relies on the PDA which is not initialized
24 yet. */ 26 yet. */
25static void __init clear_bss(void) 27static void __init clear_bss(void)
26{ 28{
27 extern char __bss_start[], __bss_end[];
28 memset(__bss_start, 0, 29 memset(__bss_start, 0,
29 (unsigned long) __bss_end - (unsigned long) __bss_start); 30 (unsigned long) __bss_stop - (unsigned long) __bss_start);
30} 31}
31 32
32#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ 33#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
@@ -75,8 +76,6 @@ static void __init setup_boot_cpu_data(void)
75 boot_cpu_data.x86_mask = eax & 0xf; 76 boot_cpu_data.x86_mask = eax & 0xf;
76} 77}
77 78
78extern char _end[];
79
80void __init x86_64_start_kernel(char * real_mode_data) 79void __init x86_64_start_kernel(char * real_mode_data)
81{ 80{
82 char *s; 81 char *s;
@@ -86,6 +85,16 @@ void __init x86_64_start_kernel(char * real_mode_data)
86 set_intr_gate(i, early_idt_handler); 85 set_intr_gate(i, early_idt_handler);
87 asm volatile("lidt %0" :: "m" (idt_descr)); 86 asm volatile("lidt %0" :: "m" (idt_descr));
88 clear_bss(); 87 clear_bss();
88
89 /*
90 * switch to init_level4_pgt from boot_level4_pgt
91 */
92 memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
93 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
94
95 for (i = 0; i < NR_CPUS; i++)
96 cpu_pda(i) = &boot_cpu_pda[i];
97
89 pda_init(0); 98 pda_init(0);
90 copy_bootdata(real_mode_data); 99 copy_bootdata(real_mode_data);
91#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
@@ -93,7 +102,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
93#endif 102#endif
94 s = strstr(saved_command_line, "earlyprintk="); 103 s = strstr(saved_command_line, "earlyprintk=");
95 if (s != NULL) 104 if (s != NULL)
96 setup_early_printk(s); 105 setup_early_printk(strchr(s, '=') + 1);
97#ifdef CONFIG_NUMA 106#ifdef CONFIG_NUMA
98 s = strstr(saved_command_line, "numa="); 107 s = strstr(saved_command_line, "numa=");
99 if (s != NULL) 108 if (s != NULL)
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index d9b22b633e39..a5d7e16b928e 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -95,7 +95,7 @@ int save_i387(struct _fpstate __user *buf)
95 if (!used_math()) 95 if (!used_math())
96 return 0; 96 return 0;
97 clear_used_math(); /* trigger finit */ 97 clear_used_math(); /* trigger finit */
98 if (tsk->thread_info->status & TS_USEDFPU) { 98 if (task_thread_info(tsk)->status & TS_USEDFPU) {
99 err = save_i387_checking((struct i387_fxsave_struct __user *)buf); 99 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
100 if (err) return err; 100 if (err) return err;
101 stts(); 101 stts();
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c6c9791d77c1..5ecd34ab8c2b 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -133,7 +133,7 @@ static void end_8259A_irq (unsigned int irq)
133{ 133{
134 if (irq > 256) { 134 if (irq > 256) {
135 char var; 135 char var;
136 printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); 136 printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, task_thread_info(current));
137 137
138 BUG(); 138 BUG();
139 } 139 }
@@ -492,6 +492,7 @@ void invalidate_interrupt5(void);
492void invalidate_interrupt6(void); 492void invalidate_interrupt6(void);
493void invalidate_interrupt7(void); 493void invalidate_interrupt7(void);
494void thermal_interrupt(void); 494void thermal_interrupt(void);
495void threshold_interrupt(void);
495void i8254_timer_resume(void); 496void i8254_timer_resume(void);
496 497
497static void setup_timer_hardware(void) 498static void setup_timer_hardware(void)
@@ -515,7 +516,7 @@ void i8254_timer_resume(void)
515} 516}
516 517
517static struct sysdev_class timer_sysclass = { 518static struct sysdev_class timer_sysclass = {
518 set_kset_name("timer"), 519 set_kset_name("timer_pit"),
519 .resume = timer_resume, 520 .resume = timer_resume,
520}; 521};
521 522
@@ -548,10 +549,9 @@ void __init init_IRQ(void)
548 int vector = FIRST_EXTERNAL_VECTOR + i; 549 int vector = FIRST_EXTERNAL_VECTOR + i;
549 if (i >= NR_IRQS) 550 if (i >= NR_IRQS)
550 break; 551 break;
551 if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { 552 if (vector != IA32_SYSCALL_VECTOR)
552 set_intr_gate(vector, interrupt[i]); 553 set_intr_gate(vector, interrupt[i]);
553 } 554 }
554 }
555 555
556#ifdef CONFIG_SMP 556#ifdef CONFIG_SMP
557 /* 557 /*
@@ -580,6 +580,7 @@ void __init init_IRQ(void)
580 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 580 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
581#endif 581#endif
582 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 582 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
583 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
583 584
584#ifdef CONFIG_X86_LOCAL_APIC 585#ifdef CONFIG_X86_LOCAL_APIC
585 /* self generated IPI for local APIC timer */ 586 /* self generated IPI for local APIC timer */
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index e0ba5c1043fd..ce31d904d601 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task);
44 * section. Since TSS's are completely CPU-local, we want them 44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */ 46 */
47DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS; 47DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
48 48
49#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) 49#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c8eee20cd519..e8cf44ef8778 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -37,6 +37,7 @@
37#include <asm/proto.h> 37#include <asm/proto.h>
38#include <asm/mach_apic.h> 38#include <asm/mach_apic.h>
39#include <asm/acpi.h> 39#include <asm/acpi.h>
40#include <asm/dma.h>
40 41
41#define __apicdebuginit __init 42#define __apicdebuginit __init
42 43
@@ -46,6 +47,9 @@ static int no_timer_check;
46 47
47int disable_timer_pin_1 __initdata; 48int disable_timer_pin_1 __initdata;
48 49
50/* Where if anywhere is the i8259 connect in external int mode */
51static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
52
49static DEFINE_SPINLOCK(ioapic_lock); 53static DEFINE_SPINLOCK(ioapic_lock);
50 54
51/* 55/*
@@ -57,7 +61,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
57 * Rough estimation of how many shared IRQs there are, can 61 * Rough estimation of how many shared IRQs there are, can
58 * be changed anytime. 62 * be changed anytime.
59 */ 63 */
60#define MAX_PLUS_SHARED_IRQS NR_IRQS 64#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
61#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 65#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
62 66
63/* 67/*
@@ -85,6 +89,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
85 int pin; \ 89 int pin; \
86 struct irq_pin_list *entry = irq_2_pin + irq; \ 90 struct irq_pin_list *entry = irq_2_pin + irq; \
87 \ 91 \
92 BUG_ON(irq >= NR_IRQS); \
88 for (;;) { \ 93 for (;;) { \
89 unsigned int reg; \ 94 unsigned int reg; \
90 pin = entry->pin; \ 95 pin = entry->pin; \
@@ -127,6 +132,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
127} 132}
128#endif 133#endif
129 134
135static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
136
130/* 137/*
131 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 138 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
132 * shared ISA-space IRQs, so we have to support them. We are super 139 * shared ISA-space IRQs, so we have to support them. We are super
@@ -137,6 +144,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
137 static int first_free_entry = NR_IRQS; 144 static int first_free_entry = NR_IRQS;
138 struct irq_pin_list *entry = irq_2_pin + irq; 145 struct irq_pin_list *entry = irq_2_pin + irq;
139 146
147 BUG_ON(irq >= NR_IRQS);
140 while (entry->next) 148 while (entry->next)
141 entry = irq_2_pin + entry->next; 149 entry = irq_2_pin + entry->next;
142 150
@@ -144,7 +152,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
144 entry->next = first_free_entry; 152 entry->next = first_free_entry;
145 entry = irq_2_pin + entry->next; 153 entry = irq_2_pin + entry->next;
146 if (++first_free_entry >= PIN_MAP_SIZE) 154 if (++first_free_entry >= PIN_MAP_SIZE)
147 panic("io_apic.c: whoops"); 155 panic("io_apic.c: ran out of irq_2_pin entries!");
148 } 156 }
149 entry->apic = apic; 157 entry->apic = apic;
150 entry->pin = pin; 158 entry->pin = pin;
@@ -256,9 +264,6 @@ __setup("apic", enable_ioapic_setup);
256void __init check_ioapic(void) 264void __init check_ioapic(void)
257{ 265{
258 int num,slot,func; 266 int num,slot,func;
259 if (ioapic_force)
260 return;
261
262 /* Poor man's PCI discovery */ 267 /* Poor man's PCI discovery */
263 for (num = 0; num < 32; num++) { 268 for (num = 0; num < 32; num++) {
264 for (slot = 0; slot < 32; slot++) { 269 for (slot = 0; slot < 32; slot++) {
@@ -280,7 +285,7 @@ void __init check_ioapic(void)
280 switch (vendor) { 285 switch (vendor) {
281 case PCI_VENDOR_ID_VIA: 286 case PCI_VENDOR_ID_VIA:
282#ifdef CONFIG_GART_IOMMU 287#ifdef CONFIG_GART_IOMMU
283 if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || 288 if ((end_pfn > MAX_DMA32_PFN ||
284 force_iommu) && 289 force_iommu) &&
285 !iommu_aperture_allowed) { 290 !iommu_aperture_allowed) {
286 printk(KERN_INFO 291 printk(KERN_INFO
@@ -359,7 +364,7 @@ static int find_irq_entry(int apic, int pin, int type)
359/* 364/*
360 * Find the pin to which IRQ[irq] (ISA) is connected 365 * Find the pin to which IRQ[irq] (ISA) is connected
361 */ 366 */
362static int find_isa_irq_pin(int irq, int type) 367static int __init find_isa_irq_pin(int irq, int type)
363{ 368{
364 int i; 369 int i;
365 370
@@ -377,6 +382,31 @@ static int find_isa_irq_pin(int irq, int type)
377 return -1; 382 return -1;
378} 383}
379 384
385static int __init find_isa_irq_apic(int irq, int type)
386{
387 int i;
388
389 for (i = 0; i < mp_irq_entries; i++) {
390 int lbus = mp_irqs[i].mpc_srcbus;
391
392 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
393 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
394 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
395 (mp_irqs[i].mpc_irqtype == type) &&
396 (mp_irqs[i].mpc_srcbusirq == irq))
397 break;
398 }
399 if (i < mp_irq_entries) {
400 int apic;
401 for(apic = 0; apic < nr_ioapics; apic++) {
402 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
403 return apic;
404 }
405 }
406
407 return -1;
408}
409
380/* 410/*
381 * Find a specific PCI IRQ entry. 411 * Find a specific PCI IRQ entry.
382 * Not an __init, possibly needed by modules 412 * Not an __init, possibly needed by modules
@@ -420,6 +450,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
420 best_guess = irq; 450 best_guess = irq;
421 } 451 }
422 } 452 }
453 BUG_ON(best_guess >= NR_IRQS);
423 return best_guess; 454 return best_guess;
424} 455}
425 456
@@ -610,6 +641,64 @@ static inline int irq_trigger(int idx)
610 return MPBIOS_trigger(idx); 641 return MPBIOS_trigger(idx);
611} 642}
612 643
644static int next_irq = 16;
645
646/*
647 * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
648 * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
649 * from ACPI, which can reach 800 in large boxen.
650 *
651 * Compact the sparse GSI space into a sequential IRQ series and reuse
652 * vectors if possible.
653 */
654int gsi_irq_sharing(int gsi)
655{
656 int i, tries, vector;
657
658 BUG_ON(gsi >= NR_IRQ_VECTORS);
659
660 if (platform_legacy_irq(gsi))
661 return gsi;
662
663 if (gsi_2_irq[gsi] != 0xFF)
664 return (int)gsi_2_irq[gsi];
665
666 tries = NR_IRQS;
667 try_again:
668 vector = assign_irq_vector(gsi);
669
670 /*
671 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
672 * use of vector and if found, return that IRQ. However, we never want
673 * to share legacy IRQs, which usually have a different trigger mode
674 * than PCI.
675 */
676 for (i = 0; i < NR_IRQS; i++)
677 if (IO_APIC_VECTOR(i) == vector)
678 break;
679 if (platform_legacy_irq(i)) {
680 if (--tries >= 0) {
681 IO_APIC_VECTOR(i) = 0;
682 goto try_again;
683 }
684 panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
685 }
686 if (i < NR_IRQS) {
687 gsi_2_irq[gsi] = i;
688 printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
689 gsi, vector, i);
690 return i;
691 }
692
693 i = next_irq++;
694 BUG_ON(i >= NR_IRQS);
695 gsi_2_irq[gsi] = i;
696 IO_APIC_VECTOR(i) = vector;
697 printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
698 gsi, vector, i);
699 return i;
700}
701
613static int pin_2_irq(int idx, int apic, int pin) 702static int pin_2_irq(int idx, int apic, int pin)
614{ 703{
615 int irq, i; 704 int irq, i;
@@ -639,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin)
639 while (i < apic) 728 while (i < apic)
640 irq += nr_ioapic_registers[i++]; 729 irq += nr_ioapic_registers[i++];
641 irq += pin; 730 irq += pin;
731 irq = gsi_irq_sharing(irq);
642 break; 732 break;
643 } 733 }
644 default: 734 default:
@@ -648,6 +738,7 @@ static int pin_2_irq(int idx, int apic, int pin)
648 break; 738 break;
649 } 739 }
650 } 740 }
741 BUG_ON(irq >= NR_IRQS);
651 742
652 /* 743 /*
653 * PCI IRQ command line redirection. Yes, limits are hardcoded. 744 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -663,6 +754,7 @@ static int pin_2_irq(int idx, int apic, int pin)
663 } 754 }
664 } 755 }
665 } 756 }
757 BUG_ON(irq >= NR_IRQS);
666 return irq; 758 return irq;
667} 759}
668 760
@@ -690,8 +782,8 @@ int assign_irq_vector(int irq)
690{ 782{
691 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; 783 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
692 784
693 BUG_ON(irq >= NR_IRQ_VECTORS); 785 BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
694 if (IO_APIC_VECTOR(irq) > 0) 786 if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
695 return IO_APIC_VECTOR(irq); 787 return IO_APIC_VECTOR(irq);
696next: 788next:
697 current_vector += 8; 789 current_vector += 8;
@@ -699,9 +791,8 @@ next:
699 goto next; 791 goto next;
700 792
701 if (current_vector >= FIRST_SYSTEM_VECTOR) { 793 if (current_vector >= FIRST_SYSTEM_VECTOR) {
702 offset++; 794 /* If we run out of vectors on large boxen, must share them. */
703 if (!(offset%8)) 795 offset = (offset + 1) % 8;
704 return -ENOSPC;
705 current_vector = FIRST_DEVICE_VECTOR + offset; 796 current_vector = FIRST_DEVICE_VECTOR + offset;
706 } 797 }
707 798
@@ -809,7 +900,7 @@ static void __init setup_IO_APIC_irqs(void)
809 * Set up the 8259A-master output pin as broadcast to all 900 * Set up the 8259A-master output pin as broadcast to all
810 * CPUs. 901 * CPUs.
811 */ 902 */
812static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) 903static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
813{ 904{
814 struct IO_APIC_route_entry entry; 905 struct IO_APIC_route_entry entry;
815 unsigned long flags; 906 unsigned long flags;
@@ -819,7 +910,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
819 disable_8259A_irq(0); 910 disable_8259A_irq(0);
820 911
821 /* mask LVT0 */ 912 /* mask LVT0 */
822 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 913 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
823 914
824 /* 915 /*
825 * We use logical delivery to get the timer IRQ 916 * We use logical delivery to get the timer IRQ
@@ -843,8 +934,8 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
843 * Add it to the IO-APIC irq-routing table: 934 * Add it to the IO-APIC irq-routing table:
844 */ 935 */
845 spin_lock_irqsave(&ioapic_lock, flags); 936 spin_lock_irqsave(&ioapic_lock, flags);
846 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); 937 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
847 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); 938 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
848 spin_unlock_irqrestore(&ioapic_lock, flags); 939 spin_unlock_irqrestore(&ioapic_lock, flags);
849 940
850 enable_8259A_irq(0); 941 enable_8259A_irq(0);
@@ -1123,7 +1214,8 @@ void __apicdebuginit print_PIC(void)
1123static void __init enable_IO_APIC(void) 1214static void __init enable_IO_APIC(void)
1124{ 1215{
1125 union IO_APIC_reg_01 reg_01; 1216 union IO_APIC_reg_01 reg_01;
1126 int i; 1217 int i8259_apic, i8259_pin;
1218 int i, apic;
1127 unsigned long flags; 1219 unsigned long flags;
1128 1220
1129 for (i = 0; i < PIN_MAP_SIZE; i++) { 1221 for (i = 0; i < PIN_MAP_SIZE; i++) {
@@ -1137,11 +1229,48 @@ static void __init enable_IO_APIC(void)
1137 /* 1229 /*
1138 * The number of IO-APIC IRQ registers (== #pins): 1230 * The number of IO-APIC IRQ registers (== #pins):
1139 */ 1231 */
1140 for (i = 0; i < nr_ioapics; i++) { 1232 for (apic = 0; apic < nr_ioapics; apic++) {
1141 spin_lock_irqsave(&ioapic_lock, flags); 1233 spin_lock_irqsave(&ioapic_lock, flags);
1142 reg_01.raw = io_apic_read(i, 1); 1234 reg_01.raw = io_apic_read(apic, 1);
1143 spin_unlock_irqrestore(&ioapic_lock, flags); 1235 spin_unlock_irqrestore(&ioapic_lock, flags);
1144 nr_ioapic_registers[i] = reg_01.bits.entries+1; 1236 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1237 }
1238 for(apic = 0; apic < nr_ioapics; apic++) {
1239 int pin;
1240 /* See if any of the pins is in ExtINT mode */
1241 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1242 struct IO_APIC_route_entry entry;
1243 spin_lock_irqsave(&ioapic_lock, flags);
1244 *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1245 *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1246 spin_unlock_irqrestore(&ioapic_lock, flags);
1247
1248
1249 /* If the interrupt line is enabled and in ExtInt mode
1250 * I have found the pin where the i8259 is connected.
1251 */
1252 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1253 ioapic_i8259.apic = apic;
1254 ioapic_i8259.pin = pin;
1255 goto found_i8259;
1256 }
1257 }
1258 }
1259 found_i8259:
1260 /* Look to see what if the MP table has reported the ExtINT */
1261 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1262 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1263 /* Trust the MP table if nothing is setup in the hardware */
1264 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1265 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1266 ioapic_i8259.pin = i8259_pin;
1267 ioapic_i8259.apic = i8259_apic;
1268 }
1269 /* Complain if the MP table and the hardware disagree */
1270 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1271 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1272 {
1273 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1145 } 1274 }
1146 1275
1147 /* 1276 /*
@@ -1155,7 +1284,6 @@ static void __init enable_IO_APIC(void)
1155 */ 1284 */
1156void disable_IO_APIC(void) 1285void disable_IO_APIC(void)
1157{ 1286{
1158 int pin;
1159 /* 1287 /*
1160 * Clear the IO-APIC before rebooting: 1288 * Clear the IO-APIC before rebooting:
1161 */ 1289 */
@@ -1166,8 +1294,7 @@ void disable_IO_APIC(void)
1166 * Put that IOAPIC in virtual wire mode 1294 * Put that IOAPIC in virtual wire mode
1167 * so legacy interrupts can be delivered. 1295 * so legacy interrupts can be delivered.
1168 */ 1296 */
1169 pin = find_isa_irq_pin(0, mp_ExtINT); 1297 if (ioapic_i8259.pin != -1) {
1170 if (pin != -1) {
1171 struct IO_APIC_route_entry entry; 1298 struct IO_APIC_route_entry entry;
1172 unsigned long flags; 1299 unsigned long flags;
1173 1300
@@ -1178,21 +1305,23 @@ void disable_IO_APIC(void)
1178 entry.polarity = 0; /* High */ 1305 entry.polarity = 0; /* High */
1179 entry.delivery_status = 0; 1306 entry.delivery_status = 0;
1180 entry.dest_mode = 0; /* Physical */ 1307 entry.dest_mode = 0; /* Physical */
1181 entry.delivery_mode = 7; /* ExtInt */ 1308 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1182 entry.vector = 0; 1309 entry.vector = 0;
1183 entry.dest.physical.physical_dest = 0; 1310 entry.dest.physical.physical_dest =
1184 1311 GET_APIC_ID(apic_read(APIC_ID));
1185 1312
1186 /* 1313 /*
1187 * Add it to the IO-APIC irq-routing table: 1314 * Add it to the IO-APIC irq-routing table:
1188 */ 1315 */
1189 spin_lock_irqsave(&ioapic_lock, flags); 1316 spin_lock_irqsave(&ioapic_lock, flags);
1190 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); 1317 io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
1191 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); 1318 *(((int *)&entry)+1));
1319 io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
1320 *(((int *)&entry)+0));
1192 spin_unlock_irqrestore(&ioapic_lock, flags); 1321 spin_unlock_irqrestore(&ioapic_lock, flags);
1193 } 1322 }
1194 1323
1195 disconnect_bsp_APIC(pin != -1); 1324 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1196} 1325}
1197 1326
1198/* 1327/*
@@ -1506,7 +1635,7 @@ static void enable_lapic_irq (unsigned int irq)
1506 unsigned long v; 1635 unsigned long v;
1507 1636
1508 v = apic_read(APIC_LVT0); 1637 v = apic_read(APIC_LVT0);
1509 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 1638 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1510} 1639}
1511 1640
1512static void disable_lapic_irq (unsigned int irq) 1641static void disable_lapic_irq (unsigned int irq)
@@ -1514,7 +1643,7 @@ static void disable_lapic_irq (unsigned int irq)
1514 unsigned long v; 1643 unsigned long v;
1515 1644
1516 v = apic_read(APIC_LVT0); 1645 v = apic_read(APIC_LVT0);
1517 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 1646 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1518} 1647}
1519 1648
1520static void ack_lapic_irq (unsigned int irq) 1649static void ack_lapic_irq (unsigned int irq)
@@ -1561,20 +1690,21 @@ static void setup_nmi (void)
1561 */ 1690 */
1562static inline void unlock_ExtINT_logic(void) 1691static inline void unlock_ExtINT_logic(void)
1563{ 1692{
1564 int pin, i; 1693 int apic, pin, i;
1565 struct IO_APIC_route_entry entry0, entry1; 1694 struct IO_APIC_route_entry entry0, entry1;
1566 unsigned char save_control, save_freq_select; 1695 unsigned char save_control, save_freq_select;
1567 unsigned long flags; 1696 unsigned long flags;
1568 1697
1569 pin = find_isa_irq_pin(8, mp_INT); 1698 pin = find_isa_irq_pin(8, mp_INT);
1699 apic = find_isa_irq_apic(8, mp_INT);
1570 if (pin == -1) 1700 if (pin == -1)
1571 return; 1701 return;
1572 1702
1573 spin_lock_irqsave(&ioapic_lock, flags); 1703 spin_lock_irqsave(&ioapic_lock, flags);
1574 *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); 1704 *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1575 *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); 1705 *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1576 spin_unlock_irqrestore(&ioapic_lock, flags); 1706 spin_unlock_irqrestore(&ioapic_lock, flags);
1577 clear_IO_APIC_pin(0, pin); 1707 clear_IO_APIC_pin(apic, pin);
1578 1708
1579 memset(&entry1, 0, sizeof(entry1)); 1709 memset(&entry1, 0, sizeof(entry1));
1580 1710
@@ -1587,8 +1717,8 @@ static inline void unlock_ExtINT_logic(void)
1587 entry1.vector = 0; 1717 entry1.vector = 0;
1588 1718
1589 spin_lock_irqsave(&ioapic_lock, flags); 1719 spin_lock_irqsave(&ioapic_lock, flags);
1590 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); 1720 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
1591 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); 1721 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
1592 spin_unlock_irqrestore(&ioapic_lock, flags); 1722 spin_unlock_irqrestore(&ioapic_lock, flags);
1593 1723
1594 save_control = CMOS_READ(RTC_CONTROL); 1724 save_control = CMOS_READ(RTC_CONTROL);
@@ -1606,11 +1736,11 @@ static inline void unlock_ExtINT_logic(void)
1606 1736
1607 CMOS_WRITE(save_control, RTC_CONTROL); 1737 CMOS_WRITE(save_control, RTC_CONTROL);
1608 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); 1738 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
1609 clear_IO_APIC_pin(0, pin); 1739 clear_IO_APIC_pin(apic, pin);
1610 1740
1611 spin_lock_irqsave(&ioapic_lock, flags); 1741 spin_lock_irqsave(&ioapic_lock, flags);
1612 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); 1742 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
1613 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); 1743 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
1614 spin_unlock_irqrestore(&ioapic_lock, flags); 1744 spin_unlock_irqrestore(&ioapic_lock, flags);
1615} 1745}
1616 1746
@@ -1622,7 +1752,7 @@ static inline void unlock_ExtINT_logic(void)
1622 */ 1752 */
1623static inline void check_timer(void) 1753static inline void check_timer(void)
1624{ 1754{
1625 int pin1, pin2; 1755 int apic1, pin1, apic2, pin2;
1626 int vector; 1756 int vector;
1627 1757
1628 /* 1758 /*
@@ -1639,14 +1769,17 @@ static inline void check_timer(void)
1639 * the 8259A which implies the virtual wire has to be 1769 * the 8259A which implies the virtual wire has to be
1640 * disabled in the local APIC. 1770 * disabled in the local APIC.
1641 */ 1771 */
1642 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 1772 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1643 init_8259A(1); 1773 init_8259A(1);
1644 enable_8259A_irq(0); 1774 enable_8259A_irq(0);
1645 1775
1646 pin1 = find_isa_irq_pin(0, mp_INT); 1776 pin1 = find_isa_irq_pin(0, mp_INT);
1647 pin2 = find_isa_irq_pin(0, mp_ExtINT); 1777 apic1 = find_isa_irq_apic(0, mp_INT);
1778 pin2 = ioapic_i8259.pin;
1779 apic2 = ioapic_i8259.apic;
1648 1780
1649 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); 1781 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
1782 vector, apic1, pin1, apic2, pin2);
1650 1783
1651 if (pin1 != -1) { 1784 if (pin1 != -1) {
1652 /* 1785 /*
@@ -1664,17 +1797,20 @@ static inline void check_timer(void)
1664 clear_IO_APIC_pin(0, pin1); 1797 clear_IO_APIC_pin(0, pin1);
1665 return; 1798 return;
1666 } 1799 }
1667 clear_IO_APIC_pin(0, pin1); 1800 clear_IO_APIC_pin(apic1, pin1);
1668 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); 1801 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
1802 "connected to IO-APIC\n");
1669 } 1803 }
1670 1804
1671 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); 1805 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
1806 "through the 8259A ... ");
1672 if (pin2 != -1) { 1807 if (pin2 != -1) {
1673 apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); 1808 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
1809 apic2, pin2);
1674 /* 1810 /*
1675 * legacy devices should be connected to IO APIC #0 1811 * legacy devices should be connected to IO APIC #0
1676 */ 1812 */
1677 setup_ExtINT_IRQ0_pin(pin2, vector); 1813 setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
1678 if (timer_irq_works()) { 1814 if (timer_irq_works()) {
1679 printk("works.\n"); 1815 printk("works.\n");
1680 nmi_watchdog_default(); 1816 nmi_watchdog_default();
@@ -1686,7 +1822,7 @@ static inline void check_timer(void)
1686 /* 1822 /*
1687 * Cleanup, just in case ... 1823 * Cleanup, just in case ...
1688 */ 1824 */
1689 clear_IO_APIC_pin(0, pin2); 1825 clear_IO_APIC_pin(apic2, pin2);
1690 } 1826 }
1691 printk(" failed.\n"); 1827 printk(" failed.\n");
1692 1828
@@ -1699,21 +1835,21 @@ static inline void check_timer(void)
1699 1835
1700 disable_8259A_irq(0); 1836 disable_8259A_irq(0);
1701 irq_desc[0].handler = &lapic_irq_type; 1837 irq_desc[0].handler = &lapic_irq_type;
1702 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 1838 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
1703 enable_8259A_irq(0); 1839 enable_8259A_irq(0);
1704 1840
1705 if (timer_irq_works()) { 1841 if (timer_irq_works()) {
1706 apic_printk(APIC_QUIET, " works.\n"); 1842 apic_printk(APIC_QUIET, " works.\n");
1707 return; 1843 return;
1708 } 1844 }
1709 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 1845 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
1710 apic_printk(APIC_VERBOSE," failed.\n"); 1846 apic_printk(APIC_VERBOSE," failed.\n");
1711 1847
1712 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1848 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
1713 1849
1714 init_8259A(0); 1850 init_8259A(0);
1715 make_8259A_irq(0); 1851 make_8259A_irq(0);
1716 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 1852 apic_write(APIC_LVT0, APIC_DM_EXTINT);
1717 1853
1718 unlock_ExtINT_logic(); 1854 unlock_ExtINT_logic();
1719 1855
@@ -1917,6 +2053,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
1917 entry.polarity = active_high_low; 2053 entry.polarity = active_high_low;
1918 entry.mask = 1; /* Disabled (masked) */ 2054 entry.mask = 1; /* Disabled (masked) */
1919 2055
2056 irq = gsi_irq_sharing(irq);
1920 /* 2057 /*
1921 * IRQs < 16 are already in the irq_2_pin[] map 2058 * IRQs < 16 are already in the irq_2_pin[] map
1922 */ 2059 */
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index cde0e868efe3..b81614970ecc 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/capability.h>
10#include <linux/errno.h> 11#include <linux/errno.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/ioport.h> 13#include <linux/ioport.h>
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index d6a04a8320a3..30d2a1e545fe 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h>
20 21
21atomic_t irq_err_count; 22atomic_t irq_err_count;
22#ifdef CONFIG_X86_IO_APIC 23#ifdef CONFIG_X86_IO_APIC
@@ -69,13 +70,13 @@ skip:
69 seq_printf(p, "NMI: "); 70 seq_printf(p, "NMI: ");
70 for (j = 0; j < NR_CPUS; j++) 71 for (j = 0; j < NR_CPUS; j++)
71 if (cpu_online(j)) 72 if (cpu_online(j))
72 seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); 73 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
73 seq_putc(p, '\n'); 74 seq_putc(p, '\n');
74#ifdef CONFIG_X86_LOCAL_APIC 75#ifdef CONFIG_X86_LOCAL_APIC
75 seq_printf(p, "LOC: "); 76 seq_printf(p, "LOC: ");
76 for (j = 0; j < NR_CPUS; j++) 77 for (j = 0; j < NR_CPUS; j++)
77 if (cpu_online(j)) 78 if (cpu_online(j))
78 seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); 79 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
79 seq_putc(p, '\n'); 80 seq_putc(p, '\n');
80#endif 81#endif
81 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 82 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
@@ -98,6 +99,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
98 /* high bits used in ret_from_ code */ 99 /* high bits used in ret_from_ code */
99 unsigned irq = regs->orig_rax & 0xff; 100 unsigned irq = regs->orig_rax & 0xff;
100 101
102 exit_idle();
101 irq_enter(); 103 irq_enter();
102 104
103 __do_IRQ(irq, regs); 105 __do_IRQ(irq, regs);
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index dddeb678b440..8b866a8572cf 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -42,8 +42,8 @@
42#include <asm/pgtable.h> 42#include <asm/pgtable.h>
43#include <asm/kdebug.h> 43#include <asm/kdebug.h>
44 44
45static DECLARE_MUTEX(kprobe_mutex);
46void jprobe_return_end(void); 45void jprobe_return_end(void);
46static void __kprobes arch_copy_kprobe(struct kprobe *p);
47 47
48DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 48DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
49DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 49DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -69,12 +69,11 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
69int __kprobes arch_prepare_kprobe(struct kprobe *p) 69int __kprobes arch_prepare_kprobe(struct kprobe *p)
70{ 70{
71 /* insn: must be on special executable page on x86_64. */ 71 /* insn: must be on special executable page on x86_64. */
72 down(&kprobe_mutex);
73 p->ainsn.insn = get_insn_slot(); 72 p->ainsn.insn = get_insn_slot();
74 up(&kprobe_mutex);
75 if (!p->ainsn.insn) { 73 if (!p->ainsn.insn) {
76 return -ENOMEM; 74 return -ENOMEM;
77 } 75 }
76 arch_copy_kprobe(p);
78 return 0; 77 return 0;
79} 78}
80 79
@@ -181,7 +180,7 @@ static inline s32 *is_riprel(u8 *insn)
181 return NULL; 180 return NULL;
182} 181}
183 182
184void __kprobes arch_copy_kprobe(struct kprobe *p) 183static void __kprobes arch_copy_kprobe(struct kprobe *p)
185{ 184{
186 s32 *ripdisp; 185 s32 *ripdisp;
187 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); 186 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
@@ -329,12 +328,21 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
329 */ 328 */
330 save_previous_kprobe(kcb); 329 save_previous_kprobe(kcb);
331 set_current_kprobe(p, regs, kcb); 330 set_current_kprobe(p, regs, kcb);
332 p->nmissed++; 331 kprobes_inc_nmissed_count(p);
333 prepare_singlestep(p, regs); 332 prepare_singlestep(p, regs);
334 kcb->kprobe_status = KPROBE_REENTER; 333 kcb->kprobe_status = KPROBE_REENTER;
335 return 1; 334 return 1;
336 } 335 }
337 } else { 336 } else {
337 if (*addr != BREAKPOINT_INSTRUCTION) {
338 /* The breakpoint instruction was removed by
339 * another cpu right after we hit, no further
340 * handling of this interrupt is appropriate
341 */
342 regs->rip = (unsigned long)addr;
343 ret = 1;
344 goto no_kprobe;
345 }
338 p = __get_cpu_var(current_kprobe); 346 p = __get_cpu_var(current_kprobe);
339 if (p->break_handler && p->break_handler(p, regs)) { 347 if (p->break_handler && p->break_handler(p, regs)) {
340 goto ss_probe; 348 goto ss_probe;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 69541db5ff2c..13a2eada6c95 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -15,6 +15,7 @@
15#include <linux/sysdev.h> 15#include <linux/sysdev.h>
16#include <linux/miscdevice.h> 16#include <linux/miscdevice.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/capability.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
19#include <linux/percpu.h> 20#include <linux/percpu.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
@@ -23,9 +24,10 @@
23#include <asm/mce.h> 24#include <asm/mce.h>
24#include <asm/kdebug.h> 25#include <asm/kdebug.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27#include <asm/smp.h>
26 28
27#define MISC_MCELOG_MINOR 227 29#define MISC_MCELOG_MINOR 227
28#define NR_BANKS 5 30#define NR_BANKS 6
29 31
30static int mce_dont_init; 32static int mce_dont_init;
31 33
@@ -37,7 +39,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37static unsigned long console_logged; 39static unsigned long console_logged;
38static int notify_user; 40static int notify_user;
39static int rip_msr; 41static int rip_msr;
40static int mce_bootlog; 42static int mce_bootlog = 1;
41 43
42/* 44/*
43 * Lockless MCE logging infrastructure. 45 * Lockless MCE logging infrastructure.
@@ -91,6 +93,7 @@ void mce_log(struct mce *mce)
91static void print_mce(struct mce *m) 93static void print_mce(struct mce *m)
92{ 94{
93 printk(KERN_EMERG "\n" 95 printk(KERN_EMERG "\n"
96 KERN_EMERG "HARDWARE ERROR\n"
94 KERN_EMERG 97 KERN_EMERG
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 98 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status); 99 m->cpu, m->mcgstatus, m->bank, m->status);
@@ -109,6 +112,9 @@ static void print_mce(struct mce *m)
109 if (m->misc) 112 if (m->misc)
110 printk("MISC %Lx ", m->misc); 113 printk("MISC %Lx ", m->misc);
111 printk("\n"); 114 printk("\n");
115 printk(KERN_EMERG "This is not a software problem!\n");
116 printk(KERN_EMERG
117 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
112} 118}
113 119
114static void mce_panic(char *msg, struct mce *backup, unsigned long start) 120static void mce_panic(char *msg, struct mce *backup, unsigned long start)
@@ -168,12 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
168 int panicm_found = 0; 174 int panicm_found = 0;
169 175
170 if (regs) 176 if (regs)
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); 177 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
172 if (!banks) 178 if (!banks)
173 return; 179 return;
174 180
175 memset(&m, 0, sizeof(struct mce)); 181 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id(); 182 m.cpu = safe_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 183 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 184 if (!(m.mcgstatus & MCG_STATUS_RIPV))
179 kill_it = 1; 185 kill_it = 1;
@@ -347,7 +353,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
347 /* disable GART TBL walk error reporting, which trips off 353 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */ 354 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]); 355 clear_bit(10, &bank[4]);
356 /* Lots of broken BIOS around that don't clear them
357 by default and leave crap in there. Don't log. */
358 mce_bootlog = 0;
350 } 359 }
360
351} 361}
352 362
353static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) 363static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
@@ -356,6 +366,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
356 case X86_VENDOR_INTEL: 366 case X86_VENDOR_INTEL:
357 mce_intel_feature_init(c); 367 mce_intel_feature_init(c);
358 break; 368 break;
369 case X86_VENDOR_AMD:
370 mce_amd_feature_init(c);
371 break;
359 default: 372 default:
360 break; 373 break;
361 } 374 }
@@ -495,16 +508,16 @@ static int __init mcheck_disable(char *str)
495/* mce=off disables machine check. Note you can reenable it later 508/* mce=off disables machine check. Note you can reenable it later
496 using sysfs. 509 using sysfs.
497 mce=TOLERANCELEVEL (number, see above) 510 mce=TOLERANCELEVEL (number, see above)
498 mce=bootlog Log MCEs from before booting. Disabled by default to work 511 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
499 around buggy BIOS that leave bogus MCEs. */ 512 mce=nobootlog Don't log MCEs from before booting. */
500static int __init mcheck_enable(char *str) 513static int __init mcheck_enable(char *str)
501{ 514{
502 if (*str == '=') 515 if (*str == '=')
503 str++; 516 str++;
504 if (!strcmp(str, "off")) 517 if (!strcmp(str, "off"))
505 mce_dont_init = 1; 518 mce_dont_init = 1;
506 else if (!strcmp(str, "bootlog")) 519 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
507 mce_bootlog = 1; 520 mce_bootlog = str[0] == 'b';
508 else if (isdigit(str[0])) 521 else if (isdigit(str[0]))
509 get_option(&str, &tolerant); 522 get_option(&str, &tolerant);
510 else 523 else
@@ -566,6 +579,10 @@ ACCESSOR(bank1ctl,bank[1],mce_restart())
566ACCESSOR(bank2ctl,bank[2],mce_restart()) 579ACCESSOR(bank2ctl,bank[2],mce_restart())
567ACCESSOR(bank3ctl,bank[3],mce_restart()) 580ACCESSOR(bank3ctl,bank[3],mce_restart())
568ACCESSOR(bank4ctl,bank[4],mce_restart()) 581ACCESSOR(bank4ctl,bank[4],mce_restart())
582ACCESSOR(bank5ctl,bank[5],mce_restart())
583static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
584 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
585 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
569ACCESSOR(tolerant,tolerant,) 586ACCESSOR(tolerant,tolerant,)
570ACCESSOR(check_interval,check_interval,mce_restart()) 587ACCESSOR(check_interval,check_interval,mce_restart())
571 588
@@ -573,6 +590,7 @@ ACCESSOR(check_interval,check_interval,mce_restart())
573static __cpuinit int mce_create_device(unsigned int cpu) 590static __cpuinit int mce_create_device(unsigned int cpu)
574{ 591{
575 int err; 592 int err;
593 int i;
576 if (!mce_available(&cpu_data[cpu])) 594 if (!mce_available(&cpu_data[cpu]))
577 return -EIO; 595 return -EIO;
578 596
@@ -582,11 +600,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
582 err = sysdev_register(&per_cpu(device_mce,cpu)); 600 err = sysdev_register(&per_cpu(device_mce,cpu));
583 601
584 if (!err) { 602 if (!err) {
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); 603 for (i = 0; i < banks; i++)
586 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); 604 sysdev_create_file(&per_cpu(device_mce,cpu),
587 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); 605 bank_attributes[i]);
588 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
589 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
590 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); 606 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
591 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); 607 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
592 } 608 }
@@ -596,11 +612,11 @@ static __cpuinit int mce_create_device(unsigned int cpu)
596#ifdef CONFIG_HOTPLUG_CPU 612#ifdef CONFIG_HOTPLUG_CPU
597static __cpuinit void mce_remove_device(unsigned int cpu) 613static __cpuinit void mce_remove_device(unsigned int cpu)
598{ 614{
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); 615 int i;
600 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); 616
601 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); 617 for (i = 0; i < banks; i++)
602 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); 618 sysdev_remove_file(&per_cpu(device_mce,cpu),
603 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); 619 bank_attributes[i]);
604 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); 620 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
605 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); 621 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
606 sysdev_unregister(&per_cpu(device_mce,cpu)); 622 sysdev_unregister(&per_cpu(device_mce,cpu));
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
new file mode 100644
index 000000000000..d3ad7d81266d
--- /dev/null
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -0,0 +1,540 @@
1/*
2 * (c) 2005 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Written by Jacob Shin - AMD, Inc.
8 *
9 * Support : jacob.shin@amd.com
10 *
11 * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
12 * MC4_MISC0 exists per physical processor.
13 *
14 */
15
16#include <linux/cpu.h>
17#include <linux/errno.h>
18#include <linux/init.h>
19#include <linux/interrupt.h>
20#include <linux/kobject.h>
21#include <linux/notifier.h>
22#include <linux/sched.h>
23#include <linux/smp.h>
24#include <linux/sysdev.h>
25#include <linux/sysfs.h>
26#include <asm/apic.h>
27#include <asm/mce.h>
28#include <asm/msr.h>
29#include <asm/percpu.h>
30#include <asm/idle.h>
31
32#define PFX "mce_threshold: "
33#define VERSION "version 1.00.9"
34#define NR_BANKS 5
35#define THRESHOLD_MAX 0xFFF
36#define INT_TYPE_APIC 0x00020000
37#define MASK_VALID_HI 0x80000000
38#define MASK_LVTOFF_HI 0x00F00000
39#define MASK_COUNT_EN_HI 0x00080000
40#define MASK_INT_TYPE_HI 0x00060000
41#define MASK_OVERFLOW_HI 0x00010000
42#define MASK_ERR_COUNT_HI 0x00000FFF
43#define MASK_OVERFLOW 0x0001000000000000L
44
45struct threshold_bank {
46 unsigned int cpu;
47 u8 bank;
48 u8 interrupt_enable;
49 u16 threshold_limit;
50 struct kobject kobj;
51};
52
53static struct threshold_bank threshold_defaults = {
54 .interrupt_enable = 0,
55 .threshold_limit = THRESHOLD_MAX,
56};
57
58#ifdef CONFIG_SMP
59static unsigned char shared_bank[NR_BANKS] = {
60 0, 0, 0, 0, 1
61};
62#endif
63
64static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
65
66/*
67 * CPU Initialization
68 */
69
70/* must be called with correct cpu affinity */
71static void threshold_restart_bank(struct threshold_bank *b,
72 int reset, u16 old_limit)
73{
74 u32 mci_misc_hi, mci_misc_lo;
75
76 rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
77
78 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
79 reset = 1; /* limit cannot be lower than err count */
80
81 if (reset) { /* reset err count and overflow bit */
82 mci_misc_hi =
83 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
84 (THRESHOLD_MAX - b->threshold_limit);
85 } else if (old_limit) { /* change limit w/o reset */
86 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
87 (old_limit - b->threshold_limit);
88 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
89 (new_count & THRESHOLD_MAX);
90 }
91
92 b->interrupt_enable ?
93 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
94 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
95
96 mci_misc_hi |= MASK_COUNT_EN_HI;
97 wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
98}
99
100void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
101{
102 int bank;
103 u32 mci_misc_lo, mci_misc_hi;
104 unsigned int cpu = smp_processor_id();
105
106 for (bank = 0; bank < NR_BANKS; ++bank) {
107 rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
108
109 /* !valid, !counter present, bios locked */
110 if (!(mci_misc_hi & MASK_VALID_HI) ||
111 !(mci_misc_hi & MASK_VALID_HI >> 1) ||
112 (mci_misc_hi & MASK_VALID_HI >> 2))
113 continue;
114
115 per_cpu(bank_map, cpu) |= (1 << bank);
116
117#ifdef CONFIG_SMP
118 if (shared_bank[bank] && cpu_core_id[cpu])
119 continue;
120#endif
121
122 setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
123 threshold_defaults.cpu = cpu;
124 threshold_defaults.bank = bank;
125 threshold_restart_bank(&threshold_defaults, 0, 0);
126 }
127}
128
129/*
130 * APIC Interrupt Handler
131 */
132
133/*
134 * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
135 * the interrupt goes off when error_count reaches threshold_limit.
136 * the handler will simply log mcelog w/ software defined bank number.
137 */
138asmlinkage void mce_threshold_interrupt(void)
139{
140 int bank;
141 struct mce m;
142
143 ack_APIC_irq();
144 exit_idle();
145 irq_enter();
146
147 memset(&m, 0, sizeof(m));
148 rdtscll(m.tsc);
149 m.cpu = smp_processor_id();
150
151 /* assume first bank caused it */
152 for (bank = 0; bank < NR_BANKS; ++bank) {
153 m.bank = MCE_THRESHOLD_BASE + bank;
154 rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
155
156 if (m.misc & MASK_OVERFLOW) {
157 mce_log(&m);
158 goto out;
159 }
160 }
161 out:
162 irq_exit();
163}
164
165/*
166 * Sysfs Interface
167 */
168
169static struct sysdev_class threshold_sysclass = {
170 set_kset_name("threshold"),
171};
172
173static DEFINE_PER_CPU(struct sys_device, device_threshold);
174
175struct threshold_attr {
176 struct attribute attr;
177 ssize_t(*show) (struct threshold_bank *, char *);
178 ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
179};
180
181static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
182
183static cpumask_t affinity_set(unsigned int cpu)
184{
185 cpumask_t oldmask = current->cpus_allowed;
186 cpumask_t newmask = CPU_MASK_NONE;
187 cpu_set(cpu, newmask);
188 set_cpus_allowed(current, newmask);
189 return oldmask;
190}
191
192static void affinity_restore(cpumask_t oldmask)
193{
194 set_cpus_allowed(current, oldmask);
195}
196
197#define SHOW_FIELDS(name) \
198 static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
199 { \
200 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
201 }
202SHOW_FIELDS(interrupt_enable)
203SHOW_FIELDS(threshold_limit)
204
205static ssize_t store_interrupt_enable(struct threshold_bank *b,
206 const char *buf, size_t count)
207{
208 char *end;
209 cpumask_t oldmask;
210 unsigned long new = simple_strtoul(buf, &end, 0);
211 if (end == buf)
212 return -EINVAL;
213 b->interrupt_enable = !!new;
214
215 oldmask = affinity_set(b->cpu);
216 threshold_restart_bank(b, 0, 0);
217 affinity_restore(oldmask);
218
219 return end - buf;
220}
221
222static ssize_t store_threshold_limit(struct threshold_bank *b,
223 const char *buf, size_t count)
224{
225 char *end;
226 cpumask_t oldmask;
227 u16 old;
228 unsigned long new = simple_strtoul(buf, &end, 0);
229 if (end == buf)
230 return -EINVAL;
231 if (new > THRESHOLD_MAX)
232 new = THRESHOLD_MAX;
233 if (new < 1)
234 new = 1;
235 old = b->threshold_limit;
236 b->threshold_limit = new;
237
238 oldmask = affinity_set(b->cpu);
239 threshold_restart_bank(b, 0, old);
240 affinity_restore(oldmask);
241
242 return end - buf;
243}
244
245static ssize_t show_error_count(struct threshold_bank *b, char *buf)
246{
247 u32 high, low;
248 cpumask_t oldmask;
249 oldmask = affinity_set(b->cpu);
250 rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
251 affinity_restore(oldmask);
252 return sprintf(buf, "%x\n",
253 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
254}
255
256static ssize_t store_error_count(struct threshold_bank *b,
257 const char *buf, size_t count)
258{
259 cpumask_t oldmask;
260 oldmask = affinity_set(b->cpu);
261 threshold_restart_bank(b, 1, 0);
262 affinity_restore(oldmask);
263 return 1;
264}
265
266#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
267 .attr = {.name = __stringify(_name), .mode = _mode }, \
268 .show = _show, \
269 .store = _store, \
270};
271
272#define ATTR_FIELDS(name) \
273 static struct threshold_attr name = \
274 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
275
276ATTR_FIELDS(interrupt_enable);
277ATTR_FIELDS(threshold_limit);
278ATTR_FIELDS(error_count);
279
280static struct attribute *default_attrs[] = {
281 &interrupt_enable.attr,
282 &threshold_limit.attr,
283 &error_count.attr,
284 NULL
285};
286
287#define to_bank(k) container_of(k,struct threshold_bank,kobj)
288#define to_attr(a) container_of(a,struct threshold_attr,attr)
289
290static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
291{
292 struct threshold_bank *b = to_bank(kobj);
293 struct threshold_attr *a = to_attr(attr);
294 ssize_t ret;
295 ret = a->show ? a->show(b, buf) : -EIO;
296 return ret;
297}
298
299static ssize_t store(struct kobject *kobj, struct attribute *attr,
300 const char *buf, size_t count)
301{
302 struct threshold_bank *b = to_bank(kobj);
303 struct threshold_attr *a = to_attr(attr);
304 ssize_t ret;
305 ret = a->store ? a->store(b, buf, count) : -EIO;
306 return ret;
307}
308
309static struct sysfs_ops threshold_ops = {
310 .show = show,
311 .store = store,
312};
313
314static struct kobj_type threshold_ktype = {
315 .sysfs_ops = &threshold_ops,
316 .default_attrs = default_attrs,
317};
318
319/* symlinks sibling shared banks to first core. first core owns dir/files. */
320static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
321{
322 int err = 0;
323 struct threshold_bank *b = NULL;
324
325#ifdef CONFIG_SMP
326 if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */
327 char name[16];
328 unsigned lcpu = first_cpu(cpu_core_map[cpu]);
329 if (cpu_core_id[lcpu])
330 goto out; /* first core not up yet */
331
332 b = per_cpu(threshold_banks, lcpu)[bank];
333 if (!b)
334 goto out;
335 sprintf(name, "bank%i", bank);
336 err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
337 &b->kobj, name);
338 if (err)
339 goto out;
340 per_cpu(threshold_banks, cpu)[bank] = b;
341 goto out;
342 }
343#endif
344
345 b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
346 if (!b) {
347 err = -ENOMEM;
348 goto out;
349 }
350 memset(b, 0, sizeof(struct threshold_bank));
351
352 b->cpu = cpu;
353 b->bank = bank;
354 b->interrupt_enable = 0;
355 b->threshold_limit = THRESHOLD_MAX;
356 kobject_set_name(&b->kobj, "bank%i", bank);
357 b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
358 b->kobj.ktype = &threshold_ktype;
359
360 err = kobject_register(&b->kobj);
361 if (err) {
362 kfree(b);
363 goto out;
364 }
365 per_cpu(threshold_banks, cpu)[bank] = b;
366 out:
367 return err;
368}
369
370/* create dir/files for all valid threshold banks */
371static __cpuinit int threshold_create_device(unsigned int cpu)
372{
373 int bank;
374 int err = 0;
375
376 per_cpu(device_threshold, cpu).id = cpu;
377 per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
378 err = sysdev_register(&per_cpu(device_threshold, cpu));
379 if (err)
380 goto out;
381
382 for (bank = 0; bank < NR_BANKS; ++bank) {
383 if (!(per_cpu(bank_map, cpu) & 1 << bank))
384 continue;
385 err = threshold_create_bank(cpu, bank);
386 if (err)
387 goto out;
388 }
389 out:
390 return err;
391}
392
393#ifdef CONFIG_HOTPLUG_CPU
394/*
395 * let's be hotplug friendly.
396 * in case of multiple core processors, the first core always takes ownership
397 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
398 */
399
400/* cpu hotplug call removes all symlinks before first core dies */
401static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
402{
403 struct threshold_bank *b;
404 char name[16];
405
406 b = per_cpu(threshold_banks, cpu)[bank];
407 if (!b)
408 return;
409 if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
410 sprintf(name, "bank%i", bank);
411 sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
412 per_cpu(threshold_banks, cpu)[bank] = NULL;
413 } else {
414 kobject_unregister(&b->kobj);
415 kfree(per_cpu(threshold_banks, cpu)[bank]);
416 }
417}
418
419static __cpuinit void threshold_remove_device(unsigned int cpu)
420{
421 int bank;
422
423 for (bank = 0; bank < NR_BANKS; ++bank) {
424 if (!(per_cpu(bank_map, cpu) & 1 << bank))
425 continue;
426 threshold_remove_bank(cpu, bank);
427 }
428 sysdev_unregister(&per_cpu(device_threshold, cpu));
429}
430
431/* link all existing siblings when first core comes up */
432static __cpuinit int threshold_create_symlinks(unsigned int cpu)
433{
434 int bank, err = 0;
435 unsigned int lcpu = 0;
436
437 if (cpu_core_id[cpu])
438 return 0;
439 for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
440 if (lcpu == cpu)
441 continue;
442 for (bank = 0; bank < NR_BANKS; ++bank) {
443 if (!(per_cpu(bank_map, cpu) & 1 << bank))
444 continue;
445 if (!shared_bank[bank])
446 continue;
447 err = threshold_create_bank(lcpu, bank);
448 }
449 }
450 return err;
451}
452
453/* remove all symlinks before first core dies. */
454static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
455{
456 int bank;
457 unsigned int lcpu = 0;
458 if (cpu_core_id[cpu])
459 return;
460 for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
461 if (lcpu == cpu)
462 continue;
463 for (bank = 0; bank < NR_BANKS; ++bank) {
464 if (!(per_cpu(bank_map, cpu) & 1 << bank))
465 continue;
466 if (!shared_bank[bank])
467 continue;
468 threshold_remove_bank(lcpu, bank);
469 }
470 }
471}
472#else /* !CONFIG_HOTPLUG_CPU */
473static __cpuinit void threshold_create_symlinks(unsigned int cpu)
474{
475}
476static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
477{
478}
479static void threshold_remove_device(unsigned int cpu)
480{
481}
482#endif
483
484/* get notified when a cpu comes on/off */
485static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
486 unsigned long action, void *hcpu)
487{
488 /* cpu was unsigned int to begin with */
489 unsigned int cpu = (unsigned long)hcpu;
490
491 if (cpu >= NR_CPUS)
492 goto out;
493
494 switch (action) {
495 case CPU_ONLINE:
496 threshold_create_device(cpu);
497 threshold_create_symlinks(cpu);
498 break;
499 case CPU_DOWN_PREPARE:
500 threshold_remove_symlinks(cpu);
501 break;
502 case CPU_DOWN_FAILED:
503 threshold_create_symlinks(cpu);
504 break;
505 case CPU_DEAD:
506 threshold_remove_device(cpu);
507 break;
508 default:
509 break;
510 }
511 out:
512 return NOTIFY_OK;
513}
514
515static struct notifier_block threshold_cpu_notifier = {
516 .notifier_call = threshold_cpu_callback,
517};
518
519static __init int threshold_init_device(void)
520{
521 int err;
522 int lcpu = 0;
523
524 err = sysdev_class_register(&threshold_sysclass);
525 if (err)
526 goto out;
527
528 /* to hit CPUs online before the notifier is up */
529 for_each_online_cpu(lcpu) {
530 err = threshold_create_device(lcpu);
531 if (err)
532 goto out;
533 }
534 register_cpu_notifier(&threshold_cpu_notifier);
535
536 out:
537 return err;
538}
539
540device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 0be0a7959814..8f533d2c40cb 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -10,6 +10,7 @@
10#include <asm/msr.h> 10#include <asm/msr.h>
11#include <asm/mce.h> 11#include <asm/mce.h>
12#include <asm/hw_irq.h> 12#include <asm/hw_irq.h>
13#include <asm/idle.h>
13 14
14static DEFINE_PER_CPU(unsigned long, next_check); 15static DEFINE_PER_CPU(unsigned long, next_check);
15 16
@@ -19,6 +20,7 @@ asmlinkage void smp_thermal_interrupt(void)
19 20
20 ack_APIC_irq(); 21 ack_APIC_irq();
21 22
23 exit_idle();
22 irq_enter(); 24 irq_enter();
23 if (time_before(jiffies, __get_cpu_var(next_check))) 25 if (time_before(jiffies, __get_cpu_var(next_check)))
24 goto done; 26 goto done;
@@ -78,7 +80,7 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
78 80
79 h = THERMAL_APIC_VECTOR; 81 h = THERMAL_APIC_VECTOR;
80 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); 82 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
81 apic_write_around(APIC_LVTTHMR, h); 83 apic_write(APIC_LVTTHMR, h);
82 84
83 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 85 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
84 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); 86 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
@@ -87,7 +89,7 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
87 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); 89 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
88 90
89 l = apic_read(APIC_LVTTHMR); 91 l = apic_read(APIC_LVTTHMR);
90 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 92 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
91 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 93 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
92 cpu, tm2 ? "TM2" : "TM1"); 94 cpu, tm2 ? "TM2" : "TM1");
93 return; 95 return;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f16d38d09daf..1105250bf02c 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -42,7 +42,7 @@ int acpi_found_madt;
42 * Various Linux-internal data structures created from the 42 * Various Linux-internal data structures created from the
43 * MP-table. 43 * MP-table.
44 */ 44 */
45int apic_version [MAX_APICS]; 45unsigned char apic_version [MAX_APICS];
46unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; 46unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
47int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; 47int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
48 48
@@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0;
65/* Processor that is doing the boot up */ 65/* Processor that is doing the boot up */
66unsigned int boot_cpu_id = -1U; 66unsigned int boot_cpu_id = -1U;
67/* Internal processor count */ 67/* Internal processor count */
68static unsigned int num_processors = 0; 68unsigned int num_processors __initdata = 0;
69
70unsigned disabled_cpus __initdata;
69 71
70/* Bitmask of physically existing CPUs */ 72/* Bitmask of physically existing CPUs */
71physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 73physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
@@ -106,11 +108,14 @@ static int __init mpf_checksum(unsigned char *mp, int len)
106 108
107static void __init MP_processor_info (struct mpc_config_processor *m) 109static void __init MP_processor_info (struct mpc_config_processor *m)
108{ 110{
109 int ver, cpu; 111 int cpu;
112 unsigned char ver;
110 static int found_bsp=0; 113 static int found_bsp=0;
111 114
112 if (!(m->mpc_cpuflag & CPU_ENABLED)) 115 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
116 disabled_cpus++;
113 return; 117 return;
118 }
114 119
115 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", 120 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
116 m->mpc_apicid, 121 m->mpc_apicid,
@@ -129,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
129 } 134 }
130 135
131 cpu = num_processors++; 136 cpu = num_processors++;
132 137
133 if (m->mpc_apicid > MAX_APICS) { 138#if MAX_APICS < 255
139 if ((int)m->mpc_apicid > MAX_APICS) {
134 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 140 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
135 m->mpc_apicid, MAX_APICS); 141 m->mpc_apicid, MAX_APICS);
136 return; 142 return;
137 } 143 }
144#endif
138 ver = m->mpc_apicver; 145 ver = m->mpc_apicver;
139 146
140 physid_set(m->mpc_apicid, phys_cpu_present_map); 147 physid_set(m->mpc_apicid, phys_cpu_present_map);
@@ -218,7 +225,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
218 m->mpc_irqtype, m->mpc_irqflag & 3, 225 m->mpc_irqtype, m->mpc_irqflag & 3,
219 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 226 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
220 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); 227 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
221 if (++mp_irq_entries == MAX_IRQ_SOURCES) 228 if (++mp_irq_entries >= MAX_IRQ_SOURCES)
222 panic("Max # of irq sources exceeded!!\n"); 229 panic("Max # of irq sources exceeded!!\n");
223} 230}
224 231
@@ -549,7 +556,7 @@ void __init get_smp_config (void)
549 * Read the physical hardware table. Anything here will 556 * Read the physical hardware table. Anything here will
550 * override the defaults. 557 * override the defaults.
551 */ 558 */
552 if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { 559 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
553 smp_found_config = 0; 560 smp_found_config = 0;
554 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); 561 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
555 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); 562 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 39d445e16f22..5fae6f0cd994 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -151,23 +151,25 @@ int __init check_nmi_watchdog (void)
151 151
152 printk(KERN_INFO "testing NMI watchdog ... "); 152 printk(KERN_INFO "testing NMI watchdog ... ");
153 153
154#ifdef CONFIG_SMP
154 if (nmi_watchdog == NMI_LOCAL_APIC) 155 if (nmi_watchdog == NMI_LOCAL_APIC)
155 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 156 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
157#endif
156 158
157 for (cpu = 0; cpu < NR_CPUS; cpu++) 159 for (cpu = 0; cpu < NR_CPUS; cpu++)
158 counts[cpu] = cpu_pda[cpu].__nmi_count; 160 counts[cpu] = cpu_pda(cpu)->__nmi_count;
159 local_irq_enable(); 161 local_irq_enable();
160 mdelay((10*1000)/nmi_hz); // wait 10 ticks 162 mdelay((10*1000)/nmi_hz); // wait 10 ticks
161 163
162 for (cpu = 0; cpu < NR_CPUS; cpu++) { 164 for (cpu = 0; cpu < NR_CPUS; cpu++) {
163 if (!cpu_online(cpu)) 165 if (!cpu_online(cpu))
164 continue; 166 continue;
165 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 167 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
166 endflag = 1; 168 endflag = 1;
167 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 169 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
168 cpu, 170 cpu,
169 counts[cpu], 171 counts[cpu],
170 cpu_pda[cpu].__nmi_count); 172 cpu_pda(cpu)->__nmi_count);
171 nmi_active = 0; 173 nmi_active = 0;
172 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 174 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
173 nmi_perfctr_msr = 0; 175 nmi_perfctr_msr = 0;
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index cab471cf3edb..2f5d8328e2b9 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -8,53 +8,259 @@
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <asm/io.h> 10#include <asm/io.h>
11#include <asm/proto.h>
11 12
12/* Map a set of buffers described by scatterlist in streaming 13int iommu_merge __read_mostly = 0;
13 * mode for DMA. This is the scatter-gather version of the 14EXPORT_SYMBOL(iommu_merge);
14 * above pci_map_single interface. Here the scatter gather list 15
15 * elements are each tagged with the appropriate dma address 16dma_addr_t bad_dma_address __read_mostly;
16 * and length. They are obtained via sg_dma_{address,length}(SG). 17EXPORT_SYMBOL(bad_dma_address);
17 * 18
18 * NOTE: An implementation may be able to use a smaller number of 19/* This tells the BIO block layer to assume merging. Default to off
19 * DMA address/length pairs than there are SG table elements. 20 because we cannot guarantee merging later. */
20 * (for example via virtual mapping capabilities) 21int iommu_bio_merge __read_mostly = 0;
21 * The routine returns the number of addr/length pairs actually 22EXPORT_SYMBOL(iommu_bio_merge);
22 * used, at most nents. 23
23 * 24int iommu_sac_force __read_mostly = 0;
24 * Device ownership issues as mentioned above for pci_map_single are 25EXPORT_SYMBOL(iommu_sac_force);
25 * the same here. 26
27int no_iommu __read_mostly;
28#ifdef CONFIG_IOMMU_DEBUG
29int panic_on_overflow __read_mostly = 1;
30int force_iommu __read_mostly = 1;
31#else
32int panic_on_overflow __read_mostly = 0;
33int force_iommu __read_mostly= 0;
34#endif
35
36/* Dummy device used for NULL arguments (normally ISA). Better would
37 be probably a smaller DMA mask, but this is bug-to-bug compatible
38 to i386. */
39struct device fallback_dev = {
40 .bus_id = "fallback device",
41 .coherent_dma_mask = 0xffffffff,
42 .dma_mask = &fallback_dev.coherent_dma_mask,
43};
44
45/* Allocate DMA memory on node near device */
46noinline static void *
47dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
48{
49 struct page *page;
50 int node;
51 if (dev->bus == &pci_bus_type)
52 node = pcibus_to_node(to_pci_dev(dev)->bus);
53 else
54 node = numa_node_id();
55 page = alloc_pages_node(node, gfp, order);
56 return page ? page_address(page) : NULL;
57}
58
59/*
60 * Allocate memory for a coherent mapping.
26 */ 61 */
27int dma_map_sg(struct device *hwdev, struct scatterlist *sg, 62void *
28 int nents, int direction) 63dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
64 gfp_t gfp)
29{ 65{
30 int i; 66 void *memory;
31 67 unsigned long dma_mask = 0;
32 BUG_ON(direction == DMA_NONE); 68 u64 bus;
33 for (i = 0; i < nents; i++ ) { 69
34 struct scatterlist *s = &sg[i]; 70 if (!dev)
35 BUG_ON(!s->page); 71 dev = &fallback_dev;
36 s->dma_address = virt_to_bus(page_address(s->page) +s->offset); 72 dma_mask = dev->coherent_dma_mask;
37 s->dma_length = s->length; 73 if (dma_mask == 0)
74 dma_mask = 0xffffffff;
75
76 /* Kludge to make it bug-to-bug compatible with i386. i386
77 uses the normal dma_mask for alloc_coherent. */
78 dma_mask &= *dev->dma_mask;
79
80 /* Why <=? Even when the mask is smaller than 4GB it is often
81 larger than 16MB and in this case we have a chance of
82 finding fitting memory in the next higher zone first. If
83 not retry with true GFP_DMA. -AK */
84 if (dma_mask <= 0xffffffff)
85 gfp |= GFP_DMA32;
86
87 again:
88 memory = dma_alloc_pages(dev, gfp, get_order(size));
89 if (memory == NULL)
90 return NULL;
91
92 {
93 int high, mmu;
94 bus = virt_to_bus(memory);
95 high = (bus + size) >= dma_mask;
96 mmu = high;
97 if (force_iommu && !(gfp & GFP_DMA))
98 mmu = 1;
99 else if (high) {
100 free_pages((unsigned long)memory,
101 get_order(size));
102
103 /* Don't use the 16MB ZONE_DMA unless absolutely
104 needed. It's better to use remapping first. */
105 if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) {
106 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
107 goto again;
108 }
109
110 if (dma_ops->alloc_coherent)
111 return dma_ops->alloc_coherent(dev, size,
112 dma_handle, gfp);
113 return NULL;
114 }
115
116 memset(memory, 0, size);
117 if (!mmu) {
118 *dma_handle = virt_to_bus(memory);
119 return memory;
120 }
121 }
122
123 if (dma_ops->alloc_coherent) {
124 free_pages((unsigned long)memory, get_order(size));
125 gfp &= ~(GFP_DMA|GFP_DMA32);
126 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
127 }
128
129 if (dma_ops->map_simple) {
130 *dma_handle = dma_ops->map_simple(dev, memory,
131 size,
132 PCI_DMA_BIDIRECTIONAL);
133 if (*dma_handle != bad_dma_address)
134 return memory;
38 } 135 }
39 return nents;
40}
41 136
42EXPORT_SYMBOL(dma_map_sg); 137 if (panic_on_overflow)
138 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
139 free_pages((unsigned long)memory, get_order(size));
140 return NULL;
141}
142EXPORT_SYMBOL(dma_alloc_coherent);
43 143
44/* Unmap a set of streaming mode DMA translations. 144/*
45 * Again, cpu read rules concerning calls here are the same as for 145 * Unmap coherent memory.
46 * pci_unmap_single() above. 146 * The caller must ensure that the device has finished accessing the mapping.
47 */ 147 */
48void dma_unmap_sg(struct device *dev, struct scatterlist *sg, 148void dma_free_coherent(struct device *dev, size_t size,
49 int nents, int dir) 149 void *vaddr, dma_addr_t bus)
150{
151 if (dma_ops->unmap_single)
152 dma_ops->unmap_single(dev, bus, size, 0);
153 free_pages((unsigned long)vaddr, get_order(size));
154}
155EXPORT_SYMBOL(dma_free_coherent);
156
157int dma_supported(struct device *dev, u64 mask)
158{
159 if (dma_ops->dma_supported)
160 return dma_ops->dma_supported(dev, mask);
161
162 /* Copied from i386. Doesn't make much sense, because it will
163 only work for pci_alloc_coherent.
164 The caller just has to use GFP_DMA in this case. */
165 if (mask < 0x00ffffff)
166 return 0;
167
168 /* Tell the device to use SAC when IOMMU force is on. This
169 allows the driver to use cheaper accesses in some cases.
170
171 Problem with this is that if we overflow the IOMMU area and
172 return DAC as fallback address the device may not handle it
173 correctly.
174
175 As a special case some controllers have a 39bit address
176 mode that is as efficient as 32bit (aic79xx). Don't force
177 SAC for these. Assume all masks <= 40 bits are of this
178 type. Normally this doesn't make any difference, but gives
179 more gentle handling of IOMMU overflow. */
180 if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
181 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
182 return 0;
183 }
184
185 return 1;
186}
187EXPORT_SYMBOL(dma_supported);
188
189int dma_set_mask(struct device *dev, u64 mask)
50{ 190{
51 int i; 191 if (!dev->dma_mask || !dma_supported(dev, mask))
52 for (i = 0; i < nents; i++) { 192 return -EIO;
53 struct scatterlist *s = &sg[i]; 193 *dev->dma_mask = mask;
54 BUG_ON(s->page == NULL); 194 return 0;
55 BUG_ON(s->dma_address == 0);
56 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
57 }
58} 195}
196EXPORT_SYMBOL(dma_set_mask);
197
198/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
199 [,forcesac][,fullflush][,nomerge][,biomerge]
200 size set size of iommu (in bytes)
201 noagp don't initialize the AGP driver and use full aperture.
202 off don't use the IOMMU
203 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
204 memaper[=order] allocate an own aperture over RAM with size 32MB^order.
205 noforce don't force IOMMU usage. Default.
206 force Force IOMMU.
207 merge Do lazy merging. This may improve performance on some block devices.
208 Implies force (experimental)
209 biomerge Do merging at the BIO layer. This is more efficient than merge,
210 but should be only done with very big IOMMUs. Implies merge,force.
211 nomerge Don't do SG merging.
212 forcesac For SAC mode for masks <40bits (experimental)
213 fullflush Flush IOMMU on each allocation (default)
214 nofullflush Don't use IOMMU fullflush
215 allowed overwrite iommu off workarounds for specific chipsets.
216 soft Use software bounce buffering (default for Intel machines)
217 noaperture Don't touch the aperture for AGP.
218*/
219__init int iommu_setup(char *p)
220{
221 iommu_merge = 1;
59 222
60EXPORT_SYMBOL(dma_unmap_sg); 223 while (*p) {
224 if (!strncmp(p,"off",3))
225 no_iommu = 1;
226 /* gart_parse_options has more force support */
227 if (!strncmp(p,"force",5))
228 force_iommu = 1;
229 if (!strncmp(p,"noforce",7)) {
230 iommu_merge = 0;
231 force_iommu = 0;
232 }
233
234 if (!strncmp(p, "biomerge",8)) {
235 iommu_bio_merge = 4096;
236 iommu_merge = 1;
237 force_iommu = 1;
238 }
239 if (!strncmp(p, "panic",5))
240 panic_on_overflow = 1;
241 if (!strncmp(p, "nopanic",7))
242 panic_on_overflow = 0;
243 if (!strncmp(p, "merge",5)) {
244 iommu_merge = 1;
245 force_iommu = 1;
246 }
247 if (!strncmp(p, "nomerge",7))
248 iommu_merge = 0;
249 if (!strncmp(p, "forcesac",8))
250 iommu_sac_force = 1;
251
252#ifdef CONFIG_SWIOTLB
253 if (!strncmp(p, "soft",4))
254 swiotlb = 1;
255#endif
256
257#ifdef CONFIG_GART_IOMMU
258 gart_parse_options(p);
259#endif
260
261 p += strcspn(p, ",");
262 if (*p == ',')
263 ++p;
264 }
265 return 1;
266}
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 88be97c96987..c37fc7726ba6 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -30,8 +30,8 @@
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
32#include <asm/kdebug.h> 32#include <asm/kdebug.h>
33 33#include <asm/swiotlb.h>
34dma_addr_t bad_dma_address; 34#include <asm/dma.h>
35 35
36unsigned long iommu_bus_base; /* GART remapping area (physical) */ 36unsigned long iommu_bus_base; /* GART remapping area (physical) */
37static unsigned long iommu_size; /* size of remapping area bytes */ 37static unsigned long iommu_size; /* size of remapping area bytes */
@@ -39,18 +39,6 @@ static unsigned long iommu_pages; /* .. and in pages */
39 39
40u32 *iommu_gatt_base; /* Remapping table */ 40u32 *iommu_gatt_base; /* Remapping table */
41 41
42int no_iommu;
43static int no_agp;
44#ifdef CONFIG_IOMMU_DEBUG
45int panic_on_overflow = 1;
46int force_iommu = 1;
47#else
48int panic_on_overflow = 0;
49int force_iommu = 0;
50#endif
51int iommu_merge = 1;
52int iommu_sac_force = 0;
53
54/* If this is disabled the IOMMU will use an optimized flushing strategy 42/* If this is disabled the IOMMU will use an optimized flushing strategy
55 of only flushing when an mapping is reused. With it true the GART is flushed 43 of only flushing when an mapping is reused. With it true the GART is flushed
56 for every mapping. Problem is that doing the lazy flush seems to trigger 44 for every mapping. Problem is that doing the lazy flush seems to trigger
@@ -58,10 +46,6 @@ int iommu_sac_force = 0;
58 also seen with Qlogic at least). */ 46 also seen with Qlogic at least). */
59int iommu_fullflush = 1; 47int iommu_fullflush = 1;
60 48
61/* This tells the BIO block layer to assume merging. Default to off
62 because we cannot guarantee merging later. */
63int iommu_bio_merge = 0;
64
65#define MAX_NB 8 49#define MAX_NB 8
66 50
67/* Allocation bitmap for the remapping area */ 51/* Allocation bitmap for the remapping area */
@@ -102,16 +86,6 @@ AGPEXTERN __u32 *agp_gatt_table;
102 86
103static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 87static unsigned long next_bit; /* protected by iommu_bitmap_lock */
104static int need_flush; /* global flush state. set for each gart wrap */ 88static int need_flush; /* global flush state. set for each gart wrap */
105static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
106 size_t size, int dir, int do_panic);
107
108/* Dummy device used for NULL arguments (normally ISA). Better would
109 be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */
110static struct device fallback_dev = {
111 .bus_id = "fallback device",
112 .coherent_dma_mask = 0xffffffff,
113 .dma_mask = &fallback_dev.coherent_dma_mask,
114};
115 89
116static unsigned long alloc_iommu(int size) 90static unsigned long alloc_iommu(int size)
117{ 91{
@@ -185,108 +159,7 @@ static void flush_gart(struct device *dev)
185 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 159 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
186} 160}
187 161
188/* Allocate DMA memory on node near device */
189noinline
190static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
191{
192 struct page *page;
193 int node;
194 if (dev->bus == &pci_bus_type)
195 node = pcibus_to_node(to_pci_dev(dev)->bus);
196 else
197 node = numa_node_id();
198 page = alloc_pages_node(node, gfp, order);
199 return page ? page_address(page) : NULL;
200}
201
202/*
203 * Allocate memory for a coherent mapping.
204 */
205void *
206dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
207 gfp_t gfp)
208{
209 void *memory;
210 unsigned long dma_mask = 0;
211 u64 bus;
212 162
213 if (!dev)
214 dev = &fallback_dev;
215 dma_mask = dev->coherent_dma_mask;
216 if (dma_mask == 0)
217 dma_mask = 0xffffffff;
218
219 /* Kludge to make it bug-to-bug compatible with i386. i386
220 uses the normal dma_mask for alloc_coherent. */
221 dma_mask &= *dev->dma_mask;
222
223 again:
224 memory = dma_alloc_pages(dev, gfp, get_order(size));
225 if (memory == NULL)
226 return NULL;
227
228 {
229 int high, mmu;
230 bus = virt_to_bus(memory);
231 high = (bus + size) >= dma_mask;
232 mmu = high;
233 if (force_iommu && !(gfp & GFP_DMA))
234 mmu = 1;
235 if (no_iommu || dma_mask < 0xffffffffUL) {
236 if (high) {
237 free_pages((unsigned long)memory,
238 get_order(size));
239
240 if (swiotlb) {
241 return
242 swiotlb_alloc_coherent(dev, size,
243 dma_handle,
244 gfp);
245 }
246
247 if (!(gfp & GFP_DMA)) {
248 gfp |= GFP_DMA;
249 goto again;
250 }
251 return NULL;
252 }
253 mmu = 0;
254 }
255 memset(memory, 0, size);
256 if (!mmu) {
257 *dma_handle = virt_to_bus(memory);
258 return memory;
259 }
260 }
261
262 *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0);
263 if (*dma_handle == bad_dma_address)
264 goto error;
265 flush_gart(dev);
266 return memory;
267
268error:
269 if (panic_on_overflow)
270 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size);
271 free_pages((unsigned long)memory, get_order(size));
272 return NULL;
273}
274
275/*
276 * Unmap coherent memory.
277 * The caller must ensure that the device has finished accessing the mapping.
278 */
279void dma_free_coherent(struct device *dev, size_t size,
280 void *vaddr, dma_addr_t bus)
281{
282 if (swiotlb) {
283 swiotlb_free_coherent(dev, size, vaddr, bus);
284 return;
285 }
286
287 dma_unmap_single(dev, bus, size, 0);
288 free_pages((unsigned long)vaddr, get_order(size));
289}
290 163
291#ifdef CONFIG_IOMMU_LEAK 164#ifdef CONFIG_IOMMU_LEAK
292 165
@@ -320,7 +193,7 @@ void dump_leak(void)
320#define CLEAR_LEAK(x) 193#define CLEAR_LEAK(x)
321#endif 194#endif
322 195
323static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) 196static void iommu_full(struct device *dev, size_t size, int dir)
324{ 197{
325 /* 198 /*
326 * Ran out of IOMMU space for this operation. This is very bad. 199 * Ran out of IOMMU space for this operation. This is very bad.
@@ -336,11 +209,11 @@ static void iommu_full(struct device *dev, size_t size, int dir, int do_panic)
336 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", 209 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
337 size, dev->bus_id); 210 size, dev->bus_id);
338 211
339 if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { 212 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
340 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 213 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
341 panic("PCI-DMA: Memory would be corrupted\n"); 214 panic("PCI-DMA: Memory would be corrupted\n");
342 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 215 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
343 panic("PCI-DMA: Random memory would be DMAed\n"); 216 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
344 } 217 }
345 218
346#ifdef CONFIG_IOMMU_LEAK 219#ifdef CONFIG_IOMMU_LEAK
@@ -379,8 +252,8 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
379/* Map a single continuous physical area into the IOMMU. 252/* Map a single continuous physical area into the IOMMU.
380 * Caller needs to check if the iommu is needed and flush. 253 * Caller needs to check if the iommu is needed and flush.
381 */ 254 */
382static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, 255static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
383 size_t size, int dir, int do_panic) 256 size_t size, int dir)
384{ 257{
385 unsigned long npages = to_pages(phys_mem, size); 258 unsigned long npages = to_pages(phys_mem, size);
386 unsigned long iommu_page = alloc_iommu(npages); 259 unsigned long iommu_page = alloc_iommu(npages);
@@ -390,7 +263,7 @@ static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
390 return phys_mem; 263 return phys_mem;
391 if (panic_on_overflow) 264 if (panic_on_overflow)
392 panic("dma_map_area overflow %lu bytes\n", size); 265 panic("dma_map_area overflow %lu bytes\n", size);
393 iommu_full(dev, size, dir, do_panic); 266 iommu_full(dev, size, dir);
394 return bad_dma_address; 267 return bad_dma_address;
395 } 268 }
396 269
@@ -402,15 +275,21 @@ static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
402 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 275 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
403} 276}
404 277
278static dma_addr_t gart_map_simple(struct device *dev, char *buf,
279 size_t size, int dir)
280{
281 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
282 flush_gart(dev);
283 return map;
284}
285
405/* Map a single area into the IOMMU */ 286/* Map a single area into the IOMMU */
406dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) 287dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
407{ 288{
408 unsigned long phys_mem, bus; 289 unsigned long phys_mem, bus;
409 290
410 BUG_ON(dir == DMA_NONE); 291 BUG_ON(dir == DMA_NONE);
411 292
412 if (swiotlb)
413 return swiotlb_map_single(dev,addr,size,dir);
414 if (!dev) 293 if (!dev)
415 dev = &fallback_dev; 294 dev = &fallback_dev;
416 295
@@ -418,10 +297,24 @@ dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir)
418 if (!need_iommu(dev, phys_mem, size)) 297 if (!need_iommu(dev, phys_mem, size))
419 return phys_mem; 298 return phys_mem;
420 299
421 bus = dma_map_area(dev, phys_mem, size, dir, 1); 300 bus = gart_map_simple(dev, addr, size, dir);
422 flush_gart(dev);
423 return bus; 301 return bus;
424} 302}
303
304/*
305 * Wrapper for pci_unmap_single working with scatterlists.
306 */
307void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
308{
309 int i;
310
311 for (i = 0; i < nents; i++) {
312 struct scatterlist *s = &sg[i];
313 if (!s->dma_length || !s->length)
314 break;
315 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
316 }
317}
425 318
426/* Fallback for dma_map_sg in case of overflow */ 319/* Fallback for dma_map_sg in case of overflow */
427static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, 320static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
@@ -437,10 +330,10 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
437 struct scatterlist *s = &sg[i]; 330 struct scatterlist *s = &sg[i];
438 unsigned long addr = page_to_phys(s->page) + s->offset; 331 unsigned long addr = page_to_phys(s->page) + s->offset;
439 if (nonforced_iommu(dev, addr, s->length)) { 332 if (nonforced_iommu(dev, addr, s->length)) {
440 addr = dma_map_area(dev, addr, s->length, dir, 0); 333 addr = dma_map_area(dev, addr, s->length, dir);
441 if (addr == bad_dma_address) { 334 if (addr == bad_dma_address) {
442 if (i > 0) 335 if (i > 0)
443 dma_unmap_sg(dev, sg, i, dir); 336 gart_unmap_sg(dev, sg, i, dir);
444 nents = 0; 337 nents = 0;
445 sg[0].dma_length = 0; 338 sg[0].dma_length = 0;
446 break; 339 break;
@@ -509,7 +402,7 @@ static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
509 * DMA map all entries in a scatterlist. 402 * DMA map all entries in a scatterlist.
510 * Merge chunks that have page aligned sizes into a continuous mapping. 403 * Merge chunks that have page aligned sizes into a continuous mapping.
511 */ 404 */
512int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 405int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
513{ 406{
514 int i; 407 int i;
515 int out; 408 int out;
@@ -521,8 +414,6 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
521 if (nents == 0) 414 if (nents == 0)
522 return 0; 415 return 0;
523 416
524 if (swiotlb)
525 return swiotlb_map_sg(dev,sg,nents,dir);
526 if (!dev) 417 if (!dev)
527 dev = &fallback_dev; 418 dev = &fallback_dev;
528 419
@@ -565,13 +456,13 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
565 456
566error: 457error:
567 flush_gart(NULL); 458 flush_gart(NULL);
568 dma_unmap_sg(dev, sg, nents, dir); 459 gart_unmap_sg(dev, sg, nents, dir);
569 /* When it was forced try again unforced */ 460 /* When it was forced try again unforced */
570 if (force_iommu) 461 if (force_iommu)
571 return dma_map_sg_nonforce(dev, sg, nents, dir); 462 return dma_map_sg_nonforce(dev, sg, nents, dir);
572 if (panic_on_overflow) 463 if (panic_on_overflow)
573 panic("dma_map_sg: overflow on %lu pages\n", pages); 464 panic("dma_map_sg: overflow on %lu pages\n", pages);
574 iommu_full(dev, pages << PAGE_SHIFT, dir, 0); 465 iommu_full(dev, pages << PAGE_SHIFT, dir);
575 for (i = 0; i < nents; i++) 466 for (i = 0; i < nents; i++)
576 sg[i].dma_address = bad_dma_address; 467 sg[i].dma_address = bad_dma_address;
577 return 0; 468 return 0;
@@ -580,18 +471,13 @@ error:
580/* 471/*
581 * Free a DMA mapping. 472 * Free a DMA mapping.
582 */ 473 */
583void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, 474void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
584 size_t size, int direction) 475 size_t size, int direction)
585{ 476{
586 unsigned long iommu_page; 477 unsigned long iommu_page;
587 int npages; 478 int npages;
588 int i; 479 int i;
589 480
590 if (swiotlb) {
591 swiotlb_unmap_single(dev,dma_addr,size,direction);
592 return;
593 }
594
595 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 481 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
596 dma_addr >= iommu_bus_base + iommu_size) 482 dma_addr >= iommu_bus_base + iommu_size)
597 return; 483 return;
@@ -604,68 +490,7 @@ void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
604 free_iommu(iommu_page, npages); 490 free_iommu(iommu_page, npages);
605} 491}
606 492
607/* 493static int no_agp;
608 * Wrapper for pci_unmap_single working with scatterlists.
609 */
610void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
611{
612 int i;
613 if (swiotlb) {
614 swiotlb_unmap_sg(dev,sg,nents,dir);
615 return;
616 }
617 for (i = 0; i < nents; i++) {
618 struct scatterlist *s = &sg[i];
619 if (!s->dma_length || !s->length)
620 break;
621 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
622 }
623}
624
625int dma_supported(struct device *dev, u64 mask)
626{
627 /* Copied from i386. Doesn't make much sense, because it will
628 only work for pci_alloc_coherent.
629 The caller just has to use GFP_DMA in this case. */
630 if (mask < 0x00ffffff)
631 return 0;
632
633 /* Tell the device to use SAC when IOMMU force is on.
634 This allows the driver to use cheaper accesses in some cases.
635
636 Problem with this is that if we overflow the IOMMU area
637 and return DAC as fallback address the device may not handle it correctly.
638
639 As a special case some controllers have a 39bit address mode
640 that is as efficient as 32bit (aic79xx). Don't force SAC for these.
641 Assume all masks <= 40 bits are of this type. Normally this doesn't
642 make any difference, but gives more gentle handling of IOMMU overflow. */
643 if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
644 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
645 return 0;
646 }
647
648 return 1;
649}
650
651int dma_get_cache_alignment(void)
652{
653 return boot_cpu_data.x86_clflush_size;
654}
655
656EXPORT_SYMBOL(dma_unmap_sg);
657EXPORT_SYMBOL(dma_map_sg);
658EXPORT_SYMBOL(dma_map_single);
659EXPORT_SYMBOL(dma_unmap_single);
660EXPORT_SYMBOL(dma_supported);
661EXPORT_SYMBOL(no_iommu);
662EXPORT_SYMBOL(force_iommu);
663EXPORT_SYMBOL(bad_dma_address);
664EXPORT_SYMBOL(iommu_bio_merge);
665EXPORT_SYMBOL(iommu_sac_force);
666EXPORT_SYMBOL(dma_get_cache_alignment);
667EXPORT_SYMBOL(dma_alloc_coherent);
668EXPORT_SYMBOL(dma_free_coherent);
669 494
670static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 495static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
671{ 496{
@@ -766,12 +591,27 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
766 nommu: 591 nommu:
767 /* Should not happen anymore */ 592 /* Should not happen anymore */
768 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 593 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
769 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); 594 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
770 return -1; 595 return -1;
771} 596}
772 597
773extern int agp_amd64_init(void); 598extern int agp_amd64_init(void);
774 599
600static struct dma_mapping_ops gart_dma_ops = {
601 .mapping_error = NULL,
602 .map_single = gart_map_single,
603 .map_simple = gart_map_simple,
604 .unmap_single = gart_unmap_single,
605 .sync_single_for_cpu = NULL,
606 .sync_single_for_device = NULL,
607 .sync_single_range_for_cpu = NULL,
608 .sync_single_range_for_device = NULL,
609 .sync_sg_for_cpu = NULL,
610 .sync_sg_for_device = NULL,
611 .map_sg = gart_map_sg,
612 .unmap_sg = gart_unmap_sg,
613};
614
775static int __init pci_iommu_init(void) 615static int __init pci_iommu_init(void)
776{ 616{
777 struct agp_kern_info info; 617 struct agp_kern_info info;
@@ -793,16 +633,15 @@ static int __init pci_iommu_init(void)
793 633
794 if (swiotlb) { 634 if (swiotlb) {
795 no_iommu = 1; 635 no_iommu = 1;
796 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
797 return -1; 636 return -1;
798 } 637 }
799 638
800 if (no_iommu || 639 if (no_iommu ||
801 (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || 640 (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
802 !iommu_aperture || 641 !iommu_aperture ||
803 (no_agp && init_k8_gatt(&info) < 0)) { 642 (no_agp && init_k8_gatt(&info) < 0)) {
804 printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
805 no_iommu = 1; 643 no_iommu = 1;
644 no_iommu_init();
806 return -1; 645 return -1;
807 } 646 }
808 647
@@ -879,100 +718,50 @@ static int __init pci_iommu_init(void)
879 718
880 flush_gart(NULL); 719 flush_gart(NULL);
881 720
721 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
722 dma_ops = &gart_dma_ops;
723
882 return 0; 724 return 0;
883} 725}
884 726
885/* Must execute after PCI subsystem */ 727/* Must execute after PCI subsystem */
886fs_initcall(pci_iommu_init); 728fs_initcall(pci_iommu_init);
887 729
888/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] 730void gart_parse_options(char *p)
889 [,forcesac][,fullflush][,nomerge][,biomerge] 731{
890 size set size of iommu (in bytes) 732 int arg;
891 noagp don't initialize the AGP driver and use full aperture. 733
892 off don't use the IOMMU
893 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
894 memaper[=order] allocate an own aperture over RAM with size 32MB^order.
895 noforce don't force IOMMU usage. Default.
896 force Force IOMMU.
897 merge Do lazy merging. This may improve performance on some block devices.
898 Implies force (experimental)
899 biomerge Do merging at the BIO layer. This is more efficient than merge,
900 but should be only done with very big IOMMUs. Implies merge,force.
901 nomerge Don't do SG merging.
902 forcesac For SAC mode for masks <40bits (experimental)
903 fullflush Flush IOMMU on each allocation (default)
904 nofullflush Don't use IOMMU fullflush
905 allowed overwrite iommu off workarounds for specific chipsets.
906 soft Use software bounce buffering (default for Intel machines)
907 noaperture Don't touch the aperture for AGP.
908*/
909__init int iommu_setup(char *p)
910{
911 int arg;
912
913 while (*p) {
914 if (!strncmp(p,"noagp",5))
915 no_agp = 1;
916 if (!strncmp(p,"off",3))
917 no_iommu = 1;
918 if (!strncmp(p,"force",5)) {
919 force_iommu = 1;
920 iommu_aperture_allowed = 1;
921 }
922 if (!strncmp(p,"allowed",7))
923 iommu_aperture_allowed = 1;
924 if (!strncmp(p,"noforce",7)) {
925 iommu_merge = 0;
926 force_iommu = 0;
927 }
928 if (!strncmp(p, "memaper", 7)) {
929 fallback_aper_force = 1;
930 p += 7;
931 if (*p == '=') {
932 ++p;
933 if (get_option(&p, &arg))
934 fallback_aper_order = arg;
935 }
936 }
937 if (!strncmp(p, "biomerge",8)) {
938 iommu_bio_merge = 4096;
939 iommu_merge = 1;
940 force_iommu = 1;
941 }
942 if (!strncmp(p, "panic",5))
943 panic_on_overflow = 1;
944 if (!strncmp(p, "nopanic",7))
945 panic_on_overflow = 0;
946 if (!strncmp(p, "merge",5)) {
947 iommu_merge = 1;
948 force_iommu = 1;
949 }
950 if (!strncmp(p, "nomerge",7))
951 iommu_merge = 0;
952 if (!strncmp(p, "forcesac",8))
953 iommu_sac_force = 1;
954 if (!strncmp(p, "fullflush",8))
955 iommu_fullflush = 1;
956 if (!strncmp(p, "nofullflush",11))
957 iommu_fullflush = 0;
958 if (!strncmp(p, "soft",4))
959 swiotlb = 1;
960 if (!strncmp(p, "noaperture",10))
961 fix_aperture = 0;
962#ifdef CONFIG_IOMMU_LEAK 734#ifdef CONFIG_IOMMU_LEAK
963 if (!strncmp(p,"leak",4)) { 735 if (!strncmp(p,"leak",4)) {
964 leak_trace = 1; 736 leak_trace = 1;
965 p += 4; 737 p += 4;
966 if (*p == '=') ++p; 738 if (*p == '=') ++p;
967 if (isdigit(*p) && get_option(&p, &arg)) 739 if (isdigit(*p) && get_option(&p, &arg))
968 iommu_leak_pages = arg; 740 iommu_leak_pages = arg;
969 } else 741 }
970#endif 742#endif
971 if (isdigit(*p) && get_option(&p, &arg)) 743 if (isdigit(*p) && get_option(&p, &arg))
972 iommu_size = arg; 744 iommu_size = arg;
973 p += strcspn(p, ","); 745 if (!strncmp(p, "fullflush",8))
974 if (*p == ',') 746 iommu_fullflush = 1;
975 ++p; 747 if (!strncmp(p, "nofullflush",11))
976 } 748 iommu_fullflush = 0;
977 return 1; 749 if (!strncmp(p,"noagp",5))
978} 750 no_agp = 1;
751 if (!strncmp(p, "noaperture",10))
752 fix_aperture = 0;
753 /* duplicated from pci-dma.c */
754 if (!strncmp(p,"force",5))
755 iommu_aperture_allowed = 1;
756 if (!strncmp(p,"allowed",7))
757 iommu_aperture_allowed = 1;
758 if (!strncmp(p, "memaper", 7)) {
759 fallback_aper_force = 1;
760 p += 7;
761 if (*p == '=') {
762 ++p;
763 if (get_option(&p, &arg))
764 fallback_aper_order = arg;
765 }
766 }
767}
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 5a981dca87ff..e41564975195 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -6,89 +6,94 @@
6#include <linux/string.h> 6#include <linux/string.h>
7#include <asm/proto.h> 7#include <asm/proto.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9#include <asm/dma.h>
9 10
10int iommu_merge = 0; 11static int
11EXPORT_SYMBOL(iommu_merge); 12check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
12
13dma_addr_t bad_dma_address;
14EXPORT_SYMBOL(bad_dma_address);
15
16int iommu_bio_merge = 0;
17EXPORT_SYMBOL(iommu_bio_merge);
18
19int iommu_sac_force = 0;
20EXPORT_SYMBOL(iommu_sac_force);
21
22/*
23 * Dummy IO MMU functions
24 */
25
26void *dma_alloc_coherent(struct device *hwdev, size_t size,
27 dma_addr_t *dma_handle, gfp_t gfp)
28{ 13{
29 void *ret; 14 if (hwdev && bus + size > *hwdev->dma_mask) {
30 u64 mask; 15 printk(KERN_ERR
31 int order = get_order(size); 16 "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
32 17 name, (long long)bus, size, (long long)*hwdev->dma_mask);
33 if (hwdev) 18 return 0;
34 mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
35 else
36 mask = 0xffffffff;
37 for (;;) {
38 ret = (void *)__get_free_pages(gfp, order);
39 if (ret == NULL)
40 return NULL;
41 *dma_handle = virt_to_bus(ret);
42 if ((*dma_handle & ~mask) == 0)
43 break;
44 free_pages((unsigned long)ret, order);
45 if (gfp & GFP_DMA)
46 return NULL;
47 gfp |= GFP_DMA;
48 } 19 }
20 return 1;
21}
49 22
50 memset(ret, 0, size); 23static dma_addr_t
51 return ret; 24nommu_map_single(struct device *hwdev, void *ptr, size_t size,
25 int direction)
26{
27 dma_addr_t bus = virt_to_bus(ptr);
28 if (!check_addr("map_single", hwdev, bus, size))
29 return bad_dma_address;
30 return bus;
52} 31}
53EXPORT_SYMBOL(dma_alloc_coherent);
54 32
55void dma_free_coherent(struct device *hwdev, size_t size, 33void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
56 void *vaddr, dma_addr_t dma_handle) 34 int direction)
57{ 35{
58 free_pages((unsigned long)vaddr, get_order(size));
59} 36}
60EXPORT_SYMBOL(dma_free_coherent);
61 37
62int dma_supported(struct device *hwdev, u64 mask) 38/* Map a set of buffers described by scatterlist in streaming
39 * mode for DMA. This is the scatter-gather version of the
40 * above pci_map_single interface. Here the scatter gather list
41 * elements are each tagged with the appropriate dma address
42 * and length. They are obtained via sg_dma_{address,length}(SG).
43 *
44 * NOTE: An implementation may be able to use a smaller number of
45 * DMA address/length pairs than there are SG table elements.
46 * (for example via virtual mapping capabilities)
47 * The routine returns the number of addr/length pairs actually
48 * used, at most nents.
49 *
50 * Device ownership issues as mentioned above for pci_map_single are
51 * the same here.
52 */
53int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
54 int nents, int direction)
63{ 55{
64 /* 56 int i;
65 * we fall back to GFP_DMA when the mask isn't all 1s,
66 * so we can't guarantee allocations that must be
67 * within a tighter range than GFP_DMA..
68 * RED-PEN this won't work for pci_map_single. Caller has to
69 * use GFP_DMA in the first place.
70 */
71 if (mask < 0x00ffffff)
72 return 0;
73 57
74 return 1; 58 BUG_ON(direction == DMA_NONE);
75} 59 for (i = 0; i < nents; i++ ) {
76EXPORT_SYMBOL(dma_supported); 60 struct scatterlist *s = &sg[i];
61 BUG_ON(!s->page);
62 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
63 if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
64 return 0;
65 s->dma_length = s->length;
66 }
67 return nents;
68}
77 69
78int dma_get_cache_alignment(void) 70/* Unmap a set of streaming mode DMA translations.
71 * Again, cpu read rules concerning calls here are the same as for
72 * pci_unmap_single() above.
73 */
74void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
75 int nents, int dir)
79{ 76{
80 return boot_cpu_data.x86_clflush_size;
81} 77}
82EXPORT_SYMBOL(dma_get_cache_alignment);
83 78
84static int __init check_ram(void) 79struct dma_mapping_ops nommu_dma_ops = {
85{ 80 .map_single = nommu_map_single,
86 if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { 81 .unmap_single = nommu_unmap_single,
87 printk( 82 .map_sg = nommu_map_sg,
88 KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" 83 .unmap_sg = nommu_unmap_sg,
89 KERN_ERR "WARNING 32bit PCI may malfunction.\n"); 84 .is_phys = 1,
90 } 85};
91 return 0;
92}
93__initcall(check_ram);
94 86
87void __init no_iommu_init(void)
88{
89 if (dma_ops)
90 return;
91 printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
92 dma_ops = &nommu_dma_ops;
93 if (end_pfn > MAX_DMA32_PFN) {
94 printk(KERN_ERR
95 "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
96 KERN_ERR "WARNING 32bit PCI may malfunction.\n"
97 KERN_ERR "You might want to enable CONFIG_GART_IOMMU\n");
98 }
99}
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
new file mode 100644
index 000000000000..3569a25ad7fb
--- /dev/null
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -0,0 +1,42 @@
1/* Glue code to lib/swiotlb.c */
2
3#include <linux/pci.h>
4#include <linux/cache.h>
5#include <linux/module.h>
6#include <asm/dma-mapping.h>
7#include <asm/proto.h>
8#include <asm/swiotlb.h>
9#include <asm/dma.h>
10
11int swiotlb __read_mostly;
12EXPORT_SYMBOL(swiotlb);
13
14struct dma_mapping_ops swiotlb_dma_ops = {
15 .mapping_error = swiotlb_dma_mapping_error,
16 .alloc_coherent = swiotlb_alloc_coherent,
17 .free_coherent = swiotlb_free_coherent,
18 .map_single = swiotlb_map_single,
19 .unmap_single = swiotlb_unmap_single,
20 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
21 .sync_single_for_device = swiotlb_sync_single_for_device,
22 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
23 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
24 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
25 .sync_sg_for_device = swiotlb_sync_sg_for_device,
26 .map_sg = swiotlb_map_sg,
27 .unmap_sg = swiotlb_unmap_sg,
28 .dma_supported = NULL,
29};
30
31void pci_swiotlb_init(void)
32{
33 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
34 if (!iommu_aperture && !no_iommu &&
35 (end_pfn > MAX_DMA32_PFN || force_iommu))
36 swiotlb = 1;
37 if (swiotlb) {
38 swiotlb_init();
39 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
40 dma_ops = &swiotlb_dma_ops;
41 }
42}
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 59be85d9a4bc..8ded407e4a94 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -36,6 +36,7 @@
36#include <linux/utsname.h> 36#include <linux/utsname.h>
37#include <linux/random.h> 37#include <linux/random.h>
38#include <linux/kprobes.h> 38#include <linux/kprobes.h>
39#include <linux/notifier.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/pgtable.h> 42#include <asm/pgtable.h>
@@ -50,13 +51,12 @@
50#include <asm/desc.h> 51#include <asm/desc.h>
51#include <asm/proto.h> 52#include <asm/proto.h>
52#include <asm/ia32.h> 53#include <asm/ia32.h>
54#include <asm/idle.h>
53 55
54asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
55 57
56unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
57 59
58static atomic_t hlt_counter = ATOMIC_INIT(0);
59
60unsigned long boot_option_idle_override = 0; 60unsigned long boot_option_idle_override = 0;
61EXPORT_SYMBOL(boot_option_idle_override); 61EXPORT_SYMBOL(boot_option_idle_override);
62 62
@@ -66,19 +66,49 @@ EXPORT_SYMBOL(boot_option_idle_override);
66void (*pm_idle)(void); 66void (*pm_idle)(void);
67static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 67static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 68
69void disable_hlt(void) 69static struct notifier_block *idle_notifier;
70static DEFINE_SPINLOCK(idle_notifier_lock);
71
72void idle_notifier_register(struct notifier_block *n)
73{
74 unsigned long flags;
75 spin_lock_irqsave(&idle_notifier_lock, flags);
76 notifier_chain_register(&idle_notifier, n);
77 spin_unlock_irqrestore(&idle_notifier_lock, flags);
78}
79EXPORT_SYMBOL_GPL(idle_notifier_register);
80
81void idle_notifier_unregister(struct notifier_block *n)
70{ 82{
71 atomic_inc(&hlt_counter); 83 unsigned long flags;
84 spin_lock_irqsave(&idle_notifier_lock, flags);
85 notifier_chain_unregister(&idle_notifier, n);
86 spin_unlock_irqrestore(&idle_notifier_lock, flags);
72} 87}
88EXPORT_SYMBOL(idle_notifier_unregister);
73 89
74EXPORT_SYMBOL(disable_hlt); 90enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
91static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
75 92
76void enable_hlt(void) 93void enter_idle(void)
77{ 94{
78 atomic_dec(&hlt_counter); 95 __get_cpu_var(idle_state) = CPU_IDLE;
96 notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79} 97}
80 98
81EXPORT_SYMBOL(enable_hlt); 99static void __exit_idle(void)
100{
101 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
102 notifier_call_chain(&idle_notifier, IDLE_END, NULL);
103}
104
105/* Called from interrupts to signify idle end */
106void exit_idle(void)
107{
108 if (current->pid | read_pda(irqcount))
109 return;
110 __exit_idle();
111}
82 112
83/* 113/*
84 * We use this if we don't have any better 114 * We use this if we don't have any better
@@ -88,21 +118,16 @@ void default_idle(void)
88{ 118{
89 local_irq_enable(); 119 local_irq_enable();
90 120
91 if (!atomic_read(&hlt_counter)) { 121 clear_thread_flag(TIF_POLLING_NRFLAG);
92 clear_thread_flag(TIF_POLLING_NRFLAG); 122 smp_mb__after_clear_bit();
93 smp_mb__after_clear_bit(); 123 while (!need_resched()) {
94 while (!need_resched()) { 124 local_irq_disable();
95 local_irq_disable(); 125 if (!need_resched())
96 if (!need_resched()) 126 safe_halt();
97 safe_halt(); 127 else
98 else 128 local_irq_enable();
99 local_irq_enable();
100 }
101 set_thread_flag(TIF_POLLING_NRFLAG);
102 } else {
103 while (!need_resched())
104 cpu_relax();
105 } 129 }
130 set_thread_flag(TIF_POLLING_NRFLAG);
106} 131}
107 132
108/* 133/*
@@ -144,7 +169,8 @@ void cpu_idle_wait(void)
144 do { 169 do {
145 ssleep(1); 170 ssleep(1);
146 for_each_online_cpu(cpu) { 171 for_each_online_cpu(cpu) {
147 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) 172 if (cpu_isset(cpu, map) &&
173 !per_cpu(cpu_idle_state, cpu))
148 cpu_clear(cpu, map); 174 cpu_clear(cpu, map);
149 } 175 }
150 cpus_and(map, map, cpu_online_map); 176 cpus_and(map, map, cpu_online_map);
@@ -156,7 +182,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
156DECLARE_PER_CPU(int, cpu_state); 182DECLARE_PER_CPU(int, cpu_state);
157 183
158#include <asm/nmi.h> 184#include <asm/nmi.h>
159/* We don't actually take CPU down, just spin without interrupts. */ 185/* We halt the CPU with physical CPU hotplug */
160static inline void play_dead(void) 186static inline void play_dead(void)
161{ 187{
162 idle_task_exit(); 188 idle_task_exit();
@@ -165,8 +191,9 @@ static inline void play_dead(void)
165 /* Ack it */ 191 /* Ack it */
166 __get_cpu_var(cpu_state) = CPU_DEAD; 192 __get_cpu_var(cpu_state) = CPU_DEAD;
167 193
194 local_irq_disable();
168 while (1) 195 while (1)
169 safe_halt(); 196 halt();
170} 197}
171#else 198#else
172static inline void play_dead(void) 199static inline void play_dead(void)
@@ -199,7 +226,9 @@ void cpu_idle (void)
199 idle = default_idle; 226 idle = default_idle;
200 if (cpu_is_offline(smp_processor_id())) 227 if (cpu_is_offline(smp_processor_id()))
201 play_dead(); 228 play_dead();
229 enter_idle();
202 idle(); 230 idle();
231 __exit_idle();
203 } 232 }
204 233
205 preempt_enable_no_resched(); 234 preempt_enable_no_resched();
@@ -275,7 +304,8 @@ void __show_regs(struct pt_regs * regs)
275 system_utsname.version); 304 system_utsname.version);
276 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 305 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
277 printk_address(regs->rip); 306 printk_address(regs->rip);
278 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); 307 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
308 regs->eflags);
279 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 309 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
280 regs->rax, regs->rbx, regs->rcx); 310 regs->rax, regs->rbx, regs->rcx);
281 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 311 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -349,13 +379,6 @@ void flush_thread(void)
349 struct task_struct *tsk = current; 379 struct task_struct *tsk = current;
350 struct thread_info *t = current_thread_info(); 380 struct thread_info *t = current_thread_info();
351 381
352 /*
353 * Remove function-return probe instances associated with this task
354 * and put them back on the free list. Do not insert an exit probe for
355 * this function, it will be disabled by kprobe_flush_task if you do.
356 */
357 kprobe_flush_task(tsk);
358
359 if (t->flags & _TIF_ABI_PENDING) 382 if (t->flags & _TIF_ABI_PENDING)
360 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); 383 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
361 384
@@ -427,21 +450,20 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
427 struct pt_regs * childregs; 450 struct pt_regs * childregs;
428 struct task_struct *me = current; 451 struct task_struct *me = current;
429 452
430 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; 453 childregs = ((struct pt_regs *)
431 454 (THREAD_SIZE + task_stack_page(p))) - 1;
432 *childregs = *regs; 455 *childregs = *regs;
433 456
434 childregs->rax = 0; 457 childregs->rax = 0;
435 childregs->rsp = rsp; 458 childregs->rsp = rsp;
436 if (rsp == ~0UL) { 459 if (rsp == ~0UL)
437 childregs->rsp = (unsigned long)childregs; 460 childregs->rsp = (unsigned long)childregs;
438 }
439 461
440 p->thread.rsp = (unsigned long) childregs; 462 p->thread.rsp = (unsigned long) childregs;
441 p->thread.rsp0 = (unsigned long) (childregs+1); 463 p->thread.rsp0 = (unsigned long) (childregs+1);
442 p->thread.userrsp = me->thread.userrsp; 464 p->thread.userrsp = me->thread.userrsp;
443 465
444 set_ti_thread_flag(p->thread_info, TIF_FORK); 466 set_tsk_thread_flag(p, TIF_FORK);
445 467
446 p->thread.fs = me->thread.fs; 468 p->thread.fs = me->thread.fs;
447 p->thread.gs = me->thread.gs; 469 p->thread.gs = me->thread.gs;
@@ -457,7 +479,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
457 p->thread.io_bitmap_max = 0; 479 p->thread.io_bitmap_max = 0;
458 return -ENOMEM; 480 return -ENOMEM;
459 } 481 }
460 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); 482 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
483 IO_BITMAP_BYTES);
461 } 484 }
462 485
463 /* 486 /*
@@ -494,7 +517,8 @@ out:
494 * - fold all the options into a flag word and test it with a single test. 517 * - fold all the options into a flag word and test it with a single test.
495 * - could test fs/gs bitsliced 518 * - could test fs/gs bitsliced
496 */ 519 */
497struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 520struct task_struct *
521__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
498{ 522{
499 struct thread_struct *prev = &prev_p->thread, 523 struct thread_struct *prev = &prev_p->thread,
500 *next = &next_p->thread; 524 *next = &next_p->thread;
@@ -565,7 +589,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
565 prev->userrsp = read_pda(oldrsp); 589 prev->userrsp = read_pda(oldrsp);
566 write_pda(oldrsp, next->userrsp); 590 write_pda(oldrsp, next->userrsp);
567 write_pda(pcurrent, next_p); 591 write_pda(pcurrent, next_p);
568 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); 592 write_pda(kernelstack,
593 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
569 594
570 /* 595 /*
571 * Now maybe reload the debug registers 596 * Now maybe reload the debug registers
@@ -646,7 +671,9 @@ asmlinkage long sys_fork(struct pt_regs *regs)
646 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 671 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
647} 672}
648 673
649asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 674asmlinkage long
675sys_clone(unsigned long clone_flags, unsigned long newsp,
676 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
650{ 677{
651 if (!newsp) 678 if (!newsp)
652 newsp = regs->rsp; 679 newsp = regs->rsp;
@@ -677,12 +704,13 @@ unsigned long get_wchan(struct task_struct *p)
677 704
678 if (!p || p == current || p->state==TASK_RUNNING) 705 if (!p || p == current || p->state==TASK_RUNNING)
679 return 0; 706 return 0;
680 stack = (unsigned long)p->thread_info; 707 stack = (unsigned long)task_stack_page(p);
681 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 708 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
682 return 0; 709 return 0;
683 fp = *(u64 *)(p->thread.rsp); 710 fp = *(u64 *)(p->thread.rsp);
684 do { 711 do {
685 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) 712 if (fp < (unsigned long)stack ||
713 fp > (unsigned long)stack+THREAD_SIZE)
686 return 0; 714 return 0;
687 rip = *(u64 *)(fp+8); 715 rip = *(u64 *)(fp+8);
688 if (!in_sched_functions(rip)) 716 if (!in_sched_functions(rip))
@@ -717,8 +745,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
717 task->thread.gsindex = 0; 745 task->thread.gsindex = 0;
718 task->thread.gs = addr; 746 task->thread.gs = addr;
719 if (doit) { 747 if (doit) {
720 load_gs_index(0); 748 load_gs_index(0);
721 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 749 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
722 } 750 }
723 } 751 }
724 put_cpu(); 752 put_cpu();
@@ -735,7 +763,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
735 set_32bit_tls(task, FS_TLS, addr); 763 set_32bit_tls(task, FS_TLS, addr);
736 if (doit) { 764 if (doit) {
737 load_TLS(&task->thread, cpu); 765 load_TLS(&task->thread, cpu);
738 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); 766 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
739 } 767 }
740 task->thread.fsindex = FS_TLS_SEL; 768 task->thread.fsindex = FS_TLS_SEL;
741 task->thread.fs = 0; 769 task->thread.fs = 0;
@@ -745,8 +773,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
745 if (doit) { 773 if (doit) {
746 /* set the selector to 0 to not confuse 774 /* set the selector to 0 to not confuse
747 __switch_to */ 775 __switch_to */
748 asm volatile("movl %0,%%fs" :: "r" (0)); 776 asm volatile("movl %0,%%fs" :: "r" (0));
749 ret = checking_wrmsrl(MSR_FS_BASE, addr); 777 ret = checking_wrmsrl(MSR_FS_BASE, addr);
750 } 778 }
751 } 779 }
752 put_cpu(); 780 put_cpu();
@@ -755,9 +783,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
755 unsigned long base; 783 unsigned long base;
756 if (task->thread.fsindex == FS_TLS_SEL) 784 if (task->thread.fsindex == FS_TLS_SEL)
757 base = read_32bit_tls(task, FS_TLS); 785 base = read_32bit_tls(task, FS_TLS);
758 else if (doit) { 786 else if (doit)
759 rdmsrl(MSR_FS_BASE, base); 787 rdmsrl(MSR_FS_BASE, base);
760 } else 788 else
761 base = task->thread.fs; 789 base = task->thread.fs;
762 ret = put_user(base, (unsigned long __user *)addr); 790 ret = put_user(base, (unsigned long __user *)addr);
763 break; 791 break;
@@ -766,9 +794,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
766 unsigned long base; 794 unsigned long base;
767 if (task->thread.gsindex == GS_TLS_SEL) 795 if (task->thread.gsindex == GS_TLS_SEL)
768 base = read_32bit_tls(task, GS_TLS); 796 base = read_32bit_tls(task, GS_TLS);
769 else if (doit) { 797 else if (doit)
770 rdmsrl(MSR_KERNEL_GS_BASE, base); 798 rdmsrl(MSR_KERNEL_GS_BASE, base);
771 } else 799 else
772 base = task->thread.gs; 800 base = task->thread.gs;
773 ret = put_user(base, (unsigned long __user *)addr); 801 ret = put_user(base, (unsigned long __user *)addr);
774 break; 802 break;
@@ -794,8 +822,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
794{ 822{
795 struct pt_regs *pp, ptregs; 823 struct pt_regs *pp, ptregs;
796 824
797 pp = (struct pt_regs *)(tsk->thread.rsp0); 825 pp = task_pt_regs(tsk);
798 --pp;
799 826
800 ptregs = *pp; 827 ptregs = *pp;
801 ptregs.cs &= 0xffff; 828 ptregs.cs &= 0xffff;
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index a87b6cebe80f..53205622351c 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -36,9 +36,12 @@
36 * in exit.c or in signal.c. 36 * in exit.c or in signal.c.
37 */ 37 */
38 38
39/* determines which flags the user has access to. */ 39/*
40/* 1 = access 0 = no access */ 40 * Determines which flags the user has access to [1 = access, 0 = no access].
41#define FLAG_MASK 0x44dd5UL 41 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
42 * Also masks reserved bits (63-22, 15, 5, 3, 1).
43 */
44#define FLAG_MASK 0x54dd5UL
42 45
43/* set's the trap flag. */ 46/* set's the trap flag. */
44#define TRAP_FLAG 0x100UL 47#define TRAP_FLAG 0x100UL
@@ -64,12 +67,6 @@ static inline unsigned long get_stack_long(struct task_struct *task, int offset)
64 return (*((unsigned long *)stack)); 67 return (*((unsigned long *)stack));
65} 68}
66 69
67static inline struct pt_regs *get_child_regs(struct task_struct *task)
68{
69 struct pt_regs *regs = (void *)task->thread.rsp0;
70 return regs - 1;
71}
72
73/* 70/*
74 * this routine will put a word on the processes privileged stack. 71 * this routine will put a word on the processes privileged stack.
75 * the offset is how far from the base addr as stored in the TSS. 72 * the offset is how far from the base addr as stored in the TSS.
@@ -167,7 +164,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
167 164
168static void set_singlestep(struct task_struct *child) 165static void set_singlestep(struct task_struct *child)
169{ 166{
170 struct pt_regs *regs = get_child_regs(child); 167 struct pt_regs *regs = task_pt_regs(child);
171 168
172 /* 169 /*
173 * Always set TIF_SINGLESTEP - this guarantees that 170 * Always set TIF_SINGLESTEP - this guarantees that
@@ -205,7 +202,7 @@ static void clear_singlestep(struct task_struct *child)
205 202
206 /* But touch TF only if it was set by us.. */ 203 /* But touch TF only if it was set by us.. */
207 if (child->ptrace & PT_DTRACE) { 204 if (child->ptrace & PT_DTRACE) {
208 struct pt_regs *regs = get_child_regs(child); 205 struct pt_regs *regs = task_pt_regs(child);
209 regs->eflags &= ~TRAP_FLAG; 206 regs->eflags &= ~TRAP_FLAG;
210 child->ptrace &= ~PT_DTRACE; 207 child->ptrace &= ~PT_DTRACE;
211 } 208 }
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 47f95687905f..57117b8beb2b 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/ctype.h> 7#include <linux/ctype.h>
8#include <linux/string.h> 8#include <linux/string.h>
9#include <linux/pm.h>
9#include <asm/io.h> 10#include <asm/io.h>
10#include <asm/kdebug.h> 11#include <asm/kdebug.h>
11#include <asm/delay.h> 12#include <asm/delay.h>
@@ -77,6 +78,7 @@ static inline void kb_wait(void)
77 78
78void machine_shutdown(void) 79void machine_shutdown(void)
79{ 80{
81 unsigned long flags;
80 /* Stop the cpus and apics */ 82 /* Stop the cpus and apics */
81#ifdef CONFIG_SMP 83#ifdef CONFIG_SMP
82 int reboot_cpu_id; 84 int reboot_cpu_id;
@@ -98,7 +100,7 @@ void machine_shutdown(void)
98 smp_send_stop(); 100 smp_send_stop();
99#endif 101#endif
100 102
101 local_irq_disable(); 103 local_irq_save(flags);
102 104
103#ifndef CONFIG_SMP 105#ifndef CONFIG_SMP
104 disable_local_APIC(); 106 disable_local_APIC();
@@ -106,7 +108,7 @@ void machine_shutdown(void)
106 108
107 disable_IO_APIC(); 109 disable_IO_APIC();
108 110
109 local_irq_enable(); 111 local_irq_restore(flags);
110} 112}
111 113
112void machine_emergency_restart(void) 114void machine_emergency_restart(void)
@@ -120,7 +122,7 @@ void machine_emergency_restart(void)
120 /* Could also try the reset bit in the Hammer NB */ 122 /* Could also try the reset bit in the Hammer NB */
121 switch (reboot_type) { 123 switch (reboot_type) {
122 case BOOT_KBD: 124 case BOOT_KBD:
123 for (i=0; i<100; i++) { 125 for (i=0; i<10; i++) {
124 kb_wait(); 126 kb_wait();
125 udelay(50); 127 udelay(50);
126 outb(0xfe,0x64); /* pulse reset low */ 128 outb(0xfe,0x64); /* pulse reset low */
@@ -153,10 +155,11 @@ void machine_halt(void)
153 155
154void machine_power_off(void) 156void machine_power_off(void)
155{ 157{
156 if (!reboot_force) { 158 if (pm_power_off) {
157 machine_shutdown(); 159 if (!reboot_force) {
158 } 160 machine_shutdown();
159 if (pm_power_off) 161 }
160 pm_power_off(); 162 pm_power_off();
163 }
161} 164}
162 165
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index da0bc3e7bdf5..28895c03cb11 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -35,6 +35,7 @@
35#include <asm/processor.h> 35#include <asm/processor.h>
36#include <linux/console.h> 36#include <linux/console.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/crash_dump.h>
38#include <linux/root_dev.h> 39#include <linux/root_dev.h>
39#include <linux/pci.h> 40#include <linux/pci.h>
40#include <linux/acpi.h> 41#include <linux/acpi.h>
@@ -42,6 +43,9 @@
42#include <linux/edd.h> 43#include <linux/edd.h>
43#include <linux/mmzone.h> 44#include <linux/mmzone.h>
44#include <linux/kexec.h> 45#include <linux/kexec.h>
46#include <linux/cpufreq.h>
47#include <linux/dmi.h>
48#include <linux/dma-mapping.h>
45 49
46#include <asm/mtrr.h> 50#include <asm/mtrr.h>
47#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -60,6 +64,9 @@
60#include <asm/setup.h> 64#include <asm/setup.h>
61#include <asm/mach_apic.h> 65#include <asm/mach_apic.h>
62#include <asm/numa.h> 66#include <asm/numa.h>
67#include <asm/swiotlb.h>
68#include <asm/sections.h>
69#include <asm/gart-mapping.h>
63 70
64/* 71/*
65 * Machine setup.. 72 * Machine setup..
@@ -84,11 +91,6 @@ int bootloader_type;
84 91
85unsigned long saved_video_mode; 92unsigned long saved_video_mode;
86 93
87#ifdef CONFIG_SWIOTLB
88int swiotlb;
89EXPORT_SYMBOL(swiotlb);
90#endif
91
92/* 94/*
93 * Setup options 95 * Setup options
94 */ 96 */
@@ -103,7 +105,6 @@ struct edid_info edid_info;
103struct e820map e820; 105struct e820map e820;
104 106
105extern int root_mountflags; 107extern int root_mountflags;
106extern char _text, _etext, _edata, _end;
107 108
108char command_line[COMMAND_LINE_SIZE]; 109char command_line[COMMAND_LINE_SIZE];
109 110
@@ -274,10 +275,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
274{ 275{
275 char c = ' ', *to = command_line, *from = COMMAND_LINE; 276 char c = ' ', *to = command_line, *from = COMMAND_LINE;
276 int len = 0; 277 int len = 0;
277 278 int userdef = 0;
278 /* Save unparsed command line copy for /proc/cmdline */
279 memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
280 saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
281 279
282 for (;;) { 280 for (;;) {
283 if (c != ' ') 281 if (c != ' ')
@@ -345,10 +343,14 @@ static __init void parse_cmdline_early (char ** cmdline_p)
345 !memcmp(from, "disableapic", 11)) 343 !memcmp(from, "disableapic", 11))
346 disable_apic = 1; 344 disable_apic = 1;
347 345
348 if (!memcmp(from, "noapic", 6)) 346 /* Don't confuse with noapictimer */
347 if (!memcmp(from, "noapic", 6) &&
348 (from[6] == ' ' || from[6] == 0))
349 skip_ioapic_setup = 1; 349 skip_ioapic_setup = 1;
350 350
351 if (!memcmp(from, "apic", 4)) { 351 /* Make sure to not confuse with apic= */
352 if (!memcmp(from, "apic", 4) &&
353 (from[4] == ' ' || from[4] == 0)) {
352 skip_ioapic_setup = 0; 354 skip_ioapic_setup = 0;
353 ioapic_force = 1; 355 ioapic_force = 1;
354 } 356 }
@@ -356,16 +358,36 @@ static __init void parse_cmdline_early (char ** cmdline_p)
356 if (!memcmp(from, "mem=", 4)) 358 if (!memcmp(from, "mem=", 4))
357 parse_memopt(from+4, &from); 359 parse_memopt(from+4, &from);
358 360
361 if (!memcmp(from, "memmap=", 7)) {
362 /* exactmap option is for used defined memory */
363 if (!memcmp(from+7, "exactmap", 8)) {
364#ifdef CONFIG_CRASH_DUMP
365 /* If we are doing a crash dump, we
366 * still need to know the real mem
367 * size before original memory map is
368 * reset.
369 */
370 saved_max_pfn = e820_end_of_ram();
371#endif
372 from += 8+7;
373 end_pfn_map = 0;
374 e820.nr_map = 0;
375 userdef = 1;
376 }
377 else {
378 parse_memmapopt(from+7, &from);
379 userdef = 1;
380 }
381 }
382
359#ifdef CONFIG_NUMA 383#ifdef CONFIG_NUMA
360 if (!memcmp(from, "numa=", 5)) 384 if (!memcmp(from, "numa=", 5))
361 numa_setup(from+5); 385 numa_setup(from+5);
362#endif 386#endif
363 387
364#ifdef CONFIG_GART_IOMMU
365 if (!memcmp(from,"iommu=",6)) { 388 if (!memcmp(from,"iommu=",6)) {
366 iommu_setup(from+6); 389 iommu_setup(from+6);
367 } 390 }
368#endif
369 391
370 if (!memcmp(from,"oops=panic", 10)) 392 if (!memcmp(from,"oops=panic", 10))
371 panic_on_oops = 1; 393 panic_on_oops = 1;
@@ -394,6 +416,14 @@ static __init void parse_cmdline_early (char ** cmdline_p)
394 } 416 }
395#endif 417#endif
396 418
419#ifdef CONFIG_PROC_VMCORE
420 /* elfcorehdr= specifies the location of elf core header
421 * stored by the crashed kernel. This option will be passed
422 * by kexec loader to the capture kernel.
423 */
424 else if(!memcmp(from, "elfcorehdr=", 11))
425 elfcorehdr_addr = memparse(from+11, &from);
426#endif
397 next_char: 427 next_char:
398 c = *(from++); 428 c = *(from++);
399 if (!c) 429 if (!c)
@@ -402,6 +432,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
402 break; 432 break;
403 *(to++) = c; 433 *(to++) = c;
404 } 434 }
435 if (userdef) {
436 printk(KERN_INFO "user-defined physical RAM map:\n");
437 e820_print_map("user");
438 }
405 *to = '\0'; 439 *to = '\0';
406 *cmdline_p = command_line; 440 *cmdline_p = command_line;
407} 441}
@@ -412,7 +446,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
412{ 446{
413 unsigned long bootmap_size, bootmap; 447 unsigned long bootmap_size, bootmap;
414 448
415 memory_present(0, start_pfn, end_pfn);
416 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 449 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
417 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); 450 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
418 if (bootmap == -1L) 451 if (bootmap == -1L)
@@ -443,6 +476,8 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
443 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 476 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
444}; 477};
445 478
479extern char __vsyscall_0;
480
446/* Replace instructions with better alternatives for this CPU type. 481/* Replace instructions with better alternatives for this CPU type.
447 482
448 This runs before SMP is initialized to avoid SMP problems with 483 This runs before SMP is initialized to avoid SMP problems with
@@ -454,11 +489,17 @@ void apply_alternatives(void *start, void *end)
454 struct alt_instr *a; 489 struct alt_instr *a;
455 int diff, i, k; 490 int diff, i, k;
456 for (a = start; (void *)a < end; a++) { 491 for (a = start; (void *)a < end; a++) {
492 u8 *instr;
493
457 if (!boot_cpu_has(a->cpuid)) 494 if (!boot_cpu_has(a->cpuid))
458 continue; 495 continue;
459 496
460 BUG_ON(a->replacementlen > a->instrlen); 497 BUG_ON(a->replacementlen > a->instrlen);
461 __inline_memcpy(a->instr, a->replacement, a->replacementlen); 498 instr = a->instr;
499 /* vsyscall code is not mapped yet. resolve it manually. */
500 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
501 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
502 __inline_memcpy(instr, a->replacement, a->replacementlen);
462 diff = a->instrlen - a->replacementlen; 503 diff = a->instrlen - a->replacementlen;
463 504
464 /* Pad the rest with nops */ 505 /* Pad the rest with nops */
@@ -466,7 +507,7 @@ void apply_alternatives(void *start, void *end)
466 k = diff; 507 k = diff;
467 if (k > ASM_NOP_MAX) 508 if (k > ASM_NOP_MAX)
468 k = ASM_NOP_MAX; 509 k = ASM_NOP_MAX;
469 __inline_memcpy(a->instr + i, k8_nops[k], k); 510 __inline_memcpy(instr + i, k8_nops[k], k);
470 } 511 }
471 } 512 }
472} 513}
@@ -571,6 +612,8 @@ void __init setup_arch(char **cmdline_p)
571 612
572 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); 613 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
573 614
615 zap_low_mappings(0);
616
574#ifdef CONFIG_ACPI 617#ifdef CONFIG_ACPI
575 /* 618 /*
576 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). 619 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
@@ -657,8 +700,6 @@ void __init setup_arch(char **cmdline_p)
657 } 700 }
658#endif 701#endif
659 702
660 sparse_init();
661
662 paging_init(); 703 paging_init();
663 704
664 check_ioapic(); 705 check_ioapic();
@@ -670,6 +711,8 @@ void __init setup_arch(char **cmdline_p)
670 acpi_boot_init(); 711 acpi_boot_init();
671#endif 712#endif
672 713
714 init_cpu_to_node();
715
673#ifdef CONFIG_X86_LOCAL_APIC 716#ifdef CONFIG_X86_LOCAL_APIC
674 /* 717 /*
675 * get boot-time SMP configuration: 718 * get boot-time SMP configuration:
@@ -793,7 +836,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
793#endif 836#endif
794 837
795 bits = 0; 838 bits = 0;
796 while ((1 << bits) < c->x86_num_cores) 839 while ((1 << bits) < c->x86_max_cores)
797 bits++; 840 bits++;
798 841
799 /* Low order bits define the core id (index of core in socket) */ 842 /* Low order bits define the core id (index of core in socket) */
@@ -823,10 +866,10 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
823 if (!node_online(node)) 866 if (!node_online(node))
824 node = nearby_node(apicid); 867 node = nearby_node(apicid);
825 } 868 }
826 cpu_to_node[cpu] = node; 869 numa_set_node(cpu, node);
827 870
828 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", 871 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
829 cpu, c->x86_num_cores, node, cpu_core_id[cpu]); 872 cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
830#endif 873#endif
831#endif 874#endif
832} 875}
@@ -834,7 +877,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
834static int __init init_amd(struct cpuinfo_x86 *c) 877static int __init init_amd(struct cpuinfo_x86 *c)
835{ 878{
836 int r; 879 int r;
837 int level;
838 880
839#ifdef CONFIG_SMP 881#ifdef CONFIG_SMP
840 unsigned long value; 882 unsigned long value;
@@ -857,11 +899,6 @@ static int __init init_amd(struct cpuinfo_x86 *c)
857 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ 899 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
858 clear_bit(0*32+31, &c->x86_capability); 900 clear_bit(0*32+31, &c->x86_capability);
859 901
860 /* C-stepping K8? */
861 level = cpuid_eax(1);
862 if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
863 set_bit(X86_FEATURE_K8_C, &c->x86_capability);
864
865 r = get_model_name(c); 902 r = get_model_name(c);
866 if (!r) { 903 if (!r) {
867 switch (c->x86) { 904 switch (c->x86) {
@@ -874,10 +911,14 @@ static int __init init_amd(struct cpuinfo_x86 *c)
874 } 911 }
875 display_cacheinfo(c); 912 display_cacheinfo(c);
876 913
914 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
915 if (c->x86_power & (1<<8))
916 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
917
877 if (c->extended_cpuid_level >= 0x80000008) { 918 if (c->extended_cpuid_level >= 0x80000008) {
878 c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 919 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
879 if (c->x86_num_cores & (c->x86_num_cores - 1)) 920 if (c->x86_max_cores & (c->x86_max_cores - 1))
880 c->x86_num_cores = 1; 921 c->x86_max_cores = 1;
881 922
882 amd_detect_cmp(c); 923 amd_detect_cmp(c);
883 } 924 }
@@ -889,54 +930,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
889{ 930{
890#ifdef CONFIG_SMP 931#ifdef CONFIG_SMP
891 u32 eax, ebx, ecx, edx; 932 u32 eax, ebx, ecx, edx;
892 int index_msb, tmp; 933 int index_msb, core_bits;
893 int cpu = smp_processor_id(); 934 int cpu = smp_processor_id();
894 935
936 cpuid(1, &eax, &ebx, &ecx, &edx);
937
938 c->apicid = phys_pkg_id(0);
939
895 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) 940 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
896 return; 941 return;
897 942
898 cpuid(1, &eax, &ebx, &ecx, &edx);
899 smp_num_siblings = (ebx & 0xff0000) >> 16; 943 smp_num_siblings = (ebx & 0xff0000) >> 16;
900 944
901 if (smp_num_siblings == 1) { 945 if (smp_num_siblings == 1) {
902 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 946 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
903 } else if (smp_num_siblings > 1) { 947 } else if (smp_num_siblings > 1 ) {
904 index_msb = 31; 948
905 /*
906 * At this point we only support two siblings per
907 * processor package.
908 */
909 if (smp_num_siblings > NR_CPUS) { 949 if (smp_num_siblings > NR_CPUS) {
910 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); 950 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
911 smp_num_siblings = 1; 951 smp_num_siblings = 1;
912 return; 952 return;
913 } 953 }
914 tmp = smp_num_siblings; 954
915 while ((tmp & 0x80000000 ) == 0) { 955 index_msb = get_count_order(smp_num_siblings);
916 tmp <<=1 ;
917 index_msb--;
918 }
919 if (smp_num_siblings & (smp_num_siblings - 1))
920 index_msb++;
921 phys_proc_id[cpu] = phys_pkg_id(index_msb); 956 phys_proc_id[cpu] = phys_pkg_id(index_msb);
922 957
923 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 958 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
924 phys_proc_id[cpu]); 959 phys_proc_id[cpu]);
925 960
926 smp_num_siblings = smp_num_siblings / c->x86_num_cores; 961 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
927 962
928 tmp = smp_num_siblings; 963 index_msb = get_count_order(smp_num_siblings) ;
929 index_msb = 31; 964
930 while ((tmp & 0x80000000) == 0) { 965 core_bits = get_count_order(c->x86_max_cores);
931 tmp <<=1 ;
932 index_msb--;
933 }
934 if (smp_num_siblings & (smp_num_siblings - 1))
935 index_msb++;
936 966
937 cpu_core_id[cpu] = phys_pkg_id(index_msb); 967 cpu_core_id[cpu] = phys_pkg_id(index_msb) &
968 ((1 << core_bits) - 1);
938 969
939 if (c->x86_num_cores > 1) 970 if (c->x86_max_cores > 1)
940 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 971 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
941 cpu_core_id[cpu]); 972 cpu_core_id[cpu]);
942 } 973 }
@@ -975,7 +1006,7 @@ static void srat_detect_node(void)
975 node = apicid_to_node[hard_smp_processor_id()]; 1006 node = apicid_to_node[hard_smp_processor_id()];
976 if (node == NUMA_NO_NODE) 1007 if (node == NUMA_NO_NODE)
977 node = 0; 1008 node = 0;
978 cpu_to_node[cpu] = node; 1009 numa_set_node(cpu, node);
979 1010
980 if (acpi_numa > 0) 1011 if (acpi_numa > 0)
981 printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); 1012 printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
@@ -993,13 +1024,20 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
993 unsigned eax = cpuid_eax(0x80000008); 1024 unsigned eax = cpuid_eax(0x80000008);
994 c->x86_virt_bits = (eax >> 8) & 0xff; 1025 c->x86_virt_bits = (eax >> 8) & 0xff;
995 c->x86_phys_bits = eax & 0xff; 1026 c->x86_phys_bits = eax & 0xff;
1027 /* CPUID workaround for Intel 0F34 CPU */
1028 if (c->x86_vendor == X86_VENDOR_INTEL &&
1029 c->x86 == 0xF && c->x86_model == 0x3 &&
1030 c->x86_mask == 0x4)
1031 c->x86_phys_bits = 36;
996 } 1032 }
997 1033
998 if (c->x86 == 15) 1034 if (c->x86 == 15)
999 c->x86_cache_alignment = c->x86_clflush_size * 2; 1035 c->x86_cache_alignment = c->x86_clflush_size * 2;
1000 if (c->x86 >= 15) 1036 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
1037 (c->x86 == 0x6 && c->x86_model >= 0x0e))
1001 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 1038 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
1002 c->x86_num_cores = intel_num_cpu_cores(c); 1039 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
1040 c->x86_max_cores = intel_num_cpu_cores(c);
1003 1041
1004 srat_detect_node(); 1042 srat_detect_node();
1005} 1043}
@@ -1037,7 +1075,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
1037 c->x86_model_id[0] = '\0'; /* Unset */ 1075 c->x86_model_id[0] = '\0'; /* Unset */
1038 c->x86_clflush_size = 64; 1076 c->x86_clflush_size = 64;
1039 c->x86_cache_alignment = c->x86_clflush_size; 1077 c->x86_cache_alignment = c->x86_clflush_size;
1040 c->x86_num_cores = 1; 1078 c->x86_max_cores = 1;
1041 c->extended_cpuid_level = 0; 1079 c->extended_cpuid_level = 0;
1042 memset(&c->x86_capability, 0, sizeof c->x86_capability); 1080 memset(&c->x86_capability, 0, sizeof c->x86_capability);
1043 1081
@@ -1060,10 +1098,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
1060 c->x86 = (tfms >> 8) & 0xf; 1098 c->x86 = (tfms >> 8) & 0xf;
1061 c->x86_model = (tfms >> 4) & 0xf; 1099 c->x86_model = (tfms >> 4) & 0xf;
1062 c->x86_mask = tfms & 0xf; 1100 c->x86_mask = tfms & 0xf;
1063 if (c->x86 == 0xf) { 1101 if (c->x86 == 0xf)
1064 c->x86 += (tfms >> 20) & 0xff; 1102 c->x86 += (tfms >> 20) & 0xff;
1103 if (c->x86 >= 0x6)
1065 c->x86_model += ((tfms >> 16) & 0xF) << 4; 1104 c->x86_model += ((tfms >> 16) & 0xF) << 4;
1066 }
1067 if (c->x86_capability[0] & (1<<19)) 1105 if (c->x86_capability[0] & (1<<19))
1068 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 1106 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
1069 } else { 1107 } else {
@@ -1197,7 +1235,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1197 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1235 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1198 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, 1236 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1199 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, 1237 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1200 NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", 1238 NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
1201 1239
1202 /* Transmeta-defined */ 1240 /* Transmeta-defined */
1203 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, 1241 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -1225,7 +1263,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1225 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1263 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1226 1264
1227 /* AMD-defined (#2) */ 1265 /* AMD-defined (#2) */
1228 "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, 1266 "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
1229 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1267 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1230 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1268 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1231 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1269 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1236,7 +1274,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1236 "vid", /* voltage id control */ 1274 "vid", /* voltage id control */
1237 "ttp", /* thermal trip */ 1275 "ttp", /* thermal trip */
1238 "tm", 1276 "tm",
1239 "stc" 1277 "stc",
1278 NULL,
1279 /* nothing */ /* constant_tsc - moved to flags */
1240 }; 1280 };
1241 1281
1242 1282
@@ -1262,8 +1302,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1262 seq_printf(m, "stepping\t: unknown\n"); 1302 seq_printf(m, "stepping\t: unknown\n");
1263 1303
1264 if (cpu_has(c,X86_FEATURE_TSC)) { 1304 if (cpu_has(c,X86_FEATURE_TSC)) {
1305 unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
1306 if (!freq)
1307 freq = cpu_khz;
1265 seq_printf(m, "cpu MHz\t\t: %u.%03u\n", 1308 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1266 cpu_khz / 1000, (cpu_khz % 1000)); 1309 freq / 1000, (freq % 1000));
1267 } 1310 }
1268 1311
1269 /* Cache size */ 1312 /* Cache size */
@@ -1271,13 +1314,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1271 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); 1314 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1272 1315
1273#ifdef CONFIG_SMP 1316#ifdef CONFIG_SMP
1274 if (smp_num_siblings * c->x86_num_cores > 1) { 1317 if (smp_num_siblings * c->x86_max_cores > 1) {
1275 int cpu = c - cpu_data; 1318 int cpu = c - cpu_data;
1276 seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); 1319 seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
1277 seq_printf(m, "siblings\t: %d\n", 1320 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
1278 c->x86_num_cores * smp_num_siblings);
1279 seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); 1321 seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
1280 seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); 1322 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1281 } 1323 }
1282#endif 1324#endif
1283 1325
@@ -1314,8 +1356,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1314 unsigned i; 1356 unsigned i;
1315 for (i = 0; i < 32; i++) 1357 for (i = 0; i < 32; i++)
1316 if (c->x86_power & (1 << i)) { 1358 if (c->x86_power & (1 << i)) {
1317 if (i < ARRAY_SIZE(x86_power_flags)) 1359 if (i < ARRAY_SIZE(x86_power_flags) &&
1318 seq_printf(m, " %s", x86_power_flags[i]); 1360 x86_power_flags[i])
1361 seq_printf(m, "%s%s",
1362 x86_power_flags[i][0]?" ":"",
1363 x86_power_flags[i]);
1319 else 1364 else
1320 seq_printf(m, " [%d]", i); 1365 seq_printf(m, " [%d]", i);
1321 } 1366 }
@@ -1347,3 +1392,11 @@ struct seq_operations cpuinfo_op = {
1347 .stop = c_stop, 1392 .stop = c_stop,
1348 .show = show_cpuinfo, 1393 .show = show_cpuinfo,
1349}; 1394};
1395
1396static int __init run_dmi_scan(void)
1397{
1398 dmi_scan_machine();
1399 return 0;
1400}
1401core_initcall(run_dmi_scan);
1402
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 79190891fbc5..70f1bb808a20 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -30,14 +30,15 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30 30
31cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 31cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
32 32
33struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 33struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
34struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
34 35
35struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 36struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
36 37
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 38char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 39
39unsigned long __supported_pte_mask __read_mostly = ~0UL; 40unsigned long __supported_pte_mask __read_mostly = ~0UL;
40static int do_not_nx __initdata = 0; 41static int do_not_nx __cpuinitdata = 0;
41 42
42/* noexec=on|off 43/* noexec=on|off
43Control non executable mappings for 64bit processes. 44Control non executable mappings for 64bit processes.
@@ -110,18 +111,18 @@ void __init setup_per_cpu_areas(void)
110 } 111 }
111 if (!ptr) 112 if (!ptr)
112 panic("Cannot allocate cpu data for CPU %d\n", i); 113 panic("Cannot allocate cpu data for CPU %d\n", i);
113 cpu_pda[i].data_offset = ptr - __per_cpu_start; 114 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
114 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 115 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
115 } 116 }
116} 117}
117 118
118void pda_init(int cpu) 119void pda_init(int cpu)
119{ 120{
120 struct x8664_pda *pda = &cpu_pda[cpu]; 121 struct x8664_pda *pda = cpu_pda(cpu);
121 122
122 /* Setup up data that may be needed in __get_free_pages early */ 123 /* Setup up data that may be needed in __get_free_pages early */
123 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 124 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
124 wrmsrl(MSR_GS_BASE, cpu_pda + cpu); 125 wrmsrl(MSR_GS_BASE, pda);
125 126
126 pda->cpunumber = cpu; 127 pda->cpunumber = cpu;
127 pda->irqcount = -1; 128 pda->irqcount = -1;
@@ -141,12 +142,11 @@ void pda_init(int cpu)
141 panic("cannot allocate irqstack for cpu %d", cpu); 142 panic("cannot allocate irqstack for cpu %d", cpu);
142 } 143 }
143 144
144 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
145 145
146 pda->irqstackptr += IRQSTACKSIZE-64; 146 pda->irqstackptr += IRQSTACKSIZE-64;
147} 147}
148 148
149char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 149char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
150__attribute__((section(".bss.page_aligned"))); 150__attribute__((section(".bss.page_aligned")));
151 151
152/* May not be marked __init: used by software suspend */ 152/* May not be marked __init: used by software suspend */
@@ -197,6 +197,7 @@ void __cpuinit cpu_init (void)
197 /* CPU 0 is initialised in head64.c */ 197 /* CPU 0 is initialised in head64.c */
198 if (cpu != 0) { 198 if (cpu != 0) {
199 pda_init(cpu); 199 pda_init(cpu);
200 zap_low_mappings(cpu);
200 } else 201 } else
201 estacks = boot_exception_stacks; 202 estacks = boot_exception_stacks;
202 203
@@ -213,23 +214,14 @@ void __cpuinit cpu_init (void)
213 * Initialize the per-CPU GDT with the boot GDT, 214 * Initialize the per-CPU GDT with the boot GDT,
214 * and set up the GDT descriptor: 215 * and set up the GDT descriptor:
215 */ 216 */
216 if (cpu) { 217 if (cpu)
217 memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); 218 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
218 }
219 219
220 cpu_gdt_descr[cpu].size = GDT_SIZE; 220 cpu_gdt_descr[cpu].size = GDT_SIZE;
221 cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
222 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); 221 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
223 asm volatile("lidt %0" :: "m" (idt_descr)); 222 asm volatile("lidt %0" :: "m" (idt_descr));
224 223
225 memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); 224 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
226
227 /*
228 * Delete NT
229 */
230
231 asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
232
233 syscall_init(); 225 syscall_init();
234 226
235 wrmsrl(MSR_FS_BASE, 0); 227 wrmsrl(MSR_FS_BASE, 0);
@@ -243,13 +235,27 @@ void __cpuinit cpu_init (void)
243 */ 235 */
244 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 236 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
245 if (cpu) { 237 if (cpu) {
246 estacks = (char *)__get_free_pages(GFP_ATOMIC, 238 static const unsigned int order[N_EXCEPTION_STACKS] = {
247 EXCEPTION_STACK_ORDER); 239 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
240 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
241 };
242
243 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
248 if (!estacks) 244 if (!estacks)
249 panic("Cannot allocate exception stack %ld %d\n", 245 panic("Cannot allocate exception stack %ld %d\n",
250 v, cpu); 246 v, cpu);
251 } 247 }
252 estacks += EXCEPTION_STKSZ; 248 switch (v + 1) {
249#if DEBUG_STKSZ > EXCEPTION_STKSZ
250 case DEBUG_STACK:
251 cpu_pda[cpu].debugstack = (unsigned long)estacks;
252 estacks += DEBUG_STKSZ;
253 break;
254#endif
255 default:
256 estacks += EXCEPTION_STKSZ;
257 break;
258 }
253 t->ist[v] = (unsigned long)estacks; 259 t->ist[v] = (unsigned long)estacks;
254 } 260 }
255 261
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d642fbf3da29..5876df116c92 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
110 COPY(r14); 110 COPY(r14);
111 COPY(r15); 111 COPY(r15);
112 112
113 /* Kernel saves and restores only the CS segment register on signals,
114 * which is the bare minimum needed to allow mixed 32/64-bit code.
115 * App's signal handler can save/restore other segments if needed. */
116 {
117 unsigned cs;
118 err |= __get_user(cs, &sc->cs);
119 regs->cs = cs | 3; /* Force into user mode */
120 }
121
113 { 122 {
114 unsigned int tmpflags; 123 unsigned int tmpflags;
115 err |= __get_user(tmpflags, &sc->eflags); 124 err |= __get_user(tmpflags, &sc->eflags);
@@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
187{ 196{
188 int err = 0; 197 int err = 0;
189 198
199 err |= __put_user(regs->cs, &sc->cs);
190 err |= __put_user(0, &sc->gs); 200 err |= __put_user(0, &sc->gs);
191 err |= __put_user(0, &sc->fs); 201 err |= __put_user(0, &sc->fs);
192 202
@@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
318 328
319 regs->rsp = (unsigned long)frame; 329 regs->rsp = (unsigned long)frame;
320 330
331 /* Set up the CS register to run signal handlers in 64-bit mode,
332 even if the handler happens to be interrupting 32-bit code. */
333 regs->cs = __USER_CS;
334
335 /* This, by contrast, has nothing to do with segment registers -
336 see include/asm-x86_64/uaccess.h for details. */
321 set_fs(USER_DS); 337 set_fs(USER_DS);
338
322 regs->eflags &= ~TF_MASK; 339 regs->eflags &= ~TF_MASK;
323 if (test_thread_flag(TIF_SINGLESTEP)) 340 if (test_thread_flag(TIF_SINGLESTEP))
324 ptrace_notify(SIGTRAP); 341 ptrace_notify(SIGTRAP);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9db9dda161b4..19ef012b1f17 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -27,8 +27,7 @@
27#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/apicdef.h> 29#include <asm/apicdef.h>
30 30#include <asm/idle.h>
31#define __cpuinit __init
32 31
33/* 32/*
34 * Smarter SMP flushing macros. 33 * Smarter SMP flushing macros.
@@ -282,11 +281,6 @@ void flush_tlb_all(void)
282 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 281 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
283} 282}
284 283
285void smp_kdb_stop(void)
286{
287 send_IPI_allbutself(KDB_VECTOR);
288}
289
290/* 284/*
291 * this function sends a 'reschedule' IPI to another CPU. 285 * this function sends a 'reschedule' IPI to another CPU.
292 * it goes straight through and wastes no time serializing 286 * it goes straight through and wastes no time serializing
@@ -452,13 +446,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
452 446
453void smp_stop_cpu(void) 447void smp_stop_cpu(void)
454{ 448{
449 unsigned long flags;
455 /* 450 /*
456 * Remove this CPU: 451 * Remove this CPU:
457 */ 452 */
458 cpu_clear(smp_processor_id(), cpu_online_map); 453 cpu_clear(smp_processor_id(), cpu_online_map);
459 local_irq_disable(); 454 local_irq_save(flags);
460 disable_local_APIC(); 455 disable_local_APIC();
461 local_irq_enable(); 456 local_irq_restore(flags);
462} 457}
463 458
464static void smp_really_stop_cpu(void *dummy) 459static void smp_really_stop_cpu(void *dummy)
@@ -513,6 +508,7 @@ asmlinkage void smp_call_function_interrupt(void)
513 /* 508 /*
514 * At this point the info structure may be out of scope unless wait==1 509 * At this point the info structure may be out of scope unless wait==1
515 */ 510 */
511 exit_idle();
516 irq_enter(); 512 irq_enter();
517 (*func)(info); 513 (*func)(info);
518 irq_exit(); 514 irq_exit();
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index c4e59bbdc187..a28756ef7cef 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -64,6 +64,7 @@
64int smp_num_siblings = 1; 64int smp_num_siblings = 1;
65/* Package ID of each logical CPU */ 65/* Package ID of each logical CPU */
66u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 66u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
67/* core ID of each logical CPU */
67u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 68u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
68 69
69/* Bitmask of currently online CPUs */ 70/* Bitmask of currently online CPUs */
@@ -87,7 +88,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
87/* Set when the idlers are all forked */ 88/* Set when the idlers are all forked */
88int smp_threads_ready; 89int smp_threads_ready;
89 90
91/* representing HT siblings of each logical CPU */
90cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; 92cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
93
94/* representing HT and core siblings of each logical CPU */
91cpumask_t cpu_core_map[NR_CPUS] __read_mostly; 95cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
92EXPORT_SYMBOL(cpu_core_map); 96EXPORT_SYMBOL(cpu_core_map);
93 97
@@ -331,7 +335,13 @@ static __cpuinit void sync_tsc(unsigned int master)
331 335
332static void __cpuinit tsc_sync_wait(void) 336static void __cpuinit tsc_sync_wait(void)
333{ 337{
334 if (notscsync || !cpu_has_tsc) 338 /*
339 * When the CPU has synchronized TSCs assume the BIOS
340 * or the hardware already synced. Otherwise we could
341 * mess up a possible perfect synchronization with a
342 * not-quite-perfect algorithm.
343 */
344 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
335 return; 345 return;
336 sync_tsc(0); 346 sync_tsc(0);
337} 347}
@@ -434,30 +444,59 @@ void __cpuinit smp_callin(void)
434 cpu_set(cpuid, cpu_callin_map); 444 cpu_set(cpuid, cpu_callin_map);
435} 445}
436 446
447/* representing cpus for which sibling maps can be computed */
448static cpumask_t cpu_sibling_setup_map;
449
437static inline void set_cpu_sibling_map(int cpu) 450static inline void set_cpu_sibling_map(int cpu)
438{ 451{
439 int i; 452 int i;
453 struct cpuinfo_x86 *c = cpu_data;
454
455 cpu_set(cpu, cpu_sibling_setup_map);
440 456
441 if (smp_num_siblings > 1) { 457 if (smp_num_siblings > 1) {
442 for_each_cpu(i) { 458 for_each_cpu_mask(i, cpu_sibling_setup_map) {
443 if (cpu_core_id[cpu] == cpu_core_id[i]) { 459 if (phys_proc_id[cpu] == phys_proc_id[i] &&
460 cpu_core_id[cpu] == cpu_core_id[i]) {
444 cpu_set(i, cpu_sibling_map[cpu]); 461 cpu_set(i, cpu_sibling_map[cpu]);
445 cpu_set(cpu, cpu_sibling_map[i]); 462 cpu_set(cpu, cpu_sibling_map[i]);
463 cpu_set(i, cpu_core_map[cpu]);
464 cpu_set(cpu, cpu_core_map[i]);
446 } 465 }
447 } 466 }
448 } else { 467 } else {
449 cpu_set(cpu, cpu_sibling_map[cpu]); 468 cpu_set(cpu, cpu_sibling_map[cpu]);
450 } 469 }
451 470
452 if (current_cpu_data.x86_num_cores > 1) { 471 if (current_cpu_data.x86_max_cores == 1) {
453 for_each_cpu(i) {
454 if (phys_proc_id[cpu] == phys_proc_id[i]) {
455 cpu_set(i, cpu_core_map[cpu]);
456 cpu_set(cpu, cpu_core_map[i]);
457 }
458 }
459 } else {
460 cpu_core_map[cpu] = cpu_sibling_map[cpu]; 472 cpu_core_map[cpu] = cpu_sibling_map[cpu];
473 c[cpu].booted_cores = 1;
474 return;
475 }
476
477 for_each_cpu_mask(i, cpu_sibling_setup_map) {
478 if (phys_proc_id[cpu] == phys_proc_id[i]) {
479 cpu_set(i, cpu_core_map[cpu]);
480 cpu_set(cpu, cpu_core_map[i]);
481 /*
482 * Does this new cpu bringup a new core?
483 */
484 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
485 /*
486 * for each core in package, increment
487 * the booted_cores for this new cpu
488 */
489 if (first_cpu(cpu_sibling_map[i]) == i)
490 c[cpu].booted_cores++;
491 /*
492 * increment the core count for all
493 * the other cpus in this package
494 */
495 if (i != cpu)
496 c[i].booted_cores++;
497 } else if (i != cpu && !c[cpu].booted_cores)
498 c[cpu].booted_cores = c[i].booted_cores;
499 }
461 } 500 }
462} 501}
463 502
@@ -613,6 +652,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
613 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 652 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
614 } while (send_status && (timeout++ < 1000)); 653 } while (send_status && (timeout++ < 1000));
615 654
655 mb();
616 atomic_set(&init_deasserted, 1); 656 atomic_set(&init_deasserted, 1);
617 657
618 num_starts = 2; 658 num_starts = 2;
@@ -626,7 +666,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
626 666
627 for (j = 1; j <= num_starts; j++) { 667 for (j = 1; j <= num_starts; j++) {
628 Dprintk("Sending STARTUP #%d.\n",j); 668 Dprintk("Sending STARTUP #%d.\n",j);
629 apic_read_around(APIC_SPIV);
630 apic_write(APIC_ESR, 0); 669 apic_write(APIC_ESR, 0);
631 apic_read(APIC_ESR); 670 apic_read(APIC_ESR);
632 Dprintk("After apic_write.\n"); 671 Dprintk("After apic_write.\n");
@@ -665,7 +704,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
665 * Due to the Pentium erratum 3AP. 704 * Due to the Pentium erratum 3AP.
666 */ 705 */
667 if (maxlvt > 3) { 706 if (maxlvt > 3) {
668 apic_read_around(APIC_SPIV);
669 apic_write(APIC_ESR, 0); 707 apic_write(APIC_ESR, 0);
670 } 708 }
671 accept_status = (apic_read(APIC_ESR) & 0xEF); 709 accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -710,11 +748,35 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
710 }; 748 };
711 DECLARE_WORK(work, do_fork_idle, &c_idle); 749 DECLARE_WORK(work, do_fork_idle, &c_idle);
712 750
751 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
752 if (!cpu_gdt_descr[cpu].address &&
753 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
754 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
755 return -1;
756 }
757
758 /* Allocate node local memory for AP pdas */
759 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
760 struct x8664_pda *newpda, *pda;
761 int node = cpu_to_node(cpu);
762 pda = cpu_pda(cpu);
763 newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
764 node);
765 if (newpda) {
766 memcpy(newpda, pda, sizeof (struct x8664_pda));
767 cpu_pda(cpu) = newpda;
768 } else
769 printk(KERN_ERR
770 "Could not allocate node local PDA for CPU %d on node %d\n",
771 cpu, node);
772 }
773
774
713 c_idle.idle = get_idle_for_cpu(cpu); 775 c_idle.idle = get_idle_for_cpu(cpu);
714 776
715 if (c_idle.idle) { 777 if (c_idle.idle) {
716 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) 778 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
717 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); 779 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
718 init_idle(c_idle.idle, cpu); 780 init_idle(c_idle.idle, cpu);
719 goto do_rest; 781 goto do_rest;
720 } 782 }
@@ -745,14 +807,14 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
745 807
746do_rest: 808do_rest:
747 809
748 cpu_pda[cpu].pcurrent = c_idle.idle; 810 cpu_pda(cpu)->pcurrent = c_idle.idle;
749 811
750 start_rip = setup_trampoline(); 812 start_rip = setup_trampoline();
751 813
752 init_rsp = c_idle.idle->thread.rsp; 814 init_rsp = c_idle.idle->thread.rsp;
753 per_cpu(init_tss,cpu).rsp0 = init_rsp; 815 per_cpu(init_tss,cpu).rsp0 = init_rsp;
754 initial_code = start_secondary; 816 initial_code = start_secondary;
755 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); 817 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
756 818
757 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, 819 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
758 cpus_weight(cpu_present_map), 820 cpus_weight(cpu_present_map),
@@ -778,11 +840,8 @@ do_rest:
778 /* 840 /*
779 * Be paranoid about clearing APIC errors. 841 * Be paranoid about clearing APIC errors.
780 */ 842 */
781 if (APIC_INTEGRATED(apic_version[apicid])) { 843 apic_write(APIC_ESR, 0);
782 apic_read_around(APIC_SPIV); 844 apic_read(APIC_ESR);
783 apic_write(APIC_ESR, 0);
784 apic_read(APIC_ESR);
785 }
786 845
787 /* 846 /*
788 * Status is now clean 847 * Status is now clean
@@ -879,6 +938,9 @@ static __init void disable_smp(void)
879} 938}
880 939
881#ifdef CONFIG_HOTPLUG_CPU 940#ifdef CONFIG_HOTPLUG_CPU
941
942int additional_cpus __initdata = -1;
943
882/* 944/*
883 * cpu_possible_map should be static, it cannot change as cpu's 945 * cpu_possible_map should be static, it cannot change as cpu's
884 * are onlined, or offlined. The reason is per-cpu data-structures 946 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -887,14 +949,35 @@ static __init void disable_smp(void)
887 * cpu_present_map on the other hand can change dynamically. 949 * cpu_present_map on the other hand can change dynamically.
888 * In case when cpu_hotplug is not compiled, then we resort to current 950 * In case when cpu_hotplug is not compiled, then we resort to current
889 * behaviour, which is cpu_possible == cpu_present. 951 * behaviour, which is cpu_possible == cpu_present.
890 * If cpu-hotplug is supported, then we need to preallocate for all
891 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
892 * - Ashok Raj 952 * - Ashok Raj
953 *
954 * Three ways to find out the number of additional hotplug CPUs:
955 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
956 * - The user can overwrite it with additional_cpus=NUM
957 * - Otherwise don't reserve additional CPUs.
958 * We do this because additional CPUs waste a lot of memory.
959 * -AK
893 */ 960 */
894__init void prefill_possible_map(void) 961__init void prefill_possible_map(void)
895{ 962{
896 int i; 963 int i;
897 for (i = 0; i < NR_CPUS; i++) 964 int possible;
965
966 if (additional_cpus == -1) {
967 if (disabled_cpus > 0)
968 additional_cpus = disabled_cpus;
969 else
970 additional_cpus = 0;
971 }
972 possible = num_processors + additional_cpus;
973 if (possible > NR_CPUS)
974 possible = NR_CPUS;
975
976 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
977 possible,
978 max_t(int, possible - num_processors, 0));
979
980 for (i = 0; i < possible; i++)
898 cpu_set(i, cpu_possible_map); 981 cpu_set(i, cpu_possible_map);
899} 982}
900#endif 983#endif
@@ -936,7 +1019,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
936 /* 1019 /*
937 * If we couldn't find a local APIC, then get out of here now! 1020 * If we couldn't find a local APIC, then get out of here now!
938 */ 1021 */
939 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { 1022 if (!cpu_has_apic) {
940 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1023 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
941 boot_cpu_id); 1024 boot_cpu_id);
942 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); 1025 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
@@ -965,6 +1048,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
965 nmi_watchdog_default(); 1048 nmi_watchdog_default();
966 current_cpu_data = boot_cpu_data; 1049 current_cpu_data = boot_cpu_data;
967 current_thread_info()->cpu = 0; /* needed? */ 1050 current_thread_info()->cpu = 0; /* needed? */
1051 set_cpu_sibling_map(0);
968 1052
969 if (smp_sanity_check(max_cpus) < 0) { 1053 if (smp_sanity_check(max_cpus) < 0) {
970 printk(KERN_INFO "SMP disabled\n"); 1054 printk(KERN_INFO "SMP disabled\n");
@@ -1008,8 +1092,6 @@ void __init smp_prepare_boot_cpu(void)
1008 int me = smp_processor_id(); 1092 int me = smp_processor_id();
1009 cpu_set(me, cpu_online_map); 1093 cpu_set(me, cpu_online_map);
1010 cpu_set(me, cpu_callout_map); 1094 cpu_set(me, cpu_callout_map);
1011 cpu_set(0, cpu_sibling_map[0]);
1012 cpu_set(0, cpu_core_map[0]);
1013 per_cpu(cpu_state, me) = CPU_ONLINE; 1095 per_cpu(cpu_state, me) = CPU_ONLINE;
1014} 1096}
1015 1097
@@ -1062,9 +1144,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
1062 */ 1144 */
1063void __init smp_cpus_done(unsigned int max_cpus) 1145void __init smp_cpus_done(unsigned int max_cpus)
1064{ 1146{
1065#ifndef CONFIG_HOTPLUG_CPU
1066 zap_low_mappings();
1067#endif
1068 smp_cleanup_boot(); 1147 smp_cleanup_boot();
1069 1148
1070#ifdef CONFIG_X86_IO_APIC 1149#ifdef CONFIG_X86_IO_APIC
@@ -1081,15 +1160,24 @@ void __init smp_cpus_done(unsigned int max_cpus)
1081static void remove_siblinginfo(int cpu) 1160static void remove_siblinginfo(int cpu)
1082{ 1161{
1083 int sibling; 1162 int sibling;
1163 struct cpuinfo_x86 *c = cpu_data;
1084 1164
1165 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
1166 cpu_clear(cpu, cpu_core_map[sibling]);
1167 /*
1168 * last thread sibling in this cpu core going down
1169 */
1170 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
1171 c[sibling].booted_cores--;
1172 }
1173
1085 for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) 1174 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1086 cpu_clear(cpu, cpu_sibling_map[sibling]); 1175 cpu_clear(cpu, cpu_sibling_map[sibling]);
1087 for_each_cpu_mask(sibling, cpu_core_map[cpu])
1088 cpu_clear(cpu, cpu_core_map[sibling]);
1089 cpus_clear(cpu_sibling_map[cpu]); 1176 cpus_clear(cpu_sibling_map[cpu]);
1090 cpus_clear(cpu_core_map[cpu]); 1177 cpus_clear(cpu_core_map[cpu]);
1091 phys_proc_id[cpu] = BAD_APICID; 1178 phys_proc_id[cpu] = BAD_APICID;
1092 cpu_core_id[cpu] = BAD_APICID; 1179 cpu_core_id[cpu] = BAD_APICID;
1180 cpu_clear(cpu, cpu_sibling_setup_map);
1093} 1181}
1094 1182
1095void remove_cpu_from_maps(void) 1183void remove_cpu_from_maps(void)
@@ -1116,7 +1204,7 @@ int __cpu_disable(void)
1116 if (cpu == 0) 1204 if (cpu == 0)
1117 return -EBUSY; 1205 return -EBUSY;
1118 1206
1119 disable_APIC_timer(); 1207 clear_local_APIC();
1120 1208
1121 /* 1209 /*
1122 * HACK: 1210 * HACK:
@@ -1153,6 +1241,12 @@ void __cpu_die(unsigned int cpu)
1153 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1241 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1154} 1242}
1155 1243
1244static __init int setup_additional_cpus(char *s)
1245{
1246 return get_option(&s, &additional_cpus);
1247}
1248__setup("additional_cpus=", setup_additional_cpus);
1249
1156#else /* ... !CONFIG_HOTPLUG_CPU */ 1250#else /* ... !CONFIG_HOTPLUG_CPU */
1157 1251
1158int __cpu_disable(void) 1252int __cpu_disable(void)
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index fd2bef780882..ecbd34c1093d 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -120,7 +120,7 @@ void fix_processor_context(void)
120 120
121 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ 121 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
122 122
123 cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9; 123 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
124 124
125 syscall_init(); /* This sets MSR_*STAR and related */ 125 syscall_init(); /* This sets MSR_*STAR and related */
126 load_TR_desc(); /* This does ltr */ 126 load_TR_desc(); /* This does ltr */
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index cc7821c68851..6449ea8fe756 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name)
154 err |= copy_to_user(&name->machine, "i686", 5); 154 err |= copy_to_user(&name->machine, "i686", 5);
155 return err ? -EFAULT : 0; 155 return err ? -EFAULT : 0;
156} 156}
157
158asmlinkage long sys_time64(long __user * tloc)
159{
160 struct timeval now;
161 int i;
162
163 do_gettimeofday(&now);
164 i = now.tv_sec;
165 if (tloc) {
166 if (put_user(i,tloc))
167 i = -EFAULT;
168 }
169 return i;
170}
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index e263685f864c..7c176b3edde0 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -19,7 +19,7 @@ typedef void (*sys_call_ptr_t)(void);
19 19
20extern void sys_ni_syscall(void); 20extern void sys_ni_syscall(void);
21 21
22sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall, 24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm-x86_64/unistd.h> 25#include <asm-x86_64/unistd.h>
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index fdaddc4e5284..f8c47c688443 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -59,7 +59,7 @@ static int notsc __initdata = 0;
59unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 59unsigned int cpu_khz; /* TSC clocks / usec, not used here */
60static unsigned long hpet_period; /* fsecs / HPET clock */ 60static unsigned long hpet_period; /* fsecs / HPET clock */
61unsigned long hpet_tick; /* HPET clocks / interrupt */ 61unsigned long hpet_tick; /* HPET clocks / interrupt */
62static int hpet_use_timer; 62static int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */
63unsigned long vxtime_hz = PIT_TICK_RATE; 63unsigned long vxtime_hz = PIT_TICK_RATE;
64int report_lost_ticks; /* command line option */ 64int report_lost_ticks; /* command line option */
65unsigned long long monotonic_base; 65unsigned long long monotonic_base;
@@ -71,14 +71,6 @@ unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
71struct timespec __xtime __section_xtime; 71struct timespec __xtime __section_xtime;
72struct timezone __sys_tz __section_sys_tz; 72struct timezone __sys_tz __section_sys_tz;
73 73
74static inline void rdtscll_sync(unsigned long *tsc)
75{
76#ifdef CONFIG_SMP
77 sync_core();
78#endif
79 rdtscll(*tsc);
80}
81
82/* 74/*
83 * do_gettimeoffset() returns microseconds since last timer interrupt was 75 * do_gettimeoffset() returns microseconds since last timer interrupt was
84 * triggered by hardware. A memory read of HPET is slower than a register read 76 * triggered by hardware. A memory read of HPET is slower than a register read
@@ -93,7 +85,7 @@ static inline unsigned int do_gettimeoffset_tsc(void)
93{ 85{
94 unsigned long t; 86 unsigned long t;
95 unsigned long x; 87 unsigned long x;
96 rdtscll_sync(&t); 88 t = get_cycles_sync();
97 if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ 89 if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
98 x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; 90 x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
99 return x; 91 return x;
@@ -259,8 +251,8 @@ static void set_rtc_mmss(unsigned long nowtime)
259#endif 251#endif
260 252
261 { 253 {
262 BIN_TO_BCD(real_seconds); 254 BIN_TO_BCD(real_seconds);
263 BIN_TO_BCD(real_minutes); 255 BIN_TO_BCD(real_minutes);
264 CMOS_WRITE(real_seconds, RTC_SECONDS); 256 CMOS_WRITE(real_seconds, RTC_SECONDS);
265 CMOS_WRITE(real_minutes, RTC_MINUTES); 257 CMOS_WRITE(real_minutes, RTC_MINUTES);
266 } 258 }
@@ -297,25 +289,21 @@ unsigned long long monotonic_clock(void)
297 last_offset = vxtime.last; 289 last_offset = vxtime.last;
298 base = monotonic_base; 290 base = monotonic_base;
299 this_offset = hpet_readl(HPET_COUNTER); 291 this_offset = hpet_readl(HPET_COUNTER);
300
301 } while (read_seqretry(&xtime_lock, seq)); 292 } while (read_seqretry(&xtime_lock, seq));
302 offset = (this_offset - last_offset); 293 offset = (this_offset - last_offset);
303 offset *=(NSEC_PER_SEC/HZ)/hpet_tick; 294 offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
304 return base + offset; 295 return base + offset;
305 }else{ 296 } else {
306 do { 297 do {
307 seq = read_seqbegin(&xtime_lock); 298 seq = read_seqbegin(&xtime_lock);
308 299
309 last_offset = vxtime.last_tsc; 300 last_offset = vxtime.last_tsc;
310 base = monotonic_base; 301 base = monotonic_base;
311 } while (read_seqretry(&xtime_lock, seq)); 302 } while (read_seqretry(&xtime_lock, seq));
312 sync_core(); 303 this_offset = get_cycles_sync();
313 rdtscll(this_offset);
314 offset = (this_offset - last_offset)*1000/cpu_khz; 304 offset = (this_offset - last_offset)*1000/cpu_khz;
315 return base + offset; 305 return base + offset;
316 } 306 }
317
318
319} 307}
320EXPORT_SYMBOL(monotonic_clock); 308EXPORT_SYMBOL(monotonic_clock);
321 309
@@ -391,7 +379,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
391 delay = LATCH - 1 - delay; 379 delay = LATCH - 1 - delay;
392 } 380 }
393 381
394 rdtscll_sync(&tsc); 382 tsc = get_cycles_sync();
395 383
396 if (vxtime.mode == VXTIME_HPET) { 384 if (vxtime.mode == VXTIME_HPET) {
397 if (offset - vxtime.last > hpet_tick) { 385 if (offset - vxtime.last > hpet_tick) {
@@ -471,6 +459,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
471 459
472 write_sequnlock(&xtime_lock); 460 write_sequnlock(&xtime_lock);
473 461
462#ifdef CONFIG_X86_LOCAL_APIC
463 if (using_apic_timer)
464 smp_send_timer_broadcast_ipi();
465#endif
466
474 return IRQ_HANDLED; 467 return IRQ_HANDLED;
475} 468}
476 469
@@ -509,10 +502,10 @@ unsigned long long sched_clock(void)
509 return cycles_2_ns(a); 502 return cycles_2_ns(a);
510} 503}
511 504
512unsigned long get_cmos_time(void) 505static unsigned long get_cmos_time(void)
513{ 506{
514 unsigned int timeout, year, mon, day, hour, min, sec; 507 unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
515 unsigned char last, this; 508 unsigned char uip = 0, this = 0;
516 unsigned long flags; 509 unsigned long flags;
517 510
518/* 511/*
@@ -525,45 +518,41 @@ unsigned long get_cmos_time(void)
525 518
526 spin_lock_irqsave(&rtc_lock, flags); 519 spin_lock_irqsave(&rtc_lock, flags);
527 520
528 timeout = 1000000; 521 while (timeout && (!uip || this)) {
529 last = this = 0; 522 uip |= this;
530
531 while (timeout && last && !this) {
532 last = this;
533 this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; 523 this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
534 timeout--; 524 timeout--;
535 } 525 }
536 526
537/* 527 /*
538 * Here we are safe to assume the registers won't change for a whole second, so 528 * Here we are safe to assume the registers won't change for a whole
539 * we just go ahead and read them. 529 * second, so we just go ahead and read them.
540 */ 530 */
541 531 sec = CMOS_READ(RTC_SECONDS);
542 sec = CMOS_READ(RTC_SECONDS); 532 min = CMOS_READ(RTC_MINUTES);
543 min = CMOS_READ(RTC_MINUTES); 533 hour = CMOS_READ(RTC_HOURS);
544 hour = CMOS_READ(RTC_HOURS); 534 day = CMOS_READ(RTC_DAY_OF_MONTH);
545 day = CMOS_READ(RTC_DAY_OF_MONTH); 535 mon = CMOS_READ(RTC_MONTH);
546 mon = CMOS_READ(RTC_MONTH); 536 year = CMOS_READ(RTC_YEAR);
547 year = CMOS_READ(RTC_YEAR);
548 537
549 spin_unlock_irqrestore(&rtc_lock, flags); 538 spin_unlock_irqrestore(&rtc_lock, flags);
550 539
551/* 540 /*
552 * We know that x86-64 always uses BCD format, no need to check the config 541 * We know that x86-64 always uses BCD format, no need to check the
553 * register. 542 * config register.
554 */ 543 */
555 544
556 BCD_TO_BIN(sec); 545 BCD_TO_BIN(sec);
557 BCD_TO_BIN(min); 546 BCD_TO_BIN(min);
558 BCD_TO_BIN(hour); 547 BCD_TO_BIN(hour);
559 BCD_TO_BIN(day); 548 BCD_TO_BIN(day);
560 BCD_TO_BIN(mon); 549 BCD_TO_BIN(mon);
561 BCD_TO_BIN(year); 550 BCD_TO_BIN(year);
562 551
563/* 552 /*
564 * x86-64 systems only exists since 2002. 553 * x86-64 systems only exists since 2002.
565 * This will work up to Dec 31, 2100 554 * This will work up to Dec 31, 2100
566 */ 555 */
567 year += 2000; 556 year += 2000;
568 557
569 return mktime(year, mon, day, hour, min, sec); 558 return mktime(year, mon, day, hour, min, sec);
@@ -695,8 +684,7 @@ static unsigned int __init hpet_calibrate_tsc(void)
695 do { 684 do {
696 local_irq_disable(); 685 local_irq_disable();
697 hpet_now = hpet_readl(HPET_COUNTER); 686 hpet_now = hpet_readl(HPET_COUNTER);
698 sync_core(); 687 tsc_now = get_cycles_sync();
699 rdtscl(tsc_now);
700 local_irq_restore(flags); 688 local_irq_restore(flags);
701 } while ((tsc_now - tsc_start) < TICK_COUNT && 689 } while ((tsc_now - tsc_start) < TICK_COUNT &&
702 (hpet_now - hpet_start) < TICK_COUNT); 690 (hpet_now - hpet_start) < TICK_COUNT);
@@ -726,11 +714,9 @@ static unsigned int __init pit_calibrate_tsc(void)
726 outb(0xb0, 0x43); 714 outb(0xb0, 0x43);
727 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 715 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
728 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); 716 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
729 rdtscll(start); 717 start = get_cycles_sync();
730 sync_core();
731 while ((inb(0x61) & 0x20) == 0); 718 while ((inb(0x61) & 0x20) == 0);
732 sync_core(); 719 end = get_cycles_sync();
733 rdtscll(end);
734 720
735 spin_unlock_irqrestore(&i8253_lock, flags); 721 spin_unlock_irqrestore(&i8253_lock, flags);
736 722
@@ -744,7 +730,7 @@ static __init int late_hpet_init(void)
744 unsigned int ntimer; 730 unsigned int ntimer;
745 731
746 if (!vxtime.hpet_address) 732 if (!vxtime.hpet_address)
747 return -1; 733 return -1;
748 734
749 memset(&hd, 0, sizeof (hd)); 735 memset(&hd, 0, sizeof (hd));
750 736
@@ -878,8 +864,6 @@ static struct irqaction irq0 = {
878 timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL 864 timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
879}; 865};
880 866
881extern void __init config_acpi_tables(void);
882
883void __init time_init(void) 867void __init time_init(void)
884{ 868{
885 char *timename; 869 char *timename;
@@ -908,12 +892,14 @@ void __init time_init(void)
908 if (!hpet_init()) 892 if (!hpet_init())
909 vxtime_hz = (1000000000000000L + hpet_period / 2) / 893 vxtime_hz = (1000000000000000L + hpet_period / 2) /
910 hpet_period; 894 hpet_period;
895 else
896 vxtime.hpet_address = 0;
911 897
912 if (hpet_use_timer) { 898 if (hpet_use_timer) {
913 cpu_khz = hpet_calibrate_tsc(); 899 cpu_khz = hpet_calibrate_tsc();
914 timename = "HPET"; 900 timename = "HPET";
915#ifdef CONFIG_X86_PM_TIMER 901#ifdef CONFIG_X86_PM_TIMER
916 } else if (pmtmr_ioport) { 902 } else if (pmtmr_ioport && !vxtime.hpet_address) {
917 vxtime_hz = PM_TIMER_FREQUENCY; 903 vxtime_hz = PM_TIMER_FREQUENCY;
918 timename = "PM"; 904 timename = "PM";
919 pit_init(); 905 pit_init();
@@ -932,7 +918,7 @@ void __init time_init(void)
932 vxtime.mode = VXTIME_TSC; 918 vxtime.mode = VXTIME_TSC;
933 vxtime.quot = (1000000L << 32) / vxtime_hz; 919 vxtime.quot = (1000000L << 32) / vxtime_hz;
934 vxtime.tsc_quot = (1000L << 32) / cpu_khz; 920 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
935 rdtscll_sync(&vxtime.last_tsc); 921 vxtime.last_tsc = get_cycles_sync();
936 setup_irq(0, &irq0); 922 setup_irq(0, &irq0);
937 923
938 set_cyc2ns_scale(cpu_khz); 924 set_cyc2ns_scale(cpu_khz);
@@ -946,7 +932,7 @@ void __init time_init(void)
946 * Make an educated guess if the TSC is trustworthy and synchronized 932 * Make an educated guess if the TSC is trustworthy and synchronized
947 * over all CPUs. 933 * over all CPUs.
948 */ 934 */
949static __init int unsynchronized_tsc(void) 935__init int unsynchronized_tsc(void)
950{ 936{
951#ifdef CONFIG_SMP 937#ifdef CONFIG_SMP
952 if (oem_force_hpet_timer()) 938 if (oem_force_hpet_timer())
@@ -957,7 +943,7 @@ static __init int unsynchronized_tsc(void)
957 return 0; 943 return 0;
958#endif 944#endif
959 /* Assume multi socket systems are not synchronized */ 945 /* Assume multi socket systems are not synchronized */
960 return num_online_cpus() > 1; 946 return num_present_cpus() > 1;
961} 947}
962 948
963/* 949/*
@@ -997,6 +983,10 @@ __setup("report_lost_ticks", time_setup);
997static long clock_cmos_diff; 983static long clock_cmos_diff;
998static unsigned long sleep_start; 984static unsigned long sleep_start;
999 985
986/*
987 * sysfs support for the timer.
988 */
989
1000static int timer_suspend(struct sys_device *dev, pm_message_t state) 990static int timer_suspend(struct sys_device *dev, pm_message_t state)
1001{ 991{
1002 /* 992 /*
@@ -1039,7 +1029,6 @@ static struct sysdev_class timer_sysclass = {
1039 set_kset_name("timer"), 1029 set_kset_name("timer"),
1040}; 1030};
1041 1031
1042
1043/* XXX this driverfs stuff should probably go elsewhere later -john */ 1032/* XXX this driverfs stuff should probably go elsewhere later -john */
1044static struct sys_device device_timer = { 1033static struct sys_device device_timer = {
1045 .id = 0, 1034 .id = 0,
@@ -1073,8 +1062,6 @@ device_initcall(time_init_device);
1073 */ 1062 */
1074#include <linux/rtc.h> 1063#include <linux/rtc.h>
1075 1064
1076extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
1077
1078#define DEFAULT_RTC_INT_FREQ 64 1065#define DEFAULT_RTC_INT_FREQ 64
1079#define RTC_NUM_INTS 1 1066#define RTC_NUM_INTS 1
1080 1067
@@ -1283,8 +1270,6 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
1283} 1270}
1284#endif 1271#endif
1285 1272
1286
1287
1288static int __init nohpet_setup(char *s) 1273static int __init nohpet_setup(char *s)
1289{ 1274{
1290 nohpet = 1; 1275 nohpet = 1;
@@ -1301,5 +1286,3 @@ static int __init notsc_setup(char *s)
1301} 1286}
1302 1287
1303__setup("notsc", notsc_setup); 1288__setup("notsc", notsc_setup);
1304
1305
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index 6d9c9a8e7d0e..23a03eb91fc7 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -42,8 +42,15 @@ r_base = .
42 movl $0xA5A5A5A5, trampoline_data - r_base 42 movl $0xA5A5A5A5, trampoline_data - r_base
43 # write marker for master knows we're running 43 # write marker for master knows we're running
44 44
45 lidt idt_48 - r_base # load idt with 0, 0 45 /*
46 lgdt gdt_48 - r_base # load gdt with whatever is appropriate 46 * GDT tables in non default location kernel can be beyond 16MB and
47 * lgdt will not be able to load the address as in real mode default
48 * operand size is 16bit. Use lgdtl instead to force operand size
49 * to 32 bit.
50 */
51
52 lidtl idt_48 - r_base # load idt with 0, 0
53 lgdtl gdt_48 - r_base # load gdt with whatever is appropriate
47 54
48 xor %ax, %ax 55 xor %ax, %ax
49 inc %ax # protected mode (PE) bit 56 inc %ax # protected mode (PE) bit
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b5e09e6b5536..8bb0aeda78b9 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -70,7 +70,6 @@ asmlinkage void reserved(void);
70asmlinkage void alignment_check(void); 70asmlinkage void alignment_check(void);
71asmlinkage void machine_check(void); 71asmlinkage void machine_check(void);
72asmlinkage void spurious_interrupt_bug(void); 72asmlinkage void spurious_interrupt_bug(void);
73asmlinkage void call_debug(void);
74 73
75struct notifier_block *die_chain; 74struct notifier_block *die_chain;
76static DEFINE_SPINLOCK(die_notifier_lock); 75static DEFINE_SPINLOCK(die_notifier_lock);
@@ -121,19 +120,31 @@ int printk_address(unsigned long address)
121static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 120static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
122 unsigned *usedp, const char **idp) 121 unsigned *usedp, const char **idp)
123{ 122{
124 static const char ids[N_EXCEPTION_STACKS][8] = { 123 static char ids[][8] = {
125 [DEBUG_STACK - 1] = "#DB", 124 [DEBUG_STACK - 1] = "#DB",
126 [NMI_STACK - 1] = "NMI", 125 [NMI_STACK - 1] = "NMI",
127 [DOUBLEFAULT_STACK - 1] = "#DF", 126 [DOUBLEFAULT_STACK - 1] = "#DF",
128 [STACKFAULT_STACK - 1] = "#SS", 127 [STACKFAULT_STACK - 1] = "#SS",
129 [MCE_STACK - 1] = "#MC", 128 [MCE_STACK - 1] = "#MC",
129#if DEBUG_STKSZ > EXCEPTION_STKSZ
130 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
131#endif
130 }; 132 };
131 unsigned k; 133 unsigned k;
132 134
133 for (k = 0; k < N_EXCEPTION_STACKS; k++) { 135 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
134 unsigned long end; 136 unsigned long end;
135 137
136 end = per_cpu(init_tss, cpu).ist[k]; 138 switch (k + 1) {
139#if DEBUG_STKSZ > EXCEPTION_STKSZ
140 case DEBUG_STACK:
141 end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
142 break;
143#endif
144 default:
145 end = per_cpu(init_tss, cpu).ist[k];
146 break;
147 }
137 if (stack >= end) 148 if (stack >= end)
138 continue; 149 continue;
139 if (stack >= end - EXCEPTION_STKSZ) { 150 if (stack >= end - EXCEPTION_STKSZ) {
@@ -143,6 +154,22 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
143 *idp = ids[k]; 154 *idp = ids[k];
144 return (unsigned long *)end; 155 return (unsigned long *)end;
145 } 156 }
157#if DEBUG_STKSZ > EXCEPTION_STKSZ
158 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
159 unsigned j = N_EXCEPTION_STACKS - 1;
160
161 do {
162 ++j;
163 end -= EXCEPTION_STKSZ;
164 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
165 } while (stack < end - EXCEPTION_STKSZ);
166 if (*usedp & (1U << j))
167 break;
168 *usedp |= 1U << j;
169 *idp = ids[j];
170 return (unsigned long *)end;
171 }
172#endif
146 } 173 }
147 return NULL; 174 return NULL;
148} 175}
@@ -156,9 +183,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
156 183
157void show_trace(unsigned long *stack) 184void show_trace(unsigned long *stack)
158{ 185{
159 unsigned long addr;
160 const unsigned cpu = safe_smp_processor_id(); 186 const unsigned cpu = safe_smp_processor_id();
161 unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr; 187 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
162 int i; 188 int i;
163 unsigned used = 0; 189 unsigned used = 0;
164 190
@@ -166,8 +192,14 @@ void show_trace(unsigned long *stack)
166 192
167#define HANDLE_STACK(cond) \ 193#define HANDLE_STACK(cond) \
168 do while (cond) { \ 194 do while (cond) { \
169 addr = *stack++; \ 195 unsigned long addr = *stack++; \
170 if (kernel_text_address(addr)) { \ 196 if (kernel_text_address(addr)) { \
197 if (i > 50) { \
198 printk("\n "); \
199 i = 0; \
200 } \
201 else \
202 i += printk(" "); \
171 /* \ 203 /* \
172 * If the address is either in the text segment of the \ 204 * If the address is either in the text segment of the \
173 * kernel, or in the region which contains vmalloc'ed \ 205 * kernel, or in the region which contains vmalloc'ed \
@@ -177,25 +209,19 @@ void show_trace(unsigned long *stack)
177 * out the call path that was taken. \ 209 * out the call path that was taken. \
178 */ \ 210 */ \
179 i += printk_address(addr); \ 211 i += printk_address(addr); \
180 if (i > 50) { \
181 printk("\n "); \
182 i = 0; \
183 } \
184 else \
185 i += printk(" "); \
186 } \ 212 } \
187 } while (0) 213 } while (0)
188 214
189 for(i = 0; ; ) { 215 for(i = 11; ; ) {
190 const char *id; 216 const char *id;
191 unsigned long *estack_end; 217 unsigned long *estack_end;
192 estack_end = in_exception_stack(cpu, (unsigned long)stack, 218 estack_end = in_exception_stack(cpu, (unsigned long)stack,
193 &used, &id); 219 &used, &id);
194 220
195 if (estack_end) { 221 if (estack_end) {
196 i += printk(" <%s> ", id); 222 i += printk(" <%s>", id);
197 HANDLE_STACK (stack < estack_end); 223 HANDLE_STACK (stack < estack_end);
198 i += printk(" <EOE> "); 224 i += printk(" <EOE>");
199 stack = (unsigned long *) estack_end[-2]; 225 stack = (unsigned long *) estack_end[-2];
200 continue; 226 continue;
201 } 227 }
@@ -205,11 +231,11 @@ void show_trace(unsigned long *stack)
205 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 231 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
206 232
207 if (stack >= irqstack && stack < irqstack_end) { 233 if (stack >= irqstack && stack < irqstack_end) {
208 i += printk(" <IRQ> "); 234 i += printk(" <IRQ>");
209 HANDLE_STACK (stack < irqstack_end); 235 HANDLE_STACK (stack < irqstack_end);
210 stack = (unsigned long *) (irqstack_end[-1]); 236 stack = (unsigned long *) (irqstack_end[-1]);
211 irqstack_end = NULL; 237 irqstack_end = NULL;
212 i += printk(" <EOI> "); 238 i += printk(" <EOI>");
213 continue; 239 continue;
214 } 240 }
215 } 241 }
@@ -226,8 +252,8 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
226 unsigned long *stack; 252 unsigned long *stack;
227 int i; 253 int i;
228 const int cpu = safe_smp_processor_id(); 254 const int cpu = safe_smp_processor_id();
229 unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); 255 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
230 unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); 256 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
231 257
232 // debugging aid: "show_stack(NULL, NULL);" prints the 258 // debugging aid: "show_stack(NULL, NULL);" prints the
233 // back trace for this cpu. 259 // back trace for this cpu.
@@ -275,14 +301,14 @@ void show_registers(struct pt_regs *regs)
275 int in_kernel = !user_mode(regs); 301 int in_kernel = !user_mode(regs);
276 unsigned long rsp; 302 unsigned long rsp;
277 const int cpu = safe_smp_processor_id(); 303 const int cpu = safe_smp_processor_id();
278 struct task_struct *cur = cpu_pda[cpu].pcurrent; 304 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
279 305
280 rsp = regs->rsp; 306 rsp = regs->rsp;
281 307
282 printk("CPU %d ", cpu); 308 printk("CPU %d ", cpu);
283 __show_regs(regs); 309 __show_regs(regs);
284 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 310 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
285 cur->comm, cur->pid, cur->thread_info, cur); 311 cur->comm, cur->pid, task_thread_info(cur), cur);
286 312
287 /* 313 /*
288 * When in-kernel, we also print out the stack and code at the 314 * When in-kernel, we also print out the stack and code at the
@@ -314,20 +340,26 @@ bad:
314void handle_BUG(struct pt_regs *regs) 340void handle_BUG(struct pt_regs *regs)
315{ 341{
316 struct bug_frame f; 342 struct bug_frame f;
317 char tmp; 343 long len;
344 const char *prefix = "";
318 345
319 if (user_mode(regs)) 346 if (user_mode(regs))
320 return; 347 return;
321 if (__copy_from_user(&f, (struct bug_frame *) regs->rip, 348 if (__copy_from_user(&f, (const void __user *) regs->rip,
322 sizeof(struct bug_frame))) 349 sizeof(struct bug_frame)))
323 return; 350 return;
324 if (f.filename >= 0 || 351 if (f.filename >= 0 ||
325 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 352 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
326 return; 353 return;
327 if (__get_user(tmp, (char *)(long)f.filename)) 354 len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
355 if (len < 0 || len >= PATH_MAX)
328 f.filename = (int)(long)"unmapped filename"; 356 f.filename = (int)(long)"unmapped filename";
357 else if (len > 50) {
358 f.filename += len - 50;
359 prefix = "...";
360 }
329 printk("----------- [cut here ] --------- [please bite here ] ---------\n"); 361 printk("----------- [cut here ] --------- [please bite here ] ---------\n");
330 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", (char *)(long)f.filename, f.line); 362 printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
331} 363}
332 364
333#ifdef CONFIG_BUG 365#ifdef CONFIG_BUG
@@ -382,7 +414,7 @@ void __die(const char * str, struct pt_regs * regs, long err)
382 printk("DEBUG_PAGEALLOC"); 414 printk("DEBUG_PAGEALLOC");
383#endif 415#endif
384 printk("\n"); 416 printk("\n");
385 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); 417 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
386 show_registers(regs); 418 show_registers(regs);
387 /* Executive summary in case the oops scrolled away */ 419 /* Executive summary in case the oops scrolled away */
388 printk(KERN_ALERT "RIP "); 420 printk(KERN_ALERT "RIP ");
@@ -399,11 +431,6 @@ void die(const char * str, struct pt_regs * regs, long err)
399 oops_end(flags); 431 oops_end(flags);
400 do_exit(SIGSEGV); 432 do_exit(SIGSEGV);
401} 433}
402static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
403{
404 if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS))
405 die(str, regs, err);
406}
407 434
408void die_nmi(char *str, struct pt_regs *regs) 435void die_nmi(char *str, struct pt_regs *regs)
409{ 436{
@@ -426,32 +453,20 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
426 struct pt_regs * regs, long error_code, 453 struct pt_regs * regs, long error_code,
427 siginfo_t *info) 454 siginfo_t *info)
428{ 455{
456 struct task_struct *tsk = current;
457
429 conditional_sti(regs); 458 conditional_sti(regs);
430 459
431#ifdef CONFIG_CHECKING 460 tsk->thread.error_code = error_code;
432 { 461 tsk->thread.trap_no = trapnr;
433 unsigned long gs;
434 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
435 rdmsrl(MSR_GS_BASE, gs);
436 if (gs != (unsigned long)pda) {
437 wrmsrl(MSR_GS_BASE, pda);
438 printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
439 regs->rip);
440 }
441 }
442#endif
443 462
444 if (user_mode(regs)) { 463 if (user_mode(regs)) {
445 struct task_struct *tsk = current;
446
447 if (exception_trace && unhandled_signal(tsk, signr)) 464 if (exception_trace && unhandled_signal(tsk, signr))
448 printk(KERN_INFO 465 printk(KERN_INFO
449 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 466 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
450 tsk->comm, tsk->pid, str, 467 tsk->comm, tsk->pid, str,
451 regs->rip,regs->rsp,error_code); 468 regs->rip,regs->rsp,error_code);
452 469
453 tsk->thread.error_code = error_code;
454 tsk->thread.trap_no = trapnr;
455 if (info) 470 if (info)
456 force_sig_info(signr, info, tsk); 471 force_sig_info(signr, info, tsk);
457 else 472 else
@@ -498,7 +513,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
498DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) 513DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
499DO_ERROR( 4, SIGSEGV, "overflow", overflow) 514DO_ERROR( 4, SIGSEGV, "overflow", overflow)
500DO_ERROR( 5, SIGSEGV, "bounds", bounds) 515DO_ERROR( 5, SIGSEGV, "bounds", bounds)
501DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) 516DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
502DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 517DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
503DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 518DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
504DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 519DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -506,38 +521,41 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
506DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) 521DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
507DO_ERROR(18, SIGSEGV, "reserved", reserved) 522DO_ERROR(18, SIGSEGV, "reserved", reserved)
508DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 523DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
509DO_ERROR( 8, SIGSEGV, "double fault", double_fault) 524
525asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
526{
527 static const char str[] = "double fault";
528 struct task_struct *tsk = current;
529
530 /* Return not checked because double check cannot be ignored */
531 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
532
533 tsk->thread.error_code = error_code;
534 tsk->thread.trap_no = 8;
535
536 /* This is always a kernel trap and never fixable (and thus must
537 never return). */
538 for (;;)
539 die(str, regs, error_code);
540}
510 541
511asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, 542asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
512 long error_code) 543 long error_code)
513{ 544{
545 struct task_struct *tsk = current;
546
514 conditional_sti(regs); 547 conditional_sti(regs);
515 548
516#ifdef CONFIG_CHECKING 549 tsk->thread.error_code = error_code;
517 { 550 tsk->thread.trap_no = 13;
518 unsigned long gs;
519 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
520 rdmsrl(MSR_GS_BASE, gs);
521 if (gs != (unsigned long)pda) {
522 wrmsrl(MSR_GS_BASE, pda);
523 oops_in_progress++;
524 printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
525 oops_in_progress--;
526 }
527 }
528#endif
529 551
530 if (user_mode(regs)) { 552 if (user_mode(regs)) {
531 struct task_struct *tsk = current;
532
533 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) 553 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
534 printk(KERN_INFO 554 printk(KERN_INFO
535 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 555 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
536 tsk->comm, tsk->pid, 556 tsk->comm, tsk->pid,
537 regs->rip,regs->rsp,error_code); 557 regs->rip,regs->rsp,error_code);
538 558
539 tsk->thread.error_code = error_code;
540 tsk->thread.trap_no = 13;
541 force_sig(SIGSEGV, tsk); 559 force_sig(SIGSEGV, tsk);
542 return; 560 return;
543 } 561 }
@@ -600,7 +618,7 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
600 reason = get_nmi_reason(); 618 reason = get_nmi_reason();
601 619
602 if (!(reason & 0xc0)) { 620 if (!(reason & 0xc0)) {
603 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) 621 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
604 == NOTIFY_STOP) 622 == NOTIFY_STOP)
605 return; 623 return;
606#ifdef CONFIG_X86_LOCAL_APIC 624#ifdef CONFIG_X86_LOCAL_APIC
@@ -616,7 +634,7 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
616 unknown_nmi_error(reason, regs); 634 unknown_nmi_error(reason, regs);
617 return; 635 return;
618 } 636 }
619 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) 637 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
620 return; 638 return;
621 639
622 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 640 /* AK: following checks seem to be broken on modern chipsets. FIXME */
@@ -627,6 +645,7 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
627 io_check_error(reason, regs); 645 io_check_error(reason, regs);
628} 646}
629 647
648/* runs on IST stack. */
630asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) 649asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
631{ 650{
632 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { 651 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
@@ -647,7 +666,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
647 ; 666 ;
648 /* Exception from user space */ 667 /* Exception from user space */
649 else if (user_mode(eregs)) 668 else if (user_mode(eregs))
650 regs = ((struct pt_regs *)current->thread.rsp0) - 1; 669 regs = task_pt_regs(current);
651 /* Exception from kernel and interrupts are enabled. Move to 670 /* Exception from kernel and interrupts are enabled. Move to
652 kernel process stack. */ 671 kernel process stack. */
653 else if (eregs->eflags & X86_EFLAGS_IF) 672 else if (eregs->eflags & X86_EFLAGS_IF)
@@ -665,19 +684,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
665 struct task_struct *tsk = current; 684 struct task_struct *tsk = current;
666 siginfo_t info; 685 siginfo_t info;
667 686
668#ifdef CONFIG_CHECKING
669 {
670 /* RED-PEN interaction with debugger - could destroy gs */
671 unsigned long gs;
672 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
673 rdmsrl(MSR_GS_BASE, gs);
674 if (gs != (unsigned long)pda) {
675 wrmsrl(MSR_GS_BASE, pda);
676 printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
677 }
678 }
679#endif
680
681 get_debugreg(condition, 6); 687 get_debugreg(condition, 6);
682 688
683 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 689 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
@@ -724,11 +730,9 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
724 info.si_signo = SIGTRAP; 730 info.si_signo = SIGTRAP;
725 info.si_errno = 0; 731 info.si_errno = 0;
726 info.si_code = TRAP_BRKPT; 732 info.si_code = TRAP_BRKPT;
727 if (!user_mode(regs)) 733 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
728 goto clear_dr7; 734 force_sig_info(SIGTRAP, &info, tsk);
729 735
730 info.si_addr = (void __user *)regs->rip;
731 force_sig_info(SIGTRAP, &info, tsk);
732clear_dr7: 736clear_dr7:
733 set_debugreg(0UL, 7); 737 set_debugreg(0UL, 7);
734 return; 738 return;
@@ -738,7 +742,7 @@ clear_TF_reenable:
738 regs->eflags &= ~TF_MASK; 742 regs->eflags &= ~TF_MASK;
739} 743}
740 744
741static int kernel_math_error(struct pt_regs *regs, char *str) 745static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
742{ 746{
743 const struct exception_table_entry *fixup; 747 const struct exception_table_entry *fixup;
744 fixup = search_exception_tables(regs->rip); 748 fixup = search_exception_tables(regs->rip);
@@ -746,8 +750,9 @@ static int kernel_math_error(struct pt_regs *regs, char *str)
746 regs->rip = fixup->fixup; 750 regs->rip = fixup->fixup;
747 return 1; 751 return 1;
748 } 752 }
749 notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); 753 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
750 /* Illegal floating point operation in the kernel */ 754 /* Illegal floating point operation in the kernel */
755 current->thread.trap_no = trapnr;
751 die(str, regs, 0); 756 die(str, regs, 0);
752 return 0; 757 return 0;
753} 758}
@@ -766,7 +771,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
766 771
767 conditional_sti(regs); 772 conditional_sti(regs);
768 if (!user_mode(regs) && 773 if (!user_mode(regs) &&
769 kernel_math_error(regs, "kernel x87 math error")) 774 kernel_math_error(regs, "kernel x87 math error", 16))
770 return; 775 return;
771 776
772 /* 777 /*
@@ -835,7 +840,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
835 840
836 conditional_sti(regs); 841 conditional_sti(regs);
837 if (!user_mode(regs) && 842 if (!user_mode(regs) &&
838 kernel_math_error(regs, "kernel simd math error")) 843 kernel_math_error(regs, "kernel simd math error", 19))
839 return; 844 return;
840 845
841 /* 846 /*
@@ -888,6 +893,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
888{ 893{
889} 894}
890 895
896asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
897{
898}
899
891/* 900/*
892 * 'math_state_restore()' saves the current math information in the 901 * 'math_state_restore()' saves the current math information in the
893 * old math state array, and gets the new ones from the current task 902 * old math state array, and gets the new ones from the current task
@@ -903,12 +912,7 @@ asmlinkage void math_state_restore(void)
903 if (!used_math()) 912 if (!used_math())
904 init_fpu(me); 913 init_fpu(me);
905 restore_fpu_checking(&me->thread.i387.fxsave); 914 restore_fpu_checking(&me->thread.i387.fxsave);
906 me->thread_info->status |= TS_USEDFPU; 915 task_thread_info(me)->status |= TS_USEDFPU;
907}
908
909void do_call_debug(struct pt_regs *regs)
910{
911 notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT);
912} 916}
913 917
914void __init trap_init(void) 918void __init trap_init(void)
@@ -916,9 +920,9 @@ void __init trap_init(void)
916 set_intr_gate(0,&divide_error); 920 set_intr_gate(0,&divide_error);
917 set_intr_gate_ist(1,&debug,DEBUG_STACK); 921 set_intr_gate_ist(1,&debug,DEBUG_STACK);
918 set_intr_gate_ist(2,&nmi,NMI_STACK); 922 set_intr_gate_ist(2,&nmi,NMI_STACK);
919 set_system_gate(3,&int3); 923 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
920 set_system_gate(4,&overflow); /* int4-5 can be called from all */ 924 set_system_gate(4,&overflow); /* int4 can be called from all */
921 set_system_gate(5,&bounds); 925 set_intr_gate(5,&bounds);
922 set_intr_gate(6,&invalid_op); 926 set_intr_gate(6,&invalid_op);
923 set_intr_gate(7,&device_not_available); 927 set_intr_gate(7,&device_not_available);
924 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); 928 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
@@ -940,8 +944,6 @@ void __init trap_init(void)
940 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 944 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
941#endif 945#endif
942 946
943 set_intr_gate(KDB_VECTOR, call_debug);
944
945 /* 947 /*
946 * Should be a barrier for any external CPU state. 948 * Should be a barrier for any external CPU state.
947 */ 949 */
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6dd642cad2ef..b0eed1faf740 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -8,6 +8,8 @@
8#include <asm/page.h> 8#include <asm/page.h>
9#include <linux/config.h> 9#include <linux/config.h>
10 10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
11OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
12OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
13ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
@@ -50,7 +52,7 @@ SECTIONS
50 *(.bss.page_aligned) 52 *(.bss.page_aligned)
51 *(.bss) 53 *(.bss)
52 } 54 }
53 __bss_end = .; 55 __bss_stop = .;
54 56
55 . = ALIGN(PAGE_SIZE); 57 . = ALIGN(PAGE_SIZE);
56 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 58 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
@@ -189,7 +191,7 @@ SECTIONS
189 /* Sections to be discarded */ 191 /* Sections to be discarded */
190 /DISCARD/ : { 192 /DISCARD/ : {
191 *(.exitcall.exit) 193 *(.exitcall.exit)
192#ifndef CONFIG_DEBUG_INFO 194#ifndef CONFIG_UNWIND_INFO
193 *(.eh_frame) 195 *(.eh_frame)
194#endif 196#endif
195 } 197 }
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
new file mode 100644
index 000000000000..92f70c74965f
--- /dev/null
+++ b/arch/x86_64/kernel/vsmp.c
@@ -0,0 +1,45 @@
1/*
2 * vSMPowered(tm) systems specific initialization
3 * Copyright (C) 2005 ScaleMP Inc.
4 *
5 * Use of this code is subject to the terms and conditions of the
6 * GNU general public license version 2. See "COPYING" or
7 * http://www.gnu.org/licenses/gpl.html
8 *
9 * Ravikiran Thirumalai <kiran@scalemp.com>,
10 * Shai Fultheim <shai@scalemp.com>
11 */
12
13#include <linux/init.h>
14#include <linux/pci_ids.h>
15#include <linux/pci_regs.h>
16#include <asm/pci-direct.h>
17
18static int __init vsmp_init(void)
19{
20 void *address;
21 unsigned int cap, ctl;
22
23 /* Check if we are running on a ScaleMP vSMP box */
24 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
25 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
26 return 0;
27
28 /* set vSMP magic bits to indicate vSMP capable kernel */
29 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
30 cap = readl(address);
31 ctl = readl(address + 4);
32 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl);
33 if (cap & ctl & (1 << 4)) {
34 /* Turn on vSMP IRQ fastpath handling (see system.h) */
35 ctl &= ~(1 << 4);
36 writel(ctl, address + 4);
37 ctl = readl(address + 4);
38 printk("vSMP CTL: control set to:0x%08x\n", ctl);
39 }
40
41 iounmap(address);
42 return 0;
43}
44
45core_initcall(vsmp_init);
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 70a0bd16085f..9468fb20b0bc 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -35,14 +35,13 @@
35#include <asm/io.h> 35#include <asm/io.h>
36 36
37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
38#define force_inline __attribute__((always_inline)) inline
39 38
40int __sysctl_vsyscall __section_sysctl_vsyscall = 1; 39int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
41seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; 40seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
42 41
43#include <asm/unistd.h> 42#include <asm/unistd.h>
44 43
45static force_inline void timeval_normalize(struct timeval * tv) 44static __always_inline void timeval_normalize(struct timeval * tv)
46{ 45{
47 time_t __sec; 46 time_t __sec;
48 47
@@ -53,7 +52,7 @@ static force_inline void timeval_normalize(struct timeval * tv)
53 } 52 }
54} 53}
55 54
56static force_inline void do_vgettimeofday(struct timeval * tv) 55static __always_inline void do_vgettimeofday(struct timeval * tv)
57{ 56{
58 long sequence, t; 57 long sequence, t;
59 unsigned long sec, usec; 58 unsigned long sec, usec;
@@ -66,8 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
66 (__jiffies - __wall_jiffies) * (1000000 / HZ); 65 (__jiffies - __wall_jiffies) * (1000000 / HZ);
67 66
68 if (__vxtime.mode != VXTIME_HPET) { 67 if (__vxtime.mode != VXTIME_HPET) {
69 sync_core(); 68 t = get_cycles_sync();
70 rdtscll(t);
71 if (t < __vxtime.last_tsc) 69 if (t < __vxtime.last_tsc)
72 t = __vxtime.last_tsc; 70 t = __vxtime.last_tsc;
73 usec += ((t - __vxtime.last_tsc) * 71 usec += ((t - __vxtime.last_tsc) *
@@ -84,12 +82,12 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
84} 82}
85 83
86/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ 84/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
87static force_inline void do_get_tz(struct timezone * tz) 85static __always_inline void do_get_tz(struct timezone * tz)
88{ 86{
89 *tz = __sys_tz; 87 *tz = __sys_tz;
90} 88}
91 89
92static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 90static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
93{ 91{
94 int ret; 92 int ret;
95 asm volatile("vsysc2: syscall" 93 asm volatile("vsysc2: syscall"
@@ -98,7 +96,7 @@ static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
98 return ret; 96 return ret;
99} 97}
100 98
101static force_inline long time_syscall(long *t) 99static __always_inline long time_syscall(long *t)
102{ 100{
103 long secs; 101 long secs;
104 asm volatile("vsysc1: syscall" 102 asm volatile("vsysc1: syscall"
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index fd99ddd009bc..b614d54d2ae4 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -13,7 +13,6 @@
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/tty.h> 15#include <linux/tty.h>
16#include <linux/ioctl32.h>
17 16
18#include <asm/semaphore.h> 17#include <asm/semaphore.h>
19#include <asm/processor.h> 18#include <asm/processor.h>
@@ -45,22 +44,15 @@ extern struct drive_info_struct drive_info;
45EXPORT_SYMBOL(drive_info); 44EXPORT_SYMBOL(drive_info);
46#endif 45#endif
47 46
48extern unsigned long get_cmos_time(void);
49
50/* platform dependent support */ 47/* platform dependent support */
51EXPORT_SYMBOL(boot_cpu_data); 48EXPORT_SYMBOL(boot_cpu_data);
52//EXPORT_SYMBOL(dump_fpu); 49//EXPORT_SYMBOL(dump_fpu);
53EXPORT_SYMBOL(__ioremap); 50EXPORT_SYMBOL(__ioremap);
54EXPORT_SYMBOL(ioremap_nocache); 51EXPORT_SYMBOL(ioremap_nocache);
55EXPORT_SYMBOL(iounmap); 52EXPORT_SYMBOL(iounmap);
56EXPORT_SYMBOL(enable_irq);
57EXPORT_SYMBOL(disable_irq);
58EXPORT_SYMBOL(disable_irq_nosync);
59EXPORT_SYMBOL(probe_irq_mask);
60EXPORT_SYMBOL(kernel_thread); 53EXPORT_SYMBOL(kernel_thread);
61EXPORT_SYMBOL(pm_idle); 54EXPORT_SYMBOL(pm_idle);
62EXPORT_SYMBOL(pm_power_off); 55EXPORT_SYMBOL(pm_power_off);
63EXPORT_SYMBOL(get_cmos_time);
64 56
65EXPORT_SYMBOL(__down_failed); 57EXPORT_SYMBOL(__down_failed);
66EXPORT_SYMBOL(__down_failed_interruptible); 58EXPORT_SYMBOL(__down_failed_interruptible);
@@ -84,9 +76,6 @@ EXPORT_SYMBOL(__put_user_2);
84EXPORT_SYMBOL(__put_user_4); 76EXPORT_SYMBOL(__put_user_4);
85EXPORT_SYMBOL(__put_user_8); 77EXPORT_SYMBOL(__put_user_8);
86 78
87EXPORT_SYMBOL(strpbrk);
88EXPORT_SYMBOL(strstr);
89
90EXPORT_SYMBOL(strncpy_from_user); 79EXPORT_SYMBOL(strncpy_from_user);
91EXPORT_SYMBOL(__strncpy_from_user); 80EXPORT_SYMBOL(__strncpy_from_user);
92EXPORT_SYMBOL(clear_user); 81EXPORT_SYMBOL(clear_user);
@@ -98,25 +87,18 @@ EXPORT_SYMBOL(copy_in_user);
98EXPORT_SYMBOL(strnlen_user); 87EXPORT_SYMBOL(strnlen_user);
99 88
100#ifdef CONFIG_PCI 89#ifdef CONFIG_PCI
101EXPORT_SYMBOL(pci_alloc_consistent);
102EXPORT_SYMBOL(pci_free_consistent);
103#endif
104
105#ifdef CONFIG_PCI
106EXPORT_SYMBOL(pci_mem_start); 90EXPORT_SYMBOL(pci_mem_start);
107#endif 91#endif
108 92
109EXPORT_SYMBOL(copy_page); 93EXPORT_SYMBOL(copy_page);
110EXPORT_SYMBOL(clear_page); 94EXPORT_SYMBOL(clear_page);
111 95
112EXPORT_SYMBOL(cpu_pda); 96EXPORT_SYMBOL(_cpu_pda);
113#ifdef CONFIG_SMP 97#ifdef CONFIG_SMP
114EXPORT_SYMBOL(cpu_data); 98EXPORT_SYMBOL(cpu_data);
115EXPORT_SYMBOL(cpu_online_map);
116EXPORT_SYMBOL(__write_lock_failed); 99EXPORT_SYMBOL(__write_lock_failed);
117EXPORT_SYMBOL(__read_lock_failed); 100EXPORT_SYMBOL(__read_lock_failed);
118 101
119EXPORT_SYMBOL(synchronize_irq);
120EXPORT_SYMBOL(smp_call_function); 102EXPORT_SYMBOL(smp_call_function);
121EXPORT_SYMBOL(cpu_callout_map); 103EXPORT_SYMBOL(cpu_callout_map);
122#endif 104#endif
@@ -137,30 +119,17 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
137#undef memcpy 119#undef memcpy
138#undef memset 120#undef memset
139#undef memmove 121#undef memmove
140#undef memchr
141#undef strlen 122#undef strlen
142#undef strncmp
143#undef strncpy
144#undef strchr
145 123
146extern void * memset(void *,int,__kernel_size_t); 124extern void * memset(void *,int,__kernel_size_t);
147extern size_t strlen(const char *); 125extern size_t strlen(const char *);
148extern void * memmove(void * dest,const void *src,size_t count); 126extern void * memmove(void * dest,const void *src,size_t count);
149extern void *memchr(const void *s, int c, size_t n);
150extern void * memcpy(void *,const void *,__kernel_size_t); 127extern void * memcpy(void *,const void *,__kernel_size_t);
151extern void * __memcpy(void *,const void *,__kernel_size_t); 128extern void * __memcpy(void *,const void *,__kernel_size_t);
152 129
153EXPORT_SYMBOL(memset); 130EXPORT_SYMBOL(memset);
154EXPORT_SYMBOL(strlen); 131EXPORT_SYMBOL(strlen);
155EXPORT_SYMBOL(memmove); 132EXPORT_SYMBOL(memmove);
156EXPORT_SYMBOL(strncmp);
157EXPORT_SYMBOL(strncpy);
158EXPORT_SYMBOL(strchr);
159EXPORT_SYMBOL(strncat);
160EXPORT_SYMBOL(memchr);
161EXPORT_SYMBOL(strrchr);
162EXPORT_SYMBOL(strnlen);
163EXPORT_SYMBOL(memscan);
164EXPORT_SYMBOL(memcpy); 133EXPORT_SYMBOL(memcpy);
165EXPORT_SYMBOL(__memcpy); 134EXPORT_SYMBOL(__memcpy);
166 135
@@ -203,3 +172,6 @@ EXPORT_SYMBOL(flush_tlb_page);
203#endif 172#endif
204 173
205EXPORT_SYMBOL(cpu_khz); 174EXPORT_SYMBOL(cpu_khz);
175
176EXPORT_SYMBOL(load_gs_index);
177
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 30a9da458c15..43d9fa136180 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
5 .globl clear_page 5 .globl clear_page
6 .p2align 4 6 .p2align 4
7clear_page: 7clear_page:
8 xorl %eax,%eax
9 movl $4096/64,%ecx
10 .p2align 4
11.Lloop:
12 decl %ecx
13#define PUT(x) movq %rax,x*8(%rdi)
14 movq %rax,(%rdi)
15 PUT(1)
16 PUT(2)
17 PUT(3)
18 PUT(4)
19 PUT(5)
20 PUT(6)
21 PUT(7)
22 leaq 64(%rdi),%rdi
23 jnz .Lloop
24 nop
25 ret
26clear_page_end:
27
28 /* C stepping K8 run faster using the string instructions.
29 It is also a lot simpler. Use this when possible */
30
31#include <asm/cpufeature.h>
32
33 .section .altinstructions,"a"
34 .align 8
35 .quad clear_page
36 .quad clear_page_c
37 .byte X86_FEATURE_K8_C
38 .byte clear_page_end-clear_page
39 .byte clear_page_c_end-clear_page_c
40 .previous
41
42 .section .altinstr_replacement,"ax"
43clear_page_c:
44 movl $4096/8,%ecx 8 movl $4096/8,%ecx
45 xorl %eax,%eax 9 xorl %eax,%eax
46 rep 10 rep
47 stosq 11 stosq
48 ret 12 ret
49clear_page_c_end:
50 .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index dd3aa47b6bf5..621a19769406 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
8 .globl copy_page 8 .globl copy_page
9 .p2align 4 9 .p2align 4
10copy_page: 10copy_page:
11 subq $3*8,%rsp
12 movq %rbx,(%rsp)
13 movq %r12,1*8(%rsp)
14 movq %r13,2*8(%rsp)
15
16 movl $(4096/64)-5,%ecx
17 .p2align 4
18.Loop64:
19 dec %rcx
20
21 movq (%rsi), %rax
22 movq 8 (%rsi), %rbx
23 movq 16 (%rsi), %rdx
24 movq 24 (%rsi), %r8
25 movq 32 (%rsi), %r9
26 movq 40 (%rsi), %r10
27 movq 48 (%rsi), %r11
28 movq 56 (%rsi), %r12
29
30 prefetcht0 5*64(%rsi)
31
32 movq %rax, (%rdi)
33 movq %rbx, 8 (%rdi)
34 movq %rdx, 16 (%rdi)
35 movq %r8, 24 (%rdi)
36 movq %r9, 32 (%rdi)
37 movq %r10, 40 (%rdi)
38 movq %r11, 48 (%rdi)
39 movq %r12, 56 (%rdi)
40
41 leaq 64 (%rsi), %rsi
42 leaq 64 (%rdi), %rdi
43
44 jnz .Loop64
45
46 movl $5,%ecx
47 .p2align 4
48.Loop2:
49 decl %ecx
50
51 movq (%rsi), %rax
52 movq 8 (%rsi), %rbx
53 movq 16 (%rsi), %rdx
54 movq 24 (%rsi), %r8
55 movq 32 (%rsi), %r9
56 movq 40 (%rsi), %r10
57 movq 48 (%rsi), %r11
58 movq 56 (%rsi), %r12
59
60 movq %rax, (%rdi)
61 movq %rbx, 8 (%rdi)
62 movq %rdx, 16 (%rdi)
63 movq %r8, 24 (%rdi)
64 movq %r9, 32 (%rdi)
65 movq %r10, 40 (%rdi)
66 movq %r11, 48 (%rdi)
67 movq %r12, 56 (%rdi)
68
69 leaq 64(%rdi),%rdi
70 leaq 64(%rsi),%rsi
71
72 jnz .Loop2
73
74 movq (%rsp),%rbx
75 movq 1*8(%rsp),%r12
76 movq 2*8(%rsp),%r13
77 addq $3*8,%rsp
78 ret
79
80 /* C stepping K8 run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */
82
83#include <asm/cpufeature.h>
84
85 .section .altinstructions,"a"
86 .align 8
87 .quad copy_page
88 .quad copy_page_c
89 .byte X86_FEATURE_K8_C
90 .byte copy_page_c_end-copy_page_c
91 .byte copy_page_c_end-copy_page_c
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
96 movl $4096/8,%ecx 11 movl $4096/8,%ecx
97 rep 12 rep
98 movsq 13 movsq
99 ret 14 ret
100copy_page_c_end:
101 .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index dfa358b05c89..79422b6559c3 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,12 +4,9 @@
4 * Functions to copy from and to user space. 4 * Functions to copy from and to user space.
5 */ 5 */
6 6
7#define FIX_ALIGNMENT 1
8
9 #include <asm/current.h> 7 #include <asm/current.h>
10 #include <asm/asm-offsets.h> 8 #include <asm/asm-offsets.h>
11 #include <asm/thread_info.h> 9 #include <asm/thread_info.h>
12 #include <asm/cpufeature.h>
13 10
14/* Standard copy_to_user with segment limit checking */ 11/* Standard copy_to_user with segment limit checking */
15 .globl copy_to_user 12 .globl copy_to_user
@@ -21,23 +18,7 @@ copy_to_user:
21 jc bad_to_user 18 jc bad_to_user
22 cmpq threadinfo_addr_limit(%rax),%rcx 19 cmpq threadinfo_addr_limit(%rax),%rcx
23 jae bad_to_user 20 jae bad_to_user
242: 21 jmp copy_user_generic
25 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f
271:
28
29 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31 .long copy_user_generic_c-1b /* offset */
32 .previous
33 .section .altinstructions,"a"
34 .align 8
35 .quad 2b
36 .quad 3b
37 .byte X86_FEATURE_K8_C
38 .byte 5
39 .byte 5
40 .previous
41 22
42/* Standard copy_from_user with segment limit checking */ 23/* Standard copy_from_user with segment limit checking */
43 .globl copy_from_user 24 .globl copy_from_user
@@ -72,223 +53,44 @@ bad_to_user:
72 * rsi source 53 * rsi source
73 * rdx count 54 * rdx count
74 * 55 *
56 * Only 4GB of copy is supported. This shouldn't be a problem
57 * because the kernel normally only writes from/to page sized chunks
58 * even if user space passed a longer buffer.
59 * And more would be dangerous because both Intel and AMD have
60 * errata with rep movsq > 4GB. If someone feels the need to fix
61 * this please consider this.
62 *
75 * Output: 63 * Output:
76 * eax uncopied bytes or 0 if successful. 64 * eax uncopied bytes or 0 if successful.
77 */ 65 */
78 .globl copy_user_generic
79 .p2align 4
80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_K8_C
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx
98 xorl %eax,%eax /*zero for the exception handler */
99
100#ifdef FIX_ALIGNMENT
101 /* check for bad alignment of destination */
102 movl %edi,%ecx
103 andl $7,%ecx
104 jnz .Lbad_alignment
105.Lafter_bad_alignment:
106#endif
107 66
108 movq %rdx,%rcx 67 .globl copy_user_generic
109 68copy_user_generic:
110 movl $64,%ebx
111 shrq $6,%rdx
112 decq %rdx
113 js .Lhandle_tail
114
115 .p2align 4
116.Lloop:
117.Ls1: movq (%rsi),%r11
118.Ls2: movq 1*8(%rsi),%r8
119.Ls3: movq 2*8(%rsi),%r9
120.Ls4: movq 3*8(%rsi),%r10
121.Ld1: movq %r11,(%rdi)
122.Ld2: movq %r8,1*8(%rdi)
123.Ld3: movq %r9,2*8(%rdi)
124.Ld4: movq %r10,3*8(%rdi)
125
126.Ls5: movq 4*8(%rsi),%r11
127.Ls6: movq 5*8(%rsi),%r8
128.Ls7: movq 6*8(%rsi),%r9
129.Ls8: movq 7*8(%rsi),%r10
130.Ld5: movq %r11,4*8(%rdi)
131.Ld6: movq %r8,5*8(%rdi)
132.Ld7: movq %r9,6*8(%rdi)
133.Ld8: movq %r10,7*8(%rdi)
134
135 decq %rdx
136
137 leaq 64(%rsi),%rsi
138 leaq 64(%rdi),%rdi
139
140 jns .Lloop
141
142 .p2align 4
143.Lhandle_tail:
144 movl %ecx,%edx
145 andl $63,%ecx
146 shrl $3,%ecx
147 jz .Lhandle_7
148 movl $8,%ebx
149 .p2align 4
150.Lloop_8:
151.Ls9: movq (%rsi),%r8
152.Ld9: movq %r8,(%rdi)
153 decl %ecx
154 leaq 8(%rdi),%rdi
155 leaq 8(%rsi),%rsi
156 jnz .Lloop_8
157
158.Lhandle_7:
159 movl %edx,%ecx
160 andl $7,%ecx
161 jz .Lende
162 .p2align 4
163.Lloop_1:
164.Ls10: movb (%rsi),%bl
165.Ld10: movb %bl,(%rdi)
166 incq %rdi
167 incq %rsi
168 decl %ecx
169 jnz .Lloop_1
170
171.Lende:
172 popq %rbx
173 ret
174
175#ifdef FIX_ALIGNMENT
176 /* align destination */
177 .p2align 4
178.Lbad_alignment:
179 movl $8,%r9d
180 subl %ecx,%r9d
181 movl %r9d,%ecx
182 cmpq %r9,%rdx
183 jz .Lhandle_7
184 js .Lhandle_7
185.Lalign_1:
186.Ls11: movb (%rsi),%bl
187.Ld11: movb %bl,(%rdi)
188 incq %rsi
189 incq %rdi
190 decl %ecx
191 jnz .Lalign_1
192 subq %r9,%rdx
193 jmp .Lafter_bad_alignment
194#endif
195
196 /* table sorted by exception address */
197 .section __ex_table,"a"
198 .align 8
199 .quad .Ls1,.Ls1e
200 .quad .Ls2,.Ls2e
201 .quad .Ls3,.Ls3e
202 .quad .Ls4,.Ls4e
203 .quad .Ld1,.Ls1e
204 .quad .Ld2,.Ls2e
205 .quad .Ld3,.Ls3e
206 .quad .Ld4,.Ls4e
207 .quad .Ls5,.Ls5e
208 .quad .Ls6,.Ls6e
209 .quad .Ls7,.Ls7e
210 .quad .Ls8,.Ls8e
211 .quad .Ld5,.Ls5e
212 .quad .Ld6,.Ls6e
213 .quad .Ld7,.Ls7e
214 .quad .Ld8,.Ls8e
215 .quad .Ls9,.Le_quad
216 .quad .Ld9,.Le_quad
217 .quad .Ls10,.Le_byte
218 .quad .Ld10,.Le_byte
219#ifdef FIX_ALIGNMENT
220 .quad .Ls11,.Lzero_rest
221 .quad .Ld11,.Lzero_rest
222#endif
223 .quad .Le5,.Le_zero
224 .previous
225
226 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
227 pessimistic side. this is gross. it would be better to fix the
228 interface. */
229 /* eax: zero, ebx: 64 */
230.Ls1e: addl $8,%eax
231.Ls2e: addl $8,%eax
232.Ls3e: addl $8,%eax
233.Ls4e: addl $8,%eax
234.Ls5e: addl $8,%eax
235.Ls6e: addl $8,%eax
236.Ls7e: addl $8,%eax
237.Ls8e: addl $8,%eax
238 addq %rbx,%rdi /* +64 */
239 subq %rax,%rdi /* correct destination with computed offset */
240
241 shlq $6,%rdx /* loop counter * 64 (stride length) */
242 addq %rax,%rdx /* add offset to loopcnt */
243 andl $63,%ecx /* remaining bytes */
244 addq %rcx,%rdx /* add them */
245 jmp .Lzero_rest
246
247 /* exception on quad word loop in tail handling */
248 /* ecx: loopcnt/8, %edx: length, rdi: correct */
249.Le_quad:
250 shll $3,%ecx
251 andl $7,%edx
252 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest:
255 movq %rdx,%rcx
256.Le_byte:
257 xorl %eax,%eax
258.Le5: rep
259 stosb
260 /* when there is another exception while zeroing the rest just return */
261.Le_zero:
262 movq %rdx,%rax
263 jmp .Lende
264
265 /* C stepping K8 run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible.
267 Patch in jmps to this code instead of copying it fully
268 to avoid unwanted aliasing in the exception tables. */
269
270 /* rdi destination
271 * rsi source
272 * rdx count
273 *
274 * Output:
275 * eax uncopied bytes or 0 if successfull.
276 */
277copy_user_generic_c:
278 movl %edx,%ecx 69 movl %edx,%ecx
279 shrl $3,%ecx 70 shrl $3,%ecx
280 andl $7,%edx 71 andl $7,%edx
72 jz 5f
2811: rep 731: rep
282 movsq 74 movsq
283 movl %edx,%ecx 75 movl %edx,%ecx
76 xor %eax,%eax
2842: rep 772: rep
285 movsb 78 movsb
2864: movl %ecx,%eax
287 ret 79 ret
80 /* align here? */
815: xorl %eax,%eax
826: rep movsq
83 ret
84
85 .section .fixup,"ax"
2883: lea (%rdx,%rcx,8),%rax 863: lea (%rdx,%rcx,8),%rax
289 ret 87 ret
290 884: movl %ecx,%eax
89 ret
90 .previous
91
291 .section __ex_table,"a" 92 .section __ex_table,"a"
292 .quad 1b,3b 93 .quad 1b,3b
293 .quad 2b,4b 94 .quad 2b,4b
95 .quad 6b,4b
294 .previous 96 .previous
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 841bd738a189..03c460cbdd1c 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -39,7 +39,7 @@ void __delay(unsigned long loops)
39 39
40inline void __const_udelay(unsigned long xloops) 40inline void __const_udelay(unsigned long xloops)
41{ 41{
42 __delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); 42 __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32);
43} 43}
44 44
45void __udelay(unsigned long usecs) 45void __udelay(unsigned long usecs)
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef5..92dd80544602 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
11 * 11 *
12 * Output: 12 * Output:
13 * rax original destination 13 * rax original destination
14 *
15 * TODO: check best memcpy for PSC
14 */ 16 */
15 17
16 .globl __memcpy 18 .globl __memcpy
@@ -18,95 +20,6 @@
18 .p2align 4 20 .p2align 4
19__memcpy: 21__memcpy:
20memcpy: 22memcpy:
21 pushq %rbx
22 movq %rdi,%rax
23
24 movl %edx,%ecx
25 shrl $6,%ecx
26 jz .Lhandle_tail
27
28 .p2align 4
29.Lloop_64:
30 decl %ecx
31
32 movq (%rsi),%r11
33 movq 8(%rsi),%r8
34
35 movq %r11,(%rdi)
36 movq %r8,1*8(%rdi)
37
38 movq 2*8(%rsi),%r9
39 movq 3*8(%rsi),%r10
40
41 movq %r9,2*8(%rdi)
42 movq %r10,3*8(%rdi)
43
44 movq 4*8(%rsi),%r11
45 movq 5*8(%rsi),%r8
46
47 movq %r11,4*8(%rdi)
48 movq %r8,5*8(%rdi)
49
50 movq 6*8(%rsi),%r9
51 movq 7*8(%rsi),%r10
52
53 movq %r9,6*8(%rdi)
54 movq %r10,7*8(%rdi)
55
56 leaq 64(%rsi),%rsi
57 leaq 64(%rdi),%rdi
58 jnz .Lloop_64
59
60.Lhandle_tail:
61 movl %edx,%ecx
62 andl $63,%ecx
63 shrl $3,%ecx
64 jz .Lhandle_7
65 .p2align 4
66.Lloop_8:
67 decl %ecx
68 movq (%rsi),%r8
69 movq %r8,(%rdi)
70 leaq 8(%rdi),%rdi
71 leaq 8(%rsi),%rsi
72 jnz .Lloop_8
73
74.Lhandle_7:
75 movl %edx,%ecx
76 andl $7,%ecx
77 jz .Lende
78 .p2align 4
79.Lloop_1:
80 movb (%rsi),%r8b
81 movb %r8b,(%rdi)
82 incq %rdi
83 incq %rsi
84 decl %ecx
85 jnz .Lloop_1
86
87.Lende:
88 popq %rbx
89 ret
90.Lfinal:
91
92 /* C stepping K8 run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */
94
95 .section .altinstructions,"a"
96 .align 8
97 .quad memcpy
98 .quad memcpy_c
99 .byte X86_FEATURE_K8_C
100 .byte .Lfinal-memcpy
101 .byte memcpy_c_end-memcpy_c
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
110 movq %rdi,%rax 23 movq %rdi,%rax
111 movl %edx,%ecx 24 movl %edx,%ecx
112 shrl $3,%ecx 25 shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
117 rep 30 rep
118 movsb 31 movsb
119 ret 32 ret
120memcpy_c_end:
121 .previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
13 .p2align 4 13 .p2align 4
14memset: 14memset:
15__memset: 15__memset:
16 movq %rdi,%r10
17 movq %rdx,%r11
18
19 /* expand byte value */
20 movzbl %sil,%ecx
21 movabs $0x0101010101010101,%rax
22 mul %rcx /* with rax, clobbers rdx */
23
24 /* align dst */
25 movl %edi,%r9d
26 andl $7,%r9d
27 jnz .Lbad_alignment
28.Lafter_bad_alignment:
29
30 movl %r11d,%ecx
31 shrl $6,%ecx
32 jz .Lhandle_tail
33
34 .p2align 4
35.Lloop_64:
36 decl %ecx
37 movq %rax,(%rdi)
38 movq %rax,8(%rdi)
39 movq %rax,16(%rdi)
40 movq %rax,24(%rdi)
41 movq %rax,32(%rdi)
42 movq %rax,40(%rdi)
43 movq %rax,48(%rdi)
44 movq %rax,56(%rdi)
45 leaq 64(%rdi),%rdi
46 jnz .Lloop_64
47
48 /* Handle tail in loops. The loops should be faster than hard
49 to predict jump tables. */
50 .p2align 4
51.Lhandle_tail:
52 movl %r11d,%ecx
53 andl $63&(~7),%ecx
54 jz .Lhandle_7
55 shrl $3,%ecx
56 .p2align 4
57.Lloop_8:
58 decl %ecx
59 movq %rax,(%rdi)
60 leaq 8(%rdi),%rdi
61 jnz .Lloop_8
62
63.Lhandle_7:
64 movl %r11d,%ecx
65 andl $7,%ecx
66 jz .Lende
67 .p2align 4
68.Lloop_1:
69 decl %ecx
70 movb %al,(%rdi)
71 leaq 1(%rdi),%rdi
72 jnz .Lloop_1
73
74.Lende:
75 movq %r10,%rax
76 ret
77
78.Lbad_alignment:
79 cmpq $7,%r11
80 jbe .Lhandle_7
81 movq %rax,(%rdi) /* unaligned store */
82 movq $8,%r8
83 subq %r9,%r8
84 addq %r8,%rdi
85 subq %r8,%r11
86 jmp .Lafter_bad_alignment
87
88 /* C stepping K8 run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */
90
91#include <asm/cpufeature.h>
92
93 .section .altinstructions,"a"
94 .align 8
95 .quad memset
96 .quad memset_c
97 .byte X86_FEATURE_K8_C
98 .byte memset_c_end-memset_c
99 .byte memset_c_end-memset_c
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
108 movq %rdi,%r9 16 movq %rdi,%r9
109 movl %edx,%r8d 17 movl %edx,%r8d
110 andl $7,%r8d 18 andl $7,%r8d
@@ -121,5 +29,3 @@ memset_c:
121 stosb 29 stosb
122 movq %r9,%rax 30 movq %r9,%rax
123 ret 31 ret
124memset_c_end:
125 .previous
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
index db8abba1ad81..9bc2c295818e 100644
--- a/arch/x86_64/lib/usercopy.c
+++ b/arch/x86_64/lib/usercopy.c
@@ -109,14 +109,11 @@ unsigned long clear_user(void __user *to, unsigned long n)
109 * Return 0 on exception, a value greater than N if too long 109 * Return 0 on exception, a value greater than N if too long
110 */ 110 */
111 111
112long strnlen_user(const char __user *s, long n) 112long __strnlen_user(const char __user *s, long n)
113{ 113{
114 long res = 0; 114 long res = 0;
115 char c; 115 char c;
116 116
117 if (!access_ok(VERIFY_READ, s, n))
118 return 0;
119
120 while (1) { 117 while (1) {
121 if (res>n) 118 if (res>n)
122 return n+1; 119 return n+1;
@@ -129,6 +126,13 @@ long strnlen_user(const char __user *s, long n)
129 } 126 }
130} 127}
131 128
129long strnlen_user(const char __user *s, long n)
130{
131 if (!access_ok(VERIFY_READ, s, n))
132 return 0;
133 return __strnlen_user(s, n);
134}
135
132long strlen_user(const char __user *s) 136long strlen_user(const char __user *s)
133{ 137{
134 long res = 0; 138 long res = 0;
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
index 1d232a87f113..d25ac86fe27a 100644
--- a/arch/x86_64/mm/Makefile
+++ b/arch/x86_64/mm/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux x86_64-specific parts of the memory manager. 2# Makefile for the linux x86_64-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init.o fault.o ioremap.o extable.o pageattr.o 5obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_NUMA) += numa.o 7obj-$(CONFIG_NUMA) += numa.o
8obj-$(CONFIG_K8_NUMA) += k8topology.o 8obj-$(CONFIG_K8_NUMA) += k8topology.o
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872ec154..26eac194064b 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -35,6 +35,13 @@
35#include <asm-generic/sections.h> 35#include <asm-generic/sections.h>
36#include <asm/kdebug.h> 36#include <asm/kdebug.h>
37 37
38/* Page fault error code bits */
39#define PF_PROT (1<<0) /* or no page found */
40#define PF_WRITE (1<<1)
41#define PF_USER (1<<2)
42#define PF_RSVD (1<<3)
43#define PF_INSTR (1<<4)
44
38void bust_spinlocks(int yes) 45void bust_spinlocks(int yes)
39{ 46{
40 int loglevel_save = console_loglevel; 47 int loglevel_save = console_loglevel;
@@ -68,7 +75,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
68 unsigned char *max_instr; 75 unsigned char *max_instr;
69 76
70 /* If it was a exec fault ignore */ 77 /* If it was a exec fault ignore */
71 if (error_code & (1<<4)) 78 if (error_code & PF_INSTR)
72 return 0; 79 return 0;
73 80
74 instr = (unsigned char *)convert_rip_to_linear(current, regs); 81 instr = (unsigned char *)convert_rip_to_linear(current, regs);
@@ -222,17 +229,22 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
222 unsigned long error_code) 229 unsigned long error_code)
223{ 230{
224 unsigned long flags = oops_begin(); 231 unsigned long flags = oops_begin();
232 struct task_struct *tsk;
225 233
226 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 234 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
227 current->comm, address); 235 current->comm, address);
228 dump_pagetable(address); 236 dump_pagetable(address);
237 tsk = current;
238 tsk->thread.cr2 = address;
239 tsk->thread.trap_no = 14;
240 tsk->thread.error_code = error_code;
229 __die("Bad pagetable", regs, error_code); 241 __die("Bad pagetable", regs, error_code);
230 oops_end(flags); 242 oops_end(flags);
231 do_exit(SIGKILL); 243 do_exit(SIGKILL);
232} 244}
233 245
234/* 246/*
235 * Handle a fault on the vmalloc or module mapping area 247 * Handle a fault on the vmalloc area
236 * 248 *
237 * This assumes no large pages in there. 249 * This assumes no large pages in there.
238 */ 250 */
@@ -278,7 +290,6 @@ static int vmalloc_fault(unsigned long address)
278 that. */ 290 that. */
279 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 291 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
280 BUG(); 292 BUG();
281 __flush_tlb_all();
282 return 0; 293 return 0;
283} 294}
284 295
@@ -289,12 +300,6 @@ int exception_trace = 1;
289 * This routine handles page faults. It determines the address, 300 * This routine handles page faults. It determines the address,
290 * and the problem, and then passes it off to one of the appropriate 301 * and the problem, and then passes it off to one of the appropriate
291 * routines. 302 * routines.
292 *
293 * error_code:
294 * bit 0 == 0 means no page found, 1 means protection fault
295 * bit 1 == 0 means read, 1 means write
296 * bit 2 == 0 means kernel, 1 means user-mode
297 * bit 3 == 1 means fault was an instruction fetch
298 */ 303 */
299asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, 304asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
300 unsigned long error_code) 305 unsigned long error_code)
@@ -308,18 +313,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
308 unsigned long flags; 313 unsigned long flags;
309 siginfo_t info; 314 siginfo_t info;
310 315
311#ifdef CONFIG_CHECKING
312 {
313 unsigned long gs;
314 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
315 rdmsrl(MSR_GS_BASE, gs);
316 if (gs != (unsigned long)pda) {
317 wrmsrl(MSR_GS_BASE, pda);
318 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
319 }
320 }
321#endif
322
323 /* get the address */ 316 /* get the address */
324 __asm__("movq %%cr2,%0":"=r" (address)); 317 __asm__("movq %%cr2,%0":"=r" (address));
325 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 318 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -349,12 +342,16 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
349 * 342 *
350 * This verifies that the fault happens in kernel space 343 * This verifies that the fault happens in kernel space
351 * (error_code & 4) == 0, and that the fault was not a 344 * (error_code & 4) == 0, and that the fault was not a
352 * protection error (error_code & 1) == 0. 345 * protection error (error_code & 9) == 0.
353 */ 346 */
354 if (unlikely(address >= TASK_SIZE64)) { 347 if (unlikely(address >= TASK_SIZE64)) {
355 if (!(error_code & 5) && 348 /*
356 ((address >= VMALLOC_START && address < VMALLOC_END) || 349 * Don't check for the module range here: its PML4
357 (address >= MODULES_VADDR && address < MODULES_END))) { 350 * is always initialized because it's shared with the main
351 * kernel text. Only vmalloc may need PML4 syncups.
352 */
353 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
354 ((address >= VMALLOC_START && address < VMALLOC_END))) {
358 if (vmalloc_fault(address) < 0) 355 if (vmalloc_fault(address) < 0)
359 goto bad_area_nosemaphore; 356 goto bad_area_nosemaphore;
360 return; 357 return;
@@ -366,7 +363,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
366 goto bad_area_nosemaphore; 363 goto bad_area_nosemaphore;
367 } 364 }
368 365
369 if (unlikely(error_code & (1 << 3))) 366 if (unlikely(error_code & PF_RSVD))
370 pgtable_bad(address, regs, error_code); 367 pgtable_bad(address, regs, error_code);
371 368
372 /* 369 /*
@@ -393,7 +390,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
393 * thus avoiding the deadlock. 390 * thus avoiding the deadlock.
394 */ 391 */
395 if (!down_read_trylock(&mm->mmap_sem)) { 392 if (!down_read_trylock(&mm->mmap_sem)) {
396 if ((error_code & 4) == 0 && 393 if ((error_code & PF_USER) == 0 &&
397 !search_exception_tables(regs->rip)) 394 !search_exception_tables(regs->rip))
398 goto bad_area_nosemaphore; 395 goto bad_area_nosemaphore;
399 down_read(&mm->mmap_sem); 396 down_read(&mm->mmap_sem);
@@ -420,17 +417,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
420good_area: 417good_area:
421 info.si_code = SEGV_ACCERR; 418 info.si_code = SEGV_ACCERR;
422 write = 0; 419 write = 0;
423 switch (error_code & 3) { 420 switch (error_code & (PF_PROT|PF_WRITE)) {
424 default: /* 3: write, present */ 421 default: /* 3: write, present */
425 /* fall through */ 422 /* fall through */
426 case 2: /* write, not present */ 423 case PF_WRITE: /* write, not present */
427 if (!(vma->vm_flags & VM_WRITE)) 424 if (!(vma->vm_flags & VM_WRITE))
428 goto bad_area; 425 goto bad_area;
429 write++; 426 write++;
430 break; 427 break;
431 case 1: /* read, present */ 428 case PF_PROT: /* read, present */
432 goto bad_area; 429 goto bad_area;
433 case 0: /* read, not present */ 430 case 0: /* read, not present */
434 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 431 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
435 goto bad_area; 432 goto bad_area;
436 } 433 }
@@ -465,7 +462,7 @@ bad_area:
465 462
466bad_area_nosemaphore: 463bad_area_nosemaphore:
467 /* User mode accesses just cause a SIGSEGV */ 464 /* User mode accesses just cause a SIGSEGV */
468 if (error_code & 4) { 465 if (error_code & PF_USER) {
469 if (is_prefetch(regs, address, error_code)) 466 if (is_prefetch(regs, address, error_code))
470 return; 467 return;
471 468
@@ -533,6 +530,9 @@ no_context:
533 printk_address(regs->rip); 530 printk_address(regs->rip);
534 printk("\n"); 531 printk("\n");
535 dump_pagetable(address); 532 dump_pagetable(address);
533 tsk->thread.cr2 = address;
534 tsk->thread.trap_no = 14;
535 tsk->thread.error_code = error_code;
536 __die("Oops", regs, error_code); 536 __die("Oops", regs, error_code);
537 /* Executive summary in case the body of the oops scrolled away */ 537 /* Executive summary in case the body of the oops scrolled away */
538 printk(KERN_EMERG "CR2: %016lx\n", address); 538 printk(KERN_EMERG "CR2: %016lx\n", address);
@@ -558,7 +558,7 @@ do_sigbus:
558 up_read(&mm->mmap_sem); 558 up_read(&mm->mmap_sem);
559 559
560 /* Kernel mode? Handle exceptions or die */ 560 /* Kernel mode? Handle exceptions or die */
561 if (!(error_code & 4)) 561 if (!(error_code & PF_USER))
562 goto no_context; 562 goto no_context;
563 563
564 tsk->thread.cr2 = address; 564 tsk->thread.cr2 = address;
@@ -571,3 +571,10 @@ do_sigbus:
571 force_sig_info(SIGBUS, &info, tsk); 571 force_sig_info(SIGBUS, &info, tsk);
572 return; 572 return;
573} 573}
574
575static int __init enable_pagefaulttrace(char *str)
576{
577 page_fault_trace = 1;
578 return 0;
579}
580__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de8..7af1742aa958 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,10 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
25#include <linux/pci.h>
26#include <linux/dma-mapping.h>
27#include <linux/module.h>
28#include <linux/memory_hotplug.h>
25 29
26#include <asm/processor.h> 30#include <asm/processor.h>
27#include <asm/system.h> 31#include <asm/system.h>
@@ -36,16 +40,18 @@
36#include <asm/mmu_context.h> 40#include <asm/mmu_context.h>
37#include <asm/proto.h> 41#include <asm/proto.h>
38#include <asm/smp.h> 42#include <asm/smp.h>
43#include <asm/sections.h>
44#include <asm/dma-mapping.h>
45#include <asm/swiotlb.h>
39 46
40#ifndef Dprintk 47#ifndef Dprintk
41#define Dprintk(x...) 48#define Dprintk(x...)
42#endif 49#endif
43 50
44#ifdef CONFIG_GART_IOMMU 51struct dma_mapping_ops* dma_ops;
45extern int swiotlb; 52EXPORT_SYMBOL(dma_ops);
46#endif
47 53
48extern char _stext[]; 54static unsigned long dma_reserve __initdata;
49 55
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51 57
@@ -86,9 +92,6 @@ void show_mem(void)
86 92
87/* References to section boundaries */ 93/* References to section boundaries */
88 94
89extern char _text, _etext, _edata, __bss_start, _end[];
90extern char __init_begin, __init_end;
91
92int after_bootmem; 95int after_bootmem;
93 96
94static void *spp_getpage(void) 97static void *spp_getpage(void)
@@ -179,13 +182,19 @@ static struct temp_map {
179 {} 182 {}
180}; 183};
181 184
182static __init void *alloc_low_page(int *index, unsigned long *phys) 185static __meminit void *alloc_low_page(int *index, unsigned long *phys)
183{ 186{
184 struct temp_map *ti; 187 struct temp_map *ti;
185 int i; 188 int i;
186 unsigned long pfn = table_end++, paddr; 189 unsigned long pfn = table_end++, paddr;
187 void *adr; 190 void *adr;
188 191
192 if (after_bootmem) {
193 adr = (void *)get_zeroed_page(GFP_ATOMIC);
194 *phys = __pa(adr);
195 return adr;
196 }
197
189 if (pfn >= end_pfn) 198 if (pfn >= end_pfn)
190 panic("alloc_low_page: ran out of memory"); 199 panic("alloc_low_page: ran out of memory");
191 for (i = 0; temp_mappings[i].allocated; i++) { 200 for (i = 0; temp_mappings[i].allocated; i++) {
@@ -198,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys)
198 ti->allocated = 1; 207 ti->allocated = 1;
199 __flush_tlb(); 208 __flush_tlb();
200 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 209 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
210 memset(adr, 0, PAGE_SIZE);
201 *index = i; 211 *index = i;
202 *phys = pfn * PAGE_SIZE; 212 *phys = pfn * PAGE_SIZE;
203 return adr; 213 return adr;
204} 214}
205 215
206static __init void unmap_low_page(int i) 216static __meminit void unmap_low_page(int i)
207{ 217{
208 struct temp_map *ti = &temp_mappings[i]; 218 struct temp_map *ti;
219
220 if (after_bootmem)
221 return;
222
223 ti = &temp_mappings[i];
209 set_pmd(ti->pmd, __pmd(0)); 224 set_pmd(ti->pmd, __pmd(0));
210 ti->allocated = 0; 225 ti->allocated = 0;
211} 226}
212 227
213static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 228static void __meminit
229phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
230{
231 int i;
232
233 for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
234 unsigned long entry;
235
236 if (address > end) {
237 for (; i < PTRS_PER_PMD; i++, pmd++)
238 set_pmd(pmd, __pmd(0));
239 break;
240 }
241 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
242 entry &= __supported_pte_mask;
243 set_pmd(pmd, __pmd(entry));
244 }
245}
246
247static void __meminit
248phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
249{
250 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
251
252 if (pmd_none(*pmd)) {
253 spin_lock(&init_mm.page_table_lock);
254 phys_pmd_init(pmd, address, end);
255 spin_unlock(&init_mm.page_table_lock);
256 __flush_tlb_all();
257 }
258}
259
260static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
214{ 261{
215 long i, j; 262 long i = pud_index(address);
216 263
217 i = pud_index(address);
218 pud = pud + i; 264 pud = pud + i;
265
266 if (after_bootmem && pud_val(*pud)) {
267 phys_pmd_update(pud, address, end);
268 return;
269 }
270
219 for (; i < PTRS_PER_PUD; pud++, i++) { 271 for (; i < PTRS_PER_PUD; pud++, i++) {
220 int map; 272 int map;
221 unsigned long paddr, pmd_phys; 273 unsigned long paddr, pmd_phys;
222 pmd_t *pmd; 274 pmd_t *pmd;
223 275
224 paddr = address + i*PUD_SIZE; 276 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
225 if (paddr >= end) { 277 if (paddr >= end)
226 for (; i < PTRS_PER_PUD; i++, pud++)
227 set_pud(pud, __pud(0));
228 break; 278 break;
229 }
230 279
231 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 280 if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
232 set_pud(pud, __pud(0)); 281 set_pud(pud, __pud(0));
233 continue; 282 continue;
234 } 283 }
235 284
236 pmd = alloc_low_page(&map, &pmd_phys); 285 pmd = alloc_low_page(&map, &pmd_phys);
286 spin_lock(&init_mm.page_table_lock);
237 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 287 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
238 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { 288 phys_pmd_init(pmd, paddr, end);
239 unsigned long pe; 289 spin_unlock(&init_mm.page_table_lock);
240
241 if (paddr >= end) {
242 for (; j < PTRS_PER_PMD; j++, pmd++)
243 set_pmd(pmd, __pmd(0));
244 break;
245 }
246 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
247 pe &= __supported_pte_mask;
248 set_pmd(pmd, __pmd(pe));
249 }
250 unmap_low_page(map); 290 unmap_low_page(map);
251 } 291 }
252 __flush_tlb(); 292 __flush_tlb();
@@ -254,25 +294,32 @@ static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned lon
254 294
255static void __init find_early_table_space(unsigned long end) 295static void __init find_early_table_space(unsigned long end)
256{ 296{
257 unsigned long puds, pmds, tables; 297 unsigned long puds, pmds, tables, start;
258 298
259 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 299 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
260 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 300 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
261 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + 301 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
262 round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 302 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
263 303
264 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); 304 /* RED-PEN putting page tables only on node 0 could
305 cause a hotspot and fill up ZONE_DMA. The page tables
306 need roughly 0.5KB per GB. */
307 start = 0x8000;
308 table_start = find_e820_area(start, end, tables);
265 if (table_start == -1UL) 309 if (table_start == -1UL)
266 panic("Cannot find space for the kernel page tables"); 310 panic("Cannot find space for the kernel page tables");
267 311
268 table_start >>= PAGE_SHIFT; 312 table_start >>= PAGE_SHIFT;
269 table_end = table_start; 313 table_end = table_start;
314
315 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
316 end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
270} 317}
271 318
272/* Setup the direct mapping of the physical memory at PAGE_OFFSET. 319/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
273 This runs before bootmem is initialized and gets pages directly from the 320 This runs before bootmem is initialized and gets pages directly from the
274 physical memory. To access them they are temporarily mapped. */ 321 physical memory. To access them they are temporarily mapped. */
275void __init init_memory_mapping(unsigned long start, unsigned long end) 322void __meminit init_memory_mapping(unsigned long start, unsigned long end)
276{ 323{
277 unsigned long next; 324 unsigned long next;
278 325
@@ -284,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
284 * mapped. Unfortunately this is done currently before the nodes are 331 * mapped. Unfortunately this is done currently before the nodes are
285 * discovered. 332 * discovered.
286 */ 333 */
287 find_early_table_space(end); 334 if (!after_bootmem)
335 find_early_table_space(end);
288 336
289 start = (unsigned long)__va(start); 337 start = (unsigned long)__va(start);
290 end = (unsigned long)__va(end); 338 end = (unsigned long)__va(end);
@@ -292,58 +340,106 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
292 for (; start < end; start = next) { 340 for (; start < end; start = next) {
293 int map; 341 int map;
294 unsigned long pud_phys; 342 unsigned long pud_phys;
295 pud_t *pud = alloc_low_page(&map, &pud_phys); 343 pgd_t *pgd = pgd_offset_k(start);
344 pud_t *pud;
345
346 if (after_bootmem)
347 pud = pud_offset_k(pgd, __PAGE_OFFSET);
348 else
349 pud = alloc_low_page(&map, &pud_phys);
350
296 next = start + PGDIR_SIZE; 351 next = start + PGDIR_SIZE;
297 if (next > end) 352 if (next > end)
298 next = end; 353 next = end;
299 phys_pud_init(pud, __pa(start), __pa(next)); 354 phys_pud_init(pud, __pa(start), __pa(next));
300 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 355 if (!after_bootmem)
356 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
301 unmap_low_page(map); 357 unmap_low_page(map);
302 } 358 }
303 359
304 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); 360 if (!after_bootmem)
361 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
305 __flush_tlb_all(); 362 __flush_tlb_all();
306 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
307 table_start<<PAGE_SHIFT,
308 table_end<<PAGE_SHIFT);
309} 363}
310 364
311extern struct x8664_pda cpu_pda[NR_CPUS]; 365void __cpuinit zap_low_mappings(int cpu)
366{
367 if (cpu == 0) {
368 pgd_t *pgd = pgd_offset_k(0UL);
369 pgd_clear(pgd);
370 } else {
371 /*
372 * For AP's, zap the low identity mappings by changing the cr3
373 * to init_level4_pgt and doing local flush tlb all
374 */
375 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
376 }
377 __flush_tlb_all();
378}
312 379
313/* Assumes all CPUs still execute in init_mm */ 380/* Compute zone sizes for the DMA and DMA32 zones in a node. */
314void zap_low_mappings(void) 381__init void
382size_zones(unsigned long *z, unsigned long *h,
383 unsigned long start_pfn, unsigned long end_pfn)
315{ 384{
316 pgd_t *pgd = pgd_offset_k(0UL); 385 int i;
317 pgd_clear(pgd); 386 unsigned long w;
318 flush_tlb_all(); 387
388 for (i = 0; i < MAX_NR_ZONES; i++)
389 z[i] = 0;
390
391 if (start_pfn < MAX_DMA_PFN)
392 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
393 if (start_pfn < MAX_DMA32_PFN) {
394 unsigned long dma32_pfn = MAX_DMA32_PFN;
395 if (dma32_pfn > end_pfn)
396 dma32_pfn = end_pfn;
397 z[ZONE_DMA32] = dma32_pfn - start_pfn;
398 }
399 z[ZONE_NORMAL] = end_pfn - start_pfn;
400
401 /* Remove lower zones from higher ones. */
402 w = 0;
403 for (i = 0; i < MAX_NR_ZONES; i++) {
404 if (z[i])
405 z[i] -= w;
406 w += z[i];
407 }
408
409 /* Compute holes */
410 w = start_pfn;
411 for (i = 0; i < MAX_NR_ZONES; i++) {
412 unsigned long s = w;
413 w += z[i];
414 h[i] = e820_hole_size(s, w);
415 }
416
417 /* Add the space pace needed for mem_map to the holes too. */
418 for (i = 0; i < MAX_NR_ZONES; i++)
419 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
420
421 /* The 16MB DMA zone has the kernel and other misc mappings.
422 Account them too */
423 if (h[ZONE_DMA]) {
424 h[ZONE_DMA] += dma_reserve;
425 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
426 printk(KERN_WARNING
427 "Kernel too large and filling up ZONE_DMA?\n");
428 h[ZONE_DMA] = z[ZONE_DMA];
429 }
430 }
319} 431}
320 432
321#ifndef CONFIG_NUMA 433#ifndef CONFIG_NUMA
322void __init paging_init(void) 434void __init paging_init(void)
323{ 435{
324 { 436 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
325 unsigned long zones_size[MAX_NR_ZONES]; 437
326 unsigned long holes[MAX_NR_ZONES]; 438 memory_present(0, 0, end_pfn);
327 unsigned int max_dma; 439 sparse_init();
328 440 size_zones(zones, holes, 0, end_pfn);
329 memset(zones_size, 0, sizeof(zones_size)); 441 free_area_init_node(0, NODE_DATA(0), zones,
330 memset(holes, 0, sizeof(holes)); 442 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
331
332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
333
334 if (end_pfn < max_dma) {
335 zones_size[ZONE_DMA] = end_pfn;
336 holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
337 } else {
338 zones_size[ZONE_DMA] = max_dma;
339 holes[ZONE_DMA] = e820_hole_size(0, max_dma);
340 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
342 }
343 free_area_init_node(0, NODE_DATA(0), zones_size,
344 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
345 }
346 return;
347} 443}
348#endif 444#endif
349 445
@@ -381,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
381 __flush_tlb_all(); 477 __flush_tlb_all();
382} 478}
383 479
480/*
481 * Memory hotplug specific functions
482 * These are only for non-NUMA machines right now.
483 */
484#ifdef CONFIG_MEMORY_HOTPLUG
485
486void online_page(struct page *page)
487{
488 ClearPageReserved(page);
489 set_page_count(page, 1);
490 __free_page(page);
491 totalram_pages++;
492 num_physpages++;
493}
494
495int add_memory(u64 start, u64 size)
496{
497 struct pglist_data *pgdat = NODE_DATA(0);
498 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
499 unsigned long start_pfn = start >> PAGE_SHIFT;
500 unsigned long nr_pages = size >> PAGE_SHIFT;
501 int ret;
502
503 ret = __add_pages(zone, start_pfn, nr_pages);
504 if (ret)
505 goto error;
506
507 init_memory_mapping(start, (start + size -1));
508
509 return ret;
510error:
511 printk("%s: Problem encountered in __add_pages!\n", __func__);
512 return ret;
513}
514EXPORT_SYMBOL_GPL(add_memory);
515
516int remove_memory(u64 start, u64 size)
517{
518 return -EINVAL;
519}
520EXPORT_SYMBOL_GPL(remove_memory);
521
522#endif
523
384static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, 524static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
385 kcore_vsyscall; 525 kcore_vsyscall;
386 526
@@ -389,12 +529,9 @@ void __init mem_init(void)
389 long codesize, reservedpages, datasize, initsize; 529 long codesize, reservedpages, datasize, initsize;
390 530
391#ifdef CONFIG_SWIOTLB 531#ifdef CONFIG_SWIOTLB
392 if (!iommu_aperture && 532 pci_swiotlb_init();
393 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
394 swiotlb = 1;
395 if (swiotlb)
396 swiotlb_init();
397#endif 533#endif
534 no_iommu_init();
398 535
399 /* How many end-of-memory variables you have, grandma! */ 536 /* How many end-of-memory variables you have, grandma! */
400 max_low_pfn = end_pfn; 537 max_low_pfn = end_pfn;
@@ -438,19 +575,16 @@ void __init mem_init(void)
438 datasize >> 10, 575 datasize >> 10,
439 initsize >> 10); 576 initsize >> 10);
440 577
578#ifdef CONFIG_SMP
441 /* 579 /*
442 * Subtle. SMP is doing its boot stuff late (because it has to 580 * Sync boot_level4_pgt mappings with the init_level4_pgt
443 * fork idle threads) - but it also needs low mappings for the 581 * except for the low identity mappings which are already zapped
444 * protected-mode entry to work. We zap these entries only after 582 * in init_level4_pgt. This sync-up is essential for AP's bringup
445 * the WP-bit has been tested.
446 */ 583 */
447#ifndef CONFIG_SMP 584 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
448 zap_low_mappings();
449#endif 585#endif
450} 586}
451 587
452extern char __initdata_begin[], __initdata_end[];
453
454void free_initmem(void) 588void free_initmem(void)
455{ 589{
456 unsigned long addr; 590 unsigned long addr;
@@ -464,13 +598,36 @@ void free_initmem(void)
464 totalram_pages++; 598 totalram_pages++;
465 } 599 }
466 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); 600 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
467 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); 601 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
468} 602}
469 603
604#ifdef CONFIG_DEBUG_RODATA
605
606extern char __start_rodata, __end_rodata;
607void mark_rodata_ro(void)
608{
609 unsigned long addr = (unsigned long)&__start_rodata;
610
611 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
612 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
613
614 printk ("Write protecting the kernel read-only data: %luk\n",
615 (&__end_rodata - &__start_rodata) >> 10);
616
617 /*
618 * change_page_attr_addr() requires a global_flush_tlb() call after it.
619 * We do this after the printk so that if something went wrong in the
620 * change, the printk gets out at least to give a better debug hint
621 * of who is the culprit.
622 */
623 global_flush_tlb();
624}
625#endif
626
470#ifdef CONFIG_BLK_DEV_INITRD 627#ifdef CONFIG_BLK_DEV_INITRD
471void free_initrd_mem(unsigned long start, unsigned long end) 628void free_initrd_mem(unsigned long start, unsigned long end)
472{ 629{
473 if (start < (unsigned long)&_end) 630 if (start >= end)
474 return; 631 return;
475 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 632 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
476 for (; start < end; start += PAGE_SIZE) { 633 for (; start < end; start += PAGE_SIZE) {
@@ -491,6 +648,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
491#else 648#else
492 reserve_bootmem(phys, len); 649 reserve_bootmem(phys, len);
493#endif 650#endif
651 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
652 dma_reserve += len / PAGE_SIZE;
494} 653}
495 654
496int kern_addr_valid(unsigned long addr) 655int kern_addr_valid(unsigned long addr)
@@ -532,10 +691,6 @@ extern int exception_trace, page_fault_trace;
532static ctl_table debug_table2[] = { 691static ctl_table debug_table2[] = {
533 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, 692 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
534 proc_dointvec }, 693 proc_dointvec },
535#ifdef CONFIG_CHECKING
536 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
537 proc_dointvec },
538#endif
539 { 0, } 694 { 0, }
540}; 695};
541 696
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index ecf7acb5db9b..ae207064201e 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -247,9 +247,15 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
247 return __ioremap(phys_addr, size, _PAGE_PCD); 247 return __ioremap(phys_addr, size, _PAGE_PCD);
248} 248}
249 249
250/**
251 * iounmap - Free a IO remapping
252 * @addr: virtual address from ioremap_*
253 *
254 * Caller must ensure there is only one unmapping for the same pointer.
255 */
250void iounmap(volatile void __iomem *addr) 256void iounmap(volatile void __iomem *addr)
251{ 257{
252 struct vm_struct *p; 258 struct vm_struct *p, *o;
253 259
254 if (addr <= high_memory) 260 if (addr <= high_memory)
255 return; 261 return;
@@ -257,12 +263,31 @@ void iounmap(volatile void __iomem *addr)
257 addr < phys_to_virt(ISA_END_ADDRESS)) 263 addr < phys_to_virt(ISA_END_ADDRESS))
258 return; 264 return;
259 265
260 write_lock(&vmlist_lock); 266 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
261 p = __remove_vm_area((void *)((unsigned long)addr & PAGE_MASK)); 267 /* Use the vm area unlocked, assuming the caller
262 if (!p) 268 ensures there isn't another iounmap for the same address
269 in parallel. Reuse of the virtual address is prevented by
270 leaving it in the global lists until we're done with it.
271 cpa takes care of the direct mappings. */
272 read_lock(&vmlist_lock);
273 for (p = vmlist; p; p = p->next) {
274 if (p->addr == addr)
275 break;
276 }
277 read_unlock(&vmlist_lock);
278
279 if (!p) {
263 printk("iounmap: bad address %p\n", addr); 280 printk("iounmap: bad address %p\n", addr);
264 else if (p->flags >> 20) 281 dump_stack();
282 return;
283 }
284
285 /* Reset the direct mapping. Can block */
286 if (p->flags >> 20)
265 ioremap_change_attr(p->phys_addr, p->size, 0); 287 ioremap_change_attr(p->phys_addr, p->size, 0);
266 write_unlock(&vmlist_lock); 288
289 /* Finally remove it */
290 o = remove_vm_area((void *)addr);
291 BUG_ON(p != o || o == NULL);
267 kfree(p); 292 kfree(p);
268} 293}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 65417b040c1b..a5663e0bb01c 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
108 limit >>= 16; 108 limit >>= 16;
109 limit <<= 24; 109 limit <<= 24;
110 limit |= (1<<24)-1; 110 limit |= (1<<24)-1;
111 limit++;
111 112
112 if (limit > end_pfn << PAGE_SHIFT) 113 if (limit > end_pfn << PAGE_SHIFT)
113 limit = end_pfn << PAGE_SHIFT; 114 limit = end_pfn << PAGE_SHIFT;
diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c
new file mode 100644
index 000000000000..43e9b99bdf25
--- /dev/null
+++ b/arch/x86_64/mm/mmap.c
@@ -0,0 +1,30 @@
1/* Copyright 2005 Andi Kleen, SuSE Labs.
2 * Licensed under GPL, v.2
3 */
4#include <linux/config.h>
5#include <linux/mm.h>
6#include <linux/sched.h>
7#include <linux/random.h>
8#include <asm/ia32.h>
9
10/* Notebook: move the mmap code from sys_x86_64.c over here. */
11
12void arch_pick_mmap_layout(struct mm_struct *mm)
13{
14#ifdef CONFIG_IA32_EMULATION
15 if (current_thread_info()->flags & _TIF_IA32)
16 return ia32_pick_mmap_layout(mm);
17#endif
18 mm->mmap_base = TASK_UNMAPPED_BASE;
19 if (current->flags & PF_RANDOMIZE) {
20 /* Add 28bit randomness which is about 40bits of address space
21 because mmap base has to be page aligned.
22 or ~1/128 of the total user VM
23 (total user address space is 47bits) */
24 unsigned rnd = get_random_int() & 0xfffffff;
25 mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
26 }
27 mm->get_unmapped_area = arch_get_unmapped_area;
28 mm->unmap_area = arch_unmap_area;
29}
30
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 214803821001..6ef9f9a76235 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -38,38 +38,59 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 38
39int numa_off __initdata; 39int numa_off __initdata;
40 40
41int __init compute_hash_shift(struct node *nodes, int numnodes) 41
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init
50populate_memnodemap(const struct node *nodes, int numnodes, int shift)
42{ 51{
43 int i; 52 int i;
44 int shift = 20; 53 int res = -1;
45 unsigned long addr,maxend=0; 54 unsigned long addr, end;
46
47 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
50 55
51 while ((1UL << shift) < (maxend / NODEMAPSIZE)) 56 if (shift >= 64)
52 shift++; 57 return -1;
53 58 memset(memnodemap, 0xff, sizeof(memnodemap));
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
55 shift,maxend);
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) { 59 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end) 60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
59 continue; 63 continue;
60 for (addr = nodes[i].start; 64 if ((end >> shift) >= NODEMAPSIZE)
61 addr < nodes[i].end; 65 return 0;
62 addr += (1UL << shift)) { 66 do {
63 if (memnodemap[addr >> shift] != 0xff) { 67 if (memnodemap[addr >> shift] != 0xff)
64 printk(KERN_INFO
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
67 shift,addr);
68 return -1; 68 return -1;
69 }
70 memnodemap[addr >> shift] = i; 69 memnodemap[addr >> shift] = i;
71 } 70 addr += (1UL << shift);
71 } while (addr < end);
72 res = 1;
72 } 73 }
74 return res;
75}
76
77int __init compute_hash_shift(struct node *nodes, int numnodes)
78{
79 int shift = 20;
80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 shift++;
83
84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
85 shift);
86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
93 }
73 return shift; 94 return shift;
74} 95}
75 96
@@ -89,12 +110,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
89 110
90 start = round_up(start, ZONE_ALIGN); 111 start = round_up(start, ZONE_ALIGN);
91 112
92 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
93 114
94 start_pfn = start >> PAGE_SHIFT; 115 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT; 116 end_pfn = end >> PAGE_SHIFT;
96 117
97 memory_present(nodeid, start_pfn, end_pfn);
98 nodedata_phys = find_e820_area(start, end, pgdat_size); 118 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L) 119 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid); 120 panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -132,29 +152,14 @@ void __init setup_node_zones(int nodeid)
132 unsigned long start_pfn, end_pfn; 152 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES]; 153 unsigned long zones[MAX_NR_ZONES];
134 unsigned long holes[MAX_NR_ZONES]; 154 unsigned long holes[MAX_NR_ZONES];
135 unsigned long dma_end_pfn;
136 155
137 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 156 start_pfn = node_start_pfn(nodeid);
138 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); 157 end_pfn = node_end_pfn(nodeid);
139 158
140 start_pfn = node_start_pfn(nodeid); 159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
141 end_pfn = node_end_pfn(nodeid); 160 nodeid, start_pfn, end_pfn);
142 161
143 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); 162 size_zones(zones, holes, start_pfn, end_pfn);
144
145 /* All nodes > 0 have a zero length zone DMA */
146 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
147 if (start_pfn < dma_end_pfn) {
148 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
149 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
150 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
151 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152
153 } else {
154 zones[ZONE_NORMAL] = end_pfn - start_pfn;
155 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
156 }
157
158 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
159 start_pfn, holes); 164 start_pfn, holes);
160} 165}
@@ -171,7 +176,7 @@ void __init numa_init_array(void)
171 for (i = 0; i < NR_CPUS; i++) { 176 for (i = 0; i < NR_CPUS; i++) {
172 if (cpu_to_node[i] != NUMA_NO_NODE) 177 if (cpu_to_node[i] != NUMA_NO_NODE)
173 continue; 178 continue;
174 cpu_to_node[i] = rr; 179 numa_set_node(i, rr);
175 rr = next_node(rr, node_online_map); 180 rr = next_node(rr, node_online_map);
176 if (rr == MAX_NUMNODES) 181 if (rr == MAX_NUMNODES)
177 rr = first_node(node_online_map); 182 rr = first_node(node_online_map);
@@ -195,7 +200,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
195 while ((x << 1) < sz) 200 while ((x << 1) < sz)
196 x <<= 1; 201 x <<= 1;
197 if (x < sz/2) 202 if (x < sz/2)
198 printk("Numa emulation unbalanced. Complain to maintainer\n"); 203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
199 sz = x; 204 sz = x;
200 } 205 }
201 206
@@ -205,8 +210,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
205 if (i == numa_fake-1) 210 if (i == numa_fake-1)
206 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
207 nodes[i].end = nodes[i].start + sz; 212 nodes[i].end = nodes[i].start + sz;
208 if (i != numa_fake-1)
209 nodes[i].end--;
210 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
211 i, 214 i,
212 nodes[i].start, nodes[i].end, 215 nodes[i].start, nodes[i].end,
@@ -257,7 +260,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
257 nodes_clear(node_online_map); 260 nodes_clear(node_online_map);
258 node_set_online(0); 261 node_set_online(0);
259 for (i = 0; i < NR_CPUS; i++) 262 for (i = 0; i < NR_CPUS; i++)
260 cpu_to_node[i] = 0; 263 numa_set_node(i, 0);
261 node_to_cpumask[0] = cpumask_of_cpu(0); 264 node_to_cpumask[0] = cpumask_of_cpu(0);
262 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
263} 266}
@@ -267,6 +270,12 @@ __cpuinit void numa_add_cpu(int cpu)
267 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
268} 271}
269 272
273void __cpuinit numa_set_node(int cpu, int node)
274{
275 cpu_pda(cpu)->nodenumber = node;
276 cpu_to_node[cpu] = node;
277}
278
270unsigned long __init numa_free_all_bootmem(void) 279unsigned long __init numa_free_all_bootmem(void)
271{ 280{
272 int i; 281 int i;
@@ -277,9 +286,26 @@ unsigned long __init numa_free_all_bootmem(void)
277 return pages; 286 return pages;
278} 287}
279 288
289#ifdef CONFIG_SPARSEMEM
290static void __init arch_sparse_init(void)
291{
292 int i;
293
294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
296
297 sparse_init();
298}
299#else
300#define arch_sparse_init() do {} while (0)
301#endif
302
280void __init paging_init(void) 303void __init paging_init(void)
281{ 304{
282 int i; 305 int i;
306
307 arch_sparse_init();
308
283 for_each_online_node(i) { 309 for_each_online_node(i) {
284 setup_node_zones(i); 310 setup_node_zones(i);
285 } 311 }
@@ -304,8 +330,69 @@ __init int numa_setup(char *opt)
304 return 1; 330 return 1;
305} 331}
306 332
333/*
334 * Setup early cpu_to_node.
335 *
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases.
344 */
345void __init init_cpu_to_node(void)
346{
347 int i;
348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID)
351 continue;
352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
353 continue;
354 cpu_to_node[i] = apicid_to_node[apicid];
355 }
356}
357
307EXPORT_SYMBOL(cpu_to_node); 358EXPORT_SYMBOL(cpu_to_node);
308EXPORT_SYMBOL(node_to_cpumask); 359EXPORT_SYMBOL(node_to_cpumask);
309EXPORT_SYMBOL(memnode_shift); 360EXPORT_SYMBOL(memnode_shift);
310EXPORT_SYMBOL(memnodemap); 361EXPORT_SYMBOL(memnodemap);
311EXPORT_SYMBOL(node_data); 362EXPORT_SYMBOL(node_data);
363
364#ifdef CONFIG_DISCONTIGMEM
365/*
366 * Functions to convert PFNs from/to per node page addresses.
367 * These are out of line because they are quite big.
368 * They could be all tuned by pre caching more state.
369 * Should do that.
370 */
371
372/* Requires pfn_valid(pfn) to be true */
373struct page *pfn_to_page(unsigned long pfn)
374{
375 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
376 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
377}
378EXPORT_SYMBOL(pfn_to_page);
379
380unsigned long page_to_pfn(struct page *page)
381{
382 return (long)(((page) - page_zone(page)->zone_mem_map) +
383 page_zone(page)->zone_start_pfn);
384}
385EXPORT_SYMBOL(page_to_pfn);
386
387int pfn_valid(unsigned long pfn)
388{
389 unsigned nid;
390 if (pfn >= num_physpages)
391 return 0;
392 nid = pfn_to_nid(pfn);
393 if (nid == 0xff)
394 return 0;
395 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
396}
397EXPORT_SYMBOL(pfn_valid);
398#endif
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index b90e8fe9eeb0..35f1f1aab063 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
128 pte_t *kpte; 128 pte_t *kpte;
129 struct page *kpte_page; 129 struct page *kpte_page;
130 unsigned kpte_flags; 130 unsigned kpte_flags;
131 pgprot_t ref_prot2;
131 kpte = lookup_address(address); 132 kpte = lookup_address(address);
132 if (!kpte) return 0; 133 if (!kpte) return 0;
133 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); 134 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
140 * split_large_page will take the reference for this change_page_attr 141 * split_large_page will take the reference for this change_page_attr
141 * on the split page. 142 * on the split page.
142 */ 143 */
143 struct page *split = split_large_page(address, prot, ref_prot); 144
145 struct page *split;
146 ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
147
148 split = split_large_page(address, prot, ref_prot2);
144 if (!split) 149 if (!split)
145 return -ENOMEM; 150 return -ENOMEM;
146 set_pte(kpte,mk_pte(split, ref_prot)); 151 set_pte(kpte,mk_pte(split, ref_prot2));
147 kpte_page = split; 152 kpte_page = split;
148 } 153 }
149 get_page(kpte_page); 154 get_page(kpte_page);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844c15a7..8b7f85608fa8 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -17,21 +17,23 @@
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <asm/proto.h> 18#include <asm/proto.h>
19#include <asm/numa.h> 19#include <asm/numa.h>
20#include <asm/e820.h>
20 21
21static struct acpi_table_slit *acpi_slit; 22static struct acpi_table_slit *acpi_slit;
22 23
23static nodemask_t nodes_parsed __initdata; 24static nodemask_t nodes_parsed __initdata;
24static nodemask_t nodes_found __initdata; 25static nodemask_t nodes_found __initdata;
25static struct node nodes[MAX_NUMNODES] __initdata; 26static struct node nodes[MAX_NUMNODES] __initdata;
26static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; 27static u8 pxm2node[256] = { [0 ... 255] = 0xff };
27 28
28static int node_to_pxm(int n); 29static int node_to_pxm(int n);
29 30
30int pxm_to_node(int pxm) 31int pxm_to_node(int pxm)
31{ 32{
32 if ((unsigned)pxm >= 256) 33 if ((unsigned)pxm >= 256)
33 return 0; 34 return -1;
34 return pxm2node[pxm]; 35 /* Extend 0xff to (int)-1 */
36 return (signed char)pxm2node[pxm];
35} 37}
36 38
37static __init int setup_node(int pxm) 39static __init int setup_node(int pxm)
@@ -71,8 +73,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
71 nd->start = nd->end; 73 nd->start = nd->end;
72 } 74 }
73 if (nd->end > end) { 75 if (nd->end > end) {
74 if (!(end & 0xfff))
75 end--;
76 nd->end = end; 76 nd->end = end;
77 if (nd->start > nd->end) 77 if (nd->start > nd->end)
78 nd->start = nd->end; 78 nd->start = nd->end;
@@ -93,9 +93,36 @@ static __init inline int srat_disabled(void)
93 return numa_off || acpi_numa < 0; 93 return numa_off || acpi_numa < 0;
94} 94}
95 95
96/*
97 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
98 * up the NUMA heuristics which wants the local node to have a smaller
99 * distance than the others.
100 * Do some quick checks here and only use the SLIT if it passes.
101 */
102static __init int slit_valid(struct acpi_table_slit *slit)
103{
104 int i, j;
105 int d = slit->localities;
106 for (i = 0; i < d; i++) {
107 for (j = 0; j < d; j++) {
108 u8 val = slit->entry[d*i + j];
109 if (i == j) {
110 if (val != 10)
111 return 0;
112 } else if (val <= 10)
113 return 0;
114 }
115 }
116 return 1;
117}
118
96/* Callback for SLIT parsing */ 119/* Callback for SLIT parsing */
97void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 120void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
98{ 121{
122 if (!slit_valid(slit)) {
123 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
124 return;
125 }
99 acpi_slit = slit; 126 acpi_slit = slit;
100} 127}
101 128
@@ -166,18 +193,43 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
166 if (nd->end < end) 193 if (nd->end < end)
167 nd->end = end; 194 nd->end = end;
168 } 195 }
169 if (!(nd->end & 0xfff))
170 nd->end--;
171 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 196 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
172 nd->start, nd->end); 197 nd->start, nd->end);
173} 198}
174 199
200/* Sanity check to catch more bad SRATs (they are amazingly common).
201 Make sure the PXMs cover all memory. */
202static int nodes_cover_memory(void)
203{
204 int i;
205 unsigned long pxmram, e820ram;
206
207 pxmram = 0;
208 for_each_node_mask(i, nodes_parsed) {
209 unsigned long s = nodes[i].start >> PAGE_SHIFT;
210 unsigned long e = nodes[i].end >> PAGE_SHIFT;
211 pxmram += e - s;
212 pxmram -= e820_hole_size(s, e);
213 }
214
215 e820ram = end_pfn - e820_hole_size(0, end_pfn);
216 if (pxmram < e820ram) {
217 printk(KERN_ERR
218 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
219 (pxmram << PAGE_SHIFT) >> 20,
220 (e820ram << PAGE_SHIFT) >> 20);
221 return 0;
222 }
223 return 1;
224}
225
175void __init acpi_numa_arch_fixup(void) {} 226void __init acpi_numa_arch_fixup(void) {}
176 227
177/* Use the information discovered above to actually set up the nodes. */ 228/* Use the information discovered above to actually set up the nodes. */
178int __init acpi_scan_nodes(unsigned long start, unsigned long end) 229int __init acpi_scan_nodes(unsigned long start, unsigned long end)
179{ 230{
180 int i; 231 int i;
232
181 if (acpi_numa <= 0) 233 if (acpi_numa <= 0)
182 return -1; 234 return -1;
183 235
@@ -188,6 +240,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
188 node_clear(i, nodes_parsed); 240 node_clear(i, nodes_parsed);
189 } 241 }
190 242
243 if (!nodes_cover_memory()) {
244 bad_srat();
245 return -1;
246 }
247
191 memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); 248 memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
192 if (memnode_shift < 0) { 249 if (memnode_shift < 0) {
193 printk(KERN_ERR 250 printk(KERN_ERR
@@ -203,7 +260,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
203 if (cpu_to_node[i] == NUMA_NO_NODE) 260 if (cpu_to_node[i] == NUMA_NO_NODE)
204 continue; 261 continue;
205 if (!node_isset(cpu_to_node[i], nodes_parsed)) 262 if (!node_isset(cpu_to_node[i], nodes_parsed))
206 cpu_to_node[i] = NUMA_NO_NODE; 263 numa_set_node(i, NUMA_NO_NODE);
207 } 264 }
208 numa_init_array(); 265 numa_init_array();
209 return 0; 266 return 0;
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index bb34e5ef916c..a8f75a2a0f6f 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y += fixup.o
11obj-$(CONFIG_ACPI) += acpi.o 11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o 12obj-y += legacy.o irq.o common.o
13# mmconfig has a 64bit special 13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o 14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
15 15
16obj-$(CONFIG_NUMA) += k8-bus.o 16obj-$(CONFIG_NUMA) += k8-bus.o
17 17
diff --git a/arch/x86_64/pci/Makefile-BUS b/arch/x86_64/pci/Makefile-BUS
deleted file mode 100644
index 4f0c05abd408..000000000000
--- a/arch/x86_64/pci/Makefile-BUS
+++ /dev/null
@@ -1,22 +0,0 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6CFLAGS += -I arch/i386/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o
11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
15
16direct-y += ../../i386/pci/direct.o
17acpi-y += ../../i386/pci/acpi.o
18legacy-y += ../../i386/pci/legacy.o
19irq-y += ../../i386/pci/irq.o
20common-y += ../../i386/pci/common.o
21fixup-y += ../../i386/pci/fixup.o
22i386-y += ../../i386/pci/i386.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index a0838c4a94e4..f16c0d57c552 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -8,18 +8,21 @@
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h>
11#include "pci.h" 12#include "pci.h"
12 13
13#define MMCONFIG_APER_SIZE (256*1024*1024) 14#define MMCONFIG_APER_SIZE (256*1024*1024)
14 15
16static DECLARE_BITMAP(fallback_slots, 32);
17
15/* Static virtual mapping of the MMCONFIG aperture */ 18/* Static virtual mapping of the MMCONFIG aperture */
16struct mmcfg_virt { 19struct mmcfg_virt {
17 struct acpi_table_mcfg_config *cfg; 20 struct acpi_table_mcfg_config *cfg;
18 char *virt; 21 char __iomem *virt;
19}; 22};
20static struct mmcfg_virt *pci_mmcfg_virt; 23static struct mmcfg_virt *pci_mmcfg_virt;
21 24
22static char *get_virt(unsigned int seg, int bus) 25static char __iomem *get_virt(unsigned int seg, unsigned bus)
23{ 26{
24 int cfg_num = -1; 27 int cfg_num = -1;
25 struct acpi_table_mcfg_config *cfg; 28 struct acpi_table_mcfg_config *cfg;
@@ -27,10 +30,9 @@ static char *get_virt(unsigned int seg, int bus)
27 while (1) { 30 while (1) {
28 ++cfg_num; 31 ++cfg_num;
29 if (cfg_num >= pci_mmcfg_config_num) { 32 if (cfg_num >= pci_mmcfg_config_num) {
30 /* something bad is going on, no cfg table is found. */ 33 /* Not found - fall back to type 1. This happens
31 /* so we fall back to the old way we used to do this */ 34 e.g. on the internal devices of a K8 northbridge. */
32 /* and just rely on the first entry to be correct. */ 35 return NULL;
33 return pci_mmcfg_virt[0].virt;
34 } 36 }
35 cfg = pci_mmcfg_virt[cfg_num].cfg; 37 cfg = pci_mmcfg_virt[cfg_num].cfg;
36 if (cfg->pci_segment_group_number != seg) 38 if (cfg->pci_segment_group_number != seg)
@@ -41,20 +43,30 @@ static char *get_virt(unsigned int seg, int bus)
41 } 43 }
42} 44}
43 45
44static inline char *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 46static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
45{ 47{
46 48 char __iomem *addr;
47 return get_virt(seg, bus) + ((bus << 20) | (devfn << 12)); 49 if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), &fallback_slots))
50 return NULL;
51 addr = get_virt(seg, bus);
52 if (!addr)
53 return NULL;
54 return addr + ((bus << 20) | (devfn << 12));
48} 55}
49 56
50static int pci_mmcfg_read(unsigned int seg, unsigned int bus, 57static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
51 unsigned int devfn, int reg, int len, u32 *value) 58 unsigned int devfn, int reg, int len, u32 *value)
52{ 59{
53 char *addr = pci_dev_base(seg, bus, devfn); 60 char __iomem *addr;
54 61
62 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
55 if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) 63 if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
56 return -EINVAL; 64 return -EINVAL;
57 65
66 addr = pci_dev_base(seg, bus, devfn);
67 if (!addr)
68 return pci_conf1_read(seg,bus,devfn,reg,len,value);
69
58 switch (len) { 70 switch (len) {
59 case 1: 71 case 1:
60 *value = readb(addr + reg); 72 *value = readb(addr + reg);
@@ -73,11 +85,16 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
73static int pci_mmcfg_write(unsigned int seg, unsigned int bus, 85static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
74 unsigned int devfn, int reg, int len, u32 value) 86 unsigned int devfn, int reg, int len, u32 value)
75{ 87{
76 char *addr = pci_dev_base(seg, bus, devfn); 88 char __iomem *addr;
77 89
90 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
78 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) 91 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
79 return -EINVAL; 92 return -EINVAL;
80 93
94 addr = pci_dev_base(seg, bus, devfn);
95 if (!addr)
96 return pci_conf1_write(seg,bus,devfn,reg,len,value);
97
81 switch (len) { 98 switch (len) {
82 case 1: 99 case 1:
83 writeb(value, addr + reg); 100 writeb(value, addr + reg);
@@ -98,6 +115,30 @@ static struct pci_raw_ops pci_mmcfg = {
98 .write = pci_mmcfg_write, 115 .write = pci_mmcfg_write,
99}; 116};
100 117
118/* K8 systems have some devices (typically in the builtin northbridge)
119 that are only accessible using type1
120 Normally this can be expressed in the MCFG by not listing them
121 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
122 Instead try to discover all devices on bus 0 that are unreachable using MM
123 and fallback for them.
124 We only do this for bus 0/seg 0 */
125static __init void unreachable_devices(void)
126{
127 int i;
128 for (i = 0; i < 32; i++) {
129 u32 val1;
130 char __iomem *addr;
131
132 pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
133 if (val1 == 0xffffffff)
134 continue;
135 addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
136 if (addr == NULL|| readl(addr) != val1) {
137 set_bit(i, &fallback_slots);
138 }
139 }
140}
141
101static int __init pci_mmcfg_init(void) 142static int __init pci_mmcfg_init(void)
102{ 143{
103 int i; 144 int i;
@@ -128,6 +169,8 @@ static int __init pci_mmcfg_init(void)
128 printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address); 169 printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address);
129 } 170 }
130 171
172 unreachable_devices();
173
131 raw_pci_ops = &pci_mmcfg; 174 raw_pci_ops = &pci_mmcfg;
132 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 175 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
133 176