aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig49
-rw-r--r--arch/x86_64/Makefile10
-rw-r--r--arch/x86_64/boot/compressed/Makefile3
-rw-r--r--arch/x86_64/boot/setup.S4
-rw-r--r--arch/x86_64/defconfig109
-rw-r--r--arch/x86_64/ia32/ia32_aout.c8
-rw-r--r--arch/x86_64/ia32/ia32_signal.c53
-rw-r--r--arch/x86_64/ia32/ia32entry.S9
-rw-r--r--arch/x86_64/ia32/ptrace32.c10
-rw-r--r--arch/x86_64/ia32/sys_ia32.c54
-rw-r--r--arch/x86_64/kernel/Makefile9
-rw-r--r--arch/x86_64/kernel/aperture.c25
-rw-r--r--arch/x86_64/kernel/apic.c229
-rw-r--r--arch/x86_64/kernel/crash.c26
-rw-r--r--arch/x86_64/kernel/e820.c243
-rw-r--r--arch/x86_64/kernel/early-quirks.c122
-rw-r--r--arch/x86_64/kernel/early_printk.c20
-rw-r--r--arch/x86_64/kernel/entry.S63
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c1
-rw-r--r--arch/x86_64/kernel/genapic_flat.c5
-rw-r--r--arch/x86_64/kernel/head.S15
-rw-r--r--arch/x86_64/kernel/head64.c44
-rw-r--r--arch/x86_64/kernel/i8259.c15
-rw-r--r--arch/x86_64/kernel/io_apic.c482
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/machine_kexec.c99
-rw-r--r--arch/x86_64/kernel/mce.c29
-rw-r--r--arch/x86_64/kernel/mce_intel.c30
-rw-r--r--arch/x86_64/kernel/mpparse.c238
-rw-r--r--arch/x86_64/kernel/nmi.c840
-rw-r--r--arch/x86_64/kernel/pci-calgary.c142
-rw-r--r--arch/x86_64/kernel/pci-dma.c7
-rw-r--r--arch/x86_64/kernel/pci-gart.c3
-rw-r--r--arch/x86_64/kernel/pci-nommu.c1
-rw-r--r--arch/x86_64/kernel/process.c110
-rw-r--r--arch/x86_64/kernel/ptrace.c29
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S171
-rw-r--r--arch/x86_64/kernel/setup.c248
-rw-r--r--arch/x86_64/kernel/setup64.c45
-rw-r--r--arch/x86_64/kernel/signal.c87
-rw-r--r--arch/x86_64/kernel/smp.c23
-rw-r--r--arch/x86_64/kernel/smpboot.c14
-rw-r--r--arch/x86_64/kernel/stacktrace.c220
-rw-r--r--arch/x86_64/kernel/tce.c12
-rw-r--r--arch/x86_64/kernel/time.c94
-rw-r--r--arch/x86_64/kernel/trampoline.S2
-rw-r--r--arch/x86_64/kernel/traps.c204
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S25
-rw-r--r--arch/x86_64/kernel/vsmp.c3
-rw-r--r--arch/x86_64/kernel/vsyscall.c98
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c1
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/clear_page.S47
-rw-r--r--arch/x86_64/lib/copy_page.S53
-rw-r--r--arch/x86_64/lib/copy_user.S153
-rw-r--r--arch/x86_64/lib/csum-copy.S26
-rw-r--r--arch/x86_64/lib/getuser.S32
-rw-r--r--arch/x86_64/lib/iomap_copy.S10
-rw-r--r--arch/x86_64/lib/memcpy.S69
-rw-r--r--arch/x86_64/lib/memset.S79
-rw-r--r--arch/x86_64/lib/putuser.S32
-rw-r--r--arch/x86_64/lib/rwlock.S38
-rw-r--r--arch/x86_64/lib/thunk.S43
-rw-r--r--arch/x86_64/mm/fault.c22
-rw-r--r--arch/x86_64/mm/init.c127
-rw-r--r--arch/x86_64/mm/k8topology.c6
-rw-r--r--arch/x86_64/mm/numa.c32
-rw-r--r--arch/x86_64/mm/pageattr.c24
-rw-r--r--arch/x86_64/mm/srat.c19
-rw-r--r--arch/x86_64/pci/Makefile3
-rw-r--r--arch/x86_64/pci/mmconfig.c44
72 files changed, 2705 insertions, 2552 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 581ce9af0ec8..32ae1378f35c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -85,6 +85,9 @@ config ARCH_MAY_HAVE_PC_FDC
85 bool 85 bool
86 default y 86 default y
87 87
88config ARCH_POPULATES_NODE_MAP
89 def_bool y
90
88config DMI 91config DMI
89 bool 92 bool
90 default y 93 default y
@@ -109,6 +112,7 @@ config X86_PC
109 112
110config X86_VSMP 113config X86_VSMP
111 bool "Support for ScaleMP vSMP" 114 bool "Support for ScaleMP vSMP"
115 depends on PCI
112 help 116 help
113 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is 117 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
114 supposed to run on these EM64T-based machines. Only choose this option 118 supposed to run on these EM64T-based machines. Only choose this option
@@ -167,6 +171,7 @@ config X86_GOOD_APIC
167 171
168config MICROCODE 172config MICROCODE
169 tristate "/dev/cpu/microcode - Intel CPU microcode support" 173 tristate "/dev/cpu/microcode - Intel CPU microcode support"
174 select FW_LOADER
170 ---help--- 175 ---help---
171 If you say Y here the 'File systems' section, you will be 176 If you say Y here the 'File systems' section, you will be
172 able to update the microcode on Intel processors. You will 177 able to update the microcode on Intel processors. You will
@@ -182,6 +187,11 @@ config MICROCODE
182 If you use modprobe or kmod you may also want to add the line 187 If you use modprobe or kmod you may also want to add the line
183 'alias char-major-10-184 microcode' to your /etc/modules.conf file. 188 'alias char-major-10-184 microcode' to your /etc/modules.conf file.
184 189
190config MICROCODE_OLD_INTERFACE
191 bool
192 depends on MICROCODE
193 default y
194
185config X86_MSR 195config X86_MSR
186 tristate "/dev/cpu/*/msr - Model-specific register support" 196 tristate "/dev/cpu/*/msr - Model-specific register support"
187 help 197 help
@@ -295,7 +305,7 @@ config NUMA
295 305
296config K8_NUMA 306config K8_NUMA
297 bool "Old style AMD Opteron NUMA detection" 307 bool "Old style AMD Opteron NUMA detection"
298 depends on NUMA 308 depends on NUMA && PCI
299 default y 309 default y
300 help 310 help
301 Enable K8 NUMA node topology detection. You should say Y here if 311 Enable K8 NUMA node topology detection. You should say Y here if
@@ -425,7 +435,6 @@ config IOMMU
425 435
426config CALGARY_IOMMU 436config CALGARY_IOMMU
427 bool "IBM Calgary IOMMU support" 437 bool "IBM Calgary IOMMU support"
428 default y
429 select SWIOTLB 438 select SWIOTLB
430 depends on PCI && EXPERIMENTAL 439 depends on PCI && EXPERIMENTAL
431 help 440 help
@@ -472,8 +481,7 @@ config X86_MCE_AMD
472 the DRAM Error Threshold. 481 the DRAM Error Threshold.
473 482
474config KEXEC 483config KEXEC
475 bool "kexec system call (EXPERIMENTAL)" 484 bool "kexec system call"
476 depends on EXPERIMENTAL
477 help 485 help
478 kexec is a system call that implements the ability to shutdown your 486 kexec is a system call that implements the ability to shutdown your
479 current kernel, and to start another kernel. It is like a reboot 487 current kernel, and to start another kernel. It is like a reboot
@@ -492,7 +500,14 @@ config CRASH_DUMP
492 bool "kernel crash dumps (EXPERIMENTAL)" 500 bool "kernel crash dumps (EXPERIMENTAL)"
493 depends on EXPERIMENTAL 501 depends on EXPERIMENTAL
494 help 502 help
495 Generate crash dump after being started by kexec. 503 Generate crash dump after being started by kexec.
504 This should be normally only set in special crash dump kernels
505 which are loaded in the main kernel with kexec-tools into
506 a specially reserved region and then later executed after
507 a crash by kdump/kexec. The crash dump kernel must be compiled
508 to a memory address not used by the main kernel or BIOS using
509 PHYSICAL_START.
510 For more details see Documentation/kdump/kdump.txt
496 511
497config PHYSICAL_START 512config PHYSICAL_START
498 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 513 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
@@ -530,6 +545,30 @@ config SECCOMP
530 545
531 If unsure, say Y. Only embedded should say N here. 546 If unsure, say Y. Only embedded should say N here.
532 547
548config CC_STACKPROTECTOR
549 bool "Enable -fstack-protector buffer overflow detection (EXPRIMENTAL)"
550 depends on EXPERIMENTAL
551 help
552 This option turns on the -fstack-protector GCC feature. This
553 feature puts, at the beginning of critical functions, a canary
554 value on the stack just before the return address, and validates
555 the value just before actually returning. Stack based buffer
556 overflows (that need to overwrite this return address) now also
557 overwrite the canary, which gets detected and the attack is then
558 neutralized via a kernel panic.
559
560 This feature requires gcc version 4.2 or above, or a distribution
561 gcc with the feature backported. Older versions are automatically
562 detected and for those versions, this configuration option is ignored.
563
564config CC_STACKPROTECTOR_ALL
565 bool "Use stack-protector for all functions"
566 depends on CC_STACKPROTECTOR
567 help
568 Normally, GCC only inserts the canary value protection for
569 functions that use large-ish on-stack buffers. By enabling
570 this option, GCC will be asked to do this for ALL functions.
571
533source kernel/Kconfig.hz 572source kernel/Kconfig.hz
534 573
535config REORDER 574config REORDER
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 431bb4bc36cd..1c0f18d4f887 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -54,6 +54,16 @@ endif
54cflags-y += $(call cc-option,-funit-at-a-time) 54cflags-y += $(call cc-option,-funit-at-a-time)
55# prevent gcc from generating any FP code by mistake 55# prevent gcc from generating any FP code by mistake
56cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) 56cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
57# do binutils support CFI?
58cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
59AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
60
61# is .cfi_signal_frame supported too?
62cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
63AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
64
65cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector )
66cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all )
57 67
58CFLAGS += $(cflags-y) 68CFLAGS += $(cflags-y)
59CFLAGS_KERNEL += $(cflags-kernel-y) 69CFLAGS_KERNEL += $(cflags-kernel-y)
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index f89d96f11a9f..e70fa6e1da08 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -7,7 +7,8 @@
7# 7#
8 8
9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
10EXTRA_AFLAGS := -traditional -m32 10EXTRA_AFLAGS := -traditional
11AFLAGS := $(subst -m64,-m32,$(AFLAGS))
11 12
12# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with 13# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
13# -m32 14# -m32
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index a50b631f4d2b..c3bfd223ab49 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -526,12 +526,12 @@ is_disk1:
526 movw %cs, %ax # aka SETUPSEG 526 movw %cs, %ax # aka SETUPSEG
527 subw $DELTA_INITSEG, %ax # aka INITSEG 527 subw $DELTA_INITSEG, %ax # aka INITSEG
528 movw %ax, %ds 528 movw %ax, %ds
529 movw $0, (0x1ff) # default is no pointing device 529 movb $0, (0x1ff) # default is no pointing device
530 int $0x11 # int 0x11: equipment list 530 int $0x11 # int 0x11: equipment list
531 testb $0x04, %al # check if mouse installed 531 testb $0x04, %al # check if mouse installed
532 jz no_psmouse 532 jz no_psmouse
533 533
534 movw $0xAA, (0x1ff) # device present 534 movb $0xAA, (0x1ff) # device present
535no_psmouse: 535no_psmouse:
536 536
537#include "../../i386/boot/edd.S" 537#include "../../i386/boot/edd.S"
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 5fb970715941..647610ecb580 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.18-rc4 3# Linux kernel version: 2.6.18-git5
4# Thu Aug 24 21:05:55 2006 4# Tue Sep 26 09:30:47 2006
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -19,6 +19,7 @@ CONFIG_GENERIC_ISA_DMA=y
19CONFIG_GENERIC_IOMAP=y 19CONFIG_GENERIC_IOMAP=y
20CONFIG_ARCH_MAY_HAVE_PC_FDC=y 20CONFIG_ARCH_MAY_HAVE_PC_FDC=y
21CONFIG_DMI=y 21CONFIG_DMI=y
22CONFIG_AUDIT_ARCH=y
22CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 23CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
23 24
24# 25#
@@ -38,16 +39,16 @@ CONFIG_SYSVIPC=y
38CONFIG_POSIX_MQUEUE=y 39CONFIG_POSIX_MQUEUE=y
39# CONFIG_BSD_PROCESS_ACCT is not set 40# CONFIG_BSD_PROCESS_ACCT is not set
40# CONFIG_TASKSTATS is not set 41# CONFIG_TASKSTATS is not set
41CONFIG_SYSCTL=y
42# CONFIG_AUDIT is not set 42# CONFIG_AUDIT is not set
43CONFIG_IKCONFIG=y 43CONFIG_IKCONFIG=y
44CONFIG_IKCONFIG_PROC=y 44CONFIG_IKCONFIG_PROC=y
45# CONFIG_CPUSETS is not set 45# CONFIG_CPUSETS is not set
46# CONFIG_RELAY is not set 46# CONFIG_RELAY is not set
47CONFIG_INITRAMFS_SOURCE="" 47CONFIG_INITRAMFS_SOURCE=""
48CONFIG_UID16=y
49CONFIG_CC_OPTIMIZE_FOR_SIZE=y 48CONFIG_CC_OPTIMIZE_FOR_SIZE=y
50# CONFIG_EMBEDDED is not set 49# CONFIG_EMBEDDED is not set
50CONFIG_UID16=y
51CONFIG_SYSCTL=y
51CONFIG_KALLSYMS=y 52CONFIG_KALLSYMS=y
52CONFIG_KALLSYMS_ALL=y 53CONFIG_KALLSYMS_ALL=y
53# CONFIG_KALLSYMS_EXTRA_PASS is not set 54# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -56,12 +57,12 @@ CONFIG_PRINTK=y
56CONFIG_BUG=y 57CONFIG_BUG=y
57CONFIG_ELF_CORE=y 58CONFIG_ELF_CORE=y
58CONFIG_BASE_FULL=y 59CONFIG_BASE_FULL=y
59CONFIG_RT_MUTEXES=y
60CONFIG_FUTEX=y 60CONFIG_FUTEX=y
61CONFIG_EPOLL=y 61CONFIG_EPOLL=y
62CONFIG_SHMEM=y 62CONFIG_SHMEM=y
63CONFIG_SLAB=y 63CONFIG_SLAB=y
64CONFIG_VM_EVENT_COUNTERS=y 64CONFIG_VM_EVENT_COUNTERS=y
65CONFIG_RT_MUTEXES=y
65# CONFIG_TINY_SHMEM is not set 66# CONFIG_TINY_SHMEM is not set
66CONFIG_BASE_SMALL=0 67CONFIG_BASE_SMALL=0
67# CONFIG_SLOB is not set 68# CONFIG_SLOB is not set
@@ -160,6 +161,7 @@ CONFIG_X86_MCE_AMD=y
160# CONFIG_CRASH_DUMP is not set 161# CONFIG_CRASH_DUMP is not set
161CONFIG_PHYSICAL_START=0x200000 162CONFIG_PHYSICAL_START=0x200000
162CONFIG_SECCOMP=y 163CONFIG_SECCOMP=y
164# CONFIG_CC_STACKPROTECTOR is not set
163# CONFIG_HZ_100 is not set 165# CONFIG_HZ_100 is not set
164CONFIG_HZ_250=y 166CONFIG_HZ_250=y
165# CONFIG_HZ_1000 is not set 167# CONFIG_HZ_1000 is not set
@@ -307,18 +309,23 @@ CONFIG_IP_PNP_DHCP=y
307CONFIG_INET_DIAG=y 309CONFIG_INET_DIAG=y
308CONFIG_INET_TCP_DIAG=y 310CONFIG_INET_TCP_DIAG=y
309# CONFIG_TCP_CONG_ADVANCED is not set 311# CONFIG_TCP_CONG_ADVANCED is not set
310CONFIG_TCP_CONG_BIC=y 312CONFIG_TCP_CONG_CUBIC=y
313CONFIG_DEFAULT_TCP_CONG="cubic"
311CONFIG_IPV6=y 314CONFIG_IPV6=y
312# CONFIG_IPV6_PRIVACY is not set 315# CONFIG_IPV6_PRIVACY is not set
313# CONFIG_IPV6_ROUTER_PREF is not set 316# CONFIG_IPV6_ROUTER_PREF is not set
314# CONFIG_INET6_AH is not set 317# CONFIG_INET6_AH is not set
315# CONFIG_INET6_ESP is not set 318# CONFIG_INET6_ESP is not set
316# CONFIG_INET6_IPCOMP is not set 319# CONFIG_INET6_IPCOMP is not set
320# CONFIG_IPV6_MIP6 is not set
317# CONFIG_INET6_XFRM_TUNNEL is not set 321# CONFIG_INET6_XFRM_TUNNEL is not set
318# CONFIG_INET6_TUNNEL is not set 322# CONFIG_INET6_TUNNEL is not set
319# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set 323# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set
320# CONFIG_INET6_XFRM_MODE_TUNNEL is not set 324# CONFIG_INET6_XFRM_MODE_TUNNEL is not set
325# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
321# CONFIG_IPV6_TUNNEL is not set 326# CONFIG_IPV6_TUNNEL is not set
327# CONFIG_IPV6_SUBTREES is not set
328# CONFIG_IPV6_MULTIPLE_TABLES is not set
322# CONFIG_NETWORK_SECMARK is not set 329# CONFIG_NETWORK_SECMARK is not set
323# CONFIG_NETFILTER is not set 330# CONFIG_NETFILTER is not set
324 331
@@ -345,7 +352,6 @@ CONFIG_IPV6=y
345# CONFIG_ATALK is not set 352# CONFIG_ATALK is not set
346# CONFIG_X25 is not set 353# CONFIG_X25 is not set
347# CONFIG_LAPB is not set 354# CONFIG_LAPB is not set
348# CONFIG_NET_DIVERT is not set
349# CONFIG_ECONET is not set 355# CONFIG_ECONET is not set
350# CONFIG_WAN_ROUTER is not set 356# CONFIG_WAN_ROUTER is not set
351 357
@@ -487,6 +493,7 @@ CONFIG_IDEDMA_AUTO=y
487# 493#
488# CONFIG_RAID_ATTRS is not set 494# CONFIG_RAID_ATTRS is not set
489CONFIG_SCSI=y 495CONFIG_SCSI=y
496CONFIG_SCSI_NETLINK=y
490# CONFIG_SCSI_PROC_FS is not set 497# CONFIG_SCSI_PROC_FS is not set
491 498
492# 499#
@@ -508,12 +515,13 @@ CONFIG_SCSI_CONSTANTS=y
508# CONFIG_SCSI_LOGGING is not set 515# CONFIG_SCSI_LOGGING is not set
509 516
510# 517#
511# SCSI Transport Attributes 518# SCSI Transports
512# 519#
513CONFIG_SCSI_SPI_ATTRS=y 520CONFIG_SCSI_SPI_ATTRS=y
514CONFIG_SCSI_FC_ATTRS=y 521CONFIG_SCSI_FC_ATTRS=y
515# CONFIG_SCSI_ISCSI_ATTRS is not set 522# CONFIG_SCSI_ISCSI_ATTRS is not set
516CONFIG_SCSI_SAS_ATTRS=y 523CONFIG_SCSI_SAS_ATTRS=y
524# CONFIG_SCSI_SAS_LIBSAS is not set
517 525
518# 526#
519# SCSI low-level drivers 527# SCSI low-level drivers
@@ -532,29 +540,14 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000
532# CONFIG_AIC79XX_DEBUG_ENABLE is not set 540# CONFIG_AIC79XX_DEBUG_ENABLE is not set
533CONFIG_AIC79XX_DEBUG_MASK=0 541CONFIG_AIC79XX_DEBUG_MASK=0
534# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set 542# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
543# CONFIG_SCSI_AIC94XX is not set
544# CONFIG_SCSI_ARCMSR is not set
535CONFIG_MEGARAID_NEWGEN=y 545CONFIG_MEGARAID_NEWGEN=y
536CONFIG_MEGARAID_MM=y 546CONFIG_MEGARAID_MM=y
537CONFIG_MEGARAID_MAILBOX=y 547CONFIG_MEGARAID_MAILBOX=y
538# CONFIG_MEGARAID_LEGACY is not set 548# CONFIG_MEGARAID_LEGACY is not set
539CONFIG_MEGARAID_SAS=y 549CONFIG_MEGARAID_SAS=y
540CONFIG_SCSI_SATA=y
541CONFIG_SCSI_SATA_AHCI=y
542CONFIG_SCSI_SATA_SVW=y
543CONFIG_SCSI_ATA_PIIX=y
544# CONFIG_SCSI_SATA_MV is not set
545CONFIG_SCSI_SATA_NV=y
546# CONFIG_SCSI_PDC_ADMA is not set
547# CONFIG_SCSI_HPTIOP is not set 550# CONFIG_SCSI_HPTIOP is not set
548# CONFIG_SCSI_SATA_QSTOR is not set
549# CONFIG_SCSI_SATA_PROMISE is not set
550# CONFIG_SCSI_SATA_SX4 is not set
551CONFIG_SCSI_SATA_SIL=y
552# CONFIG_SCSI_SATA_SIL24 is not set
553# CONFIG_SCSI_SATA_SIS is not set
554# CONFIG_SCSI_SATA_ULI is not set
555CONFIG_SCSI_SATA_VIA=y
556# CONFIG_SCSI_SATA_VITESSE is not set
557CONFIG_SCSI_SATA_INTEL_COMBINED=y
558# CONFIG_SCSI_BUSLOGIC is not set 551# CONFIG_SCSI_BUSLOGIC is not set
559# CONFIG_SCSI_DMX3191D is not set 552# CONFIG_SCSI_DMX3191D is not set
560# CONFIG_SCSI_EATA is not set 553# CONFIG_SCSI_EATA is not set
@@ -563,6 +556,7 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
563# CONFIG_SCSI_IPS is not set 556# CONFIG_SCSI_IPS is not set
564# CONFIG_SCSI_INITIO is not set 557# CONFIG_SCSI_INITIO is not set
565# CONFIG_SCSI_INIA100 is not set 558# CONFIG_SCSI_INIA100 is not set
559# CONFIG_SCSI_STEX is not set
566# CONFIG_SCSI_SYM53C8XX_2 is not set 560# CONFIG_SCSI_SYM53C8XX_2 is not set
567# CONFIG_SCSI_IPR is not set 561# CONFIG_SCSI_IPR is not set
568# CONFIG_SCSI_QLOGIC_1280 is not set 562# CONFIG_SCSI_QLOGIC_1280 is not set
@@ -573,6 +567,62 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
573# CONFIG_SCSI_DEBUG is not set 567# CONFIG_SCSI_DEBUG is not set
574 568
575# 569#
570# Serial ATA (prod) and Parallel ATA (experimental) drivers
571#
572CONFIG_ATA=y
573CONFIG_SATA_AHCI=y
574CONFIG_SATA_SVW=y
575CONFIG_ATA_PIIX=y
576# CONFIG_SATA_MV is not set
577CONFIG_SATA_NV=y
578# CONFIG_PDC_ADMA is not set
579# CONFIG_SATA_QSTOR is not set
580# CONFIG_SATA_PROMISE is not set
581# CONFIG_SATA_SX4 is not set
582CONFIG_SATA_SIL=y
583# CONFIG_SATA_SIL24 is not set
584# CONFIG_SATA_SIS is not set
585# CONFIG_SATA_ULI is not set
586CONFIG_SATA_VIA=y
587# CONFIG_SATA_VITESSE is not set
588CONFIG_SATA_INTEL_COMBINED=y
589# CONFIG_PATA_ALI is not set
590# CONFIG_PATA_AMD is not set
591# CONFIG_PATA_ARTOP is not set
592# CONFIG_PATA_ATIIXP is not set
593# CONFIG_PATA_CMD64X is not set
594# CONFIG_PATA_CS5520 is not set
595# CONFIG_PATA_CS5530 is not set
596# CONFIG_PATA_CYPRESS is not set
597# CONFIG_PATA_EFAR is not set
598# CONFIG_ATA_GENERIC is not set
599# CONFIG_PATA_HPT366 is not set
600# CONFIG_PATA_HPT37X is not set
601# CONFIG_PATA_HPT3X2N is not set
602# CONFIG_PATA_HPT3X3 is not set
603# CONFIG_PATA_IT821X is not set
604# CONFIG_PATA_JMICRON is not set
605# CONFIG_PATA_LEGACY is not set
606# CONFIG_PATA_TRIFLEX is not set
607# CONFIG_PATA_MPIIX is not set
608# CONFIG_PATA_OLDPIIX is not set
609# CONFIG_PATA_NETCELL is not set
610# CONFIG_PATA_NS87410 is not set
611# CONFIG_PATA_OPTI is not set
612# CONFIG_PATA_OPTIDMA is not set
613# CONFIG_PATA_PDC_OLD is not set
614# CONFIG_PATA_QDI is not set
615# CONFIG_PATA_RADISYS is not set
616# CONFIG_PATA_RZ1000 is not set
617# CONFIG_PATA_SC1200 is not set
618# CONFIG_PATA_SERVERWORKS is not set
619# CONFIG_PATA_PDC2027X is not set
620# CONFIG_PATA_SIL680 is not set
621# CONFIG_PATA_SIS is not set
622# CONFIG_PATA_VIA is not set
623# CONFIG_PATA_WINBOND is not set
624
625#
576# Multi-device support (RAID and LVM) 626# Multi-device support (RAID and LVM)
577# 627#
578CONFIG_MD=y 628CONFIG_MD=y
@@ -678,6 +728,7 @@ CONFIG_NET_PCI=y
678# CONFIG_ADAPTEC_STARFIRE is not set 728# CONFIG_ADAPTEC_STARFIRE is not set
679CONFIG_B44=y 729CONFIG_B44=y
680CONFIG_FORCEDETH=y 730CONFIG_FORCEDETH=y
731# CONFIG_FORCEDETH_NAPI is not set
681# CONFIG_DGRS is not set 732# CONFIG_DGRS is not set
682# CONFIG_EEPRO100 is not set 733# CONFIG_EEPRO100 is not set
683CONFIG_E100=y 734CONFIG_E100=y
@@ -714,6 +765,7 @@ CONFIG_E1000=y
714# CONFIG_VIA_VELOCITY is not set 765# CONFIG_VIA_VELOCITY is not set
715CONFIG_TIGON3=y 766CONFIG_TIGON3=y
716CONFIG_BNX2=y 767CONFIG_BNX2=y
768# CONFIG_QLA3XXX is not set
717 769
718# 770#
719# Ethernet (10000 Mbit) 771# Ethernet (10000 Mbit)
@@ -1036,6 +1088,7 @@ CONFIG_SOUND=y
1036# Open Sound System 1088# Open Sound System
1037# 1089#
1038CONFIG_SOUND_PRIME=y 1090CONFIG_SOUND_PRIME=y
1091CONFIG_OSS_OBSOLETE_DRIVER=y
1039# CONFIG_SOUND_BT878 is not set 1092# CONFIG_SOUND_BT878 is not set
1040# CONFIG_SOUND_EMU10K1 is not set 1093# CONFIG_SOUND_EMU10K1 is not set
1041# CONFIG_SOUND_FUSION is not set 1094# CONFIG_SOUND_FUSION is not set
@@ -1046,7 +1099,6 @@ CONFIG_SOUND_ICH=y
1046# CONFIG_SOUND_MSNDPIN is not set 1099# CONFIG_SOUND_MSNDPIN is not set
1047# CONFIG_SOUND_VIA82CXXX is not set 1100# CONFIG_SOUND_VIA82CXXX is not set
1048# CONFIG_SOUND_OSS is not set 1101# CONFIG_SOUND_OSS is not set
1049# CONFIG_SOUND_TVMIXER is not set
1050 1102
1051# 1103#
1052# USB support 1104# USB support
@@ -1203,7 +1255,6 @@ CONFIG_USB_MON=y
1203# InfiniBand support 1255# InfiniBand support
1204# 1256#
1205# CONFIG_INFINIBAND is not set 1257# CONFIG_INFINIBAND is not set
1206# CONFIG_IPATH_CORE is not set
1207 1258
1208# 1259#
1209# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) 1260# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1449,10 +1500,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
1449# CONFIG_CRYPTO is not set 1500# CONFIG_CRYPTO is not set
1450 1501
1451# 1502#
1452# Hardware crypto devices
1453#
1454
1455#
1456# Library routines 1503# Library routines
1457# 1504#
1458# CONFIG_CRC_CCITT is not set 1505# CONFIG_CRC_CCITT is not set
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3bf58af98936..396d3c100011 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -333,7 +333,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
333 return error; 333 return error;
334 } 334 }
335 335
336 error = bprm->file->f_op->read(bprm->file, (char *)text_addr, 336 error = bprm->file->f_op->read(bprm->file,
337 (char __user *)text_addr,
337 ex.a_text+ex.a_data, &pos); 338 ex.a_text+ex.a_data, &pos);
338 if ((signed long)error < 0) { 339 if ((signed long)error < 0) {
339 send_sig(SIGKILL, current, 0); 340 send_sig(SIGKILL, current, 0);
@@ -366,7 +367,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
366 down_write(&current->mm->mmap_sem); 367 down_write(&current->mm->mmap_sem);
367 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); 368 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
368 up_write(&current->mm->mmap_sem); 369 up_write(&current->mm->mmap_sem);
369 bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), 370 bprm->file->f_op->read(bprm->file,
371 (char __user *)N_TXTADDR(ex),
370 ex.a_text+ex.a_data, &pos); 372 ex.a_text+ex.a_data, &pos);
371 flush_icache_range((unsigned long) N_TXTADDR(ex), 373 flush_icache_range((unsigned long) N_TXTADDR(ex),
372 (unsigned long) N_TXTADDR(ex) + 374 (unsigned long) N_TXTADDR(ex) +
@@ -477,7 +479,7 @@ static int load_aout_library(struct file *file)
477 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); 479 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
478 up_write(&current->mm->mmap_sem); 480 up_write(&current->mm->mmap_sem);
479 481
480 file->f_op->read(file, (char *)start_addr, 482 file->f_op->read(file, (char __user *)start_addr,
481 ex.a_text + ex.a_data, &pos); 483 ex.a_text + ex.a_data, &pos);
482 flush_icache_range((unsigned long) start_addr, 484 flush_icache_range((unsigned long) start_addr,
483 (unsigned long) start_addr + ex.a_text + ex.a_data); 485 (unsigned long) start_addr + ex.a_text + ex.a_data);
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 25e5ca22204c..a6ba9951e86c 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -113,25 +113,19 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
113} 113}
114 114
115asmlinkage long 115asmlinkage long
116sys32_sigsuspend(int history0, int history1, old_sigset_t mask, 116sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
117 struct pt_regs *regs)
118{ 117{
119 sigset_t saveset;
120
121 mask &= _BLOCKABLE; 118 mask &= _BLOCKABLE;
122 spin_lock_irq(&current->sighand->siglock); 119 spin_lock_irq(&current->sighand->siglock);
123 saveset = current->blocked; 120 current->saved_sigmask = current->blocked;
124 siginitset(&current->blocked, mask); 121 siginitset(&current->blocked, mask);
125 recalc_sigpending(); 122 recalc_sigpending();
126 spin_unlock_irq(&current->sighand->siglock); 123 spin_unlock_irq(&current->sighand->siglock);
127 124
128 regs->rax = -EINTR; 125 current->state = TASK_INTERRUPTIBLE;
129 while (1) { 126 schedule();
130 current->state = TASK_INTERRUPTIBLE; 127 set_thread_flag(TIF_RESTORE_SIGMASK);
131 schedule(); 128 return -ERESTARTNOHAND;
132 if (do_signal(regs, &saveset))
133 return -EINTR;
134 }
135} 129}
136 130
137asmlinkage long 131asmlinkage long
@@ -437,15 +431,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
437 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
438 goto give_sigsegv; 432 goto give_sigsegv;
439 433
440 { 434 err |= __put_user(sig, &frame->sig);
441 struct exec_domain *ed = current_thread_info()->exec_domain;
442 err |= __put_user((ed
443 && ed->signal_invmap
444 && sig < 32
445 ? ed->signal_invmap[sig]
446 : sig),
447 &frame->sig);
448 }
449 if (err) 435 if (err)
450 goto give_sigsegv; 436 goto give_sigsegv;
451 437
@@ -492,6 +478,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
492 regs->rsp = (unsigned long) frame; 478 regs->rsp = (unsigned long) frame;
493 regs->rip = (unsigned long) ka->sa.sa_handler; 479 regs->rip = (unsigned long) ka->sa.sa_handler;
494 480
481 /* Make -mregparm=3 work */
482 regs->rax = sig;
483 regs->rdx = 0;
484 regs->rcx = 0;
485
495 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 486 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
496 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 487 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
497 488
@@ -499,20 +490,20 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
499 regs->ss = __USER32_DS; 490 regs->ss = __USER32_DS;
500 491
501 set_fs(USER_DS); 492 set_fs(USER_DS);
502 regs->eflags &= ~TF_MASK; 493 regs->eflags &= ~TF_MASK;
503 if (test_thread_flag(TIF_SINGLESTEP)) 494 if (test_thread_flag(TIF_SINGLESTEP))
504 ptrace_notify(SIGTRAP); 495 ptrace_notify(SIGTRAP);
505 496
506#if DEBUG_SIG 497#if DEBUG_SIG
507 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 498 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
508 current->comm, current->pid, frame, regs->rip, frame->pretcode); 499 current->comm, current->pid, frame, regs->rip, frame->pretcode);
509#endif 500#endif
510 501
511 return 1; 502 return 0;
512 503
513give_sigsegv: 504give_sigsegv:
514 force_sigsegv(sig, current); 505 force_sigsegv(sig, current);
515 return 0; 506 return -EFAULT;
516} 507}
517 508
518int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 509int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -595,18 +586,18 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
595 regs->ss = __USER32_DS; 586 regs->ss = __USER32_DS;
596 587
597 set_fs(USER_DS); 588 set_fs(USER_DS);
598 regs->eflags &= ~TF_MASK; 589 regs->eflags &= ~TF_MASK;
599 if (test_thread_flag(TIF_SINGLESTEP)) 590 if (test_thread_flag(TIF_SINGLESTEP))
600 ptrace_notify(SIGTRAP); 591 ptrace_notify(SIGTRAP);
601 592
602#if DEBUG_SIG 593#if DEBUG_SIG
603 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 594 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
604 current->comm, current->pid, frame, regs->rip, frame->pretcode); 595 current->comm, current->pid, frame, regs->rip, frame->pretcode);
605#endif 596#endif
606 597
607 return 1; 598 return 0;
608 599
609give_sigsegv: 600give_sigsegv:
610 force_sigsegv(sig, current); 601 force_sigsegv(sig, current);
611 return 0; 602 return -EFAULT;
612} 603}
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d125ed0..b4aa875e175b 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -71,6 +71,7 @@
71 */ 71 */
72ENTRY(ia32_sysenter_target) 72ENTRY(ia32_sysenter_target)
73 CFI_STARTPROC32 simple 73 CFI_STARTPROC32 simple
74 CFI_SIGNAL_FRAME
74 CFI_DEF_CFA rsp,0 75 CFI_DEF_CFA rsp,0
75 CFI_REGISTER rsp,rbp 76 CFI_REGISTER rsp,rbp
76 swapgs 77 swapgs
@@ -186,6 +187,7 @@ ENDPROC(ia32_sysenter_target)
186 */ 187 */
187ENTRY(ia32_cstar_target) 188ENTRY(ia32_cstar_target)
188 CFI_STARTPROC32 simple 189 CFI_STARTPROC32 simple
190 CFI_SIGNAL_FRAME
189 CFI_DEF_CFA rsp,PDA_STACKOFFSET 191 CFI_DEF_CFA rsp,PDA_STACKOFFSET
190 CFI_REGISTER rip,rcx 192 CFI_REGISTER rip,rcx
191 /*CFI_REGISTER rflags,r11*/ 193 /*CFI_REGISTER rflags,r11*/
@@ -293,6 +295,7 @@ ia32_badarg:
293 295
294ENTRY(ia32_syscall) 296ENTRY(ia32_syscall)
295 CFI_STARTPROC simple 297 CFI_STARTPROC simple
298 CFI_SIGNAL_FRAME
296 CFI_DEF_CFA rsp,SS+8-RIP 299 CFI_DEF_CFA rsp,SS+8-RIP
297 /*CFI_REL_OFFSET ss,SS-RIP*/ 300 /*CFI_REL_OFFSET ss,SS-RIP*/
298 CFI_REL_OFFSET rsp,RSP-RIP 301 CFI_REL_OFFSET rsp,RSP-RIP
@@ -370,6 +373,7 @@ ENTRY(ia32_ptregs_common)
370 popq %r11 373 popq %r11
371 CFI_ENDPROC 374 CFI_ENDPROC
372 CFI_STARTPROC32 simple 375 CFI_STARTPROC32 simple
376 CFI_SIGNAL_FRAME
373 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 377 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
374 CFI_REL_OFFSET rax,RAX-ARGOFFSET 378 CFI_REL_OFFSET rax,RAX-ARGOFFSET
375 CFI_REL_OFFSET rcx,RCX-ARGOFFSET 379 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
@@ -703,8 +707,8 @@ ia32_sys_call_table:
703 .quad sys_readlinkat /* 305 */ 707 .quad sys_readlinkat /* 305 */
704 .quad sys_fchmodat 708 .quad sys_fchmodat
705 .quad sys_faccessat 709 .quad sys_faccessat
706 .quad quiet_ni_syscall /* pselect6 for now */ 710 .quad compat_sys_pselect6
707 .quad quiet_ni_syscall /* ppoll for now */ 711 .quad compat_sys_ppoll
708 .quad sys_unshare /* 310 */ 712 .quad sys_unshare /* 310 */
709 .quad compat_sys_set_robust_list 713 .quad compat_sys_set_robust_list
710 .quad compat_sys_get_robust_list 714 .quad compat_sys_get_robust_list
@@ -713,4 +717,5 @@ ia32_sys_call_table:
713 .quad sys_tee 717 .quad sys_tee
714 .quad compat_sys_vmsplice 718 .quad compat_sys_vmsplice
715 .quad compat_sys_move_pages 719 .quad compat_sys_move_pages
720 .quad sys_getcpu
716ia32_syscall_end: 721ia32_syscall_end:
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 659c0722f6b8..d18198ed636b 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -117,6 +117,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
117 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) 117 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
118 return -EIO; 118 return -EIO;
119 child->thread.debugreg7 = val; 119 child->thread.debugreg7 = val;
120 if (val)
121 set_tsk_thread_flag(child, TIF_DEBUG);
122 else
123 clear_tsk_thread_flag(child, TIF_DEBUG);
120 break; 124 break;
121 125
122 default: 126 default:
@@ -371,8 +375,10 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
371 ret = -EIO; 375 ret = -EIO;
372 if (!access_ok(VERIFY_READ, u, sizeof(*u))) 376 if (!access_ok(VERIFY_READ, u, sizeof(*u)))
373 break; 377 break;
374 /* no checking to be bug-to-bug compatible with i386 */ 378 /* no checking to be bug-to-bug compatible with i386. */
375 __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); 379 /* but silence warning */
380 if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
381 ;
376 set_stopped_child_used_math(child); 382 set_stopped_child_used_math(child);
377 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; 383 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
378 ret = 0; 384 ret = 0;
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 9c130993380d..f280d3665f4b 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -60,6 +60,7 @@
60#include <linux/highuid.h> 60#include <linux/highuid.h>
61#include <linux/vmalloc.h> 61#include <linux/vmalloc.h>
62#include <linux/fsnotify.h> 62#include <linux/fsnotify.h>
63#include <linux/sysctl.h>
63#include <asm/mman.h> 64#include <asm/mman.h>
64#include <asm/types.h> 65#include <asm/types.h>
65#include <asm/uaccess.h> 66#include <asm/uaccess.h>
@@ -389,7 +390,9 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
389 } 390 }
390 } 391 }
391 set_fs (KERNEL_DS); 392 set_fs (KERNEL_DS);
392 ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, 393 ret = sys_rt_sigprocmask(how,
394 set ? (sigset_t __user *)&s : NULL,
395 oset ? (sigset_t __user *)&s : NULL,
393 sigsetsize); 396 sigsetsize);
394 set_fs (old_fs); 397 set_fs (old_fs);
395 if (ret) return ret; 398 if (ret) return ret;
@@ -541,7 +544,7 @@ sys32_sysinfo(struct sysinfo32 __user *info)
541 int bitcount = 0; 544 int bitcount = 0;
542 545
543 set_fs (KERNEL_DS); 546 set_fs (KERNEL_DS);
544 ret = sys_sysinfo(&s); 547 ret = sys_sysinfo((struct sysinfo __user *)&s);
545 set_fs (old_fs); 548 set_fs (old_fs);
546 549
547 /* Check to see if any memory value is too large for 32-bit and scale 550 /* Check to see if any memory value is too large for 32-bit and scale
@@ -589,7 +592,7 @@ sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *int
589 mm_segment_t old_fs = get_fs (); 592 mm_segment_t old_fs = get_fs ();
590 593
591 set_fs (KERNEL_DS); 594 set_fs (KERNEL_DS);
592 ret = sys_sched_rr_get_interval(pid, &t); 595 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
593 set_fs (old_fs); 596 set_fs (old_fs);
594 if (put_compat_timespec(&t, interval)) 597 if (put_compat_timespec(&t, interval))
595 return -EFAULT; 598 return -EFAULT;
@@ -605,7 +608,7 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
605 mm_segment_t old_fs = get_fs(); 608 mm_segment_t old_fs = get_fs();
606 609
607 set_fs (KERNEL_DS); 610 set_fs (KERNEL_DS);
608 ret = sys_rt_sigpending(&s, sigsetsize); 611 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
609 set_fs (old_fs); 612 set_fs (old_fs);
610 if (!ret) { 613 if (!ret) {
611 switch (_NSIG_WORDS) { 614 switch (_NSIG_WORDS) {
@@ -630,7 +633,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
630 if (copy_siginfo_from_user32(&info, uinfo)) 633 if (copy_siginfo_from_user32(&info, uinfo))
631 return -EFAULT; 634 return -EFAULT;
632 set_fs (KERNEL_DS); 635 set_fs (KERNEL_DS);
633 ret = sys_rt_sigqueueinfo(pid, sig, &info); 636 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
634 set_fs (old_fs); 637 set_fs (old_fs);
635 return ret; 638 return ret;
636} 639}
@@ -645,7 +648,7 @@ sys32_pause(void)
645} 648}
646 649
647 650
648#ifdef CONFIG_SYSCTL 651#ifdef CONFIG_SYSCTL_SYSCALL
649struct sysctl_ia32 { 652struct sysctl_ia32 {
650 unsigned int name; 653 unsigned int name;
651 int nlen; 654 int nlen;
@@ -666,9 +669,6 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
666 size_t oldlen; 669 size_t oldlen;
667 int __user *namep; 670 int __user *namep;
668 long ret; 671 long ret;
669 extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
670 void *newval, size_t newlen);
671
672 672
673 if (copy_from_user(&a32, args32, sizeof (a32))) 673 if (copy_from_user(&a32, args32, sizeof (a32)))
674 return -EFAULT; 674 return -EFAULT;
@@ -692,7 +692,8 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
692 692
693 set_fs(KERNEL_DS); 693 set_fs(KERNEL_DS);
694 lock_kernel(); 694 lock_kernel();
695 ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); 695 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
696 newvalp, (size_t) a32.newlen);
696 unlock_kernel(); 697 unlock_kernel();
697 set_fs(old_fs); 698 set_fs(old_fs);
698 699
@@ -743,7 +744,8 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
743 return -EFAULT; 744 return -EFAULT;
744 745
745 set_fs(KERNEL_DS); 746 set_fs(KERNEL_DS);
746 ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); 747 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
748 count);
747 set_fs(old_fs); 749 set_fs(old_fs);
748 750
749 if (offset && put_user(of, offset)) 751 if (offset && put_user(of, offset))
@@ -778,7 +780,7 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
778 780
779asmlinkage long sys32_olduname(struct oldold_utsname __user * name) 781asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
780{ 782{
781 int error; 783 int err;
782 784
783 if (!name) 785 if (!name)
784 return -EFAULT; 786 return -EFAULT;
@@ -787,27 +789,31 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
787 789
788 down_read(&uts_sem); 790 down_read(&uts_sem);
789 791
790 error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); 792 err = __copy_to_user(&name->sysname,&system_utsname.sysname,
791 __put_user(0,name->sysname+__OLD_UTS_LEN); 793 __OLD_UTS_LEN);
792 __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); 794 err |= __put_user(0,name->sysname+__OLD_UTS_LEN);
793 __put_user(0,name->nodename+__OLD_UTS_LEN); 795 err |= __copy_to_user(&name->nodename,&system_utsname.nodename,
794 __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); 796 __OLD_UTS_LEN);
795 __put_user(0,name->release+__OLD_UTS_LEN); 797 err |= __put_user(0,name->nodename+__OLD_UTS_LEN);
796 __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); 798 err |= __copy_to_user(&name->release,&system_utsname.release,
797 __put_user(0,name->version+__OLD_UTS_LEN); 799 __OLD_UTS_LEN);
800 err |= __put_user(0,name->release+__OLD_UTS_LEN);
801 err |= __copy_to_user(&name->version,&system_utsname.version,
802 __OLD_UTS_LEN);
803 err |= __put_user(0,name->version+__OLD_UTS_LEN);
798 { 804 {
799 char *arch = "x86_64"; 805 char *arch = "x86_64";
800 if (personality(current->personality) == PER_LINUX32) 806 if (personality(current->personality) == PER_LINUX32)
801 arch = "i686"; 807 arch = "i686";
802 808
803 __copy_to_user(&name->machine,arch,strlen(arch)+1); 809 err |= __copy_to_user(&name->machine,arch,strlen(arch)+1);
804 } 810 }
805 811
806 up_read(&uts_sem); 812 up_read(&uts_sem);
807 813
808 error = error ? -EFAULT : 0; 814 err = err ? -EFAULT : 0;
809 815
810 return error; 816 return err;
811} 817}
812 818
813long sys32_uname(struct old_utsname __user * name) 819long sys32_uname(struct old_utsname __user * name)
@@ -831,7 +837,7 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
831 837
832 seg = get_fs(); 838 seg = get_fs();
833 set_fs(KERNEL_DS); 839 set_fs(KERNEL_DS);
834 ret = sys_ustat(dev,&u); 840 ret = sys_ustat(dev, (struct ustat __user *)&u);
835 set_fs(seg); 841 set_fs(seg);
836 if (ret >= 0) { 842 if (ret >= 0) {
837 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 843 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) ||
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index b5aaeafc1cd3..3c7cbff04d3d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
11 pci-dma.o pci-nommu.o alternative.o 11 pci-dma.o pci-nommu.o alternative.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-$(CONFIG_X86_MCE) += mce.o 14obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
15obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 15obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
16obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 16obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
17obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ 17obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
@@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o 20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o 22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
23obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 23obj-y += apic.o nmi.o
24obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 24obj-y += io_apic.o mpparse.o \
25 genapic.o genapic_cluster.o genapic_flat.o 25 genapic.o genapic_cluster.o genapic_flat.o
26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o 26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
@@ -39,12 +39,14 @@ obj-$(CONFIG_K8_NB) += k8.o
39obj-$(CONFIG_AUDIT) += audit.o 39obj-$(CONFIG_AUDIT) += audit.o
40 40
41obj-$(CONFIG_MODULES) += module.o 41obj-$(CONFIG_MODULES) += module.o
42obj-$(CONFIG_PCI) += early-quirks.o
42 43
43obj-y += topology.o 44obj-y += topology.o
44obj-y += intel_cacheinfo.o 45obj-y += intel_cacheinfo.o
45 46
46CFLAGS_vsyscall.o := $(PROFILING) -g0 47CFLAGS_vsyscall.o := $(PROFILING) -g0
47 48
49therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o
48bootflag-y += ../../i386/kernel/bootflag.o 50bootflag-y += ../../i386/kernel/bootflag.o
49cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o 51cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
50topology-y += ../../i386/kernel/topology.o 52topology-y += ../../i386/kernel/topology.o
@@ -54,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o
54i8237-y += ../../i386/kernel/i8237.o 56i8237-y += ../../i386/kernel/i8237.o
55msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 57msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
56alternative-y += ../../i386/kernel/alternative.o 58alternative-y += ../../i386/kernel/alternative.o
57
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 58af8e73738b..b487396c4c5b 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -17,6 +17,7 @@
17#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
18#include <linux/pci.h> 18#include <linux/pci.h>
19#include <linux/bitops.h> 19#include <linux/bitops.h>
20#include <linux/ioport.h>
20#include <asm/e820.h> 21#include <asm/e820.h>
21#include <asm/io.h> 22#include <asm/io.h>
22#include <asm/proto.h> 23#include <asm/proto.h>
@@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0;
33 34
34int fix_aperture __initdata = 1; 35int fix_aperture __initdata = 1;
35 36
37static struct resource gart_resource = {
38 .name = "GART",
39 .flags = IORESOURCE_MEM,
40};
41
42static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
43{
44 gart_resource.start = aper_base;
45 gart_resource.end = aper_base + aper_size - 1;
46 insert_resource(&iomem_resource, &gart_resource);
47}
48
36/* This code runs before the PCI subsystem is initialized, so just 49/* This code runs before the PCI subsystem is initialized, so just
37 access the northbridge directly. */ 50 access the northbridge directly. */
38 51
@@ -48,7 +61,7 @@ static u32 __init allocate_aperture(void)
48 61
49 /* 62 /*
50 * Aperture has to be naturally aligned. This means an 2GB aperture won't 63 * Aperture has to be naturally aligned. This means an 2GB aperture won't
51 * have much chances to find a place in the lower 4GB of memory. 64 * have much chance of finding a place in the lower 4GB of memory.
52 * Unfortunately we cannot move it up because that would make the 65 * Unfortunately we cannot move it up because that would make the
53 * IOMMU useless. 66 * IOMMU useless.
54 */ 67 */
@@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void)
62 } 75 }
63 printk("Mapping aperture over %d KB of RAM @ %lx\n", 76 printk("Mapping aperture over %d KB of RAM @ %lx\n",
64 aper_size >> 10, __pa(p)); 77 aper_size >> 10, __pa(p));
78 insert_aperture_resource((u32)__pa(p), aper_size);
65 return (u32)__pa(p); 79 return (u32)__pa(p);
66} 80}
67 81
@@ -198,7 +212,7 @@ void __init iommu_hole_init(void)
198 u64 aper_base, last_aper_base = 0; 212 u64 aper_base, last_aper_base = 0;
199 int valid_agp = 0; 213 int valid_agp = 0;
200 214
201 if (iommu_aperture_disabled || !fix_aperture) 215 if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
202 return; 216 return;
203 217
204 printk("Checking aperture...\n"); 218 printk("Checking aperture...\n");
@@ -233,8 +247,13 @@ void __init iommu_hole_init(void)
233 last_aper_base = aper_base; 247 last_aper_base = aper_base;
234 } 248 }
235 249
236 if (!fix && !fallback_aper_force) 250 if (!fix && !fallback_aper_force) {
251 if (last_aper_base) {
252 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
253 insert_aperture_resource((u32)last_aper_base, n);
254 }
237 return; 255 return;
256 }
238 257
239 if (!fallback_aper_force) 258 if (!fallback_aper_force)
240 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 259 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 2b8cef037a65..135ff25e6b44 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
25#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/ioport.h>
28 29
29#include <asm/atomic.h> 30#include <asm/atomic.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
@@ -36,13 +37,20 @@
36#include <asm/idle.h> 37#include <asm/idle.h>
37#include <asm/proto.h> 38#include <asm/proto.h>
38#include <asm/timex.h> 39#include <asm/timex.h>
40#include <asm/apic.h>
39 41
42int apic_mapped;
40int apic_verbosity; 43int apic_verbosity;
41int apic_runs_main_timer; 44int apic_runs_main_timer;
42int apic_calibrate_pmtmr __initdata; 45int apic_calibrate_pmtmr __initdata;
43 46
44int disable_apic_timer __initdata; 47int disable_apic_timer __initdata;
45 48
49static struct resource lapic_resource = {
50 .name = "Local APIC",
51 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
52};
53
46/* 54/*
47 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as 55 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
48 * IPIs in place of local APIC timers 56 * IPIs in place of local APIC timers
@@ -136,72 +144,40 @@ void clear_local_APIC(void)
136 apic_read(APIC_ESR); 144 apic_read(APIC_ESR);
137} 145}
138 146
139void __init connect_bsp_APIC(void)
140{
141 if (pic_mode) {
142 /*
143 * Do not trust the local APIC being empty at bootup.
144 */
145 clear_local_APIC();
146 /*
147 * PIC mode, enable APIC mode in the IMCR, i.e.
148 * connect BSP's local APIC to INT and NMI lines.
149 */
150 apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
151 outb(0x70, 0x22);
152 outb(0x01, 0x23);
153 }
154}
155
156void disconnect_bsp_APIC(int virt_wire_setup) 147void disconnect_bsp_APIC(int virt_wire_setup)
157{ 148{
158 if (pic_mode) { 149 /* Go back to Virtual Wire compatibility mode */
159 /* 150 unsigned long value;
160 * Put the board back into PIC mode (has an effect 151
161 * only on certain older boards). Note that APIC 152 /* For the spurious interrupt use vector F, and enable it */
162 * interrupts, including IPIs, won't work beyond 153 value = apic_read(APIC_SPIV);
163 * this point! The only exception are INIT IPIs. 154 value &= ~APIC_VECTOR_MASK;
164 */ 155 value |= APIC_SPIV_APIC_ENABLED;
165 apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); 156 value |= 0xf;
166 outb(0x70, 0x22); 157 apic_write(APIC_SPIV, value);
167 outb(0x00, 0x23);
168 }
169 else {
170 /* Go back to Virtual Wire compatibility mode */
171 unsigned long value;
172
173 /* For the spurious interrupt use vector F, and enable it */
174 value = apic_read(APIC_SPIV);
175 value &= ~APIC_VECTOR_MASK;
176 value |= APIC_SPIV_APIC_ENABLED;
177 value |= 0xf;
178 apic_write(APIC_SPIV, value);
179
180 if (!virt_wire_setup) {
181 /* For LVT0 make it edge triggered, active high, external and enabled */
182 value = apic_read(APIC_LVT0);
183 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
184 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
185 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
186 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
187 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
188 apic_write(APIC_LVT0, value);
189 }
190 else {
191 /* Disable LVT0 */
192 apic_write(APIC_LVT0, APIC_LVT_MASKED);
193 }
194 158
195 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 159 if (!virt_wire_setup) {
196 value = apic_read(APIC_LVT1); 160 /* For LVT0 make it edge triggered, active high, external and enabled */
197 value &= ~( 161 value = apic_read(APIC_LVT0);
198 APIC_MODE_MASK | APIC_SEND_PENDING | 162 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
199 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 163 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
200 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 164 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
201 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 165 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
202 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 166 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
203 apic_write(APIC_LVT1, value); 167 apic_write(APIC_LVT0, value);
168 } else {
169 /* Disable LVT0 */
170 apic_write(APIC_LVT0, APIC_LVT_MASKED);
204 } 171 }
172
173 /* For LVT1 make it edge triggered, active high, nmi and enabled */
174 value = apic_read(APIC_LVT1);
175 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
176 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
177 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
178 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
179 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
180 apic_write(APIC_LVT1, value);
205} 181}
206 182
207void disable_local_APIC(void) 183void disable_local_APIC(void)
@@ -297,8 +273,6 @@ void __init sync_Arb_IDs(void)
297 | APIC_DM_INIT); 273 | APIC_DM_INIT);
298} 274}
299 275
300extern void __error_in_apic_c (void);
301
302/* 276/*
303 * An initial setup of the virtual wire mode. 277 * An initial setup of the virtual wire mode.
304 */ 278 */
@@ -345,8 +319,7 @@ void __cpuinit setup_local_APIC (void)
345 319
346 value = apic_read(APIC_LVR); 320 value = apic_read(APIC_LVR);
347 321
348 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) 322 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
349 __error_in_apic_c();
350 323
351 /* 324 /*
352 * Double-check whether this APIC is really registered. 325 * Double-check whether this APIC is really registered.
@@ -399,32 +372,8 @@ void __cpuinit setup_local_APIC (void)
399 */ 372 */
400 value |= APIC_SPIV_APIC_ENABLED; 373 value |= APIC_SPIV_APIC_ENABLED;
401 374
402 /* 375 /* We always use processor focus */
403 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 376
404 * certain networking cards. If high frequency interrupts are
405 * happening on a particular IOAPIC pin, plus the IOAPIC routing
406 * entry is masked/unmasked at a high rate as well then sooner or
407 * later IOAPIC line gets 'stuck', no more interrupts are received
408 * from the device. If focus CPU is disabled then the hang goes
409 * away, oh well :-(
410 *
411 * [ This bug can be reproduced easily with a level-triggered
412 * PCI Ne2000 networking cards and PII/PIII processors, dual
413 * BX chipset. ]
414 */
415 /*
416 * Actually disabling the focus CPU check just makes the hang less
417 * frequent as it makes the interrupt distributon model be more
418 * like LRU than MRU (the short-term load is more even across CPUs).
419 * See also the comment in end_level_ioapic_irq(). --macro
420 */
421#if 1
422 /* Enable focus processor (bit==0) */
423 value &= ~APIC_SPIV_FOCUS_DISABLED;
424#else
425 /* Disable focus processor (bit==1) */
426 value |= APIC_SPIV_FOCUS_DISABLED;
427#endif
428 /* 377 /*
429 * Set spurious IRQ vector 378 * Set spurious IRQ vector
430 */ 379 */
@@ -442,7 +391,7 @@ void __cpuinit setup_local_APIC (void)
442 * TODO: set up through-local-APIC from through-I/O-APIC? --macro 391 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
443 */ 392 */
444 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; 393 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
445 if (!smp_processor_id() && (pic_mode || !value)) { 394 if (!smp_processor_id() && !value) {
446 value = APIC_DM_EXTINT; 395 value = APIC_DM_EXTINT;
447 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); 396 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
448 } else { 397 } else {
@@ -479,8 +428,7 @@ void __cpuinit setup_local_APIC (void)
479 } 428 }
480 429
481 nmi_watchdog_default(); 430 nmi_watchdog_default();
482 if (nmi_watchdog == NMI_LOCAL_APIC) 431 setup_apic_nmi_watchdog(NULL);
483 setup_apic_nmi_watchdog();
484 apic_pm_activate(); 432 apic_pm_activate();
485} 433}
486 434
@@ -527,8 +475,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
527 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 475 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
528 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 476 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
529 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 477 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
530 local_save_flags(flags); 478 local_irq_save(flags);
531 local_irq_disable();
532 disable_local_APIC(); 479 disable_local_APIC();
533 local_irq_restore(flags); 480 local_irq_restore(flags);
534 return 0; 481 return 0;
@@ -606,18 +553,24 @@ static void apic_pm_activate(void) { }
606 553
607static int __init apic_set_verbosity(char *str) 554static int __init apic_set_verbosity(char *str)
608{ 555{
556 if (str == NULL) {
557 skip_ioapic_setup = 0;
558 ioapic_force = 1;
559 return 0;
560 }
609 if (strcmp("debug", str) == 0) 561 if (strcmp("debug", str) == 0)
610 apic_verbosity = APIC_DEBUG; 562 apic_verbosity = APIC_DEBUG;
611 else if (strcmp("verbose", str) == 0) 563 else if (strcmp("verbose", str) == 0)
612 apic_verbosity = APIC_VERBOSE; 564 apic_verbosity = APIC_VERBOSE;
613 else 565 else {
614 printk(KERN_WARNING "APIC Verbosity level %s not recognised" 566 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
615 " use apic=verbose or apic=debug", str); 567 " use apic=verbose or apic=debug\n", str);
568 return -EINVAL;
569 }
616 570
617 return 1; 571 return 0;
618} 572}
619 573early_param("apic", apic_set_verbosity);
620__setup("apic=", apic_set_verbosity);
621 574
622/* 575/*
623 * Detect and enable local APICs on non-SMP boards. 576 * Detect and enable local APICs on non-SMP boards.
@@ -638,6 +591,40 @@ static int __init detect_init_APIC (void)
638 return 0; 591 return 0;
639} 592}
640 593
594#ifdef CONFIG_X86_IO_APIC
595static struct resource * __init ioapic_setup_resources(void)
596{
597#define IOAPIC_RESOURCE_NAME_SIZE 11
598 unsigned long n;
599 struct resource *res;
600 char *mem;
601 int i;
602
603 if (nr_ioapics <= 0)
604 return NULL;
605
606 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
607 n *= nr_ioapics;
608
609 res = alloc_bootmem(n);
610
611 if (!res)
612 return NULL;
613
614 memset(res, 0, n);
615 mem = (void *)&res[nr_ioapics];
616
617 for (i = 0; i < nr_ioapics; i++) {
618 res[i].name = mem;
619 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
620 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
621 mem += IOAPIC_RESOURCE_NAME_SIZE;
622 }
623
624 return res;
625}
626#endif
627
641void __init init_apic_mappings(void) 628void __init init_apic_mappings(void)
642{ 629{
643 unsigned long apic_phys; 630 unsigned long apic_phys;
@@ -654,19 +641,26 @@ void __init init_apic_mappings(void)
654 apic_phys = mp_lapic_addr; 641 apic_phys = mp_lapic_addr;
655 642
656 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 643 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
644 apic_mapped = 1;
657 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); 645 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
658 646
647 /* Put local APIC into the resource map. */
648 lapic_resource.start = apic_phys;
649 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
650 insert_resource(&iomem_resource, &lapic_resource);
651
659 /* 652 /*
660 * Fetch the APIC ID of the BSP in case we have a 653 * Fetch the APIC ID of the BSP in case we have a
661 * default configuration (or the MP table is broken). 654 * default configuration (or the MP table is broken).
662 */ 655 */
663 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); 656 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
664 657
665#ifdef CONFIG_X86_IO_APIC
666 { 658 {
667 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 659 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
668 int i; 660 int i;
661 struct resource *ioapic_res;
669 662
663 ioapic_res = ioapic_setup_resources();
670 for (i = 0; i < nr_ioapics; i++) { 664 for (i = 0; i < nr_ioapics; i++) {
671 if (smp_found_config) { 665 if (smp_found_config) {
672 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 666 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -678,9 +672,15 @@ void __init init_apic_mappings(void)
678 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", 672 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
679 __fix_to_virt(idx), ioapic_phys); 673 __fix_to_virt(idx), ioapic_phys);
680 idx++; 674 idx++;
675
676 if (ioapic_res) {
677 ioapic_res->start = ioapic_phys;
678 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
679 insert_resource(&iomem_resource, ioapic_res);
680 ioapic_res++;
681 }
681 } 682 }
682 } 683 }
683#endif
684} 684}
685 685
686/* 686/*
@@ -951,7 +951,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs)
951 * We take the 'long' return path, and there every subsystem 951 * We take the 'long' return path, and there every subsystem
952 * grabs the appropriate locks (kernel lock/ irq lock). 952 * grabs the appropriate locks (kernel lock/ irq lock).
953 * 953 *
954 * we might want to decouple profiling from the 'long path', 954 * We might want to decouple profiling from the 'long path',
955 * and do the profiling totally in assembly. 955 * and do the profiling totally in assembly.
956 * 956 *
957 * Currently this isn't too much of an issue (performance wise), 957 * Currently this isn't too much of an issue (performance wise),
@@ -1123,19 +1123,15 @@ int __init APIC_init_uniprocessor (void)
1123 1123
1124 verify_local_APIC(); 1124 verify_local_APIC();
1125 1125
1126 connect_bsp_APIC();
1127
1128 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); 1126 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1129 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); 1127 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1130 1128
1131 setup_local_APIC(); 1129 setup_local_APIC();
1132 1130
1133#ifdef CONFIG_X86_IO_APIC
1134 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1131 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1135 setup_IO_APIC(); 1132 setup_IO_APIC();
1136 else 1133 else
1137 nr_ioapics = 0; 1134 nr_ioapics = 0;
1138#endif
1139 setup_boot_APIC_clock(); 1135 setup_boot_APIC_clock();
1140 check_nmi_watchdog(); 1136 check_nmi_watchdog();
1141 return 0; 1137 return 0;
@@ -1144,14 +1140,17 @@ int __init APIC_init_uniprocessor (void)
1144static __init int setup_disableapic(char *str) 1140static __init int setup_disableapic(char *str)
1145{ 1141{
1146 disable_apic = 1; 1142 disable_apic = 1;
1147 return 1; 1143 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1148} 1144 return 0;
1145}
1146early_param("disableapic", setup_disableapic);
1149 1147
1148/* same as disableapic, for compatibility */
1150static __init int setup_nolapic(char *str) 1149static __init int setup_nolapic(char *str)
1151{ 1150{
1152 disable_apic = 1; 1151 return setup_disableapic(str);
1153 return 1;
1154} 1152}
1153early_param("nolapic", setup_nolapic);
1155 1154
1156static __init int setup_noapictimer(char *str) 1155static __init int setup_noapictimer(char *str)
1157{ 1156{
@@ -1184,11 +1183,5 @@ static __init int setup_apicpmtimer(char *s)
1184} 1183}
1185__setup("apicpmtimer", setup_apicpmtimer); 1184__setup("apicpmtimer", setup_apicpmtimer);
1186 1185
1187/* dummy parsing: see setup.c */
1188
1189__setup("disableapic", setup_disableapic);
1190__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
1191
1192__setup("noapictimer", setup_noapictimer); 1186__setup("noapictimer", setup_noapictimer);
1193 1187
1194/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index d8d5750d6106..3525f884af82 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -23,6 +23,7 @@
23#include <asm/nmi.h> 23#include <asm/nmi.h>
24#include <asm/hw_irq.h> 24#include <asm/hw_irq.h>
25#include <asm/mach_apic.h> 25#include <asm/mach_apic.h>
26#include <asm/kdebug.h>
26 27
27/* This keeps a track of which one is crashing cpu. */ 28/* This keeps a track of which one is crashing cpu. */
28static int crashing_cpu; 29static int crashing_cpu;
@@ -68,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
68 * for the data I pass, and I need tags 69 * for the data I pass, and I need tags
69 * on the data to indicate what information I have 70 * on the data to indicate what information I have
70 * squirrelled away. ELF notes happen to provide 71 * squirrelled away. ELF notes happen to provide
71 * all of that that no need to invent something new. 72 * all of that, no need to invent something new.
72 */ 73 */
73 74
74 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 75 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
@@ -95,15 +96,25 @@ static void crash_save_self(struct pt_regs *regs)
95#ifdef CONFIG_SMP 96#ifdef CONFIG_SMP
96static atomic_t waiting_for_crash_ipi; 97static atomic_t waiting_for_crash_ipi;
97 98
98static int crash_nmi_callback(struct pt_regs *regs, int cpu) 99static int crash_nmi_callback(struct notifier_block *self,
100 unsigned long val, void *data)
99{ 101{
102 struct pt_regs *regs;
103 int cpu;
104
105 if (val != DIE_NMI_IPI)
106 return NOTIFY_OK;
107
108 regs = ((struct die_args *)data)->regs;
109 cpu = raw_smp_processor_id();
110
100 /* 111 /*
101 * Don't do anything if this handler is invoked on crashing cpu. 112 * Don't do anything if this handler is invoked on crashing cpu.
102 * Otherwise, system will completely hang. Crashing cpu can get 113 * Otherwise, system will completely hang. Crashing cpu can get
103 * an NMI if system was initially booted with nmi_watchdog parameter. 114 * an NMI if system was initially booted with nmi_watchdog parameter.
104 */ 115 */
105 if (cpu == crashing_cpu) 116 if (cpu == crashing_cpu)
106 return 1; 117 return NOTIFY_STOP;
107 local_irq_disable(); 118 local_irq_disable();
108 119
109 crash_save_this_cpu(regs, cpu); 120 crash_save_this_cpu(regs, cpu);
@@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void)
127 * cpu hotplug shouldn't matter. 138 * cpu hotplug shouldn't matter.
128 */ 139 */
129 140
141static struct notifier_block crash_nmi_nb = {
142 .notifier_call = crash_nmi_callback,
143};
144
130static void nmi_shootdown_cpus(void) 145static void nmi_shootdown_cpus(void)
131{ 146{
132 unsigned long msecs; 147 unsigned long msecs;
133 148
134 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); 149 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
135 set_nmi_callback(crash_nmi_callback); 150 if (register_die_notifier(&crash_nmi_nb))
151 return; /* return what? */
136 152
137 /* 153 /*
138 * Ensure the new callback function is set before sending 154 * Ensure the new callback function is set before sending
@@ -178,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
178 if(cpu_has_apic) 194 if(cpu_has_apic)
179 disable_local_APIC(); 195 disable_local_APIC();
180 196
181#if defined(CONFIG_X86_IO_APIC)
182 disable_IO_APIC(); 197 disable_IO_APIC();
183#endif
184 198
185 crash_save_self(regs); 199 crash_save_self(regs);
186} 200}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 708a3cd9a27e..b3f0908668ec 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -25,6 +25,8 @@
25#include <asm/bootsetup.h> 25#include <asm/bootsetup.h>
26#include <asm/sections.h> 26#include <asm/sections.h>
27 27
28struct e820map e820 __initdata;
29
28/* 30/*
29 * PFN of last memory page. 31 * PFN of last memory page.
30 */ 32 */
@@ -41,7 +43,7 @@ unsigned long end_pfn_map;
41/* 43/*
42 * Last pfn which the user wants to use. 44 * Last pfn which the user wants to use.
43 */ 45 */
44unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; 46static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
45 47
46extern struct resource code_resource, data_resource; 48extern struct resource code_resource, data_resource;
47 49
@@ -70,12 +72,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
70 return 1; 72 return 1;
71 } 73 }
72#endif 74#endif
73 /* kernel code + 640k memory hole (later should not be needed, but 75 /* kernel code */
74 be paranoid for now) */
75 if (last >= 640*1024 && addr < 1024*1024) {
76 *addrp = 1024*1024;
77 return 1;
78 }
79 if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { 76 if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
80 *addrp = __pa_symbol(&_end); 77 *addrp = __pa_symbol(&_end);
81 return 1; 78 return 1;
@@ -165,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
165 return -1UL; 162 return -1UL;
166} 163}
167 164
168/*
169 * Free bootmem based on the e820 table for a node.
170 */
171void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
172{
173 int i;
174 for (i = 0; i < e820.nr_map; i++) {
175 struct e820entry *ei = &e820.map[i];
176 unsigned long last, addr;
177
178 if (ei->type != E820_RAM ||
179 ei->addr+ei->size <= start ||
180 ei->addr >= end)
181 continue;
182
183 addr = round_up(ei->addr, PAGE_SIZE);
184 if (addr < start)
185 addr = start;
186
187 last = round_down(ei->addr + ei->size, PAGE_SIZE);
188 if (last >= end)
189 last = end;
190
191 if (last > addr && last-addr >= PAGE_SIZE)
192 free_bootmem_node(pgdat, addr, last-addr);
193 }
194}
195
196/* 165/*
197 * Find the highest page frame number we have available 166 * Find the highest page frame number we have available
198 */ 167 */
199unsigned long __init e820_end_of_ram(void) 168unsigned long __init e820_end_of_ram(void)
200{ 169{
201 int i;
202 unsigned long end_pfn = 0; 170 unsigned long end_pfn = 0;
171 end_pfn = find_max_pfn_with_active_regions();
203 172
204 for (i = 0; i < e820.nr_map; i++) {
205 struct e820entry *ei = &e820.map[i];
206 unsigned long start, end;
207
208 start = round_up(ei->addr, PAGE_SIZE);
209 end = round_down(ei->addr + ei->size, PAGE_SIZE);
210 if (start >= end)
211 continue;
212 if (ei->type == E820_RAM) {
213 if (end > end_pfn<<PAGE_SHIFT)
214 end_pfn = end>>PAGE_SHIFT;
215 } else {
216 if (end > end_pfn_map<<PAGE_SHIFT)
217 end_pfn_map = end>>PAGE_SHIFT;
218 }
219 }
220
221 if (end_pfn > end_pfn_map) 173 if (end_pfn > end_pfn_map)
222 end_pfn_map = end_pfn; 174 end_pfn_map = end_pfn;
223 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 175 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -227,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
227 if (end_pfn > end_pfn_map) 179 if (end_pfn > end_pfn_map)
228 end_pfn = end_pfn_map; 180 end_pfn = end_pfn_map;
229 181
182 printk("end_pfn_map = %lu\n", end_pfn_map);
230 return end_pfn; 183 return end_pfn;
231} 184}
232 185
233/*
234 * Compute how much memory is missing in a range.
235 * Unlike the other functions in this file the arguments are in page numbers.
236 */
237unsigned long __init
238e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
239{
240 unsigned long ram = 0;
241 unsigned long start = start_pfn << PAGE_SHIFT;
242 unsigned long end = end_pfn << PAGE_SHIFT;
243 int i;
244 for (i = 0; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246 unsigned long last, addr;
247
248 if (ei->type != E820_RAM ||
249 ei->addr+ei->size <= start ||
250 ei->addr >= end)
251 continue;
252
253 addr = round_up(ei->addr, PAGE_SIZE);
254 if (addr < start)
255 addr = start;
256
257 last = round_down(ei->addr + ei->size, PAGE_SIZE);
258 if (last >= end)
259 last = end;
260
261 if (last > addr)
262 ram += last - addr;
263 }
264 return ((end - start) - ram) >> PAGE_SHIFT;
265}
266
267/* 186/*
268 * Mark e820 reserved areas as busy for the resource manager. 187 * Mark e820 reserved areas as busy for the resource manager.
269 */ 188 */
@@ -345,6 +264,49 @@ void __init e820_mark_nosave_regions(void)
345 } 264 }
346} 265}
347 266
267/* Walk the e820 map and register active regions within a node */
268void __init
269e820_register_active_regions(int nid, unsigned long start_pfn,
270 unsigned long end_pfn)
271{
272 int i;
273 unsigned long ei_startpfn, ei_endpfn;
274 for (i = 0; i < e820.nr_map; i++) {
275 struct e820entry *ei = &e820.map[i];
276 ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
277 ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
278 >> PAGE_SHIFT;
279
280 /* Skip map entries smaller than a page */
281 if (ei_startpfn > ei_endpfn)
282 continue;
283
284 /* Check if end_pfn_map should be updated */
285 if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
286 end_pfn_map = ei_endpfn;
287
288 /* Skip if map is outside the node */
289 if (ei->type != E820_RAM ||
290 ei_endpfn <= start_pfn ||
291 ei_startpfn >= end_pfn)
292 continue;
293
294 /* Check for overlaps */
295 if (ei_startpfn < start_pfn)
296 ei_startpfn = start_pfn;
297 if (ei_endpfn > end_pfn)
298 ei_endpfn = end_pfn;
299
300 /* Obey end_user_pfn to save on memmap */
301 if (ei_startpfn >= end_user_pfn)
302 continue;
303 if (ei_endpfn > end_user_pfn)
304 ei_endpfn = end_user_pfn;
305
306 add_active_range(nid, ei_startpfn, ei_endpfn);
307 }
308}
309
348/* 310/*
349 * Add a memory region to the kernel e820 map. 311 * Add a memory region to the kernel e820 map.
350 */ 312 */
@@ -565,13 +527,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
565 * If we're lucky and live on a modern system, the setup code 527 * If we're lucky and live on a modern system, the setup code
566 * will have given us a memory map that we can use to properly 528 * will have given us a memory map that we can use to properly
567 * set up memory. If we aren't, we'll fake a memory map. 529 * set up memory. If we aren't, we'll fake a memory map.
568 *
569 * We check to see that the memory map contains at least 2 elements
570 * before we'll use it, because the detection code in setup.S may
571 * not be perfect and most every PC known to man has two memory
572 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
573 * thinkpad 560x, for example, does not cooperate with the memory
574 * detection code.)
575 */ 530 */
576static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) 531static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
577{ 532{
@@ -589,34 +544,19 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
589 if (start > end) 544 if (start > end)
590 return -1; 545 return -1;
591 546
592 /*
593 * Some BIOSes claim RAM in the 640k - 1M region.
594 * Not right. Fix it up.
595 *
596 * This should be removed on Hammer which is supposed to not
597 * have non e820 covered ISA mappings there, but I had some strange
598 * problems so it stays for now. -AK
599 */
600 if (type == E820_RAM) {
601 if (start < 0x100000ULL && end > 0xA0000ULL) {
602 if (start < 0xA0000ULL)
603 add_memory_region(start, 0xA0000ULL-start, type);
604 if (end <= 0x100000ULL)
605 continue;
606 start = 0x100000ULL;
607 size = end - start;
608 }
609 }
610
611 add_memory_region(start, size, type); 547 add_memory_region(start, size, type);
612 } while (biosmap++,--nr_map); 548 } while (biosmap++,--nr_map);
613 return 0; 549 return 0;
614} 550}
615 551
616void __init setup_memory_region(void) 552void early_panic(char *msg)
617{ 553{
618 char *who = "BIOS-e820"; 554 early_printk(msg);
555 panic(msg);
556}
619 557
558void __init setup_memory_region(void)
559{
620 /* 560 /*
621 * Try to copy the BIOS-supplied E820-map. 561 * Try to copy the BIOS-supplied E820-map.
622 * 562 *
@@ -624,51 +564,70 @@ void __init setup_memory_region(void)
624 * the next section from 1mb->appropriate_mem_k 564 * the next section from 1mb->appropriate_mem_k
625 */ 565 */
626 sanitize_e820_map(E820_MAP, &E820_MAP_NR); 566 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
627 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { 567 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
628 unsigned long mem_size; 568 early_panic("Cannot find a valid memory map");
629
630 /* compare results from other methods and take the greater */
631 if (ALT_MEM_K < EXT_MEM_K) {
632 mem_size = EXT_MEM_K;
633 who = "BIOS-88";
634 } else {
635 mem_size = ALT_MEM_K;
636 who = "BIOS-e801";
637 }
638
639 e820.nr_map = 0;
640 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
641 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
642 }
643 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 569 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
644 e820_print_map(who); 570 e820_print_map("BIOS-e820");
645} 571}
646 572
647void __init parse_memopt(char *p, char **from) 573static int __init parse_memopt(char *p)
648{ 574{
649 end_user_pfn = memparse(p, from); 575 if (!p)
576 return -EINVAL;
577 end_user_pfn = memparse(p, &p);
650 end_user_pfn >>= PAGE_SHIFT; 578 end_user_pfn >>= PAGE_SHIFT;
579 return 0;
651} 580}
581early_param("mem", parse_memopt);
652 582
653void __init parse_memmapopt(char *p, char **from) 583static int userdef __initdata;
584
585static int __init parse_memmap_opt(char *p)
654{ 586{
587 char *oldp;
655 unsigned long long start_at, mem_size; 588 unsigned long long start_at, mem_size;
656 589
657 mem_size = memparse(p, from); 590 if (!strcmp(p, "exactmap")) {
658 p = *from; 591#ifdef CONFIG_CRASH_DUMP
592 /* If we are doing a crash dump, we
593 * still need to know the real mem
594 * size before original memory map is
595 * reset.
596 */
597 saved_max_pfn = e820_end_of_ram();
598#endif
599 end_pfn_map = 0;
600 e820.nr_map = 0;
601 userdef = 1;
602 return 0;
603 }
604
605 oldp = p;
606 mem_size = memparse(p, &p);
607 if (p == oldp)
608 return -EINVAL;
659 if (*p == '@') { 609 if (*p == '@') {
660 start_at = memparse(p+1, from); 610 start_at = memparse(p+1, &p);
661 add_memory_region(start_at, mem_size, E820_RAM); 611 add_memory_region(start_at, mem_size, E820_RAM);
662 } else if (*p == '#') { 612 } else if (*p == '#') {
663 start_at = memparse(p+1, from); 613 start_at = memparse(p+1, &p);
664 add_memory_region(start_at, mem_size, E820_ACPI); 614 add_memory_region(start_at, mem_size, E820_ACPI);
665 } else if (*p == '$') { 615 } else if (*p == '$') {
666 start_at = memparse(p+1, from); 616 start_at = memparse(p+1, &p);
667 add_memory_region(start_at, mem_size, E820_RESERVED); 617 add_memory_region(start_at, mem_size, E820_RESERVED);
668 } else { 618 } else {
669 end_user_pfn = (mem_size >> PAGE_SHIFT); 619 end_user_pfn = (mem_size >> PAGE_SHIFT);
670 } 620 }
671 p = *from; 621 return *p == '\0' ? 0 : -EINVAL;
622}
623early_param("memmap", parse_memmap_opt);
624
625void finish_e820_parsing(void)
626{
627 if (userdef) {
628 printk(KERN_INFO "user-defined physical RAM map:\n");
629 e820_print_map("user");
630 }
672} 631}
673 632
674unsigned long pci_mem_start = 0xaeedbabe; 633unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
new file mode 100644
index 000000000000..208e38a372c1
--- /dev/null
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -0,0 +1,122 @@
1/* Various workarounds for chipset bugs.
2 This code runs very early and can't use the regular PCI subsystem
3 The entries are keyed to PCI bridges which usually identify chipsets
4 uniquely.
5 This is only for whole classes of chipsets with specific problems which
6 need early invasive action (e.g. before the timers are initialized).
7 Most PCI device specific workarounds can be done later and should be
8 in standard PCI quirks
9 Mainboard specific bugs should be handled by DMI entries.
10 CPU specific bugs in setup.c */
11
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci_ids.h>
15#include <asm/pci-direct.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18
19static void via_bugs(void)
20{
21#ifdef CONFIG_IOMMU
22 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
23 !iommu_aperture_allowed) {
24 printk(KERN_INFO
25 "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
26 iommu_aperture_disabled = 1;
27 }
28#endif
29}
30
31#ifdef CONFIG_ACPI
32
33static int nvidia_hpet_detected __initdata;
34
35static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
36{
37 nvidia_hpet_detected = 1;
38 return 0;
39}
40#endif
41
42static void nvidia_bugs(void)
43{
44#ifdef CONFIG_ACPI
45 /*
46 * All timer overrides on Nvidia are
47 * wrong unless HPET is enabled.
48 */
49 nvidia_hpet_detected = 0;
50 acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
51 if (nvidia_hpet_detected == 0) {
52 acpi_skip_timer_override = 1;
53 printk(KERN_INFO "Nvidia board "
54 "detected. Ignoring ACPI "
55 "timer override.\n");
56 }
57#endif
58 /* RED-PEN skip them on mptables too? */
59
60}
61
62static void ati_bugs(void)
63{
64#if 1 /* for testing */
65 printk("ATI board detected\n");
66#endif
67 /* No bugs right now */
68}
69
70struct chipset {
71 u16 vendor;
72 void (*f)(void);
73};
74
75static struct chipset early_qrk[] = {
76 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
77 { PCI_VENDOR_ID_VIA, via_bugs },
78 { PCI_VENDOR_ID_ATI, ati_bugs },
79 {}
80};
81
82void __init early_quirks(void)
83{
84 int num, slot, func;
85
86 if (!early_pci_allowed())
87 return;
88
89 /* Poor man's PCI discovery */
90 for (num = 0; num < 32; num++) {
91 for (slot = 0; slot < 32; slot++) {
92 for (func = 0; func < 8; func++) {
93 u32 class;
94 u32 vendor;
95 u8 type;
96 int i;
97 class = read_pci_config(num,slot,func,
98 PCI_CLASS_REVISION);
99 if (class == 0xffffffff)
100 break;
101
102 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
103 continue;
104
105 vendor = read_pci_config(num, slot, func,
106 PCI_VENDOR_ID);
107 vendor &= 0xffff;
108
109 for (i = 0; early_qrk[i].f; i++)
110 if (early_qrk[i].vendor == vendor) {
111 early_qrk[i].f();
112 return;
113 }
114
115 type = read_pci_config_byte(num, slot, func,
116 PCI_HEADER_TYPE);
117 if (!(type & 0x80))
118 break;
119 }
120 }
121 }
122}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 140051e07fa6..e22ecd54870d 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...)
215 215
216static int __initdata keep_early; 216static int __initdata keep_early;
217 217
218int __init setup_early_printk(char *opt) 218static int __init setup_early_printk(char *buf)
219{ 219{
220 char *space; 220 if (!buf)
221 char buf[256]; 221 return 0;
222 222
223 if (early_console_initialized) 223 if (early_console_initialized)
224 return 1; 224 return 0;
225 225 early_console_initialized = 1;
226 strlcpy(buf,opt,sizeof(buf));
227 space = strchr(buf, ' ');
228 if (space)
229 *space = 0;
230 226
231 if (strstr(buf,"keep")) 227 if (!strcmp(buf,"keep"))
232 keep_early = 1; 228 keep_early = 1;
233 229
234 if (!strncmp(buf, "serial", 6)) { 230 if (!strncmp(buf, "serial", 6)) {
@@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt)
248 early_console = &simnow_console; 244 early_console = &simnow_console;
249 keep_early = 1; 245 keep_early = 1;
250 } 246 }
251 early_console_initialized = 1;
252 register_console(early_console); 247 register_console(early_console);
253 return 0; 248 return 0;
254} 249}
255 250
251early_param("earlyprintk", setup_early_printk);
252
256void __init disable_early_printk(void) 253void __init disable_early_printk(void)
257{ 254{
258 if (!early_console_initialized || !early_console) 255 if (!early_console_initialized || !early_console)
@@ -266,4 +263,3 @@ void __init disable_early_printk(void)
266 } 263 }
267} 264}
268 265
269__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index aa8d8939abc1..2802524104f3 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -4,8 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 *
8 * $Id$
9 */ 7 */
10 8
11/* 9/*
@@ -22,15 +20,25 @@
22 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
25 * 23 *
26 * TODO: 24 * Some macro usage:
27 * - schedule it carefully for the final hardware. 25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
28 */ 38 */
29 39
30#define ASSEMBLY 1
31#include <linux/linkage.h> 40#include <linux/linkage.h>
32#include <asm/segment.h> 41#include <asm/segment.h>
33#include <asm/smp.h>
34#include <asm/cache.h> 42#include <asm/cache.h>
35#include <asm/errno.h> 43#include <asm/errno.h>
36#include <asm/dwarf2.h> 44#include <asm/dwarf2.h>
@@ -115,6 +123,7 @@
115 .macro CFI_DEFAULT_STACK start=1 123 .macro CFI_DEFAULT_STACK start=1
116 .if \start 124 .if \start
117 CFI_STARTPROC simple 125 CFI_STARTPROC simple
126 CFI_SIGNAL_FRAME
118 CFI_DEF_CFA rsp,SS+8 127 CFI_DEF_CFA rsp,SS+8
119 .else 128 .else
120 CFI_DEF_CFA_OFFSET SS+8 129 CFI_DEF_CFA_OFFSET SS+8
@@ -146,6 +155,10 @@
146/* rdi: prev */ 155/* rdi: prev */
147ENTRY(ret_from_fork) 156ENTRY(ret_from_fork)
148 CFI_DEFAULT_STACK 157 CFI_DEFAULT_STACK
158 push kernel_eflags(%rip)
159 CFI_ADJUST_CFA_OFFSET 4
160 popf # reset kernel eflags
161 CFI_ADJUST_CFA_OFFSET -4
149 call schedule_tail 162 call schedule_tail
150 GET_THREAD_INFO(%rcx) 163 GET_THREAD_INFO(%rcx)
151 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) 164 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
@@ -199,6 +212,7 @@ END(ret_from_fork)
199 212
200ENTRY(system_call) 213ENTRY(system_call)
201 CFI_STARTPROC simple 214 CFI_STARTPROC simple
215 CFI_SIGNAL_FRAME
202 CFI_DEF_CFA rsp,PDA_STACKOFFSET 216 CFI_DEF_CFA rsp,PDA_STACKOFFSET
203 CFI_REGISTER rip,rcx 217 CFI_REGISTER rip,rcx
204 /*CFI_REGISTER rflags,r11*/ 218 /*CFI_REGISTER rflags,r11*/
@@ -316,6 +330,7 @@ END(system_call)
316 */ 330 */
317ENTRY(int_ret_from_sys_call) 331ENTRY(int_ret_from_sys_call)
318 CFI_STARTPROC simple 332 CFI_STARTPROC simple
333 CFI_SIGNAL_FRAME
319 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 334 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
320 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ 335 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
321 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 336 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
@@ -476,6 +491,7 @@ END(stub_rt_sigreturn)
476 */ 491 */
477 .macro _frame ref 492 .macro _frame ref
478 CFI_STARTPROC simple 493 CFI_STARTPROC simple
494 CFI_SIGNAL_FRAME
479 CFI_DEF_CFA rsp,SS+8-\ref 495 CFI_DEF_CFA rsp,SS+8-\ref
480 /*CFI_REL_OFFSET ss,SS-\ref*/ 496 /*CFI_REL_OFFSET ss,SS-\ref*/
481 CFI_REL_OFFSET rsp,RSP-\ref 497 CFI_REL_OFFSET rsp,RSP-\ref
@@ -511,7 +527,12 @@ END(stub_rt_sigreturn)
511 testl $3,CS(%rdi) 527 testl $3,CS(%rdi)
512 je 1f 528 je 1f
513 swapgs 529 swapgs
5141: incl %gs:pda_irqcount # RED-PEN should check preempt count 530 /* irqcount is used to check if a CPU is already on an interrupt
531 stack or not. While this is essentially redundant with preempt_count
532 it is a little cheaper to use a separate counter in the PDA
533 (short of moving irq_enter into assembly, which would be too
534 much work) */
5351: incl %gs:pda_irqcount
515 cmoveq %gs:pda_irqstackptr,%rsp 536 cmoveq %gs:pda_irqstackptr,%rsp
516 push %rbp # backlink for old unwinder 537 push %rbp # backlink for old unwinder
517 /* 538 /*
@@ -619,8 +640,7 @@ retint_signal:
619#ifdef CONFIG_PREEMPT 640#ifdef CONFIG_PREEMPT
620 /* Returning to kernel space. Check if we need preemption */ 641 /* Returning to kernel space. Check if we need preemption */
621 /* rcx: threadinfo. interrupts off. */ 642 /* rcx: threadinfo. interrupts off. */
622 .p2align 643ENTRY(retint_kernel)
623retint_kernel:
624 cmpl $0,threadinfo_preempt_count(%rcx) 644 cmpl $0,threadinfo_preempt_count(%rcx)
625 jnz retint_restore_args 645 jnz retint_restore_args
626 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) 646 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
@@ -679,7 +699,6 @@ ENTRY(call_function_interrupt)
679END(call_function_interrupt) 699END(call_function_interrupt)
680#endif 700#endif
681 701
682#ifdef CONFIG_X86_LOCAL_APIC
683ENTRY(apic_timer_interrupt) 702ENTRY(apic_timer_interrupt)
684 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 703 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
685END(apic_timer_interrupt) 704END(apic_timer_interrupt)
@@ -691,7 +710,6 @@ END(error_interrupt)
691ENTRY(spurious_interrupt) 710ENTRY(spurious_interrupt)
692 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt 711 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
693END(spurious_interrupt) 712END(spurious_interrupt)
694#endif
695 713
696/* 714/*
697 * Exception entry points. 715 * Exception entry points.
@@ -768,7 +786,9 @@ paranoid_exit\trace:
768 testl $3,CS(%rsp) 786 testl $3,CS(%rsp)
769 jnz paranoid_userspace\trace 787 jnz paranoid_userspace\trace
770paranoid_swapgs\trace: 788paranoid_swapgs\trace:
789 .if \trace
771 TRACE_IRQS_IRETQ 0 790 TRACE_IRQS_IRETQ 0
791 .endif
772 swapgs 792 swapgs
773paranoid_restore\trace: 793paranoid_restore\trace:
774 RESTORE_ALL 8 794 RESTORE_ALL 8
@@ -814,7 +834,7 @@ paranoid_schedule\trace:
814 * Exception entry point. This expects an error code/orig_rax on the stack 834 * Exception entry point. This expects an error code/orig_rax on the stack
815 * and the exception handler in %rax. 835 * and the exception handler in %rax.
816 */ 836 */
817ENTRY(error_entry) 837KPROBE_ENTRY(error_entry)
818 _frame RDI 838 _frame RDI
819 /* rdi slot contains rax, oldrax contains error code */ 839 /* rdi slot contains rax, oldrax contains error code */
820 cld 840 cld
@@ -898,7 +918,7 @@ error_kernelspace:
898 cmpq $gs_change,RIP(%rsp) 918 cmpq $gs_change,RIP(%rsp)
899 je error_swapgs 919 je error_swapgs
900 jmp error_sti 920 jmp error_sti
901END(error_entry) 921KPROBE_END(error_entry)
902 922
903 /* Reload gs selector with exception handling */ 923 /* Reload gs selector with exception handling */
904 /* edi: new selector */ 924 /* edi: new selector */
@@ -1020,8 +1040,7 @@ ENDPROC(execve)
1020 1040
1021KPROBE_ENTRY(page_fault) 1041KPROBE_ENTRY(page_fault)
1022 errorentry do_page_fault 1042 errorentry do_page_fault
1023END(page_fault) 1043KPROBE_END(page_fault)
1024 .previous .text
1025 1044
1026ENTRY(coprocessor_error) 1045ENTRY(coprocessor_error)
1027 zeroentry do_coprocessor_error 1046 zeroentry do_coprocessor_error
@@ -1042,8 +1061,7 @@ KPROBE_ENTRY(debug)
1042 CFI_ADJUST_CFA_OFFSET 8 1061 CFI_ADJUST_CFA_OFFSET 8
1043 paranoidentry do_debug, DEBUG_STACK 1062 paranoidentry do_debug, DEBUG_STACK
1044 paranoidexit 1063 paranoidexit
1045END(debug) 1064KPROBE_END(debug)
1046 .previous .text
1047 1065
1048 /* runs on exception stack */ 1066 /* runs on exception stack */
1049KPROBE_ENTRY(nmi) 1067KPROBE_ENTRY(nmi)
@@ -1057,8 +1075,7 @@ KPROBE_ENTRY(nmi)
1057 jmp paranoid_exit1 1075 jmp paranoid_exit1
1058 CFI_ENDPROC 1076 CFI_ENDPROC
1059#endif 1077#endif
1060END(nmi) 1078KPROBE_END(nmi)
1061 .previous .text
1062 1079
1063KPROBE_ENTRY(int3) 1080KPROBE_ENTRY(int3)
1064 INTR_FRAME 1081 INTR_FRAME
@@ -1067,8 +1084,7 @@ KPROBE_ENTRY(int3)
1067 paranoidentry do_int3, DEBUG_STACK 1084 paranoidentry do_int3, DEBUG_STACK
1068 jmp paranoid_exit1 1085 jmp paranoid_exit1
1069 CFI_ENDPROC 1086 CFI_ENDPROC
1070END(int3) 1087KPROBE_END(int3)
1071 .previous .text
1072 1088
1073ENTRY(overflow) 1089ENTRY(overflow)
1074 zeroentry do_overflow 1090 zeroentry do_overflow
@@ -1116,8 +1132,7 @@ END(stack_segment)
1116 1132
1117KPROBE_ENTRY(general_protection) 1133KPROBE_ENTRY(general_protection)
1118 errorentry do_general_protection 1134 errorentry do_general_protection
1119END(general_protection) 1135KPROBE_END(general_protection)
1120 .previous .text
1121 1136
1122ENTRY(alignment_check) 1137ENTRY(alignment_check)
1123 errorentry do_alignment_check 1138 errorentry do_alignment_check
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
index 3020917546de..cdb90e671b88 100644
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -118,7 +118,6 @@ struct genapic apic_cluster = {
118 .name = "clustered", 118 .name = "clustered",
119 .int_delivery_mode = dest_Fixed, 119 .int_delivery_mode = dest_Fixed,
120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
121 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
122 .target_cpus = cluster_target_cpus, 121 .target_cpus = cluster_target_cpus,
123 .apic_id_registered = cluster_apic_id_registered, 122 .apic_id_registered = cluster_apic_id_registered,
124 .init_apic_ldr = cluster_init_apic_ldr, 123 .init_apic_ldr = cluster_init_apic_ldr,
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index eb86d374813a..50ad153eaac4 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
49 unsigned long cfg; 49 unsigned long cfg;
50 unsigned long flags; 50 unsigned long flags;
51 51
52 local_save_flags(flags); 52 local_irq_save(flags);
53 local_irq_disable();
54 53
55 /* 54 /*
56 * Wait for idle. 55 * Wait for idle.
@@ -121,7 +120,6 @@ struct genapic apic_flat = {
121 .name = "flat", 120 .name = "flat",
122 .int_delivery_mode = dest_LowestPrio, 121 .int_delivery_mode = dest_LowestPrio,
123 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 122 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
124 .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
125 .target_cpus = flat_target_cpus, 123 .target_cpus = flat_target_cpus,
126 .apic_id_registered = flat_apic_id_registered, 124 .apic_id_registered = flat_apic_id_registered,
127 .init_apic_ldr = flat_init_apic_ldr, 125 .init_apic_ldr = flat_init_apic_ldr,
@@ -180,7 +178,6 @@ struct genapic apic_physflat = {
180 .name = "physical flat", 178 .name = "physical flat",
181 .int_delivery_mode = dest_Fixed, 179 .int_delivery_mode = dest_Fixed,
182 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
183 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
184 .target_cpus = physflat_target_cpus, 181 .target_cpus = physflat_target_cpus,
185 .apic_id_registered = flat_apic_id_registered, 182 .apic_id_registered = flat_apic_id_registered,
186 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ 183 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c9739ca81d06..1e6f80870679 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,8 +5,6 @@
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> 6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> 7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 *
9 * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
10 */ 8 */
11 9
12 10
@@ -187,12 +185,15 @@ startup_64:
187 185
188 /* Finally jump to run C code and to be on real kernel address 186 /* Finally jump to run C code and to be on real kernel address
189 * Since we are running on identity-mapped space we have to jump 187 * Since we are running on identity-mapped space we have to jump
190 * to the full 64bit address , this is only possible as indirect 188 * to the full 64bit address, this is only possible as indirect
191 * jump 189 * jump. In addition we need to ensure %cs is set so we make this
190 * a far return.
192 */ 191 */
193 movq initial_code(%rip),%rax 192 movq initial_code(%rip),%rax
194 pushq $0 # fake return address 193 pushq $0 # fake return address to stop unwinder
195 jmp *%rax 194 pushq $__KERNEL_CS # set correct cs
195 pushq %rax # target address in negative space
196 lretq
196 197
197 /* SMP bootup changes these two */ 198 /* SMP bootup changes these two */
198 .align 8 199 .align 8
@@ -371,7 +372,7 @@ ENTRY(cpu_gdt_table)
371 .quad 0,0 /* TSS */ 372 .quad 0,0 /* TSS */
372 .quad 0,0 /* LDT */ 373 .quad 0,0 /* LDT */
373 .quad 0,0,0 /* three TLS descriptors */ 374 .quad 0,0,0 /* three TLS descriptors */
374 .quad 0 /* unused */ 375 .quad 0x0000f40000000000 /* node/CPU stored in limit */
375gdt_end: 376gdt_end:
376 /* asm/segment.h:GDT_ENTRIES must match this */ 377 /* asm/segment.h:GDT_ENTRIES must match this */
377 /* This should be a multiple of the cache line size */ 378 /* This should be a multiple of the cache line size */
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 36647ce6aecb..9561eb3c5b5c 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -45,38 +45,16 @@ static void __init copy_bootdata(char *real_mode_data)
45 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); 45 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
46 if (!new_data) { 46 if (!new_data) {
47 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { 47 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
48 printk("so old bootloader that it does not support commandline?!\n");
49 return; 48 return;
50 } 49 }
51 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; 50 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
52 printk("old bootloader convention, maybe loadlin?\n");
53 } 51 }
54 command_line = (char *) ((u64)(new_data)); 52 command_line = (char *) ((u64)(new_data));
55 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); 53 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
56 printk("Bootdata ok (command line is %s)\n", saved_command_line);
57}
58
59static void __init setup_boot_cpu_data(void)
60{
61 unsigned int dummy, eax;
62
63 /* get vendor info */
64 cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
65 (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
66 (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
67 (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
68
69 /* get cpu type */
70 cpuid(1, &eax, &dummy, &dummy,
71 (unsigned int *) &boot_cpu_data.x86_capability);
72 boot_cpu_data.x86 = (eax >> 8) & 0xf;
73 boot_cpu_data.x86_model = (eax >> 4) & 0xf;
74 boot_cpu_data.x86_mask = eax & 0xf;
75} 54}
76 55
77void __init x86_64_start_kernel(char * real_mode_data) 56void __init x86_64_start_kernel(char * real_mode_data)
78{ 57{
79 char *s;
80 int i; 58 int i;
81 59
82 for (i = 0; i < 256; i++) 60 for (i = 0; i < 256; i++)
@@ -84,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
84 asm volatile("lidt %0" :: "m" (idt_descr)); 62 asm volatile("lidt %0" :: "m" (idt_descr));
85 clear_bss(); 63 clear_bss();
86 64
87 /* 65 early_printk("Kernel alive\n");
88 * This must be called really, really early:
89 */
90 lockdep_init();
91 66
92 /* 67 /*
93 * switch to init_level4_pgt from boot_level4_pgt 68 * switch to init_level4_pgt from boot_level4_pgt
@@ -103,22 +78,5 @@ void __init x86_64_start_kernel(char * real_mode_data)
103#ifdef CONFIG_SMP 78#ifdef CONFIG_SMP
104 cpu_set(0, cpu_online_map); 79 cpu_set(0, cpu_online_map);
105#endif 80#endif
106 s = strstr(saved_command_line, "earlyprintk=");
107 if (s != NULL)
108 setup_early_printk(strchr(s, '=') + 1);
109#ifdef CONFIG_NUMA
110 s = strstr(saved_command_line, "numa=");
111 if (s != NULL)
112 numa_setup(s+5);
113#endif
114#ifdef CONFIG_X86_IO_APIC
115 if (strstr(saved_command_line, "disableapic"))
116 disable_apic = 1;
117#endif
118 /* You need early console to see that */
119 if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
120 panic("Kernel too big for kernel mapping\n");
121
122 setup_boot_cpu_data();
123 start_kernel(); 81 start_kernel();
124} 82}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 0434b1f8e3dd..2dd51f364ea2 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -55,7 +55,6 @@
55 */ 55 */
56BUILD_16_IRQS(0x0) 56BUILD_16_IRQS(0x0)
57 57
58#ifdef CONFIG_X86_LOCAL_APIC
59/* 58/*
60 * The IO-APIC gives us many more interrupt sources. Most of these 59 * The IO-APIC gives us many more interrupt sources. Most of these
61 * are unused but an SMP system is supposed to have enough memory ... 60 * are unused but an SMP system is supposed to have enough memory ...
@@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
75 BUILD_15_IRQS(0xe) 74 BUILD_15_IRQS(0xe)
76#endif 75#endif
77 76
78#endif
79
80#undef BUILD_16_IRQS 77#undef BUILD_16_IRQS
81#undef BUILD_15_IRQS 78#undef BUILD_15_IRQS
82#undef BI 79#undef BI
@@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
100void (*interrupt[NR_IRQS])(void) = { 97void (*interrupt[NR_IRQS])(void) = {
101 IRQLIST_16(0x0), 98 IRQLIST_16(0x0),
102 99
103#ifdef CONFIG_X86_IO_APIC
104 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), 100 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
105 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 101 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
106 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 102 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = {
110 , IRQLIST_15(0xe) 106 , IRQLIST_15(0xe)
111#endif 107#endif
112 108
113#endif
114}; 109};
115 110
116#undef IRQ 111#undef IRQ
@@ -128,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = {
128 123
129DEFINE_SPINLOCK(i8259A_lock); 124DEFINE_SPINLOCK(i8259A_lock);
130 125
126static int i8259A_auto_eoi;
127
131static void end_8259A_irq (unsigned int irq) 128static void end_8259A_irq (unsigned int irq)
132{ 129{
133 if (irq > 256) { 130 if (irq > 256) {
@@ -341,6 +338,8 @@ void init_8259A(int auto_eoi)
341{ 338{
342 unsigned long flags; 339 unsigned long flags;
343 340
341 i8259A_auto_eoi = auto_eoi;
342
344 spin_lock_irqsave(&i8259A_lock, flags); 343 spin_lock_irqsave(&i8259A_lock, flags);
345 344
346 outb(0xff, 0x21); /* mask all of 8259A-1 */ 345 outb(0xff, 0x21); /* mask all of 8259A-1 */
@@ -399,7 +398,7 @@ static void save_ELCR(char *trigger)
399 398
400static int i8259A_resume(struct sys_device *dev) 399static int i8259A_resume(struct sys_device *dev)
401{ 400{
402 init_8259A(0); 401 init_8259A(i8259A_auto_eoi);
403 restore_ELCR(irq_trigger); 402 restore_ELCR(irq_trigger);
404 return 0; 403 return 0;
405} 404}
@@ -453,9 +452,7 @@ void __init init_ISA_irqs (void)
453{ 452{
454 int i; 453 int i;
455 454
456#ifdef CONFIG_X86_LOCAL_APIC
457 init_bsp_APIC(); 455 init_bsp_APIC();
458#endif
459 init_8259A(0); 456 init_8259A(0);
460 457
461 for (i = 0; i < NR_IRQS; i++) { 458 for (i = 0; i < NR_IRQS; i++) {
@@ -581,14 +578,12 @@ void __init init_IRQ(void)
581 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 578 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
582 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 579 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
583 580
584#ifdef CONFIG_X86_LOCAL_APIC
585 /* self generated IPI for local APIC timer */ 581 /* self generated IPI for local APIC timer */
586 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 582 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
587 583
588 /* IPI vectors for APIC spurious and error interrupts */ 584 /* IPI vectors for APIC spurious and error interrupts */
589 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 585 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
590 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 586 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
591#endif
592 587
593 /* 588 /*
594 * Set the clock to HZ Hz, we already have a valid 589 * Set the clock to HZ Hz, we already have a valid
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 924a4a332954..0491019d4c8d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
48 48
49static int no_timer_check; 49static int no_timer_check;
50 50
51int disable_timer_pin_1 __initdata; 51static int disable_timer_pin_1 __initdata;
52 52
53int timer_over_8254 __initdata = 0; 53int timer_over_8254 __initdata = 0;
54 54
@@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
111 FINAL; \ 111 FINAL; \
112} 112}
113 113
114union entry_union {
115 struct { u32 w1, w2; };
116 struct IO_APIC_route_entry entry;
117};
118
119static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
120{
121 union entry_union eu;
122 unsigned long flags;
123 spin_lock_irqsave(&ioapic_lock, flags);
124 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
125 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
126 spin_unlock_irqrestore(&ioapic_lock, flags);
127 return eu.entry;
128}
129
130static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
131{
132 unsigned long flags;
133 union entry_union eu;
134 eu.entry = e;
135 spin_lock_irqsave(&ioapic_lock, flags);
136 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
137 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
138 spin_unlock_irqrestore(&ioapic_lock, flags);
139}
140
114#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
115static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 142static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
116{ 143{
@@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
196static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 223static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
197{ 224{
198 struct IO_APIC_route_entry entry; 225 struct IO_APIC_route_entry entry;
199 unsigned long flags;
200 226
201 /* Check delivery_mode to be sure we're not clearing an SMI pin */ 227 /* Check delivery_mode to be sure we're not clearing an SMI pin */
202 spin_lock_irqsave(&ioapic_lock, flags); 228 entry = ioapic_read_entry(apic, pin);
203 *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
204 *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
205 spin_unlock_irqrestore(&ioapic_lock, flags);
206 if (entry.delivery_mode == dest_SMI) 229 if (entry.delivery_mode == dest_SMI)
207 return; 230 return;
208 /* 231 /*
@@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
210 */ 233 */
211 memset(&entry, 0, sizeof(entry)); 234 memset(&entry, 0, sizeof(entry));
212 entry.mask = 1; 235 entry.mask = 1;
213 spin_lock_irqsave(&ioapic_lock, flags); 236 ioapic_write_entry(apic, pin, entry);
214 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
215 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
216 spin_unlock_irqrestore(&ioapic_lock, flags);
217} 237}
218 238
219static void clear_IO_APIC (void) 239static void clear_IO_APIC (void)
@@ -225,14 +245,6 @@ static void clear_IO_APIC (void)
225 clear_IO_APIC_pin(apic, pin); 245 clear_IO_APIC_pin(apic, pin);
226} 246}
227 247
228/*
229 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
230 * specific CPU-side IRQs.
231 */
232
233#define MAX_PIRQS 8
234static int pirq_entries [MAX_PIRQS];
235static int pirqs_enabled;
236int skip_ioapic_setup; 248int skip_ioapic_setup;
237int ioapic_force; 249int ioapic_force;
238 250
@@ -241,18 +253,17 @@ int ioapic_force;
241static int __init disable_ioapic_setup(char *str) 253static int __init disable_ioapic_setup(char *str)
242{ 254{
243 skip_ioapic_setup = 1; 255 skip_ioapic_setup = 1;
244 return 1; 256 return 0;
245} 257}
258early_param("noapic", disable_ioapic_setup);
246 259
247static int __init enable_ioapic_setup(char *str) 260/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
261static int __init disable_timer_pin_setup(char *arg)
248{ 262{
249 ioapic_force = 1; 263 disable_timer_pin_1 = 1;
250 skip_ioapic_setup = 0;
251 return 1; 264 return 1;
252} 265}
253 266__setup("disable_timer_pin_1", disable_timer_pin_setup);
254__setup("noapic", disable_ioapic_setup);
255__setup("apic", enable_ioapic_setup);
256 267
257static int __init setup_disable_8254_timer(char *s) 268static int __init setup_disable_8254_timer(char *s)
258{ 269{
@@ -268,135 +279,6 @@ static int __init setup_enable_8254_timer(char *s)
268__setup("disable_8254_timer", setup_disable_8254_timer); 279__setup("disable_8254_timer", setup_disable_8254_timer);
269__setup("enable_8254_timer", setup_enable_8254_timer); 280__setup("enable_8254_timer", setup_enable_8254_timer);
270 281
271#include <asm/pci-direct.h>
272#include <linux/pci_ids.h>
273#include <linux/pci.h>
274
275
276#ifdef CONFIG_ACPI
277
278static int nvidia_hpet_detected __initdata;
279
280static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
281{
282 nvidia_hpet_detected = 1;
283 return 0;
284}
285#endif
286
287/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
288 off. Check for an Nvidia or VIA PCI bridge and turn it off.
289 Use pci direct infrastructure because this runs before the PCI subsystem.
290
291 Can be overwritten with "apic"
292
293 And another hack to disable the IOMMU on VIA chipsets.
294
295 ... and others. Really should move this somewhere else.
296
297 Kludge-O-Rama. */
298void __init check_ioapic(void)
299{
300 int num,slot,func;
301 /* Poor man's PCI discovery */
302 for (num = 0; num < 32; num++) {
303 for (slot = 0; slot < 32; slot++) {
304 for (func = 0; func < 8; func++) {
305 u32 class;
306 u32 vendor;
307 u8 type;
308 class = read_pci_config(num,slot,func,
309 PCI_CLASS_REVISION);
310 if (class == 0xffffffff)
311 break;
312
313 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
314 continue;
315
316 vendor = read_pci_config(num, slot, func,
317 PCI_VENDOR_ID);
318 vendor &= 0xffff;
319 switch (vendor) {
320 case PCI_VENDOR_ID_VIA:
321#ifdef CONFIG_IOMMU
322 if ((end_pfn > MAX_DMA32_PFN ||
323 force_iommu) &&
324 !iommu_aperture_allowed) {
325 printk(KERN_INFO
326 "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
327 iommu_aperture_disabled = 1;
328 }
329#endif
330 return;
331 case PCI_VENDOR_ID_NVIDIA:
332#ifdef CONFIG_ACPI
333 /*
334 * All timer overrides on Nvidia are
335 * wrong unless HPET is enabled.
336 */
337 nvidia_hpet_detected = 0;
338 acpi_table_parse(ACPI_HPET,
339 nvidia_hpet_check);
340 if (nvidia_hpet_detected == 0) {
341 acpi_skip_timer_override = 1;
342 printk(KERN_INFO "Nvidia board "
343 "detected. Ignoring ACPI "
344 "timer override.\n");
345 }
346#endif
347 /* RED-PEN skip them on mptables too? */
348 return;
349
350 /* This should be actually default, but
351 for 2.6.16 let's do it for ATI only where
352 it's really needed. */
353 case PCI_VENDOR_ID_ATI:
354 if (timer_over_8254 == 1) {
355 timer_over_8254 = 0;
356 printk(KERN_INFO
357 "ATI board detected. Disabling timer routing over 8254.\n");
358 }
359 return;
360 }
361
362
363 /* No multi-function device? */
364 type = read_pci_config_byte(num,slot,func,
365 PCI_HEADER_TYPE);
366 if (!(type & 0x80))
367 break;
368 }
369 }
370 }
371}
372
373static int __init ioapic_pirq_setup(char *str)
374{
375 int i, max;
376 int ints[MAX_PIRQS+1];
377
378 get_options(str, ARRAY_SIZE(ints), ints);
379
380 for (i = 0; i < MAX_PIRQS; i++)
381 pirq_entries[i] = -1;
382
383 pirqs_enabled = 1;
384 apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
385 max = MAX_PIRQS;
386 if (ints[0] < MAX_PIRQS)
387 max = ints[0];
388
389 for (i = 0; i < max; i++) {
390 apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
391 /*
392 * PIRQs are mapped upside down, usually.
393 */
394 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
395 }
396 return 1;
397}
398
399__setup("pirq=", ioapic_pirq_setup);
400 282
401/* 283/*
402 * Find the IRQ entry number of a certain pin. 284 * Find the IRQ entry number of a certain pin.
@@ -425,9 +307,7 @@ static int __init find_isa_irq_pin(int irq, int type)
425 for (i = 0; i < mp_irq_entries; i++) { 307 for (i = 0; i < mp_irq_entries; i++) {
426 int lbus = mp_irqs[i].mpc_srcbus; 308 int lbus = mp_irqs[i].mpc_srcbus;
427 309
428 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 310 if (test_bit(lbus, mp_bus_not_pci) &&
429 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
430 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
431 (mp_irqs[i].mpc_irqtype == type) && 311 (mp_irqs[i].mpc_irqtype == type) &&
432 (mp_irqs[i].mpc_srcbusirq == irq)) 312 (mp_irqs[i].mpc_srcbusirq == irq))
433 313
@@ -443,9 +323,7 @@ static int __init find_isa_irq_apic(int irq, int type)
443 for (i = 0; i < mp_irq_entries; i++) { 323 for (i = 0; i < mp_irq_entries; i++) {
444 int lbus = mp_irqs[i].mpc_srcbus; 324 int lbus = mp_irqs[i].mpc_srcbus;
445 325
446 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 326 if (test_bit(lbus, mp_bus_not_pci) &&
447 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
448 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
449 (mp_irqs[i].mpc_irqtype == type) && 327 (mp_irqs[i].mpc_irqtype == type) &&
450 (mp_irqs[i].mpc_srcbusirq == irq)) 328 (mp_irqs[i].mpc_srcbusirq == irq))
451 break; 329 break;
@@ -485,7 +363,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
485 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 363 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
486 break; 364 break;
487 365
488 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && 366 if (!test_bit(lbus, mp_bus_not_pci) &&
489 !mp_irqs[i].mpc_irqtype && 367 !mp_irqs[i].mpc_irqtype &&
490 (bus == lbus) && 368 (bus == lbus) &&
491 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 369 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -508,27 +386,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
508 return best_guess; 386 return best_guess;
509} 387}
510 388
511/*
512 * EISA Edge/Level control register, ELCR
513 */
514static int EISA_ELCR(unsigned int irq)
515{
516 if (irq < 16) {
517 unsigned int port = 0x4d0 + (irq >> 3);
518 return (inb(port) >> (irq & 7)) & 1;
519 }
520 apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
521 return 0;
522}
523
524/* EISA interrupts are always polarity zero and can be edge or level
525 * trigger depending on the ELCR value. If an interrupt is listed as
526 * EISA conforming in the MP table, that means its trigger type must
527 * be read in from the ELCR */
528
529#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
530#define default_EISA_polarity(idx) (0)
531
532/* ISA interrupts are always polarity zero edge triggered, 389/* ISA interrupts are always polarity zero edge triggered,
533 * when listed as conforming in the MP table. */ 390 * when listed as conforming in the MP table. */
534 391
@@ -541,12 +398,6 @@ static int EISA_ELCR(unsigned int irq)
541#define default_PCI_trigger(idx) (1) 398#define default_PCI_trigger(idx) (1)
542#define default_PCI_polarity(idx) (1) 399#define default_PCI_polarity(idx) (1)
543 400
544/* MCA interrupts are always polarity zero level triggered,
545 * when listed as conforming in the MP table. */
546
547#define default_MCA_trigger(idx) (1)
548#define default_MCA_polarity(idx) (0)
549
550static int __init MPBIOS_polarity(int idx) 401static int __init MPBIOS_polarity(int idx)
551{ 402{
552 int bus = mp_irqs[idx].mpc_srcbus; 403 int bus = mp_irqs[idx].mpc_srcbus;
@@ -558,38 +409,11 @@ static int __init MPBIOS_polarity(int idx)
558 switch (mp_irqs[idx].mpc_irqflag & 3) 409 switch (mp_irqs[idx].mpc_irqflag & 3)
559 { 410 {
560 case 0: /* conforms, ie. bus-type dependent polarity */ 411 case 0: /* conforms, ie. bus-type dependent polarity */
561 { 412 if (test_bit(bus, mp_bus_not_pci))
562 switch (mp_bus_id_to_type[bus]) 413 polarity = default_ISA_polarity(idx);
563 { 414 else
564 case MP_BUS_ISA: /* ISA pin */ 415 polarity = default_PCI_polarity(idx);
565 {
566 polarity = default_ISA_polarity(idx);
567 break;
568 }
569 case MP_BUS_EISA: /* EISA pin */
570 {
571 polarity = default_EISA_polarity(idx);
572 break;
573 }
574 case MP_BUS_PCI: /* PCI pin */
575 {
576 polarity = default_PCI_polarity(idx);
577 break;
578 }
579 case MP_BUS_MCA: /* MCA pin */
580 {
581 polarity = default_MCA_polarity(idx);
582 break;
583 }
584 default:
585 {
586 printk(KERN_WARNING "broken BIOS!!\n");
587 polarity = 1;
588 break;
589 }
590 }
591 break; 416 break;
592 }
593 case 1: /* high active */ 417 case 1: /* high active */
594 { 418 {
595 polarity = 0; 419 polarity = 0;
@@ -627,38 +451,11 @@ static int MPBIOS_trigger(int idx)
627 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 451 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
628 { 452 {
629 case 0: /* conforms, ie. bus-type dependent */ 453 case 0: /* conforms, ie. bus-type dependent */
630 { 454 if (test_bit(bus, mp_bus_not_pci))
631 switch (mp_bus_id_to_type[bus]) 455 trigger = default_ISA_trigger(idx);
632 { 456 else
633 case MP_BUS_ISA: /* ISA pin */ 457 trigger = default_PCI_trigger(idx);
634 {
635 trigger = default_ISA_trigger(idx);
636 break;
637 }
638 case MP_BUS_EISA: /* EISA pin */
639 {
640 trigger = default_EISA_trigger(idx);
641 break;
642 }
643 case MP_BUS_PCI: /* PCI pin */
644 {
645 trigger = default_PCI_trigger(idx);
646 break;
647 }
648 case MP_BUS_MCA: /* MCA pin */
649 {
650 trigger = default_MCA_trigger(idx);
651 break;
652 }
653 default:
654 {
655 printk(KERN_WARNING "broken BIOS!!\n");
656 trigger = 1;
657 break;
658 }
659 }
660 break; 458 break;
661 }
662 case 1: /* edge */ 459 case 1: /* edge */
663 { 460 {
664 trigger = 0; 461 trigger = 0;
@@ -764,49 +561,17 @@ static int pin_2_irq(int idx, int apic, int pin)
764 if (mp_irqs[idx].mpc_dstirq != pin) 561 if (mp_irqs[idx].mpc_dstirq != pin)
765 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 562 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
766 563
767 switch (mp_bus_id_to_type[bus]) 564 if (test_bit(bus, mp_bus_not_pci)) {
768 { 565 irq = mp_irqs[idx].mpc_srcbusirq;
769 case MP_BUS_ISA: /* ISA pin */ 566 } else {
770 case MP_BUS_EISA: 567 /*
771 case MP_BUS_MCA: 568 * PCI IRQs are mapped in order
772 { 569 */
773 irq = mp_irqs[idx].mpc_srcbusirq; 570 i = irq = 0;
774 break; 571 while (i < apic)
775 } 572 irq += nr_ioapic_registers[i++];
776 case MP_BUS_PCI: /* PCI pin */ 573 irq += pin;
777 { 574 irq = gsi_irq_sharing(irq);
778 /*
779 * PCI IRQs are mapped in order
780 */
781 i = irq = 0;
782 while (i < apic)
783 irq += nr_ioapic_registers[i++];
784 irq += pin;
785 irq = gsi_irq_sharing(irq);
786 break;
787 }
788 default:
789 {
790 printk(KERN_ERR "unknown bus type %d.\n",bus);
791 irq = 0;
792 break;
793 }
794 }
795 BUG_ON(irq >= NR_IRQS);
796
797 /*
798 * PCI IRQ command line redirection. Yes, limits are hardcoded.
799 */
800 if ((pin >= 16) && (pin <= 23)) {
801 if (pirq_entries[pin-16] != -1) {
802 if (!pirq_entries[pin-16]) {
803 apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
804 } else {
805 irq = pirq_entries[pin-16];
806 apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
807 pin-16, irq);
808 }
809 }
810 } 575 }
811 BUG_ON(irq >= NR_IRQS); 576 BUG_ON(irq >= NR_IRQS);
812 return irq; 577 return irq;
@@ -943,9 +708,9 @@ static void __init setup_IO_APIC_irqs(void)
943 if (!apic && (irq < 16)) 708 if (!apic && (irq < 16))
944 disable_8259A_irq(irq); 709 disable_8259A_irq(irq);
945 } 710 }
711 ioapic_write_entry(apic, pin, entry);
712
946 spin_lock_irqsave(&ioapic_lock, flags); 713 spin_lock_irqsave(&ioapic_lock, flags);
947 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
948 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
949 set_native_irq_info(irq, TARGET_CPUS); 714 set_native_irq_info(irq, TARGET_CPUS);
950 spin_unlock_irqrestore(&ioapic_lock, flags); 715 spin_unlock_irqrestore(&ioapic_lock, flags);
951 } 716 }
@@ -1083,10 +848,7 @@ void __apicdebuginit print_IO_APIC(void)
1083 for (i = 0; i <= reg_01.bits.entries; i++) { 848 for (i = 0; i <= reg_01.bits.entries; i++) {
1084 struct IO_APIC_route_entry entry; 849 struct IO_APIC_route_entry entry;
1085 850
1086 spin_lock_irqsave(&ioapic_lock, flags); 851 entry = ioapic_read_entry(apic, i);
1087 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
1088 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
1089 spin_unlock_irqrestore(&ioapic_lock, flags);
1090 852
1091 printk(KERN_DEBUG " %02x %03X %02X ", 853 printk(KERN_DEBUG " %02x %03X %02X ",
1092 i, 854 i,
@@ -1281,9 +1043,6 @@ static void __init enable_IO_APIC(void)
1281 irq_2_pin[i].pin = -1; 1043 irq_2_pin[i].pin = -1;
1282 irq_2_pin[i].next = 0; 1044 irq_2_pin[i].next = 0;
1283 } 1045 }
1284 if (!pirqs_enabled)
1285 for (i = 0; i < MAX_PIRQS; i++)
1286 pirq_entries[i] = -1;
1287 1046
1288 /* 1047 /*
1289 * The number of IO-APIC IRQ registers (== #pins): 1048 * The number of IO-APIC IRQ registers (== #pins):
@@ -1299,11 +1058,7 @@ static void __init enable_IO_APIC(void)
1299 /* See if any of the pins is in ExtINT mode */ 1058 /* See if any of the pins is in ExtINT mode */
1300 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1059 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1301 struct IO_APIC_route_entry entry; 1060 struct IO_APIC_route_entry entry;
1302 spin_lock_irqsave(&ioapic_lock, flags); 1061 entry = ioapic_read_entry(apic, pin);
1303 *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1304 *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1305 spin_unlock_irqrestore(&ioapic_lock, flags);
1306
1307 1062
1308 /* If the interrupt line is enabled and in ExtInt mode 1063 /* If the interrupt line is enabled and in ExtInt mode
1309 * I have found the pin where the i8259 is connected. 1064 * I have found the pin where the i8259 is connected.
@@ -1355,7 +1110,6 @@ void disable_IO_APIC(void)
1355 */ 1110 */
1356 if (ioapic_i8259.pin != -1) { 1111 if (ioapic_i8259.pin != -1) {
1357 struct IO_APIC_route_entry entry; 1112 struct IO_APIC_route_entry entry;
1358 unsigned long flags;
1359 1113
1360 memset(&entry, 0, sizeof(entry)); 1114 memset(&entry, 0, sizeof(entry));
1361 entry.mask = 0; /* Enabled */ 1115 entry.mask = 0; /* Enabled */
@@ -1372,84 +1126,13 @@ void disable_IO_APIC(void)
1372 /* 1126 /*
1373 * Add it to the IO-APIC irq-routing table: 1127 * Add it to the IO-APIC irq-routing table:
1374 */ 1128 */
1375 spin_lock_irqsave(&ioapic_lock, flags); 1129 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1376 io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
1377 *(((int *)&entry)+1));
1378 io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
1379 *(((int *)&entry)+0));
1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1381 } 1130 }
1382 1131
1383 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1132 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1384} 1133}
1385 1134
1386/* 1135/*
1387 * function to set the IO-APIC physical IDs based on the
1388 * values stored in the MPC table.
1389 *
1390 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1391 */
1392
1393static void __init setup_ioapic_ids_from_mpc (void)
1394{
1395 union IO_APIC_reg_00 reg_00;
1396 int apic;
1397 int i;
1398 unsigned char old_id;
1399 unsigned long flags;
1400
1401 /*
1402 * Set the IOAPIC ID to the value stored in the MPC table.
1403 */
1404 for (apic = 0; apic < nr_ioapics; apic++) {
1405
1406 /* Read the register 0 value */
1407 spin_lock_irqsave(&ioapic_lock, flags);
1408 reg_00.raw = io_apic_read(apic, 0);
1409 spin_unlock_irqrestore(&ioapic_lock, flags);
1410
1411 old_id = mp_ioapics[apic].mpc_apicid;
1412
1413
1414 printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
1415
1416
1417 /*
1418 * We need to adjust the IRQ routing table
1419 * if the ID changed.
1420 */
1421 if (old_id != mp_ioapics[apic].mpc_apicid)
1422 for (i = 0; i < mp_irq_entries; i++)
1423 if (mp_irqs[i].mpc_dstapic == old_id)
1424 mp_irqs[i].mpc_dstapic
1425 = mp_ioapics[apic].mpc_apicid;
1426
1427 /*
1428 * Read the right value from the MPC table and
1429 * write it into the ID register.
1430 */
1431 apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
1432 mp_ioapics[apic].mpc_apicid);
1433
1434 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1435 spin_lock_irqsave(&ioapic_lock, flags);
1436 io_apic_write(apic, 0, reg_00.raw);
1437 spin_unlock_irqrestore(&ioapic_lock, flags);
1438
1439 /*
1440 * Sanity check
1441 */
1442 spin_lock_irqsave(&ioapic_lock, flags);
1443 reg_00.raw = io_apic_read(apic, 0);
1444 spin_unlock_irqrestore(&ioapic_lock, flags);
1445 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1446 printk("could not set ID!\n");
1447 else
1448 apic_printk(APIC_VERBOSE," ok.\n");
1449 }
1450}
1451
1452/*
1453 * There is a nasty bug in some older SMP boards, their mptable lies 1136 * There is a nasty bug in some older SMP boards, their mptable lies
1454 * about the timer IRQ. We do the following to work around the situation: 1137 * about the timer IRQ. We do the following to work around the situation:
1455 * 1138 *
@@ -1964,11 +1647,6 @@ void __init setup_IO_APIC(void)
1964 1647
1965 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 1648 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1966 1649
1967 /*
1968 * Set up the IO-APIC IRQ routing table.
1969 */
1970 if (!acpi_ioapic)
1971 setup_ioapic_ids_from_mpc();
1972 sync_Arb_IDs(); 1650 sync_Arb_IDs();
1973 setup_IO_APIC_irqs(); 1651 setup_IO_APIC_irqs();
1974 init_IO_APIC_traps(); 1652 init_IO_APIC_traps();
@@ -1987,17 +1665,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
1987{ 1665{
1988 struct IO_APIC_route_entry *entry; 1666 struct IO_APIC_route_entry *entry;
1989 struct sysfs_ioapic_data *data; 1667 struct sysfs_ioapic_data *data;
1990 unsigned long flags;
1991 int i; 1668 int i;
1992 1669
1993 data = container_of(dev, struct sysfs_ioapic_data, dev); 1670 data = container_of(dev, struct sysfs_ioapic_data, dev);
1994 entry = data->entry; 1671 entry = data->entry;
1995 spin_lock_irqsave(&ioapic_lock, flags); 1672 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
1996 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { 1673 *entry = ioapic_read_entry(dev->id, i);
1997 *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
1998 *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
1999 }
2000 spin_unlock_irqrestore(&ioapic_lock, flags);
2001 1674
2002 return 0; 1675 return 0;
2003} 1676}
@@ -2019,11 +1692,9 @@ static int ioapic_resume(struct sys_device *dev)
2019 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 1692 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2020 io_apic_write(dev->id, 0, reg_00.raw); 1693 io_apic_write(dev->id, 0, reg_00.raw);
2021 } 1694 }
2022 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2023 io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2024 io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2025 }
2026 spin_unlock_irqrestore(&ioapic_lock, flags); 1695 spin_unlock_irqrestore(&ioapic_lock, flags);
1696 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
1697 ioapic_write_entry(dev->id, i, entry[i]);
2027 1698
2028 return 0; 1699 return 0;
2029} 1700}
@@ -2077,19 +1748,6 @@ device_initcall(ioapic_init_sysfs);
2077 1748
2078#define IO_APIC_MAX_ID 0xFE 1749#define IO_APIC_MAX_ID 0xFE
2079 1750
2080int __init io_apic_get_version (int ioapic)
2081{
2082 union IO_APIC_reg_01 reg_01;
2083 unsigned long flags;
2084
2085 spin_lock_irqsave(&ioapic_lock, flags);
2086 reg_01.raw = io_apic_read(ioapic, 1);
2087 spin_unlock_irqrestore(&ioapic_lock, flags);
2088
2089 return reg_01.bits.version;
2090}
2091
2092
2093int __init io_apic_get_redir_entries (int ioapic) 1751int __init io_apic_get_redir_entries (int ioapic)
2094{ 1752{
2095 union IO_APIC_reg_01 reg_01; 1753 union IO_APIC_reg_01 reg_01;
@@ -2148,10 +1806,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
2148 if (!ioapic && (irq < 16)) 1806 if (!ioapic && (irq < 16))
2149 disable_8259A_irq(irq); 1807 disable_8259A_irq(irq);
2150 1808
1809 ioapic_write_entry(ioapic, pin, entry);
1810
2151 spin_lock_irqsave(&ioapic_lock, flags); 1811 spin_lock_irqsave(&ioapic_lock, flags);
2152 io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); 1812 set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2153 io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2154 set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2155 spin_unlock_irqrestore(&ioapic_lock, flags); 1813 spin_unlock_irqrestore(&ioapic_lock, flags);
2156 1814
2157 return 0; 1815 return 0;
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index b81614970ecc..fe063d3cfe42 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
56 56
57 memset(bitmap, 0xff, IO_BITMAP_BYTES); 57 memset(bitmap, 0xff, IO_BITMAP_BYTES);
58 t->io_bitmap_ptr = bitmap; 58 t->io_bitmap_ptr = bitmap;
59 set_thread_flag(TIF_IO_BITMAP);
59 } 60 }
60 61
61 /* 62 /*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 5221a53e90c1..b3677e6ccc6e 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -20,11 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21 21
22atomic_t irq_err_count; 22atomic_t irq_err_count;
23#ifdef CONFIG_X86_IO_APIC
24#ifdef APIC_MISMATCH_DEBUG
25atomic_t irq_mis_count;
26#endif
27#endif
28 23
29#ifdef CONFIG_DEBUG_STACKOVERFLOW 24#ifdef CONFIG_DEBUG_STACKOVERFLOW
30/* 25/*
@@ -92,18 +87,11 @@ skip:
92 for_each_online_cpu(j) 87 for_each_online_cpu(j)
93 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); 88 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
94 seq_putc(p, '\n'); 89 seq_putc(p, '\n');
95#ifdef CONFIG_X86_LOCAL_APIC
96 seq_printf(p, "LOC: "); 90 seq_printf(p, "LOC: ");
97 for_each_online_cpu(j) 91 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); 92 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
99 seq_putc(p, '\n'); 93 seq_putc(p, '\n');
100#endif
101 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 94 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
102#ifdef CONFIG_X86_IO_APIC
103#ifdef APIC_MISMATCH_DEBUG
104 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
105#endif
106#endif
107 } 95 }
108 return 0; 96 return 0;
109} 97}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 106076b370fc..0497e3bd5bff 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -15,6 +15,15 @@
15#include <asm/mmu_context.h> 15#include <asm/mmu_context.h>
16#include <asm/io.h> 16#include <asm/io.h>
17 17
18#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
19static u64 kexec_pgd[512] PAGE_ALIGNED;
20static u64 kexec_pud0[512] PAGE_ALIGNED;
21static u64 kexec_pmd0[512] PAGE_ALIGNED;
22static u64 kexec_pte0[512] PAGE_ALIGNED;
23static u64 kexec_pud1[512] PAGE_ALIGNED;
24static u64 kexec_pmd1[512] PAGE_ALIGNED;
25static u64 kexec_pte1[512] PAGE_ALIGNED;
26
18static void init_level2_page(pmd_t *level2p, unsigned long addr) 27static void init_level2_page(pmd_t *level2p, unsigned long addr)
19{ 28{
20 unsigned long end_addr; 29 unsigned long end_addr;
@@ -144,32 +153,19 @@ static void load_segments(void)
144 ); 153 );
145} 154}
146 155
147typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
148 unsigned long control_code_buffer,
149 unsigned long start_address,
150 unsigned long pgtable) ATTRIB_NORET;
151
152extern const unsigned char relocate_new_kernel[];
153extern const unsigned long relocate_new_kernel_size;
154
155int machine_kexec_prepare(struct kimage *image) 156int machine_kexec_prepare(struct kimage *image)
156{ 157{
157 unsigned long start_pgtable, control_code_buffer; 158 unsigned long start_pgtable;
158 int result; 159 int result;
159 160
160 /* Calculate the offsets */ 161 /* Calculate the offsets */
161 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 162 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
162 control_code_buffer = start_pgtable + PAGE_SIZE;
163 163
164 /* Setup the identity mapped 64bit page table */ 164 /* Setup the identity mapped 64bit page table */
165 result = init_pgtable(image, start_pgtable); 165 result = init_pgtable(image, start_pgtable);
166 if (result) 166 if (result)
167 return result; 167 return result;
168 168
169 /* Place the code in the reboot code buffer */
170 memcpy(__va(control_code_buffer), relocate_new_kernel,
171 relocate_new_kernel_size);
172
173 return 0; 169 return 0;
174} 170}
175 171
@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image)
184 */ 180 */
185NORET_TYPE void machine_kexec(struct kimage *image) 181NORET_TYPE void machine_kexec(struct kimage *image)
186{ 182{
187 unsigned long page_list; 183 unsigned long page_list[PAGES_NR];
188 unsigned long control_code_buffer; 184 void *control_page;
189 unsigned long start_pgtable;
190 relocate_new_kernel_t rnk;
191 185
192 /* Interrupts aren't acceptable while we reboot */ 186 /* Interrupts aren't acceptable while we reboot */
193 local_irq_disable(); 187 local_irq_disable();
194 188
195 /* Calculate the offsets */ 189 control_page = page_address(image->control_code_page) + PAGE_SIZE;
196 page_list = image->head; 190 memcpy(control_page, relocate_kernel, PAGE_SIZE);
197 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 191
198 control_code_buffer = start_pgtable + PAGE_SIZE; 192 page_list[PA_CONTROL_PAGE] = __pa(control_page);
199 193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
200 /* Set the low half of the page table to my identity mapped 194 page_list[PA_PGD] = __pa(kexec_pgd);
201 * page table for kexec. Leave the high half pointing at the 195 page_list[VA_PGD] = (unsigned long)kexec_pgd;
202 * kernel pages. Don't bother to flush the global pages 196 page_list[PA_PUD_0] = __pa(kexec_pud0);
203 * as that will happen when I fully switch to my identity mapped 197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
204 * page table anyway. 198 page_list[PA_PMD_0] = __pa(kexec_pmd0);
205 */ 199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
206 memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); 200 page_list[PA_PTE_0] = __pa(kexec_pte0);
207 __flush_tlb(); 201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
208 202 page_list[PA_PUD_1] = __pa(kexec_pud1);
203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
204 page_list[PA_PMD_1] = __pa(kexec_pmd1);
205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
206 page_list[PA_PTE_1] = __pa(kexec_pte1);
207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
208
209 page_list[PA_TABLE_PAGE] =
210 (unsigned long)__pa(page_address(image->control_code_page));
209 211
210 /* The segment registers are funny things, they have both a 212 /* The segment registers are funny things, they have both a
211 * visible and an invisible part. Whenever the visible part is 213 * visible and an invisible part. Whenever the visible part is
@@ -222,7 +224,36 @@ NORET_TYPE void machine_kexec(struct kimage *image)
222 */ 224 */
223 set_gdt(phys_to_virt(0),0); 225 set_gdt(phys_to_virt(0),0);
224 set_idt(phys_to_virt(0),0); 226 set_idt(phys_to_virt(0),0);
227
225 /* now call it */ 228 /* now call it */
226 rnk = (relocate_new_kernel_t) control_code_buffer; 229 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
227 (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); 230 image->start);
228} 231}
232
233/* crashkernel=size@addr specifies the location to reserve for
234 * a crash kernel. By reserving this memory we guarantee
235 * that linux never set's it up as a DMA target.
236 * Useful for holding code to do something appropriate
237 * after a kernel panic.
238 */
239static int __init setup_crashkernel(char *arg)
240{
241 unsigned long size, base;
242 char *p;
243 if (!arg)
244 return -EINVAL;
245 size = memparse(arg, &p);
246 if (arg == p)
247 return -EINVAL;
248 if (*p == '@') {
249 base = memparse(p+1, &p);
250 /* FIXME: Do I want a sanity check to validate the
251 * memory range? Yes you do, but it's too early for
252 * e820 -AK */
253 crashk_res.start = base;
254 crashk_res.end = base + size - 1;
255 }
256 return 0;
257}
258early_param("crashkernel", setup_crashkernel);
259
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 4e017fb30fb3..bbea88801d88 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
182 goto out2; 182 goto out2;
183 183
184 memset(&m, 0, sizeof(struct mce)); 184 memset(&m, 0, sizeof(struct mce));
185 m.cpu = safe_smp_processor_id(); 185 m.cpu = smp_processor_id();
186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
187 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 187 if (!(m.mcgstatus & MCG_STATUS_RIPV))
188 kill_it = 1; 188 kill_it = 1;
@@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code)
274 atomic_dec(&mce_entry); 274 atomic_dec(&mce_entry);
275} 275}
276 276
277#ifdef CONFIG_X86_MCE_INTEL
278/***
279 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
280 * @cpu: The CPU on which the event occured.
281 * @status: Event status information
282 *
283 * This function should be called by the thermal interrupt after the
284 * event has been processed and the decision was made to log the event
285 * further.
286 *
287 * The status parameter will be saved to the 'status' field of 'struct mce'
288 * and historically has been the register value of the
289 * MSR_IA32_THERMAL_STATUS (Intel) msr.
290 */
291void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
292{
293 struct mce m;
294
295 memset(&m, 0, sizeof(m));
296 m.cpu = cpu;
297 m.bank = MCE_THERMAL_BANK;
298 m.status = status;
299 rdtscll(m.tsc);
300 mce_log(&m);
301}
302#endif /* CONFIG_X86_MCE_INTEL */
303
277/* 304/*
278 * Periodic polling timer for "silent" machine check errors. 305 * Periodic polling timer for "silent" machine check errors.
279 */ 306 */
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 8f533d2c40cb..6551505d8a2c 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -11,36 +11,21 @@
11#include <asm/mce.h> 11#include <asm/mce.h>
12#include <asm/hw_irq.h> 12#include <asm/hw_irq.h>
13#include <asm/idle.h> 13#include <asm/idle.h>
14 14#include <asm/therm_throt.h>
15static DEFINE_PER_CPU(unsigned long, next_check);
16 15
17asmlinkage void smp_thermal_interrupt(void) 16asmlinkage void smp_thermal_interrupt(void)
18{ 17{
19 struct mce m; 18 __u64 msr_val;
20 19
21 ack_APIC_irq(); 20 ack_APIC_irq();
22 21
23 exit_idle(); 22 exit_idle();
24 irq_enter(); 23 irq_enter();
25 if (time_before(jiffies, __get_cpu_var(next_check)))
26 goto done;
27
28 __get_cpu_var(next_check) = jiffies + HZ*300;
29 memset(&m, 0, sizeof(m));
30 m.cpu = smp_processor_id();
31 m.bank = MCE_THERMAL_BANK;
32 rdtscll(m.tsc);
33 rdmsrl(MSR_IA32_THERM_STATUS, m.status);
34 if (m.status & 0x1) {
35 printk(KERN_EMERG
36 "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
37 add_taint(TAINT_MACHINE_CHECK);
38 } else {
39 printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
40 }
41 24
42 mce_log(&m); 25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
43done: 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28
44 irq_exit(); 29 irq_exit();
45} 30}
46 31
@@ -92,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
92 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 77 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
93 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 78 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
94 cpu, tm2 ? "TM2" : "TM1"); 79 cpu, tm2 ? "TM2" : "TM1");
80
81 /* enable thermal throttle processing */
82 atomic_set(&therm_throt_en, 1);
95 return; 83 return;
96} 84}
97 85
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index a1ab4197f8a1..20e88f4b564b 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -41,8 +41,7 @@ int acpi_found_madt;
41 * Various Linux-internal data structures created from the 41 * Various Linux-internal data structures created from the
42 * MP-table. 42 * MP-table.
43 */ 43 */
44unsigned char apic_version [MAX_APICS]; 44DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
45unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
46int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; 45int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
47 46
48static int mp_current_pci_id = 0; 47static int mp_current_pci_id = 0;
@@ -56,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
56int mp_irq_entries; 55int mp_irq_entries;
57 56
58int nr_ioapics; 57int nr_ioapics;
59int pic_mode;
60unsigned long mp_lapic_addr = 0; 58unsigned long mp_lapic_addr = 0;
61 59
62 60
@@ -71,19 +69,6 @@ unsigned disabled_cpus __initdata;
71/* Bitmask of physically existing CPUs */ 69/* Bitmask of physically existing CPUs */
72physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 70physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
73 71
74/* ACPI MADT entry parsing functions */
75#ifdef CONFIG_ACPI
76extern struct acpi_boot_flags acpi_boot;
77#ifdef CONFIG_X86_LOCAL_APIC
78extern int acpi_parse_lapic (acpi_table_entry_header *header);
79extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
80extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
81#endif /*CONFIG_X86_LOCAL_APIC*/
82#ifdef CONFIG_X86_IO_APIC
83extern int acpi_parse_ioapic (acpi_table_entry_header *header);
84#endif /*CONFIG_X86_IO_APIC*/
85#endif /*CONFIG_ACPI*/
86
87u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 72u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
88 73
89 74
@@ -108,24 +93,20 @@ static int __init mpf_checksum(unsigned char *mp, int len)
108static void __cpuinit MP_processor_info (struct mpc_config_processor *m) 93static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
109{ 94{
110 int cpu; 95 int cpu;
111 unsigned char ver;
112 cpumask_t tmp_map; 96 cpumask_t tmp_map;
97 char *bootup_cpu = "";
113 98
114 if (!(m->mpc_cpuflag & CPU_ENABLED)) { 99 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
115 disabled_cpus++; 100 disabled_cpus++;
116 return; 101 return;
117 } 102 }
118
119 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
120 m->mpc_apicid,
121 (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
122 (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
123 m->mpc_apicver);
124
125 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 103 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
126 Dprintk(" Bootup CPU\n"); 104 bootup_cpu = " (Bootup-CPU)";
127 boot_cpu_id = m->mpc_apicid; 105 boot_cpu_id = m->mpc_apicid;
128 } 106 }
107
108 printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
109
129 if (num_processors >= NR_CPUS) { 110 if (num_processors >= NR_CPUS) {
130 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 111 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
131 " Processor ignored.\n", NR_CPUS); 112 " Processor ignored.\n", NR_CPUS);
@@ -136,24 +117,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
136 cpus_complement(tmp_map, cpu_present_map); 117 cpus_complement(tmp_map, cpu_present_map);
137 cpu = first_cpu(tmp_map); 118 cpu = first_cpu(tmp_map);
138 119
139#if MAX_APICS < 255
140 if ((int)m->mpc_apicid > MAX_APICS) {
141 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
142 m->mpc_apicid, MAX_APICS);
143 return;
144 }
145#endif
146 ver = m->mpc_apicver;
147
148 physid_set(m->mpc_apicid, phys_cpu_present_map); 120 physid_set(m->mpc_apicid, phys_cpu_present_map);
149 /*
150 * Validate version
151 */
152 if (ver == 0x0) {
153 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
154 ver = 0x10;
155 }
156 apic_version[m->mpc_apicid] = ver;
157 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 121 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
158 /* 122 /*
159 * bios_cpu_apicid is required to have processors listed 123 * bios_cpu_apicid is required to have processors listed
@@ -178,15 +142,11 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
178 Dprintk("Bus #%d is %s\n", m->mpc_busid, str); 142 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
179 143
180 if (strncmp(str, "ISA", 3) == 0) { 144 if (strncmp(str, "ISA", 3) == 0) {
181 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 145 set_bit(m->mpc_busid, mp_bus_not_pci);
182 } else if (strncmp(str, "EISA", 4) == 0) {
183 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
184 } else if (strncmp(str, "PCI", 3) == 0) { 146 } else if (strncmp(str, "PCI", 3) == 0) {
185 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 147 clear_bit(m->mpc_busid, mp_bus_not_pci);
186 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; 148 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
187 mp_current_pci_id++; 149 mp_current_pci_id++;
188 } else if (strncmp(str, "MCA", 3) == 0) {
189 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
190 } else { 150 } else {
191 printk(KERN_ERR "Unknown bustype %s\n", str); 151 printk(KERN_ERR "Unknown bustype %s\n", str);
192 } 152 }
@@ -197,8 +157,8 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
197 if (!(m->mpc_flags & MPC_APIC_USABLE)) 157 if (!(m->mpc_flags & MPC_APIC_USABLE))
198 return; 158 return;
199 159
200 printk("I/O APIC #%d Version %d at 0x%X.\n", 160 printk("I/O APIC #%d at 0x%X.\n",
201 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 161 m->mpc_apicid, m->mpc_apicaddr);
202 if (nr_ioapics >= MAX_IO_APICS) { 162 if (nr_ioapics >= MAX_IO_APICS) {
203 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", 163 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
204 MAX_IO_APICS, nr_ioapics); 164 MAX_IO_APICS, nr_ioapics);
@@ -232,19 +192,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
232 m->mpc_irqtype, m->mpc_irqflag & 3, 192 m->mpc_irqtype, m->mpc_irqflag & 3,
233 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, 193 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
234 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 194 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
235 /*
236 * Well it seems all SMP boards in existence
237 * use ExtINT/LVT1 == LINT0 and
238 * NMI/LVT2 == LINT1 - the following check
239 * will show us if this assumptions is false.
240 * Until then we do not have to add baggage.
241 */
242 if ((m->mpc_irqtype == mp_ExtINT) &&
243 (m->mpc_destapiclint != 0))
244 BUG();
245 if ((m->mpc_irqtype == mp_NMI) &&
246 (m->mpc_destapiclint != 1))
247 BUG();
248} 195}
249 196
250/* 197/*
@@ -258,7 +205,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
258 unsigned char *mpt=((unsigned char *)mpc)+count; 205 unsigned char *mpt=((unsigned char *)mpc)+count;
259 206
260 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { 207 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
261 printk("SMP mptable: bad signature [%c%c%c%c]!\n", 208 printk("MPTABLE: bad signature [%c%c%c%c]!\n",
262 mpc->mpc_signature[0], 209 mpc->mpc_signature[0],
263 mpc->mpc_signature[1], 210 mpc->mpc_signature[1],
264 mpc->mpc_signature[2], 211 mpc->mpc_signature[2],
@@ -266,31 +213,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
266 return 0; 213 return 0;
267 } 214 }
268 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { 215 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
269 printk("SMP mptable: checksum error!\n"); 216 printk("MPTABLE: checksum error!\n");
270 return 0; 217 return 0;
271 } 218 }
272 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { 219 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
273 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", 220 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
274 mpc->mpc_spec); 221 mpc->mpc_spec);
275 return 0; 222 return 0;
276 } 223 }
277 if (!mpc->mpc_lapic) { 224 if (!mpc->mpc_lapic) {
278 printk(KERN_ERR "SMP mptable: null local APIC address!\n"); 225 printk(KERN_ERR "MPTABLE: null local APIC address!\n");
279 return 0; 226 return 0;
280 } 227 }
281 memcpy(str,mpc->mpc_oem,8); 228 memcpy(str,mpc->mpc_oem,8);
282 str[8]=0; 229 str[8] = 0;
283 printk(KERN_INFO "OEM ID: %s ",str); 230 printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
284 231
285 memcpy(str,mpc->mpc_productid,12); 232 memcpy(str,mpc->mpc_productid,12);
286 str[12]=0; 233 str[12] = 0;
287 printk("Product ID: %s ",str); 234 printk("MPTABLE: Product ID: %s ",str);
288 235
289 printk("APIC at: 0x%X\n",mpc->mpc_lapic); 236 printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
290 237
291 /* save the local APIC address, it might be non-default */ 238 /* save the local APIC address, it might be non-default */
292 if (!acpi_lapic) 239 if (!acpi_lapic)
293 mp_lapic_addr = mpc->mpc_lapic; 240 mp_lapic_addr = mpc->mpc_lapic;
294 241
295 /* 242 /*
296 * Now process the configuration blocks. 243 * Now process the configuration blocks.
@@ -302,7 +249,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
302 struct mpc_config_processor *m= 249 struct mpc_config_processor *m=
303 (struct mpc_config_processor *)mpt; 250 (struct mpc_config_processor *)mpt;
304 if (!acpi_lapic) 251 if (!acpi_lapic)
305 MP_processor_info(m); 252 MP_processor_info(m);
306 mpt += sizeof(*m); 253 mpt += sizeof(*m);
307 count += sizeof(*m); 254 count += sizeof(*m);
308 break; 255 break;
@@ -321,8 +268,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
321 struct mpc_config_ioapic *m= 268 struct mpc_config_ioapic *m=
322 (struct mpc_config_ioapic *)mpt; 269 (struct mpc_config_ioapic *)mpt;
323 MP_ioapic_info(m); 270 MP_ioapic_info(m);
324 mpt+=sizeof(*m); 271 mpt += sizeof(*m);
325 count+=sizeof(*m); 272 count += sizeof(*m);
326 break; 273 break;
327 } 274 }
328 case MP_INTSRC: 275 case MP_INTSRC:
@@ -331,8 +278,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
331 (struct mpc_config_intsrc *)mpt; 278 (struct mpc_config_intsrc *)mpt;
332 279
333 MP_intsrc_info(m); 280 MP_intsrc_info(m);
334 mpt+=sizeof(*m); 281 mpt += sizeof(*m);
335 count+=sizeof(*m); 282 count += sizeof(*m);
336 break; 283 break;
337 } 284 }
338 case MP_LINTSRC: 285 case MP_LINTSRC:
@@ -340,15 +287,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
340 struct mpc_config_lintsrc *m= 287 struct mpc_config_lintsrc *m=
341 (struct mpc_config_lintsrc *)mpt; 288 (struct mpc_config_lintsrc *)mpt;
342 MP_lintsrc_info(m); 289 MP_lintsrc_info(m);
343 mpt+=sizeof(*m); 290 mpt += sizeof(*m);
344 count+=sizeof(*m); 291 count += sizeof(*m);
345 break; 292 break;
346 } 293 }
347 } 294 }
348 } 295 }
349 clustered_apic_check(); 296 clustered_apic_check();
350 if (!num_processors) 297 if (!num_processors)
351 printk(KERN_ERR "SMP mptable: no processors registered!\n"); 298 printk(KERN_ERR "MPTABLE: no processors registered!\n");
352 return num_processors; 299 return num_processors;
353} 300}
354 301
@@ -444,13 +391,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
444 * 2 CPUs, numbered 0 & 1. 391 * 2 CPUs, numbered 0 & 1.
445 */ 392 */
446 processor.mpc_type = MP_PROCESSOR; 393 processor.mpc_type = MP_PROCESSOR;
447 /* Either an integrated APIC or a discrete 82489DX. */ 394 processor.mpc_apicver = 0;
448 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
449 processor.mpc_cpuflag = CPU_ENABLED; 395 processor.mpc_cpuflag = CPU_ENABLED;
450 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 396 processor.mpc_cpufeature = 0;
451 (boot_cpu_data.x86_model << 4) | 397 processor.mpc_featureflag = 0;
452 boot_cpu_data.x86_mask;
453 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
454 processor.mpc_reserved[0] = 0; 398 processor.mpc_reserved[0] = 0;
455 processor.mpc_reserved[1] = 0; 399 processor.mpc_reserved[1] = 0;
456 for (i = 0; i < 2; i++) { 400 for (i = 0; i < 2; i++) {
@@ -469,14 +413,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
469 case 5: 413 case 5:
470 memcpy(bus.mpc_bustype, "ISA ", 6); 414 memcpy(bus.mpc_bustype, "ISA ", 6);
471 break; 415 break;
472 case 2:
473 case 6:
474 case 3:
475 memcpy(bus.mpc_bustype, "EISA ", 6);
476 break;
477 case 4:
478 case 7:
479 memcpy(bus.mpc_bustype, "MCA ", 6);
480 } 416 }
481 MP_bus_info(&bus); 417 MP_bus_info(&bus);
482 if (mpc_default_type > 4) { 418 if (mpc_default_type > 4) {
@@ -487,7 +423,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
487 423
488 ioapic.mpc_type = MP_IOAPIC; 424 ioapic.mpc_type = MP_IOAPIC;
489 ioapic.mpc_apicid = 2; 425 ioapic.mpc_apicid = 2;
490 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 426 ioapic.mpc_apicver = 0;
491 ioapic.mpc_flags = MPC_APIC_USABLE; 427 ioapic.mpc_flags = MPC_APIC_USABLE;
492 ioapic.mpc_apicaddr = 0xFEC00000; 428 ioapic.mpc_apicaddr = 0xFEC00000;
493 MP_ioapic_info(&ioapic); 429 MP_ioapic_info(&ioapic);
@@ -530,13 +466,6 @@ void __init get_smp_config (void)
530 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); 466 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
531 467
532 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); 468 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
533 if (mpf->mpf_feature2 & (1<<7)) {
534 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
535 pic_mode = 1;
536 } else {
537 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
538 pic_mode = 0;
539 }
540 469
541 /* 470 /*
542 * Now see if we need to read further. 471 * Now see if we need to read further.
@@ -616,7 +545,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
616 return 0; 545 return 0;
617} 546}
618 547
619void __init find_intel_smp (void) 548void __init find_smp_config(void)
620{ 549{
621 unsigned int address; 550 unsigned int address;
622 551
@@ -633,9 +562,7 @@ void __init find_intel_smp (void)
633 smp_scan_config(0xF0000,0x10000)) 562 smp_scan_config(0xF0000,0x10000))
634 return; 563 return;
635 /* 564 /*
636 * If it is an SMP machine we should know now, unless the 565 * If it is an SMP machine we should know now.
637 * configuration is in an EISA/MCA bus machine with an
638 * extended bios data area.
639 * 566 *
640 * there is a real-mode segmented pointer pointing to the 567 * there is a real-mode segmented pointer pointing to the
641 * 4K EBDA area at 0x40E, calculate and scan it here. 568 * 4K EBDA area at 0x40E, calculate and scan it here.
@@ -656,69 +583,41 @@ void __init find_intel_smp (void)
656 printk(KERN_INFO "No mptable found.\n"); 583 printk(KERN_INFO "No mptable found.\n");
657} 584}
658 585
659/*
660 * - Intel MP Configuration Table
661 */
662void __init find_smp_config (void)
663{
664#ifdef CONFIG_X86_LOCAL_APIC
665 find_intel_smp();
666#endif
667}
668
669
670/* -------------------------------------------------------------------------- 586/* --------------------------------------------------------------------------
671 ACPI-based MP Configuration 587 ACPI-based MP Configuration
672 -------------------------------------------------------------------------- */ 588 -------------------------------------------------------------------------- */
673 589
674#ifdef CONFIG_ACPI 590#ifdef CONFIG_ACPI
675 591
676void __init mp_register_lapic_address ( 592void __init mp_register_lapic_address(u64 address)
677 u64 address)
678{ 593{
679 mp_lapic_addr = (unsigned long) address; 594 mp_lapic_addr = (unsigned long) address;
680
681 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); 595 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
682
683 if (boot_cpu_id == -1U) 596 if (boot_cpu_id == -1U)
684 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); 597 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
685
686 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
687} 598}
688 599
689 600void __cpuinit mp_register_lapic (u8 id, u8 enabled)
690void __cpuinit mp_register_lapic (
691 u8 id,
692 u8 enabled)
693{ 601{
694 struct mpc_config_processor processor; 602 struct mpc_config_processor processor;
695 int boot_cpu = 0; 603 int boot_cpu = 0;
696 604
697 if (id >= MAX_APICS) { 605 if (id == boot_cpu_id)
698 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
699 id, MAX_APICS);
700 return;
701 }
702
703 if (id == boot_cpu_physical_apicid)
704 boot_cpu = 1; 606 boot_cpu = 1;
705 607
706 processor.mpc_type = MP_PROCESSOR; 608 processor.mpc_type = MP_PROCESSOR;
707 processor.mpc_apicid = id; 609 processor.mpc_apicid = id;
708 processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); 610 processor.mpc_apicver = 0;
709 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); 611 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
710 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); 612 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
711 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 613 processor.mpc_cpufeature = 0;
712 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; 614 processor.mpc_featureflag = 0;
713 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
714 processor.mpc_reserved[0] = 0; 615 processor.mpc_reserved[0] = 0;
715 processor.mpc_reserved[1] = 0; 616 processor.mpc_reserved[1] = 0;
716 617
717 MP_processor_info(&processor); 618 MP_processor_info(&processor);
718} 619}
719 620
720#ifdef CONFIG_X86_IO_APIC
721
722#define MP_ISA_BUS 0 621#define MP_ISA_BUS 0
723#define MP_MAX_IOAPIC_PIN 127 622#define MP_MAX_IOAPIC_PIN 127
724 623
@@ -729,11 +628,9 @@ static struct mp_ioapic_routing {
729 u32 pin_programmed[4]; 628 u32 pin_programmed[4];
730} mp_ioapic_routing[MAX_IO_APICS]; 629} mp_ioapic_routing[MAX_IO_APICS];
731 630
732 631static int mp_find_ioapic(int gsi)
733static int mp_find_ioapic (
734 int gsi)
735{ 632{
736 int i = 0; 633 int i = 0;
737 634
738 /* Find the IOAPIC that manages this GSI. */ 635 /* Find the IOAPIC that manages this GSI. */
739 for (i = 0; i < nr_ioapics; i++) { 636 for (i = 0; i < nr_ioapics; i++) {
@@ -743,17 +640,12 @@ static int mp_find_ioapic (
743 } 640 }
744 641
745 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); 642 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
746
747 return -1; 643 return -1;
748} 644}
749
750 645
751void __init mp_register_ioapic ( 646void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
752 u8 id,
753 u32 address,
754 u32 gsi_base)
755{ 647{
756 int idx = 0; 648 int idx = 0;
757 649
758 if (nr_ioapics >= MAX_IO_APICS) { 650 if (nr_ioapics >= MAX_IO_APICS) {
759 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " 651 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -774,7 +666,7 @@ void __init mp_register_ioapic (
774 666
775 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 667 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
776 mp_ioapics[idx].mpc_apicid = id; 668 mp_ioapics[idx].mpc_apicid = id;
777 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 669 mp_ioapics[idx].mpc_apicver = 0;
778 670
779 /* 671 /*
780 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups 672 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
@@ -785,21 +677,15 @@ void __init mp_register_ioapic (
785 mp_ioapic_routing[idx].gsi_end = gsi_base + 677 mp_ioapic_routing[idx].gsi_end = gsi_base +
786 io_apic_get_redir_entries(idx); 678 io_apic_get_redir_entries(idx);
787 679
788 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 680 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
789 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 681 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
790 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 682 mp_ioapics[idx].mpc_apicaddr,
791 mp_ioapic_routing[idx].gsi_start, 683 mp_ioapic_routing[idx].gsi_start,
792 mp_ioapic_routing[idx].gsi_end); 684 mp_ioapic_routing[idx].gsi_end);
793
794 return;
795} 685}
796 686
797 687void __init
798void __init mp_override_legacy_irq ( 688mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
799 u8 bus_irq,
800 u8 polarity,
801 u8 trigger,
802 u32 gsi)
803{ 689{
804 struct mpc_config_intsrc intsrc; 690 struct mpc_config_intsrc intsrc;
805 int ioapic = -1; 691 int ioapic = -1;
@@ -837,22 +723,18 @@ void __init mp_override_legacy_irq (
837 mp_irqs[mp_irq_entries] = intsrc; 723 mp_irqs[mp_irq_entries] = intsrc;
838 if (++mp_irq_entries == MAX_IRQ_SOURCES) 724 if (++mp_irq_entries == MAX_IRQ_SOURCES)
839 panic("Max # of irq sources exceeded!\n"); 725 panic("Max # of irq sources exceeded!\n");
840
841 return;
842} 726}
843 727
844 728void __init mp_config_acpi_legacy_irqs(void)
845void __init mp_config_acpi_legacy_irqs (void)
846{ 729{
847 struct mpc_config_intsrc intsrc; 730 struct mpc_config_intsrc intsrc;
848 int i = 0; 731 int i = 0;
849 int ioapic = -1; 732 int ioapic = -1;
850 733
851 /* 734 /*
852 * Fabricate the legacy ISA bus (bus #31). 735 * Fabricate the legacy ISA bus (bus #31).
853 */ 736 */
854 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 737 set_bit(MP_ISA_BUS, mp_bus_not_pci);
855 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
856 738
857 /* 739 /*
858 * Locate the IOAPIC that manages the ISA IRQs (0-15). 740 * Locate the IOAPIC that manages the ISA IRQs (0-15).
@@ -905,24 +787,22 @@ void __init mp_config_acpi_legacy_irqs (void)
905 if (++mp_irq_entries == MAX_IRQ_SOURCES) 787 if (++mp_irq_entries == MAX_IRQ_SOURCES)
906 panic("Max # of irq sources exceeded!\n"); 788 panic("Max # of irq sources exceeded!\n");
907 } 789 }
908
909 return;
910} 790}
911 791
912#define MAX_GSI_NUM 4096 792#define MAX_GSI_NUM 4096
913 793
914int mp_register_gsi(u32 gsi, int triggering, int polarity) 794int mp_register_gsi(u32 gsi, int triggering, int polarity)
915{ 795{
916 int ioapic = -1; 796 int ioapic = -1;
917 int ioapic_pin = 0; 797 int ioapic_pin = 0;
918 int idx, bit = 0; 798 int idx, bit = 0;
919 static int pci_irq = 16; 799 static int pci_irq = 16;
920 /* 800 /*
921 * Mapping between Global System Interrupts, which 801 * Mapping between Global System Interrupts, which
922 * represent all possible interrupts, to the IRQs 802 * represent all possible interrupts, to the IRQs
923 * assigned to actual devices. 803 * assigned to actual devices.
924 */ 804 */
925 static int gsi_to_irq[MAX_GSI_NUM]; 805 static int gsi_to_irq[MAX_GSI_NUM];
926 806
927 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 807 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
928 return gsi; 808 return gsi;
@@ -996,6 +876,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
996 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 876 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
997 return gsi; 877 return gsi;
998} 878}
999
1000#endif /*CONFIG_X86_IO_APIC*/
1001#endif /*CONFIG_ACPI*/ 879#endif /*CONFIG_ACPI*/
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 5baa0c726e97..4d6fb047952e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -28,71 +28,138 @@
28#include <asm/mce.h> 28#include <asm/mce.h>
29#include <asm/intel_arch_perfmon.h> 29#include <asm/intel_arch_perfmon.h>
30 30
31/* 31/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
32 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 32 * evtsel_nmi_owner tracks the ownership of the event selection
33 * - it may be reserved by some other driver, or not 33 * - different performance counters/ event selection may be reserved for
34 * - when not reserved by some other driver, it may be used for 34 * different subsystems this reservation system just tries to coordinate
35 * the NMI watchdog, or not 35 * things a little
36 *
37 * This is maintained separately from nmi_active because the NMI
38 * watchdog may also be driven from the I/O APIC timer.
39 */ 36 */
40static DEFINE_SPINLOCK(lapic_nmi_owner_lock); 37static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
41static unsigned int lapic_nmi_owner; 38static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
42#define LAPIC_NMI_WATCHDOG (1<<0) 39
43#define LAPIC_NMI_RESERVED (1<<1) 40/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
41 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
42 */
43#define NMI_MAX_COUNTER_BITS 66
44 44
45/* nmi_active: 45/* nmi_active:
46 * +1: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
47 * 0: the lapic NMI watchdog has not been set up, and cannot 47 * <0: the lapic NMI watchdog has not been set up, and cannot
48 * be enabled 48 * be enabled
49 * -1: the lapic NMI watchdog is disabled, but can be enabled 49 * 0: the lapic NMI watchdog is disabled, but can be enabled
50 */ 50 */
51int nmi_active; /* oprofile uses this */ 51atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
52int panic_on_timeout; 52int panic_on_timeout;
53 53
54unsigned int nmi_watchdog = NMI_DEFAULT; 54unsigned int nmi_watchdog = NMI_DEFAULT;
55static unsigned int nmi_hz = HZ; 55static unsigned int nmi_hz = HZ;
56static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
57static unsigned int nmi_p4_cccr_val;
58 56
59/* Note that these events don't tick when the CPU idles. This means 57struct nmi_watchdog_ctlblk {
60 the frequency varies with CPU load. */ 58 int enabled;
59 u64 check_bit;
60 unsigned int cccr_msr;
61 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
62 unsigned int evntsel_msr; /* the MSR to select the events to handle */
63};
64static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
61 65
62#define K7_EVNTSEL_ENABLE (1 << 22) 66/* local prototypes */
63#define K7_EVNTSEL_INT (1 << 20) 67static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
64#define K7_EVNTSEL_OS (1 << 17)
65#define K7_EVNTSEL_USR (1 << 16)
66#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
67#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
68 68
69#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 69/* converts an msr to an appropriate reservation bit */
70#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK 70static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
71{
72 /* returns the bit offset of the performance counter register */
73 switch (boot_cpu_data.x86_vendor) {
74 case X86_VENDOR_AMD:
75 return (msr - MSR_K7_PERFCTR0);
76 case X86_VENDOR_INTEL:
77 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
78 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
79 else
80 return (msr - MSR_P4_BPU_PERFCTR0);
81 }
82 return 0;
83}
71 84
72#define MSR_P4_MISC_ENABLE 0x1A0 85/* converts an msr to an appropriate reservation bit */
73#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 86static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
74#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) 87{
75#define MSR_P4_PERFCTR0 0x300 88 /* returns the bit offset of the event selection register */
76#define MSR_P4_CCCR0 0x360 89 switch (boot_cpu_data.x86_vendor) {
77#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 90 case X86_VENDOR_AMD:
78#define P4_ESCR_OS (1<<3) 91 return (msr - MSR_K7_EVNTSEL0);
79#define P4_ESCR_USR (1<<2) 92 case X86_VENDOR_INTEL:
80#define P4_CCCR_OVF_PMI0 (1<<26) 93 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
81#define P4_CCCR_OVF_PMI1 (1<<27) 94 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
82#define P4_CCCR_THRESHOLD(N) ((N)<<20) 95 else
83#define P4_CCCR_COMPLEMENT (1<<19) 96 return (msr - MSR_P4_BSU_ESCR0);
84#define P4_CCCR_COMPARE (1<<18) 97 }
85#define P4_CCCR_REQUIRED (3<<16) 98 return 0;
86#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 99}
87#define P4_CCCR_ENABLE (1<<12) 100
88/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 101/* checks for a bit availability (hack for oprofile) */
89 CRU_ESCR0 (with any non-null event selector) through a complemented 102int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
90 max threshold. [IA32-Vol3, Section 14.9.9] */ 103{
91#define MSR_P4_IQ_COUNTER0 0x30C 104 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
92#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) 105
93#define P4_NMI_IQ_CCCR0 \ 106 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
94 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 107}
95 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 108
109/* checks the an msr for availability */
110int avail_to_resrv_perfctr_nmi(unsigned int msr)
111{
112 unsigned int counter;
113
114 counter = nmi_perfctr_msr_to_bit(msr);
115 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
116
117 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
118}
119
120int reserve_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
128 return 1;
129 return 0;
130}
131
132void release_perfctr_nmi(unsigned int msr)
133{
134 unsigned int counter;
135
136 counter = nmi_perfctr_msr_to_bit(msr);
137 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
138
139 clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
140}
141
142int reserve_evntsel_nmi(unsigned int msr)
143{
144 unsigned int counter;
145
146 counter = nmi_evntsel_msr_to_bit(msr);
147 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
148
149 if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)))
150 return 1;
151 return 0;
152}
153
154void release_evntsel_nmi(unsigned int msr)
155{
156 unsigned int counter;
157
158 counter = nmi_evntsel_msr_to_bit(msr);
159 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
160
161 clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner));
162}
96 163
97static __cpuinit inline int nmi_known_cpu(void) 164static __cpuinit inline int nmi_known_cpu(void)
98{ 165{
@@ -109,7 +176,7 @@ static __cpuinit inline int nmi_known_cpu(void)
109} 176}
110 177
111/* Run after command line and cpu_init init, but before all other checks */ 178/* Run after command line and cpu_init init, but before all other checks */
112void __cpuinit nmi_watchdog_default(void) 179void nmi_watchdog_default(void)
113{ 180{
114 if (nmi_watchdog != NMI_DEFAULT) 181 if (nmi_watchdog != NMI_DEFAULT)
115 return; 182 return;
@@ -145,6 +212,12 @@ int __init check_nmi_watchdog (void)
145 int *counts; 212 int *counts;
146 int cpu; 213 int cpu;
147 214
215 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
216 return 0;
217
218 if (!atomic_read(&nmi_active))
219 return 0;
220
148 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 221 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
149 if (!counts) 222 if (!counts)
150 return -1; 223 return -1;
@@ -162,26 +235,43 @@ int __init check_nmi_watchdog (void)
162 mdelay((10*1000)/nmi_hz); // wait 10 ticks 235 mdelay((10*1000)/nmi_hz); // wait 10 ticks
163 236
164 for_each_online_cpu(cpu) { 237 for_each_online_cpu(cpu) {
238 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
239 continue;
165 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 240 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
166 endflag = 1;
167 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 241 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
168 cpu, 242 cpu,
169 counts[cpu], 243 counts[cpu],
170 cpu_pda(cpu)->__nmi_count); 244 cpu_pda(cpu)->__nmi_count);
171 nmi_active = 0; 245 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
172 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 246 atomic_dec(&nmi_active);
173 nmi_perfctr_msr = 0;
174 kfree(counts);
175 return -1;
176 } 247 }
177 } 248 }
249 if (!atomic_read(&nmi_active)) {
250 kfree(counts);
251 atomic_set(&nmi_active, -1);
252 return -1;
253 }
178 endflag = 1; 254 endflag = 1;
179 printk("OK.\n"); 255 printk("OK.\n");
180 256
181 /* now that we know it works we can reduce NMI frequency to 257 /* now that we know it works we can reduce NMI frequency to
182 something more reasonable; makes a difference in some configs */ 258 something more reasonable; makes a difference in some configs */
183 if (nmi_watchdog == NMI_LOCAL_APIC) 259 if (nmi_watchdog == NMI_LOCAL_APIC) {
260 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
261
184 nmi_hz = 1; 262 nmi_hz = 1;
263 /*
264 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
265 * are writable, with higher bits sign extending from bit 31.
266 * So, we can only program the counter with 31 bit values and
267 * 32nd bit should be 1, for 33.. to be 1.
268 * Find the appropriate nmi_hz
269 */
270 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
271 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
272 nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
273 }
274 }
185 275
186 kfree(counts); 276 kfree(counts);
187 return 0; 277 return 0;
@@ -201,91 +291,65 @@ int __init setup_nmi_watchdog(char *str)
201 291
202 get_option(&str, &nmi); 292 get_option(&str, &nmi);
203 293
204 if (nmi >= NMI_INVALID) 294 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
205 return 0; 295 return 0;
296
297 if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
298 return 0; /* no lapic support */
206 nmi_watchdog = nmi; 299 nmi_watchdog = nmi;
207 return 1; 300 return 1;
208} 301}
209 302
210__setup("nmi_watchdog=", setup_nmi_watchdog); 303__setup("nmi_watchdog=", setup_nmi_watchdog);
211 304
212static void disable_intel_arch_watchdog(void);
213
214static void disable_lapic_nmi_watchdog(void) 305static void disable_lapic_nmi_watchdog(void)
215{ 306{
216 if (nmi_active <= 0) 307 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
308
309 if (atomic_read(&nmi_active) <= 0)
217 return; 310 return;
218 switch (boot_cpu_data.x86_vendor) {
219 case X86_VENDOR_AMD:
220 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
221 break;
222 case X86_VENDOR_INTEL:
223 if (boot_cpu_data.x86 == 15) {
224 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
225 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
226 } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
227 disable_intel_arch_watchdog();
228 }
229 break;
230 }
231 nmi_active = -1;
232 /* tell do_nmi() and others that we're not active any more */
233 nmi_watchdog = 0;
234}
235 311
236static void enable_lapic_nmi_watchdog(void) 312 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
237{ 313
238 if (nmi_active < 0) { 314 BUG_ON(atomic_read(&nmi_active) != 0);
239 nmi_watchdog = NMI_LOCAL_APIC;
240 touch_nmi_watchdog();
241 setup_apic_nmi_watchdog();
242 }
243} 315}
244 316
245int reserve_lapic_nmi(void) 317static void enable_lapic_nmi_watchdog(void)
246{ 318{
247 unsigned int old_owner; 319 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
248 320
249 spin_lock(&lapic_nmi_owner_lock); 321 /* are we already enabled */
250 old_owner = lapic_nmi_owner; 322 if (atomic_read(&nmi_active) != 0)
251 lapic_nmi_owner |= LAPIC_NMI_RESERVED; 323 return;
252 spin_unlock(&lapic_nmi_owner_lock);
253 if (old_owner & LAPIC_NMI_RESERVED)
254 return -EBUSY;
255 if (old_owner & LAPIC_NMI_WATCHDOG)
256 disable_lapic_nmi_watchdog();
257 return 0;
258}
259 324
260void release_lapic_nmi(void) 325 /* are we lapic aware */
261{ 326 if (nmi_known_cpu() <= 0)
262 unsigned int new_owner; 327 return;
263 328
264 spin_lock(&lapic_nmi_owner_lock); 329 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
265 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; 330 touch_nmi_watchdog();
266 lapic_nmi_owner = new_owner;
267 spin_unlock(&lapic_nmi_owner_lock);
268 if (new_owner & LAPIC_NMI_WATCHDOG)
269 enable_lapic_nmi_watchdog();
270} 331}
271 332
272void disable_timer_nmi_watchdog(void) 333void disable_timer_nmi_watchdog(void)
273{ 334{
274 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) 335 BUG_ON(nmi_watchdog != NMI_IO_APIC);
336
337 if (atomic_read(&nmi_active) <= 0)
275 return; 338 return;
276 339
277 disable_irq(0); 340 disable_irq(0);
278 unset_nmi_callback(); 341 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
279 nmi_active = -1; 342
280 nmi_watchdog = NMI_NONE; 343 BUG_ON(atomic_read(&nmi_active) != 0);
281} 344}
282 345
283void enable_timer_nmi_watchdog(void) 346void enable_timer_nmi_watchdog(void)
284{ 347{
285 if (nmi_active < 0) { 348 BUG_ON(nmi_watchdog != NMI_IO_APIC);
286 nmi_watchdog = NMI_IO_APIC; 349
350 if (atomic_read(&nmi_active) == 0) {
287 touch_nmi_watchdog(); 351 touch_nmi_watchdog();
288 nmi_active = 1; 352 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
289 enable_irq(0); 353 enable_irq(0);
290 } 354 }
291} 355}
@@ -296,15 +360,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
296 360
297static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 361static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
298{ 362{
299 nmi_pm_active = nmi_active; 363 /* only CPU0 goes here, other CPUs should be offline */
300 disable_lapic_nmi_watchdog(); 364 nmi_pm_active = atomic_read(&nmi_active);
365 stop_apic_nmi_watchdog(NULL);
366 BUG_ON(atomic_read(&nmi_active) != 0);
301 return 0; 367 return 0;
302} 368}
303 369
304static int lapic_nmi_resume(struct sys_device *dev) 370static int lapic_nmi_resume(struct sys_device *dev)
305{ 371{
306 if (nmi_pm_active > 0) 372 /* only CPU0 goes here, other CPUs should be offline */
307 enable_lapic_nmi_watchdog(); 373 if (nmi_pm_active > 0) {
374 setup_apic_nmi_watchdog(NULL);
375 touch_nmi_watchdog();
376 }
308 return 0; 377 return 0;
309} 378}
310 379
@@ -323,7 +392,13 @@ static int __init init_lapic_nmi_sysfs(void)
323{ 392{
324 int error; 393 int error;
325 394
326 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) 395 /* should really be a BUG_ON but b/c this is an
396 * init call, it just doesn't work. -dcz
397 */
398 if (nmi_watchdog != NMI_LOCAL_APIC)
399 return 0;
400
401 if ( atomic_read(&nmi_active) < 0 )
327 return 0; 402 return 0;
328 403
329 error = sysdev_class_register(&nmi_sysclass); 404 error = sysdev_class_register(&nmi_sysclass);
@@ -341,74 +416,209 @@ late_initcall(init_lapic_nmi_sysfs);
341 * Original code written by Keith Owens. 416 * Original code written by Keith Owens.
342 */ 417 */
343 418
344static void clear_msr_range(unsigned int base, unsigned int n) 419/* Note that these events don't tick when the CPU idles. This means
345{ 420 the frequency varies with CPU load. */
346 unsigned int i;
347 421
348 for(i = 0; i < n; ++i) 422#define K7_EVNTSEL_ENABLE (1 << 22)
349 wrmsr(base+i, 0, 0); 423#define K7_EVNTSEL_INT (1 << 20)
350} 424#define K7_EVNTSEL_OS (1 << 17)
425#define K7_EVNTSEL_USR (1 << 16)
426#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
427#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
351 428
352static void setup_k7_watchdog(void) 429static int setup_k7_watchdog(void)
353{ 430{
354 int i; 431 unsigned int perfctr_msr, evntsel_msr;
355 unsigned int evntsel; 432 unsigned int evntsel;
433 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
356 434
357 nmi_perfctr_msr = MSR_K7_PERFCTR0; 435 perfctr_msr = MSR_K7_PERFCTR0;
436 evntsel_msr = MSR_K7_EVNTSEL0;
437 if (!reserve_perfctr_nmi(perfctr_msr))
438 goto fail;
358 439
359 for(i = 0; i < 4; ++i) { 440 if (!reserve_evntsel_nmi(evntsel_msr))
360 /* Simulator may not support it */ 441 goto fail1;
361 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { 442
362 nmi_perfctr_msr = 0; 443 /* Simulator may not support it */
363 return; 444 if (checking_wrmsrl(evntsel_msr, 0UL))
364 } 445 goto fail2;
365 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 446 wrmsrl(perfctr_msr, 0UL);
366 }
367 447
368 evntsel = K7_EVNTSEL_INT 448 evntsel = K7_EVNTSEL_INT
369 | K7_EVNTSEL_OS 449 | K7_EVNTSEL_OS
370 | K7_EVNTSEL_USR 450 | K7_EVNTSEL_USR
371 | K7_NMI_EVENT; 451 | K7_NMI_EVENT;
372 452
373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 453 /* setup the timer */
374 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); 454 wrmsr(evntsel_msr, evntsel, 0);
455 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
375 apic_write(APIC_LVTPC, APIC_DM_NMI); 456 apic_write(APIC_LVTPC, APIC_DM_NMI);
376 evntsel |= K7_EVNTSEL_ENABLE; 457 evntsel |= K7_EVNTSEL_ENABLE;
377 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 458 wrmsr(evntsel_msr, evntsel, 0);
459
460 wd->perfctr_msr = perfctr_msr;
461 wd->evntsel_msr = evntsel_msr;
462 wd->cccr_msr = 0; //unused
463 wd->check_bit = 1ULL<<63;
464 return 1;
465fail2:
466 release_evntsel_nmi(evntsel_msr);
467fail1:
468 release_perfctr_nmi(perfctr_msr);
469fail:
470 return 0;
378} 471}
379 472
380static void disable_intel_arch_watchdog(void) 473static void stop_k7_watchdog(void)
381{ 474{
382 unsigned ebx; 475 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
383 476
384 /* 477 wrmsr(wd->evntsel_msr, 0, 0);
385 * Check whether the Architectural PerfMon supports 478
386 * Unhalted Core Cycles Event or not. 479 release_evntsel_nmi(wd->evntsel_msr);
387 * NOTE: Corresponding bit = 0 in ebp indicates event present. 480 release_perfctr_nmi(wd->perfctr_msr);
481}
482
483/* Note that these events don't tick when the CPU idles. This means
484 the frequency varies with CPU load. */
485
486#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
487#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
488#define P4_ESCR_OS (1<<3)
489#define P4_ESCR_USR (1<<2)
490#define P4_CCCR_OVF_PMI0 (1<<26)
491#define P4_CCCR_OVF_PMI1 (1<<27)
492#define P4_CCCR_THRESHOLD(N) ((N)<<20)
493#define P4_CCCR_COMPLEMENT (1<<19)
494#define P4_CCCR_COMPARE (1<<18)
495#define P4_CCCR_REQUIRED (3<<16)
496#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
497#define P4_CCCR_ENABLE (1<<12)
498#define P4_CCCR_OVF (1<<31)
499/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
500 CRU_ESCR0 (with any non-null event selector) through a complemented
501 max threshold. [IA32-Vol3, Section 14.9.9] */
502
503static int setup_p4_watchdog(void)
504{
505 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
506 unsigned int evntsel, cccr_val;
507 unsigned int misc_enable, dummy;
508 unsigned int ht_num;
509 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
510
511 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
512 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
513 return 0;
514
515#ifdef CONFIG_SMP
516 /* detect which hyperthread we are on */
517 if (smp_num_siblings == 2) {
518 unsigned int ebx, apicid;
519
520 ebx = cpuid_ebx(1);
521 apicid = (ebx >> 24) & 0xff;
522 ht_num = apicid & 1;
523 } else
524#endif
525 ht_num = 0;
526
527 /* performance counters are shared resources
528 * assign each hyperthread its own set
529 * (re-use the ESCR0 register, seems safe
530 * and keeps the cccr_val the same)
388 */ 531 */
389 ebx = cpuid_ebx(10); 532 if (!ht_num) {
390 if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 533 /* logical cpu 0 */
391 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); 534 perfctr_msr = MSR_P4_IQ_PERFCTR0;
535 evntsel_msr = MSR_P4_CRU_ESCR0;
536 cccr_msr = MSR_P4_IQ_CCCR0;
537 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
538 } else {
539 /* logical cpu 1 */
540 perfctr_msr = MSR_P4_IQ_PERFCTR1;
541 evntsel_msr = MSR_P4_CRU_ESCR0;
542 cccr_msr = MSR_P4_IQ_CCCR1;
543 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
544 }
545
546 if (!reserve_perfctr_nmi(perfctr_msr))
547 goto fail;
548
549 if (!reserve_evntsel_nmi(evntsel_msr))
550 goto fail1;
551
552 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
553 | P4_ESCR_OS
554 | P4_ESCR_USR;
555
556 cccr_val |= P4_CCCR_THRESHOLD(15)
557 | P4_CCCR_COMPLEMENT
558 | P4_CCCR_COMPARE
559 | P4_CCCR_REQUIRED;
560
561 wrmsr(evntsel_msr, evntsel, 0);
562 wrmsr(cccr_msr, cccr_val, 0);
563 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
564 apic_write(APIC_LVTPC, APIC_DM_NMI);
565 cccr_val |= P4_CCCR_ENABLE;
566 wrmsr(cccr_msr, cccr_val, 0);
567
568 wd->perfctr_msr = perfctr_msr;
569 wd->evntsel_msr = evntsel_msr;
570 wd->cccr_msr = cccr_msr;
571 wd->check_bit = 1ULL<<39;
572 return 1;
573fail1:
574 release_perfctr_nmi(perfctr_msr);
575fail:
576 return 0;
577}
578
579static void stop_p4_watchdog(void)
580{
581 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
582
583 wrmsr(wd->cccr_msr, 0, 0);
584 wrmsr(wd->evntsel_msr, 0, 0);
585
586 release_evntsel_nmi(wd->evntsel_msr);
587 release_perfctr_nmi(wd->perfctr_msr);
392} 588}
393 589
590#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
591#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
592
394static int setup_intel_arch_watchdog(void) 593static int setup_intel_arch_watchdog(void)
395{ 594{
595 unsigned int ebx;
596 union cpuid10_eax eax;
597 unsigned int unused;
598 unsigned int perfctr_msr, evntsel_msr;
396 unsigned int evntsel; 599 unsigned int evntsel;
397 unsigned ebx; 600 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
398 601
399 /* 602 /*
400 * Check whether the Architectural PerfMon supports 603 * Check whether the Architectural PerfMon supports
401 * Unhalted Core Cycles Event or not. 604 * Unhalted Core Cycles Event or not.
402 * NOTE: Corresponding bit = 0 in ebp indicates event present. 605 * NOTE: Corresponding bit = 0 in ebx indicates event present.
403 */ 606 */
404 ebx = cpuid_ebx(10); 607 cpuid(10, &(eax.full), &ebx, &unused, &unused);
405 if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 608 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
406 return 0; 609 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
610 goto fail;
611
612 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
613 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
407 614
408 nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; 615 if (!reserve_perfctr_nmi(perfctr_msr))
616 goto fail;
409 617
410 clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); 618 if (!reserve_evntsel_nmi(evntsel_msr))
411 clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); 619 goto fail1;
620
621 wrmsrl(perfctr_msr, 0UL);
412 622
413 evntsel = ARCH_PERFMON_EVENTSEL_INT 623 evntsel = ARCH_PERFMON_EVENTSEL_INT
414 | ARCH_PERFMON_EVENTSEL_OS 624 | ARCH_PERFMON_EVENTSEL_OS
@@ -416,84 +626,122 @@ static int setup_intel_arch_watchdog(void)
416 | ARCH_PERFMON_NMI_EVENT_SEL 626 | ARCH_PERFMON_NMI_EVENT_SEL
417 | ARCH_PERFMON_NMI_EVENT_UMASK; 627 | ARCH_PERFMON_NMI_EVENT_UMASK;
418 628
419 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); 629 /* setup the timer */
420 wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); 630 wrmsr(evntsel_msr, evntsel, 0);
631 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
632
421 apic_write(APIC_LVTPC, APIC_DM_NMI); 633 apic_write(APIC_LVTPC, APIC_DM_NMI);
422 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 634 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
423 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); 635 wrmsr(evntsel_msr, evntsel, 0);
636
637 wd->perfctr_msr = perfctr_msr;
638 wd->evntsel_msr = evntsel_msr;
639 wd->cccr_msr = 0; //unused
640 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
424 return 1; 641 return 1;
642fail1:
643 release_perfctr_nmi(perfctr_msr);
644fail:
645 return 0;
425} 646}
426 647
427 648static void stop_intel_arch_watchdog(void)
428static int setup_p4_watchdog(void)
429{ 649{
430 unsigned int misc_enable, dummy; 650 unsigned int ebx;
651 union cpuid10_eax eax;
652 unsigned int unused;
653 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
431 654
432 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); 655 /*
433 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) 656 * Check whether the Architectural PerfMon supports
434 return 0; 657 * Unhalted Core Cycles Event or not.
658 * NOTE: Corresponding bit = 0 in ebx indicates event present.
659 */
660 cpuid(10, &(eax.full), &ebx, &unused, &unused);
661 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
662 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
663 return;
435 664
436 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; 665 wrmsr(wd->evntsel_msr, 0, 0);
437 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
438#ifdef CONFIG_SMP
439 if (smp_num_siblings == 2)
440 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
441#endif
442 666
443 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) 667 release_evntsel_nmi(wd->evntsel_msr);
444 clear_msr_range(0x3F1, 2); 668 release_perfctr_nmi(wd->perfctr_msr);
445 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
446 docs doesn't fully define it, so leave it alone for now. */
447 if (boot_cpu_data.x86_model >= 0x3) {
448 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
449 clear_msr_range(0x3A0, 26);
450 clear_msr_range(0x3BC, 3);
451 } else {
452 clear_msr_range(0x3A0, 31);
453 }
454 clear_msr_range(0x3C0, 6);
455 clear_msr_range(0x3C8, 6);
456 clear_msr_range(0x3E0, 2);
457 clear_msr_range(MSR_P4_CCCR0, 18);
458 clear_msr_range(MSR_P4_PERFCTR0, 18);
459
460 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
461 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
462 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
463 wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
464 apic_write(APIC_LVTPC, APIC_DM_NMI);
465 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
466 return 1;
467} 669}
468 670
469void setup_apic_nmi_watchdog(void) 671void setup_apic_nmi_watchdog(void *unused)
470{ 672{
471 switch (boot_cpu_data.x86_vendor) { 673 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472 case X86_VENDOR_AMD: 674
473 if (boot_cpu_data.x86 != 15) 675 /* only support LOCAL and IO APICs for now */
474 return; 676 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
475 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) 677 (nmi_watchdog != NMI_IO_APIC))
476 return; 678 return;
477 setup_k7_watchdog(); 679
478 break; 680 if (wd->enabled == 1)
479 case X86_VENDOR_INTEL: 681 return;
480 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 682
481 if (!setup_intel_arch_watchdog()) 683 /* cheap hack to support suspend/resume */
684 /* if cpu0 is not active neither should the other cpus */
685 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
686 return;
687
688 if (nmi_watchdog == NMI_LOCAL_APIC) {
689 switch (boot_cpu_data.x86_vendor) {
690 case X86_VENDOR_AMD:
691 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
482 return; 692 return;
483 } else if (boot_cpu_data.x86 == 15) { 693 if (!setup_k7_watchdog())
694 return;
695 break;
696 case X86_VENDOR_INTEL:
697 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
698 if (!setup_intel_arch_watchdog())
699 return;
700 break;
701 }
484 if (!setup_p4_watchdog()) 702 if (!setup_p4_watchdog())
485 return; 703 return;
486 } else { 704 break;
705 default:
487 return; 706 return;
488 } 707 }
708 }
709 wd->enabled = 1;
710 atomic_inc(&nmi_active);
711}
712
713void stop_apic_nmi_watchdog(void *unused)
714{
715 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
489 716
490 break; 717 /* only support LOCAL and IO APICs for now */
718 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
719 (nmi_watchdog != NMI_IO_APIC))
720 return;
491 721
492 default: 722 if (wd->enabled == 0)
493 return; 723 return;
724
725 if (nmi_watchdog == NMI_LOCAL_APIC) {
726 switch (boot_cpu_data.x86_vendor) {
727 case X86_VENDOR_AMD:
728 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
729 return;
730 stop_k7_watchdog();
731 break;
732 case X86_VENDOR_INTEL:
733 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
734 stop_intel_arch_watchdog();
735 break;
736 }
737 stop_p4_watchdog();
738 break;
739 default:
740 return;
741 }
494 } 742 }
495 lapic_nmi_owner = LAPIC_NMI_WATCHDOG; 743 wd->enabled = 0;
496 nmi_active = 1; 744 atomic_dec(&nmi_active);
497} 745}
498 746
499/* 747/*
@@ -526,93 +774,109 @@ void touch_nmi_watchdog (void)
526 touch_softlockup_watchdog(); 774 touch_softlockup_watchdog();
527} 775}
528 776
529void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) 777int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
530{ 778{
531 int sum; 779 int sum;
532 int touched = 0; 780 int touched = 0;
781 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
782 u64 dummy;
783 int rc=0;
784
785 /* check for other users first */
786 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
787 == NOTIFY_STOP) {
788 rc = 1;
789 touched = 1;
790 }
533 791
534 sum = read_pda(apic_timer_irqs); 792 sum = read_pda(apic_timer_irqs);
535 if (__get_cpu_var(nmi_touch)) { 793 if (__get_cpu_var(nmi_touch)) {
536 __get_cpu_var(nmi_touch) = 0; 794 __get_cpu_var(nmi_touch) = 0;
537 touched = 1; 795 touched = 1;
538 } 796 }
797
539#ifdef CONFIG_X86_MCE 798#ifdef CONFIG_X86_MCE
540 /* Could check oops_in_progress here too, but it's safer 799 /* Could check oops_in_progress here too, but it's safer
541 not too */ 800 not too */
542 if (atomic_read(&mce_entry) > 0) 801 if (atomic_read(&mce_entry) > 0)
543 touched = 1; 802 touched = 1;
544#endif 803#endif
804 /* if the apic timer isn't firing, this cpu isn't doing much */
545 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 805 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
546 /* 806 /*
547 * Ayiee, looks like this CPU is stuck ... 807 * Ayiee, looks like this CPU is stuck ...
548 * wait a few IRQs (5 seconds) before doing the oops ... 808 * wait a few IRQs (5 seconds) before doing the oops ...
549 */ 809 */
550 local_inc(&__get_cpu_var(alert_counter)); 810 local_inc(&__get_cpu_var(alert_counter));
551 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { 811 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
552 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 812 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
553 == NOTIFY_STOP) { 813 panic_on_timeout);
554 local_set(&__get_cpu_var(alert_counter), 0);
555 return;
556 }
557 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs);
558 }
559 } else { 814 } else {
560 __get_cpu_var(last_irq_sum) = sum; 815 __get_cpu_var(last_irq_sum) = sum;
561 local_set(&__get_cpu_var(alert_counter), 0); 816 local_set(&__get_cpu_var(alert_counter), 0);
562 } 817 }
563 if (nmi_perfctr_msr) { 818
564 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { 819 /* see if the nmi watchdog went off */
565 /* 820 if (wd->enabled) {
566 * P4 quirks: 821 if (nmi_watchdog == NMI_LOCAL_APIC) {
567 * - An overflown perfctr will assert its interrupt 822 rdmsrl(wd->perfctr_msr, dummy);
568 * until the OVF flag in its CCCR is cleared. 823 if (dummy & wd->check_bit){
569 * - LVTPC is masked on interrupt and must be 824 /* this wasn't a watchdog timer interrupt */
570 * unmasked by the LVTPC handler. 825 goto done;
571 */ 826 }
572 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 827
573 apic_write(APIC_LVTPC, APIC_DM_NMI); 828 /* only Intel uses the cccr msr */
574 } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 829 if (wd->cccr_msr != 0) {
575 /* 830 /*
576 * For Intel based architectural perfmon 831 * P4 quirks:
577 * - LVTPC is masked on interrupt and must be 832 * - An overflown perfctr will assert its interrupt
578 * unmasked by the LVTPC handler. 833 * until the OVF flag in its CCCR is cleared.
834 * - LVTPC is masked on interrupt and must be
835 * unmasked by the LVTPC handler.
836 */
837 rdmsrl(wd->cccr_msr, dummy);
838 dummy &= ~P4_CCCR_OVF;
839 wrmsrl(wd->cccr_msr, dummy);
840 apic_write(APIC_LVTPC, APIC_DM_NMI);
841 } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
842 /*
843 * ArchPerfom/Core Duo needs to re-unmask
844 * the apic vector
845 */
846 apic_write(APIC_LVTPC, APIC_DM_NMI);
847 }
848 /* start the cycle over again */
849 wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
850 rc = 1;
851 } else if (nmi_watchdog == NMI_IO_APIC) {
852 /* don't know how to accurately check for this.
853 * just assume it was a watchdog timer interrupt
854 * This matches the old behaviour.
579 */ 855 */
580 apic_write(APIC_LVTPC, APIC_DM_NMI); 856 rc = 1;
581 } 857 } else
582 wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); 858 printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
583 } 859 }
860done:
861 return rc;
584} 862}
585 863
586static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
587{
588 return 0;
589}
590
591static nmi_callback_t nmi_callback = dummy_nmi_callback;
592
593asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) 864asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
594{ 865{
595 int cpu = safe_smp_processor_id();
596
597 nmi_enter(); 866 nmi_enter();
598 add_pda(__nmi_count,1); 867 add_pda(__nmi_count,1);
599 if (!rcu_dereference(nmi_callback)(regs, cpu)) 868 default_do_nmi(regs);
600 default_do_nmi(regs);
601 nmi_exit(); 869 nmi_exit();
602} 870}
603 871
604void set_nmi_callback(nmi_callback_t callback) 872int do_nmi_callback(struct pt_regs * regs, int cpu)
605{ 873{
606 vmalloc_sync_all(); 874#ifdef CONFIG_SYSCTL
607 rcu_assign_pointer(nmi_callback, callback); 875 if (unknown_nmi_panic)
608} 876 return unknown_nmi_panic_callback(regs, cpu);
609EXPORT_SYMBOL_GPL(set_nmi_callback); 877#endif
610 878 return 0;
611void unset_nmi_callback(void)
612{
613 nmi_callback = dummy_nmi_callback;
614} 879}
615EXPORT_SYMBOL_GPL(unset_nmi_callback);
616 880
617#ifdef CONFIG_SYSCTL 881#ifdef CONFIG_SYSCTL
618 882
@@ -621,36 +885,42 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
621 unsigned char reason = get_nmi_reason(); 885 unsigned char reason = get_nmi_reason();
622 char buf[64]; 886 char buf[64];
623 887
624 if (!(reason & 0xc0)) { 888 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
625 sprintf(buf, "NMI received for unknown reason %02x\n", reason); 889 die_nmi(buf, regs, 1); /* Always panic here */
626 die_nmi(buf,regs);
627 }
628 return 0; 890 return 0;
629} 891}
630 892
631/* 893/*
632 * proc handler for /proc/sys/kernel/unknown_nmi_panic 894 * proc handler for /proc/sys/kernel/nmi
633 */ 895 */
634int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, 896int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
635 void __user *buffer, size_t *length, loff_t *ppos) 897 void __user *buffer, size_t *length, loff_t *ppos)
636{ 898{
637 int old_state; 899 int old_state;
638 900
639 old_state = unknown_nmi_panic; 901 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
902 old_state = nmi_watchdog_enabled;
640 proc_dointvec(table, write, file, buffer, length, ppos); 903 proc_dointvec(table, write, file, buffer, length, ppos);
641 if (!!old_state == !!unknown_nmi_panic) 904 if (!!old_state == !!nmi_watchdog_enabled)
642 return 0; 905 return 0;
643 906
644 if (unknown_nmi_panic) { 907 if (atomic_read(&nmi_active) < 0) {
645 if (reserve_lapic_nmi() < 0) { 908 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
646 unknown_nmi_panic = 0; 909 return -EIO;
647 return -EBUSY; 910 }
648 } else { 911
649 set_nmi_callback(unknown_nmi_panic_callback); 912 /* if nmi_watchdog is not set yet, then set it */
650 } 913 nmi_watchdog_default();
914
915 if (nmi_watchdog == NMI_LOCAL_APIC) {
916 if (nmi_watchdog_enabled)
917 enable_lapic_nmi_watchdog();
918 else
919 disable_lapic_nmi_watchdog();
651 } else { 920 } else {
652 release_lapic_nmi(); 921 printk( KERN_WARNING
653 unset_nmi_callback(); 922 "NMI watchdog doesn't know what hardware to touch\n");
923 return -EIO;
654 } 924 }
655 return 0; 925 return 0;
656} 926}
@@ -659,8 +929,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file
659 929
660EXPORT_SYMBOL(nmi_active); 930EXPORT_SYMBOL(nmi_active);
661EXPORT_SYMBOL(nmi_watchdog); 931EXPORT_SYMBOL(nmi_watchdog);
662EXPORT_SYMBOL(reserve_lapic_nmi); 932EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
663EXPORT_SYMBOL(release_lapic_nmi); 933EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
934EXPORT_SYMBOL(reserve_perfctr_nmi);
935EXPORT_SYMBOL(release_perfctr_nmi);
936EXPORT_SYMBOL(reserve_evntsel_nmi);
937EXPORT_SYMBOL(release_evntsel_nmi);
664EXPORT_SYMBOL(disable_timer_nmi_watchdog); 938EXPORT_SYMBOL(disable_timer_nmi_watchdog);
665EXPORT_SYMBOL(enable_timer_nmi_watchdog); 939EXPORT_SYMBOL(enable_timer_nmi_watchdog);
666EXPORT_SYMBOL(touch_nmi_watchdog); 940EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 146924ba5df5..cfb09b07ae99 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -86,7 +86,8 @@
86 86
87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
88#define MAX_NUM_CHASSIS 8 /* max number of chassis */ 88#define MAX_NUM_CHASSIS 8 /* max number of chassis */
89#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ 89/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
90#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
90#define PHBS_PER_CALGARY 4 91#define PHBS_PER_CALGARY 4
91 92
92/* register offsets in Calgary's internal register space */ 93/* register offsets in Calgary's internal register space */
@@ -111,31 +112,49 @@ static const unsigned long phb_offsets[] = {
111 0xB000 /* PHB3 */ 112 0xB000 /* PHB3 */
112}; 113};
113 114
114static char bus_to_phb[MAX_PHB_BUS_NUM];
115void* tce_table_kva[MAX_PHB_BUS_NUM];
116unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 115unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
117static int translate_empty_slots __read_mostly = 0; 116static int translate_empty_slots __read_mostly = 0;
118static int calgary_detected __read_mostly = 0; 117static int calgary_detected __read_mostly = 0;
119 118
120/* 119struct calgary_bus_info {
121 * the bitmap of PHBs the user requested that we disable 120 void *tce_space;
122 * translation on. 121 unsigned char translation_disabled;
123 */ 122 signed char phbid;
124static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); 123};
124
125static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
125 126
126static void tce_cache_blast(struct iommu_table *tbl); 127static void tce_cache_blast(struct iommu_table *tbl);
127 128
128/* enable this to stress test the chip's TCE cache */ 129/* enable this to stress test the chip's TCE cache */
129#ifdef CONFIG_IOMMU_DEBUG 130#ifdef CONFIG_IOMMU_DEBUG
130static inline void tce_cache_blast_stress(struct iommu_table *tbl) 131int debugging __read_mostly = 1;
132
133static inline unsigned long verify_bit_range(unsigned long* bitmap,
134 int expected, unsigned long start, unsigned long end)
131{ 135{
132 tce_cache_blast(tbl); 136 unsigned long idx = start;
137
138 BUG_ON(start >= end);
139
140 while (idx < end) {
141 if (!!test_bit(idx, bitmap) != expected)
142 return idx;
143 ++idx;
144 }
145
146 /* all bits have the expected value */
147 return ~0UL;
133} 148}
134#else 149#else /* debugging is disabled */
135static inline void tce_cache_blast_stress(struct iommu_table *tbl) 150int debugging __read_mostly = 0;
151
152static inline unsigned long verify_bit_range(unsigned long* bitmap,
153 int expected, unsigned long start, unsigned long end)
136{ 154{
155 return ~0UL;
137} 156}
138#endif /* BLAST_TCE_CACHE_ON_UNMAP */ 157#endif /* CONFIG_IOMMU_DEBUG */
139 158
140static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) 159static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
141{ 160{
@@ -149,7 +168,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
149 168
150static inline int translate_phb(struct pci_dev* dev) 169static inline int translate_phb(struct pci_dev* dev)
151{ 170{
152 int disabled = test_bit(dev->bus->number, translation_disabled); 171 int disabled = bus_info[dev->bus->number].translation_disabled;
153 return !disabled; 172 return !disabled;
154} 173}
155 174
@@ -158,6 +177,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
158{ 177{
159 unsigned long index; 178 unsigned long index;
160 unsigned long end; 179 unsigned long end;
180 unsigned long badbit;
161 181
162 index = start_addr >> PAGE_SHIFT; 182 index = start_addr >> PAGE_SHIFT;
163 183
@@ -169,14 +189,15 @@ static void iommu_range_reserve(struct iommu_table *tbl,
169 if (end > tbl->it_size) /* don't go off the table */ 189 if (end > tbl->it_size) /* don't go off the table */
170 end = tbl->it_size; 190 end = tbl->it_size;
171 191
172 while (index < end) { 192 badbit = verify_bit_range(tbl->it_map, 0, index, end);
173 if (test_bit(index, tbl->it_map)) 193 if (badbit != ~0UL) {
194 if (printk_ratelimit())
174 printk(KERN_ERR "Calgary: entry already allocated at " 195 printk(KERN_ERR "Calgary: entry already allocated at "
175 "0x%lx tbl %p dma 0x%lx npages %u\n", 196 "0x%lx tbl %p dma 0x%lx npages %u\n",
176 index, tbl, start_addr, npages); 197 badbit, tbl, start_addr, npages);
177 ++index;
178 } 198 }
179 set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); 199
200 set_bit_string(tbl->it_map, index, npages);
180} 201}
181 202
182static unsigned long iommu_range_alloc(struct iommu_table *tbl, 203static unsigned long iommu_range_alloc(struct iommu_table *tbl,
@@ -243,7 +264,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
243 unsigned int npages) 264 unsigned int npages)
244{ 265{
245 unsigned long entry; 266 unsigned long entry;
246 unsigned long i; 267 unsigned long badbit;
247 268
248 entry = dma_addr >> PAGE_SHIFT; 269 entry = dma_addr >> PAGE_SHIFT;
249 270
@@ -251,16 +272,15 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
251 272
252 tce_free(tbl, entry, npages); 273 tce_free(tbl, entry, npages);
253 274
254 for (i = 0; i < npages; ++i) { 275 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
255 if (!test_bit(entry + i, tbl->it_map)) 276 if (badbit != ~0UL) {
277 if (printk_ratelimit())
256 printk(KERN_ERR "Calgary: bit is off at 0x%lx " 278 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
257 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", 279 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
258 entry + i, tbl, dma_addr, entry, npages); 280 badbit, tbl, dma_addr, entry, npages);
259 } 281 }
260 282
261 __clear_bit_string(tbl->it_map, entry, npages); 283 __clear_bit_string(tbl->it_map, entry, npages);
262
263 tce_cache_blast_stress(tbl);
264} 284}
265 285
266static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 286static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -454,7 +474,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
454 474
455static inline int busno_to_phbid(unsigned char num) 475static inline int busno_to_phbid(unsigned char num)
456{ 476{
457 return bus_to_phb[num]; 477 return bus_info[num].phbid;
458} 478}
459 479
460static inline unsigned long split_queue_offset(unsigned char num) 480static inline unsigned long split_queue_offset(unsigned char num)
@@ -631,6 +651,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
631 if (ret) 651 if (ret)
632 return ret; 652 return ret;
633 653
654 tbl = dev->sysdata;
655 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
656 tce_free(tbl, 0, tbl->it_size);
657
634 calgary_reserve_regions(dev); 658 calgary_reserve_regions(dev);
635 659
636 /* set TARs for each PHB */ 660 /* set TARs for each PHB */
@@ -654,11 +678,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
654 return 0; 678 return 0;
655} 679}
656 680
657static void __init calgary_free_tar(struct pci_dev *dev) 681static void __init calgary_free_bus(struct pci_dev *dev)
658{ 682{
659 u64 val64; 683 u64 val64;
660 struct iommu_table *tbl = dev->sysdata; 684 struct iommu_table *tbl = dev->sysdata;
661 void __iomem *target; 685 void __iomem *target;
686 unsigned int bitmapsz;
662 687
663 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); 688 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
664 val64 = be64_to_cpu(readq(target)); 689 val64 = be64_to_cpu(readq(target));
@@ -666,8 +691,15 @@ static void __init calgary_free_tar(struct pci_dev *dev)
666 writeq(cpu_to_be64(val64), target); 691 writeq(cpu_to_be64(val64), target);
667 readq(target); /* flush */ 692 readq(target); /* flush */
668 693
694 bitmapsz = tbl->it_size / BITS_PER_BYTE;
695 free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
696 tbl->it_map = NULL;
697
669 kfree(tbl); 698 kfree(tbl);
670 dev->sysdata = NULL; 699 dev->sysdata = NULL;
700
701 /* Can't free bootmem allocated memory after system is up :-( */
702 bus_info[dev->bus->number].tce_space = NULL;
671} 703}
672 704
673static void calgary_watchdog(unsigned long data) 705static void calgary_watchdog(unsigned long data)
@@ -772,12 +804,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev)
772 return address; 804 return address;
773} 805}
774 806
775static int __init calgary_init_one_nontraslated(struct pci_dev *dev) 807static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
776{ 808{
809 pci_dev_get(dev);
777 dev->sysdata = NULL; 810 dev->sysdata = NULL;
778 dev->bus->self = dev; 811 dev->bus->self = dev;
779
780 return 0;
781} 812}
782 813
783static int __init calgary_init_one(struct pci_dev *dev) 814static int __init calgary_init_one(struct pci_dev *dev)
@@ -798,6 +829,7 @@ static int __init calgary_init_one(struct pci_dev *dev)
798 if (ret) 829 if (ret)
799 goto iounmap; 830 goto iounmap;
800 831
832 pci_dev_get(dev);
801 dev->bus->self = dev; 833 dev->bus->self = dev;
802 calgary_enable_translation(dev); 834 calgary_enable_translation(dev);
803 835
@@ -824,10 +856,9 @@ static int __init calgary_init(void)
824 calgary_init_one_nontraslated(dev); 856 calgary_init_one_nontraslated(dev);
825 continue; 857 continue;
826 } 858 }
827 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { 859 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
828 pci_dev_put(dev);
829 continue; 860 continue;
830 } 861
831 ret = calgary_init_one(dev); 862 ret = calgary_init_one(dev);
832 if (ret) 863 if (ret)
833 goto error; 864 goto error;
@@ -840,15 +871,18 @@ error:
840 dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, 871 dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
841 PCI_DEVICE_ID_IBM_CALGARY, 872 PCI_DEVICE_ID_IBM_CALGARY,
842 dev); 873 dev);
874 if (!dev)
875 break;
843 if (!translate_phb(dev)) { 876 if (!translate_phb(dev)) {
844 pci_dev_put(dev); 877 pci_dev_put(dev);
845 continue; 878 continue;
846 } 879 }
847 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) 880 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
848 continue; 881 continue;
882
849 calgary_disable_translation(dev); 883 calgary_disable_translation(dev);
850 calgary_free_tar(dev); 884 calgary_free_bus(dev);
851 pci_dev_put(dev); 885 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
852 } 886 }
853 887
854 return ret; 888 return ret;
@@ -890,13 +924,15 @@ void __init detect_calgary(void)
890 if (swiotlb || no_iommu || iommu_detected) 924 if (swiotlb || no_iommu || iommu_detected)
891 return; 925 return;
892 926
927 if (!early_pci_allowed())
928 return;
929
893 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 930 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
894 931
895 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 932 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
896 int dev; 933 int dev;
897 934 struct calgary_bus_info *info = &bus_info[bus];
898 tce_table_kva[bus] = NULL; 935 info->phbid = -1;
899 bus_to_phb[bus] = -1;
900 936
901 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 937 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
902 continue; 938 continue;
@@ -907,12 +943,9 @@ void __init detect_calgary(void)
907 */ 943 */
908 phb = (phb + 1) % PHBS_PER_CALGARY; 944 phb = (phb + 1) % PHBS_PER_CALGARY;
909 945
910 if (test_bit(bus, translation_disabled)) { 946 if (info->translation_disabled)
911 printk(KERN_INFO "Calgary: translation is disabled for "
912 "PHB 0x%x\n", bus);
913 /* skip this phb, don't allocate a tbl for it */
914 continue; 947 continue;
915 } 948
916 /* 949 /*
917 * Scan the slots of the PCI bus to see if there is a device present. 950 * Scan the slots of the PCI bus to see if there is a device present.
918 * The parent bus will be the zero-ith device, so start at 1. 951 * The parent bus will be the zero-ith device, so start at 1.
@@ -923,8 +956,8 @@ void __init detect_calgary(void)
923 tbl = alloc_tce_table(); 956 tbl = alloc_tce_table();
924 if (!tbl) 957 if (!tbl)
925 goto cleanup; 958 goto cleanup;
926 tce_table_kva[bus] = tbl; 959 info->tce_space = tbl;
927 bus_to_phb[bus] = phb; 960 info->phbid = phb;
928 calgary_found = 1; 961 calgary_found = 1;
929 break; 962 break;
930 } 963 }
@@ -934,15 +967,20 @@ void __init detect_calgary(void)
934 if (calgary_found) { 967 if (calgary_found) {
935 iommu_detected = 1; 968 iommu_detected = 1;
936 calgary_detected = 1; 969 calgary_detected = 1;
937 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " 970 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
938 "TCE table spec is %d.\n", specified_table_size); 971 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
972 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
973 debugging ? "enabled" : "disabled");
939 } 974 }
940 return; 975 return;
941 976
942cleanup: 977cleanup:
943 for (--bus; bus >= 0; --bus) 978 for (--bus; bus >= 0; --bus) {
944 if (tce_table_kva[bus]) 979 struct calgary_bus_info *info = &bus_info[bus];
945 free_tce_table(tce_table_kva[bus]); 980
981 if (info->tce_space)
982 free_tce_table(info->tce_space);
983 }
946} 984}
947 985
948int __init calgary_iommu_init(void) 986int __init calgary_iommu_init(void)
@@ -1016,7 +1054,7 @@ static int __init calgary_parse_options(char *p)
1016 if (bridge < MAX_PHB_BUS_NUM) { 1054 if (bridge < MAX_PHB_BUS_NUM) {
1017 printk(KERN_INFO "Calgary: disabling " 1055 printk(KERN_INFO "Calgary: disabling "
1018 "translation for PHB 0x%x\n", bridge); 1056 "translation for PHB 0x%x\n", bridge);
1019 set_bit(bridge, translation_disabled); 1057 bus_info[bridge].translation_disabled = 1;
1020 } 1058 }
1021 } 1059 }
1022 1060
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 9c44f4f2433d..4dcb671bd19f 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -236,6 +236,9 @@ __init int iommu_setup(char *p)
236{ 236{
237 iommu_merge = 1; 237 iommu_merge = 1;
238 238
239 if (!p)
240 return -EINVAL;
241
239 while (*p) { 242 while (*p) {
240 if (!strncmp(p,"off",3)) 243 if (!strncmp(p,"off",3))
241 no_iommu = 1; 244 no_iommu = 1;
@@ -278,9 +281,9 @@ __init int iommu_setup(char *p)
278 if (*p == ',') 281 if (*p == ',')
279 ++p; 282 ++p;
280 } 283 }
281 return 1; 284 return 0;
282} 285}
283__setup("iommu=", iommu_setup); 286early_param("iommu", iommu_setup);
284 287
285void __init pci_iommu_alloc(void) 288void __init pci_iommu_alloc(void)
286{ 289{
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 6d3e61baf7a0..16261a8a3303 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
239{ 239{
240 unsigned long phys_mem, bus; 240 unsigned long phys_mem, bus;
241 241
242 BUG_ON(dir == DMA_NONE);
243
244 if (!dev) 242 if (!dev)
245 dev = &fallback_dev; 243 dev = &fallback_dev;
246 244
@@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
383 unsigned long pages = 0; 381 unsigned long pages = 0;
384 int need = 0, nextneed; 382 int need = 0, nextneed;
385 383
386 BUG_ON(dir == DMA_NONE);
387 if (nents == 0) 384 if (nents == 0)
388 return 0; 385 return 0;
389 386
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index aad7609d8e92..df09ab05a1bd 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
59{ 59{
60 int i; 60 int i;
61 61
62 BUG_ON(direction == DMA_NONE);
63 for (i = 0; i < nents; i++ ) { 62 for (i = 0; i < nents; i++ ) {
64 struct scatterlist *s = &sg[i]; 63 struct scatterlist *s = &sg[i];
65 BUG_ON(!s->page); 64 BUG_ON(!s->page);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index bb6745d13b8f..458006ae19f3 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n)
80} 80}
81EXPORT_SYMBOL(idle_notifier_unregister); 81EXPORT_SYMBOL(idle_notifier_unregister);
82 82
83enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
84static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
85
86void enter_idle(void) 83void enter_idle(void)
87{ 84{
88 __get_cpu_var(idle_state) = CPU_IDLE; 85 write_pda(isidle, 1);
89 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
90} 87}
91 88
92static void __exit_idle(void) 89static void __exit_idle(void)
93{ 90{
94 __get_cpu_var(idle_state) = CPU_NOT_IDLE; 91 if (read_pda(isidle) == 0)
92 return;
93 write_pda(isidle, 0);
95 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96} 95}
97 96
98/* Called from interrupts to signify idle end */ 97/* Called from interrupts to signify idle end */
99void exit_idle(void) 98void exit_idle(void)
100{ 99{
101 if (current->pid | read_pda(irqcount)) 100 /* idle loop has pid 0 */
101 if (current->pid)
102 return; 102 return;
103 __exit_idle(); 103 __exit_idle();
104} 104}
@@ -220,6 +220,9 @@ void cpu_idle (void)
220 play_dead(); 220 play_dead();
221 enter_idle(); 221 enter_idle();
222 idle(); 222 idle();
223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
223 __exit_idle(); 226 __exit_idle();
224 } 227 }
225 228
@@ -350,6 +353,7 @@ void exit_thread(void)
350 353
351 kfree(t->io_bitmap_ptr); 354 kfree(t->io_bitmap_ptr);
352 t->io_bitmap_ptr = NULL; 355 t->io_bitmap_ptr = NULL;
356 clear_thread_flag(TIF_IO_BITMAP);
353 /* 357 /*
354 * Careful, clear this in the TSS too: 358 * Careful, clear this in the TSS too:
355 */ 359 */
@@ -369,6 +373,7 @@ void flush_thread(void)
369 if (t->flags & _TIF_IA32) 373 if (t->flags & _TIF_IA32)
370 current_thread_info()->status |= TS_COMPAT; 374 current_thread_info()->status |= TS_COMPAT;
371 } 375 }
376 t->flags &= ~_TIF_DEBUG;
372 377
373 tsk->thread.debugreg0 = 0; 378 tsk->thread.debugreg0 = 0;
374 tsk->thread.debugreg1 = 0; 379 tsk->thread.debugreg1 = 0;
@@ -461,7 +466,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
461 asm("mov %%es,%0" : "=m" (p->thread.es)); 466 asm("mov %%es,%0" : "=m" (p->thread.es));
462 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
463 468
464 if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 469 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
465 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
466 if (!p->thread.io_bitmap_ptr) { 471 if (!p->thread.io_bitmap_ptr) {
467 p->thread.io_bitmap_max = 0; 472 p->thread.io_bitmap_max = 0;
@@ -469,6 +474,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
469 } 474 }
470 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
471 IO_BITMAP_BYTES); 476 IO_BITMAP_BYTES);
477 set_tsk_thread_flag(p, TIF_IO_BITMAP);
472 } 478 }
473 479
474 /* 480 /*
@@ -498,6 +504,40 @@ out:
498 */ 504 */
499#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 505#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
500 506
507static inline void __switch_to_xtra(struct task_struct *prev_p,
508 struct task_struct *next_p,
509 struct tss_struct *tss)
510{
511 struct thread_struct *prev, *next;
512
513 prev = &prev_p->thread,
514 next = &next_p->thread;
515
516 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
517 loaddebug(next, 0);
518 loaddebug(next, 1);
519 loaddebug(next, 2);
520 loaddebug(next, 3);
521 /* no 4 and 5 */
522 loaddebug(next, 6);
523 loaddebug(next, 7);
524 }
525
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
527 /*
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
530 */
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
534 /*
535 * Clear any possible leftover bits:
536 */
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
538 }
539}
540
501/* 541/*
502 * switch_to(x,y) should switch tasks from x to y. 542 * switch_to(x,y) should switch tasks from x to y.
503 * 543 *
@@ -515,6 +555,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
515 int cpu = smp_processor_id(); 555 int cpu = smp_processor_id();
516 struct tss_struct *tss = &per_cpu(init_tss, cpu); 556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
517 557
558 /* we're going to use this soon, after a few expensive things */
559 if (next_p->fpu_counter>5)
560 prefetch(&next->i387.fxsave);
561
518 /* 562 /*
519 * Reload esp0, LDT and the page table pointer: 563 * Reload esp0, LDT and the page table pointer:
520 */ 564 */
@@ -583,41 +627,29 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
583 And the AMD workaround requires it to be after DS reload. */ 627 And the AMD workaround requires it to be after DS reload. */
584 unlazy_fpu(prev_p); 628 unlazy_fpu(prev_p);
585 write_pda(kernelstack, 629 write_pda(kernelstack,
586 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
587 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary);
588 /* 633 /*
589 * Now maybe reload the debug registers 634 * Build time only check to make sure the stack_canary is at
635 * offset 40 in the pda; this is a gcc ABI requirement
590 */ 636 */
591 if (unlikely(next->debugreg7)) { 637 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
592 loaddebug(next, 0); 638#endif
593 loaddebug(next, 1);
594 loaddebug(next, 2);
595 loaddebug(next, 3);
596 /* no 4 and 5 */
597 loaddebug(next, 6);
598 loaddebug(next, 7);
599 }
600
601 639
602 /* 640 /*
603 * Handle the IO bitmap 641 * Now maybe reload the debug registers and handle I/O bitmaps
604 */ 642 */
605 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { 643 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
606 if (next->io_bitmap_ptr) 644 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
607 /* 645 __switch_to_xtra(prev_p, next_p, tss);
608 * Copy the relevant range of the IO bitmap.
609 * Normally this is 128 bytes or less:
610 */
611 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
612 max(prev->io_bitmap_max, next->io_bitmap_max));
613 else {
614 /*
615 * Clear any possible leftover bits:
616 */
617 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
618 }
619 }
620 646
647 /* If the task has used fpu the last 5 timeslices, just do a full
648 * restore of the math state immediately to avoid the trap; the
649 * chances of needing FPU soon are obviously high now
650 */
651 if (next_p->fpu_counter>5)
652 math_state_restore();
621 return prev_p; 653 return prev_p;
622} 654}
623 655
@@ -834,7 +866,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
834 866
835unsigned long arch_align_stack(unsigned long sp) 867unsigned long arch_align_stack(unsigned long sp)
836{ 868{
837 if (randomize_va_space) 869 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
838 sp -= get_random_int() % 8192; 870 sp -= get_random_int() % 8192;
839 return sp & ~0xf; 871 return sp & ~0xf;
840} 872}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 2d50024c9f30..addc14af0c56 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
116 return addr; 116 return addr;
117} 117}
118 118
119static int is_at_popf(struct task_struct *child, struct pt_regs *regs) 119static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
120{ 120{
121 int i, copied; 121 int i, copied;
122 unsigned char opcode[16]; 122 unsigned char opcode[15];
123 unsigned long addr = convert_rip_to_linear(child, regs); 123 unsigned long addr = convert_rip_to_linear(child, regs);
124 124
125 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); 125 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
126 for (i = 0; i < copied; i++) { 126 for (i = 0; i < copied; i++) {
127 switch (opcode[i]) { 127 switch (opcode[i]) {
128 /* popf */ 128 /* popf and iret */
129 case 0x9d: 129 case 0x9d: case 0xcf:
130 return 1; 130 return 1;
131 131
132 /* CHECKME: 64 65 */ 132 /* CHECKME: 64 65 */
@@ -138,14 +138,17 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
138 case 0x26: case 0x2e: 138 case 0x26: case 0x2e:
139 case 0x36: case 0x3e: 139 case 0x36: case 0x3e:
140 case 0x64: case 0x65: 140 case 0x64: case 0x65:
141 case 0xf0: case 0xf2: case 0xf3: 141 case 0xf2: case 0xf3:
142 continue; 142 continue;
143 143
144 /* REX prefixes */
145 case 0x40 ... 0x4f: 144 case 0x40 ... 0x4f:
145 if (regs->cs != __USER_CS)
146 /* 32-bit mode: register increment */
147 return 0;
148 /* 64-bit mode: REX prefix */
146 continue; 149 continue;
147 150
148 /* CHECKME: f0, f2, f3 */ 151 /* CHECKME: f2, f3 */
149 152
150 /* 153 /*
151 * pushf: NOTE! We should probably not let 154 * pushf: NOTE! We should probably not let
@@ -186,10 +189,8 @@ static void set_singlestep(struct task_struct *child)
186 * ..but if TF is changed by the instruction we will trace, 189 * ..but if TF is changed by the instruction we will trace,
187 * don't mark it as being "us" that set it, so that we 190 * don't mark it as being "us" that set it, so that we
188 * won't clear it by hand later. 191 * won't clear it by hand later.
189 *
190 * AK: this is not enough, LAHF and IRET can change TF in user space too.
191 */ 192 */
192 if (is_at_popf(child, regs)) 193 if (is_setting_trap_flag(child, regs))
193 return; 194 return;
194 195
195 child->ptrace |= PT_DTRACE; 196 child->ptrace |= PT_DTRACE;
@@ -420,9 +421,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
420 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) 421 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
421 break; 422 break;
422 if (i == 4) { 423 if (i == 4) {
423 child->thread.debugreg7 = data; 424 child->thread.debugreg7 = data;
425 if (data)
426 set_tsk_thread_flag(child, TIF_DEBUG);
427 else
428 clear_tsk_thread_flag(child, TIF_DEBUG);
424 ret = 0; 429 ret = 0;
425 } 430 }
426 break; 431 break;
427 } 432 }
428 break; 433 break;
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
index d24fa9b72a2b..14e95872c6a3 100644
--- a/arch/x86_64/kernel/relocate_kernel.S
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -7,31 +7,169 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
10 12
11 /* 13/*
12 * Must be relocatable PIC code callable as a C function, that once 14 * Must be relocatable PIC code callable as a C function
13 * it starts can not use the previous processes stack. 15 */
14 */ 16
15 .globl relocate_new_kernel 17#define PTR(x) (x << 3)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20
21 .text
22 .align PAGE_ALIGNED
16 .code64 23 .code64
24 .globl relocate_kernel
25relocate_kernel:
26 /* %rdi indirection_page
27 * %rsi page_list
28 * %rdx start address
29 */
30
31 /* map the control page at its virtual address */
32
33 movq $0x0000ff8000000000, %r10 /* mask */
34 mov $(39 - 3), %cl /* bits to shift */
35 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
36
37 movq %r11, %r9
38 andq %r10, %r9
39 shrq %cl, %r9
40
41 movq PTR(VA_PGD)(%rsi), %r8
42 addq %r8, %r9
43 movq PTR(PA_PUD_0)(%rsi), %r8
44 orq $PAGE_ATTR, %r8
45 movq %r8, (%r9)
46
47 shrq $9, %r10
48 sub $9, %cl
49
50 movq %r11, %r9
51 andq %r10, %r9
52 shrq %cl, %r9
53
54 movq PTR(VA_PUD_0)(%rsi), %r8
55 addq %r8, %r9
56 movq PTR(PA_PMD_0)(%rsi), %r8
57 orq $PAGE_ATTR, %r8
58 movq %r8, (%r9)
59
60 shrq $9, %r10
61 sub $9, %cl
62
63 movq %r11, %r9
64 andq %r10, %r9
65 shrq %cl, %r9
66
67 movq PTR(VA_PMD_0)(%rsi), %r8
68 addq %r8, %r9
69 movq PTR(PA_PTE_0)(%rsi), %r8
70 orq $PAGE_ATTR, %r8
71 movq %r8, (%r9)
72
73 shrq $9, %r10
74 sub $9, %cl
75
76 movq %r11, %r9
77 andq %r10, %r9
78 shrq %cl, %r9
79
80 movq PTR(VA_PTE_0)(%rsi), %r8
81 addq %r8, %r9
82 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
83 orq $PAGE_ATTR, %r8
84 movq %r8, (%r9)
85
86 /* identity map the control page at its physical address */
87
88 movq $0x0000ff8000000000, %r10 /* mask */
89 mov $(39 - 3), %cl /* bits to shift */
90 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
91
92 movq %r11, %r9
93 andq %r10, %r9
94 shrq %cl, %r9
95
96 movq PTR(VA_PGD)(%rsi), %r8
97 addq %r8, %r9
98 movq PTR(PA_PUD_1)(%rsi), %r8
99 orq $PAGE_ATTR, %r8
100 movq %r8, (%r9)
101
102 shrq $9, %r10
103 sub $9, %cl
104
105 movq %r11, %r9
106 andq %r10, %r9
107 shrq %cl, %r9
108
109 movq PTR(VA_PUD_1)(%rsi), %r8
110 addq %r8, %r9
111 movq PTR(PA_PMD_1)(%rsi), %r8
112 orq $PAGE_ATTR, %r8
113 movq %r8, (%r9)
114
115 shrq $9, %r10
116 sub $9, %cl
117
118 movq %r11, %r9
119 andq %r10, %r9
120 shrq %cl, %r9
121
122 movq PTR(VA_PMD_1)(%rsi), %r8
123 addq %r8, %r9
124 movq PTR(PA_PTE_1)(%rsi), %r8
125 orq $PAGE_ATTR, %r8
126 movq %r8, (%r9)
127
128 shrq $9, %r10
129 sub $9, %cl
130
131 movq %r11, %r9
132 andq %r10, %r9
133 shrq %cl, %r9
134
135 movq PTR(VA_PTE_1)(%rsi), %r8
136 addq %r8, %r9
137 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
138 orq $PAGE_ATTR, %r8
139 movq %r8, (%r9)
140
17relocate_new_kernel: 141relocate_new_kernel:
18 /* %rdi page_list 142 /* %rdi indirection_page
19 * %rsi reboot_code_buffer 143 * %rsi page_list
20 * %rdx start address 144 * %rdx start address
21 * %rcx page_table
22 * %r8 arg5
23 * %r9 arg6
24 */ 145 */
25 146
26 /* zero out flags, and disable interrupts */ 147 /* zero out flags, and disable interrupts */
27 pushq $0 148 pushq $0
28 popfq 149 popfq
29 150
30 /* set a new stack at the bottom of our page... */ 151 /* get physical address of control page now */
31 lea 4096(%rsi), %rsp 152 /* this is impossible after page table switch */
153 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
154
155 /* get physical address of page table now too */
156 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
32 157
33 /* store the parameters back on the stack */ 158 /* switch to new set of page tables */
34 pushq %rdx /* store the start address */ 159 movq PTR(PA_PGD)(%rsi), %r9
160 movq %r9, %cr3
161
162 /* setup a new stack at the end of the physical control page */
163 lea 4096(%r8), %rsp
164
165 /* jump to identity mapped page */
166 addq $(identity_mapped - relocate_kernel), %r8
167 pushq %r8
168 ret
169
170identity_mapped:
171 /* store the start address on the stack */
172 pushq %rdx
35 173
36 /* Set cr0 to a known state: 174 /* Set cr0 to a known state:
37 * 31 1 == Paging enabled 175 * 31 1 == Paging enabled
@@ -136,8 +274,3 @@ relocate_new_kernel:
136 xorq %r15, %r15 274 xorq %r15, %r15
137 275
138 ret 276 ret
139relocate_new_kernel_end:
140
141 .globl relocate_new_kernel_size
142relocate_new_kernel_size:
143 .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 4b39f0da17f3..0b00bb2ea576 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -74,16 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data);
74 74
75unsigned long mmu_cr4_features; 75unsigned long mmu_cr4_features;
76 76
77int acpi_disabled;
78EXPORT_SYMBOL(acpi_disabled);
79#ifdef CONFIG_ACPI
80extern int __initdata acpi_ht;
81extern acpi_interrupt_flags acpi_sci_flags;
82int __initdata acpi_force = 0;
83#endif
84
85int acpi_numa __initdata;
86
87/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 77/* Boot loader ID as an integer, for the benefit of proc_dointvec */
88int bootloader_type; 78int bootloader_type;
89 79
@@ -107,7 +97,6 @@ struct sys_desc_table_struct {
107 97
108struct edid_info edid_info; 98struct edid_info edid_info;
109EXPORT_SYMBOL_GPL(edid_info); 99EXPORT_SYMBOL_GPL(edid_info);
110struct e820map e820;
111 100
112extern int root_mountflags; 101extern int root_mountflags;
113 102
@@ -276,185 +265,22 @@ static void __init probe_roms(void)
276 } 265 }
277} 266}
278 267
279/* Check for full argument with no trailing characters */ 268#ifdef CONFIG_PROC_VMCORE
280static int fullarg(char *p, char *arg) 269/* elfcorehdr= specifies the location of elf core header
270 * stored by the crashed kernel. This option will be passed
271 * by kexec loader to the capture kernel.
272 */
273static int __init setup_elfcorehdr(char *arg)
281{ 274{
282 int l = strlen(arg); 275 char *end;
283 return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); 276 if (!arg)
277 return -EINVAL;
278 elfcorehdr_addr = memparse(arg, &end);
279 return end > arg ? 0 : -EINVAL;
284} 280}
285 281early_param("elfcorehdr", setup_elfcorehdr);
286static __init void parse_cmdline_early (char ** cmdline_p)
287{
288 char c = ' ', *to = command_line, *from = COMMAND_LINE;
289 int len = 0;
290 int userdef = 0;
291
292 for (;;) {
293 if (c != ' ')
294 goto next_char;
295
296#ifdef CONFIG_SMP
297 /*
298 * If the BIOS enumerates physical processors before logical,
299 * maxcpus=N at enumeration-time can be used to disable HT.
300 */
301 else if (!memcmp(from, "maxcpus=", 8)) {
302 extern unsigned int maxcpus;
303
304 maxcpus = simple_strtoul(from + 8, NULL, 0);
305 }
306#endif
307#ifdef CONFIG_ACPI
308 /* "acpi=off" disables both ACPI table parsing and interpreter init */
309 if (fullarg(from,"acpi=off"))
310 disable_acpi();
311
312 if (fullarg(from, "acpi=force")) {
313 /* add later when we do DMI horrors: */
314 acpi_force = 1;
315 acpi_disabled = 0;
316 }
317
318 /* acpi=ht just means: do ACPI MADT parsing
319 at bootup, but don't enable the full ACPI interpreter */
320 if (fullarg(from, "acpi=ht")) {
321 if (!acpi_force)
322 disable_acpi();
323 acpi_ht = 1;
324 }
325 else if (fullarg(from, "pci=noacpi"))
326 acpi_disable_pci();
327 else if (fullarg(from, "acpi=noirq"))
328 acpi_noirq_set();
329
330 else if (fullarg(from, "acpi_sci=edge"))
331 acpi_sci_flags.trigger = 1;
332 else if (fullarg(from, "acpi_sci=level"))
333 acpi_sci_flags.trigger = 3;
334 else if (fullarg(from, "acpi_sci=high"))
335 acpi_sci_flags.polarity = 1;
336 else if (fullarg(from, "acpi_sci=low"))
337 acpi_sci_flags.polarity = 3;
338
339 /* acpi=strict disables out-of-spec workarounds */
340 else if (fullarg(from, "acpi=strict")) {
341 acpi_strict = 1;
342 }
343#ifdef CONFIG_X86_IO_APIC
344 else if (fullarg(from, "acpi_skip_timer_override"))
345 acpi_skip_timer_override = 1;
346#endif
347#endif
348
349 if (fullarg(from, "disable_timer_pin_1"))
350 disable_timer_pin_1 = 1;
351 if (fullarg(from, "enable_timer_pin_1"))
352 disable_timer_pin_1 = -1;
353
354 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
355 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
356 disable_apic = 1;
357 }
358
359 if (fullarg(from, "noapic"))
360 skip_ioapic_setup = 1;
361
362 if (fullarg(from,"apic")) {
363 skip_ioapic_setup = 0;
364 ioapic_force = 1;
365 }
366
367 if (!memcmp(from, "mem=", 4))
368 parse_memopt(from+4, &from);
369
370 if (!memcmp(from, "memmap=", 7)) {
371 /* exactmap option is for used defined memory */
372 if (!memcmp(from+7, "exactmap", 8)) {
373#ifdef CONFIG_CRASH_DUMP
374 /* If we are doing a crash dump, we
375 * still need to know the real mem
376 * size before original memory map is
377 * reset.
378 */
379 saved_max_pfn = e820_end_of_ram();
380#endif
381 from += 8+7;
382 end_pfn_map = 0;
383 e820.nr_map = 0;
384 userdef = 1;
385 }
386 else {
387 parse_memmapopt(from+7, &from);
388 userdef = 1;
389 }
390 }
391
392#ifdef CONFIG_NUMA
393 if (!memcmp(from, "numa=", 5))
394 numa_setup(from+5);
395#endif 282#endif
396 283
397 if (!memcmp(from,"iommu=",6)) {
398 iommu_setup(from+6);
399 }
400
401 if (fullarg(from,"oops=panic"))
402 panic_on_oops = 1;
403
404 if (!memcmp(from, "noexec=", 7))
405 nonx_setup(from + 7);
406
407#ifdef CONFIG_KEXEC
408 /* crashkernel=size@addr specifies the location to reserve for
409 * a crash kernel. By reserving this memory we guarantee
410 * that linux never set's it up as a DMA target.
411 * Useful for holding code to do something appropriate
412 * after a kernel panic.
413 */
414 else if (!memcmp(from, "crashkernel=", 12)) {
415 unsigned long size, base;
416 size = memparse(from+12, &from);
417 if (*from == '@') {
418 base = memparse(from+1, &from);
419 /* FIXME: Do I want a sanity check
420 * to validate the memory range?
421 */
422 crashk_res.start = base;
423 crashk_res.end = base + size - 1;
424 }
425 }
426#endif
427
428#ifdef CONFIG_PROC_VMCORE
429 /* elfcorehdr= specifies the location of elf core header
430 * stored by the crashed kernel. This option will be passed
431 * by kexec loader to the capture kernel.
432 */
433 else if(!memcmp(from, "elfcorehdr=", 11))
434 elfcorehdr_addr = memparse(from+11, &from);
435#endif
436
437#ifdef CONFIG_HOTPLUG_CPU
438 else if (!memcmp(from, "additional_cpus=", 16))
439 setup_additional_cpus(from+16);
440#endif
441
442 next_char:
443 c = *(from++);
444 if (!c)
445 break;
446 if (COMMAND_LINE_SIZE <= ++len)
447 break;
448 *(to++) = c;
449 }
450 if (userdef) {
451 printk(KERN_INFO "user-defined physical RAM map:\n");
452 e820_print_map("user");
453 }
454 *to = '\0';
455 *cmdline_p = command_line;
456}
457
458#ifndef CONFIG_NUMA 284#ifndef CONFIG_NUMA
459static void __init 285static void __init
460contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 286contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -466,7 +292,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
466 if (bootmap == -1L) 292 if (bootmap == -1L)
467 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 293 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
468 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 294 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
469 e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); 295 e820_register_active_regions(0, start_pfn, end_pfn);
296 free_bootmem_with_active_regions(0, end_pfn);
470 reserve_bootmem(bootmap, bootmap_size); 297 reserve_bootmem(bootmap, bootmap_size);
471} 298}
472#endif 299#endif
@@ -521,6 +348,8 @@ static void discover_ebda(void)
521 348
522void __init setup_arch(char **cmdline_p) 349void __init setup_arch(char **cmdline_p)
523{ 350{
351 printk(KERN_INFO "Command line: %s\n", saved_command_line);
352
524 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); 353 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
525 screen_info = SCREEN_INFO; 354 screen_info = SCREEN_INFO;
526 edid_info = EDID_INFO; 355 edid_info = EDID_INFO;
@@ -547,16 +376,22 @@ void __init setup_arch(char **cmdline_p)
547 data_resource.start = virt_to_phys(&_etext); 376 data_resource.start = virt_to_phys(&_etext);
548 data_resource.end = virt_to_phys(&_edata)-1; 377 data_resource.end = virt_to_phys(&_edata)-1;
549 378
550 parse_cmdline_early(cmdline_p);
551
552 early_identify_cpu(&boot_cpu_data); 379 early_identify_cpu(&boot_cpu_data);
553 380
381 strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
382 *cmdline_p = command_line;
383
384 parse_early_param();
385
386 finish_e820_parsing();
387
388 e820_register_active_regions(0, 0, -1UL);
554 /* 389 /*
555 * partially used pages are not usable - thus 390 * partially used pages are not usable - thus
556 * we are rounding upwards: 391 * we are rounding upwards:
557 */ 392 */
558 end_pfn = e820_end_of_ram(); 393 end_pfn = e820_end_of_ram();
559 num_physpages = end_pfn; /* for pfn_valid */ 394 num_physpages = end_pfn;
560 395
561 check_efer(); 396 check_efer();
562 397
@@ -576,6 +411,14 @@ void __init setup_arch(char **cmdline_p)
576 acpi_boot_table_init(); 411 acpi_boot_table_init();
577#endif 412#endif
578 413
414 /* How many end-of-memory variables you have, grandma! */
415 max_low_pfn = end_pfn;
416 max_pfn = end_pfn;
417 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
418
419 /* Remove active ranges so rediscovery with NUMA-awareness happens */
420 remove_all_active_ranges();
421
579#ifdef CONFIG_ACPI_NUMA 422#ifdef CONFIG_ACPI_NUMA
580 /* 423 /*
581 * Parse SRAT to discover nodes. 424 * Parse SRAT to discover nodes.
@@ -625,12 +468,10 @@ void __init setup_arch(char **cmdline_p)
625 */ 468 */
626 acpi_reserve_bootmem(); 469 acpi_reserve_bootmem();
627#endif 470#endif
628#ifdef CONFIG_X86_LOCAL_APIC
629 /* 471 /*
630 * Find and reserve possible boot-time SMP configuration: 472 * Find and reserve possible boot-time SMP configuration:
631 */ 473 */
632 find_smp_config(); 474 find_smp_config();
633#endif
634#ifdef CONFIG_BLK_DEV_INITRD 475#ifdef CONFIG_BLK_DEV_INITRD
635 if (LOADER_TYPE && INITRD_START) { 476 if (LOADER_TYPE && INITRD_START) {
636 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { 477 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
@@ -657,7 +498,9 @@ void __init setup_arch(char **cmdline_p)
657 498
658 paging_init(); 499 paging_init();
659 500
660 check_ioapic(); 501#ifdef CONFIG_PCI
502 early_quirks();
503#endif
661 504
662 /* 505 /*
663 * set this early, so we dont allocate cpu0 506 * set this early, so we dont allocate cpu0
@@ -674,14 +517,12 @@ void __init setup_arch(char **cmdline_p)
674 517
675 init_cpu_to_node(); 518 init_cpu_to_node();
676 519
677#ifdef CONFIG_X86_LOCAL_APIC
678 /* 520 /*
679 * get boot-time SMP configuration: 521 * get boot-time SMP configuration:
680 */ 522 */
681 if (smp_found_config) 523 if (smp_found_config)
682 get_smp_config(); 524 get_smp_config();
683 init_apic_mappings(); 525 init_apic_mappings();
684#endif
685 526
686 /* 527 /*
687 * Request address space for all standard RAM and ROM resources 528 * Request address space for all standard RAM and ROM resources
@@ -839,7 +680,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
839#endif 680#endif
840} 681}
841 682
842static void __init init_amd(struct cpuinfo_x86 *c) 683static void __cpuinit init_amd(struct cpuinfo_x86 *c)
843{ 684{
844 unsigned level; 685 unsigned level;
845 686
@@ -895,6 +736,12 @@ static void __init init_amd(struct cpuinfo_x86 *c)
895 736
896 /* Fix cpuid4 emulation for more */ 737 /* Fix cpuid4 emulation for more */
897 num_cache_leaves = 3; 738 num_cache_leaves = 3;
739
740 /* When there is only one core no need to synchronize RDTSC */
741 if (num_possible_cpus() == 1)
742 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
743 else
744 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
898} 745}
899 746
900static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 747static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -976,8 +823,7 @@ static void srat_detect_node(void)
976 node = first_node(node_online_map); 823 node = first_node(node_online_map);
977 numa_set_node(cpu, node); 824 numa_set_node(cpu, node);
978 825
979 if (acpi_numa > 0) 826 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
980 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
981#endif 827#endif
982} 828}
983 829
@@ -1011,6 +857,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
1011 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 857 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
1012 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 858 (c->x86 == 0x6 && c->x86_model >= 0x0e))
1013 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 859 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
860 if (c->x86 == 6)
861 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
1014 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 862 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
1015 c->x86_max_cores = intel_num_cpu_cores(c); 863 c->x86_max_cores = intel_num_cpu_cores(c);
1016 864
@@ -1229,8 +1077,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1229 1077
1230 /* Intel-defined (#2) */ 1078 /* Intel-defined (#2) */
1231 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 1079 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
1232 "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, 1080 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
1233 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1081 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
1234 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1082 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1235 1083
1236 /* VIA/Cyrix/Centaur-defined */ 1084 /* VIA/Cyrix/Centaur-defined */
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 417de564456e..8c4b80fe71a1 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -24,7 +24,7 @@
24#include <asm/proto.h> 24#include <asm/proto.h>
25#include <asm/sections.h> 25#include <asm/sections.h>
26 26
27char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 27char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
28 28
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30 30
@@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes.
46on Enable(default) 46on Enable(default)
47off Disable 47off Disable
48*/ 48*/
49int __init nonx_setup(char *str) 49static int __init nonx_setup(char *str)
50{ 50{
51 if (!str)
52 return -EINVAL;
51 if (!strncmp(str, "on", 2)) { 53 if (!strncmp(str, "on", 2)) {
52 __supported_pte_mask |= _PAGE_NX; 54 __supported_pte_mask |= _PAGE_NX;
53 do_not_nx = 0; 55 do_not_nx = 0;
@@ -55,9 +57,9 @@ int __init nonx_setup(char *str)
55 do_not_nx = 1; 57 do_not_nx = 1;
56 __supported_pte_mask &= ~_PAGE_NX; 58 __supported_pte_mask &= ~_PAGE_NX;
57 } 59 }
58 return 1; 60 return 0;
59} 61}
60__setup("noexec=", nonx_setup); /* parsed early actually */ 62early_param("noexec", nonx_setup);
61 63
62int force_personality32 = 0; 64int force_personality32 = 0;
63 65
@@ -93,12 +95,9 @@ void __init setup_per_cpu_areas(void)
93#endif 95#endif
94 96
95 /* Copy section for each CPU (we discard the original) */ 97 /* Copy section for each CPU (we discard the original) */
96 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); 98 size = PERCPU_ENOUGH_ROOM;
97#ifdef CONFIG_MODULES
98 if (size < PERCPU_ENOUGH_ROOM)
99 size = PERCPU_ENOUGH_ROOM;
100#endif
101 99
100 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
102 for_each_cpu_mask (i, cpu_possible_map) { 101 for_each_cpu_mask (i, cpu_possible_map) {
103 char *ptr; 102 char *ptr;
104 103
@@ -122,7 +121,10 @@ void pda_init(int cpu)
122 121
123 /* Setup up data that may be needed in __get_free_pages early */ 122 /* Setup up data that may be needed in __get_free_pages early */
124 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 123 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
124 /* Memory clobbers used to order PDA accessed */
125 mb();
125 wrmsrl(MSR_GS_BASE, pda); 126 wrmsrl(MSR_GS_BASE, pda);
127 mb();
126 128
127 pda->cpunumber = cpu; 129 pda->cpunumber = cpu;
128 pda->irqcount = -1; 130 pda->irqcount = -1;
@@ -178,6 +180,8 @@ void __cpuinit check_efer(void)
178 } 180 }
179} 181}
180 182
183unsigned long kernel_eflags;
184
181/* 185/*
182 * cpu_init() initializes state that is per-CPU. Some data is already 186 * cpu_init() initializes state that is per-CPU. Some data is already
183 * initialized (naturally) in the bootstrap process, such as the GDT 187 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -235,28 +239,17 @@ void __cpuinit cpu_init (void)
235 * set up and load the per-CPU TSS 239 * set up and load the per-CPU TSS
236 */ 240 */
237 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 241 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
242 static const unsigned int order[N_EXCEPTION_STACKS] = {
243 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
244 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
245 };
238 if (cpu) { 246 if (cpu) {
239 static const unsigned int order[N_EXCEPTION_STACKS] = {
240 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
241 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
242 };
243
244 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); 247 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
245 if (!estacks) 248 if (!estacks)
246 panic("Cannot allocate exception stack %ld %d\n", 249 panic("Cannot allocate exception stack %ld %d\n",
247 v, cpu); 250 v, cpu);
248 } 251 }
249 switch (v + 1) { 252 estacks += PAGE_SIZE << order[v];
250#if DEBUG_STKSZ > EXCEPTION_STKSZ
251 case DEBUG_STACK:
252 cpu_pda(cpu)->debugstack = (unsigned long)estacks;
253 estacks += DEBUG_STKSZ;
254 break;
255#endif
256 default:
257 estacks += EXCEPTION_STKSZ;
258 break;
259 }
260 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; 253 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
261 } 254 }
262 255
@@ -290,4 +283,6 @@ void __cpuinit cpu_init (void)
290 set_debugreg(0UL, 7); 283 set_debugreg(0UL, 7);
291 284
292 fpu_init(); 285 fpu_init();
286
287 raw_local_save_flags(kernel_eflags);
293} 288}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 28161170fb0a..49ec324cd141 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -38,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
38 sigset_t *set, struct pt_regs * regs); 38 sigset_t *set, struct pt_regs * regs);
39 39
40asmlinkage long 40asmlinkage long
41sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
42{
43 sigset_t saveset, newset;
44
45 /* XXX: Don't preclude handling different sized sigset_t's. */
46 if (sigsetsize != sizeof(sigset_t))
47 return -EINVAL;
48
49 if (copy_from_user(&newset, unewset, sizeof(newset)))
50 return -EFAULT;
51 sigdelsetmask(&newset, ~_BLOCKABLE);
52
53 spin_lock_irq(&current->sighand->siglock);
54 saveset = current->blocked;
55 current->blocked = newset;
56 recalc_sigpending();
57 spin_unlock_irq(&current->sighand->siglock);
58#ifdef DEBUG_SIG
59 printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
60 saveset, newset, regs, regs->rip);
61#endif
62 regs->rax = -EINTR;
63 while (1) {
64 current->state = TASK_INTERRUPTIBLE;
65 schedule();
66 if (do_signal(regs, &saveset))
67 return -EINTR;
68 }
69}
70
71asmlinkage long
72sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 41sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
73 struct pt_regs *regs) 42 struct pt_regs *regs)
74{ 43{
@@ -308,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
308#endif 277#endif
309 278
310 /* Set up registers for signal handler */ 279 /* Set up registers for signal handler */
311 {
312 struct exec_domain *ed = current_thread_info()->exec_domain;
313 if (unlikely(ed && ed->signal_invmap && sig < 32))
314 sig = ed->signal_invmap[sig];
315 }
316 regs->rdi = sig; 280 regs->rdi = sig;
317 /* In case the signal handler was declared without prototypes */ 281 /* In case the signal handler was declared without prototypes */
318 regs->rax = 0; 282 regs->rax = 0;
@@ -341,11 +305,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
341 current->comm, current->pid, frame, regs->rip, frame->pretcode); 305 current->comm, current->pid, frame, regs->rip, frame->pretcode);
342#endif 306#endif
343 307
344 return 1; 308 return 0;
345 309
346give_sigsegv: 310give_sigsegv:
347 force_sigsegv(sig, current); 311 force_sigsegv(sig, current);
348 return 0; 312 return -EFAULT;
349} 313}
350 314
351/* 315/*
@@ -408,7 +372,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
408#endif 372#endif
409 ret = setup_rt_frame(sig, ka, info, oldset, regs); 373 ret = setup_rt_frame(sig, ka, info, oldset, regs);
410 374
411 if (ret) { 375 if (ret == 0) {
412 spin_lock_irq(&current->sighand->siglock); 376 spin_lock_irq(&current->sighand->siglock);
413 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); 377 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
414 if (!(ka->sa.sa_flags & SA_NODEFER)) 378 if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -425,11 +389,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
425 * want to handle. Thus you cannot kill init even with a SIGKILL even by 389 * want to handle. Thus you cannot kill init even with a SIGKILL even by
426 * mistake. 390 * mistake.
427 */ 391 */
428int do_signal(struct pt_regs *regs, sigset_t *oldset) 392static void do_signal(struct pt_regs *regs)
429{ 393{
430 struct k_sigaction ka; 394 struct k_sigaction ka;
431 siginfo_t info; 395 siginfo_t info;
432 int signr; 396 int signr;
397 sigset_t *oldset;
433 398
434 /* 399 /*
435 * We want the common case to go fast, which 400 * We want the common case to go fast, which
@@ -438,9 +403,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
438 * if so. 403 * if so.
439 */ 404 */
440 if (!user_mode(regs)) 405 if (!user_mode(regs))
441 return 1; 406 return;
442 407
443 if (!oldset) 408 if (test_thread_flag(TIF_RESTORE_SIGMASK))
409 oldset = &current->saved_sigmask;
410 else
444 oldset = &current->blocked; 411 oldset = &current->blocked;
445 412
446 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 413 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
@@ -454,30 +421,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
454 set_debugreg(current->thread.debugreg7, 7); 421 set_debugreg(current->thread.debugreg7, 7);
455 422
456 /* Whee! Actually deliver the signal. */ 423 /* Whee! Actually deliver the signal. */
457 return handle_signal(signr, &info, &ka, oldset, regs); 424 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
425 /* a signal was successfully delivered; the saved
426 * sigmask will have been stored in the signal frame,
427 * and will be restored by sigreturn, so we can simply
428 * clear the TIF_RESTORE_SIGMASK flag */
429 clear_thread_flag(TIF_RESTORE_SIGMASK);
430 }
431 return;
458 } 432 }
459 433
460 /* Did we come from a system call? */ 434 /* Did we come from a system call? */
461 if ((long)regs->orig_rax >= 0) { 435 if ((long)regs->orig_rax >= 0) {
462 /* Restart the system call - no handlers present */ 436 /* Restart the system call - no handlers present */
463 long res = regs->rax; 437 long res = regs->rax;
464 if (res == -ERESTARTNOHAND || 438 switch (res) {
465 res == -ERESTARTSYS || 439 case -ERESTARTNOHAND:
466 res == -ERESTARTNOINTR) { 440 case -ERESTARTSYS:
441 case -ERESTARTNOINTR:
467 regs->rax = regs->orig_rax; 442 regs->rax = regs->orig_rax;
468 regs->rip -= 2; 443 regs->rip -= 2;
469 } 444 break;
470 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { 445 case -ERESTART_RESTARTBLOCK:
471 regs->rax = test_thread_flag(TIF_IA32) ? 446 regs->rax = test_thread_flag(TIF_IA32) ?
472 __NR_ia32_restart_syscall : 447 __NR_ia32_restart_syscall :
473 __NR_restart_syscall; 448 __NR_restart_syscall;
474 regs->rip -= 2; 449 regs->rip -= 2;
450 break;
475 } 451 }
476 } 452 }
477 return 0; 453
454 /* if there's no signal to deliver, we just put the saved sigmask
455 back. */
456 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
457 clear_thread_flag(TIF_RESTORE_SIGMASK);
458 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
459 }
478} 460}
479 461
480void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) 462void
463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{ 464{
482#ifdef DEBUG_SIG 465#ifdef DEBUG_SIG
483 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", 466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
@@ -491,8 +474,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_
491 } 474 }
492 475
493 /* deal with pending signal delivery */ 476 /* deal with pending signal delivery */
494 if (thread_info_flags & _TIF_SIGPENDING) 477 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
495 do_signal(regs,oldset); 478 do_signal(regs);
496} 479}
497 480
498void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 481void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 06af6ca60129..4f67697f5036 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void)
522 } 522 }
523} 523}
524 524
525int safe_smp_processor_id(void)
526{
527 unsigned apicid, i;
528
529 if (disable_apic)
530 return 0;
531
532 apicid = hard_smp_processor_id();
533 if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
534 return apicid;
535
536 for (i = 0; i < NR_CPUS; ++i) {
537 if (x86_cpu_to_apicid[i] == apicid)
538 return i;
539 }
540
541 /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
542 * or called too early. Either way, we must be CPU 0. */
543 if (x86_cpu_to_apicid[0] == BAD_APICID)
544 return 0;
545
546 return 0; /* Should not happen */
547}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 3ae9ffddddc0..7b7a6870288a 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -1091,7 +1091,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1091 /* 1091 /*
1092 * Switch from PIC to APIC mode. 1092 * Switch from PIC to APIC mode.
1093 */ 1093 */
1094 connect_bsp_APIC();
1095 setup_local_APIC(); 1094 setup_local_APIC();
1096 1095
1097 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 1096 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
@@ -1176,12 +1175,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1176void __init smp_cpus_done(unsigned int max_cpus) 1175void __init smp_cpus_done(unsigned int max_cpus)
1177{ 1176{
1178 smp_cleanup_boot(); 1177 smp_cleanup_boot();
1179
1180#ifdef CONFIG_X86_IO_APIC
1181 setup_ioapic_dest(); 1178 setup_ioapic_dest();
1182#endif
1183
1184 check_nmi_watchdog(); 1179 check_nmi_watchdog();
1180 time_init_gtod();
1185} 1181}
1186 1182
1187#ifdef CONFIG_HOTPLUG_CPU 1183#ifdef CONFIG_HOTPLUG_CPU
@@ -1234,6 +1230,8 @@ int __cpu_disable(void)
1234 if (cpu == 0) 1230 if (cpu == 0)
1235 return -EBUSY; 1231 return -EBUSY;
1236 1232
1233 if (nmi_watchdog == NMI_LOCAL_APIC)
1234 stop_apic_nmi_watchdog(NULL);
1237 clear_local_APIC(); 1235 clear_local_APIC();
1238 1236
1239 /* 1237 /*
@@ -1273,11 +1271,11 @@ void __cpu_die(unsigned int cpu)
1273 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1271 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1274} 1272}
1275 1273
1276__init int setup_additional_cpus(char *s) 1274static __init int setup_additional_cpus(char *s)
1277{ 1275{
1278 return get_option(&s, &additional_cpus); 1276 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1279} 1277}
1280__setup("additional_cpus=", setup_additional_cpus); 1278early_param("additional_cpus", setup_additional_cpus);
1281 1279
1282#else /* ... !CONFIG_HOTPLUG_CPU */ 1280#else /* ... !CONFIG_HOTPLUG_CPU */
1283 1281
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 32cf55eb9af8..6026b31d037e 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -7,215 +7,49 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/stacktrace.h> 9#include <linux/stacktrace.h>
10#include <linux/module.h>
11#include <asm/stacktrace.h>
10 12
11#include <asm/smp.h> 13static void save_stack_warning(void *data, char *msg)
12
13static inline int
14in_range(unsigned long start, unsigned long addr, unsigned long end)
15{ 14{
16 return addr >= start && addr <= end;
17} 15}
18 16
19static unsigned long 17static void
20get_stack_end(struct task_struct *task, unsigned long stack) 18save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
21{ 19{
22 unsigned long stack_start, stack_end, flags;
23 int i, cpu;
24
25 /*
26 * The most common case is that we are in the task stack:
27 */
28 stack_start = (unsigned long)task->thread_info;
29 stack_end = stack_start + THREAD_SIZE;
30
31 if (in_range(stack_start, stack, stack_end))
32 return stack_end;
33
34 /*
35 * We are in an interrupt if irqstackptr is set:
36 */
37 raw_local_irq_save(flags);
38 cpu = safe_smp_processor_id();
39 stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr;
40
41 if (stack_end) {
42 stack_start = stack_end & ~(IRQSTACKSIZE-1);
43 if (in_range(stack_start, stack, stack_end))
44 goto out_restore;
45 /*
46 * We get here if we are in an IRQ context but we
47 * are also in an exception stack.
48 */
49 }
50
51 /*
52 * Iterate over all exception stacks, and figure out whether
53 * 'stack' is in one of them:
54 */
55 for (i = 0; i < N_EXCEPTION_STACKS; i++) {
56 /*
57 * set 'end' to the end of the exception stack.
58 */
59 stack_end = per_cpu(init_tss, cpu).ist[i];
60 stack_start = stack_end - EXCEPTION_STKSZ;
61
62 /*
63 * Is 'stack' above this exception frame's end?
64 * If yes then skip to the next frame.
65 */
66 if (stack >= stack_end)
67 continue;
68 /*
69 * Is 'stack' above this exception frame's start address?
70 * If yes then we found the right frame.
71 */
72 if (stack >= stack_start)
73 goto out_restore;
74
75 /*
76 * If this is a debug stack, and if it has a larger size than
77 * the usual exception stacks, then 'stack' might still
78 * be within the lower portion of the debug stack:
79 */
80#if DEBUG_STKSZ > EXCEPTION_STKSZ
81 if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) {
82 /*
83 * Black magic. A large debug stack is composed of
84 * multiple exception stack entries, which we
85 * iterate through now. Dont look:
86 */
87 do {
88 stack_end -= EXCEPTION_STKSZ;
89 stack_start -= EXCEPTION_STKSZ;
90 } while (stack < stack_start);
91
92 goto out_restore;
93 }
94#endif
95 }
96 /*
97 * Ok, 'stack' is not pointing to any of the system stacks.
98 */
99 stack_end = 0;
100
101out_restore:
102 raw_local_irq_restore(flags);
103
104 return stack_end;
105} 20}
106 21
107 22static int save_stack_stack(void *data, char *name)
108/*
109 * Save stack-backtrace addresses into a stack_trace buffer:
110 */
111static inline unsigned long
112save_context_stack(struct stack_trace *trace, unsigned int skip,
113 unsigned long stack, unsigned long stack_end)
114{ 23{
115 unsigned long addr; 24 struct stack_trace *trace = (struct stack_trace *)data;
116 25 return trace->all_contexts ? 0 : -1;
117#ifdef CONFIG_FRAME_POINTER 26}
118 unsigned long prev_stack = 0;
119 27
120 while (in_range(prev_stack, stack, stack_end)) { 28static void save_stack_address(void *data, unsigned long addr)
121 pr_debug("stack: %p\n", (void *)stack); 29{
122 addr = (unsigned long)(((unsigned long *)stack)[1]); 30 struct stack_trace *trace = (struct stack_trace *)data;
123 pr_debug("addr: %p\n", (void *)addr); 31 if (trace->skip > 0) {
124 if (!skip) 32 trace->skip--;
125 trace->entries[trace->nr_entries++] = addr-1; 33 return;
126 else
127 skip--;
128 if (trace->nr_entries >= trace->max_entries)
129 break;
130 if (!addr)
131 return 0;
132 /*
133 * Stack frames must go forwards (otherwise a loop could
134 * happen if the stackframe is corrupted), so we move
135 * prev_stack forwards:
136 */
137 prev_stack = stack;
138 stack = (unsigned long)(((unsigned long *)stack)[0]);
139 }
140 pr_debug("invalid: %p\n", (void *)stack);
141#else
142 while (stack < stack_end) {
143 addr = ((unsigned long *)stack)[0];
144 stack += sizeof(long);
145 if (__kernel_text_address(addr)) {
146 if (!skip)
147 trace->entries[trace->nr_entries++] = addr-1;
148 else
149 skip--;
150 if (trace->nr_entries >= trace->max_entries)
151 break;
152 }
153 } 34 }
154#endif 35 if (trace->nr_entries < trace->max_entries - 1)
155 return stack; 36 trace->entries[trace->nr_entries++] = addr;
156} 37}
157 38
158#define MAX_STACKS 10 39static struct stacktrace_ops save_stack_ops = {
40 .warning = save_stack_warning,
41 .warning_symbol = save_stack_warning_symbol,
42 .stack = save_stack_stack,
43 .address = save_stack_address,
44};
159 45
160/* 46/*
161 * Save stack-backtrace addresses into a stack_trace buffer. 47 * Save stack-backtrace addresses into a stack_trace buffer.
162 * If all_contexts is set, all contexts (hardirq, softirq and process)
163 * are saved. If not set then only the current context is saved.
164 */ 48 */
165void save_stack_trace(struct stack_trace *trace, 49void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
166 struct task_struct *task, int all_contexts,
167 unsigned int skip)
168{ 50{
169 unsigned long stack = (unsigned long)&stack; 51 dump_trace(task, NULL, NULL, &save_stack_ops, trace);
170 int i, nr_stacks = 0, stacks_done[MAX_STACKS]; 52 trace->entries[trace->nr_entries++] = ULONG_MAX;
171
172 WARN_ON(trace->nr_entries || !trace->max_entries);
173
174 if (!task)
175 task = current;
176
177 pr_debug("task: %p, ti: %p\n", task, task->thread_info);
178
179 if (!task || task == current) {
180 /* Grab rbp right from our regs: */
181 asm ("mov %%rbp, %0" : "=r" (stack));
182 pr_debug("rbp: %p\n", (void *)stack);
183 } else {
184 /* rbp is the last reg pushed by switch_to(): */
185 stack = task->thread.rsp;
186 pr_debug("other task rsp: %p\n", (void *)stack);
187 stack = (unsigned long)(((unsigned long *)stack)[0]);
188 pr_debug("other task rbp: %p\n", (void *)stack);
189 }
190
191 while (1) {
192 unsigned long stack_end = get_stack_end(task, stack);
193
194 pr_debug("stack: %p\n", (void *)stack);
195 pr_debug("stack end: %p\n", (void *)stack_end);
196
197 /*
198 * Invalid stack addres?
199 */
200 if (!stack_end)
201 return;
202 /*
203 * Were we in this stack already? (recursion)
204 */
205 for (i = 0; i < nr_stacks; i++)
206 if (stacks_done[i] == stack_end)
207 return;
208 stacks_done[nr_stacks] = stack_end;
209
210 stack = save_context_stack(trace, skip, stack, stack_end);
211 if (!all_contexts || !stack ||
212 trace->nr_entries >= trace->max_entries)
213 return;
214 trace->entries[trace->nr_entries++] = ULONG_MAX;
215 if (trace->nr_entries >= trace->max_entries)
216 return;
217 if (++nr_stacks >= MAX_STACKS)
218 return;
219 }
220} 53}
54EXPORT_SYMBOL(save_stack_trace);
221 55
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index 5530dda3f27a..cbabfdf78e06 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -1,4 +1,6 @@
1/* 1/*
2 * This file manages the translation entries for the IBM Calgary IOMMU.
3 *
2 * Derived from arch/powerpc/platforms/pseries/iommu.c 4 * Derived from arch/powerpc/platforms/pseries/iommu.c
3 * 5 *
4 * Copyright (C) IBM Corporation, 2006 6 * Copyright (C) IBM Corporation, 2006
@@ -104,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
104 /* set the tce table size - measured in entries */ 106 /* set the tce table size - measured in entries */
105 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 107 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
106 108
107 tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
108 if (!tbl->it_base) {
109 printk(KERN_ERR "Calgary: iommu_table_setparms: "
110 "no table allocated?!\n");
111 ret = -ENOMEM;
112 goto done;
113 }
114
115 /* 109 /*
116 * number of bytes needed for the bitmap size in number of 110 * number of bytes needed for the bitmap size in number of
117 * entries; we need one bit per entry 111 * entries; we need one bit per entry
@@ -160,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
160 if (ret) 154 if (ret)
161 goto free_tbl; 155 goto free_tbl;
162 156
163 tce_free(tbl, 0, tbl->it_size);
164
165 tbl->bbar = bbar; 157 tbl->bbar = bbar;
166 158
167 /* 159 /*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7700e6cd2bd9..1c255ee76e7c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -24,6 +24,8 @@
24#include <linux/device.h> 24#include <linux/device.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/bcd.h> 26#include <linux/bcd.h>
27#include <linux/notifier.h>
28#include <linux/cpu.h>
27#include <linux/kallsyms.h> 29#include <linux/kallsyms.h>
28#include <linux/acpi.h> 30#include <linux/acpi.h>
29#ifdef CONFIG_ACPI 31#ifdef CONFIG_ACPI
@@ -39,9 +41,7 @@
39#include <asm/sections.h> 41#include <asm/sections.h>
40#include <linux/cpufreq.h> 42#include <linux/cpufreq.h>
41#include <linux/hpet.h> 43#include <linux/hpet.h>
42#ifdef CONFIG_X86_LOCAL_APIC
43#include <asm/apic.h> 44#include <asm/apic.h>
44#endif
45 45
46#ifdef CONFIG_CPU_FREQ 46#ifdef CONFIG_CPU_FREQ
47static void cpufreq_delayed_get(void); 47static void cpufreq_delayed_get(void);
@@ -49,7 +49,7 @@ static void cpufreq_delayed_get(void);
49extern void i8254_timer_resume(void); 49extern void i8254_timer_resume(void);
50extern int using_apic_timer; 50extern int using_apic_timer;
51 51
52static char *time_init_gtod(void); 52static char *timename = NULL;
53 53
54DEFINE_SPINLOCK(rtc_lock); 54DEFINE_SPINLOCK(rtc_lock);
55EXPORT_SYMBOL(rtc_lock); 55EXPORT_SYMBOL(rtc_lock);
@@ -187,20 +187,15 @@ unsigned long profile_pc(struct pt_regs *regs)
187{ 187{
188 unsigned long pc = instruction_pointer(regs); 188 unsigned long pc = instruction_pointer(regs);
189 189
190 /* Assume the lock function has either no stack frame or only a single 190 /* Assume the lock function has either no stack frame or a copy
191 word. This checks if the address on the stack looks like a kernel 191 of eflags from PUSHF
192 text address. 192 Eflags always has bits 22 and up cleared unlike kernel addresses. */
193 There is a small window for false hits, but in that case the tick
194 is just accounted to the spinlock function.
195 Better would be to write these functions in assembler again
196 and check exactly. */
197 if (!user_mode(regs) && in_lock_functions(pc)) { 193 if (!user_mode(regs) && in_lock_functions(pc)) {
198 char *v = *(char **)regs->rsp; 194 unsigned long *sp = (unsigned long *)regs->rsp;
199 if ((v >= _stext && v <= _etext) || 195 if (sp[0] >> 22)
200 (v >= _sinittext && v <= _einittext) || 196 return sp[0];
201 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) 197 if (sp[1] >> 22)
202 return (unsigned long)v; 198 return sp[1];
203 return ((unsigned long *)regs->rsp)[1];
204 } 199 }
205 return pc; 200 return pc;
206} 201}
@@ -281,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime)
281 * Note: This function is required to return accurate 276 * Note: This function is required to return accurate
282 * time even in the absence of multiple timer ticks. 277 * time even in the absence of multiple timer ticks.
283 */ 278 */
279static inline unsigned long long cycles_2_ns(unsigned long long cyc);
284unsigned long long monotonic_clock(void) 280unsigned long long monotonic_clock(void)
285{ 281{
286 unsigned long seq; 282 unsigned long seq;
@@ -305,8 +301,7 @@ unsigned long long monotonic_clock(void)
305 base = monotonic_base; 301 base = monotonic_base;
306 } while (read_seqretry(&xtime_lock, seq)); 302 } while (read_seqretry(&xtime_lock, seq));
307 this_offset = get_cycles_sync(); 303 this_offset = get_cycles_sync();
308 /* FIXME: 1000 or 1000000? */ 304 offset = cycles_2_ns(this_offset - last_offset);
309 offset = (this_offset - last_offset)*1000 / cpu_khz;
310 } 305 }
311 return base + offset; 306 return base + offset;
312} 307}
@@ -410,8 +405,7 @@ void main_timer_handler(struct pt_regs *regs)
410 offset %= USEC_PER_TICK; 405 offset %= USEC_PER_TICK;
411 } 406 }
412 407
413 /* FIXME: 1000 or 1000000? */ 408 monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
414 monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
415 409
416 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; 410 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
417 411
@@ -441,12 +435,8 @@ void main_timer_handler(struct pt_regs *regs)
441 * have to call the local interrupt handler. 435 * have to call the local interrupt handler.
442 */ 436 */
443 437
444#ifndef CONFIG_X86_LOCAL_APIC
445 profile_tick(CPU_PROFILING, regs);
446#else
447 if (!using_apic_timer) 438 if (!using_apic_timer)
448 smp_local_timer_interrupt(regs); 439 smp_local_timer_interrupt(regs);
449#endif
450 440
451/* 441/*
452 * If we have an externally synchronized Linux clock, then update CMOS clock 442 * If we have an externally synchronized Linux clock, then update CMOS clock
@@ -470,10 +460,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
470 if (apic_runs_main_timer > 1) 460 if (apic_runs_main_timer > 1)
471 return IRQ_HANDLED; 461 return IRQ_HANDLED;
472 main_timer_handler(regs); 462 main_timer_handler(regs);
473#ifdef CONFIG_X86_LOCAL_APIC
474 if (using_apic_timer) 463 if (using_apic_timer)
475 smp_send_timer_broadcast_ipi(); 464 smp_send_timer_broadcast_ipi();
476#endif
477 return IRQ_HANDLED; 465 return IRQ_HANDLED;
478} 466}
479 467
@@ -893,11 +881,17 @@ static struct irqaction irq0 = {
893 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL 881 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
894}; 882};
895 883
896void __init time_init(void) 884static int __cpuinit
885time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
897{ 886{
898 char *timename; 887 unsigned cpu = (unsigned long) hcpu;
899 char *gtod; 888 if (action == CPU_ONLINE)
889 vsyscall_set_cpu(cpu);
890 return NOTIFY_DONE;
891}
900 892
893void __init time_init(void)
894{
901 if (nohpet) 895 if (nohpet)
902 vxtime.hpet_address = 0; 896 vxtime.hpet_address = 0;
903 897
@@ -931,18 +925,17 @@ void __init time_init(void)
931 } 925 }
932 926
933 vxtime.mode = VXTIME_TSC; 927 vxtime.mode = VXTIME_TSC;
934 gtod = time_init_gtod();
935
936 printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
937 vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
938 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
939 cpu_khz / 1000, cpu_khz % 1000);
940 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; 928 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
941 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; 929 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
942 vxtime.last_tsc = get_cycles_sync(); 930 vxtime.last_tsc = get_cycles_sync();
931 set_cyc2ns_scale(cpu_khz);
943 setup_irq(0, &irq0); 932 setup_irq(0, &irq0);
933 hotcpu_notifier(time_cpu_notifier, 0);
934 time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());
944 935
945 set_cyc2ns_scale(cpu_khz); 936#ifndef CONFIG_SMP
937 time_init_gtod();
938#endif
946} 939}
947 940
948/* 941/*
@@ -973,12 +966,18 @@ __cpuinit int unsynchronized_tsc(void)
973/* 966/*
974 * Decide what mode gettimeofday should use. 967 * Decide what mode gettimeofday should use.
975 */ 968 */
976__init static char *time_init_gtod(void) 969void time_init_gtod(void)
977{ 970{
978 char *timetype; 971 char *timetype;
979 972
980 if (unsynchronized_tsc()) 973 if (unsynchronized_tsc())
981 notsc = 1; 974 notsc = 1;
975
976 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
977 vgetcpu_mode = VGETCPU_RDTSCP;
978 else
979 vgetcpu_mode = VGETCPU_LSL;
980
982 if (vxtime.hpet_address && notsc) { 981 if (vxtime.hpet_address && notsc) {
983 timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; 982 timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
984 if (hpet_use_timer) 983 if (hpet_use_timer)
@@ -1001,7 +1000,16 @@ __init static char *time_init_gtod(void)
1001 timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; 1000 timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
1002 vxtime.mode = VXTIME_TSC; 1001 vxtime.mode = VXTIME_TSC;
1003 } 1002 }
1004 return timetype; 1003
1004 printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
1005 vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
1006 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
1007 cpu_khz / 1000, cpu_khz % 1000);
1008 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
1009 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
1010 vxtime.last_tsc = get_cycles_sync();
1011
1012 set_cyc2ns_scale(cpu_khz);
1005} 1013}
1006 1014
1007__setup("report_lost_ticks", time_setup); 1015__setup("report_lost_ticks", time_setup);
@@ -1031,8 +1039,16 @@ static int timer_resume(struct sys_device *dev)
1031 unsigned long flags; 1039 unsigned long flags;
1032 unsigned long sec; 1040 unsigned long sec;
1033 unsigned long ctime = get_cmos_time(); 1041 unsigned long ctime = get_cmos_time();
1034 unsigned long sleep_length = (ctime - sleep_start) * HZ; 1042 long sleep_length = (ctime - sleep_start) * HZ;
1035 1043
1044 if (sleep_length < 0) {
1045 printk(KERN_WARNING "Time skew detected in timer resume!\n");
1046 /* The time after the resume must not be earlier than the time
1047 * before the suspend or some nasty things will happen
1048 */
1049 sleep_length = 0;
1050 ctime = sleep_start;
1051 }
1036 if (vxtime.hpet_address) 1052 if (vxtime.hpet_address)
1037 hpet_reenable(); 1053 hpet_reenable();
1038 else 1054 else
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index 23a03eb91fc7..c79b99a9e2f6 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -64,7 +64,7 @@ idt_48:
64 .word 0, 0 # idt base = 0L 64 .word 0, 0 # idt base = 0L
65 65
66gdt_48: 66gdt_48:
67 .short __KERNEL32_CS + 7 # gdt limit 67 .short GDT_ENTRIES*8 - 1 # gdt limit
68 .long cpu_gdt_table-__START_KERNEL_map 68 .long cpu_gdt_table-__START_KERNEL_map
69 69
70.globl trampoline_end 70.globl trampoline_end
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b1249774d1e8..01f2a8d254c2 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <asm/pda.h> 45#include <asm/pda.h>
46#include <asm/proto.h> 46#include <asm/proto.h>
47#include <asm/nmi.h> 47#include <asm/nmi.h>
48#include <asm/stacktrace.h>
48 49
49asmlinkage void divide_error(void); 50asmlinkage void divide_error(void);
50asmlinkage void debug(void); 51asmlinkage void debug(void);
@@ -142,7 +143,7 @@ void printk_address(unsigned long address)
142#endif 143#endif
143 144
144static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 145static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
145 unsigned *usedp, const char **idp) 146 unsigned *usedp, char **idp)
146{ 147{
147 static char ids[][8] = { 148 static char ids[][8] = {
148 [DEBUG_STACK - 1] = "#DB", 149 [DEBUG_STACK - 1] = "#DB",
@@ -161,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
161 * 'stack' is in one of them: 162 * 'stack' is in one of them:
162 */ 163 */
163 for (k = 0; k < N_EXCEPTION_STACKS; k++) { 164 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
164 unsigned long end; 165 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
165
166 /*
167 * set 'end' to the end of the exception stack.
168 */
169 switch (k + 1) {
170 /*
171 * TODO: this block is not needed i think, because
172 * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
173 * properly too.
174 */
175#if DEBUG_STKSZ > EXCEPTION_STKSZ
176 case DEBUG_STACK:
177 end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
178 break;
179#endif
180 default:
181 end = per_cpu(orig_ist, cpu).ist[k];
182 break;
183 }
184 /* 166 /*
185 * Is 'stack' above this exception frame's end? 167 * Is 'stack' above this exception frame's end?
186 * If yes then skip to the next frame. 168 * If yes then skip to the next frame.
@@ -234,13 +216,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
234 return NULL; 216 return NULL;
235} 217}
236 218
237static int show_trace_unwind(struct unwind_frame_info *info, void *context) 219struct ops_and_data {
220 struct stacktrace_ops *ops;
221 void *data;
222};
223
224static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
238{ 225{
226 struct ops_and_data *oad = (struct ops_and_data *)context;
239 int n = 0; 227 int n = 0;
240 228
241 while (unwind(info) == 0 && UNW_PC(info)) { 229 while (unwind(info) == 0 && UNW_PC(info)) {
242 n++; 230 n++;
243 printk_address(UNW_PC(info)); 231 oad->ops->address(oad->data, UNW_PC(info));
244 if (arch_unw_user_mode(info)) 232 if (arch_unw_user_mode(info))
245 break; 233 break;
246 } 234 }
@@ -254,45 +242,53 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context)
254 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 242 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
255 */ 243 */
256 244
257void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) 245void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
246 struct stacktrace_ops *ops, void *data)
258{ 247{
259 const unsigned cpu = safe_smp_processor_id(); 248 const unsigned cpu = smp_processor_id();
260 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 249 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
261 unsigned used = 0; 250 unsigned used = 0;
262 251
263 printk("\nCall Trace:\n");
264
265 if (!tsk) 252 if (!tsk)
266 tsk = current; 253 tsk = current;
267 254
268 if (call_trace >= 0) { 255 if (call_trace >= 0) {
269 int unw_ret = 0; 256 int unw_ret = 0;
270 struct unwind_frame_info info; 257 struct unwind_frame_info info;
258 struct ops_and_data oad = { .ops = ops, .data = data };
271 259
272 if (regs) { 260 if (regs) {
273 if (unwind_init_frame_info(&info, tsk, regs) == 0) 261 if (unwind_init_frame_info(&info, tsk, regs) == 0)
274 unw_ret = show_trace_unwind(&info, NULL); 262 unw_ret = dump_trace_unwind(&info, &oad);
275 } else if (tsk == current) 263 } else if (tsk == current)
276 unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); 264 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
277 else { 265 else {
278 if (unwind_init_blocked(&info, tsk) == 0) 266 if (unwind_init_blocked(&info, tsk) == 0)
279 unw_ret = show_trace_unwind(&info, NULL); 267 unw_ret = dump_trace_unwind(&info, &oad);
280 } 268 }
281 if (unw_ret > 0) { 269 if (unw_ret > 0) {
282 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 270 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
283 print_symbol("DWARF2 unwinder stuck at %s\n", 271 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
284 UNW_PC(&info)); 272 UNW_PC(&info));
285 if ((long)UNW_SP(&info) < 0) { 273 if ((long)UNW_SP(&info) < 0) {
286 printk("Leftover inexact backtrace:\n"); 274 ops->warning(data, "Leftover inexact backtrace:\n");
287 stack = (unsigned long *)UNW_SP(&info); 275 stack = (unsigned long *)UNW_SP(&info);
276 if (!stack)
277 return;
288 } else 278 } else
289 printk("Full inexact backtrace again:\n"); 279 ops->warning(data, "Full inexact backtrace again:\n");
290 } else if (call_trace >= 1) 280 } else if (call_trace >= 1)
291 return; 281 return;
292 else 282 else
293 printk("Full inexact backtrace again:\n"); 283 ops->warning(data, "Full inexact backtrace again:\n");
294 } else 284 } else
295 printk("Inexact backtrace:\n"); 285 ops->warning(data, "Inexact backtrace:\n");
286 }
287 if (!stack) {
288 unsigned long dummy;
289 stack = &dummy;
290 if (tsk && tsk != current)
291 stack = (unsigned long *)tsk->thread.rsp;
296 } 292 }
297 293
298 /* 294 /*
@@ -303,7 +299,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
303#define HANDLE_STACK(cond) \ 299#define HANDLE_STACK(cond) \
304 do while (cond) { \ 300 do while (cond) { \
305 unsigned long addr = *stack++; \ 301 unsigned long addr = *stack++; \
306 if (kernel_text_address(addr)) { \ 302 if (oops_in_progress ? \
303 __kernel_text_address(addr) : \
304 kernel_text_address(addr)) { \
307 /* \ 305 /* \
308 * If the address is either in the text segment of the \ 306 * If the address is either in the text segment of the \
309 * kernel, or in the region which contains vmalloc'ed \ 307 * kernel, or in the region which contains vmalloc'ed \
@@ -312,7 +310,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
312 * down the cause of the crash will be able to figure \ 310 * down the cause of the crash will be able to figure \
313 * out the call path that was taken. \ 311 * out the call path that was taken. \
314 */ \ 312 */ \
315 printk_address(addr); \ 313 ops->address(data, addr); \
316 } \ 314 } \
317 } while (0) 315 } while (0)
318 316
@@ -321,16 +319,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
321 * current stack address. If the stacks consist of nested 319 * current stack address. If the stacks consist of nested
322 * exceptions 320 * exceptions
323 */ 321 */
324 for ( ; ; ) { 322 for (;;) {
325 const char *id; 323 char *id;
326 unsigned long *estack_end; 324 unsigned long *estack_end;
327 estack_end = in_exception_stack(cpu, (unsigned long)stack, 325 estack_end = in_exception_stack(cpu, (unsigned long)stack,
328 &used, &id); 326 &used, &id);
329 327
330 if (estack_end) { 328 if (estack_end) {
331 printk(" <%s>", id); 329 if (ops->stack(data, id) < 0)
330 break;
332 HANDLE_STACK (stack < estack_end); 331 HANDLE_STACK (stack < estack_end);
333 printk(" <EOE>"); 332 ops->stack(data, "<EOE>");
334 /* 333 /*
335 * We link to the next stack via the 334 * We link to the next stack via the
336 * second-to-last pointer (index -2 to end) in the 335 * second-to-last pointer (index -2 to end) in the
@@ -345,7 +344,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
345 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 344 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
346 345
347 if (stack >= irqstack && stack < irqstack_end) { 346 if (stack >= irqstack && stack < irqstack_end) {
348 printk(" <IRQ>"); 347 if (ops->stack(data, "IRQ") < 0)
348 break;
349 HANDLE_STACK (stack < irqstack_end); 349 HANDLE_STACK (stack < irqstack_end);
350 /* 350 /*
351 * We link to the next stack (which would be 351 * We link to the next stack (which would be
@@ -354,7 +354,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
354 */ 354 */
355 stack = (unsigned long *) (irqstack_end[-1]); 355 stack = (unsigned long *) (irqstack_end[-1]);
356 irqstack_end = NULL; 356 irqstack_end = NULL;
357 printk(" <EOI>"); 357 ops->stack(data, "EOI");
358 continue; 358 continue;
359 } 359 }
360 } 360 }
@@ -362,19 +362,57 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
362 } 362 }
363 363
364 /* 364 /*
365 * This prints the process stack: 365 * This handles the process stack:
366 */ 366 */
367 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); 367 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
368#undef HANDLE_STACK 368#undef HANDLE_STACK
369}
370EXPORT_SYMBOL(dump_trace);
371
372static void
373print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
374{
375 print_symbol(msg, symbol);
376 printk("\n");
377}
378
379static void print_trace_warning(void *data, char *msg)
380{
381 printk("%s\n", msg);
382}
383
384static int print_trace_stack(void *data, char *name)
385{
386 printk(" <%s> ", name);
387 return 0;
388}
389
390static void print_trace_address(void *data, unsigned long addr)
391{
392 printk_address(addr);
393}
394
395static struct stacktrace_ops print_trace_ops = {
396 .warning = print_trace_warning,
397 .warning_symbol = print_trace_warning_symbol,
398 .stack = print_trace_stack,
399 .address = print_trace_address,
400};
369 401
402void
403show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
404{
405 printk("\nCall Trace:\n");
406 dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
370 printk("\n"); 407 printk("\n");
371} 408}
372 409
373static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) 410static void
411_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
374{ 412{
375 unsigned long *stack; 413 unsigned long *stack;
376 int i; 414 int i;
377 const int cpu = safe_smp_processor_id(); 415 const int cpu = smp_processor_id();
378 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 416 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
379 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 417 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
380 418
@@ -428,7 +466,7 @@ void show_registers(struct pt_regs *regs)
428 int i; 466 int i;
429 int in_kernel = !user_mode(regs); 467 int in_kernel = !user_mode(regs);
430 unsigned long rsp; 468 unsigned long rsp;
431 const int cpu = safe_smp_processor_id(); 469 const int cpu = smp_processor_id();
432 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 470 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
433 471
434 rsp = regs->rsp; 472 rsp = regs->rsp;
@@ -503,9 +541,11 @@ static unsigned int die_nest_count;
503 541
504unsigned __kprobes long oops_begin(void) 542unsigned __kprobes long oops_begin(void)
505{ 543{
506 int cpu = safe_smp_processor_id(); 544 int cpu = smp_processor_id();
507 unsigned long flags; 545 unsigned long flags;
508 546
547 oops_enter();
548
509 /* racy, but better than risking deadlock. */ 549 /* racy, but better than risking deadlock. */
510 local_irq_save(flags); 550 local_irq_save(flags);
511 if (!spin_trylock(&die_lock)) { 551 if (!spin_trylock(&die_lock)) {
@@ -534,6 +574,7 @@ void __kprobes oops_end(unsigned long flags)
534 spin_unlock_irqrestore(&die_lock, flags); 574 spin_unlock_irqrestore(&die_lock, flags);
535 if (panic_on_oops) 575 if (panic_on_oops)
536 panic("Fatal exception"); 576 panic("Fatal exception");
577 oops_exit();
537} 578}
538 579
539void __kprobes __die(const char * str, struct pt_regs * regs, long err) 580void __kprobes __die(const char * str, struct pt_regs * regs, long err)
@@ -570,7 +611,7 @@ void die(const char * str, struct pt_regs * regs, long err)
570 do_exit(SIGSEGV); 611 do_exit(SIGSEGV);
571} 612}
572 613
573void __kprobes die_nmi(char *str, struct pt_regs *regs) 614void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
574{ 615{
575 unsigned long flags = oops_begin(); 616 unsigned long flags = oops_begin();
576 617
@@ -578,13 +619,12 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
578 * We are in trouble anyway, lets at least try 619 * We are in trouble anyway, lets at least try
579 * to get a message out. 620 * to get a message out.
580 */ 621 */
581 printk(str, safe_smp_processor_id()); 622 printk(str, smp_processor_id());
582 show_registers(regs); 623 show_registers(regs);
583 if (kexec_should_crash(current)) 624 if (kexec_should_crash(current))
584 crash_kexec(regs); 625 crash_kexec(regs);
585 if (panic_on_timeout || panic_on_oops) 626 if (do_panic || panic_on_oops)
586 panic("nmi watchdog"); 627 panic("Non maskable interrupt");
587 printk("console shuts up ...\n");
588 oops_end(flags); 628 oops_end(flags);
589 nmi_exit(); 629 nmi_exit();
590 local_irq_enable(); 630 local_irq_enable();
@@ -730,8 +770,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
730static __kprobes void 770static __kprobes void
731mem_parity_error(unsigned char reason, struct pt_regs * regs) 771mem_parity_error(unsigned char reason, struct pt_regs * regs)
732{ 772{
733 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); 773 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
734 printk("You probably have a hardware problem with your RAM chips\n"); 774 reason);
775 printk(KERN_EMERG "You probably have a hardware problem with your "
776 "RAM chips\n");
777
778 if (panic_on_unrecovered_nmi)
779 panic("NMI: Not continuing");
780
781 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
735 782
736 /* Clear and disable the memory parity error line. */ 783 /* Clear and disable the memory parity error line. */
737 reason = (reason & 0xf) | 4; 784 reason = (reason & 0xf) | 4;
@@ -754,9 +801,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
754 801
755static __kprobes void 802static __kprobes void
756unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 803unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
757{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); 804{
758 printk("Dazed and confused, but trying to continue\n"); 805 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
759 printk("Do you have a strange power saving mode enabled?\n"); 806 reason);
807 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
808
809 if (panic_on_unrecovered_nmi)
810 panic("NMI: Not continuing");
811
812 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
760} 813}
761 814
762/* Runs on IST stack. This code must keep interrupts off all the time. 815/* Runs on IST stack. This code must keep interrupts off all the time.
@@ -776,17 +829,15 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
776 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 829 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
777 == NOTIFY_STOP) 830 == NOTIFY_STOP)
778 return; 831 return;
779#ifdef CONFIG_X86_LOCAL_APIC
780 /* 832 /*
781 * Ok, so this is none of the documented NMI sources, 833 * Ok, so this is none of the documented NMI sources,
782 * so it must be the NMI watchdog. 834 * so it must be the NMI watchdog.
783 */ 835 */
784 if (nmi_watchdog > 0) { 836 if (nmi_watchdog_tick(regs,reason))
785 nmi_watchdog_tick(regs,reason);
786 return; 837 return;
787 } 838 if (!do_nmi_callback(regs,cpu))
788#endif 839 unknown_nmi_error(reason, regs);
789 unknown_nmi_error(reason, regs); 840
790 return; 841 return;
791 } 842 }
792 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 843 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -1071,6 +1122,7 @@ asmlinkage void math_state_restore(void)
1071 init_fpu(me); 1122 init_fpu(me);
1072 restore_fpu_checking(&me->thread.i387.fxsave); 1123 restore_fpu_checking(&me->thread.i387.fxsave);
1073 task_thread_info(me)->status |= TS_USEDFPU; 1124 task_thread_info(me)->status |= TS_USEDFPU;
1125 me->fpu_counter++;
1074} 1126}
1075 1127
1076void __init trap_init(void) 1128void __init trap_init(void)
@@ -1109,24 +1161,30 @@ void __init trap_init(void)
1109} 1161}
1110 1162
1111 1163
1112/* Actual parsing is done early in setup.c. */ 1164static int __init oops_setup(char *s)
1113static int __init oops_dummy(char *s)
1114{ 1165{
1115 panic_on_oops = 1; 1166 if (!s)
1116 return 1; 1167 return -EINVAL;
1168 if (!strcmp(s, "panic"))
1169 panic_on_oops = 1;
1170 return 0;
1117} 1171}
1118__setup("oops=", oops_dummy); 1172early_param("oops", oops_setup);
1119 1173
1120static int __init kstack_setup(char *s) 1174static int __init kstack_setup(char *s)
1121{ 1175{
1176 if (!s)
1177 return -EINVAL;
1122 kstack_depth_to_print = simple_strtoul(s,NULL,0); 1178 kstack_depth_to_print = simple_strtoul(s,NULL,0);
1123 return 1; 1179 return 0;
1124} 1180}
1125__setup("kstack=", kstack_setup); 1181early_param("kstack", kstack_setup);
1126 1182
1127#ifdef CONFIG_STACK_UNWIND 1183#ifdef CONFIG_STACK_UNWIND
1128static int __init call_trace_setup(char *s) 1184static int __init call_trace_setup(char *s)
1129{ 1185{
1186 if (!s)
1187 return -EINVAL;
1130 if (strcmp(s, "old") == 0) 1188 if (strcmp(s, "old") == 0)
1131 call_trace = -1; 1189 call_trace = -1;
1132 else if (strcmp(s, "both") == 0) 1190 else if (strcmp(s, "both") == 0)
@@ -1135,7 +1193,7 @@ static int __init call_trace_setup(char *s)
1135 call_trace = 1; 1193 call_trace = 1;
1136 else if (strcmp(s, "new") == 0) 1194 else if (strcmp(s, "new") == 0)
1137 call_trace = 2; 1195 call_trace = 2;
1138 return 1; 1196 return 0;
1139} 1197}
1140__setup("call_trace=", call_trace_setup); 1198early_param("call_trace", call_trace_setup);
1141#endif 1199#endif
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7c4de31471d4..d0564f1bcb0b 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 13OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 14ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 15jiffies_64 = jiffies;
16PHDRS {
17 text PT_LOAD FLAGS(5); /* R_E */
18 data PT_LOAD FLAGS(7); /* RWE */
19 user PT_LOAD FLAGS(7); /* RWE */
20 note PT_NOTE FLAGS(4); /* R__ */
21}
16SECTIONS 22SECTIONS
17{ 23{
18 . = __START_KERNEL; 24 . = __START_KERNEL;
@@ -31,7 +37,7 @@ SECTIONS
31 KPROBES_TEXT 37 KPROBES_TEXT
32 *(.fixup) 38 *(.fixup)
33 *(.gnu.warning) 39 *(.gnu.warning)
34 } = 0x9090 40 } :text = 0x9090
35 /* out-of-line lock text */ 41 /* out-of-line lock text */
36 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } 42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
37 43
@@ -57,7 +63,7 @@ SECTIONS
57 .data : AT(ADDR(.data) - LOAD_OFFSET) { 63 .data : AT(ADDR(.data) - LOAD_OFFSET) {
58 *(.data) 64 *(.data)
59 CONSTRUCTORS 65 CONSTRUCTORS
60 } 66 } :data
61 67
62 _edata = .; /* End of data section */ 68 _edata = .; /* End of data section */
63 69
@@ -89,7 +95,7 @@ SECTIONS
89#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 95#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
90 96
91 . = VSYSCALL_ADDR; 97 . = VSYSCALL_ADDR;
92 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } 98 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
93 __vsyscall_0 = VSYSCALL_VIRT_ADDR; 99 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
94 100
95 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 101 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
@@ -99,6 +105,9 @@ SECTIONS
99 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } 105 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
100 vxtime = VVIRT(.vxtime); 106 vxtime = VVIRT(.vxtime);
101 107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
102 .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } 111 .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
103 wall_jiffies = VVIRT(.wall_jiffies); 112 wall_jiffies = VVIRT(.wall_jiffies);
104 113
@@ -132,7 +141,7 @@ SECTIONS
132 . = ALIGN(8192); /* init_task */ 141 . = ALIGN(8192); /* init_task */
133 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 142 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
134 *(.data.init_task) 143 *(.data.init_task)
135 } 144 } :data
136 145
137 . = ALIGN(4096); 146 . = ALIGN(4096);
138 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 147 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -207,14 +216,12 @@ SECTIONS
207 __initramfs_start = .; 216 __initramfs_start = .;
208 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 217 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
209 __initramfs_end = .; 218 __initramfs_end = .;
210 /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ 219 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
211 complain */
212 . = ALIGN(4096);
213 __init_end = .;
214 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
215 __per_cpu_start = .; 220 __per_cpu_start = .;
216 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 221 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
217 __per_cpu_end = .; 222 __per_cpu_end = .;
223 . = ALIGN(4096);
224 __init_end = .;
218 225
219 . = ALIGN(4096); 226 . = ALIGN(4096);
220 __nosave_begin = .; 227 __nosave_begin = .;
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
index 92f70c74965f..044e852bd25e 100644
--- a/arch/x86_64/kernel/vsmp.c
+++ b/arch/x86_64/kernel/vsmp.c
@@ -20,6 +20,9 @@ static int __init vsmp_init(void)
20 void *address; 20 void *address;
21 unsigned int cap, ctl; 21 unsigned int cap, ctl;
22 22
23 if (!early_pci_allowed())
24 return 0;
25
23 /* Check if we are running on a ScaleMP vSMP box */ 26 /* Check if we are running on a ScaleMP vSMP box */
24 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || 27 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
25 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) 28 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index f603037df162..ac48c3857ddb 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
26#include <linux/seqlock.h> 26#include <linux/seqlock.h>
27#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/sysctl.h> 28#include <linux/sysctl.h>
29#include <linux/getcpu.h>
29 30
30#include <asm/vsyscall.h> 31#include <asm/vsyscall.h>
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
@@ -33,11 +34,15 @@
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/errno.h> 35#include <asm/errno.h>
35#include <asm/io.h> 36#include <asm/io.h>
37#include <asm/segment.h>
38#include <asm/desc.h>
39#include <asm/topology.h>
36 40
37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 41#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
38 42
39int __sysctl_vsyscall __section_sysctl_vsyscall = 1; 43int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
40seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; 44seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
45int __vgetcpu_mode __section_vgetcpu_mode;
41 46
42#include <asm/unistd.h> 47#include <asm/unistd.h>
43 48
@@ -72,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
72 __vxtime.tsc_quot) >> 32; 77 __vxtime.tsc_quot) >> 32;
73 /* See comment in x86_64 do_gettimeofday. */ 78 /* See comment in x86_64 do_gettimeofday. */
74 } else { 79 } else {
75 usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - 80 usec += ((readl((void __iomem *)
81 fix_to_virt(VSYSCALL_HPET) + 0xf0) -
76 __vxtime.last) * __vxtime.quot) >> 32; 82 __vxtime.last) * __vxtime.quot) >> 32;
77 } 83 }
78 } while (read_seqretry(&__xtime_lock, sequence)); 84 } while (read_seqretry(&__xtime_lock, sequence));
@@ -127,9 +133,46 @@ time_t __vsyscall(1) vtime(time_t *t)
127 return __xtime.tv_sec; 133 return __xtime.tv_sec;
128} 134}
129 135
130long __vsyscall(2) venosys_0(void) 136/* Fast way to get current CPU and node.
137 This helps to do per node and per CPU caches in user space.
138 The result is not guaranteed without CPU affinity, but usually
139 works out because the scheduler tries to keep a thread on the same
140 CPU.
141
142 tcache must point to a two element sized long array.
143 All arguments can be NULL. */
144long __vsyscall(2)
145vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
131{ 146{
132 return -ENOSYS; 147 unsigned int dummy, p;
148 unsigned long j = 0;
149
150 /* Fast cache - only recompute value once per jiffies and avoid
151 relatively costly rdtscp/cpuid otherwise.
152 This works because the scheduler usually keeps the process
153 on the same CPU and this syscall doesn't guarantee its
154 results anyways.
155 We do this here because otherwise user space would do it on
156 its own in a likely inferior way (no access to jiffies).
157 If you don't like it pass NULL. */
158 if (tcache && tcache->t0 == (j = __jiffies)) {
159 p = tcache->t1;
160 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
161 /* Load per CPU data from RDTSCP */
162 rdtscp(dummy, dummy, p);
163 } else {
164 /* Load per CPU data from GDT */
165 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
166 }
167 if (tcache) {
168 tcache->t0 = j;
169 tcache->t1 = p;
170 }
171 if (cpu)
172 *cpu = p & 0xfff;
173 if (node)
174 *node = p >> 12;
175 return 0;
133} 176}
134 177
135long __vsyscall(3) venosys_1(void) 178long __vsyscall(3) venosys_1(void)
@@ -149,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
149 void __user *buffer, size_t *lenp, loff_t *ppos) 192 void __user *buffer, size_t *lenp, loff_t *ppos)
150{ 193{
151 extern u16 vsysc1, vsysc2; 194 extern u16 vsysc1, vsysc2;
152 u16 *map1, *map2; 195 u16 __iomem *map1;
196 u16 __iomem *map2;
153 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 197 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
154 if (!write) 198 if (!write)
155 return ret; 199 return ret;
@@ -164,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
164 goto out; 208 goto out;
165 } 209 }
166 if (!sysctl_vsyscall) { 210 if (!sysctl_vsyscall) {
167 *map1 = SYSCALL; 211 writew(SYSCALL, map1);
168 *map2 = SYSCALL; 212 writew(SYSCALL, map2);
169 } else { 213 } else {
170 *map1 = NOP2; 214 writew(NOP2, map1);
171 *map2 = NOP2; 215 writew(NOP2, map2);
172 } 216 }
173 iounmap(map2); 217 iounmap(map2);
174out: 218out:
@@ -200,6 +244,43 @@ static ctl_table kernel_root_table2[] = {
200 244
201#endif 245#endif
202 246
247static void __cpuinit write_rdtscp_cb(void *info)
248{
249 write_rdtscp_aux((unsigned long)info);
250}
251
252void __cpuinit vsyscall_set_cpu(int cpu)
253{
254 unsigned long *d;
255 unsigned long node = 0;
256#ifdef CONFIG_NUMA
257 node = cpu_to_node[cpu];
258#endif
259 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
260 void *info = (void *)((node << 12) | cpu);
261 /* Can happen on preemptive kernel */
262 if (get_cpu() == cpu)
263 write_rdtscp_cb(info);
264#ifdef CONFIG_SMP
265 else {
266 /* the notifier is unfortunately not executed on the
267 target CPU */
268 smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
269 }
270#endif
271 put_cpu();
272 }
273
274 /* Store cpu number in limit so that it can be loaded quickly
275 in user space in vgetcpu.
276 12 bits for the CPU and 8 bits for the node. */
277 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
278 *d = 0x0f40000000000ULL;
279 *d |= cpu;
280 *d |= (node & 0xf) << 12;
281 *d |= (node >> 4) << 48;
282}
283
203static void __init map_vsyscall(void) 284static void __init map_vsyscall(void)
204{ 285{
205 extern char __vsyscall_0; 286 extern char __vsyscall_0;
@@ -214,6 +295,7 @@ static int __init vsyscall_init(void)
214 VSYSCALL_ADDR(__NR_vgettimeofday))); 295 VSYSCALL_ADDR(__NR_vgettimeofday)));
215 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
216 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
217 map_vsyscall(); 299 map_vsyscall();
218#ifdef CONFIG_SYSCTL 300#ifdef CONFIG_SYSCTL
219 register_sysctl_table(kernel_root_table2, 0); 301 register_sysctl_table(kernel_root_table2, 0);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 370952c4ff22..c3454af5e3a2 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8);
29EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
30EXPORT_SYMBOL(copy_from_user); 30EXPORT_SYMBOL(copy_from_user);
31EXPORT_SYMBOL(copy_to_user); 31EXPORT_SYMBOL(copy_to_user);
32EXPORT_SYMBOL(__copy_from_user_inatomic);
32 33
33EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
34EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index ccef6ae747a3..b78d4170fce2 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ 9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10 usercopy.o getuser.o putuser.o \ 10 usercopy.o getuser.o putuser.o \
11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o 11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12lib-y += memcpy.o memmove.o memset.o copy_user.o 12lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 1f81b79b796c..9a10a78bb4a4 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -1,10 +1,22 @@
1#include <linux/linkage.h>
2#include <asm/dwarf2.h>
3
1/* 4/*
2 * Zero a page. 5 * Zero a page.
3 * rdi page 6 * rdi page
4 */ 7 */
5 .globl clear_page 8 ALIGN
6 .p2align 4 9clear_page_c:
7clear_page: 10 CFI_STARTPROC
11 movl $4096/8,%ecx
12 xorl %eax,%eax
13 rep stosq
14 ret
15 CFI_ENDPROC
16ENDPROC(clear_page)
17
18ENTRY(clear_page)
19 CFI_STARTPROC
8 xorl %eax,%eax 20 xorl %eax,%eax
9 movl $4096/64,%ecx 21 movl $4096/64,%ecx
10 .p2align 4 22 .p2align 4
@@ -23,28 +35,25 @@ clear_page:
23 jnz .Lloop 35 jnz .Lloop
24 nop 36 nop
25 ret 37 ret
26clear_page_end: 38 CFI_ENDPROC
39.Lclear_page_end:
40ENDPROC(clear_page)
27 41
28 /* Some CPUs run faster using the string instructions. 42 /* Some CPUs run faster using the string instructions.
29 It is also a lot simpler. Use this when possible */ 43 It is also a lot simpler. Use this when possible */
30 44
31#include <asm/cpufeature.h> 45#include <asm/cpufeature.h>
32 46
47 .section .altinstr_replacement,"ax"
481: .byte 0xeb /* jmp <disp8> */
49 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
502:
51 .previous
33 .section .altinstructions,"a" 52 .section .altinstructions,"a"
34 .align 8 53 .align 8
35 .quad clear_page 54 .quad clear_page
36 .quad clear_page_c 55 .quad 1b
37 .byte X86_FEATURE_REP_GOOD 56 .byte X86_FEATURE_REP_GOOD
38 .byte clear_page_end-clear_page 57 .byte .Lclear_page_end - clear_page
39 .byte clear_page_c_end-clear_page_c 58 .byte 2b - 1b
40 .previous
41
42 .section .altinstr_replacement,"ax"
43clear_page_c:
44 movl $4096/8,%ecx
45 xorl %eax,%eax
46 rep
47 stosq
48 ret
49clear_page_c_end:
50 .previous 59 .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 8fa19d96a7ee..0ebb03b60e79 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -1,17 +1,33 @@
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ 1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2 2
3#include <linux/config.h>
4#include <linux/linkage.h>
5#include <asm/dwarf2.h>
6
7 ALIGN
8copy_page_c:
9 CFI_STARTPROC
10 movl $4096/8,%ecx
11 rep movsq
12 ret
13 CFI_ENDPROC
14ENDPROC(copy_page_c)
15
3/* Don't use streaming store because it's better when the target 16/* Don't use streaming store because it's better when the target
4 ends up in cache. */ 17 ends up in cache. */
5 18
6/* Could vary the prefetch distance based on SMP/UP */ 19/* Could vary the prefetch distance based on SMP/UP */
7 20
8 .globl copy_page 21ENTRY(copy_page)
9 .p2align 4 22 CFI_STARTPROC
10copy_page:
11 subq $3*8,%rsp 23 subq $3*8,%rsp
24 CFI_ADJUST_CFA_OFFSET 3*8
12 movq %rbx,(%rsp) 25 movq %rbx,(%rsp)
26 CFI_REL_OFFSET rbx, 0
13 movq %r12,1*8(%rsp) 27 movq %r12,1*8(%rsp)
28 CFI_REL_OFFSET r12, 1*8
14 movq %r13,2*8(%rsp) 29 movq %r13,2*8(%rsp)
30 CFI_REL_OFFSET r13, 2*8
15 31
16 movl $(4096/64)-5,%ecx 32 movl $(4096/64)-5,%ecx
17 .p2align 4 33 .p2align 4
@@ -72,30 +88,33 @@ copy_page:
72 jnz .Loop2 88 jnz .Loop2
73 89
74 movq (%rsp),%rbx 90 movq (%rsp),%rbx
91 CFI_RESTORE rbx
75 movq 1*8(%rsp),%r12 92 movq 1*8(%rsp),%r12
93 CFI_RESTORE r12
76 movq 2*8(%rsp),%r13 94 movq 2*8(%rsp),%r13
95 CFI_RESTORE r13
77 addq $3*8,%rsp 96 addq $3*8,%rsp
97 CFI_ADJUST_CFA_OFFSET -3*8
78 ret 98 ret
99.Lcopy_page_end:
100 CFI_ENDPROC
101ENDPROC(copy_page)
79 102
80 /* Some CPUs run faster using the string copy instructions. 103 /* Some CPUs run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */ 104 It is also a lot simpler. Use this when possible */
82 105
83#include <asm/cpufeature.h> 106#include <asm/cpufeature.h>
84 107
108 .section .altinstr_replacement,"ax"
1091: .byte 0xeb /* jmp <disp8> */
110 .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
1112:
112 .previous
85 .section .altinstructions,"a" 113 .section .altinstructions,"a"
86 .align 8 114 .align 8
87 .quad copy_page 115 .quad copy_page
88 .quad copy_page_c 116 .quad 1b
89 .byte X86_FEATURE_REP_GOOD 117 .byte X86_FEATURE_REP_GOOD
90 .byte copy_page_c_end-copy_page_c 118 .byte .Lcopy_page_end - copy_page
91 .byte copy_page_c_end-copy_page_c 119 .byte 2b - 1b
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
96 movl $4096/8,%ecx
97 rep
98 movsq
99 ret
100copy_page_c_end:
101 .previous 120 .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index f64569b83b54..70bebd310408 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,56 +4,78 @@
4 * Functions to copy from and to user space. 4 * Functions to copy from and to user space.
5 */ 5 */
6 6
7#include <linux/linkage.h>
8#include <asm/dwarf2.h>
9
7#define FIX_ALIGNMENT 1 10#define FIX_ALIGNMENT 1
8 11
9 #include <asm/current.h> 12#include <asm/current.h>
10 #include <asm/asm-offsets.h> 13#include <asm/asm-offsets.h>
11 #include <asm/thread_info.h> 14#include <asm/thread_info.h>
12 #include <asm/cpufeature.h> 15#include <asm/cpufeature.h>
13 16
14/* Standard copy_to_user with segment limit checking */ 17 .macro ALTERNATIVE_JUMP feature,orig,alt
15 .globl copy_to_user 180:
16 .p2align 4
17copy_to_user:
18 GET_THREAD_INFO(%rax)
19 movq %rdi,%rcx
20 addq %rdx,%rcx
21 jc bad_to_user
22 cmpq threadinfo_addr_limit(%rax),%rcx
23 jae bad_to_user
242:
25 .byte 0xe9 /* 32bit jump */ 19 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f 20 .long \orig-1f /* by default jump to orig */
271: 211:
28
29 .section .altinstr_replacement,"ax" 22 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */ 232: .byte 0xe9 /* near jump with 32bit immediate */
31 .long copy_user_generic_c-1b /* offset */ 24 .long \alt-1b /* offset */ /* or alternatively to alt */
32 .previous 25 .previous
33 .section .altinstructions,"a" 26 .section .altinstructions,"a"
34 .align 8 27 .align 8
28 .quad 0b
35 .quad 2b 29 .quad 2b
36 .quad 3b 30 .byte \feature /* when feature is set */
37 .byte X86_FEATURE_REP_GOOD
38 .byte 5 31 .byte 5
39 .byte 5 32 .byte 5
40 .previous 33 .previous
34 .endm
35
36/* Standard copy_to_user with segment limit checking */
37ENTRY(copy_to_user)
38 CFI_STARTPROC
39 GET_THREAD_INFO(%rax)
40 movq %rdi,%rcx
41 addq %rdx,%rcx
42 jc bad_to_user
43 cmpq threadinfo_addr_limit(%rax),%rcx
44 jae bad_to_user
45 xorl %eax,%eax /* clear zero flag */
46 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47 CFI_ENDPROC
48
49ENTRY(copy_user_generic)
50 CFI_STARTPROC
51 movl $1,%ecx /* set zero flag */
52 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53 CFI_ENDPROC
54
55ENTRY(__copy_from_user_inatomic)
56 CFI_STARTPROC
57 xorl %ecx,%ecx /* clear zero flag */
58 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59 CFI_ENDPROC
41 60
42/* Standard copy_from_user with segment limit checking */ 61/* Standard copy_from_user with segment limit checking */
43 .globl copy_from_user 62ENTRY(copy_from_user)
44 .p2align 4 63 CFI_STARTPROC
45copy_from_user:
46 GET_THREAD_INFO(%rax) 64 GET_THREAD_INFO(%rax)
47 movq %rsi,%rcx 65 movq %rsi,%rcx
48 addq %rdx,%rcx 66 addq %rdx,%rcx
49 jc bad_from_user 67 jc bad_from_user
50 cmpq threadinfo_addr_limit(%rax),%rcx 68 cmpq threadinfo_addr_limit(%rax),%rcx
51 jae bad_from_user 69 jae bad_from_user
52 /* FALL THROUGH to copy_user_generic */ 70 movl $1,%ecx /* set zero flag */
71 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72 CFI_ENDPROC
73ENDPROC(copy_from_user)
53 74
54 .section .fixup,"ax" 75 .section .fixup,"ax"
55 /* must zero dest */ 76 /* must zero dest */
56bad_from_user: 77bad_from_user:
78 CFI_STARTPROC
57 movl %edx,%ecx 79 movl %edx,%ecx
58 xorl %eax,%eax 80 xorl %eax,%eax
59 rep 81 rep
@@ -61,40 +83,32 @@ bad_from_user:
61bad_to_user: 83bad_to_user:
62 movl %edx,%eax 84 movl %edx,%eax
63 ret 85 ret
86 CFI_ENDPROC
87END(bad_from_user)
64 .previous 88 .previous
65 89
66 90
67/* 91/*
68 * copy_user_generic - memory copy with exception handling. 92 * copy_user_generic_unrolled - memory copy with exception handling.
93 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
69 * 94 *
70 * Input: 95 * Input:
71 * rdi destination 96 * rdi destination
72 * rsi source 97 * rsi source
73 * rdx count 98 * rdx count
99 * ecx zero flag -- if true zero destination on error
74 * 100 *
75 * Output: 101 * Output:
76 * eax uncopied bytes or 0 if successful. 102 * eax uncopied bytes or 0 if successful.
77 */ 103 */
78 .globl copy_user_generic 104ENTRY(copy_user_generic_unrolled)
79 .p2align 4 105 CFI_STARTPROC
80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_REP_GOOD
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx 106 pushq %rbx
107 CFI_ADJUST_CFA_OFFSET 8
108 CFI_REL_OFFSET rbx, 0
109 pushq %rcx
110 CFI_ADJUST_CFA_OFFSET 8
111 CFI_REL_OFFSET rcx, 0
98 xorl %eax,%eax /*zero for the exception handler */ 112 xorl %eax,%eax /*zero for the exception handler */
99 113
100#ifdef FIX_ALIGNMENT 114#ifdef FIX_ALIGNMENT
@@ -168,9 +182,16 @@ copy_user_generic:
168 decl %ecx 182 decl %ecx
169 jnz .Lloop_1 183 jnz .Lloop_1
170 184
185 CFI_REMEMBER_STATE
171.Lende: 186.Lende:
187 popq %rcx
188 CFI_ADJUST_CFA_OFFSET -8
189 CFI_RESTORE rcx
172 popq %rbx 190 popq %rbx
191 CFI_ADJUST_CFA_OFFSET -8
192 CFI_RESTORE rbx
173 ret 193 ret
194 CFI_RESTORE_STATE
174 195
175#ifdef FIX_ALIGNMENT 196#ifdef FIX_ALIGNMENT
176 /* align destination */ 197 /* align destination */
@@ -252,6 +273,8 @@ copy_user_generic:
252 addl %ecx,%edx 273 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */ 274 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest: 275.Lzero_rest:
276 cmpl $0,(%rsp)
277 jz .Le_zero
255 movq %rdx,%rcx 278 movq %rdx,%rcx
256.Le_byte: 279.Le_byte:
257 xorl %eax,%eax 280 xorl %eax,%eax
@@ -261,6 +284,9 @@ copy_user_generic:
261.Le_zero: 284.Le_zero:
262 movq %rdx,%rax 285 movq %rdx,%rax
263 jmp .Lende 286 jmp .Lende
287 CFI_ENDPROC
288ENDPROC(copy_user_generic)
289
264 290
265 /* Some CPUs run faster using the string copy instructions. 291 /* Some CPUs run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible. 292 This is also a lot simpler. Use them when possible.
@@ -270,6 +296,7 @@ copy_user_generic:
270 /* rdi destination 296 /* rdi destination
271 * rsi source 297 * rsi source
272 * rdx count 298 * rdx count
299 * ecx zero flag
273 * 300 *
274 * Output: 301 * Output:
275 * eax uncopied bytes or 0 if successfull. 302 * eax uncopied bytes or 0 if successfull.
@@ -280,22 +307,48 @@ copy_user_generic:
280 * And more would be dangerous because both Intel and AMD have 307 * And more would be dangerous because both Intel and AMD have
281 * errata with rep movsq > 4GB. If someone feels the need to fix 308 * errata with rep movsq > 4GB. If someone feels the need to fix
282 * this please consider this. 309 * this please consider this.
283 */ 310 */
284copy_user_generic_c: 311ENTRY(copy_user_generic_string)
312 CFI_STARTPROC
313 movl %ecx,%r8d /* save zero flag */
285 movl %edx,%ecx 314 movl %edx,%ecx
286 shrl $3,%ecx 315 shrl $3,%ecx
287 andl $7,%edx 316 andl $7,%edx
317 jz 10f
2881: rep 3181: rep
289 movsq 319 movsq
290 movl %edx,%ecx 320 movl %edx,%ecx
2912: rep 3212: rep
292 movsb 322 movsb
2934: movl %ecx,%eax 3239: movl %ecx,%eax
294 ret 324 ret
2953: lea (%rdx,%rcx,8),%rax 325
326 /* multiple of 8 byte */
32710: rep
328 movsq
329 xor %eax,%eax
296 ret 330 ret
297 331
332 /* exception handling */
3333: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
334 jmp 6f
3355: movl %ecx,%eax /* exception on byte loop */
336 /* eax: left over bytes */
3376: testl %r8d,%r8d /* zero flag set? */
338 jz 7f
339 movl %eax,%ecx /* initialize x86 loop counter */
340 push %rax
341 xorl %eax,%eax
3428: rep
343 stosb /* zero the rest */
34411: pop %rax
3457: ret
346 CFI_ENDPROC
347END(copy_user_generic_c)
348
298 .section __ex_table,"a" 349 .section __ex_table,"a"
299 .quad 1b,3b 350 .quad 1b,3b
300 .quad 2b,4b 351 .quad 2b,5b
352 .quad 8b,11b
353 .quad 10b,3b
301 .previous 354 .previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
index 72fd55ee896e..f0dba36578ea 100644
--- a/arch/x86_64/lib/csum-copy.S
+++ b/arch/x86_64/lib/csum-copy.S
@@ -5,8 +5,9 @@
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
7 */ 7 */
8 #include <linux/linkage.h> 8#include <linux/linkage.h>
9 #include <asm/errno.h> 9#include <asm/dwarf2.h>
10#include <asm/errno.h>
10 11
11/* 12/*
12 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
@@ -53,19 +54,24 @@
53 .endm 54 .endm
54 55
55 56
56 .globl csum_partial_copy_generic 57ENTRY(csum_partial_copy_generic)
57 .p2align 4 58 CFI_STARTPROC
58csum_partial_copy_generic:
59 cmpl $3*64,%edx 59 cmpl $3*64,%edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8,%rsp
64 CFI_ADJUST_CFA_OFFSET 7*8
64 movq %rbx,2*8(%rsp) 65 movq %rbx,2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8
65 movq %r12,3*8(%rsp) 67 movq %r12,3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8
66 movq %r14,4*8(%rsp) 69 movq %r14,4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8
67 movq %r13,5*8(%rsp) 71 movq %r13,5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8
68 movq %rbp,6*8(%rsp) 73 movq %rbp,6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8
69 75
70 movq %r8,(%rsp) 76 movq %r8,(%rsp)
71 movq %r9,1*8(%rsp) 77 movq %r9,1*8(%rsp)
@@ -208,14 +214,22 @@ csum_partial_copy_generic:
208 addl %ebx,%eax 214 addl %ebx,%eax
209 adcl %r9d,%eax /* carry */ 215 adcl %r9d,%eax /* carry */
210 216
217 CFI_REMEMBER_STATE
211.Lende: 218.Lende:
212 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp),%rbx
220 CFI_RESTORE rbx
213 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp),%r12
222 CFI_RESTORE r12
214 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp),%r14
224 CFI_RESTORE r14
215 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp),%r13
226 CFI_RESTORE r13
216 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp),%rbp
228 CFI_RESTORE rbp
217 addq $7*8,%rsp 229 addq $7*8,%rsp
230 CFI_ADJUST_CFA_OFFSET -7*8
218 ret 231 ret
232 CFI_RESTORE_STATE
219 233
220 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
221.Lbad_source: 235.Lbad_source:
@@ -231,3 +245,5 @@ csum_partial_copy_generic:
231 jz .Lende 245 jz .Lende
232 movl $-EFAULT,(%rax) 246 movl $-EFAULT,(%rax)
233 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
index 3844d5e885a4..5448876261f8 100644
--- a/arch/x86_64/lib/getuser.S
+++ b/arch/x86_64/lib/getuser.S
@@ -27,25 +27,26 @@
27 */ 27 */
28 28
29#include <linux/linkage.h> 29#include <linux/linkage.h>
30#include <asm/dwarf2.h>
30#include <asm/page.h> 31#include <asm/page.h>
31#include <asm/errno.h> 32#include <asm/errno.h>
32#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
33#include <asm/thread_info.h> 34#include <asm/thread_info.h>
34 35
35 .text 36 .text
36 .p2align 4 37ENTRY(__get_user_1)
37.globl __get_user_1 38 CFI_STARTPROC
38__get_user_1:
39 GET_THREAD_INFO(%r8) 39 GET_THREAD_INFO(%r8)
40 cmpq threadinfo_addr_limit(%r8),%rcx 40 cmpq threadinfo_addr_limit(%r8),%rcx
41 jae bad_get_user 41 jae bad_get_user
421: movzb (%rcx),%edx 421: movzb (%rcx),%edx
43 xorl %eax,%eax 43 xorl %eax,%eax
44 ret 44 ret
45 CFI_ENDPROC
46ENDPROC(__get_user_1)
45 47
46 .p2align 4 48ENTRY(__get_user_2)
47.globl __get_user_2 49 CFI_STARTPROC
48__get_user_2:
49 GET_THREAD_INFO(%r8) 50 GET_THREAD_INFO(%r8)
50 addq $1,%rcx 51 addq $1,%rcx
51 jc 20f 52 jc 20f
@@ -57,10 +58,11 @@ __get_user_2:
57 ret 58 ret
5820: decq %rcx 5920: decq %rcx
59 jmp bad_get_user 60 jmp bad_get_user
61 CFI_ENDPROC
62ENDPROC(__get_user_2)
60 63
61 .p2align 4 64ENTRY(__get_user_4)
62.globl __get_user_4 65 CFI_STARTPROC
63__get_user_4:
64 GET_THREAD_INFO(%r8) 66 GET_THREAD_INFO(%r8)
65 addq $3,%rcx 67 addq $3,%rcx
66 jc 30f 68 jc 30f
@@ -72,10 +74,11 @@ __get_user_4:
72 ret 74 ret
7330: subq $3,%rcx 7530: subq $3,%rcx
74 jmp bad_get_user 76 jmp bad_get_user
77 CFI_ENDPROC
78ENDPROC(__get_user_4)
75 79
76 .p2align 4 80ENTRY(__get_user_8)
77.globl __get_user_8 81 CFI_STARTPROC
78__get_user_8:
79 GET_THREAD_INFO(%r8) 82 GET_THREAD_INFO(%r8)
80 addq $7,%rcx 83 addq $7,%rcx
81 jc 40f 84 jc 40f
@@ -87,11 +90,16 @@ __get_user_8:
87 ret 90 ret
8840: subq $7,%rcx 9140: subq $7,%rcx
89 jmp bad_get_user 92 jmp bad_get_user
93 CFI_ENDPROC
94ENDPROC(__get_user_8)
90 95
91bad_get_user: 96bad_get_user:
97 CFI_STARTPROC
92 xorl %edx,%edx 98 xorl %edx,%edx
93 movq $(-EFAULT),%rax 99 movq $(-EFAULT),%rax
94 ret 100 ret
101 CFI_ENDPROC
102END(bad_get_user)
95 103
96.section __ex_table,"a" 104.section __ex_table,"a"
97 .quad 1b,bad_get_user 105 .quad 1b,bad_get_user
diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S
index 8bbade5fea05..05a95e713da8 100644
--- a/arch/x86_64/lib/iomap_copy.S
+++ b/arch/x86_64/lib/iomap_copy.S
@@ -15,12 +15,16 @@
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
16 */ 16 */
17 17
18#include <linux/linkage.h>
19#include <asm/dwarf2.h>
20
18/* 21/*
19 * override generic version in lib/iomap_copy.c 22 * override generic version in lib/iomap_copy.c
20 */ 23 */
21 .globl __iowrite32_copy 24ENTRY(__iowrite32_copy)
22 .p2align 4 25 CFI_STARTPROC
23__iowrite32_copy:
24 movl %edx,%ecx 26 movl %edx,%ecx
25 rep movsd 27 rep movsd
26 ret 28 ret
29 CFI_ENDPROC
30ENDPROC(__iowrite32_copy)
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 5554948b5554..967b22fa7d07 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -1,6 +1,10 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3 #include <asm/cpufeature.h> 3#include <linux/config.h>
4#include <linux/linkage.h>
5#include <asm/dwarf2.h>
6#include <asm/cpufeature.h>
7
4/* 8/*
5 * memcpy - Copy a memory block. 9 * memcpy - Copy a memory block.
6 * 10 *
@@ -13,12 +17,26 @@
13 * rax original destination 17 * rax original destination
14 */ 18 */
15 19
16 .globl __memcpy 20 ALIGN
17 .globl memcpy 21memcpy_c:
18 .p2align 4 22 CFI_STARTPROC
19__memcpy: 23 movq %rdi,%rax
20memcpy: 24 movl %edx,%ecx
25 shrl $3,%ecx
26 andl $7,%edx
27 rep movsq
28 movl %edx,%ecx
29 rep movsb
30 ret
31 CFI_ENDPROC
32ENDPROC(memcpy_c)
33
34ENTRY(__memcpy)
35ENTRY(memcpy)
36 CFI_STARTPROC
21 pushq %rbx 37 pushq %rbx
38 CFI_ADJUST_CFA_OFFSET 8
39 CFI_REL_OFFSET rbx, 0
22 movq %rdi,%rax 40 movq %rdi,%rax
23 41
24 movl %edx,%ecx 42 movl %edx,%ecx
@@ -86,36 +104,27 @@ memcpy:
86 104
87.Lende: 105.Lende:
88 popq %rbx 106 popq %rbx
107 CFI_ADJUST_CFA_OFFSET -8
108 CFI_RESTORE rbx
89 ret 109 ret
90.Lfinal: 110.Lfinal:
111 CFI_ENDPROC
112ENDPROC(memcpy)
113ENDPROC(__memcpy)
91 114
92 /* Some CPUs run faster using the string copy instructions. 115 /* Some CPUs run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */ 116 It is also a lot simpler. Use this when possible */
94 117
118 .section .altinstr_replacement,"ax"
1191: .byte 0xeb /* jmp <disp8> */
120 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1212:
122 .previous
95 .section .altinstructions,"a" 123 .section .altinstructions,"a"
96 .align 8 124 .align 8
97 .quad memcpy 125 .quad memcpy
98 .quad memcpy_c 126 .quad 1b
99 .byte X86_FEATURE_REP_GOOD 127 .byte X86_FEATURE_REP_GOOD
100 .byte .Lfinal-memcpy 128 .byte .Lfinal - memcpy
101 .byte memcpy_c_end-memcpy_c 129 .byte 2b - 1b
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
110 movq %rdi,%rax
111 movl %edx,%ecx
112 shrl $3,%ecx
113 andl $7,%edx
114 rep
115 movsq
116 movl %edx,%ecx
117 rep
118 movsb
119 ret
120memcpy_c_end:
121 .previous 130 .previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index ad397f2c7de8..09ed1f6b0eaa 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,4 +1,9 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs */ 1/* Copyright 2002 Andi Kleen, SuSE Labs */
2
3#include <linux/config.h>
4#include <linux/linkage.h>
5#include <asm/dwarf2.h>
6
2/* 7/*
3 * ISO C memset - set a memory block to a byte value. 8 * ISO C memset - set a memory block to a byte value.
4 * 9 *
@@ -8,11 +13,29 @@
8 * 13 *
9 * rax original destination 14 * rax original destination
10 */ 15 */
11 .globl __memset 16 ALIGN
12 .globl memset 17memset_c:
13 .p2align 4 18 CFI_STARTPROC
14memset: 19 movq %rdi,%r9
15__memset: 20 movl %edx,%r8d
21 andl $7,%r8d
22 movl %edx,%ecx
23 shrl $3,%ecx
24 /* expand byte value */
25 movzbl %sil,%esi
26 movabs $0x0101010101010101,%rax
27 mulq %rsi /* with rax, clobbers rdx */
28 rep stosq
29 movl %r8d,%ecx
30 rep stosb
31 movq %r9,%rax
32 ret
33 CFI_ENDPROC
34ENDPROC(memset_c)
35
36ENTRY(memset)
37ENTRY(__memset)
38 CFI_STARTPROC
16 movq %rdi,%r10 39 movq %rdi,%r10
17 movq %rdx,%r11 40 movq %rdx,%r11
18 41
@@ -25,6 +48,7 @@ __memset:
25 movl %edi,%r9d 48 movl %edi,%r9d
26 andl $7,%r9d 49 andl $7,%r9d
27 jnz .Lbad_alignment 50 jnz .Lbad_alignment
51 CFI_REMEMBER_STATE
28.Lafter_bad_alignment: 52.Lafter_bad_alignment:
29 53
30 movl %r11d,%ecx 54 movl %r11d,%ecx
@@ -75,6 +99,7 @@ __memset:
75 movq %r10,%rax 99 movq %r10,%rax
76 ret 100 ret
77 101
102 CFI_RESTORE_STATE
78.Lbad_alignment: 103.Lbad_alignment:
79 cmpq $7,%r11 104 cmpq $7,%r11
80 jbe .Lhandle_7 105 jbe .Lhandle_7
@@ -84,42 +109,26 @@ __memset:
84 addq %r8,%rdi 109 addq %r8,%rdi
85 subq %r8,%r11 110 subq %r8,%r11
86 jmp .Lafter_bad_alignment 111 jmp .Lafter_bad_alignment
112.Lfinal:
113 CFI_ENDPROC
114ENDPROC(memset)
115ENDPROC(__memset)
87 116
88 /* Some CPUs run faster using the string instructions. 117 /* Some CPUs run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */ 118 It is also a lot simpler. Use this when possible */
90 119
91#include <asm/cpufeature.h> 120#include <asm/cpufeature.h>
92 121
122 .section .altinstr_replacement,"ax"
1231: .byte 0xeb /* jmp <disp8> */
124 .byte (memset_c - memset) - (2f - 1b) /* offset */
1252:
126 .previous
93 .section .altinstructions,"a" 127 .section .altinstructions,"a"
94 .align 8 128 .align 8
95 .quad memset 129 .quad memset
96 .quad memset_c 130 .quad 1b
97 .byte X86_FEATURE_REP_GOOD 131 .byte X86_FEATURE_REP_GOOD
98 .byte memset_c_end-memset_c 132 .byte .Lfinal - memset
99 .byte memset_c_end-memset_c 133 .byte 2b - 1b
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
108 movq %rdi,%r9
109 movl %edx,%r8d
110 andl $7,%r8d
111 movl %edx,%ecx
112 shrl $3,%ecx
113 /* expand byte value */
114 movzbl %sil,%esi
115 movabs $0x0101010101010101,%rax
116 mulq %rsi /* with rax, clobbers rdx */
117 rep
118 stosq
119 movl %r8d,%ecx
120 rep
121 stosb
122 movq %r9,%rax
123 ret
124memset_c_end:
125 .previous 134 .previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
index 7f5593974e2d..4989f5a8fa9b 100644
--- a/arch/x86_64/lib/putuser.S
+++ b/arch/x86_64/lib/putuser.S
@@ -25,25 +25,26 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/dwarf2.h>
28#include <asm/page.h> 29#include <asm/page.h>
29#include <asm/errno.h> 30#include <asm/errno.h>
30#include <asm/asm-offsets.h> 31#include <asm/asm-offsets.h>
31#include <asm/thread_info.h> 32#include <asm/thread_info.h>
32 33
33 .text 34 .text
34 .p2align 4 35ENTRY(__put_user_1)
35.globl __put_user_1 36 CFI_STARTPROC
36__put_user_1:
37 GET_THREAD_INFO(%r8) 37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx 38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user 39 jae bad_put_user
401: movb %dl,(%rcx) 401: movb %dl,(%rcx)
41 xorl %eax,%eax 41 xorl %eax,%eax
42 ret 42 ret
43 CFI_ENDPROC
44ENDPROC(__put_user_1)
43 45
44 .p2align 4 46ENTRY(__put_user_2)
45.globl __put_user_2 47 CFI_STARTPROC
46__put_user_2:
47 GET_THREAD_INFO(%r8) 48 GET_THREAD_INFO(%r8)
48 addq $1,%rcx 49 addq $1,%rcx
49 jc 20f 50 jc 20f
@@ -55,10 +56,11 @@ __put_user_2:
55 ret 56 ret
5620: decq %rcx 5720: decq %rcx
57 jmp bad_put_user 58 jmp bad_put_user
59 CFI_ENDPROC
60ENDPROC(__put_user_2)
58 61
59 .p2align 4 62ENTRY(__put_user_4)
60.globl __put_user_4 63 CFI_STARTPROC
61__put_user_4:
62 GET_THREAD_INFO(%r8) 64 GET_THREAD_INFO(%r8)
63 addq $3,%rcx 65 addq $3,%rcx
64 jc 30f 66 jc 30f
@@ -70,10 +72,11 @@ __put_user_4:
70 ret 72 ret
7130: subq $3,%rcx 7330: subq $3,%rcx
72 jmp bad_put_user 74 jmp bad_put_user
75 CFI_ENDPROC
76ENDPROC(__put_user_4)
73 77
74 .p2align 4 78ENTRY(__put_user_8)
75.globl __put_user_8 79 CFI_STARTPROC
76__put_user_8:
77 GET_THREAD_INFO(%r8) 80 GET_THREAD_INFO(%r8)
78 addq $7,%rcx 81 addq $7,%rcx
79 jc 40f 82 jc 40f
@@ -85,10 +88,15 @@ __put_user_8:
85 ret 88 ret
8640: subq $7,%rcx 8940: subq $7,%rcx
87 jmp bad_put_user 90 jmp bad_put_user
91 CFI_ENDPROC
92ENDPROC(__put_user_8)
88 93
89bad_put_user: 94bad_put_user:
95 CFI_STARTPROC
90 movq $(-EFAULT),%rax 96 movq $(-EFAULT),%rax
91 ret 97 ret
98 CFI_ENDPROC
99END(bad_put_user)
92 100
93.section __ex_table,"a" 101.section __ex_table,"a"
94 .quad 1b,bad_put_user 102 .quad 1b,bad_put_user
diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86_64/lib/rwlock.S
@@ -0,0 +1,38 @@
1/* Slow paths of read/write spinlocks. */
2
3#include <linux/linkage.h>
4#include <asm/rwlock.h>
5#include <asm/alternative-asm.i>
6#include <asm/dwarf2.h>
7
8/* rdi: pointer to rwlock_t */
9ENTRY(__write_lock_failed)
10 CFI_STARTPROC
11 LOCK_PREFIX
12 addl $RW_LOCK_BIAS,(%rdi)
131: rep
14 nop
15 cmpl $RW_LOCK_BIAS,(%rdi)
16 jne 1b
17 LOCK_PREFIX
18 subl $RW_LOCK_BIAS,(%rdi)
19 jnz __write_lock_failed
20 ret
21 CFI_ENDPROC
22END(__write_lock_failed)
23
24/* rdi: pointer to rwlock_t */
25ENTRY(__read_lock_failed)
26 CFI_STARTPROC
27 LOCK_PREFIX
28 incl (%rdi)
291: rep
30 nop
31 cmpl $1,(%rdi)
32 js 1b
33 LOCK_PREFIX
34 decl (%rdi)
35 js __read_lock_failed
36 ret
37 CFI_ENDPROC
38END(__read_lock_failed)
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index 332ea5dff916..0025535cac8d 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -1,10 +1,9 @@
1 /* 1/*
2 * Save registers before calling assembly functions. This avoids 2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs. 3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs. 4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Subject to the GNU public license, v.2. No warranty of any kind. 5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ 6 */
7 */
8 7
9 #include <linux/config.h> 8 #include <linux/config.h>
10 #include <linux/linkage.h> 9 #include <linux/linkage.h>
@@ -67,33 +66,3 @@ restore_norax:
67 RESTORE_ARGS 1 66 RESTORE_ARGS 1
68 ret 67 ret
69 CFI_ENDPROC 68 CFI_ENDPROC
70
71#ifdef CONFIG_SMP
72/* Support for read/write spinlocks. */
73 .text
74/* rax: pointer to rwlock_t */
75ENTRY(__write_lock_failed)
76 lock
77 addl $RW_LOCK_BIAS,(%rax)
781: rep
79 nop
80 cmpl $RW_LOCK_BIAS,(%rax)
81 jne 1b
82 lock
83 subl $RW_LOCK_BIAS,(%rax)
84 jnz __write_lock_failed
85 ret
86
87/* rax: pointer to rwlock_t */
88ENTRY(__read_lock_failed)
89 lock
90 incl (%rax)
911: rep
92 nop
93 cmpl $1,(%rax)
94 js 1b
95 lock
96 decl (%rax)
97 js __read_lock_failed
98 ret
99#endif
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 4198798e1469..1a17b0733ab5 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -40,8 +40,7 @@
40#define PF_RSVD (1<<3) 40#define PF_RSVD (1<<3)
41#define PF_INSTR (1<<4) 41#define PF_INSTR (1<<4)
42 42
43#ifdef CONFIG_KPROBES 43static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
44ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
45 44
46/* Hook to register for page fault notifications */ 45/* Hook to register for page fault notifications */
47int register_page_fault_notifier(struct notifier_block *nb) 46int register_page_fault_notifier(struct notifier_block *nb)
@@ -49,11 +48,13 @@ int register_page_fault_notifier(struct notifier_block *nb)
49 vmalloc_sync_all(); 48 vmalloc_sync_all();
50 return atomic_notifier_chain_register(&notify_page_fault_chain, nb); 49 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
51} 50}
51EXPORT_SYMBOL_GPL(register_page_fault_notifier);
52 52
53int unregister_page_fault_notifier(struct notifier_block *nb) 53int unregister_page_fault_notifier(struct notifier_block *nb)
54{ 54{
55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb); 55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
56} 56}
57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
57 58
58static inline int notify_page_fault(enum die_val val, const char *str, 59static inline int notify_page_fault(enum die_val val, const char *str,
59 struct pt_regs *regs, long err, int trap, int sig) 60 struct pt_regs *regs, long err, int trap, int sig)
@@ -67,13 +68,6 @@ static inline int notify_page_fault(enum die_val val, const char *str,
67 }; 68 };
68 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args); 69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
69} 70}
70#else
71static inline int notify_page_fault(enum die_val val, const char *str,
72 struct pt_regs *regs, long err, int trap, int sig)
73{
74 return NOTIFY_DONE;
75}
76#endif
77 71
78void bust_spinlocks(int yes) 72void bust_spinlocks(int yes)
79{ 73{
@@ -102,7 +96,7 @@ void bust_spinlocks(int yes)
102static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, 96static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
103 unsigned long error_code) 97 unsigned long error_code)
104{ 98{
105 unsigned char *instr; 99 unsigned char __user *instr;
106 int scan_more = 1; 100 int scan_more = 1;
107 int prefetch = 0; 101 int prefetch = 0;
108 unsigned char *max_instr; 102 unsigned char *max_instr;
@@ -111,7 +105,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
111 if (error_code & PF_INSTR) 105 if (error_code & PF_INSTR)
112 return 0; 106 return 0;
113 107
114 instr = (unsigned char *)convert_rip_to_linear(current, regs); 108 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
115 max_instr = instr + 15; 109 max_instr = instr + 15;
116 110
117 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
@@ -122,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
122 unsigned char instr_hi; 116 unsigned char instr_hi;
123 unsigned char instr_lo; 117 unsigned char instr_lo;
124 118
125 if (__get_user(opcode, instr)) 119 if (__get_user(opcode, (char __user *)instr))
126 break; 120 break;
127 121
128 instr_hi = opcode & 0xf0; 122 instr_hi = opcode & 0xf0;
@@ -160,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
160 case 0x00: 154 case 0x00:
161 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
162 scan_more = 0; 156 scan_more = 0;
163 if (__get_user(opcode, instr)) 157 if (__get_user(opcode, (char __user *)instr))
164 break; 158 break;
165 prefetch = (instr_lo == 0xF) && 159 prefetch = (instr_lo == 0xF) &&
166 (opcode == 0x0D || opcode == 0x18); 160 (opcode == 0x0D || opcode == 0x18);
@@ -176,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
176static int bad_address(void *p) 170static int bad_address(void *p)
177{ 171{
178 unsigned long dummy; 172 unsigned long dummy;
179 return __get_user(dummy, (unsigned long *)p); 173 return __get_user(dummy, (unsigned long __user *)p);
180} 174}
181 175
182void dump_pagetable(unsigned long address) 176void dump_pagetable(unsigned long address)
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 52fd42c40c86..3e16fe08150e 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -229,7 +229,6 @@ __init void *early_ioremap(unsigned long addr, unsigned long size)
229 229
230 /* actually usually some more */ 230 /* actually usually some more */
231 if (size >= LARGE_PAGE_SIZE) { 231 if (size >= LARGE_PAGE_SIZE) {
232 printk("SMBIOS area too long %lu\n", size);
233 return NULL; 232 return NULL;
234 } 233 }
235 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); 234 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
@@ -250,12 +249,13 @@ __init void early_iounmap(void *addr, unsigned long size)
250} 249}
251 250
252static void __meminit 251static void __meminit
253phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) 252phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
254{ 253{
255 int i; 254 int i = pmd_index(address);
256 255
257 for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { 256 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
258 unsigned long entry; 257 unsigned long entry;
258 pmd_t *pmd = pmd_page + pmd_index(address);
259 259
260 if (address >= end) { 260 if (address >= end) {
261 if (!after_bootmem) 261 if (!after_bootmem)
@@ -263,6 +263,10 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
263 set_pmd(pmd, __pmd(0)); 263 set_pmd(pmd, __pmd(0));
264 break; 264 break;
265 } 265 }
266
267 if (pmd_val(*pmd))
268 continue;
269
266 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; 270 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
267 entry &= __supported_pte_mask; 271 entry &= __supported_pte_mask;
268 set_pmd(pmd, __pmd(entry)); 272 set_pmd(pmd, __pmd(entry));
@@ -272,45 +276,41 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
272static void __meminit 276static void __meminit
273phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 277phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
274{ 278{
275 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); 279 pmd_t *pmd = pmd_offset(pud,0);
276 280 spin_lock(&init_mm.page_table_lock);
277 if (pmd_none(*pmd)) { 281 phys_pmd_init(pmd, address, end);
278 spin_lock(&init_mm.page_table_lock); 282 spin_unlock(&init_mm.page_table_lock);
279 phys_pmd_init(pmd, address, end); 283 __flush_tlb_all();
280 spin_unlock(&init_mm.page_table_lock);
281 __flush_tlb_all();
282 }
283} 284}
284 285
285static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 286static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
286{ 287{
287 long i = pud_index(address); 288 int i = pud_index(addr);
288 289
289 pud = pud + i;
290 290
291 if (after_bootmem && pud_val(*pud)) { 291 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
292 phys_pmd_update(pud, address, end);
293 return;
294 }
295
296 for (; i < PTRS_PER_PUD; pud++, i++) {
297 int map; 292 int map;
298 unsigned long paddr, pmd_phys; 293 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr);
299 pmd_t *pmd; 295 pmd_t *pmd;
300 296
301 paddr = (address & PGDIR_MASK) + i*PUD_SIZE; 297 if (addr >= end)
302 if (paddr >= end)
303 break; 298 break;
304 299
305 if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { 300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
306 set_pud(pud, __pud(0)); 301 set_pud(pud, __pud(0));
307 continue; 302 continue;
308 } 303 }
309 304
305 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end);
307 continue;
308 }
309
310 pmd = alloc_low_page(&map, &pmd_phys); 310 pmd = alloc_low_page(&map, &pmd_phys);
311 spin_lock(&init_mm.page_table_lock); 311 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, paddr, end); 313 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock); 314 spin_unlock(&init_mm.page_table_lock);
315 unmap_low_page(map); 315 unmap_low_page(map);
316 } 316 }
@@ -403,69 +403,15 @@ void __cpuinit zap_low_mappings(int cpu)
403 __flush_tlb_all(); 403 __flush_tlb_all();
404} 404}
405 405
406/* Compute zone sizes for the DMA and DMA32 zones in a node. */
407__init void
408size_zones(unsigned long *z, unsigned long *h,
409 unsigned long start_pfn, unsigned long end_pfn)
410{
411 int i;
412 unsigned long w;
413
414 for (i = 0; i < MAX_NR_ZONES; i++)
415 z[i] = 0;
416
417 if (start_pfn < MAX_DMA_PFN)
418 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
419 if (start_pfn < MAX_DMA32_PFN) {
420 unsigned long dma32_pfn = MAX_DMA32_PFN;
421 if (dma32_pfn > end_pfn)
422 dma32_pfn = end_pfn;
423 z[ZONE_DMA32] = dma32_pfn - start_pfn;
424 }
425 z[ZONE_NORMAL] = end_pfn - start_pfn;
426
427 /* Remove lower zones from higher ones. */
428 w = 0;
429 for (i = 0; i < MAX_NR_ZONES; i++) {
430 if (z[i])
431 z[i] -= w;
432 w += z[i];
433 }
434
435 /* Compute holes */
436 w = start_pfn;
437 for (i = 0; i < MAX_NR_ZONES; i++) {
438 unsigned long s = w;
439 w += z[i];
440 h[i] = e820_hole_size(s, w);
441 }
442
443 /* Add the space pace needed for mem_map to the holes too. */
444 for (i = 0; i < MAX_NR_ZONES; i++)
445 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
446
447 /* The 16MB DMA zone has the kernel and other misc mappings.
448 Account them too */
449 if (h[ZONE_DMA]) {
450 h[ZONE_DMA] += dma_reserve;
451 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
452 printk(KERN_WARNING
453 "Kernel too large and filling up ZONE_DMA?\n");
454 h[ZONE_DMA] = z[ZONE_DMA];
455 }
456 }
457}
458
459#ifndef CONFIG_NUMA 406#ifndef CONFIG_NUMA
460void __init paging_init(void) 407void __init paging_init(void)
461{ 408{
462 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; 409 unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN,
463 410 MAX_DMA32_PFN,
411 end_pfn};
464 memory_present(0, 0, end_pfn); 412 memory_present(0, 0, end_pfn);
465 sparse_init(); 413 sparse_init();
466 size_zones(zones, holes, 0, end_pfn); 414 free_area_init_nodes(max_zone_pfns);
467 free_area_init_node(0, NODE_DATA(0), zones,
468 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
469} 415}
470#endif 416#endif
471 417
@@ -597,12 +543,6 @@ void __init mem_init(void)
597 543
598 pci_iommu_alloc(); 544 pci_iommu_alloc();
599 545
600 /* How many end-of-memory variables you have, grandma! */
601 max_low_pfn = end_pfn;
602 max_pfn = end_pfn;
603 num_physpages = end_pfn;
604 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
605
606 /* clear the zero-page */ 546 /* clear the zero-page */
607 memset(empty_zero_page, 0, PAGE_SIZE); 547 memset(empty_zero_page, 0, PAGE_SIZE);
608 548
@@ -614,7 +554,8 @@ void __init mem_init(void)
614#else 554#else
615 totalram_pages = free_all_bootmem(); 555 totalram_pages = free_all_bootmem();
616#endif 556#endif
617 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); 557 reservedpages = end_pfn - totalram_pages -
558 absent_pages_in_range(0, end_pfn);
618 559
619 after_bootmem = 1; 560 after_bootmem = 1;
620 561
@@ -714,8 +655,10 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
714#else 655#else
715 reserve_bootmem(phys, len); 656 reserve_bootmem(phys, len);
716#endif 657#endif
717 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) 658 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
718 dma_reserve += len / PAGE_SIZE; 659 dma_reserve += len / PAGE_SIZE;
660 set_dma_reserve(dma_reserve);
661 }
719} 662}
720 663
721int kern_addr_valid(unsigned long addr) 664int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 7c45c2d2b8b2..b5b8dba28b4e 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -54,6 +54,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
54 54
55 nodes_clear(nodes_parsed); 55 nodes_clear(nodes_parsed);
56 56
57 if (!early_pci_allowed())
58 return -1;
59
57 nb = find_northbridge(); 60 nb = find_northbridge();
58 if (nb < 0) 61 if (nb < 0)
59 return nb; 62 return nb;
@@ -146,6 +149,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
146 149
147 nodes[nodeid].start = base; 150 nodes[nodeid].start = base;
148 nodes[nodeid].end = limit; 151 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT,
154 nodes[nodeid].end >> PAGE_SHIFT);
149 155
150 prevbase = base; 156 prevbase = base;
151 157
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index b2fac14baac0..829a008bd39b 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
161 bootmap_start >> PAGE_SHIFT, 161 bootmap_start >> PAGE_SHIFT,
162 start_pfn, end_pfn); 162 start_pfn, end_pfn);
163 163
164 e820_bootmem_free(NODE_DATA(nodeid), start, end); 164 free_bootmem_with_active_regions(nodeid, end);
165 165
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
175void __init setup_node_zones(int nodeid) 175void __init setup_node_zones(int nodeid)
176{ 176{
177 unsigned long start_pfn, end_pfn, memmapsize, limit; 177 unsigned long start_pfn, end_pfn, memmapsize, limit;
178 unsigned long zones[MAX_NR_ZONES];
179 unsigned long holes[MAX_NR_ZONES];
180 178
181 start_pfn = node_start_pfn(nodeid); 179 start_pfn = node_start_pfn(nodeid);
182 end_pfn = node_end_pfn(nodeid); 180 end_pfn = node_end_pfn(nodeid);
183 181
184 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", 182 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
185 nodeid, start_pfn, end_pfn); 183 nodeid, start_pfn, end_pfn);
186 184
187 /* Try to allocate mem_map at end to not fill up precious <4GB 185 /* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
195 round_down(limit - memmapsize, PAGE_SIZE), 193 round_down(limit - memmapsize, PAGE_SIZE),
196 limit); 194 limit);
197#endif 195#endif
198
199 size_zones(zones, holes, start_pfn, end_pfn);
200 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
201 start_pfn, holes);
202} 196}
203 197
204void __init numa_init_array(void) 198void __init numa_init_array(void)
@@ -225,7 +219,7 @@ void __init numa_init_array(void)
225int numa_fake __initdata = 0; 219int numa_fake __initdata = 0;
226 220
227/* Numa emulation */ 221/* Numa emulation */
228static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 222static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
229{ 223{
230 int i; 224 int i;
231 struct bootnode nodes[MAX_NUMNODES]; 225 struct bootnode nodes[MAX_NUMNODES];
@@ -259,8 +253,11 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
259 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 253 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
260 return -1; 254 return -1;
261 } 255 }
262 for_each_online_node(i) 256 for_each_online_node(i) {
257 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
258 nodes[i].end >> PAGE_SHIFT);
263 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 259 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
260 }
264 numa_init_array(); 261 numa_init_array();
265 return 0; 262 return 0;
266} 263}
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
299 for (i = 0; i < NR_CPUS; i++) 296 for (i = 0; i < NR_CPUS; i++)
300 numa_set_node(i, 0); 297 numa_set_node(i, 0);
301 node_to_cpumask[0] = cpumask_of_cpu(0); 298 node_to_cpumask[0] = cpumask_of_cpu(0);
299 e820_register_active_regions(0, start_pfn, end_pfn);
302 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 300 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
303} 301}
304 302
@@ -340,17 +338,23 @@ static void __init arch_sparse_init(void)
340void __init paging_init(void) 338void __init paging_init(void)
341{ 339{
342 int i; 340 int i;
341 unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN,
342 MAX_DMA32_PFN,
343 end_pfn};
343 344
344 arch_sparse_init(); 345 arch_sparse_init();
345 346
346 for_each_online_node(i) { 347 for_each_online_node(i) {
347 setup_node_zones(i); 348 setup_node_zones(i);
348 } 349 }
350
351 free_area_init_nodes(max_zone_pfns);
349} 352}
350 353
351/* [numa=off] */ 354static __init int numa_setup(char *opt)
352__init int numa_setup(char *opt)
353{ 355{
356 if (!opt)
357 return -EINVAL;
354 if (!strncmp(opt,"off",3)) 358 if (!strncmp(opt,"off",3))
355 numa_off = 1; 359 numa_off = 1;
356#ifdef CONFIG_NUMA_EMU 360#ifdef CONFIG_NUMA_EMU
@@ -366,9 +370,11 @@ __init int numa_setup(char *opt)
366 if (!strncmp(opt,"hotadd=", 7)) 370 if (!strncmp(opt,"hotadd=", 7))
367 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 371 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
368#endif 372#endif
369 return 1; 373 return 0;
370} 374}
371 375
376early_param("numa", numa_setup);
377
372/* 378/*
373 * Setup early cpu_to_node. 379 * Setup early cpu_to_node.
374 * 380 *
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 2685b1f3671c..3e231d762aaa 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -108,8 +108,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
108 BUG_ON(pud_none(*pud)); 108 BUG_ON(pud_none(*pud));
109 pmd = pmd_offset(pud, address); 109 pmd = pmd_offset(pud, address);
110 BUG_ON(pmd_val(*pmd) & _PAGE_PSE); 110 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
111 pgprot_val(ref_prot) |= _PAGE_PSE;
112 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); 111 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
112 large_pte = pte_mkhuge(large_pte);
113 set_pte((pte_t *)pmd, large_pte); 113 set_pte((pte_t *)pmd, large_pte);
114} 114}
115 115
@@ -119,32 +119,28 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
119{ 119{
120 pte_t *kpte; 120 pte_t *kpte;
121 struct page *kpte_page; 121 struct page *kpte_page;
122 unsigned kpte_flags;
123 pgprot_t ref_prot2; 122 pgprot_t ref_prot2;
124 kpte = lookup_address(address); 123 kpte = lookup_address(address);
125 if (!kpte) return 0; 124 if (!kpte) return 0;
126 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); 125 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
127 kpte_flags = pte_val(*kpte);
128 if (pgprot_val(prot) != pgprot_val(ref_prot)) { 126 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
129 if ((kpte_flags & _PAGE_PSE) == 0) { 127 if (!pte_huge(*kpte)) {
130 set_pte(kpte, pfn_pte(pfn, prot)); 128 set_pte(kpte, pfn_pte(pfn, prot));
131 } else { 129 } else {
132 /* 130 /*
133 * split_large_page will take the reference for this 131 * split_large_page will take the reference for this
134 * change_page_attr on the split page. 132 * change_page_attr on the split page.
135 */ 133 */
136
137 struct page *split; 134 struct page *split;
138 ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); 135 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
139
140 split = split_large_page(address, prot, ref_prot2); 136 split = split_large_page(address, prot, ref_prot2);
141 if (!split) 137 if (!split)
142 return -ENOMEM; 138 return -ENOMEM;
143 set_pte(kpte,mk_pte(split, ref_prot2)); 139 set_pte(kpte, mk_pte(split, ref_prot2));
144 kpte_page = split; 140 kpte_page = split;
145 } 141 }
146 page_private(kpte_page)++; 142 page_private(kpte_page)++;
147 } else if ((kpte_flags & _PAGE_PSE) == 0) { 143 } else if (!pte_huge(*kpte)) {
148 set_pte(kpte, pfn_pte(pfn, ref_prot)); 144 set_pte(kpte, pfn_pte(pfn, ref_prot));
149 BUG_ON(page_private(kpte_page) == 0); 145 BUG_ON(page_private(kpte_page) == 0);
150 page_private(kpte_page)--; 146 page_private(kpte_page)--;
@@ -190,10 +186,12 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
190 * lowmem */ 186 * lowmem */
191 if (__pa(address) < KERNEL_TEXT_SIZE) { 187 if (__pa(address) < KERNEL_TEXT_SIZE) {
192 unsigned long addr2; 188 unsigned long addr2;
193 pgprot_t prot2 = prot; 189 pgprot_t prot2;
194 addr2 = __START_KERNEL_map + __pa(address); 190 addr2 = __START_KERNEL_map + __pa(address);
195 pgprot_val(prot2) &= ~_PAGE_NX; 191 /* Make sure the kernel mappings stay executable */
196 err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); 192 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
193 err = __change_page_attr(addr2, pfn, prot2,
194 PAGE_KERNEL_EXEC);
197 } 195 }
198 } 196 }
199 up_write(&init_mm.mmap_sem); 197 up_write(&init_mm.mmap_sem);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 502fce65e96a..f8c04d6935c9 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -21,6 +21,8 @@
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23 23
24int acpi_numa __initdata;
25
24#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ 26#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ 27 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG) 28 && !defined(CONFIG_MEMORY_HOTPLUG)
@@ -91,6 +93,7 @@ static __init void bad_srat(void)
91 apicid_to_node[i] = NUMA_NO_NODE; 93 apicid_to_node[i] = NUMA_NO_NODE;
92 for (i = 0; i < MAX_NUMNODES; i++) 94 for (i = 0; i < MAX_NUMNODES; i++)
93 nodes_add[i].start = nodes[i].end = 0; 95 nodes_add[i].start = nodes[i].end = 0;
96 remove_all_active_ranges();
94} 97}
95 98
96static __init inline int srat_disabled(void) 99static __init inline int srat_disabled(void)
@@ -173,7 +176,7 @@ static int hotadd_enough_memory(struct bootnode *nd)
173 176
174 if (mem < 0) 177 if (mem < 0)
175 return 0; 178 return 0;
176 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; 179 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
177 allowed = (allowed / 100) * hotadd_percent; 180 allowed = (allowed / 100) * hotadd_percent;
178 if (allocated + mem > allowed) { 181 if (allocated + mem > allowed) {
179 unsigned long range; 182 unsigned long range;
@@ -223,8 +226,10 @@ static int reserve_hotadd(int node, unsigned long start, unsigned long end)
223 } 226 }
224 227
225 /* This check might be a bit too strict, but I'm keeping it for now. */ 228 /* This check might be a bit too strict, but I'm keeping it for now. */
226 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { 229 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
227 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); 230 printk(KERN_ERR
231 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
232 s_pfn, e_pfn);
228 return -1; 233 return -1;
229 } 234 }
230 235
@@ -317,6 +322,10 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
317 322
318 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 323 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
319 nd->start, nd->end); 324 nd->start, nd->end);
325 e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
326 nd->end >> PAGE_SHIFT);
327 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
328 nd->end >> PAGE_SHIFT);
320 329
321#ifdef RESERVE_HOTADD 330#ifdef RESERVE_HOTADD
322 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { 331 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -341,13 +350,13 @@ static int nodes_cover_memory(void)
341 unsigned long s = nodes[i].start >> PAGE_SHIFT; 350 unsigned long s = nodes[i].start >> PAGE_SHIFT;
342 unsigned long e = nodes[i].end >> PAGE_SHIFT; 351 unsigned long e = nodes[i].end >> PAGE_SHIFT;
343 pxmram += e - s; 352 pxmram += e - s;
344 pxmram -= e820_hole_size(s, e); 353 pxmram -= absent_pages_in_range(s, e);
345 pxmram -= nodes_add[i].end - nodes_add[i].start; 354 pxmram -= nodes_add[i].end - nodes_add[i].start;
346 if ((long)pxmram < 0) 355 if ((long)pxmram < 0)
347 pxmram = 0; 356 pxmram = 0;
348 } 357 }
349 358
350 e820ram = end_pfn - e820_hole_size(0, end_pfn); 359 e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
351 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 360 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
352 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 361 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
353 printk(KERN_ERR 362 printk(KERN_ERR
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index a3f6ad570179..1eb18f421edf 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -9,7 +9,7 @@ obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o 9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o init.o 10obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o 11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o 12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special 13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o 14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
15 15
@@ -23,3 +23,4 @@ common-y += ../../i386/pci/common.o
23fixup-y += ../../i386/pci/fixup.o 23fixup-y += ../../i386/pci/fixup.o
24i386-y += ../../i386/pci/i386.o 24i386-y += ../../i386/pci/i386.o
25init-y += ../../i386/pci/init.o 25init-y += ../../i386/pci/init.o
26early-y += ../../i386/pci/early.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index 3c55c76c6fd5..7732f4254d21 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -156,15 +156,45 @@ static __init void unreachable_devices(void)
156 addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); 156 addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
157 if (addr == NULL|| readl(addr) != val1) { 157 if (addr == NULL|| readl(addr) != val1) {
158 set_bit(i + 32*k, fallback_slots); 158 set_bit(i + 32*k, fallback_slots);
159 printk(KERN_NOTICE 159 printk(KERN_NOTICE "PCI: No mmconfig possible"
160 "PCI: No mmconfig possible on device %x:%x\n", 160 " on device %02x:%02x\n", k, i);
161 k, i);
162 } 161 }
163 } 162 }
164 } 163 }
165} 164}
166 165
167void __init pci_mmcfg_init(void) 166static __init void pci_mmcfg_insert_resources(void)
167{
168#define PCI_MMCFG_RESOURCE_NAME_LEN 19
169 int i;
170 struct resource *res;
171 char *names;
172 unsigned num_buses;
173
174 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
175 pci_mmcfg_config_num, GFP_KERNEL);
176
177 if (!res) {
178 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
179 return;
180 }
181
182 names = (void *)&res[pci_mmcfg_config_num];
183 for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
184 num_buses = pci_mmcfg_config[i].end_bus_number -
185 pci_mmcfg_config[i].start_bus_number + 1;
186 res->name = names;
187 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
188 pci_mmcfg_config[i].pci_segment_group_number);
189 res->start = pci_mmcfg_config[i].base_address;
190 res->end = res->start + (num_buses << 20) - 1;
191 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
192 insert_resource(&iomem_resource, res);
193 names += PCI_MMCFG_RESOURCE_NAME_LEN;
194 }
195}
196
197void __init pci_mmcfg_init(int type)
168{ 198{
169 int i; 199 int i;
170 200
@@ -177,7 +207,9 @@ void __init pci_mmcfg_init(void)
177 (pci_mmcfg_config[0].base_address == 0)) 207 (pci_mmcfg_config[0].base_address == 0))
178 return; 208 return;
179 209
180 if (!e820_all_mapped(pci_mmcfg_config[0].base_address, 210 /* Only do this check when type 1 works. If it doesn't work
211 assume we run on a Mac and always use MCFG */
212 if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address,
181 pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, 213 pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
182 E820_RESERVED)) { 214 E820_RESERVED)) {
183 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", 215 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
@@ -186,7 +218,6 @@ void __init pci_mmcfg_init(void)
186 return; 218 return;
187 } 219 }
188 220
189 /* RED-PEN i386 doesn't do _nocache right now */
190 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); 221 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
191 if (pci_mmcfg_virt == NULL) { 222 if (pci_mmcfg_virt == NULL) {
192 printk("PCI: Can not allocate memory for mmconfig structures\n"); 223 printk("PCI: Can not allocate memory for mmconfig structures\n");
@@ -205,6 +236,7 @@ void __init pci_mmcfg_init(void)
205 } 236 }
206 237
207 unreachable_devices(); 238 unreachable_devices();
239 pci_mmcfg_insert_resources();
208 240
209 raw_pci_ops = &pci_mmcfg; 241 raw_pci_ops = &pci_mmcfg;
210 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 242 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;