aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
author <jgarzik@pretzel.yyz.us>2005-05-26 13:03:24 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-05-26 13:03:24 -0400
commit8973a585aec125beb2a3de50bb491004299f53d5 (patch)
tree3f069a9c7eff2c916e02427fd9800ce2b55a4a90 /arch/x86_64
parent907f4678c114a125fe4584758681c31bf3d627da (diff)
parentbef9c558841604116704e10b3d9ff3dbf4939423 (diff)
Automatic merge of rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git branch HEAD
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig14
-rw-r--r--arch/x86_64/defconfig58
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/apic.c5
-rw-r--r--arch/x86_64/kernel/entry.S11
-rw-r--r--arch/x86_64/kernel/io_apic.c81
-rw-r--r--arch/x86_64/kernel/mpparse.c22
-rw-r--r--arch/x86_64/kernel/nmi.c248
-rw-r--r--arch/x86_64/kernel/pmtimer.c101
-rw-r--r--arch/x86_64/kernel/ptrace.c17
-rw-r--r--arch/x86_64/kernel/setup.c30
-rw-r--r--arch/x86_64/kernel/signal.c4
-rw-r--r--arch/x86_64/kernel/smpboot.c263
-rw-r--r--arch/x86_64/kernel/time.c62
-rw-r--r--arch/x86_64/kernel/traps.c2
-rw-r--r--arch/x86_64/kernel/vsyscall.c5
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c3
-rw-r--r--arch/x86_64/mm/fault.c11
-rw-r--r--arch/x86_64/mm/ioremap.c29
19 files changed, 645 insertions, 322 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 44ee7f6acf7b..82cb2a3f127a 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -303,6 +303,20 @@ config HPET_TIMER
303 as it is off-chip. You can find the HPET spec at 303 as it is off-chip. You can find the HPET spec at
304 <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. 304 <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
305 305
306config X86_PM_TIMER
307 bool "PM timer"
308 default y
309 help
310 Support the ACPI PM timer for time keeping. This is slow,
311 but is useful on some chipsets without HPET on systems with more
312 than one CPU. On a single processor or single socket multi core
313 system it is normally not required.
314 When the PM timer is active 64bit vsyscalls are disabled
315 and should not be enabled (/proc/sys/kernel/vsyscall64 should
316 not be changed).
317 The kernel selects the PM timer only as a last resort, so it is
318 useful to enable just in case.
319
306config HPET_EMULATE_RTC 320config HPET_EMULATE_RTC
307 bool "Provide RTC interrupt" 321 bool "Provide RTC interrupt"
308 depends on HPET_TIMER && RTC=y 322 depends on HPET_TIMER && RTC=y
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 9ce51dee30b3..569595b74c7c 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.11-bk7 3# Linux kernel version: 2.6.12-rc4
4# Sat Mar 12 23:43:44 2005 4# Fri May 13 06:39:11 2005
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -11,8 +11,6 @@ CONFIG_RWSEM_GENERIC_SPINLOCK=y
11CONFIG_GENERIC_CALIBRATE_DELAY=y 11CONFIG_GENERIC_CALIBRATE_DELAY=y
12CONFIG_X86_CMPXCHG=y 12CONFIG_X86_CMPXCHG=y
13CONFIG_EARLY_PRINTK=y 13CONFIG_EARLY_PRINTK=y
14CONFIG_HPET_TIMER=y
15CONFIG_HPET_EMULATE_RTC=y
16CONFIG_GENERIC_ISA_DMA=y 14CONFIG_GENERIC_ISA_DMA=y
17CONFIG_GENERIC_IOMAP=y 15CONFIG_GENERIC_IOMAP=y
18 16
@@ -22,6 +20,7 @@ CONFIG_GENERIC_IOMAP=y
22CONFIG_EXPERIMENTAL=y 20CONFIG_EXPERIMENTAL=y
23CONFIG_CLEAN_COMPILE=y 21CONFIG_CLEAN_COMPILE=y
24CONFIG_LOCK_KERNEL=y 22CONFIG_LOCK_KERNEL=y
23CONFIG_INIT_ENV_ARG_LIMIT=32
25 24
26# 25#
27# General setup 26# General setup
@@ -33,7 +32,6 @@ CONFIG_POSIX_MQUEUE=y
33# CONFIG_BSD_PROCESS_ACCT is not set 32# CONFIG_BSD_PROCESS_ACCT is not set
34CONFIG_SYSCTL=y 33CONFIG_SYSCTL=y
35# CONFIG_AUDIT is not set 34# CONFIG_AUDIT is not set
36CONFIG_LOG_BUF_SHIFT=18
37# CONFIG_HOTPLUG is not set 35# CONFIG_HOTPLUG is not set
38CONFIG_KOBJECT_UEVENT=y 36CONFIG_KOBJECT_UEVENT=y
39CONFIG_IKCONFIG=y 37CONFIG_IKCONFIG=y
@@ -43,10 +41,11 @@ CONFIG_IKCONFIG_PROC=y
43CONFIG_KALLSYMS=y 41CONFIG_KALLSYMS=y
44CONFIG_KALLSYMS_ALL=y 42CONFIG_KALLSYMS_ALL=y
45# CONFIG_KALLSYMS_EXTRA_PASS is not set 43# CONFIG_KALLSYMS_EXTRA_PASS is not set
44CONFIG_PRINTK=y
45CONFIG_BUG=y
46CONFIG_BASE_FULL=y 46CONFIG_BASE_FULL=y
47CONFIG_FUTEX=y 47CONFIG_FUTEX=y
48CONFIG_EPOLL=y 48CONFIG_EPOLL=y
49# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
50CONFIG_SHMEM=y 49CONFIG_SHMEM=y
51CONFIG_CC_ALIGN_FUNCTIONS=0 50CONFIG_CC_ALIGN_FUNCTIONS=0
52CONFIG_CC_ALIGN_LABELS=0 51CONFIG_CC_ALIGN_LABELS=0
@@ -93,6 +92,9 @@ CONFIG_DISCONTIGMEM=y
93CONFIG_NUMA=y 92CONFIG_NUMA=y
94CONFIG_HAVE_DEC_LOCK=y 93CONFIG_HAVE_DEC_LOCK=y
95CONFIG_NR_CPUS=8 94CONFIG_NR_CPUS=8
95CONFIG_HPET_TIMER=y
96CONFIG_X86_PM_TIMER=y
97CONFIG_HPET_EMULATE_RTC=y
96CONFIG_GART_IOMMU=y 98CONFIG_GART_IOMMU=y
97CONFIG_SWIOTLB=y 99CONFIG_SWIOTLB=y
98CONFIG_X86_MCE=y 100CONFIG_X86_MCE=y
@@ -100,6 +102,7 @@ CONFIG_X86_MCE_INTEL=y
100CONFIG_SECCOMP=y 102CONFIG_SECCOMP=y
101CONFIG_GENERIC_HARDIRQS=y 103CONFIG_GENERIC_HARDIRQS=y
102CONFIG_GENERIC_IRQ_PROBE=y 104CONFIG_GENERIC_IRQ_PROBE=y
105CONFIG_ISA_DMA_API=y
103 106
104# 107#
105# Power management options 108# Power management options
@@ -129,7 +132,7 @@ CONFIG_ACPI_NUMA=y
129# CONFIG_ACPI_IBM is not set 132# CONFIG_ACPI_IBM is not set
130CONFIG_ACPI_TOSHIBA=y 133CONFIG_ACPI_TOSHIBA=y
131CONFIG_ACPI_BLACKLIST_YEAR=2001 134CONFIG_ACPI_BLACKLIST_YEAR=2001
132CONFIG_ACPI_DEBUG=y 135# CONFIG_ACPI_DEBUG is not set
133CONFIG_ACPI_BUS=y 136CONFIG_ACPI_BUS=y
134CONFIG_ACPI_EC=y 137CONFIG_ACPI_EC=y
135CONFIG_ACPI_POWER=y 138CONFIG_ACPI_POWER=y
@@ -141,6 +144,7 @@ CONFIG_ACPI_SYSTEM=y
141# CPU Frequency scaling 144# CPU Frequency scaling
142# 145#
143CONFIG_CPU_FREQ=y 146CONFIG_CPU_FREQ=y
147CONFIG_CPU_FREQ_TABLE=y
144# CONFIG_CPU_FREQ_DEBUG is not set 148# CONFIG_CPU_FREQ_DEBUG is not set
145CONFIG_CPU_FREQ_STAT=y 149CONFIG_CPU_FREQ_STAT=y
146# CONFIG_CPU_FREQ_STAT_DETAILS is not set 150# CONFIG_CPU_FREQ_STAT_DETAILS is not set
@@ -150,7 +154,6 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
150# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 154# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
151CONFIG_CPU_FREQ_GOV_USERSPACE=y 155CONFIG_CPU_FREQ_GOV_USERSPACE=y
152CONFIG_CPU_FREQ_GOV_ONDEMAND=y 156CONFIG_CPU_FREQ_GOV_ONDEMAND=y
153CONFIG_CPU_FREQ_TABLE=y
154 157
155# 158#
156# CPUFreq processor drivers 159# CPUFreq processor drivers
@@ -164,6 +167,7 @@ CONFIG_X86_ACPI_CPUFREQ=y
164# shared options 167# shared options
165# 168#
166CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y 169CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
170# CONFIG_X86_SPEEDSTEP_LIB is not set
167 171
168# 172#
169# Bus options (PCI etc.) 173# Bus options (PCI etc.)
@@ -172,9 +176,11 @@ CONFIG_PCI=y
172CONFIG_PCI_DIRECT=y 176CONFIG_PCI_DIRECT=y
173CONFIG_PCI_MMCONFIG=y 177CONFIG_PCI_MMCONFIG=y
174CONFIG_UNORDERED_IO=y 178CONFIG_UNORDERED_IO=y
179# CONFIG_PCIEPORTBUS is not set
175CONFIG_PCI_MSI=y 180CONFIG_PCI_MSI=y
176# CONFIG_PCI_LEGACY_PROC is not set 181# CONFIG_PCI_LEGACY_PROC is not set
177# CONFIG_PCI_NAMES is not set 182# CONFIG_PCI_NAMES is not set
183# CONFIG_PCI_DEBUG is not set
178 184
179# 185#
180# PCCARD (PCMCIA/CardBus) support 186# PCCARD (PCMCIA/CardBus) support
@@ -182,10 +188,6 @@ CONFIG_PCI_MSI=y
182# CONFIG_PCCARD is not set 188# CONFIG_PCCARD is not set
183 189
184# 190#
185# PC-card bridges
186#
187
188#
189# PCI Hotplug Support 191# PCI Hotplug Support
190# 192#
191# CONFIG_HOTPLUG_PCI is not set 193# CONFIG_HOTPLUG_PCI is not set
@@ -254,7 +256,7 @@ CONFIG_LBD=y
254# IO Schedulers 256# IO Schedulers
255# 257#
256CONFIG_IOSCHED_NOOP=y 258CONFIG_IOSCHED_NOOP=y
257CONFIG_IOSCHED_AS=y 259# CONFIG_IOSCHED_AS is not set
258CONFIG_IOSCHED_DEADLINE=y 260CONFIG_IOSCHED_DEADLINE=y
259CONFIG_IOSCHED_CFQ=y 261CONFIG_IOSCHED_CFQ=y
260# CONFIG_ATA_OVER_ETH is not set 262# CONFIG_ATA_OVER_ETH is not set
@@ -308,7 +310,8 @@ CONFIG_BLK_DEV_AMD74XX=y
308CONFIG_BLK_DEV_PIIX=y 310CONFIG_BLK_DEV_PIIX=y
309# CONFIG_BLK_DEV_NS87415 is not set 311# CONFIG_BLK_DEV_NS87415 is not set
310# CONFIG_BLK_DEV_PDC202XX_OLD is not set 312# CONFIG_BLK_DEV_PDC202XX_OLD is not set
311# CONFIG_BLK_DEV_PDC202XX_NEW is not set 313CONFIG_BLK_DEV_PDC202XX_NEW=y
314# CONFIG_PDC202XX_FORCE is not set
312# CONFIG_BLK_DEV_SVWKS is not set 315# CONFIG_BLK_DEV_SVWKS is not set
313# CONFIG_BLK_DEV_SIIMAGE is not set 316# CONFIG_BLK_DEV_SIIMAGE is not set
314# CONFIG_BLK_DEV_SIS5513 is not set 317# CONFIG_BLK_DEV_SIS5513 is not set
@@ -353,7 +356,7 @@ CONFIG_BLK_DEV_SD=y
353# 356#
354# SCSI low-level drivers 357# SCSI low-level drivers
355# 358#
356CONFIG_BLK_DEV_3W_XXXX_RAID=y 359# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
357# CONFIG_SCSI_3W_9XXX is not set 360# CONFIG_SCSI_3W_9XXX is not set
358# CONFIG_SCSI_ACARD is not set 361# CONFIG_SCSI_ACARD is not set
359# CONFIG_SCSI_AACRAID is not set 362# CONFIG_SCSI_AACRAID is not set
@@ -384,7 +387,6 @@ CONFIG_SCSI_SATA_VIA=y
384# CONFIG_SCSI_BUSLOGIC is not set 387# CONFIG_SCSI_BUSLOGIC is not set
385# CONFIG_SCSI_DMX3191D is not set 388# CONFIG_SCSI_DMX3191D is not set
386# CONFIG_SCSI_EATA is not set 389# CONFIG_SCSI_EATA is not set
387# CONFIG_SCSI_EATA_PIO is not set
388# CONFIG_SCSI_FUTURE_DOMAIN is not set 390# CONFIG_SCSI_FUTURE_DOMAIN is not set
389# CONFIG_SCSI_GDTH is not set 391# CONFIG_SCSI_GDTH is not set
390# CONFIG_SCSI_IPS is not set 392# CONFIG_SCSI_IPS is not set
@@ -392,7 +394,6 @@ CONFIG_SCSI_SATA_VIA=y
392# CONFIG_SCSI_INIA100 is not set 394# CONFIG_SCSI_INIA100 is not set
393# CONFIG_SCSI_SYM53C8XX_2 is not set 395# CONFIG_SCSI_SYM53C8XX_2 is not set
394# CONFIG_SCSI_IPR is not set 396# CONFIG_SCSI_IPR is not set
395# CONFIG_SCSI_QLOGIC_ISP is not set
396# CONFIG_SCSI_QLOGIC_FC is not set 397# CONFIG_SCSI_QLOGIC_FC is not set
397# CONFIG_SCSI_QLOGIC_1280 is not set 398# CONFIG_SCSI_QLOGIC_1280 is not set
398CONFIG_SCSI_QLA2XXX=y 399CONFIG_SCSI_QLA2XXX=y
@@ -401,6 +402,7 @@ CONFIG_SCSI_QLA2XXX=y
401# CONFIG_SCSI_QLA2300 is not set 402# CONFIG_SCSI_QLA2300 is not set
402# CONFIG_SCSI_QLA2322 is not set 403# CONFIG_SCSI_QLA2322 is not set
403# CONFIG_SCSI_QLA6312 is not set 404# CONFIG_SCSI_QLA6312 is not set
405# CONFIG_SCSI_LPFC is not set
404# CONFIG_SCSI_DC395x is not set 406# CONFIG_SCSI_DC395x is not set
405# CONFIG_SCSI_DC390T is not set 407# CONFIG_SCSI_DC390T is not set
406# CONFIG_SCSI_DEBUG is not set 408# CONFIG_SCSI_DEBUG is not set
@@ -437,7 +439,6 @@ CONFIG_NET=y
437# 439#
438CONFIG_PACKET=y 440CONFIG_PACKET=y
439# CONFIG_PACKET_MMAP is not set 441# CONFIG_PACKET_MMAP is not set
440# CONFIG_NETLINK_DEV is not set
441CONFIG_UNIX=y 442CONFIG_UNIX=y
442# CONFIG_NET_KEY is not set 443# CONFIG_NET_KEY is not set
443CONFIG_INET=y 444CONFIG_INET=y
@@ -502,7 +503,7 @@ CONFIG_NETDEVICES=y
502# CONFIG_DUMMY is not set 503# CONFIG_DUMMY is not set
503# CONFIG_BONDING is not set 504# CONFIG_BONDING is not set
504# CONFIG_EQUALIZER is not set 505# CONFIG_EQUALIZER is not set
505# CONFIG_TUN is not set 506CONFIG_TUN=y
506 507
507# 508#
508# ARCnet devices 509# ARCnet devices
@@ -525,8 +526,7 @@ CONFIG_MII=y
525# CONFIG_HP100 is not set 526# CONFIG_HP100 is not set
526CONFIG_NET_PCI=y 527CONFIG_NET_PCI=y
527# CONFIG_PCNET32 is not set 528# CONFIG_PCNET32 is not set
528CONFIG_AMD8111_ETH=y 529# CONFIG_AMD8111_ETH is not set
529# CONFIG_AMD8111E_NAPI is not set
530# CONFIG_ADAPTEC_STARFIRE is not set 530# CONFIG_ADAPTEC_STARFIRE is not set
531# CONFIG_B44 is not set 531# CONFIG_B44 is not set
532CONFIG_FORCEDETH=y 532CONFIG_FORCEDETH=y
@@ -536,7 +536,7 @@ CONFIG_FORCEDETH=y
536# CONFIG_FEALNX is not set 536# CONFIG_FEALNX is not set
537# CONFIG_NATSEMI is not set 537# CONFIG_NATSEMI is not set
538# CONFIG_NE2K_PCI is not set 538# CONFIG_NE2K_PCI is not set
539CONFIG_8139CP=m 539CONFIG_8139CP=y
540CONFIG_8139TOO=y 540CONFIG_8139TOO=y
541# CONFIG_8139TOO_PIO is not set 541# CONFIG_8139TOO_PIO is not set
542# CONFIG_8139TOO_TUNE_TWISTER is not set 542# CONFIG_8139TOO_TUNE_TWISTER is not set
@@ -671,6 +671,7 @@ CONFIG_SERIAL_8250_NR_UARTS=4
671# 671#
672CONFIG_SERIAL_CORE=y 672CONFIG_SERIAL_CORE=y
673CONFIG_SERIAL_CORE_CONSOLE=y 673CONFIG_SERIAL_CORE_CONSOLE=y
674# CONFIG_SERIAL_JSM is not set
674CONFIG_UNIX98_PTYS=y 675CONFIG_UNIX98_PTYS=y
675CONFIG_LEGACY_PTYS=y 676CONFIG_LEGACY_PTYS=y
676CONFIG_LEGACY_PTY_COUNT=256 677CONFIG_LEGACY_PTY_COUNT=256
@@ -696,6 +697,7 @@ CONFIG_RTC=y
696# 697#
697CONFIG_AGP=y 698CONFIG_AGP=y
698CONFIG_AGP_AMD64=y 699CONFIG_AGP_AMD64=y
700CONFIG_AGP_INTEL=y
699# CONFIG_DRM is not set 701# CONFIG_DRM is not set
700# CONFIG_MWAVE is not set 702# CONFIG_MWAVE is not set
701CONFIG_RAW_DRIVER=y 703CONFIG_RAW_DRIVER=y
@@ -703,7 +705,7 @@ CONFIG_HPET=y
703# CONFIG_HPET_RTC_IRQ is not set 705# CONFIG_HPET_RTC_IRQ is not set
704CONFIG_HPET_MMAP=y 706CONFIG_HPET_MMAP=y
705CONFIG_MAX_RAW_DEVS=256 707CONFIG_MAX_RAW_DEVS=256
706CONFIG_HANGCHECK_TIMER=y 708# CONFIG_HANGCHECK_TIMER is not set
707 709
708# 710#
709# TPM devices 711# TPM devices
@@ -786,6 +788,8 @@ CONFIG_SOUND_ICH=y
786# 788#
787# USB support 789# USB support
788# 790#
791CONFIG_USB_ARCH_HAS_HCD=y
792CONFIG_USB_ARCH_HAS_OHCI=y
789CONFIG_USB=y 793CONFIG_USB=y
790# CONFIG_USB_DEBUG is not set 794# CONFIG_USB_DEBUG is not set
791 795
@@ -797,8 +801,6 @@ CONFIG_USB_DEVICEFS=y
797# CONFIG_USB_DYNAMIC_MINORS is not set 801# CONFIG_USB_DYNAMIC_MINORS is not set
798# CONFIG_USB_SUSPEND is not set 802# CONFIG_USB_SUSPEND is not set
799# CONFIG_USB_OTG is not set 803# CONFIG_USB_OTG is not set
800CONFIG_USB_ARCH_HAS_HCD=y
801CONFIG_USB_ARCH_HAS_OHCI=y
802 804
803# 805#
804# USB Host Controller Drivers 806# USB Host Controller Drivers
@@ -826,7 +828,6 @@ CONFIG_USB_PRINTER=y
826# 828#
827CONFIG_USB_STORAGE=y 829CONFIG_USB_STORAGE=y
828# CONFIG_USB_STORAGE_DEBUG is not set 830# CONFIG_USB_STORAGE_DEBUG is not set
829# CONFIG_USB_STORAGE_RW_DETECT is not set
830# CONFIG_USB_STORAGE_DATAFAB is not set 831# CONFIG_USB_STORAGE_DATAFAB is not set
831# CONFIG_USB_STORAGE_FREECOM is not set 832# CONFIG_USB_STORAGE_FREECOM is not set
832# CONFIG_USB_STORAGE_ISD200 is not set 833# CONFIG_USB_STORAGE_ISD200 is not set
@@ -965,7 +966,7 @@ CONFIG_AUTOFS_FS=y
965# CD-ROM/DVD Filesystems 966# CD-ROM/DVD Filesystems
966# 967#
967CONFIG_ISO9660_FS=y 968CONFIG_ISO9660_FS=y
968# CONFIG_JOLIET is not set 969CONFIG_JOLIET=y
969# CONFIG_ZISOFS is not set 970# CONFIG_ZISOFS is not set
970# CONFIG_UDF_FS is not set 971# CONFIG_UDF_FS is not set
971 972
@@ -1092,9 +1093,10 @@ CONFIG_OPROFILE=y
1092# 1093#
1093# Kernel hacking 1094# Kernel hacking
1094# 1095#
1096# CONFIG_PRINTK_TIME is not set
1095CONFIG_DEBUG_KERNEL=y 1097CONFIG_DEBUG_KERNEL=y
1096CONFIG_MAGIC_SYSRQ=y 1098CONFIG_MAGIC_SYSRQ=y
1097# CONFIG_PRINTK_TIME is not set 1099CONFIG_LOG_BUF_SHIFT=18
1098# CONFIG_SCHEDSTATS is not set 1100# CONFIG_SCHEDSTATS is not set
1099# CONFIG_DEBUG_SLAB is not set 1101# CONFIG_DEBUG_SLAB is not set
1100# CONFIG_DEBUG_SPINLOCK is not set 1102# CONFIG_DEBUG_SPINLOCK is not set
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 0a3318e08ab6..5ca4a4598fda 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o 28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
29obj-$(CONFIG_SWIOTLB) += swiotlb.o 29obj-$(CONFIG_SWIOTLB) += swiotlb.o
30obj-$(CONFIG_KPROBES) += kprobes.o 30obj-$(CONFIG_KPROBES) += kprobes.o
31obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
31 32
32obj-$(CONFIG_MODULES) += module.o 33obj-$(CONFIG_MODULES) += module.o
33 34
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 7e13545748e0..f8e6cc4fecd4 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -33,6 +33,7 @@
33#include <asm/mpspec.h> 33#include <asm/mpspec.h>
34#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
35#include <asm/mach_apic.h> 35#include <asm/mach_apic.h>
36#include <asm/nmi.h>
36 37
37int apic_verbosity; 38int apic_verbosity;
38 39
@@ -925,7 +926,7 @@ __init int oem_force_hpet_timer(void)
925 unsigned id; 926 unsigned id;
926 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 927 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
927 928
928 bitmap_empty(clustermap, NUM_APIC_CLUSTERS); 929 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
929 930
930 for (i = 0; i < NR_CPUS; i++) { 931 for (i = 0; i < NR_CPUS; i++) {
931 id = bios_cpu_apicid[i]; 932 id = bios_cpu_apicid[i];
@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void)
1056 nr_ioapics = 0; 1057 nr_ioapics = 0;
1057#endif 1058#endif
1058 setup_boot_APIC_clock(); 1059 setup_boot_APIC_clock();
1059 1060 check_nmi_watchdog();
1060 return 0; 1061 return 0;
1061} 1062}
1062 1063
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 1086b5fcac21..28817490fdc6 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -220,13 +220,18 @@ sysret_careful:
220 jmp sysret_check 220 jmp sysret_check
221 221
222 /* Handle a signal */ 222 /* Handle a signal */
223 /* edx: work flags (arg3) */
224sysret_signal: 223sysret_signal:
225 sti 224 sti
225 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
226 jz 1f
227
228 /* Really a signal */
229 /* edx: work flags (arg3) */
226 leaq do_notify_resume(%rip),%rax 230 leaq do_notify_resume(%rip),%rax
227 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 231 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
228 xorl %esi,%esi # oldset -> arg2 232 xorl %esi,%esi # oldset -> arg2
229 call ptregscall_common 233 call ptregscall_common
2341: movl $_TIF_NEED_RESCHED,%edi
230 jmp sysret_check 235 jmp sysret_check
231 236
232 /* Do syscall tracing */ 237 /* Do syscall tracing */
@@ -484,6 +489,8 @@ retint_careful:
484 jmp retint_check 489 jmp retint_check
485 490
486retint_signal: 491retint_signal:
492 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
493 jz retint_swapgs
487 sti 494 sti
488 SAVE_REST 495 SAVE_REST
489 movq $-1,ORIG_RAX(%rsp) 496 movq $-1,ORIG_RAX(%rsp)
@@ -492,8 +499,8 @@ retint_signal:
492 call do_notify_resume 499 call do_notify_resume
493 RESTORE_REST 500 RESTORE_REST
494 cli 501 cli
502 movl $_TIF_NEED_RESCHED,%edi
495 GET_THREAD_INFO(%rcx) 503 GET_THREAD_INFO(%rcx)
496 movl $_TIF_WORK_MASK,%edi
497 jmp retint_check 504 jmp retint_check
498 505
499#ifdef CONFIG_PREEMPT 506#ifdef CONFIG_PREEMPT
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 60be58617eb9..80e9b498c443 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -42,6 +42,8 @@
42 42
43int sis_apic_bug; /* not actually supported, dummy for compile */ 43int sis_apic_bug; /* not actually supported, dummy for compile */
44 44
45static int no_timer_check;
46
45static DEFINE_SPINLOCK(ioapic_lock); 47static DEFINE_SPINLOCK(ioapic_lock);
46 48
47/* 49/*
@@ -1601,7 +1603,7 @@ static inline void check_timer(void)
1601 * Ok, does IRQ0 through the IOAPIC work? 1603 * Ok, does IRQ0 through the IOAPIC work?
1602 */ 1604 */
1603 unmask_IO_APIC_irq(0); 1605 unmask_IO_APIC_irq(0);
1604 if (timer_irq_works()) { 1606 if (!no_timer_check && timer_irq_works()) {
1605 nmi_watchdog_default(); 1607 nmi_watchdog_default();
1606 if (nmi_watchdog == NMI_IO_APIC) { 1608 if (nmi_watchdog == NMI_IO_APIC) {
1607 disable_8259A_irq(0); 1609 disable_8259A_irq(0);
@@ -1671,6 +1673,13 @@ static inline void check_timer(void)
1671 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1673 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
1672} 1674}
1673 1675
1676static int __init notimercheck(char *s)
1677{
1678 no_timer_check = 1;
1679 return 1;
1680}
1681__setup("no_timer_check", notimercheck);
1682
1674/* 1683/*
1675 * 1684 *
1676 * IRQ's that are handled by the PIC in the MPS IOAPIC case. 1685 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
@@ -1804,76 +1813,6 @@ device_initcall(ioapic_init_sysfs);
1804 1813
1805#define IO_APIC_MAX_ID 0xFE 1814#define IO_APIC_MAX_ID 0xFE
1806 1815
1807int __init io_apic_get_unique_id (int ioapic, int apic_id)
1808{
1809 union IO_APIC_reg_00 reg_00;
1810 static physid_mask_t apic_id_map;
1811 unsigned long flags;
1812 int i = 0;
1813
1814 /*
1815 * The P4 platform supports up to 256 APIC IDs on two separate APIC
1816 * buses (one for LAPICs, one for IOAPICs), where predecessors only
1817 * supports up to 16 on one shared APIC bus.
1818 *
1819 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
1820 * advantage of new APIC bus architecture.
1821 */
1822
1823 if (physids_empty(apic_id_map))
1824 apic_id_map = phys_cpu_present_map;
1825
1826 spin_lock_irqsave(&ioapic_lock, flags);
1827 reg_00.raw = io_apic_read(ioapic, 0);
1828 spin_unlock_irqrestore(&ioapic_lock, flags);
1829
1830 if (apic_id >= IO_APIC_MAX_ID) {
1831 apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
1832 "%d\n", ioapic, apic_id, reg_00.bits.ID);
1833 apic_id = reg_00.bits.ID;
1834 }
1835
1836 /*
1837 * Every APIC in a system must have a unique ID or we get lots of nice
1838 * 'stuck on smp_invalidate_needed IPI wait' messages.
1839 */
1840 if (physid_isset(apic_id, apic_id_map)) {
1841
1842 for (i = 0; i < IO_APIC_MAX_ID; i++) {
1843 if (!physid_isset(i, apic_id_map))
1844 break;
1845 }
1846
1847 if (i == IO_APIC_MAX_ID)
1848 panic("Max apic_id exceeded!\n");
1849
1850 apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
1851 "trying %d\n", ioapic, apic_id, i);
1852
1853 apic_id = i;
1854 }
1855
1856 physid_set(apic_id, apic_id_map);
1857
1858 if (reg_00.bits.ID != apic_id) {
1859 reg_00.bits.ID = apic_id;
1860
1861 spin_lock_irqsave(&ioapic_lock, flags);
1862 io_apic_write(ioapic, 0, reg_00.raw);
1863 reg_00.raw = io_apic_read(ioapic, 0);
1864 spin_unlock_irqrestore(&ioapic_lock, flags);
1865
1866 /* Sanity check */
1867 if (reg_00.bits.ID != apic_id)
1868 panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
1869 }
1870
1871 apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
1872
1873 return apic_id;
1874}
1875
1876
1877int __init io_apic_get_version (int ioapic) 1816int __init io_apic_get_version (int ioapic)
1878{ 1817{
1879 union IO_APIC_reg_01 reg_01; 1818 union IO_APIC_reg_01 reg_01;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 7ec031c6ca10..f86d9db94bfc 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
107static void __init MP_processor_info (struct mpc_config_processor *m) 107static void __init MP_processor_info (struct mpc_config_processor *m)
108{ 108{
109 int ver; 109 int ver;
110 static int found_bsp=0;
110 111
111 if (!(m->mpc_cpuflag & CPU_ENABLED)) 112 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return; 113 return;
@@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
126 " Processor ignored.\n", NR_CPUS); 127 " Processor ignored.\n", NR_CPUS);
127 return; 128 return;
128 } 129 }
129 if (num_processors >= maxcpus) {
130 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
131 " Processor ignored.\n", maxcpus);
132 return;
133 }
134 130
135 num_processors++; 131 num_processors++;
136 132
@@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
150 ver = 0x10; 146 ver = 0x10;
151 } 147 }
152 apic_version[m->mpc_apicid] = ver; 148 apic_version[m->mpc_apicid] = ver;
153 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; 149 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
150 /*
151 * bios_cpu_apicid is required to have processors listed
152 * in same order as logical cpu numbers. Hence the first
153 * entry is BSP, and so on.
154 */
155 bios_cpu_apicid[0] = m->mpc_apicid;
156 x86_cpu_to_apicid[0] = m->mpc_apicid;
157 found_bsp = 1;
158 } else {
159 bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
160 x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
161 }
154} 162}
155 163
156static void __init MP_bus_info (struct mpc_config_bus *m) 164static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -759,7 +767,7 @@ void __init mp_register_ioapic (
759 mp_ioapics[idx].mpc_apicaddr = address; 767 mp_ioapics[idx].mpc_apicaddr = address;
760 768
761 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 769 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
762 mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); 770 mp_ioapics[idx].mpc_apicid = id;
763 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 771 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
764 772
765 /* 773 /*
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 61de0b34a01e..31c0f2e6ac91 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
33#include <asm/msr.h> 33#include <asm/msr.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h>
36 37
37/* 38/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
59 60
60unsigned int nmi_watchdog = NMI_DEFAULT; 61unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ; 62static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 63static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
64static unsigned int nmi_p4_cccr_val;
63 65
64/* Note that these events don't tick when the CPU idles. This means 66/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */ 67 the frequency varies with CPU load. */
@@ -71,61 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 73#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 74#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73 75
74#define P6_EVNTSEL0_ENABLE (1 << 22) 76#define MSR_P4_MISC_ENABLE 0x1A0
75#define P6_EVNTSEL_INT (1 << 20) 77#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
76#define P6_EVNTSEL_OS (1 << 17) 78#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
77#define P6_EVNTSEL_USR (1 << 16) 79#define MSR_P4_PERFCTR0 0x300
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 80#define MSR_P4_CCCR0 0x360
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED 81#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
82#define P4_ESCR_OS (1<<3)
83#define P4_ESCR_USR (1<<2)
84#define P4_CCCR_OVF_PMI0 (1<<26)
85#define P4_CCCR_OVF_PMI1 (1<<27)
86#define P4_CCCR_THRESHOLD(N) ((N)<<20)
87#define P4_CCCR_COMPLEMENT (1<<19)
88#define P4_CCCR_COMPARE (1<<18)
89#define P4_CCCR_REQUIRED (3<<16)
90#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
91#define P4_CCCR_ENABLE (1<<12)
92/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
93 CRU_ESCR0 (with any non-null event selector) through a complemented
94 max threshold. [IA32-Vol3, Section 14.9.9] */
95#define MSR_P4_IQ_COUNTER0 0x30C
96#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
97#define P4_NMI_IQ_CCCR0 \
98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
100
101static __init inline int nmi_known_cpu(void)
102{
103 switch (boot_cpu_data.x86_vendor) {
104 case X86_VENDOR_AMD:
105 return boot_cpu_data.x86 == 15;
106 case X86_VENDOR_INTEL:
107 return boot_cpu_data.x86 == 15;
108 }
109 return 0;
110}
80 111
81/* Run after command line and cpu_init init, but before all other checks */ 112/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void) 113void __init nmi_watchdog_default(void)
83{ 114{
84 if (nmi_watchdog != NMI_DEFAULT) 115 if (nmi_watchdog != NMI_DEFAULT)
85 return; 116 return;
86 117 if (nmi_known_cpu())
87 /* For some reason the IO APIC watchdog doesn't work on the AMD 118 nmi_watchdog = NMI_LOCAL_APIC;
88 8111 chipset. For now switch to local APIC mode using 119 else
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC; 120 nmi_watchdog = NMI_IO_APIC;
99 }
100} 121}
101 122
102/* Why is there no CPUID flag for this? */ 123#ifdef CONFIG_SMP
103static __init int cpu_has_lapic(void) 124/* The performance counters used by NMI_LOCAL_APIC don't trigger when
125 * the CPU is idle. To make sure the NMI watchdog really ticks on all
126 * CPUs during the test make them busy.
127 */
128static __init void nmi_cpu_busy(void *data)
104{ 129{
105 switch (boot_cpu_data.x86_vendor) { 130 volatile int *endflag = data;
106 case X86_VENDOR_INTEL: 131 local_irq_enable();
107 case X86_VENDOR_AMD: 132 /* Intentionally don't use cpu_relax here. This is
108 return boot_cpu_data.x86 >= 6; 133 to make sure that the performance counter really ticks,
109 /* .... add more cpus here or find a different way to figure this out. */ 134 even if there is a simulator or similar that catches the
110 default: 135 pause instruction. On a real HT machine this is fine because
111 return 0; 136 all other CPUs are busy with "useless" delay loops and don't
112 } 137 care if they get somewhat less cycles. */
138 while (*endflag == 0)
139 barrier();
113} 140}
141#endif
114 142
115static int __init check_nmi_watchdog (void) 143int __init check_nmi_watchdog (void)
116{ 144{
117 int counts[NR_CPUS]; 145 volatile int endflag = 0;
146 int *counts;
118 int cpu; 147 int cpu;
119 148
120 if (nmi_watchdog == NMI_NONE) 149 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
121 return 0; 150 if (!counts)
151 return -1;
122 152
123 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 153 printk(KERN_INFO "testing NMI watchdog ... ");
124 nmi_watchdog = NMI_NONE;
125 return -1;
126 }
127 154
128 printk(KERN_INFO "Testing NMI watchdog ... "); 155 if (nmi_watchdog == NMI_LOCAL_APIC)
156 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
129 157
130 for (cpu = 0; cpu < NR_CPUS; cpu++) 158 for (cpu = 0; cpu < NR_CPUS; cpu++)
131 counts[cpu] = cpu_pda[cpu].__nmi_count; 159 counts[cpu] = cpu_pda[cpu].__nmi_count;
@@ -133,15 +161,22 @@ static int __init check_nmi_watchdog (void)
133 mdelay((10*1000)/nmi_hz); // wait 10 ticks 161 mdelay((10*1000)/nmi_hz); // wait 10 ticks
134 162
135 for (cpu = 0; cpu < NR_CPUS; cpu++) { 163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (!cpu_online(cpu))
165 continue;
136 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 166 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
137 printk("CPU#%d: NMI appears to be stuck (%d)!\n", 167 endflag = 1;
168 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
138 cpu, 169 cpu,
170 counts[cpu],
139 cpu_pda[cpu].__nmi_count); 171 cpu_pda[cpu].__nmi_count);
140 nmi_active = 0; 172 nmi_active = 0;
141 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 173 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
174 nmi_perfctr_msr = 0;
175 kfree(counts);
142 return -1; 176 return -1;
143 } 177 }
144 } 178 }
179 endflag = 1;
145 printk("OK.\n"); 180 printk("OK.\n");
146 181
147 /* now that we know it works we can reduce NMI frequency to 182 /* now that we know it works we can reduce NMI frequency to
@@ -149,10 +184,9 @@ static int __init check_nmi_watchdog (void)
149 if (nmi_watchdog == NMI_LOCAL_APIC) 184 if (nmi_watchdog == NMI_LOCAL_APIC)
150 nmi_hz = 1; 185 nmi_hz = 1;
151 186
187 kfree(counts);
152 return 0; 188 return 0;
153} 189}
154/* Have this called later during boot so counters are updating */
155late_initcall(check_nmi_watchdog);
156 190
157int __init setup_nmi_watchdog(char *str) 191int __init setup_nmi_watchdog(char *str)
158{ 192{
@@ -170,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
170 204
171 if (nmi >= NMI_INVALID) 205 if (nmi >= NMI_INVALID)
172 return 0; 206 return 0;
173 nmi_watchdog = nmi; 207 nmi_watchdog = nmi;
174 return 1; 208 return 1;
175} 209}
176 210
@@ -185,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
185 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 219 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
186 break; 220 break;
187 case X86_VENDOR_INTEL: 221 case X86_VENDOR_INTEL:
188 wrmsr(MSR_IA32_EVNTSEL0, 0, 0); 222 if (boot_cpu_data.x86 == 15) {
223 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
224 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
225 }
189 break; 226 break;
190 } 227 }
191 nmi_active = -1; 228 nmi_active = -1;
@@ -253,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
253 290
254static int nmi_pm_active; /* nmi_active before suspend */ 291static int nmi_pm_active; /* nmi_active before suspend */
255 292
256static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 293static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
257{ 294{
258 nmi_pm_active = nmi_active; 295 nmi_pm_active = nmi_active;
259 disable_lapic_nmi_watchdog(); 296 disable_lapic_nmi_watchdog();
@@ -300,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
300 * Original code written by Keith Owens. 337 * Original code written by Keith Owens.
301 */ 338 */
302 339
340static void clear_msr_range(unsigned int base, unsigned int n)
341{
342 unsigned int i;
343
344 for(i = 0; i < n; ++i)
345 wrmsr(base+i, 0, 0);
346}
347
303static void setup_k7_watchdog(void) 348static void setup_k7_watchdog(void)
304{ 349{
305 int i; 350 int i;
306 unsigned int evntsel; 351 unsigned int evntsel;
307 352
308 /* No check, so can start with slow frequency */
309 nmi_hz = 1;
310
311 /* XXX should check these in EFER */
312
313 nmi_perfctr_msr = MSR_K7_PERFCTR0; 353 nmi_perfctr_msr = MSR_K7_PERFCTR0;
314 354
315 for(i = 0; i < 4; ++i) { 355 for(i = 0; i < 4; ++i) {
316 /* Simulator may not support it */ 356 /* Simulator may not support it */
317 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) 357 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
358 nmi_perfctr_msr = 0;
318 return; 359 return;
360 }
319 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 361 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
320 } 362 }
321 363
@@ -325,12 +367,54 @@ static void setup_k7_watchdog(void)
325 | K7_NMI_EVENT; 367 | K7_NMI_EVENT;
326 368
327 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 369 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
328 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); 370 wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
329 apic_write(APIC_LVTPC, APIC_DM_NMI); 371 apic_write(APIC_LVTPC, APIC_DM_NMI);
330 evntsel |= K7_EVNTSEL_ENABLE; 372 evntsel |= K7_EVNTSEL_ENABLE;
331 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
332} 374}
333 375
376
377static int setup_p4_watchdog(void)
378{
379 unsigned int misc_enable, dummy;
380
381 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
382 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
383 return 0;
384
385 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
386 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
387#ifdef CONFIG_SMP
388 if (smp_num_siblings == 2)
389 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
390#endif
391
392 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
393 clear_msr_range(0x3F1, 2);
394 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
395 docs doesn't fully define it, so leave it alone for now. */
396 if (boot_cpu_data.x86_model >= 0x3) {
397 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
398 clear_msr_range(0x3A0, 26);
399 clear_msr_range(0x3BC, 3);
400 } else {
401 clear_msr_range(0x3A0, 31);
402 }
403 clear_msr_range(0x3C0, 6);
404 clear_msr_range(0x3C8, 6);
405 clear_msr_range(0x3E0, 2);
406 clear_msr_range(MSR_P4_CCCR0, 18);
407 clear_msr_range(MSR_P4_PERFCTR0, 18);
408
409 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
410 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
411 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
412 wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
413 apic_write(APIC_LVTPC, APIC_DM_NMI);
414 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
415 return 1;
416}
417
334void setup_apic_nmi_watchdog(void) 418void setup_apic_nmi_watchdog(void)
335{ 419{
336 switch (boot_cpu_data.x86_vendor) { 420 switch (boot_cpu_data.x86_vendor) {
@@ -341,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
341 return; 425 return;
342 setup_k7_watchdog(); 426 setup_k7_watchdog();
343 break; 427 break;
428 case X86_VENDOR_INTEL:
429 if (boot_cpu_data.x86 != 15)
430 return;
431 if (!setup_p4_watchdog())
432 return;
433 break;
434
344 default: 435 default:
345 return; 436 return;
346 } 437 }
@@ -355,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
355 * 446 *
356 * as these watchdog NMI IRQs are generated on every CPU, we only 447 * as these watchdog NMI IRQs are generated on every CPU, we only
357 * have to check the current processor. 448 * have to check the current processor.
358 *
359 * since NMIs don't listen to _any_ locks, we have to be extremely
360 * careful not to rely on unsafe variables. The printk might lock
361 * up though, so we have to break up any console locks first ...
362 * [when there will be more tty-related locks, break them up
363 * here too!]
364 */ 449 */
365 450
366static unsigned int 451static DEFINE_PER_CPU(unsigned, last_irq_sum);
367 last_irq_sums [NR_CPUS], 452static DEFINE_PER_CPU(local_t, alert_counter);
368 alert_counter [NR_CPUS]; 453static DEFINE_PER_CPU(int, nmi_touch);
369 454
370void touch_nmi_watchdog (void) 455void touch_nmi_watchdog (void)
371{ 456{
372 int i; 457 int i;
373 458
374 /* 459 /*
375 * Just reset the alert counters, (other CPUs might be 460 * Tell other CPUs to reset their alert counters. We cannot
376 * spinning on locks we hold): 461 * do it ourselves because the alert count increase is not
462 * atomic.
377 */ 463 */
378 for (i = 0; i < NR_CPUS; i++) 464 for (i = 0; i < NR_CPUS; i++)
379 alert_counter[i] = 0; 465 per_cpu(nmi_touch, i) = 1;
380} 466}
381 467
382void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 468void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
383{ 469{
384 int sum, cpu; 470 int sum;
471 int touched = 0;
385 472
386 cpu = safe_smp_processor_id();
387 sum = read_pda(apic_timer_irqs); 473 sum = read_pda(apic_timer_irqs);
388 if (last_irq_sums[cpu] == sum) { 474 if (__get_cpu_var(nmi_touch)) {
475 __get_cpu_var(nmi_touch) = 0;
476 touched = 1;
477 }
478 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
389 /* 479 /*
390 * Ayiee, looks like this CPU is stuck ... 480 * Ayiee, looks like this CPU is stuck ...
391 * wait a few IRQs (5 seconds) before doing the oops ... 481 * wait a few IRQs (5 seconds) before doing the oops ...
392 */ 482 */
393 alert_counter[cpu]++; 483 local_inc(&__get_cpu_var(alert_counter));
394 if (alert_counter[cpu] == 5*nmi_hz) { 484 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
395 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 485 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
396 == NOTIFY_STOP) { 486 == NOTIFY_STOP) {
397 alert_counter[cpu] = 0; 487 local_set(&__get_cpu_var(alert_counter), 0);
398 return; 488 return;
399 } 489 }
400 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); 490 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
401 } 491 }
402 } else { 492 } else {
403 last_irq_sums[cpu] = sum; 493 __get_cpu_var(last_irq_sum) = sum;
404 alert_counter[cpu] = 0; 494 local_set(&__get_cpu_var(alert_counter), 0);
405 } 495 }
406 if (nmi_perfctr_msr) 496 if (nmi_perfctr_msr) {
497 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
498 /*
499 * P4 quirks:
500 * - An overflown perfctr will assert its interrupt
501 * until the OVF flag in its CCCR is cleared.
502 * - LVTPC is masked on interrupt and must be
503 * unmasked by the LVTPC handler.
504 */
505 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
506 apic_write(APIC_LVTPC, APIC_DM_NMI);
507 }
407 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); 508 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
509 }
408} 510}
409 511
410static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 512static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
new file mode 100644
index 000000000000..feb5f108dd26
--- /dev/null
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -0,0 +1,101 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <asm/io.h>
23#include <asm/proto.h>
24#include <asm/msr.h>
25#include <asm/vsyscall.h>
26
27/* The I/O port the PMTMR resides at.
28 * The location is detected during setup_arch(),
29 * in arch/i386/kernel/acpi/boot.c */
30u32 pmtmr_ioport;
31
32/* value of the Power timer at last timer interrupt */
33static u32 offset_delay;
34static u32 last_pmtmr_tick;
35
36#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
37
38static inline u32 cyc2us(u32 cycles)
39{
40 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
41 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
42 *
43 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
44 * easily be multiplied with 286 (=0x11E) without having to fear
45 * u32 overflows.
46 */
47 cycles *= 286;
48 return (cycles >> 10);
49}
50
51int pmtimer_mark_offset(void)
52{
53 static int first_run = 1;
54 unsigned long tsc;
55 u32 lost;
56
57 u32 tick = inl(pmtmr_ioport);
58 u32 delta;
59
60 delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
61
62 last_pmtmr_tick = tick;
63 monotonic_base += delta * NSEC_PER_USEC;
64
65 delta += offset_delay;
66
67 lost = delta / (USEC_PER_SEC / HZ);
68 offset_delay = delta % (USEC_PER_SEC / HZ);
69
70 rdtscll(tsc);
71 vxtime.last_tsc = tsc - offset_delay * cpu_khz;
72
73 /* don't calculate delay for first run,
74 or if we've got less then a tick */
75 if (first_run || (lost < 1)) {
76 first_run = 0;
77 offset_delay = 0;
78 }
79
80 return lost - 1;
81}
82
83unsigned int do_gettimeoffset_pm(void)
84{
85 u32 now, offset, delta = 0;
86
87 offset = last_pmtmr_tick;
88 now = inl(pmtmr_ioport);
89 delta = (now - offset) & ACPI_PM_MASK;
90
91 return offset_delay + cyc2us(delta);
92}
93
94
95static int __init nopmtimer_setup(char *s)
96{
97 pmtmr_ioport = 0;
98 return 0;
99}
100
101__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index e26e86bb56fe..525f6a128a27 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,13 +257,13 @@ static int putreg(struct task_struct *child,
257 value &= 0xffff; 257 value &= 0xffff;
258 return 0; 258 return 0;
259 case offsetof(struct user_regs_struct,fs_base): 259 case offsetof(struct user_regs_struct,fs_base):
260 if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) 260 if (value >= TASK_SIZE)
261 return -EIO; 261 return -EIO;
262 child->thread.fs = value; 262 child->thread.fs = value;
263 return 0; 263 return 0;
264 case offsetof(struct user_regs_struct,gs_base): 264 case offsetof(struct user_regs_struct,gs_base):
265 if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) 265 if (value >= TASK_SIZE)
266 return -EIO; 266 return -EIO;
267 child->thread.gs = value; 267 child->thread.gs = value;
268 return 0; 268 return 0;
269 case offsetof(struct user_regs_struct, eflags): 269 case offsetof(struct user_regs_struct, eflags):
@@ -277,6 +277,11 @@ static int putreg(struct task_struct *child,
277 return -EIO; 277 return -EIO;
278 value &= 0xffff; 278 value &= 0xffff;
279 break; 279 break;
280 case offsetof(struct user_regs_struct, rip):
281 /* Check if the new RIP address is canonical */
282 if (value >= TASK_SIZE)
283 return -EIO;
284 break;
280 } 285 }
281 put_stack_long(child, regno - sizeof(struct pt_regs), value); 286 put_stack_long(child, regno - sizeof(struct pt_regs), value);
282 return 0; 287 return 0;
@@ -375,7 +380,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
375 break; 380 break;
376 381
377 switch (addr) { 382 switch (addr) {
378 case 0 ... sizeof(struct user_regs_struct): 383 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
379 tmp = getreg(child, addr); 384 tmp = getreg(child, addr);
380 break; 385 break;
381 case offsetof(struct user, u_debugreg[0]): 386 case offsetof(struct user, u_debugreg[0]):
@@ -420,7 +425,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
420 break; 425 break;
421 426
422 switch (addr) { 427 switch (addr) {
423 case 0 ... sizeof(struct user_regs_struct): 428 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
424 ret = putreg(child, addr, data); 429 ret = putreg(child, addr, data);
425 break; 430 break;
426 /* Disallows to set a breakpoint into the vsyscall */ 431 /* Disallows to set a breakpoint into the vsyscall */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2129cf9ba6b2..99f038ede23c 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -719,7 +719,6 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
719 } 719 }
720} 720}
721 721
722#ifdef CONFIG_SMP
723/* 722/*
724 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 723 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
725 * Assumes number of cores is a power of two. 724 * Assumes number of cores is a power of two.
@@ -727,17 +726,26 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
727static void __init amd_detect_cmp(struct cpuinfo_x86 *c) 726static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
728{ 727{
729#ifdef CONFIG_SMP 728#ifdef CONFIG_SMP
730 int cpu = c->x86_apicid; 729 int cpu = smp_processor_id();
731 int node = 0; 730 int node = 0;
731 unsigned bits;
732 if (c->x86_num_cores == 1) 732 if (c->x86_num_cores == 1)
733 return; 733 return;
734 cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1); 734
735 bits = 0;
736 while ((1 << bits) < c->x86_num_cores)
737 bits++;
738
739 /* Low order bits define the core id (index of core in socket) */
740 cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
741 /* Convert the APIC ID into the socket ID */
742 phys_proc_id[cpu] >>= bits;
735 743
736#ifdef CONFIG_NUMA 744#ifdef CONFIG_NUMA
737 /* When an ACPI SRAT table is available use the mappings from SRAT 745 /* When an ACPI SRAT table is available use the mappings from SRAT
738 instead. */ 746 instead. */
739 if (acpi_numa <= 0) { 747 if (acpi_numa <= 0) {
740 node = cpu_core_id[cpu]; 748 node = phys_proc_id[cpu];
741 if (!node_online(node)) 749 if (!node_online(node))
742 node = first_node(node_online_map); 750 node = first_node(node_online_map);
743 cpu_to_node[cpu] = node; 751 cpu_to_node[cpu] = node;
@@ -745,15 +753,11 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
745 node = cpu_to_node[cpu]; 753 node = cpu_to_node[cpu];
746 } 754 }
747#endif 755#endif
756
748 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", 757 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
749 cpu, c->x86_num_cores, node, cpu_core_id[cpu]); 758 cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
750#endif 759#endif
751} 760}
752#else
753static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
754{
755}
756#endif
757 761
758static int __init init_amd(struct cpuinfo_x86 *c) 762static int __init init_amd(struct cpuinfo_x86 *c)
759{ 763{
@@ -925,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
925 c->x86_clflush_size = 64; 929 c->x86_clflush_size = 64;
926 c->x86_cache_alignment = c->x86_clflush_size; 930 c->x86_cache_alignment = c->x86_clflush_size;
927 c->x86_num_cores = 1; 931 c->x86_num_cores = 1;
928 c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
929 c->extended_cpuid_level = 0; 932 c->extended_cpuid_level = 0;
930 memset(&c->x86_capability, 0, sizeof c->x86_capability); 933 memset(&c->x86_capability, 0, sizeof c->x86_capability);
931 934
@@ -954,11 +957,14 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
954 } 957 }
955 if (c->x86_capability[0] & (1<<19)) 958 if (c->x86_capability[0] & (1<<19))
956 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 959 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
957 c->x86_apicid = misc >> 24;
958 } else { 960 } else {
959 /* Have CPUID level 0 only - unheard of */ 961 /* Have CPUID level 0 only - unheard of */
960 c->x86 = 4; 962 c->x86 = 4;
961 } 963 }
964
965#ifdef CONFIG_SMP
966 phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
967#endif
962} 968}
963 969
964/* 970/*
@@ -1088,7 +1094,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1088 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1094 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1089 1095
1090 /* Other (Linux-defined) */ 1096 /* Other (Linux-defined) */
1091 "cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+", 1097 "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
1092 "constant_tsc", NULL, NULL, 1098 "constant_tsc", NULL, NULL,
1093 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1099 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1094 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1100 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d439ced150c6..3fdcdba0fec5 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -452,7 +452,9 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
452 regs->rip -= 2; 452 regs->rip -= 2;
453 } 453 }
454 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { 454 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
455 regs->rax = __NR_restart_syscall; 455 regs->rax = test_thread_flag(TIF_IA32) ?
456 __NR_ia32_restart_syscall :
457 __NR_restart_syscall;
456 regs->rip -= 2; 458 regs->rip -= 2;
457 } 459 }
458 } 460 }
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543a..f1ec0f345941 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
56#include <asm/kdebug.h> 56#include <asm/kdebug.h>
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/proto.h> 58#include <asm/proto.h>
59#include <asm/nmi.h>
59 60
60/* Change for real CPU hotplug. Note other files need to be fixed 61/* Change for real CPU hotplug. Note other files need to be fixed
61 first too. */ 62 first too. */
@@ -93,6 +94,7 @@ int smp_threads_ready;
93 94
94cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; 95cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
95cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; 96cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
97EXPORT_SYMBOL(cpu_core_map);
96 98
97/* 99/*
98 * Trampoline 80x86 program as an array. 100 * Trampoline 80x86 program as an array.
@@ -125,96 +127,210 @@ static void __cpuinit smp_store_cpu_info(int id)
125 127
126 *c = boot_cpu_data; 128 *c = boot_cpu_data;
127 identify_cpu(c); 129 identify_cpu(c);
130 print_cpu_info(c);
128} 131}
129 132
130/* 133/*
131 * Synchronize TSCs of CPUs 134 * New Funky TSC sync algorithm borrowed from IA64.
135 * Main advantage is that it doesn't reset the TSCs fully and
136 * in general looks more robust and it works better than my earlier
137 * attempts. I believe it was written by David Mosberger. Some minor
138 * adjustments for x86-64 by me -AK
132 * 139 *
133 * This new algorithm is less accurate than the old "zero TSCs" 140 * Original comment reproduced below.
134 * one, but we cannot zero TSCs anymore in the new hotplug CPU 141 *
135 * model. 142 * Synchronize TSC of the current (slave) CPU with the TSC of the
143 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
144 * eliminate the possibility of unaccounted-for errors (such as
145 * getting a machine check in the middle of a calibration step). The
146 * basic idea is for the slave to ask the master what itc value it has
147 * and to read its own itc before and after the master responds. Each
148 * iteration gives us three timestamps:
149 *
150 * slave master
151 *
152 * t0 ---\
153 * ---\
154 * --->
155 * tm
156 * /---
157 * /---
158 * t1 <---
159 *
160 *
161 * The goal is to adjust the slave's TSC such that tm falls exactly
162 * half-way between t0 and t1. If we achieve this, the clocks are
163 * synchronized provided the interconnect between the slave and the
164 * master is symmetric. Even if the interconnect were asymmetric, we
165 * would still know that the synchronization error is smaller than the
166 * roundtrip latency (t0 - t1).
167 *
168 * When the interconnect is quiet and symmetric, this lets us
169 * synchronize the TSC to within one or two cycles. However, we can
170 * only *guarantee* that the synchronization is accurate to within a
171 * round-trip time, which is typically in the range of several hundred
172 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
173 * are usually almost perfectly synchronized, but we shouldn't assume
174 * that the accuracy is much better than half a micro second or so.
175 *
176 * [there are other errors like the latency of RDTSC and of the
177 * WRMSR. These can also account to hundreds of cycles. So it's
178 * probably worse. It claims 153 cycles error on a dual Opteron,
179 * but I suspect the numbers are actually somewhat worse -AK]
136 */ 180 */
137 181
138static atomic_t __cpuinitdata tsc_flag; 182#define MASTER 0
183#define SLAVE (SMP_CACHE_BYTES/8)
184
185/* Intentionally don't use cpu_relax() while TSC synchronization
186 because we don't want to go into funky power save modi or cause
187 hypervisors to schedule us away. Going to sleep would likely affect
188 latency and low latency is the primary objective here. -AK */
189#define no_cpu_relax() barrier()
190
139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 191static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
140static unsigned long long __cpuinitdata bp_tsc, ap_tsc; 192static volatile __cpuinitdata unsigned long go[SLAVE + 1];
193static int notscsync __cpuinitdata;
194
195#undef DEBUG_TSC_SYNC
141 196
142#define NR_LOOPS 5 197#define NUM_ROUNDS 64 /* magic value */
198#define NUM_ITERS 5 /* likewise */
143 199
144static void __cpuinit sync_tsc_bp_init(int init) 200/* Callback on boot CPU */
201static __cpuinit void sync_master(void *arg)
145{ 202{
146 if (init) 203 unsigned long flags, i;
147 _raw_spin_lock(&tsc_sync_lock); 204
148 else 205 if (smp_processor_id() != boot_cpu_id)
149 _raw_spin_unlock(&tsc_sync_lock); 206 return;
150 atomic_set(&tsc_flag, 0); 207
208 go[MASTER] = 0;
209
210 local_irq_save(flags);
211 {
212 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
213 while (!go[MASTER])
214 no_cpu_relax();
215 go[MASTER] = 0;
216 rdtscll(go[SLAVE]);
217 }
218 }
219 local_irq_restore(flags);
151} 220}
152 221
153/* 222/*
154 * Synchronize TSC on AP with BP. 223 * Return the number of cycles by which our tsc differs from the tsc
224 * on the master (time-keeper) CPU. A positive number indicates our
225 * tsc is ahead of the master, negative that it is behind.
155 */ 226 */
156static void __cpuinit __sync_tsc_ap(void) 227static inline long
228get_delta(long *rt, long *master)
157{ 229{
158 if (!cpu_has_tsc) 230 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
159 return; 231 unsigned long tcenter, t0, t1, tm;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id()); 232 int i;
161 233
162 while (atomic_read(&tsc_flag) != 0) 234 for (i = 0; i < NUM_ITERS; ++i) {
163 cpu_relax(); 235 rdtscll(t0);
164 atomic_inc(&tsc_flag); 236 go[MASTER] = 1;
165 mb(); 237 while (!(tm = go[SLAVE]))
166 _raw_spin_lock(&tsc_sync_lock); 238 no_cpu_relax();
167 wrmsrl(MSR_IA32_TSC, bp_tsc); 239 go[SLAVE] = 0;
168 _raw_spin_unlock(&tsc_sync_lock); 240 rdtscll(t1);
169 rdtscll(ap_tsc); 241
170 mb(); 242 if (t1 - t0 < best_t1 - best_t0)
171 atomic_inc(&tsc_flag); 243 best_t0 = t0, best_t1 = t1, best_tm = tm;
172 mb(); 244 }
245
246 *rt = best_t1 - best_t0;
247 *master = best_tm - best_t0;
248
249 /* average best_t0 and best_t1 without overflow: */
250 tcenter = (best_t0/2 + best_t1/2);
251 if (best_t0 % 2 + best_t1 % 2 == 2)
252 ++tcenter;
253 return tcenter - best_tm;
173} 254}
174 255
175static void __cpuinit sync_tsc_ap(void) 256static __cpuinit void sync_tsc(void)
176{ 257{
177 int i; 258 int i, done = 0;
178 for (i = 0; i < NR_LOOPS; i++) 259 long delta, adj, adjust_latency = 0;
179 __sync_tsc_ap(); 260 unsigned long flags, rt, master_time_stamp, bound;
261#if DEBUG_TSC_SYNC
262 static struct syncdebug {
263 long rt; /* roundtrip time */
264 long master; /* master's timestamp */
265 long diff; /* difference between midpoint and master's timestamp */
266 long lat; /* estimate of tsc adjustment latency */
267 } t[NUM_ROUNDS] __cpuinitdata;
268#endif
269
270 go[MASTER] = 1;
271
272 smp_call_function(sync_master, NULL, 1, 0);
273
274 while (go[MASTER]) /* wait for master to be ready */
275 no_cpu_relax();
276
277 spin_lock_irqsave(&tsc_sync_lock, flags);
278 {
279 for (i = 0; i < NUM_ROUNDS; ++i) {
280 delta = get_delta(&rt, &master_time_stamp);
281 if (delta == 0) {
282 done = 1; /* let's lock on to this... */
283 bound = rt;
284 }
285
286 if (!done) {
287 unsigned long t;
288 if (i > 0) {
289 adjust_latency += -delta;
290 adj = -delta + adjust_latency/4;
291 } else
292 adj = -delta;
293
294 rdtscll(t);
295 wrmsrl(MSR_IA32_TSC, t + adj);
296 }
297#if DEBUG_TSC_SYNC
298 t[i].rt = rt;
299 t[i].master = master_time_stamp;
300 t[i].diff = delta;
301 t[i].lat = adjust_latency/4;
302#endif
303 }
304 }
305 spin_unlock_irqrestore(&tsc_sync_lock, flags);
306
307#if DEBUG_TSC_SYNC
308 for (i = 0; i < NUM_ROUNDS; ++i)
309 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
310 t[i].rt, t[i].master, t[i].diff, t[i].lat);
311#endif
312
313 printk(KERN_INFO
314 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
315 "maxerr %lu cycles)\n",
316 smp_processor_id(), boot_cpu_id, delta, rt);
180} 317}
181 318
182/* 319static void __cpuinit tsc_sync_wait(void)
183 * Synchronize TSC from BP to AP.
184 */
185static void __cpuinit __sync_tsc_bp(int cpu)
186{ 320{
187 if (!cpu_has_tsc) 321 if (notscsync || !cpu_has_tsc)
188 return; 322 return;
189 323 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
190 /* Wait for AP */ 324 boot_cpu_id);
191 while (atomic_read(&tsc_flag) == 0) 325 sync_tsc();
192 cpu_relax();
193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205} 326}
206 327
207static void __cpuinit sync_tsc_bp(int cpu) 328static __init int notscsync_setup(char *s)
208{ 329{
209 int i; 330 notscsync = 1;
210 for (i = 0; i < NR_LOOPS - 1; i++) { 331 return 0;
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
217} 332}
333__setup("notscsync", notscsync_setup);
218 334
219static atomic_t init_deasserted __cpuinitdata; 335static atomic_t init_deasserted __cpuinitdata;
220 336
@@ -315,11 +431,6 @@ void __cpuinit start_secondary(void)
315 cpu_init(); 431 cpu_init();
316 smp_callin(); 432 smp_callin();
317 433
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 434 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
324 barrier(); 435 barrier();
325 436
@@ -334,7 +445,6 @@ void __cpuinit start_secondary(void)
334 enable_8259A_irq(0); 445 enable_8259A_irq(0);
335 } 446 }
336 447
337
338 enable_APIC_timer(); 448 enable_APIC_timer();
339 449
340 /* 450 /*
@@ -343,6 +453,11 @@ void __cpuinit start_secondary(void)
343 cpu_set(smp_processor_id(), cpu_online_map); 453 cpu_set(smp_processor_id(), cpu_online_map);
344 mb(); 454 mb();
345 455
456 /* Wait for TSC sync to not schedule things before.
457 We still process interrupts, which could see an inconsistent
458 time in that window unfortunately. */
459 tsc_sync_wait();
460
346 cpu_idle(); 461 cpu_idle();
347} 462}
348 463
@@ -531,7 +646,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
531 printk("failed fork for CPU %d\n", cpu); 646 printk("failed fork for CPU %d\n", cpu);
532 return PTR_ERR(idle); 647 return PTR_ERR(idle);
533 } 648 }
534 x86_cpu_to_apicid[cpu] = apicid;
535 649
536 cpu_pda[cpu].pcurrent = idle; 650 cpu_pda[cpu].pcurrent = idle;
537 651
@@ -600,8 +714,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
600 714
601 if (cpu_isset(cpu, cpu_callin_map)) { 715 if (cpu_isset(cpu, cpu_callin_map)) {
602 /* number CPUs logically, starting from 1 (BSP is 0) */ 716 /* number CPUs logically, starting from 1 (BSP is 0) */
603 Dprintk("OK.\n");
604 print_cpu_info(&cpu_data[cpu]);
605 Dprintk("CPU has booted.\n"); 717 Dprintk("CPU has booted.\n");
606 } else { 718 } else {
607 boot_error = 1; 719 boot_error = 1;
@@ -842,7 +954,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
842 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 954 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
843 /* Or can we switch back to PIC here? */ 955 /* Or can we switch back to PIC here? */
844 } 956 }
845 x86_cpu_to_apicid[0] = boot_cpu_id;
846 957
847 /* 958 /*
848 * Now start the IO-APICs 959 * Now start the IO-APICs
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
889 printk("__cpu_up: bad cpu %d\n", cpu); 1000 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL; 1001 return -EINVAL;
891 } 1002 }
892 sync_tsc_bp_init(1);
893 1003
894 /* Boot it! */ 1004 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid); 1005 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) { 1006 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err); 1007 Dprintk("do_boot_cpu failed %d\n", err);
899 return err; 1008 return err;
900 } 1009 }
901 1010
902 sync_tsc_bp(cpu);
903
904 /* Unleash the CPU! */ 1011 /* Unleash the CPU! */
905 Dprintk("waiting for cpu %d\n", cpu); 1012 Dprintk("waiting for cpu %d\n", cpu);
906 1013
@@ -923,4 +1030,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
923 1030
924 detect_siblings(); 1031 detect_siblings();
925 time_init_gtod(); 1032 time_init_gtod();
1033
1034 check_nmi_watchdog();
926} 1035}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 0e3b60e1ac5b..735b6767c8ed 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -26,6 +26,8 @@
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/bcd.h> 27#include <linux/bcd.h>
28#include <linux/kallsyms.h> 28#include <linux/kallsyms.h>
29#include <linux/acpi.h>
30#include <acpi/achware.h> /* for PM timer frequency */
29#include <asm/8253pit.h> 31#include <asm/8253pit.h>
30#include <asm/pgtable.h> 32#include <asm/pgtable.h>
31#include <asm/vsyscall.h> 33#include <asm/vsyscall.h>
@@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
396 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; 398 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
397 399
398 vxtime.last = offset; 400 vxtime.last = offset;
401#ifdef CONFIG_X86_PM_TIMER
402 } else if (vxtime.mode == VXTIME_PMTMR) {
403 lost = pmtimer_mark_offset();
404#endif
399 } else { 405 } else {
400 offset = (((tsc - vxtime.last_tsc) * 406 offset = (((tsc - vxtime.last_tsc) *
401 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); 407 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
@@ -898,6 +904,13 @@ void __init time_init(void)
898 hpet_period; 904 hpet_period;
899 cpu_khz = hpet_calibrate_tsc(); 905 cpu_khz = hpet_calibrate_tsc();
900 timename = "HPET"; 906 timename = "HPET";
907#ifdef CONFIG_X86_PM_TIMER
908 } else if (pmtmr_ioport) {
909 vxtime_hz = PM_TIMER_FREQUENCY;
910 timename = "PM";
911 pit_init();
912 cpu_khz = pit_calibrate_tsc();
913#endif
901 } else { 914 } else {
902 pit_init(); 915 pit_init();
903 cpu_khz = pit_calibrate_tsc(); 916 cpu_khz = pit_calibrate_tsc();
@@ -923,35 +936,50 @@ void __init time_init(void)
923} 936}
924 937
925/* 938/*
939 * Make an educated guess if the TSC is trustworthy and synchronized
940 * over all CPUs.
941 */
942static __init int unsynchronized_tsc(void)
943{
944#ifdef CONFIG_SMP
945 if (oem_force_hpet_timer())
946 return 1;
947 /* Intel systems are normally all synchronized. Exceptions
948 are handled in the OEM check above. */
949 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
950 return 0;
951 /* All in a single socket - should be synchronized */
952 if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
953 return 0;
954#endif
955 /* Assume multi socket systems are not synchronized */
956 return num_online_cpus() > 1;
957}
958
959/*
926 * Decide after all CPUs are booted what mode gettimeofday should use. 960 * Decide after all CPUs are booted what mode gettimeofday should use.
927 */ 961 */
928void __init time_init_gtod(void) 962void __init time_init_gtod(void)
929{ 963{
930 char *timetype; 964 char *timetype;
931 965
932 /* 966 if (unsynchronized_tsc())
933 * AMD systems with more than one CPU don't have fully synchronized
934 * TSCs. Always use HPET gettimeofday for these, although it is slower.
935 * Intel SMP systems usually have synchronized TSCs, so use always
936 * the TSC.
937 *
938 * Exceptions:
939 * IBM Summit2 checked by oem_force_hpet_timer().
940 * AMD dual core may also not need HPET. Check me.
941 *
942 * Can be turned off with "notsc".
943 */
944 if (num_online_cpus() > 1 &&
945 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
946 notsc = 1;
947 /* Some systems will want to disable TSC and use HPET. */
948 if (oem_force_hpet_timer())
949 notsc = 1; 967 notsc = 1;
950 if (vxtime.hpet_address && notsc) { 968 if (vxtime.hpet_address && notsc) {
951 timetype = "HPET"; 969 timetype = "HPET";
952 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; 970 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
953 vxtime.mode = VXTIME_HPET; 971 vxtime.mode = VXTIME_HPET;
954 do_gettimeoffset = do_gettimeoffset_hpet; 972 do_gettimeoffset = do_gettimeoffset_hpet;
973#ifdef CONFIG_X86_PM_TIMER
974 /* Using PM for gettimeofday is quite slow, but we have no other
975 choice because the TSC is too unreliable on some systems. */
976 } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
977 timetype = "PM";
978 do_gettimeoffset = do_gettimeoffset_pm;
979 vxtime.mode = VXTIME_PMTMR;
980 sysctl_vsyscall = 0;
981 printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
982#endif
955 } else { 983 } else {
956 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; 984 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
957 vxtime.mode = VXTIME_TSC; 985 vxtime.mode = VXTIME_TSC;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 65a37f52c56e..3dfec8fdabcd 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -332,10 +332,12 @@ void handle_BUG(struct pt_regs *regs)
332 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); 332 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
333} 333}
334 334
335#ifdef CONFIG_BUG
335void out_of_line_bug(void) 336void out_of_line_bug(void)
336{ 337{
337 BUG(); 338 BUG();
338} 339}
340#endif
339 341
340static DEFINE_SPINLOCK(die_lock); 342static DEFINE_SPINLOCK(die_lock);
341static int die_owner = -1; 343static int die_owner = -1;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b4b8dc59663a..2e5734425949 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
65 usec = (__xtime.tv_nsec / 1000) + 65 usec = (__xtime.tv_nsec / 1000) +
66 (__jiffies - __wall_jiffies) * (1000000 / HZ); 66 (__jiffies - __wall_jiffies) * (1000000 / HZ);
67 67
68 if (__vxtime.mode == VXTIME_TSC) { 68 if (__vxtime.mode != VXTIME_HPET) {
69 sync_core(); 69 sync_core();
70 rdtscll(t); 70 rdtscll(t);
71 if (t < __vxtime.last_tsc) 71 if (t < __vxtime.last_tsc)
@@ -217,8 +217,9 @@ static int __init vsyscall_init(void)
217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
219 map_vsyscall(); 219 map_vsyscall();
220 sysctl_vsyscall = 1; 220#ifdef CONFIG_SYSCTL
221 register_sysctl_table(kernel_root_table2, 0); 221 register_sysctl_table(kernel_root_table2, 0);
222#endif
222 return 0; 223 return 0;
223} 224}
224 225
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index a43dedb58fa2..68ec03070e5a 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -193,8 +193,9 @@ EXPORT_SYMBOL(smp_num_siblings);
193extern void do_softirq_thunk(void); 193extern void do_softirq_thunk(void);
194EXPORT_SYMBOL(do_softirq_thunk); 194EXPORT_SYMBOL(do_softirq_thunk);
195 195
196void out_of_line_bug(void); 196#ifdef CONFIG_BUG
197EXPORT_SYMBOL(out_of_line_bug); 197EXPORT_SYMBOL(out_of_line_bug);
198#endif
198 199
199EXPORT_SYMBOL(init_level4_pgt); 200EXPORT_SYMBOL(init_level4_pgt);
200 201
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index e03309216764..5d6b2114cc9a 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -234,6 +234,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
234 234
235/* 235/*
236 * Handle a fault on the vmalloc or module mapping area 236 * Handle a fault on the vmalloc or module mapping area
237 *
238 * This assumes no large pages in there.
237 */ 239 */
238static int vmalloc_fault(unsigned long address) 240static int vmalloc_fault(unsigned long address)
239{ 241{
@@ -272,7 +274,10 @@ static int vmalloc_fault(unsigned long address)
272 if (!pte_present(*pte_ref)) 274 if (!pte_present(*pte_ref))
273 return -1; 275 return -1;
274 pte = pte_offset_kernel(pmd, address); 276 pte = pte_offset_kernel(pmd, address);
275 if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) 277 /* Don't use pte_page here, because the mappings can point
278 outside mem_map, and the NUMA hash lookup cannot handle
279 that. */
280 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
276 BUG(); 281 BUG();
277 __flush_tlb_all(); 282 __flush_tlb_all();
278 return 0; 283 return 0;
@@ -346,7 +351,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
346 * protection error (error_code & 1) == 0. 351 * protection error (error_code & 1) == 0.
347 */ 352 */
348 if (unlikely(address >= TASK_SIZE)) { 353 if (unlikely(address >= TASK_SIZE)) {
349 if (!(error_code & 5)) { 354 if (!(error_code & 5) &&
355 ((address >= VMALLOC_START && address < VMALLOC_END) ||
356 (address >= MODULES_VADDR && address < MODULES_END))) {
350 if (vmalloc_fault(address) < 0) 357 if (vmalloc_fault(address) < 0)
351 goto bad_area_nosemaphore; 358 goto bad_area_nosemaphore;
352 return; 359 return;
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 74ec8554b195..58aac23760ef 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -133,7 +133,7 @@ ioremap_change_attr(unsigned long phys_addr, unsigned long size,
133 unsigned long flags) 133 unsigned long flags)
134{ 134{
135 int err = 0; 135 int err = 0;
136 if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { 136 if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
137 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 137 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
138 unsigned long vaddr = (unsigned long) __va(phys_addr); 138 unsigned long vaddr = (unsigned long) __va(phys_addr);
139 139
@@ -214,7 +214,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
214 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); 214 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
215 return NULL; 215 return NULL;
216 } 216 }
217 if (ioremap_change_attr(phys_addr, size, flags) < 0) { 217 if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
218 area->flags &= 0xffffff; 218 area->flags &= 0xffffff;
219 vunmap(addr); 219 vunmap(addr);
220 return NULL; 220 return NULL;
@@ -251,7 +251,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
251 251
252void iounmap(volatile void __iomem *addr) 252void iounmap(volatile void __iomem *addr)
253{ 253{
254 struct vm_struct *p, **pprev; 254 struct vm_struct *p;
255 255
256 if (addr <= high_memory) 256 if (addr <= high_memory)
257 return; 257 return;
@@ -260,24 +260,11 @@ void iounmap(volatile void __iomem *addr)
260 return; 260 return;
261 261
262 write_lock(&vmlist_lock); 262 write_lock(&vmlist_lock);
263 for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev) 263 p = __remove_vm_area((void *)((unsigned long)addr & PAGE_MASK));
264 if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr)) 264 if (!p)
265 break; 265 printk("iounmap: bad address %p\n", addr);
266 if (!p) { 266 else if (p->flags >> 20)
267 printk("__iounmap: bad address %p\n", addr); 267 ioremap_change_attr(p->phys_addr, p->size, 0);
268 goto out_unlock;
269 }
270 *pprev = p->next;
271 unmap_vm_area(p);
272 if ((p->flags >> 20) &&
273 p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
274 /* p->size includes the guard page, but cpa doesn't like that */
275 change_page_attr(virt_to_page(__va(p->phys_addr)),
276 p->size >> PAGE_SHIFT,
277 PAGE_KERNEL);
278 global_flush_tlb();
279 }
280out_unlock:
281 write_unlock(&vmlist_lock); 268 write_unlock(&vmlist_lock);
282 kfree(p); 269 kfree(p);
283} 270}