aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig10
-rw-r--r--arch/x86_64/Makefile24
-rw-r--r--arch/x86_64/boot/video.S5
-rw-r--r--arch/x86_64/defconfig57
-rw-r--r--arch/x86_64/ia32/Makefile4
-rw-r--r--arch/x86_64/ia32/ia32entry.S24
-rw-r--r--arch/x86_64/kernel/Makefile4
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/e820.c36
-rw-r--r--arch/x86_64/kernel/entry.S28
-rw-r--r--arch/x86_64/kernel/kprobes.c10
-rw-r--r--arch/x86_64/kernel/mce.c10
-rw-r--r--arch/x86_64/kernel/mce_amd.c2
-rw-r--r--arch/x86_64/kernel/nmi.c7
-rw-r--r--arch/x86_64/kernel/pci-dma.c2
-rw-r--r--arch/x86_64/kernel/pci-gart.c4
-rw-r--r--arch/x86_64/kernel/process.c14
-rw-r--r--arch/x86_64/kernel/ptrace.c5
-rw-r--r--arch/x86_64/kernel/setup.c8
-rw-r--r--arch/x86_64/kernel/time.c4
-rw-r--r--arch/x86_64/kernel/traps.c5
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c3
-rw-r--r--arch/x86_64/mm/init.c37
-rw-r--r--arch/x86_64/mm/numa.c48
-rw-r--r--arch/x86_64/mm/srat.c170
-rw-r--r--arch/x86_64/pci/mmconfig.c53
27 files changed, 448 insertions, 130 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 4310b4a311a5..408d44a59756 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT
136 default "7" if GENERIC_CPU || MPSC 136 default "7" if GENERIC_CPU || MPSC
137 default "6" if MK8 137 default "6" if MK8
138 138
139config X86_INTERNODE_CACHE_BYTES
140 int
141 default "4096" if X86_VSMP
142 default X86_L1_CACHE_BYTES if !X86_VSMP
143
139config X86_TSC 144config X86_TSC
140 bool 145 bool
141 default y 146 default y
@@ -283,6 +288,11 @@ config K8_NUMA
283 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 288 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
284 instead, which also takes priority if both are compiled in. 289 instead, which also takes priority if both are compiled in.
285 290
291config NODES_SHIFT
292 int
293 default "6"
294 depends on NEED_MULTIPLE_NODES
295
286# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig. 296# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
287 297
288config X86_64_ACPI_NUMA 298config X86_64_ACPI_NUMA
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 585fd4a559c8..e573e2ab5510 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -24,37 +24,37 @@
24LDFLAGS := -m elf_x86_64 24LDFLAGS := -m elf_x86_64
25OBJCOPYFLAGS := -O binary -R .note -R .comment -S 25OBJCOPYFLAGS := -O binary -R .note -R .comment -S
26LDFLAGS_vmlinux := 26LDFLAGS_vmlinux :=
27
28CHECKFLAGS += -D__x86_64__ -m64 27CHECKFLAGS += -D__x86_64__ -m64
29 28
29cflags-y :=
30cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 30cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
32cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) 32cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
33CFLAGS += $(cflags-y)
34 33
35CFLAGS += -m64 34cflags-y += -m64
36CFLAGS += -mno-red-zone 35cflags-y += -mno-red-zone
37CFLAGS += -mcmodel=kernel 36cflags-y += -mcmodel=kernel
38CFLAGS += -pipe 37cflags-y += -pipe
39cflags-$(CONFIG_REORDER) += -ffunction-sections 38cflags-$(CONFIG_REORDER) += -ffunction-sections
40# this makes reading assembly source easier, but produces worse code 39# this makes reading assembly source easier, but produces worse code
41# actually it makes the kernel smaller too. 40# actually it makes the kernel smaller too.
42CFLAGS += -fno-reorder-blocks 41cflags-y += -fno-reorder-blocks
43CFLAGS += -Wno-sign-compare 42cflags-y += -Wno-sign-compare
44ifneq ($(CONFIG_UNWIND_INFO),y) 43ifneq ($(CONFIG_UNWIND_INFO),y)
45CFLAGS += -fno-asynchronous-unwind-tables 44cflags-y += -fno-asynchronous-unwind-tables
46endif 45endif
47ifneq ($(CONFIG_DEBUG_INFO),y) 46ifneq ($(CONFIG_DEBUG_INFO),y)
48# -fweb shrinks the kernel a bit, but the difference is very small 47# -fweb shrinks the kernel a bit, but the difference is very small
49# it also messes up debugging, so don't use it for now. 48# it also messes up debugging, so don't use it for now.
50#CFLAGS += $(call cc-option,-fweb) 49#cflags-y += $(call cc-option,-fweb)
51endif 50endif
52# -funit-at-a-time shrinks the kernel .text considerably 51# -funit-at-a-time shrinks the kernel .text considerably
53# unfortunately it makes reading oopses harder. 52# unfortunately it makes reading oopses harder.
54CFLAGS += $(call cc-option,-funit-at-a-time) 53cflags-y += $(call cc-option,-funit-at-a-time)
55# prevent gcc from generating any FP code by mistake 54# prevent gcc from generating any FP code by mistake
56CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) 55cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
57 56
57CFLAGS += $(cflags-y)
58AFLAGS += -m64 58AFLAGS += -m64
59 59
60head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o 60head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
index 0587477c99f2..32327bb37aff 100644
--- a/arch/x86_64/boot/video.S
+++ b/arch/x86_64/boot/video.S
@@ -97,6 +97,7 @@
97#define PARAM_VESAPM_OFF 0x30 97#define PARAM_VESAPM_OFF 0x30
98#define PARAM_LFB_PAGES 0x32 98#define PARAM_LFB_PAGES 0x32
99#define PARAM_VESA_ATTRIB 0x34 99#define PARAM_VESA_ATTRIB 0x34
100#define PARAM_CAPABILITIES 0x36
100 101
101/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ 102/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
102#ifdef CONFIG_VIDEO_RETAIN 103#ifdef CONFIG_VIDEO_RETAIN
@@ -233,6 +234,10 @@ mopar_gr:
233 movw 18(%di), %ax 234 movw 18(%di), %ax
234 movl %eax, %fs:(PARAM_LFB_SIZE) 235 movl %eax, %fs:(PARAM_LFB_SIZE)
235 236
237# store mode capabilities
238 movl 10(%di), %eax
239 movl %eax, %fs:(PARAM_CAPABILITIES)
240
236# switching the DAC to 8-bit is for <= 8 bpp only 241# switching the DAC to 8-bit is for <= 8 bpp only
237 movw %fs:(PARAM_LFB_DEPTH), %ax 242 movw %fs:(PARAM_LFB_DEPTH), %ax
238 cmpw $8, %ax 243 cmpw $8, %ax
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 566ecc97ee5a..69db0c0721d1 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.16-git9 3# Linux kernel version: 2.6.17-rc1-git11
4# Sat Mar 25 15:18:40 2006 4# Sun Apr 16 07:22:36 2006
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -9,6 +9,7 @@ CONFIG_X86=y
9CONFIG_SEMAPHORE_SLEEPERS=y 9CONFIG_SEMAPHORE_SLEEPERS=y
10CONFIG_MMU=y 10CONFIG_MMU=y
11CONFIG_RWSEM_GENERIC_SPINLOCK=y 11CONFIG_RWSEM_GENERIC_SPINLOCK=y
12CONFIG_GENERIC_HWEIGHT=y
12CONFIG_GENERIC_CALIBRATE_DELAY=y 13CONFIG_GENERIC_CALIBRATE_DELAY=y
13CONFIG_X86_CMPXCHG=y 14CONFIG_X86_CMPXCHG=y
14CONFIG_EARLY_PRINTK=y 15CONFIG_EARLY_PRINTK=y
@@ -55,11 +56,8 @@ CONFIG_BASE_FULL=y
55CONFIG_FUTEX=y 56CONFIG_FUTEX=y
56CONFIG_EPOLL=y 57CONFIG_EPOLL=y
57CONFIG_SHMEM=y 58CONFIG_SHMEM=y
58CONFIG_CC_ALIGN_FUNCTIONS=0
59CONFIG_CC_ALIGN_LABELS=0
60CONFIG_CC_ALIGN_LOOPS=0
61CONFIG_CC_ALIGN_JUMPS=0
62CONFIG_SLAB=y 59CONFIG_SLAB=y
60CONFIG_DOUBLEFAULT=y
63# CONFIG_TINY_SHMEM is not set 61# CONFIG_TINY_SHMEM is not set
64CONFIG_BASE_SMALL=0 62CONFIG_BASE_SMALL=0
65# CONFIG_SLOB is not set 63# CONFIG_SLOB is not set
@@ -70,7 +68,6 @@ CONFIG_BASE_SMALL=0
70CONFIG_MODULES=y 68CONFIG_MODULES=y
71CONFIG_MODULE_UNLOAD=y 69CONFIG_MODULE_UNLOAD=y
72CONFIG_MODULE_FORCE_UNLOAD=y 70CONFIG_MODULE_FORCE_UNLOAD=y
73CONFIG_OBSOLETE_MODPARM=y
74# CONFIG_MODVERSIONS is not set 71# CONFIG_MODVERSIONS is not set
75# CONFIG_MODULE_SRCVERSION_ALL is not set 72# CONFIG_MODULE_SRCVERSION_ALL is not set
76# CONFIG_KMOD is not set 73# CONFIG_KMOD is not set
@@ -81,6 +78,7 @@ CONFIG_STOP_MACHINE=y
81# 78#
82CONFIG_LBD=y 79CONFIG_LBD=y
83# CONFIG_BLK_DEV_IO_TRACE is not set 80# CONFIG_BLK_DEV_IO_TRACE is not set
81# CONFIG_LSF is not set
84 82
85# 83#
86# IO Schedulers 84# IO Schedulers
@@ -105,6 +103,7 @@ CONFIG_X86_PC=y
105CONFIG_GENERIC_CPU=y 103CONFIG_GENERIC_CPU=y
106CONFIG_X86_L1_CACHE_BYTES=128 104CONFIG_X86_L1_CACHE_BYTES=128
107CONFIG_X86_L1_CACHE_SHIFT=7 105CONFIG_X86_L1_CACHE_SHIFT=7
106CONFIG_X86_INTERNODE_CACHE_BYTES=128
108CONFIG_X86_TSC=y 107CONFIG_X86_TSC=y
109CONFIG_X86_GOOD_APIC=y 108CONFIG_X86_GOOD_APIC=y
110# CONFIG_MICROCODE is not set 109# CONFIG_MICROCODE is not set
@@ -116,12 +115,14 @@ CONFIG_X86_LOCAL_APIC=y
116CONFIG_MTRR=y 115CONFIG_MTRR=y
117CONFIG_SMP=y 116CONFIG_SMP=y
118CONFIG_SCHED_SMT=y 117CONFIG_SCHED_SMT=y
118CONFIG_SCHED_MC=y
119# CONFIG_PREEMPT_NONE is not set 119# CONFIG_PREEMPT_NONE is not set
120CONFIG_PREEMPT_VOLUNTARY=y 120CONFIG_PREEMPT_VOLUNTARY=y
121# CONFIG_PREEMPT is not set 121# CONFIG_PREEMPT is not set
122CONFIG_PREEMPT_BKL=y 122CONFIG_PREEMPT_BKL=y
123CONFIG_NUMA=y 123CONFIG_NUMA=y
124CONFIG_K8_NUMA=y 124CONFIG_K8_NUMA=y
125CONFIG_NODES_SHIFT=6
125CONFIG_X86_64_ACPI_NUMA=y 126CONFIG_X86_64_ACPI_NUMA=y
126CONFIG_NUMA_EMU=y 127CONFIG_NUMA_EMU=y
127CONFIG_ARCH_DISCONTIGMEM_ENABLE=y 128CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
@@ -138,6 +139,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
138CONFIG_SPLIT_PTLOCK_CPUS=4 139CONFIG_SPLIT_PTLOCK_CPUS=4
139CONFIG_MIGRATION=y 140CONFIG_MIGRATION=y
140CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 141CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
142CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
141CONFIG_NR_CPUS=32 143CONFIG_NR_CPUS=32
142CONFIG_HOTPLUG_CPU=y 144CONFIG_HOTPLUG_CPU=y
143CONFIG_HPET_TIMER=y 145CONFIG_HPET_TIMER=y
@@ -289,6 +291,7 @@ CONFIG_IP_PNP_DHCP=y
289# CONFIG_INET_AH is not set 291# CONFIG_INET_AH is not set
290# CONFIG_INET_ESP is not set 292# CONFIG_INET_ESP is not set
291# CONFIG_INET_IPCOMP is not set 293# CONFIG_INET_IPCOMP is not set
294# CONFIG_INET_XFRM_TUNNEL is not set
292# CONFIG_INET_TUNNEL is not set 295# CONFIG_INET_TUNNEL is not set
293CONFIG_INET_DIAG=y 296CONFIG_INET_DIAG=y
294CONFIG_INET_TCP_DIAG=y 297CONFIG_INET_TCP_DIAG=y
@@ -300,6 +303,7 @@ CONFIG_IPV6=y
300# CONFIG_INET6_AH is not set 303# CONFIG_INET6_AH is not set
301# CONFIG_INET6_ESP is not set 304# CONFIG_INET6_ESP is not set
302# CONFIG_INET6_IPCOMP is not set 305# CONFIG_INET6_IPCOMP is not set
306# CONFIG_INET6_XFRM_TUNNEL is not set
303# CONFIG_INET6_TUNNEL is not set 307# CONFIG_INET6_TUNNEL is not set
304# CONFIG_IPV6_TUNNEL is not set 308# CONFIG_IPV6_TUNNEL is not set
305# CONFIG_NETFILTER is not set 309# CONFIG_NETFILTER is not set
@@ -542,7 +546,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
542# CONFIG_SCSI_INIA100 is not set 546# CONFIG_SCSI_INIA100 is not set
543# CONFIG_SCSI_SYM53C8XX_2 is not set 547# CONFIG_SCSI_SYM53C8XX_2 is not set
544# CONFIG_SCSI_IPR is not set 548# CONFIG_SCSI_IPR is not set
545# CONFIG_SCSI_QLOGIC_FC is not set
546# CONFIG_SCSI_QLOGIC_1280 is not set 549# CONFIG_SCSI_QLOGIC_1280 is not set
547# CONFIG_SCSI_QLA_FC is not set 550# CONFIG_SCSI_QLA_FC is not set
548# CONFIG_SCSI_LPFC is not set 551# CONFIG_SCSI_LPFC is not set
@@ -704,7 +707,6 @@ CONFIG_S2IO=m
704# Wireless LAN (non-hamradio) 707# Wireless LAN (non-hamradio)
705# 708#
706# CONFIG_NET_RADIO is not set 709# CONFIG_NET_RADIO is not set
707# CONFIG_NET_WIRELESS_RTNETLINK is not set
708 710
709# 711#
710# Wan interfaces 712# Wan interfaces
@@ -791,7 +793,7 @@ CONFIG_HW_CONSOLE=y
791# 793#
792CONFIG_SERIAL_8250=y 794CONFIG_SERIAL_8250=y
793CONFIG_SERIAL_8250_CONSOLE=y 795CONFIG_SERIAL_8250_CONSOLE=y
794# CONFIG_SERIAL_8250_ACPI is not set 796CONFIG_SERIAL_8250_PCI=y
795CONFIG_SERIAL_8250_NR_UARTS=4 797CONFIG_SERIAL_8250_NR_UARTS=4
796CONFIG_SERIAL_8250_RUNTIME_UARTS=4 798CONFIG_SERIAL_8250_RUNTIME_UARTS=4
797# CONFIG_SERIAL_8250_EXTENDED is not set 799# CONFIG_SERIAL_8250_EXTENDED is not set
@@ -921,6 +923,7 @@ CONFIG_HWMON=y
921# Digital Video Broadcasting Devices 923# Digital Video Broadcasting Devices
922# 924#
923# CONFIG_DVB is not set 925# CONFIG_DVB is not set
926# CONFIG_USB_DABUSB is not set
924 927
925# 928#
926# Graphics support 929# Graphics support
@@ -932,6 +935,8 @@ CONFIG_VIDEO_SELECT=y
932# Console display driver support 935# Console display driver support
933# 936#
934CONFIG_VGA_CONSOLE=y 937CONFIG_VGA_CONSOLE=y
938CONFIG_VGACON_SOFT_SCROLLBACK=y
939CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
935CONFIG_DUMMY_CONSOLE=y 940CONFIG_DUMMY_CONSOLE=y
936 941
937# 942#
@@ -1041,9 +1046,7 @@ CONFIG_USB_HIDINPUT=y
1041# CONFIG_USB_ACECAD is not set 1046# CONFIG_USB_ACECAD is not set
1042# CONFIG_USB_KBTAB is not set 1047# CONFIG_USB_KBTAB is not set
1043# CONFIG_USB_POWERMATE is not set 1048# CONFIG_USB_POWERMATE is not set
1044# CONFIG_USB_MTOUCH is not set 1049# CONFIG_USB_TOUCHSCREEN is not set
1045# CONFIG_USB_ITMTOUCH is not set
1046# CONFIG_USB_EGALAX is not set
1047# CONFIG_USB_YEALINK is not set 1050# CONFIG_USB_YEALINK is not set
1048# CONFIG_USB_XPAD is not set 1051# CONFIG_USB_XPAD is not set
1049# CONFIG_USB_ATI_REMOTE is not set 1052# CONFIG_USB_ATI_REMOTE is not set
@@ -1058,15 +1061,6 @@ CONFIG_USB_HIDINPUT=y
1058# CONFIG_USB_MICROTEK is not set 1061# CONFIG_USB_MICROTEK is not set
1059 1062
1060# 1063#
1061# USB Multimedia devices
1062#
1063# CONFIG_USB_DABUSB is not set
1064
1065#
1066# Video4Linux support is needed for USB Multimedia device support
1067#
1068
1069#
1070# USB Network Adapters 1064# USB Network Adapters
1071# 1065#
1072# CONFIG_USB_CATC is not set 1066# CONFIG_USB_CATC is not set
@@ -1118,9 +1112,23 @@ CONFIG_USB_MON=y
1118# CONFIG_MMC is not set 1112# CONFIG_MMC is not set
1119 1113
1120# 1114#
1115# LED devices
1116#
1117# CONFIG_NEW_LEDS is not set
1118
1119#
1120# LED drivers
1121#
1122
1123#
1124# LED Triggers
1125#
1126
1127#
1121# InfiniBand support 1128# InfiniBand support
1122# 1129#
1123# CONFIG_INFINIBAND is not set 1130# CONFIG_INFINIBAND is not set
1131# CONFIG_IPATH_CORE is not set
1124 1132
1125# 1133#
1126# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) 1134# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1128,6 +1136,11 @@ CONFIG_USB_MON=y
1128# CONFIG_EDAC is not set 1136# CONFIG_EDAC is not set
1129 1137
1130# 1138#
1139# Real Time Clock
1140#
1141# CONFIG_RTC_CLASS is not set
1142
1143#
1131# Firmware Drivers 1144# Firmware Drivers
1132# 1145#
1133# CONFIG_EDD is not set 1146# CONFIG_EDD is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index 929e6b0771f8..e9263b4975e0 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
27$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE 27$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
28 $(call if_changed,syscall) 28 $(call if_changed,syscall)
29 29
30AFLAGS_vsyscall-sysenter.o = -m32 30AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
31AFLAGS_vsyscall-syscall.o = -m32 31AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 35b2faccdc6c..57fc37e0fb9c 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -15,6 +15,8 @@
15#include <asm/vsyscall32.h> 15#include <asm/vsyscall32.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19
18 .macro IA32_ARG_FIXUP noebp=0 20 .macro IA32_ARG_FIXUP noebp=0
19 movl %edi,%r8d 21 movl %edi,%r8d
20 .if \noebp 22 .if \noebp
@@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target)
109 CFI_REMEMBER_STATE 111 CFI_REMEMBER_STATE
110 jnz sysenter_tracesys 112 jnz sysenter_tracesys
111sysenter_do_call: 113sysenter_do_call:
112 cmpl $(IA32_NR_syscalls),%eax 114 cmpl $(IA32_NR_syscalls-1),%eax
113 jae ia32_badsys 115 ja ia32_badsys
114 IA32_ARG_FIXUP 1 116 IA32_ARG_FIXUP 1
115 call *ia32_sys_call_table(,%rax,8) 117 call *ia32_sys_call_table(,%rax,8)
116 movq %rax,RAX-ARGOFFSET(%rsp) 118 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target)
210 CFI_REMEMBER_STATE 212 CFI_REMEMBER_STATE
211 jnz cstar_tracesys 213 jnz cstar_tracesys
212cstar_do_call: 214cstar_do_call:
213 cmpl $IA32_NR_syscalls,%eax 215 cmpl $IA32_NR_syscalls-1,%eax
214 jae ia32_badsys 216 ja ia32_badsys
215 IA32_ARG_FIXUP 1 217 IA32_ARG_FIXUP 1
216 call *ia32_sys_call_table(,%rax,8) 218 call *ia32_sys_call_table(,%rax,8)
217 movq %rax,RAX-ARGOFFSET(%rsp) 219 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -296,8 +298,8 @@ ENTRY(ia32_syscall)
296 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 298 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
297 jnz ia32_tracesys 299 jnz ia32_tracesys
298ia32_do_syscall: 300ia32_do_syscall:
299 cmpl $(IA32_NR_syscalls),%eax 301 cmpl $(IA32_NR_syscalls-1),%eax
300 jae ia32_badsys 302 ja ia32_badsys
301 IA32_ARG_FIXUP 303 IA32_ARG_FIXUP
302 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 304 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
303ia32_sysret: 305ia32_sysret:
@@ -685,12 +687,12 @@ ia32_sys_call_table:
685 .quad sys_readlinkat /* 305 */ 687 .quad sys_readlinkat /* 305 */
686 .quad sys_fchmodat 688 .quad sys_fchmodat
687 .quad sys_faccessat 689 .quad sys_faccessat
688 .quad sys_ni_syscall /* pselect6 for now */ 690 .quad quiet_ni_syscall /* pselect6 for now */
689 .quad sys_ni_syscall /* ppoll for now */ 691 .quad quiet_ni_syscall /* ppoll for now */
690 .quad sys_unshare /* 310 */ 692 .quad sys_unshare /* 310 */
691 .quad compat_sys_set_robust_list 693 .quad compat_sys_set_robust_list
692 .quad compat_sys_get_robust_list 694 .quad compat_sys_get_robust_list
695 .quad sys_splice
696 .quad sys_sync_file_range
697 .quad sys_tee
693ia32_syscall_end: 698ia32_syscall_end:
694 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
695 .quad ni_syscall
696 .endr
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index a098a11e7755..059c88313f4e 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \ 9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ 10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
11 dmi_scan.o pci-dma.o pci-nommu.o 11 pci-dma.o pci-nommu.o
12 12
13obj-$(CONFIG_X86_MCE) += mce.o 13obj-$(CONFIG_X86_MCE) += mce.o
14obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 14obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
@@ -49,5 +49,3 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
49quirks-y += ../../i386/kernel/quirks.o 49quirks-y += ../../i386/kernel/quirks.o
50i8237-y += ../../i386/kernel/i8237.o 50i8237-y += ../../i386/kernel/i8237.o
51msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 51msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
52dmi_scan-y += ../../i386/kernel/dmi_scan.o
53
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index fffd6b0a2fab..70b9d21ed675 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
80 printk("Aperture from %s beyond 4GB. Ignoring.\n",name); 80 printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
81 return 0; 81 return 0;
82 } 82 }
83 if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { 83 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
84 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); 84 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
85 return 0; 85 return 0;
86 } 86 }
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 293cd71a266a..62776c07cff1 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -80,7 +80,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
80 return 0; 80 return 0;
81} 81}
82 82
83int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 83/*
84 * This function checks if any part of the range <start,end> is mapped
85 * with type.
86 */
87int __meminit
88e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
84{ 89{
85 int i; 90 int i;
86 for (i = 0; i < e820.nr_map; i++) { 91 for (i = 0; i < e820.nr_map; i++) {
@@ -94,6 +99,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
94 return 0; 99 return 0;
95} 100}
96 101
102/*
103 * This function checks if the entire range <start,end> is mapped with type.
104 *
105 * Note: this function only works correct if the e820 table is sorted and
106 * not-overlapping, which is the case
107 */
108int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
109{
110 int i;
111 for (i = 0; i < e820.nr_map; i++) {
112 struct e820entry *ei = &e820.map[i];
113 if (type && ei->type != type)
114 continue;
115 /* is the region (part) in overlap with the current region ?*/
116 if (ei->addr >= end || ei->addr + ei->size <= start)
117 continue;
118
119 /* if the region is at the beginning of <start,end> we move
120 * start to the end of the region since it's ok until there
121 */
122 if (ei->addr <= start)
123 start = ei->addr + ei->size;
124 /* if start is now at or beyond end, we're done, full coverage */
125 if (start >= end)
126 return 1; /* we're done */
127 }
128 return 0;
129}
130
97/* 131/*
98 * Find a free area in a specific range. 132 * Find a free area in a specific range.
99 */ 133 */
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 8538bfea30e6..c946e4fe67a7 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -180,6 +180,10 @@ rff_trace:
180 * 180 *
181 * XXX if we had a free scratch register we could save the RSP into the stack frame 181 * XXX if we had a free scratch register we could save the RSP into the stack frame
182 * and report it properly in ps. Unfortunately we haven't. 182 * and report it properly in ps. Unfortunately we haven't.
183 *
184 * When user can change the frames always force IRET. That is because
185 * it deals with uncanonical addresses better. SYSRET has trouble
186 * with them due to bugs in both AMD and Intel CPUs.
183 */ 187 */
184 188
185ENTRY(system_call) 189ENTRY(system_call)
@@ -254,7 +258,10 @@ sysret_signal:
254 xorl %esi,%esi # oldset -> arg2 258 xorl %esi,%esi # oldset -> arg2
255 call ptregscall_common 259 call ptregscall_common
2561: movl $_TIF_NEED_RESCHED,%edi 2601: movl $_TIF_NEED_RESCHED,%edi
257 jmp sysret_check 261 /* Use IRET because user could have changed frame. This
262 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
263 cli
264 jmp int_with_check
258 265
259badsys: 266badsys:
260 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 267 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -280,7 +287,8 @@ tracesys:
280 call syscall_trace_leave 287 call syscall_trace_leave
281 RESTORE_TOP_OF_STACK %rbx 288 RESTORE_TOP_OF_STACK %rbx
282 RESTORE_REST 289 RESTORE_REST
283 jmp ret_from_sys_call 290 /* Use IRET because user could have changed frame */
291 jmp int_ret_from_sys_call
284 CFI_ENDPROC 292 CFI_ENDPROC
285 293
286/* 294/*
@@ -408,25 +416,9 @@ ENTRY(stub_execve)
408 CFI_ADJUST_CFA_OFFSET -8 416 CFI_ADJUST_CFA_OFFSET -8
409 CFI_REGISTER rip, r11 417 CFI_REGISTER rip, r11
410 SAVE_REST 418 SAVE_REST
411 movq %r11, %r15
412 CFI_REGISTER rip, r15
413 FIXUP_TOP_OF_STACK %r11 419 FIXUP_TOP_OF_STACK %r11
414 call sys_execve 420 call sys_execve
415 GET_THREAD_INFO(%rcx)
416 bt $TIF_IA32,threadinfo_flags(%rcx)
417 CFI_REMEMBER_STATE
418 jc exec_32bit
419 RESTORE_TOP_OF_STACK %r11 421 RESTORE_TOP_OF_STACK %r11
420 movq %r15, %r11
421 CFI_REGISTER rip, r11
422 RESTORE_REST
423 pushq %r11
424 CFI_ADJUST_CFA_OFFSET 8
425 CFI_REL_OFFSET rip, 0
426 ret
427
428exec_32bit:
429 CFI_RESTORE_STATE
430 movq %rax,RAX(%rsp) 422 movq %rax,RAX(%rsp)
431 RESTORE_REST 423 RESTORE_REST
432 jmp int_ret_from_sys_call 424 jmp int_ret_from_sys_call
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index accbff3fec49..1eaa5dae6174 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -53,7 +53,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
53/* 53/*
54 * returns non-zero if opcode modifies the interrupt flag. 54 * returns non-zero if opcode modifies the interrupt flag.
55 */ 55 */
56static inline int is_IF_modifier(kprobe_opcode_t *insn) 56static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
57{ 57{
58 switch (*insn) { 58 switch (*insn) {
59 case 0xfa: /* cli */ 59 case 0xfa: /* cli */
@@ -84,7 +84,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
84 * If it does, return the address of the 32-bit displacement word. 84 * If it does, return the address of the 32-bit displacement word.
85 * If not, return null. 85 * If not, return null.
86 */ 86 */
87static inline s32 *is_riprel(u8 *insn) 87static s32 __kprobes *is_riprel(u8 *insn)
88{ 88{
89#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ 89#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
90 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 90 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -229,7 +229,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
229 mutex_unlock(&kprobe_mutex); 229 mutex_unlock(&kprobe_mutex);
230} 230}
231 231
232static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) 232static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
233{ 233{
234 kcb->prev_kprobe.kp = kprobe_running(); 234 kcb->prev_kprobe.kp = kprobe_running();
235 kcb->prev_kprobe.status = kcb->kprobe_status; 235 kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -237,7 +237,7 @@ static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
237 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; 237 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
238} 238}
239 239
240static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) 240static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
241{ 241{
242 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 242 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
243 kcb->kprobe_status = kcb->prev_kprobe.status; 243 kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -245,7 +245,7 @@ static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
245 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; 245 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
246} 246}
247 247
248static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 248static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
249 struct kprobe_ctlblk *kcb) 249 struct kprobe_ctlblk *kcb)
250{ 250{
251 __get_cpu_var(current_kprobe) = p; 251 __get_cpu_var(current_kprobe) = p;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 10b3e348fc99..c69fc43cee7b 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -29,6 +29,8 @@
29#define MISC_MCELOG_MINOR 227 29#define MISC_MCELOG_MINOR 227
30#define NR_BANKS 6 30#define NR_BANKS 6
31 31
32atomic_t mce_entry;
33
32static int mce_dont_init; 34static int mce_dont_init;
33 35
34/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, 36/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
@@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
172 int i; 174 int i;
173 int panicm_found = 0; 175 int panicm_found = 0;
174 176
177 atomic_inc(&mce_entry);
178
175 if (regs) 179 if (regs)
176 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); 180 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
177 if (!banks) 181 if (!banks)
178 return; 182 goto out2;
179 183
180 memset(&m, 0, sizeof(struct mce)); 184 memset(&m, 0, sizeof(struct mce));
181 m.cpu = safe_smp_processor_id(); 185 m.cpu = safe_smp_processor_id();
@@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
266 out: 270 out:
267 /* Last thing done in the machine check exception to clear state. */ 271 /* Last thing done in the machine check exception to clear state. */
268 wrmsrl(MSR_IA32_MCG_STATUS, 0); 272 wrmsrl(MSR_IA32_MCG_STATUS, 0);
273 out2:
274 atomic_dec(&mce_entry);
269} 275}
270 276
271/* 277/*
@@ -623,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
623#endif 629#endif
624 630
625/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 631/* Get notified when a cpu comes on/off. Be hotplug friendly. */
626static __cpuinit int 632static int
627mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 633mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
628{ 634{
629 unsigned int cpu = (unsigned long)hcpu; 635 unsigned int cpu = (unsigned long)hcpu;
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d3ad7d81266d..d13b241ad094 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -482,7 +482,7 @@ static void threshold_remove_device(unsigned int cpu)
482#endif 482#endif
483 483
484/* get notified when a cpu comes on/off */ 484/* get notified when a cpu comes on/off */
485static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb, 485static int threshold_cpu_callback(struct notifier_block *nfb,
486 unsigned long action, void *hcpu) 486 unsigned long action, void *hcpu)
487{ 487{
488 /* cpu was unsigned int to begin with */ 488 /* cpu was unsigned int to begin with */
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index d9e4067faf05..4e6357fe0ec3 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -34,6 +34,7 @@
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h> 36#include <asm/local.h>
37#include <asm/mce.h>
37 38
38/* 39/*
39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 40 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
480 __get_cpu_var(nmi_touch) = 0; 481 __get_cpu_var(nmi_touch) = 0;
481 touched = 1; 482 touched = 1;
482 } 483 }
484#ifdef CONFIG_X86_MCE
485 /* Could check oops_in_progress here too, but it's safer
486 not too */
487 if (atomic_read(&mce_entry) > 0)
488 touched = 1;
489#endif
483 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 490 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
484 /* 491 /*
485 * Ayiee, looks like this CPU is stuck ... 492 * Ayiee, looks like this CPU is stuck ...
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 03c9eeedb0f3..af035ede70cd 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -48,9 +48,11 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
48{ 48{
49 struct page *page; 49 struct page *page;
50 int node; 50 int node;
51#ifdef CONFIG_PCI
51 if (dev->bus == &pci_bus_type) 52 if (dev->bus == &pci_bus_type)
52 node = pcibus_to_node(to_pci_dev(dev)->bus); 53 node = pcibus_to_node(to_pci_dev(dev)->bus);
53 else 54 else
55#endif
54 node = numa_node_id(); 56 node = numa_node_id();
55 page = alloc_pages_node(node, gfp, order); 57 page = alloc_pages_node(node, gfp, order);
56 return page ? page_address(page) : NULL; 58 return page ? page_address(page) : NULL;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index a6c01e121266..9d3d76c85ae7 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -112,10 +112,6 @@ static unsigned long alloc_iommu(int size)
112static void free_iommu(unsigned long offset, int size) 112static void free_iommu(unsigned long offset, int size)
113{ 113{
114 unsigned long flags; 114 unsigned long flags;
115 if (size == 1) {
116 clear_bit(offset, iommu_gart_bitmap);
117 return;
118 }
119 spin_lock_irqsave(&iommu_bitmap_lock, flags); 115 spin_lock_irqsave(&iommu_bitmap_lock, flags);
120 __clear_bit_string(iommu_gart_bitmap, offset, size); 116 __clear_bit_string(iommu_gart_bitmap, offset, size);
121 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 117 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 70dd8e5c6889..fb903e65e079 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -575,8 +575,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
575 prev->userrsp = read_pda(oldrsp); 575 prev->userrsp = read_pda(oldrsp);
576 write_pda(oldrsp, next->userrsp); 576 write_pda(oldrsp, next->userrsp);
577 write_pda(pcurrent, next_p); 577 write_pda(pcurrent, next_p);
578
578 /* This must be here to ensure both math_state_restore() and 579 /* This must be here to ensure both math_state_restore() and
579 kernel_fpu_begin() work consistently. */ 580 kernel_fpu_begin() work consistently.
581 And the AMD workaround requires it to be after DS reload. */
580 unlazy_fpu(prev_p); 582 unlazy_fpu(prev_p);
581 write_pda(kernelstack, 583 write_pda(kernelstack,
582 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 584 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
@@ -781,10 +783,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
781 } 783 }
782 case ARCH_GET_GS: { 784 case ARCH_GET_GS: {
783 unsigned long base; 785 unsigned long base;
786 unsigned gsindex;
784 if (task->thread.gsindex == GS_TLS_SEL) 787 if (task->thread.gsindex == GS_TLS_SEL)
785 base = read_32bit_tls(task, GS_TLS); 788 base = read_32bit_tls(task, GS_TLS);
786 else if (doit) 789 else if (doit) {
787 rdmsrl(MSR_KERNEL_GS_BASE, base); 790 asm("movl %%gs,%0" : "=r" (gsindex));
791 if (gsindex)
792 rdmsrl(MSR_KERNEL_GS_BASE, base);
793 else
794 base = task->thread.gs;
795 }
788 else 796 else
789 base = task->thread.gs; 797 base = task->thread.gs;
790 ret = put_user(base, (unsigned long __user *)addr); 798 ret = put_user(base, (unsigned long __user *)addr);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index d44b2c1e63a6..da8e7903d817 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -274,11 +274,6 @@ static int putreg(struct task_struct *child,
274 return -EIO; 274 return -EIO;
275 value &= 0xffff; 275 value &= 0xffff;
276 break; 276 break;
277 case offsetof(struct user_regs_struct, rip):
278 /* Check if the new RIP address is canonical */
279 if (value >= TASK_SIZE_OF(child))
280 return -EIO;
281 break;
282 } 277 }
283 put_stack_long(child, regno - sizeof(struct pt_regs), value); 278 put_stack_long(child, regno - sizeof(struct pt_regs), value);
284 return 0; 279 return 0;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0856ad444f90..759070c82751 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
353 if (fullarg(from, "enable_timer_pin_1")) 353 if (fullarg(from, "enable_timer_pin_1"))
354 disable_timer_pin_1 = -1; 354 disable_timer_pin_1 = -1;
355 355
356 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) 356 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
357 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
357 disable_apic = 1; 358 disable_apic = 1;
359 }
358 360
359 if (fullarg(from, "noapic")) 361 if (fullarg(from, "noapic"))
360 skip_ioapic_setup = 1; 362 skip_ioapic_setup = 1;
@@ -928,6 +930,10 @@ static int __init init_amd(struct cpuinfo_x86 *c)
928 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) 930 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
929 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 931 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
930 932
933 /* Enable workaround for FXSAVE leak */
934 if (c->x86 >= 6)
935 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
936
931 r = get_model_name(c); 937 r = get_model_name(c);
932 if (!r) { 938 if (!r) {
933 switch (c->x86) { 939 switch (c->x86) {
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index ef8bc46dc140..7392570f975d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -726,7 +726,7 @@ static __init int late_hpet_init(void)
726 unsigned int ntimer; 726 unsigned int ntimer;
727 727
728 if (!vxtime.hpet_address) 728 if (!vxtime.hpet_address)
729 return -1; 729 return 0;
730 730
731 memset(&hd, 0, sizeof (hd)); 731 memset(&hd, 0, sizeof (hd));
732 732
@@ -917,6 +917,8 @@ void __init time_init(void)
917 vxtime.hpet_address = 0; 917 vxtime.hpet_address = 0;
918 918
919 if (hpet_use_timer) { 919 if (hpet_use_timer) {
920 /* set tick_nsec to use the proper rate for HPET */
921 tick_nsec = TICK_NSEC_HPET;
920 cpu_khz = hpet_calibrate_tsc(); 922 cpu_khz = hpet_calibrate_tsc();
921 timename = "HPET"; 923 timename = "HPET";
922#ifdef CONFIG_X86_PM_TIMER 924#ifdef CONFIG_X86_PM_TIMER
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 6bda322d3caf..2700b1375c1f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,6 +30,7 @@
30#include <linux/moduleparam.h> 30#include <linux/moduleparam.h>
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/kexec.h>
33 34
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -433,6 +434,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
433 printk(KERN_ALERT "RIP "); 434 printk(KERN_ALERT "RIP ");
434 printk_address(regs->rip); 435 printk_address(regs->rip);
435 printk(" RSP <%016lx>\n", regs->rsp); 436 printk(" RSP <%016lx>\n", regs->rsp);
437 if (kexec_should_crash(current))
438 crash_kexec(regs);
436} 439}
437 440
438void die(const char * str, struct pt_regs * regs, long err) 441void die(const char * str, struct pt_regs * regs, long err)
@@ -455,6 +458,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
455 */ 458 */
456 printk(str, safe_smp_processor_id()); 459 printk(str, safe_smp_processor_id());
457 show_registers(regs); 460 show_registers(regs);
461 if (kexec_should_crash(current))
462 crash_kexec(regs);
458 if (panic_on_timeout || panic_on_oops) 463 if (panic_on_timeout || panic_on_oops)
459 panic("nmi watchdog"); 464 panic("nmi watchdog");
460 printk("console shuts up ...\n"); 465 printk("console shuts up ...\n");
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 39ff0708f803..b81f473c4a19 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
65 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 65 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
66 *(.data.cacheline_aligned) 66 *(.data.cacheline_aligned)
67 } 67 }
68 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 68 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
69 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 69 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
70 *(.data.read_mostly) 70 *(.data.read_mostly)
71 } 71 }
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index d78f46056bda..1def21c9f7cd 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -112,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
112#undef memcpy 112#undef memcpy
113#undef memset 113#undef memset
114#undef memmove 114#undef memmove
115#undef strlen
116 115
117extern void * memset(void *,int,__kernel_size_t); 116extern void * memset(void *,int,__kernel_size_t);
118extern size_t strlen(const char *); 117extern size_t strlen(const char *);
@@ -121,8 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
121extern void * __memcpy(void *,const void *,__kernel_size_t); 120extern void * __memcpy(void *,const void *,__kernel_size_t);
122 121
123EXPORT_SYMBOL(memset); 122EXPORT_SYMBOL(memset);
124EXPORT_SYMBOL(strlen);
125EXPORT_SYMBOL(strpbrk);
126EXPORT_SYMBOL(memmove); 123EXPORT_SYMBOL(memmove);
127EXPORT_SYMBOL(memcpy); 124EXPORT_SYMBOL(memcpy);
128EXPORT_SYMBOL(__memcpy); 125EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e5f7f1c34462..4ba34e95d835 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
305 if (paddr >= end) 305 if (paddr >= end)
306 break; 306 break;
307 307
308 if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 308 if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
309 set_pud(pud, __pud(0)); 309 set_pud(pud, __pud(0));
310 continue; 310 continue;
311 } 311 }
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
507 507
508/* 508/*
509 * Memory hotplug specific functions 509 * Memory hotplug specific functions
510 * These are only for non-NUMA machines right now.
511 */ 510 */
512#ifdef CONFIG_MEMORY_HOTPLUG 511#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
513 512
514void online_page(struct page *page) 513void online_page(struct page *page)
515{ 514{
@@ -520,6 +519,38 @@ void online_page(struct page *page)
520 num_physpages++; 519 num_physpages++;
521} 520}
522 521
522#ifndef CONFIG_MEMORY_HOTPLUG
523/*
524 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
525 * just online the pages.
526 */
527int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
528{
529 int err = -EIO;
530 unsigned long pfn;
531 unsigned long total = 0, mem = 0;
532 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
533 if (pfn_valid(pfn)) {
534 online_page(pfn_to_page(pfn));
535 err = 0;
536 mem++;
537 }
538 total++;
539 }
540 if (!err) {
541 z->spanned_pages += total;
542 z->present_pages += mem;
543 z->zone_pgdat->node_spanned_pages += total;
544 z->zone_pgdat->node_present_pages += mem;
545 }
546 return err;
547}
548#endif
549
550/*
551 * Memory is added always to NORMAL zone. This means you will never get
552 * additional DMA/DMA32 memory.
553 */
523int add_memory(u64 start, u64 size) 554int add_memory(u64 start, u64 size)
524{ 555{
525 struct pglist_data *pgdat = NODE_DATA(0); 556 struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..b2fac14baac0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
100} 100}
101#endif 101#endif
102 102
103static void * __init
104early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
106{
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
117 }
118 return ptr;
119}
120
103/* Initialize bootmem allocator for a node */ 121/* Initialize bootmem allocator for a node */
104void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 122void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105{ 123{
106 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
107 unsigned long nodedata_phys; 125 unsigned long nodedata_phys;
126 void *bootmap;
108 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109 128
110 start = round_up(start, ZONE_ALIGN); 129 start = round_up(start, ZONE_ALIGN);
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
114 start_pfn = start >> PAGE_SHIFT; 133 start_pfn = start >> PAGE_SHIFT;
115 end_pfn = end >> PAGE_SHIFT; 134 end_pfn = end >> PAGE_SHIFT;
116 135
117 nodedata_phys = find_e820_area(start, end, pgdat_size); 136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
118 if (nodedata_phys == -1L) 137 if (node_data[nodeid] == NULL)
119 panic("Cannot find memory pgdat in node %d\n", nodeid); 138 return;
120 139 nodedata_phys = __pa(node_data[nodeid]);
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122 140
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
129 /* Find a place for the bootmem map */ 146 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); 149 bootmap = early_node_mem(nodeid, bootmap_start, end,
133 if (bootmap_start == -1L) 150 bootmap_pages<<PAGE_SHIFT);
134 panic("Not enough continuous space for bootmap on node %d", nodeid); 151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
156 }
157 bootmap_start = __pa(bootmap);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136 159
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
142 165
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
168#ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170#endif
145 node_set_online(nodeid); 171 node_set_online(nodeid);
146} 172}
147 173
@@ -162,11 +188,13 @@ void __init setup_node_zones(int nodeid)
162 memory. */ 188 memory. */
163 memmapsize = sizeof(struct page) * (end_pfn-start_pfn); 189 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
164 limit = end_pfn << PAGE_SHIFT; 190 limit = end_pfn << PAGE_SHIFT;
191#ifdef CONFIG_FLAT_NODE_MEM_MAP
165 NODE_DATA(nodeid)->node_mem_map = 192 NODE_DATA(nodeid)->node_mem_map =
166 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 193 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
167 memmapsize, SMP_CACHE_BYTES, 194 memmapsize, SMP_CACHE_BYTES,
168 round_down(limit - memmapsize, PAGE_SIZE), 195 round_down(limit - memmapsize, PAGE_SIZE),
169 limit); 196 limit);
197#endif
170 198
171 size_zones(zones, holes, start_pfn, end_pfn); 199 size_zones(zones, holes, start_pfn, end_pfn);
172 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 200 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
@@ -335,6 +363,8 @@ __init int numa_setup(char *opt)
335#ifdef CONFIG_ACPI_NUMA 363#ifdef CONFIG_ACPI_NUMA
336 if (!strncmp(opt,"noacpi",6)) 364 if (!strncmp(opt,"noacpi",6))
337 acpi_numa = -1; 365 acpi_numa = -1;
366 if (!strncmp(opt,"hotadd=", 7))
367 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
338#endif 368#endif
339 return 1; 369 return 1;
340} 370}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..15ae9fcd65a7 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
15#include <linux/bitmap.h> 15#include <linux/bitmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
18#include <asm/proto.h> 20#include <asm/proto.h>
19#include <asm/numa.h> 21#include <asm/numa.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21 23
24#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
22static struct acpi_table_slit *acpi_slit; 30static struct acpi_table_slit *acpi_slit;
23 31
24static nodemask_t nodes_parsed __initdata; 32static nodemask_t nodes_parsed __initdata;
25static nodemask_t nodes_found __initdata; 33static nodemask_t nodes_found __initdata;
26static struct bootnode nodes[MAX_NUMNODES] __initdata; 34static struct bootnode nodes[MAX_NUMNODES] __initdata;
35static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
37int hotadd_percent __initdata = 10;
27static u8 pxm2node[256] = { [0 ... 255] = 0xff }; 38static u8 pxm2node[256] = { [0 ... 255] = 0xff };
28 39
29/* Too small nodes confuse the VM badly. Usually they result 40/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
71static __init void cutoff_node(int i, unsigned long start, unsigned long end) 82static __init void cutoff_node(int i, unsigned long start, unsigned long end)
72{ 83{
73 struct bootnode *nd = &nodes[i]; 84 struct bootnode *nd = &nodes[i];
85
86 if (found_add_area)
87 return;
88
74 if (nd->start < start) { 89 if (nd->start < start) {
75 nd->start = start; 90 nd->start = start;
76 if (nd->end < nd->start) 91 if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
90 acpi_numa = -1; 105 acpi_numa = -1;
91 for (i = 0; i < MAX_LOCAL_APIC; i++) 106 for (i = 0; i < MAX_LOCAL_APIC; i++)
92 apicid_to_node[i] = NUMA_NO_NODE; 107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
93} 110}
94 111
95static __init inline int srat_disabled(void) 112static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
155 pxm, pa->apic_id, node); 172 pxm, pa->apic_id, node);
156} 173}
157 174
175#ifdef RESERVE_HOTADD
176/*
177 * Protect against too large hotadd areas that would fill up memory.
178 */
179static int hotadd_enough_memory(struct bootnode *nd)
180{
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
188
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
203 }
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
214}
215
216/*
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
219 */
220static int reserve_hotadd(int node, unsigned long start, unsigned long end)
221{
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
226
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
230
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
236 }
237
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
242 }
243
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
247 }
248
249 /* Looks good */
250
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
260 }
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
264 }
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
267 }
268
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
271
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
275}
276#endif
277
158/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 278/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
159void __init 279void __init
160acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) 280acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
161{ 281{
162 struct bootnode *nd; 282 struct bootnode *nd, oldnode;
163 unsigned long start, end; 283 unsigned long start, end;
164 int node, pxm; 284 int node, pxm;
165 int i; 285 int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
172 } 292 }
173 if (ma->flags.enabled == 0) 293 if (ma->flags.enabled == 0)
174 return; 294 return;
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
175 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); 297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
176 end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); 298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
177 pxm = ma->proximity_domain; 299 pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
181 bad_srat(); 303 bad_srat();
182 return; 304 return;
183 } 305 }
184 /* It is fine to add this area to the nodes data it will be used later*/
185 if (ma->flags.hot_pluggable == 1)
186 printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
187 start, end);
188 i = conflicting_nodes(start, end); 306 i = conflicting_nodes(start, end);
189 if (i == node) { 307 if (i == node) {
190 printk(KERN_WARNING 308 printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
199 return; 317 return;
200 } 318 }
201 nd = &nodes[node]; 319 nd = &nodes[node];
320 oldnode = *nd;
202 if (!node_test_and_set(node, nodes_parsed)) { 321 if (!node_test_and_set(node, nodes_parsed)) {
203 nd->start = start; 322 nd->start = start;
204 nd->end = end; 323 nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
208 if (nd->end < end) 327 if (nd->end < end)
209 nd->end = end; 328 nd->end = end;
210 } 329 }
330
211 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
212 nd->start, nd->end); 332 nd->start, nd->end);
333
334#ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
341 }
342#endif
213} 343}
214 344
215/* Sanity check to catch more bad SRATs (they are amazingly common). 345/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
225 unsigned long e = nodes[i].end >> PAGE_SHIFT; 355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
226 pxmram += e - s; 356 pxmram += e - s;
227 pxmram -= e820_hole_size(s, e); 357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
228 } 361 }
229 362
230 e820ram = end_pfn - e820_hole_size(0, end_pfn); 363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
258 391
259 /* First clean up the node list */ 392 /* First clean up the node list */
260 for (i = 0; i < MAX_NUMNODES; i++) { 393 for (i = 0; i < MAX_NUMNODES; i++) {
261 cutoff_node(i, start, end); 394 cutoff_node(i, start, end);
262 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) 395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
263 unparse_node(i); 396 unparse_node(i);
264 } 397 }
@@ -282,6 +415,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
282 /* Finally register nodes */ 415 /* Finally register nodes */
283 for_each_node_mask(i, nodes_parsed) 416 for_each_node_mask(i, nodes_parsed)
284 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
421 if (!node_online(i))
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
423
285 for (i = 0; i < NR_CPUS; i++) { 424 for (i = 0; i < NR_CPUS; i++) {
286 if (cpu_to_node[i] == NUMA_NO_NODE) 425 if (cpu_to_node[i] == NUMA_NO_NODE)
287 continue; 426 continue;
@@ -303,6 +442,25 @@ static int node_to_pxm(int n)
303 return 0; 442 return 0;
304} 443}
305 444
445void __init srat_reserve_add_area(int nodeid)
446{
447 if (found_add_area && nodes_add[nodeid].end) {
448 u64 total_mb;
449
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
454 >> PAGE_SHIFT;
455 total_mb *= sizeof(struct page);
456 total_mb >>= 20;
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
461 }
462}
463
306int __node_distance(int a, int b) 464int __node_distance(int a, int b)
307{ 465{
308 int index; 466 int index;
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index e616500207e4..a2060e4d5de6 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -9,11 +9,16 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <asm/e820.h>
13
12#include "pci.h" 14#include "pci.h"
13 15
14#define MMCONFIG_APER_SIZE (256*1024*1024) 16#define MMCONFIG_APER_SIZE (256*1024*1024)
17/* Verify the first 16 busses. We assume that systems with more busses
18 get MCFG right. */
19#define MAX_CHECK_BUS 16
15 20
16static DECLARE_BITMAP(fallback_slots, 32); 21static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
17 22
18/* Static virtual mapping of the MMCONFIG aperture */ 23/* Static virtual mapping of the MMCONFIG aperture */
19struct mmcfg_virt { 24struct mmcfg_virt {
@@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
55static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 60static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
56{ 61{
57 char __iomem *addr; 62 char __iomem *addr;
58 if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots)) 63 if (seg == 0 && bus < MAX_CHECK_BUS &&
64 test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
59 return NULL; 65 return NULL;
60 addr = get_virt(seg, bus); 66 addr = get_virt(seg, bus);
61 if (!addr) 67 if (!addr)
@@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
69 char __iomem *addr; 75 char __iomem *addr;
70 76
71 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ 77 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
72 if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) 78 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
79 *value = -1;
73 return -EINVAL; 80 return -EINVAL;
81 }
74 82
75 addr = pci_dev_base(seg, bus, devfn); 83 addr = pci_dev_base(seg, bus, devfn);
76 if (!addr) 84 if (!addr)
@@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = {
129 Normally this can be expressed in the MCFG by not listing them 137 Normally this can be expressed in the MCFG by not listing them
130 and assigning suitable _SEGs, but this isn't implemented in some BIOS. 138 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
131 Instead try to discover all devices on bus 0 that are unreachable using MM 139 Instead try to discover all devices on bus 0 that are unreachable using MM
132 and fallback for them. 140 and fallback for them. */
133 We only do this for bus 0/seg 0 */
134static __init void unreachable_devices(void) 141static __init void unreachable_devices(void)
135{ 142{
136 int i; 143 int i, k;
137 for (i = 0; i < 32; i++) { 144 /* Use the max bus number from ACPI here? */
138 u32 val1; 145 for (k = 0; k < MAX_CHECK_BUS; k++) {
139 char __iomem *addr; 146 for (i = 0; i < 32; i++) {
140 147 u32 val1;
141 pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1); 148 char __iomem *addr;
142 if (val1 == 0xffffffff) 149
143 continue; 150 pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
144 addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0)); 151 if (val1 == 0xffffffff)
145 if (addr == NULL|| readl(addr) != val1) { 152 continue;
146 set_bit(i, fallback_slots); 153 addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
154 if (addr == NULL|| readl(addr) != val1) {
155 set_bit(i + 32*k, fallback_slots);
156 printk(KERN_NOTICE
157 "PCI: No mmconfig possible on device %x:%x\n",
158 k, i);
159 }
147 } 160 }
148 } 161 }
149} 162}
@@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void)
161 (pci_mmcfg_config[0].base_address == 0)) 174 (pci_mmcfg_config[0].base_address == 0))
162 return; 175 return;
163 176
177 if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
178 pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
179 E820_RESERVED)) {
180 printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
181 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
182 return;
183 }
184
164 /* RED-PEN i386 doesn't do _nocache right now */ 185 /* RED-PEN i386 doesn't do _nocache right now */
165 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); 186 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
166 if (pci_mmcfg_virt == NULL) { 187 if (pci_mmcfg_virt == NULL) {