aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorJeff Garzik <jeff@garzik.org>2006-04-11 12:51:40 -0400
committerJeff Garzik <jeff@garzik.org>2006-04-11 12:51:40 -0400
commit10a5fd5e6b7e2d464c9f95f67cade4ddbd63f4e1 (patch)
treeeddf856286234f28cac747d20eb59d918e1bc8b5 /arch/x86_64
parentc2a6585296009379e0f4eff39cdcb108b457ebf2 (diff)
parenta145410dccdb44f81d3b56763ef9b6f721f4e47c (diff)
Merge branch 'master'
Conflicts: drivers/scsi/libata-scsi.c include/linux/libata.h
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig10
-rw-r--r--arch/x86_64/Makefile24
-rw-r--r--arch/x86_64/boot/video.S5
-rw-r--r--arch/x86_64/defconfig42
-rw-r--r--arch/x86_64/ia32/ia32entry.S23
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/e820.c36
-rw-r--r--arch/x86_64/kernel/entry.S28
-rw-r--r--arch/x86_64/kernel/mce.c8
-rw-r--r--arch/x86_64/kernel/nmi.c7
-rw-r--r--arch/x86_64/kernel/pci-dma.c2
-rw-r--r--arch/x86_64/kernel/process.c10
-rw-r--r--arch/x86_64/kernel/ptrace.c5
-rw-r--r--arch/x86_64/kernel/setup.c4
-rw-r--r--arch/x86_64/kernel/time.c4
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c3
-rw-r--r--arch/x86_64/mm/init.c37
-rw-r--r--arch/x86_64/mm/numa.c46
-rw-r--r--arch/x86_64/mm/srat.c170
-rw-r--r--arch/x86_64/pci/mmconfig.c53
21 files changed, 412 insertions, 109 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 4310b4a311a5..408d44a59756 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT
136 default "7" if GENERIC_CPU || MPSC 136 default "7" if GENERIC_CPU || MPSC
137 default "6" if MK8 137 default "6" if MK8
138 138
139config X86_INTERNODE_CACHE_BYTES
140 int
141 default "4096" if X86_VSMP
142 default X86_L1_CACHE_BYTES if !X86_VSMP
143
139config X86_TSC 144config X86_TSC
140 bool 145 bool
141 default y 146 default y
@@ -283,6 +288,11 @@ config K8_NUMA
283 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 288 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
284 instead, which also takes priority if both are compiled in. 289 instead, which also takes priority if both are compiled in.
285 290
291config NODES_SHIFT
292 int
293 default "6"
294 depends on NEED_MULTIPLE_NODES
295
286# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig. 296# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
287 297
288config X86_64_ACPI_NUMA 298config X86_64_ACPI_NUMA
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 585fd4a559c8..e573e2ab5510 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -24,37 +24,37 @@
24LDFLAGS := -m elf_x86_64 24LDFLAGS := -m elf_x86_64
25OBJCOPYFLAGS := -O binary -R .note -R .comment -S 25OBJCOPYFLAGS := -O binary -R .note -R .comment -S
26LDFLAGS_vmlinux := 26LDFLAGS_vmlinux :=
27
28CHECKFLAGS += -D__x86_64__ -m64 27CHECKFLAGS += -D__x86_64__ -m64
29 28
29cflags-y :=
30cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 30cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 31cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
32cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) 32cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
33CFLAGS += $(cflags-y)
34 33
35CFLAGS += -m64 34cflags-y += -m64
36CFLAGS += -mno-red-zone 35cflags-y += -mno-red-zone
37CFLAGS += -mcmodel=kernel 36cflags-y += -mcmodel=kernel
38CFLAGS += -pipe 37cflags-y += -pipe
39cflags-$(CONFIG_REORDER) += -ffunction-sections 38cflags-$(CONFIG_REORDER) += -ffunction-sections
40# this makes reading assembly source easier, but produces worse code 39# this makes reading assembly source easier, but produces worse code
41# actually it makes the kernel smaller too. 40# actually it makes the kernel smaller too.
42CFLAGS += -fno-reorder-blocks 41cflags-y += -fno-reorder-blocks
43CFLAGS += -Wno-sign-compare 42cflags-y += -Wno-sign-compare
44ifneq ($(CONFIG_UNWIND_INFO),y) 43ifneq ($(CONFIG_UNWIND_INFO),y)
45CFLAGS += -fno-asynchronous-unwind-tables 44cflags-y += -fno-asynchronous-unwind-tables
46endif 45endif
47ifneq ($(CONFIG_DEBUG_INFO),y) 46ifneq ($(CONFIG_DEBUG_INFO),y)
48# -fweb shrinks the kernel a bit, but the difference is very small 47# -fweb shrinks the kernel a bit, but the difference is very small
49# it also messes up debugging, so don't use it for now. 48# it also messes up debugging, so don't use it for now.
50#CFLAGS += $(call cc-option,-fweb) 49#cflags-y += $(call cc-option,-fweb)
51endif 50endif
52# -funit-at-a-time shrinks the kernel .text considerably 51# -funit-at-a-time shrinks the kernel .text considerably
53# unfortunately it makes reading oopses harder. 52# unfortunately it makes reading oopses harder.
54CFLAGS += $(call cc-option,-funit-at-a-time) 53cflags-y += $(call cc-option,-funit-at-a-time)
55# prevent gcc from generating any FP code by mistake 54# prevent gcc from generating any FP code by mistake
56CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) 55cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
57 56
57CFLAGS += $(cflags-y)
58AFLAGS += -m64 58AFLAGS += -m64
59 59
60head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o 60head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
index 0587477c99f2..32327bb37aff 100644
--- a/arch/x86_64/boot/video.S
+++ b/arch/x86_64/boot/video.S
@@ -97,6 +97,7 @@
97#define PARAM_VESAPM_OFF 0x30 97#define PARAM_VESAPM_OFF 0x30
98#define PARAM_LFB_PAGES 0x32 98#define PARAM_LFB_PAGES 0x32
99#define PARAM_VESA_ATTRIB 0x34 99#define PARAM_VESA_ATTRIB 0x34
100#define PARAM_CAPABILITIES 0x36
100 101
101/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ 102/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
102#ifdef CONFIG_VIDEO_RETAIN 103#ifdef CONFIG_VIDEO_RETAIN
@@ -233,6 +234,10 @@ mopar_gr:
233 movw 18(%di), %ax 234 movw 18(%di), %ax
234 movl %eax, %fs:(PARAM_LFB_SIZE) 235 movl %eax, %fs:(PARAM_LFB_SIZE)
235 236
237# store mode capabilities
238 movl 10(%di), %eax
239 movl %eax, %fs:(PARAM_CAPABILITIES)
240
236# switching the DAC to 8-bit is for <= 8 bpp only 241# switching the DAC to 8-bit is for <= 8 bpp only
237 movw %fs:(PARAM_LFB_DEPTH), %ax 242 movw %fs:(PARAM_LFB_DEPTH), %ax
238 cmpw $8, %ax 243 cmpw $8, %ax
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 566ecc97ee5a..3c45ec22b3fe 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.16-git9 3# Linux kernel version: 2.6.17-rc1
4# Sat Mar 25 15:18:40 2006 4# Mon Apr 3 16:11:14 2006
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -9,6 +9,7 @@ CONFIG_X86=y
9CONFIG_SEMAPHORE_SLEEPERS=y 9CONFIG_SEMAPHORE_SLEEPERS=y
10CONFIG_MMU=y 10CONFIG_MMU=y
11CONFIG_RWSEM_GENERIC_SPINLOCK=y 11CONFIG_RWSEM_GENERIC_SPINLOCK=y
12CONFIG_GENERIC_HWEIGHT=y
12CONFIG_GENERIC_CALIBRATE_DELAY=y 13CONFIG_GENERIC_CALIBRATE_DELAY=y
13CONFIG_X86_CMPXCHG=y 14CONFIG_X86_CMPXCHG=y
14CONFIG_EARLY_PRINTK=y 15CONFIG_EARLY_PRINTK=y
@@ -55,10 +56,6 @@ CONFIG_BASE_FULL=y
55CONFIG_FUTEX=y 56CONFIG_FUTEX=y
56CONFIG_EPOLL=y 57CONFIG_EPOLL=y
57CONFIG_SHMEM=y 58CONFIG_SHMEM=y
58CONFIG_CC_ALIGN_FUNCTIONS=0
59CONFIG_CC_ALIGN_LABELS=0
60CONFIG_CC_ALIGN_LOOPS=0
61CONFIG_CC_ALIGN_JUMPS=0
62CONFIG_SLAB=y 59CONFIG_SLAB=y
63# CONFIG_TINY_SHMEM is not set 60# CONFIG_TINY_SHMEM is not set
64CONFIG_BASE_SMALL=0 61CONFIG_BASE_SMALL=0
@@ -70,7 +67,6 @@ CONFIG_BASE_SMALL=0
70CONFIG_MODULES=y 67CONFIG_MODULES=y
71CONFIG_MODULE_UNLOAD=y 68CONFIG_MODULE_UNLOAD=y
72CONFIG_MODULE_FORCE_UNLOAD=y 69CONFIG_MODULE_FORCE_UNLOAD=y
73CONFIG_OBSOLETE_MODPARM=y
74# CONFIG_MODVERSIONS is not set 70# CONFIG_MODVERSIONS is not set
75# CONFIG_MODULE_SRCVERSION_ALL is not set 71# CONFIG_MODULE_SRCVERSION_ALL is not set
76# CONFIG_KMOD is not set 72# CONFIG_KMOD is not set
@@ -81,6 +77,7 @@ CONFIG_STOP_MACHINE=y
81# 77#
82CONFIG_LBD=y 78CONFIG_LBD=y
83# CONFIG_BLK_DEV_IO_TRACE is not set 79# CONFIG_BLK_DEV_IO_TRACE is not set
80# CONFIG_LSF is not set
84 81
85# 82#
86# IO Schedulers 83# IO Schedulers
@@ -105,6 +102,7 @@ CONFIG_X86_PC=y
105CONFIG_GENERIC_CPU=y 102CONFIG_GENERIC_CPU=y
106CONFIG_X86_L1_CACHE_BYTES=128 103CONFIG_X86_L1_CACHE_BYTES=128
107CONFIG_X86_L1_CACHE_SHIFT=7 104CONFIG_X86_L1_CACHE_SHIFT=7
105CONFIG_X86_INTERNODE_CACHE_BYTES=128
108CONFIG_X86_TSC=y 106CONFIG_X86_TSC=y
109CONFIG_X86_GOOD_APIC=y 107CONFIG_X86_GOOD_APIC=y
110# CONFIG_MICROCODE is not set 108# CONFIG_MICROCODE is not set
@@ -116,6 +114,7 @@ CONFIG_X86_LOCAL_APIC=y
116CONFIG_MTRR=y 114CONFIG_MTRR=y
117CONFIG_SMP=y 115CONFIG_SMP=y
118CONFIG_SCHED_SMT=y 116CONFIG_SCHED_SMT=y
117CONFIG_SCHED_MC=y
119# CONFIG_PREEMPT_NONE is not set 118# CONFIG_PREEMPT_NONE is not set
120CONFIG_PREEMPT_VOLUNTARY=y 119CONFIG_PREEMPT_VOLUNTARY=y
121# CONFIG_PREEMPT is not set 120# CONFIG_PREEMPT is not set
@@ -138,6 +137,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
138CONFIG_SPLIT_PTLOCK_CPUS=4 137CONFIG_SPLIT_PTLOCK_CPUS=4
139CONFIG_MIGRATION=y 138CONFIG_MIGRATION=y
140CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 139CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
140CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
141CONFIG_NR_CPUS=32 141CONFIG_NR_CPUS=32
142CONFIG_HOTPLUG_CPU=y 142CONFIG_HOTPLUG_CPU=y
143CONFIG_HPET_TIMER=y 143CONFIG_HPET_TIMER=y
@@ -289,6 +289,7 @@ CONFIG_IP_PNP_DHCP=y
289# CONFIG_INET_AH is not set 289# CONFIG_INET_AH is not set
290# CONFIG_INET_ESP is not set 290# CONFIG_INET_ESP is not set
291# CONFIG_INET_IPCOMP is not set 291# CONFIG_INET_IPCOMP is not set
292# CONFIG_INET_XFRM_TUNNEL is not set
292# CONFIG_INET_TUNNEL is not set 293# CONFIG_INET_TUNNEL is not set
293CONFIG_INET_DIAG=y 294CONFIG_INET_DIAG=y
294CONFIG_INET_TCP_DIAG=y 295CONFIG_INET_TCP_DIAG=y
@@ -300,6 +301,7 @@ CONFIG_IPV6=y
300# CONFIG_INET6_AH is not set 301# CONFIG_INET6_AH is not set
301# CONFIG_INET6_ESP is not set 302# CONFIG_INET6_ESP is not set
302# CONFIG_INET6_IPCOMP is not set 303# CONFIG_INET6_IPCOMP is not set
304# CONFIG_INET6_XFRM_TUNNEL is not set
303# CONFIG_INET6_TUNNEL is not set 305# CONFIG_INET6_TUNNEL is not set
304# CONFIG_IPV6_TUNNEL is not set 306# CONFIG_IPV6_TUNNEL is not set
305# CONFIG_NETFILTER is not set 307# CONFIG_NETFILTER is not set
@@ -704,7 +706,6 @@ CONFIG_S2IO=m
704# Wireless LAN (non-hamradio) 706# Wireless LAN (non-hamradio)
705# 707#
706# CONFIG_NET_RADIO is not set 708# CONFIG_NET_RADIO is not set
707# CONFIG_NET_WIRELESS_RTNETLINK is not set
708 709
709# 710#
710# Wan interfaces 711# Wan interfaces
@@ -791,7 +792,7 @@ CONFIG_HW_CONSOLE=y
791# 792#
792CONFIG_SERIAL_8250=y 793CONFIG_SERIAL_8250=y
793CONFIG_SERIAL_8250_CONSOLE=y 794CONFIG_SERIAL_8250_CONSOLE=y
794# CONFIG_SERIAL_8250_ACPI is not set 795CONFIG_SERIAL_8250_PCI=y
795CONFIG_SERIAL_8250_NR_UARTS=4 796CONFIG_SERIAL_8250_NR_UARTS=4
796CONFIG_SERIAL_8250_RUNTIME_UARTS=4 797CONFIG_SERIAL_8250_RUNTIME_UARTS=4
797# CONFIG_SERIAL_8250_EXTENDED is not set 798# CONFIG_SERIAL_8250_EXTENDED is not set
@@ -921,6 +922,7 @@ CONFIG_HWMON=y
921# Digital Video Broadcasting Devices 922# Digital Video Broadcasting Devices
922# 923#
923# CONFIG_DVB is not set 924# CONFIG_DVB is not set
925# CONFIG_USB_DABUSB is not set
924 926
925# 927#
926# Graphics support 928# Graphics support
@@ -932,6 +934,8 @@ CONFIG_VIDEO_SELECT=y
932# Console display driver support 934# Console display driver support
933# 935#
934CONFIG_VGA_CONSOLE=y 936CONFIG_VGA_CONSOLE=y
937CONFIG_VGACON_SOFT_SCROLLBACK=y
938CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
935CONFIG_DUMMY_CONSOLE=y 939CONFIG_DUMMY_CONSOLE=y
936 940
937# 941#
@@ -1058,15 +1062,6 @@ CONFIG_USB_HIDINPUT=y
1058# CONFIG_USB_MICROTEK is not set 1062# CONFIG_USB_MICROTEK is not set
1059 1063
1060# 1064#
1061# USB Multimedia devices
1062#
1063# CONFIG_USB_DABUSB is not set
1064
1065#
1066# Video4Linux support is needed for USB Multimedia device support
1067#
1068
1069#
1070# USB Network Adapters 1065# USB Network Adapters
1071# 1066#
1072# CONFIG_USB_CATC is not set 1067# CONFIG_USB_CATC is not set
@@ -1118,9 +1113,15 @@ CONFIG_USB_MON=y
1118# CONFIG_MMC is not set 1113# CONFIG_MMC is not set
1119 1114
1120# 1115#
1116# LED devices
1117#
1118# CONFIG_NEW_LEDS is not set
1119
1120#
1121# InfiniBand support 1121# InfiniBand support
1122# 1122#
1123# CONFIG_INFINIBAND is not set 1123# CONFIG_INFINIBAND is not set
1124# CONFIG_IPATH_CORE is not set
1124 1125
1125# 1126#
1126# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) 1127# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1128,6 +1129,11 @@ CONFIG_USB_MON=y
1128# CONFIG_EDAC is not set 1129# CONFIG_EDAC is not set
1129 1130
1130# 1131#
1132# Real Time Clock
1133#
1134# CONFIG_RTC_CLASS is not set
1135
1136#
1131# Firmware Drivers 1137# Firmware Drivers
1132# 1138#
1133# CONFIG_EDD is not set 1139# CONFIG_EDD is not set
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 35b2faccdc6c..5a9802676689 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -15,6 +15,8 @@
15#include <asm/vsyscall32.h> 15#include <asm/vsyscall32.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19
18 .macro IA32_ARG_FIXUP noebp=0 20 .macro IA32_ARG_FIXUP noebp=0
19 movl %edi,%r8d 21 movl %edi,%r8d
20 .if \noebp 22 .if \noebp
@@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target)
109 CFI_REMEMBER_STATE 111 CFI_REMEMBER_STATE
110 jnz sysenter_tracesys 112 jnz sysenter_tracesys
111sysenter_do_call: 113sysenter_do_call:
112 cmpl $(IA32_NR_syscalls),%eax 114 cmpl $(IA32_NR_syscalls-1),%eax
113 jae ia32_badsys 115 ja ia32_badsys
114 IA32_ARG_FIXUP 1 116 IA32_ARG_FIXUP 1
115 call *ia32_sys_call_table(,%rax,8) 117 call *ia32_sys_call_table(,%rax,8)
116 movq %rax,RAX-ARGOFFSET(%rsp) 118 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target)
210 CFI_REMEMBER_STATE 212 CFI_REMEMBER_STATE
211 jnz cstar_tracesys 213 jnz cstar_tracesys
212cstar_do_call: 214cstar_do_call:
213 cmpl $IA32_NR_syscalls,%eax 215 cmpl $IA32_NR_syscalls-1,%eax
214 jae ia32_badsys 216 ja ia32_badsys
215 IA32_ARG_FIXUP 1 217 IA32_ARG_FIXUP 1
216 call *ia32_sys_call_table(,%rax,8) 218 call *ia32_sys_call_table(,%rax,8)
217 movq %rax,RAX-ARGOFFSET(%rsp) 219 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -296,8 +298,8 @@ ENTRY(ia32_syscall)
296 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 298 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
297 jnz ia32_tracesys 299 jnz ia32_tracesys
298ia32_do_syscall: 300ia32_do_syscall:
299 cmpl $(IA32_NR_syscalls),%eax 301 cmpl $(IA32_NR_syscalls-1),%eax
300 jae ia32_badsys 302 ja ia32_badsys
301 IA32_ARG_FIXUP 303 IA32_ARG_FIXUP
302 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 304 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
303ia32_sysret: 305ia32_sysret:
@@ -685,12 +687,11 @@ ia32_sys_call_table:
685 .quad sys_readlinkat /* 305 */ 687 .quad sys_readlinkat /* 305 */
686 .quad sys_fchmodat 688 .quad sys_fchmodat
687 .quad sys_faccessat 689 .quad sys_faccessat
688 .quad sys_ni_syscall /* pselect6 for now */ 690 .quad quiet_ni_syscall /* pselect6 for now */
689 .quad sys_ni_syscall /* ppoll for now */ 691 .quad quiet_ni_syscall /* ppoll for now */
690 .quad sys_unshare /* 310 */ 692 .quad sys_unshare /* 310 */
691 .quad compat_sys_set_robust_list 693 .quad compat_sys_set_robust_list
692 .quad compat_sys_get_robust_list 694 .quad compat_sys_get_robust_list
695 .quad sys_splice
696 .quad sys_sync_file_range
693ia32_syscall_end: 697ia32_syscall_end:
694 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
695 .quad ni_syscall
696 .endr
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index fffd6b0a2fab..70b9d21ed675 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
80 printk("Aperture from %s beyond 4GB. Ignoring.\n",name); 80 printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
81 return 0; 81 return 0;
82 } 82 }
83 if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { 83 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
84 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); 84 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
85 return 0; 85 return 0;
86 } 86 }
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 293cd71a266a..62776c07cff1 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -80,7 +80,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
80 return 0; 80 return 0;
81} 81}
82 82
83int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 83/*
84 * This function checks if any part of the range <start,end> is mapped
85 * with type.
86 */
87int __meminit
88e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
84{ 89{
85 int i; 90 int i;
86 for (i = 0; i < e820.nr_map; i++) { 91 for (i = 0; i < e820.nr_map; i++) {
@@ -94,6 +99,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
94 return 0; 99 return 0;
95} 100}
96 101
102/*
103 * This function checks if the entire range <start,end> is mapped with type.
104 *
105 * Note: this function only works correct if the e820 table is sorted and
106 * not-overlapping, which is the case
107 */
108int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
109{
110 int i;
111 for (i = 0; i < e820.nr_map; i++) {
112 struct e820entry *ei = &e820.map[i];
113 if (type && ei->type != type)
114 continue;
115 /* is the region (part) in overlap with the current region ?*/
116 if (ei->addr >= end || ei->addr + ei->size <= start)
117 continue;
118
119 /* if the region is at the beginning of <start,end> we move
120 * start to the end of the region since it's ok until there
121 */
122 if (ei->addr <= start)
123 start = ei->addr + ei->size;
124 /* if start is now at or beyond end, we're done, full coverage */
125 if (start >= end)
126 return 1; /* we're done */
127 }
128 return 0;
129}
130
97/* 131/*
98 * Find a free area in a specific range. 132 * Find a free area in a specific range.
99 */ 133 */
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 8538bfea30e6..c946e4fe67a7 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -180,6 +180,10 @@ rff_trace:
180 * 180 *
181 * XXX if we had a free scratch register we could save the RSP into the stack frame 181 * XXX if we had a free scratch register we could save the RSP into the stack frame
182 * and report it properly in ps. Unfortunately we haven't. 182 * and report it properly in ps. Unfortunately we haven't.
183 *
184 * When user can change the frames always force IRET. That is because
185 * it deals with uncanonical addresses better. SYSRET has trouble
186 * with them due to bugs in both AMD and Intel CPUs.
183 */ 187 */
184 188
185ENTRY(system_call) 189ENTRY(system_call)
@@ -254,7 +258,10 @@ sysret_signal:
254 xorl %esi,%esi # oldset -> arg2 258 xorl %esi,%esi # oldset -> arg2
255 call ptregscall_common 259 call ptregscall_common
2561: movl $_TIF_NEED_RESCHED,%edi 2601: movl $_TIF_NEED_RESCHED,%edi
257 jmp sysret_check 261 /* Use IRET because user could have changed frame. This
262 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
263 cli
264 jmp int_with_check
258 265
259badsys: 266badsys:
260 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 267 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -280,7 +287,8 @@ tracesys:
280 call syscall_trace_leave 287 call syscall_trace_leave
281 RESTORE_TOP_OF_STACK %rbx 288 RESTORE_TOP_OF_STACK %rbx
282 RESTORE_REST 289 RESTORE_REST
283 jmp ret_from_sys_call 290 /* Use IRET because user could have changed frame */
291 jmp int_ret_from_sys_call
284 CFI_ENDPROC 292 CFI_ENDPROC
285 293
286/* 294/*
@@ -408,25 +416,9 @@ ENTRY(stub_execve)
408 CFI_ADJUST_CFA_OFFSET -8 416 CFI_ADJUST_CFA_OFFSET -8
409 CFI_REGISTER rip, r11 417 CFI_REGISTER rip, r11
410 SAVE_REST 418 SAVE_REST
411 movq %r11, %r15
412 CFI_REGISTER rip, r15
413 FIXUP_TOP_OF_STACK %r11 419 FIXUP_TOP_OF_STACK %r11
414 call sys_execve 420 call sys_execve
415 GET_THREAD_INFO(%rcx)
416 bt $TIF_IA32,threadinfo_flags(%rcx)
417 CFI_REMEMBER_STATE
418 jc exec_32bit
419 RESTORE_TOP_OF_STACK %r11 421 RESTORE_TOP_OF_STACK %r11
420 movq %r15, %r11
421 CFI_REGISTER rip, r11
422 RESTORE_REST
423 pushq %r11
424 CFI_ADJUST_CFA_OFFSET 8
425 CFI_REL_OFFSET rip, 0
426 ret
427
428exec_32bit:
429 CFI_RESTORE_STATE
430 movq %rax,RAX(%rsp) 422 movq %rax,RAX(%rsp)
431 RESTORE_REST 423 RESTORE_REST
432 jmp int_ret_from_sys_call 424 jmp int_ret_from_sys_call
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 10b3e348fc99..6f0790e8b6d3 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -29,6 +29,8 @@
29#define MISC_MCELOG_MINOR 227 29#define MISC_MCELOG_MINOR 227
30#define NR_BANKS 6 30#define NR_BANKS 6
31 31
32atomic_t mce_entry;
33
32static int mce_dont_init; 34static int mce_dont_init;
33 35
34/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, 36/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
@@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
172 int i; 174 int i;
173 int panicm_found = 0; 175 int panicm_found = 0;
174 176
177 atomic_inc(&mce_entry);
178
175 if (regs) 179 if (regs)
176 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); 180 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
177 if (!banks) 181 if (!banks)
178 return; 182 goto out2;
179 183
180 memset(&m, 0, sizeof(struct mce)); 184 memset(&m, 0, sizeof(struct mce));
181 m.cpu = safe_smp_processor_id(); 185 m.cpu = safe_smp_processor_id();
@@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
266 out: 270 out:
267 /* Last thing done in the machine check exception to clear state. */ 271 /* Last thing done in the machine check exception to clear state. */
268 wrmsrl(MSR_IA32_MCG_STATUS, 0); 272 wrmsrl(MSR_IA32_MCG_STATUS, 0);
273 out2:
274 atomic_dec(&mce_entry);
269} 275}
270 276
271/* 277/*
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index d9e4067faf05..4e6357fe0ec3 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -34,6 +34,7 @@
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h> 36#include <asm/local.h>
37#include <asm/mce.h>
37 38
38/* 39/*
39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 40 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
480 __get_cpu_var(nmi_touch) = 0; 481 __get_cpu_var(nmi_touch) = 0;
481 touched = 1; 482 touched = 1;
482 } 483 }
484#ifdef CONFIG_X86_MCE
485 /* Could check oops_in_progress here too, but it's safer
486 not too */
487 if (atomic_read(&mce_entry) > 0)
488 touched = 1;
489#endif
483 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 490 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
484 /* 491 /*
485 * Ayiee, looks like this CPU is stuck ... 492 * Ayiee, looks like this CPU is stuck ...
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 03c9eeedb0f3..af035ede70cd 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -48,9 +48,11 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
48{ 48{
49 struct page *page; 49 struct page *page;
50 int node; 50 int node;
51#ifdef CONFIG_PCI
51 if (dev->bus == &pci_bus_type) 52 if (dev->bus == &pci_bus_type)
52 node = pcibus_to_node(to_pci_dev(dev)->bus); 53 node = pcibus_to_node(to_pci_dev(dev)->bus);
53 else 54 else
55#endif
54 node = numa_node_id(); 56 node = numa_node_id();
55 page = alloc_pages_node(node, gfp, order); 57 page = alloc_pages_node(node, gfp, order);
56 return page ? page_address(page) : NULL; 58 return page ? page_address(page) : NULL;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 70dd8e5c6889..1c44b53cb15b 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -781,10 +781,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
781 } 781 }
782 case ARCH_GET_GS: { 782 case ARCH_GET_GS: {
783 unsigned long base; 783 unsigned long base;
784 unsigned gsindex;
784 if (task->thread.gsindex == GS_TLS_SEL) 785 if (task->thread.gsindex == GS_TLS_SEL)
785 base = read_32bit_tls(task, GS_TLS); 786 base = read_32bit_tls(task, GS_TLS);
786 else if (doit) 787 else if (doit) {
787 rdmsrl(MSR_KERNEL_GS_BASE, base); 788 asm("movl %%gs,%0" : "=r" (gsindex));
789 if (gsindex)
790 rdmsrl(MSR_KERNEL_GS_BASE, base);
791 else
792 base = task->thread.gs;
793 }
788 else 794 else
789 base = task->thread.gs; 795 base = task->thread.gs;
790 ret = put_user(base, (unsigned long __user *)addr); 796 ret = put_user(base, (unsigned long __user *)addr);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index d44b2c1e63a6..da8e7903d817 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -274,11 +274,6 @@ static int putreg(struct task_struct *child,
274 return -EIO; 274 return -EIO;
275 value &= 0xffff; 275 value &= 0xffff;
276 break; 276 break;
277 case offsetof(struct user_regs_struct, rip):
278 /* Check if the new RIP address is canonical */
279 if (value >= TASK_SIZE_OF(child))
280 return -EIO;
281 break;
282 } 277 }
283 put_stack_long(child, regno - sizeof(struct pt_regs), value); 278 put_stack_long(child, regno - sizeof(struct pt_regs), value);
284 return 0; 279 return 0;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0856ad444f90..c50b06765a80 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
353 if (fullarg(from, "enable_timer_pin_1")) 353 if (fullarg(from, "enable_timer_pin_1"))
354 disable_timer_pin_1 = -1; 354 disable_timer_pin_1 = -1;
355 355
356 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) 356 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
357 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
357 disable_apic = 1; 358 disable_apic = 1;
359 }
358 360
359 if (fullarg(from, "noapic")) 361 if (fullarg(from, "noapic"))
360 skip_ioapic_setup = 1; 362 skip_ioapic_setup = 1;
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index ef8bc46dc140..7392570f975d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -726,7 +726,7 @@ static __init int late_hpet_init(void)
726 unsigned int ntimer; 726 unsigned int ntimer;
727 727
728 if (!vxtime.hpet_address) 728 if (!vxtime.hpet_address)
729 return -1; 729 return 0;
730 730
731 memset(&hd, 0, sizeof (hd)); 731 memset(&hd, 0, sizeof (hd));
732 732
@@ -917,6 +917,8 @@ void __init time_init(void)
917 vxtime.hpet_address = 0; 917 vxtime.hpet_address = 0;
918 918
919 if (hpet_use_timer) { 919 if (hpet_use_timer) {
920 /* set tick_nsec to use the proper rate for HPET */
921 tick_nsec = TICK_NSEC_HPET;
920 cpu_khz = hpet_calibrate_tsc(); 922 cpu_khz = hpet_calibrate_tsc();
921 timename = "HPET"; 923 timename = "HPET";
922#ifdef CONFIG_X86_PM_TIMER 924#ifdef CONFIG_X86_PM_TIMER
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 39ff0708f803..b81f473c4a19 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
65 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 65 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
66 *(.data.cacheline_aligned) 66 *(.data.cacheline_aligned)
67 } 67 }
68 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 68 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
69 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 69 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
70 *(.data.read_mostly) 70 *(.data.read_mostly)
71 } 71 }
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index d78f46056bda..1def21c9f7cd 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -112,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
112#undef memcpy 112#undef memcpy
113#undef memset 113#undef memset
114#undef memmove 114#undef memmove
115#undef strlen
116 115
117extern void * memset(void *,int,__kernel_size_t); 116extern void * memset(void *,int,__kernel_size_t);
118extern size_t strlen(const char *); 117extern size_t strlen(const char *);
@@ -121,8 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
121extern void * __memcpy(void *,const void *,__kernel_size_t); 120extern void * __memcpy(void *,const void *,__kernel_size_t);
122 121
123EXPORT_SYMBOL(memset); 122EXPORT_SYMBOL(memset);
124EXPORT_SYMBOL(strlen);
125EXPORT_SYMBOL(strpbrk);
126EXPORT_SYMBOL(memmove); 123EXPORT_SYMBOL(memmove);
127EXPORT_SYMBOL(memcpy); 124EXPORT_SYMBOL(memcpy);
128EXPORT_SYMBOL(__memcpy); 125EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e5f7f1c34462..4ba34e95d835 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
305 if (paddr >= end) 305 if (paddr >= end)
306 break; 306 break;
307 307
308 if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 308 if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
309 set_pud(pud, __pud(0)); 309 set_pud(pud, __pud(0));
310 continue; 310 continue;
311 } 311 }
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
507 507
508/* 508/*
509 * Memory hotplug specific functions 509 * Memory hotplug specific functions
510 * These are only for non-NUMA machines right now.
511 */ 510 */
512#ifdef CONFIG_MEMORY_HOTPLUG 511#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
513 512
514void online_page(struct page *page) 513void online_page(struct page *page)
515{ 514{
@@ -520,6 +519,38 @@ void online_page(struct page *page)
520 num_physpages++; 519 num_physpages++;
521} 520}
522 521
522#ifndef CONFIG_MEMORY_HOTPLUG
523/*
524 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
525 * just online the pages.
526 */
527int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
528{
529 int err = -EIO;
530 unsigned long pfn;
531 unsigned long total = 0, mem = 0;
532 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
533 if (pfn_valid(pfn)) {
534 online_page(pfn_to_page(pfn));
535 err = 0;
536 mem++;
537 }
538 total++;
539 }
540 if (!err) {
541 z->spanned_pages += total;
542 z->present_pages += mem;
543 z->zone_pgdat->node_spanned_pages += total;
544 z->zone_pgdat->node_present_pages += mem;
545 }
546 return err;
547}
548#endif
549
550/*
551 * Memory is added always to NORMAL zone. This means you will never get
552 * additional DMA/DMA32 memory.
553 */
523int add_memory(u64 start, u64 size) 554int add_memory(u64 start, u64 size)
524{ 555{
525 struct pglist_data *pgdat = NODE_DATA(0); 556 struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..cc02573a3271 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
100} 100}
101#endif 101#endif
102 102
103static void * __init
104early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
106{
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
117 }
118 return ptr;
119}
120
103/* Initialize bootmem allocator for a node */ 121/* Initialize bootmem allocator for a node */
104void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 122void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105{ 123{
106 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
107 unsigned long nodedata_phys; 125 unsigned long nodedata_phys;
126 void *bootmap;
108 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109 128
110 start = round_up(start, ZONE_ALIGN); 129 start = round_up(start, ZONE_ALIGN);
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
114 start_pfn = start >> PAGE_SHIFT; 133 start_pfn = start >> PAGE_SHIFT;
115 end_pfn = end >> PAGE_SHIFT; 134 end_pfn = end >> PAGE_SHIFT;
116 135
117 nodedata_phys = find_e820_area(start, end, pgdat_size); 136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
118 if (nodedata_phys == -1L) 137 if (node_data[nodeid] == NULL)
119 panic("Cannot find memory pgdat in node %d\n", nodeid); 138 return;
120 139 nodedata_phys = __pa(node_data[nodeid]);
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122 140
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
129 /* Find a place for the bootmem map */ 146 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); 149 bootmap = early_node_mem(nodeid, bootmap_start, end,
133 if (bootmap_start == -1L) 150 bootmap_pages<<PAGE_SHIFT);
134 panic("Not enough continuous space for bootmap on node %d", nodeid); 151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
156 }
157 bootmap_start = __pa(bootmap);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136 159
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
142 165
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
168#ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170#endif
145 node_set_online(nodeid); 171 node_set_online(nodeid);
146} 172}
147 173
@@ -335,6 +361,8 @@ __init int numa_setup(char *opt)
335#ifdef CONFIG_ACPI_NUMA 361#ifdef CONFIG_ACPI_NUMA
336 if (!strncmp(opt,"noacpi",6)) 362 if (!strncmp(opt,"noacpi",6))
337 acpi_numa = -1; 363 acpi_numa = -1;
364 if (!strncmp(opt,"hotadd=", 7))
365 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
338#endif 366#endif
339 return 1; 367 return 1;
340} 368}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..15ae9fcd65a7 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
15#include <linux/bitmap.h> 15#include <linux/bitmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
18#include <asm/proto.h> 20#include <asm/proto.h>
19#include <asm/numa.h> 21#include <asm/numa.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21 23
24#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
22static struct acpi_table_slit *acpi_slit; 30static struct acpi_table_slit *acpi_slit;
23 31
24static nodemask_t nodes_parsed __initdata; 32static nodemask_t nodes_parsed __initdata;
25static nodemask_t nodes_found __initdata; 33static nodemask_t nodes_found __initdata;
26static struct bootnode nodes[MAX_NUMNODES] __initdata; 34static struct bootnode nodes[MAX_NUMNODES] __initdata;
35static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
37int hotadd_percent __initdata = 10;
27static u8 pxm2node[256] = { [0 ... 255] = 0xff }; 38static u8 pxm2node[256] = { [0 ... 255] = 0xff };
28 39
29/* Too small nodes confuse the VM badly. Usually they result 40/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
71static __init void cutoff_node(int i, unsigned long start, unsigned long end) 82static __init void cutoff_node(int i, unsigned long start, unsigned long end)
72{ 83{
73 struct bootnode *nd = &nodes[i]; 84 struct bootnode *nd = &nodes[i];
85
86 if (found_add_area)
87 return;
88
74 if (nd->start < start) { 89 if (nd->start < start) {
75 nd->start = start; 90 nd->start = start;
76 if (nd->end < nd->start) 91 if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
90 acpi_numa = -1; 105 acpi_numa = -1;
91 for (i = 0; i < MAX_LOCAL_APIC; i++) 106 for (i = 0; i < MAX_LOCAL_APIC; i++)
92 apicid_to_node[i] = NUMA_NO_NODE; 107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
93} 110}
94 111
95static __init inline int srat_disabled(void) 112static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
155 pxm, pa->apic_id, node); 172 pxm, pa->apic_id, node);
156} 173}
157 174
175#ifdef RESERVE_HOTADD
176/*
177 * Protect against too large hotadd areas that would fill up memory.
178 */
179static int hotadd_enough_memory(struct bootnode *nd)
180{
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
188
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
203 }
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
214}
215
216/*
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
219 */
220static int reserve_hotadd(int node, unsigned long start, unsigned long end)
221{
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
226
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
230
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
236 }
237
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
242 }
243
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
247 }
248
249 /* Looks good */
250
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
260 }
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
264 }
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
267 }
268
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
271
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
275}
276#endif
277
158/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 278/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
159void __init 279void __init
160acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) 280acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
161{ 281{
162 struct bootnode *nd; 282 struct bootnode *nd, oldnode;
163 unsigned long start, end; 283 unsigned long start, end;
164 int node, pxm; 284 int node, pxm;
165 int i; 285 int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
172 } 292 }
173 if (ma->flags.enabled == 0) 293 if (ma->flags.enabled == 0)
174 return; 294 return;
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
175 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); 297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
176 end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); 298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
177 pxm = ma->proximity_domain; 299 pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
181 bad_srat(); 303 bad_srat();
182 return; 304 return;
183 } 305 }
184 /* It is fine to add this area to the nodes data it will be used later*/
185 if (ma->flags.hot_pluggable == 1)
186 printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
187 start, end);
188 i = conflicting_nodes(start, end); 306 i = conflicting_nodes(start, end);
189 if (i == node) { 307 if (i == node) {
190 printk(KERN_WARNING 308 printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
199 return; 317 return;
200 } 318 }
201 nd = &nodes[node]; 319 nd = &nodes[node];
320 oldnode = *nd;
202 if (!node_test_and_set(node, nodes_parsed)) { 321 if (!node_test_and_set(node, nodes_parsed)) {
203 nd->start = start; 322 nd->start = start;
204 nd->end = end; 323 nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
208 if (nd->end < end) 327 if (nd->end < end)
209 nd->end = end; 328 nd->end = end;
210 } 329 }
330
211 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
212 nd->start, nd->end); 332 nd->start, nd->end);
333
334#ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
341 }
342#endif
213} 343}
214 344
215/* Sanity check to catch more bad SRATs (they are amazingly common). 345/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
225 unsigned long e = nodes[i].end >> PAGE_SHIFT; 355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
226 pxmram += e - s; 356 pxmram += e - s;
227 pxmram -= e820_hole_size(s, e); 357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
228 } 361 }
229 362
230 e820ram = end_pfn - e820_hole_size(0, end_pfn); 363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
258 391
259 /* First clean up the node list */ 392 /* First clean up the node list */
260 for (i = 0; i < MAX_NUMNODES; i++) { 393 for (i = 0; i < MAX_NUMNODES; i++) {
261 cutoff_node(i, start, end); 394 cutoff_node(i, start, end);
262 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) 395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
263 unparse_node(i); 396 unparse_node(i);
264 } 397 }
@@ -282,6 +415,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
282 /* Finally register nodes */ 415 /* Finally register nodes */
283 for_each_node_mask(i, nodes_parsed) 416 for_each_node_mask(i, nodes_parsed)
284 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
421 if (!node_online(i))
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
423
285 for (i = 0; i < NR_CPUS; i++) { 424 for (i = 0; i < NR_CPUS; i++) {
286 if (cpu_to_node[i] == NUMA_NO_NODE) 425 if (cpu_to_node[i] == NUMA_NO_NODE)
287 continue; 426 continue;
@@ -303,6 +442,25 @@ static int node_to_pxm(int n)
303 return 0; 442 return 0;
304} 443}
305 444
445void __init srat_reserve_add_area(int nodeid)
446{
447 if (found_add_area && nodes_add[nodeid].end) {
448 u64 total_mb;
449
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
454 >> PAGE_SHIFT;
455 total_mb *= sizeof(struct page);
456 total_mb >>= 20;
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
461 }
462}
463
306int __node_distance(int a, int b) 464int __node_distance(int a, int b)
307{ 465{
308 int index; 466 int index;
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index e616500207e4..a2060e4d5de6 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -9,11 +9,16 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <asm/e820.h>
13
12#include "pci.h" 14#include "pci.h"
13 15
14#define MMCONFIG_APER_SIZE (256*1024*1024) 16#define MMCONFIG_APER_SIZE (256*1024*1024)
17/* Verify the first 16 busses. We assume that systems with more busses
18 get MCFG right. */
19#define MAX_CHECK_BUS 16
15 20
16static DECLARE_BITMAP(fallback_slots, 32); 21static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
17 22
18/* Static virtual mapping of the MMCONFIG aperture */ 23/* Static virtual mapping of the MMCONFIG aperture */
19struct mmcfg_virt { 24struct mmcfg_virt {
@@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
55static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 60static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
56{ 61{
57 char __iomem *addr; 62 char __iomem *addr;
58 if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots)) 63 if (seg == 0 && bus < MAX_CHECK_BUS &&
64 test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
59 return NULL; 65 return NULL;
60 addr = get_virt(seg, bus); 66 addr = get_virt(seg, bus);
61 if (!addr) 67 if (!addr)
@@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
69 char __iomem *addr; 75 char __iomem *addr;
70 76
71 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ 77 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
72 if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) 78 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
79 *value = -1;
73 return -EINVAL; 80 return -EINVAL;
81 }
74 82
75 addr = pci_dev_base(seg, bus, devfn); 83 addr = pci_dev_base(seg, bus, devfn);
76 if (!addr) 84 if (!addr)
@@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = {
129 Normally this can be expressed in the MCFG by not listing them 137 Normally this can be expressed in the MCFG by not listing them
130 and assigning suitable _SEGs, but this isn't implemented in some BIOS. 138 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
131 Instead try to discover all devices on bus 0 that are unreachable using MM 139 Instead try to discover all devices on bus 0 that are unreachable using MM
132 and fallback for them. 140 and fallback for them. */
133 We only do this for bus 0/seg 0 */
134static __init void unreachable_devices(void) 141static __init void unreachable_devices(void)
135{ 142{
136 int i; 143 int i, k;
137 for (i = 0; i < 32; i++) { 144 /* Use the max bus number from ACPI here? */
138 u32 val1; 145 for (k = 0; k < MAX_CHECK_BUS; k++) {
139 char __iomem *addr; 146 for (i = 0; i < 32; i++) {
140 147 u32 val1;
141 pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1); 148 char __iomem *addr;
142 if (val1 == 0xffffffff) 149
143 continue; 150 pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
144 addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0)); 151 if (val1 == 0xffffffff)
145 if (addr == NULL|| readl(addr) != val1) { 152 continue;
146 set_bit(i, fallback_slots); 153 addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
154 if (addr == NULL|| readl(addr) != val1) {
155 set_bit(i + 32*k, fallback_slots);
156 printk(KERN_NOTICE
157 "PCI: No mmconfig possible on device %x:%x\n",
158 k, i);
159 }
147 } 160 }
148 } 161 }
149} 162}
@@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void)
161 (pci_mmcfg_config[0].base_address == 0)) 174 (pci_mmcfg_config[0].base_address == 0))
162 return; 175 return;
163 176
177 if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
178 pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
179 E820_RESERVED)) {
180 printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
181 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
182 return;
183 }
184
164 /* RED-PEN i386 doesn't do _nocache right now */ 185 /* RED-PEN i386 doesn't do _nocache right now */
165 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); 186 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
166 if (pci_mmcfg_virt == NULL) { 187 if (pci_mmcfg_virt == NULL) {