aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig66
-rw-r--r--arch/x86_64/Makefile2
-rw-r--r--arch/x86_64/boot/compressed/head.S11
-rw-r--r--arch/x86_64/boot/compressed/misc.c12
-rw-r--r--arch/x86_64/boot/install.sh2
-rw-r--r--arch/x86_64/boot/setup.S8
-rw-r--r--arch/x86_64/boot/tools/build.c5
-rw-r--r--arch/x86_64/ia32/ia32entry.S2
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S2
-rw-r--r--arch/x86_64/kernel/apic.c46
-rw-r--r--arch/x86_64/kernel/crash.c35
-rw-r--r--arch/x86_64/kernel/e820.c6
-rw-r--r--arch/x86_64/kernel/genapic_flat.c110
-rw-r--r--arch/x86_64/kernel/head.S18
-rw-r--r--arch/x86_64/kernel/i387.c2
-rw-r--r--arch/x86_64/kernel/i8259.c12
-rw-r--r--arch/x86_64/kernel/io_apic.c36
-rw-r--r--arch/x86_64/kernel/irq.c29
-rw-r--r--arch/x86_64/kernel/machine_kexec.c250
-rw-r--r--arch/x86_64/kernel/mce.c10
-rw-r--r--arch/x86_64/kernel/mce_intel.c4
-rw-r--r--arch/x86_64/kernel/nmi.c4
-rw-r--r--arch/x86_64/kernel/process.c31
-rw-r--r--arch/x86_64/kernel/reboot.c62
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
-rw-r--r--arch/x86_64/kernel/setup.c47
-rw-r--r--arch/x86_64/kernel/setup64.c6
-rw-r--r--arch/x86_64/kernel/signal.c2
-rw-r--r--arch/x86_64/kernel/smp.c10
-rw-r--r--arch/x86_64/kernel/smpboot.c317
-rw-r--r--arch/x86_64/kernel/suspend.c18
-rw-r--r--arch/x86_64/kernel/traps.c8
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S128
-rw-r--r--arch/x86_64/mm/numa.c2
35 files changed, 1187 insertions, 260 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index db259757dc8a..d09437b5c48f 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -207,33 +207,6 @@ config SMP
207 207
208 If you don't know what to do here, say N. 208 If you don't know what to do here, say N.
209 209
210config PREEMPT
211 bool "Preemptible Kernel"
212 ---help---
213 This option reduces the latency of the kernel when reacting to
214 real-time or interactive events by allowing a low priority process to
215 be preempted even if it is in kernel mode executing a system call.
216 This allows applications to run more reliably even when the system is
217 under load. On contrary it may also break your drivers and add
218 priority inheritance problems to your system. Don't select it if
219 you rely on a stable system or have slightly obscure hardware.
220 It's also not very well tested on x86-64 currently.
221 You have been warned.
222
223 Say Y here if you are feeling brave and building a kernel for a
224 desktop, embedded or real-time system. Say N if you are unsure.
225
226config PREEMPT_BKL
227 bool "Preempt The Big Kernel Lock"
228 depends on PREEMPT
229 default y
230 help
231 This option reduces the latency of the kernel by making the
232 big kernel lock preemptible.
233
234 Say Y here if you are building a kernel for a desktop system.
235 Say N if you are unsure.
236
237config SCHED_SMT 210config SCHED_SMT
238 bool "SMT (Hyperthreading) scheduler support" 211 bool "SMT (Hyperthreading) scheduler support"
239 depends on SMP 212 depends on SMP
@@ -244,6 +217,8 @@ config SCHED_SMT
244 cost of slightly increased overhead in some places. If unsure say 217 cost of slightly increased overhead in some places. If unsure say
245 N here. 218 N here.
246 219
220source "kernel/Kconfig.preempt"
221
247config K8_NUMA 222config K8_NUMA
248 bool "K8 NUMA support" 223 bool "K8 NUMA support"
249 select NUMA 224 select NUMA
@@ -313,6 +288,15 @@ config NR_CPUS
313 This is purely to save memory - each supported CPU requires 288 This is purely to save memory - each supported CPU requires
314 memory in the static kernel configuration. 289 memory in the static kernel configuration.
315 290
291config HOTPLUG_CPU
292 bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
293 depends on SMP && HOTPLUG && EXPERIMENTAL
294 help
295 Say Y here to experiment with turning CPUs off and on. CPUs
296 can be controlled through /sys/devices/system/cpu/cpu#.
297 Say N if you want to disable CPU hotplug.
298
299
316config HPET_TIMER 300config HPET_TIMER
317 bool 301 bool
318 default y 302 default y
@@ -385,6 +369,34 @@ config X86_MCE_INTEL
385 Additional support for intel specific MCE features such as 369 Additional support for intel specific MCE features such as
386 the thermal monitor. 370 the thermal monitor.
387 371
372config PHYSICAL_START
373 hex "Physical address where the kernel is loaded" if EMBEDDED
374 default "0x100000"
375 help
376 This gives the physical address where the kernel is loaded.
377 Primarily used in the case of kexec on panic where the
378 fail safe kernel needs to run at a different address than
379 the panic-ed kernel.
380
381 Don't change this unless you know what you are doing.
382
383config KEXEC
384 bool "kexec system call (EXPERIMENTAL)"
385 depends on EXPERIMENTAL
386 help
387 kexec is a system call that implements the ability to shutdown your
388 current kernel, and to start another kernel. It is like a reboot
389 but it is indepedent of the system firmware. And like a reboot
390 you can start any kernel with it, not just Linux.
391
392 The name comes from the similiarity to the exec system call.
393
394 It is an ongoing process to be certain the hardware in a machine
395 is properly shutdown, so do not be surprised if this code does not
396 initially work for you. It may help to enable device hotplugging
397 support. As of this writing the exact hardware interface is
398 strongly in flux, so no good recommendation can be made.
399
388config SECCOMP 400config SECCOMP
389 bool "Enable seccomp to safely compute untrusted bytecode" 401 bool "Enable seccomp to safely compute untrusted bytecode"
390 depends on PROC_FS 402 depends on PROC_FS
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6f90c246c418..8a73794f9b90 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
35 35
36LDFLAGS := -m elf_x86_64 36LDFLAGS := -m elf_x86_64
37OBJCOPYFLAGS := -O binary -R .note -R .comment -S 37OBJCOPYFLAGS := -O binary -R .note -R .comment -S
38LDFLAGS_vmlinux := -e stext 38LDFLAGS_vmlinux :=
39 39
40CHECKFLAGS += -D__x86_64__ -m64 40CHECKFLAGS += -D__x86_64__ -m64
41 41
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 27264dbd575c..6f55565e4d42 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -2,8 +2,6 @@
2 * linux/boot/head.S 2 * linux/boot/head.S
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993 Linus Torvalds
5 *
6 * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $
7 */ 5 */
8 6
9/* 7/*
@@ -21,13 +19,14 @@
21 */ 19 */
22 20
23/* 21/*
24 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
25 */ 23 */
26.code32 24.code32
27.text 25.text
28 26
29#include <linux/linkage.h> 27#include <linux/linkage.h>
30#include <asm/segment.h> 28#include <asm/segment.h>
29#include <asm/page.h>
31 30
32 .code32 31 .code32
33 .globl startup_32 32 .globl startup_32
@@ -77,7 +76,7 @@ startup_32:
77 jnz 3f 76 jnz 3f
78 addl $8,%esp 77 addl $8,%esp
79 xorl %ebx,%ebx 78 xorl %ebx,%ebx
80 ljmp $(__KERNEL_CS), $0x100000 79 ljmp $(__KERNEL_CS), $__PHYSICAL_START
81 80
82/* 81/*
83 * We come here, if we were loaded high. 82 * We come here, if we were loaded high.
@@ -103,7 +102,7 @@ startup_32:
103 popl %ecx # lcount 102 popl %ecx # lcount
104 popl %edx # high_buffer_start 103 popl %edx # high_buffer_start
105 popl %eax # hcount 104 popl %eax # hcount
106 movl $0x100000,%edi 105 movl $__PHYSICAL_START,%edi
107 cli # make sure we don't get interrupted 106 cli # make sure we don't get interrupted
108 ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine 107 ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
109 108
@@ -128,7 +127,7 @@ move_routine_start:
128 movsl 127 movsl
129 movl %ebx,%esi # Restore setup pointer 128 movl %ebx,%esi # Restore setup pointer
130 xorl %ebx,%ebx 129 xorl %ebx,%ebx
131 ljmp $(__KERNEL_CS), $0x100000 130 ljmp $(__KERNEL_CS), $__PHYSICAL_START
132move_routine_end: 131move_routine_end:
133 132
134 133
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index c8b9216f9e63..b38d5b8b5fb8 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -11,6 +11,7 @@
11 11
12#include "miscsetup.h" 12#include "miscsetup.h"
13#include <asm/io.h> 13#include <asm/io.h>
14#include <asm/page.h>
14 15
15/* 16/*
16 * gzip declarations 17 * gzip declarations
@@ -92,8 +93,11 @@ static unsigned long output_ptr = 0;
92static void *malloc(int size); 93static void *malloc(int size);
93static void free(void *where); 94static void free(void *where);
94 95
96void* memset(void* s, int c, unsigned n);
97void* memcpy(void* dest, const void* src, unsigned n);
98
95static void putstr(const char *); 99static void putstr(const char *);
96 100
97extern int end; 101extern int end;
98static long free_mem_ptr = (long)&end; 102static long free_mem_ptr = (long)&end;
99static long free_mem_end_ptr; 103static long free_mem_end_ptr;
@@ -284,7 +288,7 @@ void setup_normal_output_buffer(void)
284#else 288#else
285 if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); 289 if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
286#endif 290#endif
287 output_data = (char *)0x100000; /* Points to 1M */ 291 output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */
288 free_mem_end_ptr = (long)real_mode; 292 free_mem_end_ptr = (long)real_mode;
289} 293}
290 294
@@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
307 low_buffer_size = low_buffer_end - LOW_BUFFER_START; 311 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
308 high_loaded = 1; 312 high_loaded = 1;
309 free_mem_end_ptr = (long)high_buffer_start; 313 free_mem_end_ptr = (long)high_buffer_start;
310 if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { 314 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
311 high_buffer_start = (uch *)(0x100000 + low_buffer_size); 315 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
312 mv->hcount = 0; /* say: we need not to move high_buffer */ 316 mv->hcount = 0; /* say: we need not to move high_buffer */
313 } 317 }
314 else mv->hcount = -1; 318 else mv->hcount = -1;
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
index f17b40dfc0f4..198af15a7758 100644
--- a/arch/x86_64/boot/install.sh
+++ b/arch/x86_64/boot/install.sh
@@ -1,6 +1,6 @@
1#!/bin/sh 1#!/bin/sh
2# 2#
3# arch/i386/boot/install.sh 3# arch/x86_64/boot/install.sh
4# 4#
5# This file is subject to the terms and conditions of the GNU General Public 5# This file is subject to the terms and conditions of the GNU General Public
6# License. See the file "COPYING" in the main directory of this archive 6# License. See the file "COPYING" in the main directory of this archive
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 75d4d2ad93b3..ff58b2832b75 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -33,7 +33,7 @@
33 * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. 33 * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
34 * <stiker@northlink.com> 34 * <stiker@northlink.com>
35 * 35 *
36 * Fix to work around buggy BIOSes which dont use carry bit correctly 36 * Fix to work around buggy BIOSes which don't use carry bit correctly
37 * and/or report extended memory in CX/DX for e801h memory size detection 37 * and/or report extended memory in CX/DX for e801h memory size detection
38 * call. As a result the kernel got wrong figures. The int15/e801h docs 38 * call. As a result the kernel got wrong figures. The int15/e801h docs
39 * from Ralf Brown interrupt list seem to indicate AX/BX should be used 39 * from Ralf Brown interrupt list seem to indicate AX/BX should be used
@@ -383,7 +383,7 @@ sse_ok:
383# a whole bunch of different types, and allows memory holes and 383# a whole bunch of different types, and allows memory holes and
384# everything. We scan through this memory map and build a list 384# everything. We scan through this memory map and build a list
385# of the first 32 memory areas, which we return at [E820MAP]. 385# of the first 32 memory areas, which we return at [E820MAP].
386# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm 386# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
387 387
388#define SMAP 0x534d4150 388#define SMAP 0x534d4150
389 389
@@ -436,7 +436,7 @@ bail820:
436 436
437meme801: 437meme801:
438 stc # fix to work around buggy 438 stc # fix to work around buggy
439 xorw %cx,%cx # BIOSes which dont clear/set 439 xorw %cx,%cx # BIOSes which don't clear/set
440 xorw %dx,%dx # carry on pass/error of 440 xorw %dx,%dx # carry on pass/error of
441 # e801h memory size call 441 # e801h memory size call
442 # or merely pass cx,dx though 442 # or merely pass cx,dx though
@@ -733,7 +733,7 @@ flush_instr:
733# 733#
734# but we yet haven't reloaded the CS register, so the default size 734# but we yet haven't reloaded the CS register, so the default size
735# of the target offset still is 16 bit. 735# of the target offset still is 16 bit.
736# However, using an operant prefix (0x66), the CPU will properly 736# However, using an operand prefix (0x66), the CPU will properly
737# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference 737# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
738# Manual, Mixing 16-bit and 32-bit code, page 16-6) 738# Manual, Mixing 16-bit and 32-bit code, page 16-6)
739 739
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index c2fa66313170..18b5bac1c428 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares 3 * Copyright (C) 1997 Martin Mares
6 */ 4 */
@@ -8,7 +6,8 @@
8/* 6/*
9 * This file builds a disk-image from three different files: 7 * This file builds a disk-image from three different files:
10 * 8 *
11 * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest 9 * - bootsect: compatibility mbr which prints an error message if
10 * someone tries to boot the kernel directly.
12 * - setup: 8086 machine code, sets up system parm 11 * - setup: 8086 machine code, sets up system parm
13 * - system: 80386 code for actual system 12 * - system: 80386 code for actual system
14 * 13 *
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5b..cc935427d532 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
589 .quad compat_sys_mq_timedreceive /* 280 */ 589 .quad compat_sys_mq_timedreceive /* 280 */
590 .quad compat_sys_mq_notify 590 .quad compat_sys_mq_notify
591 .quad compat_sys_mq_getsetattr 591 .quad compat_sys_mq_getsetattr
592 .quad quiet_ni_syscall /* reserved for kexec */ 592 .quad compat_sys_kexec_load /* reserved for kexec */
593 .quad compat_sys_waitid 593 .quad compat_sys_waitid
594 .quad quiet_ni_syscall /* sys_altroot */ 594 .quad quiet_ni_syscall /* sys_altroot */
595 .quad sys_add_key 595 .quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fda..48f9e2c19cd6 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o 22 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
23obj-$(CONFIG_PM) += suspend.o 24obj-$(CONFIG_PM) += suspend.o
24obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o 25obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
25obj-$(CONFIG_CPU_FREQ) += cpufreq/ 26obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd4..185faa911db5 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
67 shll $4, %eax 67 shll $4, %eax
68 addl $(gdta - wakeup_code), %eax 68 addl $(gdta - wakeup_code), %eax
69 movl %eax, gdt_48a +2 - wakeup_code 69 movl %eax, gdt_48a +2 - wakeup_code
70 lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is 70 lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
71 # appropriate 71 # appropriate
72 72
73 movl $1, %eax # protected mode (PE) bit 73 movl $1, %eax # protected mode (PE) bit
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd4..375d369570ca 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
133 } 133 }
134} 134}
135 135
136void disconnect_bsp_APIC(void) 136void disconnect_bsp_APIC(int virt_wire_setup)
137{ 137{
138 if (pic_mode) { 138 if (pic_mode) {
139 /* 139 /*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
146 outb(0x70, 0x22); 146 outb(0x70, 0x22);
147 outb(0x00, 0x23); 147 outb(0x00, 0x23);
148 } 148 }
149 else {
150 /* Go back to Virtual Wire compatibility mode */
151 unsigned long value;
152
153 /* For the spurious interrupt use vector F, and enable it */
154 value = apic_read(APIC_SPIV);
155 value &= ~APIC_VECTOR_MASK;
156 value |= APIC_SPIV_APIC_ENABLED;
157 value |= 0xf;
158 apic_write_around(APIC_SPIV, value);
159
160 if (!virt_wire_setup) {
161 /* For LVT0 make it edge triggered, active high, external and enabled */
162 value = apic_read(APIC_LVT0);
163 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
164 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
165 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
166 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
167 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
168 apic_write_around(APIC_LVT0, value);
169 }
170 else {
171 /* Disable LVT0 */
172 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
173 }
174
175 /* For LVT1 make it edge triggered, active high, nmi and enabled */
176 value = apic_read(APIC_LVT1);
177 value &= ~(
178 APIC_MODE_MASK | APIC_SEND_PENDING |
179 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
180 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
181 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
182 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
183 apic_write_around(APIC_LVT1, value);
184 }
149} 185}
150 186
151void disable_local_APIC(void) 187void disable_local_APIC(void)
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void)
285 apic_write_around(APIC_LVT1, value); 321 apic_write_around(APIC_LVT1, value);
286} 322}
287 323
288void __init setup_local_APIC (void) 324void __cpuinit setup_local_APIC (void)
289{ 325{
290 unsigned int value, ver, maxlvt; 326 unsigned int value, ver, maxlvt;
291 327
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = {
534 .cls = &lapic_sysclass, 570 .cls = &lapic_sysclass,
535}; 571};
536 572
537static void __init apic_pm_activate(void) 573static void __cpuinit apic_pm_activate(void)
538{ 574{
539 apic_pm_state.active = 1; 575 apic_pm_state.active = 1;
540} 576}
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void)
774 local_irq_enable(); 810 local_irq_enable();
775} 811}
776 812
777void __init setup_secondary_APIC_clock(void) 813void __cpuinit setup_secondary_APIC_clock(void)
778{ 814{
779 local_irq_disable(); /* FIXME: Do we need this? --RR */ 815 local_irq_disable(); /* FIXME: Do we need this? --RR */
780 setup_APIC_timer(calibration_result); 816 setup_APIC_timer(calibration_result);
781 local_irq_enable(); 817 local_irq_enable();
782} 818}
783 819
784void __init disable_APIC_timer(void) 820void __cpuinit disable_APIC_timer(void)
785{ 821{
786 if (using_apic_timer) { 822 if (using_apic_timer) {
787 unsigned long v; 823 unsigned long v;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 000000000000..d7fa4248501c
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,35 @@
1/*
2 * Architecture specific (x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17
18#include <asm/processor.h>
19#include <asm/hardirq.h>
20#include <asm/nmi.h>
21#include <asm/hw_irq.h>
22
23note_buf_t crash_notes[NR_CPUS];
24
25void machine_crash_shutdown(struct pt_regs *regs)
26{
27 /* This function is only called after the system
28 * has paniced or is otherwise in a critical state.
29 * The minimum amount of code to allow a kexec'd kernel
30 * to run successfully needs to happen here.
31 *
32 * In practice this means shooting down the other cpus in
33 * an SMP system.
34 */
35}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64a..6ded3a50dfe6 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
16#include <linux/bootmem.h> 16#include <linux/bootmem.h>
17#include <linux/ioport.h> 17#include <linux/ioport.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/kexec.h>
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/e820.h> 21#include <asm/e820.h>
21#include <asm/proto.h> 22#include <asm/proto.h>
@@ -191,8 +192,6 @@ void __init e820_reserve_resources(void)
191 int i; 192 int i;
192 for (i = 0; i < e820.nr_map; i++) { 193 for (i = 0; i < e820.nr_map; i++) {
193 struct resource *res; 194 struct resource *res;
194 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
195 continue;
196 res = alloc_bootmem_low(sizeof(struct resource)); 195 res = alloc_bootmem_low(sizeof(struct resource));
197 switch (e820.map[i].type) { 196 switch (e820.map[i].type) {
198 case E820_RAM: res->name = "System RAM"; break; 197 case E820_RAM: res->name = "System RAM"; break;
@@ -212,6 +211,9 @@ void __init e820_reserve_resources(void)
212 */ 211 */
213 request_resource(res, &code_resource); 212 request_resource(res, &code_resource);
214 request_resource(res, &data_resource); 213 request_resource(res, &data_resource);
214#ifdef CONFIG_KEXEC
215 request_resource(res, &crashk_res);
216#endif
215 } 217 }
216 } 218 }
217} 219}
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad04226..282846965080 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by 7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and 8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon. 9 * James Cleverdon.
10 * Ashok Raj <ashok.raj@intel.com>
11 * Removed IPI broadcast shortcut to support CPU hotplug
10 */ 12 */
11#include <linux/config.h> 13#include <linux/config.h>
12#include <linux/threads.h> 14#include <linux/threads.h>
@@ -18,6 +20,46 @@
18#include <asm/smp.h> 20#include <asm/smp.h>
19#include <asm/ipi.h> 21#include <asm/ipi.h>
20 22
23/*
24 * The following permit choosing broadcast IPI shortcut v.s sending IPI only
25 * to online cpus via the send_IPI_mask varient.
26 * The mask version is my preferred option, since it eliminates a lot of
27 * other extra code that would need to be written to cleanup intrs sent
28 * to a CPU while offline.
29 *
30 * Sending broadcast introduces lots of trouble in CPU hotplug situations.
31 * These IPI's are delivered to cpu's irrespective of their offline status
32 * and could pickup stale intr data when these CPUS are turned online.
33 *
34 * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
35 * the idea of not using broadcast IPI's anymore. Hence the run time check
36 * is introduced, on his request so we can choose an alternate mechanism.
37 *
38 * Initial wacky performance tests that collect cycle counts show
39 * no increase in using mask v.s broadcast version. In fact they seem
40 * identical in terms of cycle counts.
41 *
42 * if we need to use broadcast, we need to do the following.
43 *
44 * cli;
45 * hold call_lock;
46 * clear any pending IPI, just ack and clear all pending intr
47 * set cpu_online_map;
48 * release call_lock;
49 * sti;
50 *
51 * The complicated dummy irq processing shown above is not required if
52 * we didnt sent IPI's to wrong CPU's in the first place.
53 *
54 * - Ashok Raj <ashok.raj@intel.com>
55 */
56#ifdef CONFIG_HOTPLUG_CPU
57#define DEFAULT_SEND_IPI (1)
58#else
59#define DEFAULT_SEND_IPI (0)
60#endif
61
62static int no_broadcast=DEFAULT_SEND_IPI;
21 63
22static cpumask_t flat_target_cpus(void) 64static cpumask_t flat_target_cpus(void)
23{ 65{
@@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void)
45 apic_write_around(APIC_LDR, val); 87 apic_write_around(APIC_LDR, val);
46} 88}
47 89
48static void flat_send_IPI_allbutself(int vector)
49{
50 /*
51 * if there are no other CPUs in the system then
52 * we get an APIC send error if we try to broadcast.
53 * thus we have to avoid sending IPIs in this case.
54 */
55 if (num_online_cpus() > 1)
56 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
57}
58
59static void flat_send_IPI_all(int vector)
60{
61 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
62}
63
64static void flat_send_IPI_mask(cpumask_t cpumask, int vector) 90static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
65{ 91{
66 unsigned long mask = cpus_addr(cpumask)[0]; 92 unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
93 local_irq_restore(flags); 119 local_irq_restore(flags);
94} 120}
95 121
122static inline void __local_flat_send_IPI_allbutself(int vector)
123{
124 if (no_broadcast) {
125 cpumask_t mask = cpu_online_map;
126 int this_cpu = get_cpu();
127
128 cpu_clear(this_cpu, mask);
129 flat_send_IPI_mask(mask, vector);
130 put_cpu();
131 }
132 else
133 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
134}
135
136static inline void __local_flat_send_IPI_all(int vector)
137{
138 if (no_broadcast)
139 flat_send_IPI_mask(cpu_online_map, vector);
140 else
141 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
142}
143
144static void flat_send_IPI_allbutself(int vector)
145{
146 if (((num_online_cpus()) - 1) >= 1)
147 __local_flat_send_IPI_allbutself(vector);
148}
149
150static void flat_send_IPI_all(int vector)
151{
152 __local_flat_send_IPI_all(vector);
153}
154
96static int flat_apic_id_registered(void) 155static int flat_apic_id_registered(void)
97{ 156{
98 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); 157 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
@@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb)
111 return ((ebx >> 24) & 0xFF) >> index_msb; 170 return ((ebx >> 24) & 0xFF) >> index_msb;
112} 171}
113 172
173static __init int no_ipi_broadcast(char *str)
174{
175 get_option(&str, &no_broadcast);
176 printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
177 "IPI Broadcast");
178 return 1;
179}
180
181__setup("no_ipi_broadcast", no_ipi_broadcast);
182
114struct genapic apic_flat = { 183struct genapic apic_flat = {
115 .name = "flat", 184 .name = "flat",
116 .int_delivery_mode = dest_LowestPrio, 185 .int_delivery_mode = dest_LowestPrio,
@@ -125,3 +194,12 @@ struct genapic apic_flat = {
125 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 194 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
126 .phys_pkg_id = phys_pkg_id, 195 .phys_pkg_id = phys_pkg_id,
127}; 196};
197
198static int __init print_ipi_mode(void)
199{
200 printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
201 "Shortcut");
202 return 0;
203}
204
205late_initcall(print_ipi_mode);
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81e..8d765aa77a26 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
248 */ 248 */
249.org 0x1000 249.org 0x1000
250ENTRY(init_level4_pgt) 250ENTRY(init_level4_pgt)
251 .quad 0x0000000000102007 /* -> level3_ident_pgt */ 251 .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
252 .fill 255,8,0 252 .fill 255,8,0
253 .quad 0x000000000010a007 253 .quad 0x000000000000a007 + __PHYSICAL_START
254 .fill 254,8,0 254 .fill 254,8,0
255 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 255 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
256 .quad 0x0000000000103007 /* -> level3_kernel_pgt */ 256 .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
257 257
258.org 0x2000 258.org 0x2000
259ENTRY(level3_ident_pgt) 259ENTRY(level3_ident_pgt)
260 .quad 0x0000000000104007 260 .quad 0x0000000000004007 + __PHYSICAL_START
261 .fill 511,8,0 261 .fill 511,8,0
262 262
263.org 0x3000 263.org 0x3000
264ENTRY(level3_kernel_pgt) 264ENTRY(level3_kernel_pgt)
265 .fill 510,8,0 265 .fill 510,8,0
266 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 266 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
267 .quad 0x0000000000105007 /* -> level2_kernel_pgt */ 267 .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */
268 .fill 1,8,0 268 .fill 1,8,0
269 269
270.org 0x4000 270.org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
337 337
338.org 0xa000 338.org 0xa000
339ENTRY(level3_physmem_pgt) 339ENTRY(level3_physmem_pgt)
340 .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ 340 .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
341 341
342 .org 0xb000 342 .org 0xb000
343#ifdef CONFIG_ACPI_SLEEP 343#ifdef CONFIG_ACPI_SLEEP
344ENTRY(wakeup_level4_pgt) 344ENTRY(wakeup_level4_pgt)
345 .quad 0x0000000000102007 /* -> level3_ident_pgt */ 345 .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
346 .fill 255,8,0 346 .fill 255,8,0
347 .quad 0x000000000010a007 347 .quad 0x000000000000a007 + __PHYSICAL_START
348 .fill 254,8,0 348 .fill 254,8,0
349 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 349 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
350 .quad 0x0000000000103007 /* -> level3_kernel_pgt */ 350 .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
351#endif 351#endif
352 352
353 .data 353 .data
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57ce..d9b22b633e39 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
42 * Called at bootup to set up the initial FPU state that is later cloned 42 * Called at bootup to set up the initial FPU state that is later cloned
43 * into all processes. 43 * into all processes.
44 */ 44 */
45void __init fpu_init(void) 45void __cpuinit fpu_init(void)
46{ 46{
47 unsigned long oldcr0 = read_cr0(); 47 unsigned long oldcr0 = read_cr0();
48 extern void __bad_fxsave_alignment(void); 48 extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 19eafa0aa95c..a89169095129 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
414 return 0; 414 return 0;
415} 415}
416 416
417static int i8259A_shutdown(struct sys_device *dev)
418{
419 /* Put the i8259A into a quiescent state that
420 * the kernel initialization code can get it
421 * out of.
422 */
423 outb(0xff, 0x21); /* mask all of 8259A-1 */
424 outb(0xff, 0xA1); /* mask all of 8259A-1 */
425 return 0;
426}
427
417static struct sysdev_class i8259_sysdev_class = { 428static struct sysdev_class i8259_sysdev_class = {
418 set_kset_name("i8259"), 429 set_kset_name("i8259"),
419 .suspend = i8259A_suspend, 430 .suspend = i8259A_suspend,
420 .resume = i8259A_resume, 431 .resume = i8259A_resume,
432 .shutdown = i8259A_shutdown,
421}; 433};
422 434
423static struct sys_device device_i8259A = { 435static struct sys_device device_i8259A = {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a8..157190d986bb 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
330/* 330/*
331 * Find the pin to which IRQ[irq] (ISA) is connected 331 * Find the pin to which IRQ[irq] (ISA) is connected
332 */ 332 */
333static int __init find_isa_irq_pin(int irq, int type) 333static int find_isa_irq_pin(int irq, int type)
334{ 334{
335 int i; 335 int i;
336 336
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void)
1132 */ 1132 */
1133void disable_IO_APIC(void) 1133void disable_IO_APIC(void)
1134{ 1134{
1135 int pin;
1135 /* 1136 /*
1136 * Clear the IO-APIC before rebooting: 1137 * Clear the IO-APIC before rebooting:
1137 */ 1138 */
1138 clear_IO_APIC(); 1139 clear_IO_APIC();
1139 1140
1140 disconnect_bsp_APIC(); 1141 /*
1142 * If the i82559 is routed through an IOAPIC
1143 * Put that IOAPIC in virtual wire mode
1144 * so legacy interrups can be delivered.
1145 */
1146 pin = find_isa_irq_pin(0, mp_ExtINT);
1147 if (pin != -1) {
1148 struct IO_APIC_route_entry entry;
1149 unsigned long flags;
1150
1151 memset(&entry, 0, sizeof(entry));
1152 entry.mask = 0; /* Enabled */
1153 entry.trigger = 0; /* Edge */
1154 entry.irr = 0;
1155 entry.polarity = 0; /* High */
1156 entry.delivery_status = 0;
1157 entry.dest_mode = 0; /* Physical */
1158 entry.delivery_mode = 7; /* ExtInt */
1159 entry.vector = 0;
1160 entry.dest.physical.physical_dest = 0;
1161
1162
1163 /*
1164 * Add it to the IO-APIC irq-routing table:
1165 */
1166 spin_lock_irqsave(&ioapic_lock, flags);
1167 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
1168 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
1169 spin_unlock_irqrestore(&ioapic_lock, flags);
1170 }
1171
1172 disconnect_bsp_APIC(pin != -1);
1141} 1173}
1142 1174
1143/* 1175/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb4..cc3fb85f5145 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
14#include <linux/interrupt.h> 14#include <linux/interrupt.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/delay.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19 20
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
106 return 1; 107 return 1;
107} 108}
108 109
110#ifdef CONFIG_HOTPLUG_CPU
111void fixup_irqs(cpumask_t map)
112{
113 unsigned int irq;
114 static int warned;
115
116 for (irq = 0; irq < NR_IRQS; irq++) {
117 cpumask_t mask;
118 if (irq == 2)
119 continue;
120
121 cpus_and(mask, irq_affinity[irq], map);
122 if (any_online_cpu(mask) == NR_CPUS) {
123 printk("Breaking affinity for irq %i\n", irq);
124 mask = map;
125 }
126 if (irq_desc[irq].handler->set_affinity)
127 irq_desc[irq].handler->set_affinity(irq, mask);
128 else if (irq_desc[irq].action && !(warned++))
129 printk("Cannot set affinity for irq %i\n", irq);
130 }
131
132 /* That doesn't seem sufficient. Give it 1ms. */
133 local_irq_enable();
134 mdelay(1);
135 local_irq_disable();
136}
137#endif
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 000000000000..60d1eff41567
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,250 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/string.h>
13#include <linux/reboot.h>
14#include <asm/pda.h>
15#include <asm/pgtable.h>
16#include <asm/pgalloc.h>
17#include <asm/tlbflush.h>
18#include <asm/mmu_context.h>
19#include <asm/io.h>
20#include <asm/apic.h>
21#include <asm/cpufeature.h>
22#include <asm/hw_irq.h>
23
24#define LEVEL0_SIZE (1UL << 12UL)
25#define LEVEL1_SIZE (1UL << 21UL)
26#define LEVEL2_SIZE (1UL << 30UL)
27#define LEVEL3_SIZE (1UL << 39UL)
28#define LEVEL4_SIZE (1UL << 48UL)
29
30#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
31#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
32#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
33#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
34
35static void init_level2_page(u64 *level2p, unsigned long addr)
36{
37 unsigned long end_addr;
38
39 addr &= PAGE_MASK;
40 end_addr = addr + LEVEL2_SIZE;
41 while (addr < end_addr) {
42 *(level2p++) = addr | L1_ATTR;
43 addr += LEVEL1_SIZE;
44 }
45}
46
47static int init_level3_page(struct kimage *image, u64 *level3p,
48 unsigned long addr, unsigned long last_addr)
49{
50 unsigned long end_addr;
51 int result;
52
53 result = 0;
54 addr &= PAGE_MASK;
55 end_addr = addr + LEVEL3_SIZE;
56 while ((addr < last_addr) && (addr < end_addr)) {
57 struct page *page;
58 u64 *level2p;
59
60 page = kimage_alloc_control_pages(image, 0);
61 if (!page) {
62 result = -ENOMEM;
63 goto out;
64 }
65 level2p = (u64 *)page_address(page);
66 init_level2_page(level2p, addr);
67 *(level3p++) = __pa(level2p) | L2_ATTR;
68 addr += LEVEL2_SIZE;
69 }
70 /* clear the unused entries */
71 while (addr < end_addr) {
72 *(level3p++) = 0;
73 addr += LEVEL2_SIZE;
74 }
75out:
76 return result;
77}
78
79
80static int init_level4_page(struct kimage *image, u64 *level4p,
81 unsigned long addr, unsigned long last_addr)
82{
83 unsigned long end_addr;
84 int result;
85
86 result = 0;
87 addr &= PAGE_MASK;
88 end_addr = addr + LEVEL4_SIZE;
89 while ((addr < last_addr) && (addr < end_addr)) {
90 struct page *page;
91 u64 *level3p;
92
93 page = kimage_alloc_control_pages(image, 0);
94 if (!page) {
95 result = -ENOMEM;
96 goto out;
97 }
98 level3p = (u64 *)page_address(page);
99 result = init_level3_page(image, level3p, addr, last_addr);
100 if (result) {
101 goto out;
102 }
103 *(level4p++) = __pa(level3p) | L3_ATTR;
104 addr += LEVEL3_SIZE;
105 }
106 /* clear the unused entries */
107 while (addr < end_addr) {
108 *(level4p++) = 0;
109 addr += LEVEL3_SIZE;
110 }
111out:
112 return result;
113}
114
115
116static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
117{
118 u64 *level4p;
119 level4p = (u64 *)__va(start_pgtable);
120 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
121}
122
123static void set_idt(void *newidt, u16 limit)
124{
125 unsigned char curidt[10];
126
127 /* x86-64 supports unaliged loads & stores */
128 (*(u16 *)(curidt)) = limit;
129 (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
130
131 __asm__ __volatile__ (
132 "lidt %0\n"
133 : "=m" (curidt)
134 );
135};
136
137
138static void set_gdt(void *newgdt, u16 limit)
139{
140 unsigned char curgdt[10];
141
142 /* x86-64 supports unaligned loads & stores */
143 (*(u16 *)(curgdt)) = limit;
144 (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
145
146 __asm__ __volatile__ (
147 "lgdt %0\n"
148 : "=m" (curgdt)
149 );
150};
151
152static void load_segments(void)
153{
154 __asm__ __volatile__ (
155 "\tmovl $"STR(__KERNEL_DS)",%eax\n"
156 "\tmovl %eax,%ds\n"
157 "\tmovl %eax,%es\n"
158 "\tmovl %eax,%ss\n"
159 "\tmovl %eax,%fs\n"
160 "\tmovl %eax,%gs\n"
161 );
162#undef STR
163#undef __STR
164}
165
166typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
167 unsigned long control_code_buffer,
168 unsigned long start_address,
169 unsigned long pgtable) ATTRIB_NORET;
170
171const extern unsigned char relocate_new_kernel[];
172const extern unsigned long relocate_new_kernel_size;
173
174int machine_kexec_prepare(struct kimage *image)
175{
176 unsigned long start_pgtable, control_code_buffer;
177 int result;
178
179 /* Calculate the offsets */
180 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
181 control_code_buffer = start_pgtable + 4096UL;
182
183 /* Setup the identity mapped 64bit page table */
184 result = init_pgtable(image, start_pgtable);
185 if (result)
186 return result;
187
188 /* Place the code in the reboot code buffer */
189 memcpy(__va(control_code_buffer), relocate_new_kernel,
190 relocate_new_kernel_size);
191
192 return 0;
193}
194
195void machine_kexec_cleanup(struct kimage *image)
196{
197 return;
198}
199
200/*
201 * Do not allocate memory (or fail in any way) in machine_kexec().
202 * We are past the point of no return, committed to rebooting now.
203 */
204NORET_TYPE void machine_kexec(struct kimage *image)
205{
206 unsigned long page_list;
207 unsigned long control_code_buffer;
208 unsigned long start_pgtable;
209 relocate_new_kernel_t rnk;
210
211 /* Interrupts aren't acceptable while we reboot */
212 local_irq_disable();
213
214 /* Calculate the offsets */
215 page_list = image->head;
216 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
217 control_code_buffer = start_pgtable + 4096UL;
218
219 /* Set the low half of the page table to my identity mapped
220 * page table for kexec. Leave the high half pointing at the
221 * kernel pages. Don't bother to flush the global pages
222 * as that will happen when I fully switch to my identity mapped
223 * page table anyway.
224 */
225 memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
226 __flush_tlb();
227
228
229 /* The segment registers are funny things, they are
230 * automatically loaded from a table, in memory wherever you
231 * set them to a specific selector, but this table is never
232 * accessed again unless you set the segment to a different selector.
233 *
234 * The more common model are caches where the behide
235 * the scenes work is done, but is also dropped at arbitrary
236 * times.
237 *
238 * I take advantage of this here by force loading the
239 * segments, before I zap the gdt with an invalid value.
240 */
241 load_segments();
242 /* The gdt & idt are now invalid.
243 * If you want to load them you must set up your own idt & gdt.
244 */
245 set_gdt(phys_to_virt(0),0);
246 set_idt(phys_to_virt(0),0);
247 /* now call it */
248 rnk = (relocate_new_kernel_t) control_code_buffer;
249 (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
250}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f6..21e70625a495 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
327} 327}
328 328
329/* Add per CPU specific workarounds here */ 329/* Add per CPU specific workarounds here */
330static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 330static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
331{ 331{
332 /* This should be disabled by the BIOS, but isn't always */ 332 /* This should be disabled by the BIOS, but isn't always */
333 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { 333 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
337 } 337 }
338} 338}
339 339
340static void __init mce_cpu_features(struct cpuinfo_x86 *c) 340static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
341{ 341{
342 switch (c->x86_vendor) { 342 switch (c->x86_vendor) {
343 case X86_VENDOR_INTEL: 343 case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
352 * Called for each booted CPU to set up machine checks. 352 * Called for each booted CPU to set up machine checks.
353 * Must be called with preempt off. 353 * Must be called with preempt off.
354 */ 354 */
355void __init mcheck_init(struct cpuinfo_x86 *c) 355void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
356{ 356{
357 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; 357 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
358 358
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
411 memset(mcelog.entry, 0, next * sizeof(struct mce)); 411 memset(mcelog.entry, 0, next * sizeof(struct mce));
412 mcelog.next = 0; 412 mcelog.next = 0;
413 413
414 synchronize_kernel(); 414 synchronize_sched();
415 415
416 /* Collect entries that were still getting written before the synchronize. */ 416 /* Collect entries that were still getting written before the synchronize. */
417 417
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
542ACCESSOR(tolerant,tolerant,) 542ACCESSOR(tolerant,tolerant,)
543ACCESSOR(check_interval,check_interval,mce_restart()) 543ACCESSOR(check_interval,check_interval,mce_restart())
544 544
545static __init int mce_init_device(void) 545static __cpuinit int mce_init_device(void)
546{ 546{
547 int err; 547 int err;
548 if (!mce_available(&boot_cpu_data)) 548 if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069f..0be0a7959814 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
42 irq_exit(); 42 irq_exit();
43} 43}
44 44
45static void __init intel_init_thermal(struct cpuinfo_x86 *c) 45static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
46{ 46{
47 u32 l, h; 47 u32 l, h;
48 int tm2 = 0; 48 int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
93 return; 93 return;
94} 94}
95 95
96void __init mce_intel_feature_init(struct cpuinfo_x86 *c) 96void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
97{ 97{
98 intel_init_thermal(c); 98 intel_init_thermal(c);
99} 99}
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac91..4e44d6e6b7e5 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
100 100
101static __init inline int nmi_known_cpu(void) 101static __cpuinit inline int nmi_known_cpu(void)
102{ 102{
103 switch (boot_cpu_data.x86_vendor) { 103 switch (boot_cpu_data.x86_vendor) {
104 case X86_VENDOR_AMD: 104 case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
110} 110}
111 111
112/* Run after command line and cpu_init init, but before all other checks */ 112/* Run after command line and cpu_init init, but before all other checks */
113void __init nmi_watchdog_default(void) 113void __cpuinit nmi_watchdog_default(void)
114{ 114{
115 if (nmi_watchdog != NMI_DEFAULT) 115 if (nmi_watchdog != NMI_DEFAULT)
116 return; 116 return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e59d1f9d6163..1d91271796e5 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
8 * 8 *
9 * X86-64 port 9 * X86-64 port
10 * Andi Kleen. 10 * Andi Kleen.
11 * 11 *
12 * CPU hotplug support - ashok.raj@intel.com
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ 13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
13 */ 14 */
14 15
@@ -18,6 +19,7 @@
18 19
19#include <stdarg.h> 20#include <stdarg.h>
20 21
22#include <linux/cpu.h>
21#include <linux/errno.h> 23#include <linux/errno.h>
22#include <linux/sched.h> 24#include <linux/sched.h>
23#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
154} 156}
155EXPORT_SYMBOL_GPL(cpu_idle_wait); 157EXPORT_SYMBOL_GPL(cpu_idle_wait);
156 158
159#ifdef CONFIG_HOTPLUG_CPU
160DECLARE_PER_CPU(int, cpu_state);
161
162#include <asm/nmi.h>
163/* We don't actually take CPU down, just spin without interrupts. */
164static inline void play_dead(void)
165{
166 idle_task_exit();
167 wbinvd();
168 mb();
169 /* Ack it */
170 __get_cpu_var(cpu_state) = CPU_DEAD;
171
172 while (1)
173 safe_halt();
174}
175#else
176static inline void play_dead(void)
177{
178 BUG();
179}
180#endif /* CONFIG_HOTPLUG_CPU */
181
157/* 182/*
158 * The idle thread. There's no useful work to be 183 * The idle thread. There's no useful work to be
159 * done, so just try to conserve power and have a 184 * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
174 idle = pm_idle; 199 idle = pm_idle;
175 if (!idle) 200 if (!idle)
176 idle = default_idle; 201 idle = default_idle;
202 if (cpu_is_offline(smp_processor_id()))
203 play_dead();
177 idle(); 204 idle();
178 } 205 }
179 206
@@ -204,7 +231,7 @@ static void mwait_idle(void)
204 } 231 }
205} 232}
206 233
207void __init select_idle_routine(const struct cpuinfo_x86 *c) 234void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
208{ 235{
209 static int printed; 236 static int printed;
210 if (cpu_has(c, X86_FEATURE_MWAIT)) { 237 if (cpu_has(c, X86_FEATURE_MWAIT)) {
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762cf..57e71dbdfd69 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str)
66 66
67__setup("reboot=", reboot_setup); 67__setup("reboot=", reboot_setup);
68 68
69#ifdef CONFIG_SMP 69static inline void kb_wait(void)
70static void smp_halt(void)
71{ 70{
72 int cpuid = safe_smp_processor_id(); 71 int i;
73 static int first_entry = 1;
74 72
75 if (reboot_force) 73 for (i=0; i<0x10000; i++)
76 return; 74 if ((inb_p(0x64) & 0x02) == 0)
75 break;
76}
77 77
78 if (first_entry) { 78void machine_shutdown(void)
79 first_entry = 0; 79{
80 smp_call_function((void *)machine_restart, NULL, 1, 0); 80 /* Stop the cpus and apics */
81 } 81#ifdef CONFIG_SMP
82 82 int reboot_cpu_id;
83 smp_stop_cpu();
84 83
85 /* AP calling this. Just halt */ 84 /* The boot cpu is always logical cpu 0 */
86 if (cpuid != boot_cpu_id) { 85 reboot_cpu_id = 0;
87 for (;;) 86
88 asm("hlt"); 87 /* Make certain the cpu I'm about to reboot on is online */
88 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
89 reboot_cpu_id = smp_processor_id();
89 } 90 }
90 91
91 /* Wait for all other CPUs to have run smp_stop_cpu */ 92 /* Make certain I only run on the appropriate processor */
92 while (!cpus_empty(cpu_online_map)) 93 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
93 rep_nop(); 94
94} 95 /* O.K Now that I'm on the appropriate processor,
96 * stop all of the others.
97 */
98 smp_send_stop();
95#endif 99#endif
96 100
97static inline void kb_wait(void) 101 local_irq_disable();
98{
99 int i;
100 102
101 for (i=0; i<0x10000; i++) 103#ifndef CONFIG_SMP
102 if ((inb_p(0x64) & 0x02) == 0) 104 disable_local_APIC();
103 break; 105#endif
106
107 disable_IO_APIC();
108
109 local_irq_enable();
104} 110}
105 111
106void machine_restart(char * __unused) 112void machine_restart(char * __unused)
@@ -109,9 +115,7 @@ void machine_restart(char * __unused)
109 115
110 printk("machine restart\n"); 116 printk("machine restart\n");
111 117
112#ifdef CONFIG_SMP 118 machine_shutdown();
113 smp_halt();
114#endif
115 119
116 if (!reboot_force) { 120 if (!reboot_force) {
117 local_irq_disable(); 121 local_irq_disable();
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d24fa9b72a2b
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10
11 /*
12 * Must be relocatable PIC code callable as a C function, that once
13 * it starts can not use the previous processes stack.
14 */
15 .globl relocate_new_kernel
16 .code64
17relocate_new_kernel:
18 /* %rdi page_list
19 * %rsi reboot_code_buffer
20 * %rdx start address
21 * %rcx page_table
22 * %r8 arg5
23 * %r9 arg6
24 */
25
26 /* zero out flags, and disable interrupts */
27 pushq $0
28 popfq
29
30 /* set a new stack at the bottom of our page... */
31 lea 4096(%rsi), %rsp
32
33 /* store the parameters back on the stack */
34 pushq %rdx /* store the start address */
35
36 /* Set cr0 to a known state:
37 * 31 1 == Paging enabled
38 * 18 0 == Alignment check disabled
39 * 16 0 == Write protect disabled
40 * 3 0 == No task switch
41 * 2 0 == Don't do FP software emulation.
42 * 0 1 == Proctected mode enabled
43 */
44 movq %cr0, %rax
45 andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
46 orl $((1<<31)|(1<<0)), %eax
47 movq %rax, %cr0
48
49 /* Set cr4 to a known state:
50 * 10 0 == xmm exceptions disabled
51 * 9 0 == xmm registers instructions disabled
52 * 8 0 == performance monitoring counter disabled
53 * 7 0 == page global disabled
54 * 6 0 == machine check exceptions disabled
55 * 5 1 == physical address extension enabled
56 * 4 0 == page size extensions disabled
57 * 3 0 == Debug extensions disabled
58 * 2 0 == Time stamp disable (disabled)
59 * 1 0 == Protected mode virtual interrupts disabled
60 * 0 0 == VME disabled
61 */
62
63 movq $((1<<5)), %rax
64 movq %rax, %cr4
65
66 jmp 1f
671:
68
69 /* Switch to the identity mapped page tables,
70 * and flush the TLB.
71 */
72 movq %rcx, %cr3
73
74 /* Do the copies */
75 movq %rdi, %rcx /* Put the page_list in %rcx */
76 xorq %rdi, %rdi
77 xorq %rsi, %rsi
78 jmp 1f
79
800: /* top, read another word for the indirection page */
81
82 movq (%rbx), %rcx
83 addq $8, %rbx
841:
85 testq $0x1, %rcx /* is it a destination page? */
86 jz 2f
87 movq %rcx, %rdi
88 andq $0xfffffffffffff000, %rdi
89 jmp 0b
902:
91 testq $0x2, %rcx /* is it an indirection page? */
92 jz 2f
93 movq %rcx, %rbx
94 andq $0xfffffffffffff000, %rbx
95 jmp 0b
962:
97 testq $0x4, %rcx /* is it the done indicator? */
98 jz 2f
99 jmp 3f
1002:
101 testq $0x8, %rcx /* is it the source indicator? */
102 jz 0b /* Ignore it otherwise */
103 movq %rcx, %rsi /* For ever source page do a copy */
104 andq $0xfffffffffffff000, %rsi
105
106 movq $512, %rcx
107 rep ; movsq
108 jmp 0b
1093:
110
111 /* To be certain of avoiding problems with self-modifying code
112 * I need to execute a serializing instruction here.
113 * So I flush the TLB by reloading %cr3 here, it's handy,
114 * and not processor dependent.
115 */
116 movq %cr3, %rax
117 movq %rax, %cr3
118
119 /* set all of the registers to known values */
120 /* leave %rsp alone */
121
122 xorq %rax, %rax
123 xorq %rbx, %rbx
124 xorq %rcx, %rcx
125 xorq %rdx, %rdx
126 xorq %rsi, %rsi
127 xorq %rdi, %rdi
128 xorq %rbp, %rbp
129 xorq %r8, %r8
130 xorq %r9, %r9
131 xorq %r10, %r9
132 xorq %r11, %r11
133 xorq %r12, %r12
134 xorq %r13, %r13
135 xorq %r14, %r14
136 xorq %r15, %r15
137
138 ret
139relocate_new_kernel_end:
140
141 .globl relocate_new_kernel_size
142relocate_new_kernel_size:
143 .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 000015dd5a8b..b02d921da4f7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -41,6 +41,7 @@
41#include <linux/kallsyms.h> 41#include <linux/kallsyms.h>
42#include <linux/edd.h> 42#include <linux/edd.h>
43#include <linux/mmzone.h> 43#include <linux/mmzone.h>
44#include <linux/kexec.h>
44 45
45#include <asm/mtrr.h> 46#include <asm/mtrr.h>
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
@@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
367 if (!memcmp(from, "noexec=", 7)) 368 if (!memcmp(from, "noexec=", 7))
368 nonx_setup(from + 7); 369 nonx_setup(from + 7);
369 370
371#ifdef CONFIG_KEXEC
372 /* crashkernel=size@addr specifies the location to reserve for
373 * a crash kernel. By reserving this memory we guarantee
374 * that linux never set's it up as a DMA target.
375 * Useful for holding code to do something appropriate
376 * after a kernel panic.
377 */
378 else if (!memcmp(from, "crashkernel=", 12)) {
379 unsigned long size, base;
380 size = memparse(from+12, &from);
381 if (*from == '@') {
382 base = memparse(from+1, &from);
383 /* FIXME: Do I want a sanity check
384 * to validate the memory range?
385 */
386 crashk_res.start = base;
387 crashk_res.end = base + size - 1;
388 }
389 }
390#endif
391
370 next_char: 392 next_char:
371 c = *(from++); 393 c = *(from++);
372 if (!c) 394 if (!c)
@@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p)
625#endif 647#endif
626 648
627 sparse_init(); 649 sparse_init();
650
651#ifdef CONFIG_KEXEC
652 if (crashk_res.start != crashk_res.end) {
653 reserve_bootmem(crashk_res.start,
654 crashk_res.end - crashk_res.start + 1);
655 }
656#endif
628 paging_init(); 657 paging_init();
629 658
630 check_ioapic(); 659 check_ioapic();
@@ -676,7 +705,7 @@ void __init setup_arch(char **cmdline_p)
676#endif 705#endif
677} 706}
678 707
679static int __init get_model_name(struct cpuinfo_x86 *c) 708static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
680{ 709{
681 unsigned int *v; 710 unsigned int *v;
682 711
@@ -692,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
692} 721}
693 722
694 723
695static void __init display_cacheinfo(struct cpuinfo_x86 *c) 724static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
696{ 725{
697 unsigned int n, dummy, eax, ebx, ecx, edx; 726 unsigned int n, dummy, eax, ebx, ecx, edx;
698 727
@@ -803,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
803 return r; 832 return r;
804} 833}
805 834
806static void __init detect_ht(struct cpuinfo_x86 *c) 835static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
807{ 836{
808#ifdef CONFIG_SMP 837#ifdef CONFIG_SMP
809 u32 eax, ebx, ecx, edx; 838 u32 eax, ebx, ecx, edx;
@@ -864,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
864/* 893/*
865 * find out the number of processor cores on the die 894 * find out the number of processor cores on the die
866 */ 895 */
867static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) 896static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
868{ 897{
869 unsigned int eax; 898 unsigned int eax;
870 899
@@ -882,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
882 return 1; 911 return 1;
883} 912}
884 913
885static void __init init_intel(struct cpuinfo_x86 *c) 914static void __cpuinit init_intel(struct cpuinfo_x86 *c)
886{ 915{
887 /* Cache sizes */ 916 /* Cache sizes */
888 unsigned n; 917 unsigned n;
@@ -902,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
902 c->x86_num_cores = intel_num_cpu_cores(c); 931 c->x86_num_cores = intel_num_cpu_cores(c);
903} 932}
904 933
905void __init get_cpu_vendor(struct cpuinfo_x86 *c) 934void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
906{ 935{
907 char *v = c->x86_vendor_id; 936 char *v = c->x86_vendor_id;
908 937
@@ -923,7 +952,7 @@ struct cpu_model_info {
923/* Do some early cpuid on the boot CPU to get some parameter that are 952/* Do some early cpuid on the boot CPU to get some parameter that are
924 needed before check_bugs. Everything advanced is in identify_cpu 953 needed before check_bugs. Everything advanced is in identify_cpu
925 below. */ 954 below. */
926void __init early_identify_cpu(struct cpuinfo_x86 *c) 955void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
927{ 956{
928 u32 tfms; 957 u32 tfms;
929 958
@@ -977,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
977/* 1006/*
978 * This does the hard work of actually picking apart the CPU stuff... 1007 * This does the hard work of actually picking apart the CPU stuff...
979 */ 1008 */
980void __init identify_cpu(struct cpuinfo_x86 *c) 1009void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
981{ 1010{
982 int i; 1011 int i;
983 u32 xlvl; 1012 u32 xlvl;
@@ -1054,7 +1083,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
1054} 1083}
1055 1084
1056 1085
1057void __init print_cpu_info(struct cpuinfo_x86 *c) 1086void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1058{ 1087{
1059 if (c->x86_model_id[0]) 1088 if (c->x86_model_id[0])
1060 printk("%s", c->x86_model_id); 1089 printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8b..f2b9c6bc999a 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
29 29
30char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 30char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
31 31
32cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; 32cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
33 33
34struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 34struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
35 35
@@ -171,7 +171,7 @@ void syscall_init(void)
171 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 171 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
172} 172}
173 173
174void __init check_efer(void) 174void __cpuinit check_efer(void)
175{ 175{
176 unsigned long efer; 176 unsigned long efer;
177 177
@@ -188,7 +188,7 @@ void __init check_efer(void)
188 * 'CPU state barrier', nothing should get across. 188 * 'CPU state barrier', nothing should get across.
189 * A lot of state is already set up in PDA init. 189 * A lot of state is already set up in PDA init.
190 */ 190 */
191void __init cpu_init (void) 191void __cpuinit cpu_init (void)
192{ 192{
193#ifdef CONFIG_SMP 193#ifdef CONFIG_SMP
194 int cpu = stack_smp_processor_id(); 194 int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b7ba95d581..98590a989f3d 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
425 if (!user_mode(regs)) 425 if (!user_mode(regs))
426 return 1; 426 return 1;
427 427
428 if (try_to_freeze(0)) 428 if (try_to_freeze())
429 goto no_signal; 429 goto no_signal;
430 430
431 if (!oldset) 431 if (!oldset)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1d..ccae392886af 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
283 283
284static struct call_data_struct * call_data; 284static struct call_data_struct * call_data;
285 285
286void lock_ipi_call_lock(void)
287{
288 spin_lock_irq(&call_lock);
289}
290
291void unlock_ipi_call_lock(void)
292{
293 spin_unlock_irq(&call_lock);
294}
295
286/* 296/*
287 * this function sends a 'generic call function' IPI to all other CPUs 297 * this function sends a 'generic call function' IPI to all other CPUs
288 * in the system. 298 * in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f345941..b969ee128728 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
34 * Andi Kleen : Converted to new state machine. 34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups. 35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now. 36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support
37 */ 38 */
38 39
39 40
@@ -58,11 +59,6 @@
58#include <asm/proto.h> 59#include <asm/proto.h>
59#include <asm/nmi.h> 60#include <asm/nmi.h>
60 61
61/* Change for real CPU hotplug. Note other files need to be fixed
62 first too. */
63#define __cpuinit __init
64#define __cpuinitdata __initdata
65
66/* Number of siblings per CPU package */ 62/* Number of siblings per CPU package */
67int smp_num_siblings = 1; 63int smp_num_siblings = 1;
68/* Package ID of each logical CPU */ 64/* Package ID of each logical CPU */
@@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
103extern unsigned char trampoline_data[]; 99extern unsigned char trampoline_data[];
104extern unsigned char trampoline_end[]; 100extern unsigned char trampoline_end[];
105 101
102/* State of each CPU */
103DEFINE_PER_CPU(int, cpu_state) = { 0 };
104
105/*
106 * Store all idle threads, this can be reused instead of creating
107 * a new thread. Also avoids complicated thread destroy functionality
108 * for idle threads.
109 */
110struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
111
112#define get_idle_for_cpu(x) (idle_thread_array[(x)])
113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
114
115/*
116 * cpu_possible_map should be static, it cannot change as cpu's
117 * are onlined, or offlined. The reason is per-cpu data-structures
118 * are allocated by some modules at init time, and dont expect to
119 * do this dynamically on cpu arrival/departure.
120 * cpu_present_map on the other hand can change dynamically.
121 * In case when cpu_hotplug is not compiled, then we resort to current
122 * behaviour, which is cpu_possible == cpu_present.
123 * If cpu-hotplug is supported, then we need to preallocate for all
124 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
125 * - Ashok Raj
126 */
127#ifdef CONFIG_HOTPLUG_CPU
128#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
129#else
130#define fixup_cpu_possible_map(x)
131#endif
132
106/* 133/*
107 * Currently trivial. Write the real->protected mode 134 * Currently trivial. Write the real->protected mode
108 * bootstrap into the page concerned. The caller 135 * bootstrap into the page concerned. The caller
@@ -418,6 +445,33 @@ void __cpuinit smp_callin(void)
418 cpu_set(cpuid, cpu_callin_map); 445 cpu_set(cpuid, cpu_callin_map);
419} 446}
420 447
448static inline void set_cpu_sibling_map(int cpu)
449{
450 int i;
451
452 if (smp_num_siblings > 1) {
453 for_each_cpu(i) {
454 if (cpu_core_id[cpu] == cpu_core_id[i]) {
455 cpu_set(i, cpu_sibling_map[cpu]);
456 cpu_set(cpu, cpu_sibling_map[i]);
457 }
458 }
459 } else {
460 cpu_set(cpu, cpu_sibling_map[cpu]);
461 }
462
463 if (current_cpu_data.x86_num_cores > 1) {
464 for_each_cpu(i) {
465 if (phys_proc_id[cpu] == phys_proc_id[i]) {
466 cpu_set(i, cpu_core_map[cpu]);
467 cpu_set(cpu, cpu_core_map[i]);
468 }
469 }
470 } else {
471 cpu_core_map[cpu] = cpu_sibling_map[cpu];
472 }
473}
474
421/* 475/*
422 * Setup code on secondary processor (after comming out of the trampoline) 476 * Setup code on secondary processor (after comming out of the trampoline)
423 */ 477 */
@@ -448,9 +502,28 @@ void __cpuinit start_secondary(void)
448 enable_APIC_timer(); 502 enable_APIC_timer();
449 503
450 /* 504 /*
505 * The sibling maps must be set before turing the online map on for
506 * this cpu
507 */
508 set_cpu_sibling_map(smp_processor_id());
509
510 /*
511 * We need to hold call_lock, so there is no inconsistency
512 * between the time smp_call_function() determines number of
513 * IPI receipients, and the time when the determination is made
514 * for which cpus receive the IPI in genapic_flat.c. Holding this
515 * lock helps us to not include this cpu in a currently in progress
516 * smp_call_function().
517 */
518 lock_ipi_call_lock();
519
520 /*
451 * Allow the master to continue. 521 * Allow the master to continue.
452 */ 522 */
453 cpu_set(smp_processor_id(), cpu_online_map); 523 cpu_set(smp_processor_id(), cpu_online_map);
524 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
525 unlock_ipi_call_lock();
526
454 mb(); 527 mb();
455 528
456 /* Wait for TSC sync to not schedule things before. 529 /* Wait for TSC sync to not schedule things before.
@@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
628 return (send_status | accept_status); 701 return (send_status | accept_status);
629} 702}
630 703
704struct create_idle {
705 struct task_struct *idle;
706 struct completion done;
707 int cpu;
708};
709
710void do_fork_idle(void *_c_idle)
711{
712 struct create_idle *c_idle = _c_idle;
713
714 c_idle->idle = fork_idle(c_idle->cpu);
715 complete(&c_idle->done);
716}
717
631/* 718/*
632 * Boot one CPU. 719 * Boot one CPU.
633 */ 720 */
634static int __cpuinit do_boot_cpu(int cpu, int apicid) 721static int __cpuinit do_boot_cpu(int cpu, int apicid)
635{ 722{
636 struct task_struct *idle;
637 unsigned long boot_error; 723 unsigned long boot_error;
638 int timeout; 724 int timeout;
639 unsigned long start_rip; 725 unsigned long start_rip;
726 struct create_idle c_idle = {
727 .cpu = cpu,
728 .done = COMPLETION_INITIALIZER(c_idle.done),
729 };
730 DECLARE_WORK(work, do_fork_idle, &c_idle);
731
732 c_idle.idle = get_idle_for_cpu(cpu);
733
734 if (c_idle.idle) {
735 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
736 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
737 init_idle(c_idle.idle, cpu);
738 goto do_rest;
739 }
740
640 /* 741 /*
641 * We can't use kernel_thread since we must avoid to 742 * During cold boot process, keventd thread is not spun up yet.
642 * reschedule the child. 743 * When we do cpu hot-add, we create idle threads on the fly, we should
744 * not acquire any attributes from the calling context. Hence the clean
745 * way to create kernel_threads() is to do that from keventd().
746 * We do the current_is_keventd() due to the fact that ACPI notifier
747 * was also queuing to keventd() and when the caller is already running
748 * in context of keventd(), we would end up with locking up the keventd
749 * thread.
643 */ 750 */
644 idle = fork_idle(cpu); 751 if (!keventd_up() || current_is_keventd())
645 if (IS_ERR(idle)) { 752 work.func(work.data);
753 else {
754 schedule_work(&work);
755 wait_for_completion(&c_idle.done);
756 }
757
758 if (IS_ERR(c_idle.idle)) {
646 printk("failed fork for CPU %d\n", cpu); 759 printk("failed fork for CPU %d\n", cpu);
647 return PTR_ERR(idle); 760 return PTR_ERR(c_idle.idle);
648 } 761 }
649 762
650 cpu_pda[cpu].pcurrent = idle; 763 set_idle_for_cpu(cpu, c_idle.idle);
764
765do_rest:
766
767 cpu_pda[cpu].pcurrent = c_idle.idle;
651 768
652 start_rip = setup_trampoline(); 769 start_rip = setup_trampoline();
653 770
654 init_rsp = idle->thread.rsp; 771 init_rsp = c_idle.idle->thread.rsp;
655 per_cpu(init_tss,cpu).rsp0 = init_rsp; 772 per_cpu(init_tss,cpu).rsp0 = init_rsp;
656 initial_code = start_secondary; 773 initial_code = start_secondary;
657 clear_ti_thread_flag(idle->thread_info, TIF_FORK); 774 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
658 775
659 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 776 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
660 start_rip, init_rsp); 777 start_rip, init_rsp);
@@ -746,51 +863,6 @@ cycles_t cacheflush_time;
746unsigned long cache_decay_ticks; 863unsigned long cache_decay_ticks;
747 864
748/* 865/*
749 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
750 * on SMT systems efficiently.
751 */
752static __cpuinit void detect_siblings(void)
753{
754 int cpu;
755
756 for (cpu = 0; cpu < NR_CPUS; cpu++) {
757 cpus_clear(cpu_sibling_map[cpu]);
758 cpus_clear(cpu_core_map[cpu]);
759 }
760
761 for_each_online_cpu (cpu) {
762 struct cpuinfo_x86 *c = cpu_data + cpu;
763 int siblings = 0;
764 int i;
765 if (smp_num_siblings > 1) {
766 for_each_online_cpu (i) {
767 if (cpu_core_id[cpu] == cpu_core_id[i]) {
768 siblings++;
769 cpu_set(i, cpu_sibling_map[cpu]);
770 }
771 }
772 } else {
773 siblings++;
774 cpu_set(cpu, cpu_sibling_map[cpu]);
775 }
776
777 if (siblings != smp_num_siblings) {
778 printk(KERN_WARNING
779 "WARNING: %d siblings found for CPU%d, should be %d\n",
780 siblings, cpu, smp_num_siblings);
781 smp_num_siblings = siblings;
782 }
783 if (c->x86_num_cores > 1) {
784 for_each_online_cpu(i) {
785 if (phys_proc_id[cpu] == phys_proc_id[i])
786 cpu_set(i, cpu_core_map[cpu]);
787 }
788 } else
789 cpu_core_map[cpu] = cpu_sibling_map[cpu];
790 }
791}
792
793/*
794 * Cleanup possible dangling ends... 866 * Cleanup possible dangling ends...
795 */ 867 */
796static __cpuinit void smp_cleanup_boot(void) 868static __cpuinit void smp_cleanup_boot(void)
@@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void)
823 * 895 *
824 * RED-PEN audit/test this more. I bet there is more state messed up here. 896 * RED-PEN audit/test this more. I bet there is more state messed up here.
825 */ 897 */
826static __cpuinit void disable_smp(void) 898static __init void disable_smp(void)
827{ 899{
828 cpu_present_map = cpumask_of_cpu(0); 900 cpu_present_map = cpumask_of_cpu(0);
829 cpu_possible_map = cpumask_of_cpu(0); 901 cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void)
838/* 910/*
839 * Handle user cpus=... parameter. 911 * Handle user cpus=... parameter.
840 */ 912 */
841static __cpuinit void enforce_max_cpus(unsigned max_cpus) 913static __init void enforce_max_cpus(unsigned max_cpus)
842{ 914{
843 int i, k; 915 int i, k;
844 k = 0; 916 k = 0;
@@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
855/* 927/*
856 * Various sanity checks. 928 * Various sanity checks.
857 */ 929 */
858static int __cpuinit smp_sanity_check(unsigned max_cpus) 930static int __init smp_sanity_check(unsigned max_cpus)
859{ 931{
860 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 932 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
861 printk("weird, boot CPU (#%d) not listed by the BIOS.\n", 933 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
913 * Prepare for SMP bootup. The MP table or ACPI has been read 985 * Prepare for SMP bootup. The MP table or ACPI has been read
914 * earlier. Just do some sanity checking here and enable APIC mode. 986 * earlier. Just do some sanity checking here and enable APIC mode.
915 */ 987 */
916void __cpuinit smp_prepare_cpus(unsigned int max_cpus) 988void __init smp_prepare_cpus(unsigned int max_cpus)
917{ 989{
918 int i; 990 int i;
919 991
@@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
930 int apicid = cpu_present_to_apicid(i); 1002 int apicid = cpu_present_to_apicid(i);
931 if (physid_isset(apicid, phys_cpu_present_map)) { 1003 if (physid_isset(apicid, phys_cpu_present_map)) {
932 cpu_set(i, cpu_present_map); 1004 cpu_set(i, cpu_present_map);
933 /* possible map would be different if we supported real
934 CPU hotplug. */
935 cpu_set(i, cpu_possible_map); 1005 cpu_set(i, cpu_possible_map);
936 } 1006 }
1007 fixup_cpu_possible_map(i);
937 } 1008 }
938 1009
939 if (smp_sanity_check(max_cpus) < 0) { 1010 if (smp_sanity_check(max_cpus) < 0) {
@@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void)
978 int me = smp_processor_id(); 1049 int me = smp_processor_id();
979 cpu_set(me, cpu_online_map); 1050 cpu_set(me, cpu_online_map);
980 cpu_set(me, cpu_callout_map); 1051 cpu_set(me, cpu_callout_map);
1052 cpu_set(0, cpu_sibling_map[0]);
1053 cpu_set(0, cpu_core_map[0]);
1054 per_cpu(cpu_state, me) = CPU_ONLINE;
981} 1055}
982 1056
983/* 1057/*
984 * Entry point to boot a CPU. 1058 * Entry point to boot a CPU.
985 *
986 * This is all __cpuinit, not __devinit for now because we don't support
987 * CPU hotplug (yet).
988 */ 1059 */
989int __cpuinit __cpu_up(unsigned int cpu) 1060int __cpuinit __cpu_up(unsigned int cpu)
990{ 1061{
@@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu)
1001 return -EINVAL; 1072 return -EINVAL;
1002 } 1073 }
1003 1074
1075 /*
1076 * Already booted CPU?
1077 */
1078 if (cpu_isset(cpu, cpu_callin_map)) {
1079 Dprintk("do_boot_cpu %d Already started\n", cpu);
1080 return -ENOSYS;
1081 }
1082
1083 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1004 /* Boot it! */ 1084 /* Boot it! */
1005 err = do_boot_cpu(cpu, apicid); 1085 err = do_boot_cpu(cpu, apicid);
1006 if (err < 0) { 1086 if (err < 0) {
@@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu)
1013 1093
1014 while (!cpu_isset(cpu, cpu_online_map)) 1094 while (!cpu_isset(cpu, cpu_online_map))
1015 cpu_relax(); 1095 cpu_relax();
1016 return 0; 1096 err = 0;
1097
1098 return err;
1017} 1099}
1018 1100
1019/* 1101/*
1020 * Finish the SMP boot. 1102 * Finish the SMP boot.
1021 */ 1103 */
1022void __cpuinit smp_cpus_done(unsigned int max_cpus) 1104void __init smp_cpus_done(unsigned int max_cpus)
1023{ 1105{
1106#ifndef CONFIG_HOTPLUG_CPU
1024 zap_low_mappings(); 1107 zap_low_mappings();
1108#endif
1025 smp_cleanup_boot(); 1109 smp_cleanup_boot();
1026 1110
1027#ifdef CONFIG_X86_IO_APIC 1111#ifdef CONFIG_X86_IO_APIC
1028 setup_ioapic_dest(); 1112 setup_ioapic_dest();
1029#endif 1113#endif
1030 1114
1031 detect_siblings();
1032 time_init_gtod(); 1115 time_init_gtod();
1033 1116
1034 check_nmi_watchdog(); 1117 check_nmi_watchdog();
1035} 1118}
1119
1120#ifdef CONFIG_HOTPLUG_CPU
1121
1122static void remove_siblinginfo(int cpu)
1123{
1124 int sibling;
1125
1126 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1127 cpu_clear(cpu, cpu_sibling_map[sibling]);
1128 for_each_cpu_mask(sibling, cpu_core_map[cpu])
1129 cpu_clear(cpu, cpu_core_map[sibling]);
1130 cpus_clear(cpu_sibling_map[cpu]);
1131 cpus_clear(cpu_core_map[cpu]);
1132 phys_proc_id[cpu] = BAD_APICID;
1133 cpu_core_id[cpu] = BAD_APICID;
1134}
1135
1136void remove_cpu_from_maps(void)
1137{
1138 int cpu = smp_processor_id();
1139
1140 cpu_clear(cpu, cpu_callout_map);
1141 cpu_clear(cpu, cpu_callin_map);
1142 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1143}
1144
1145int __cpu_disable(void)
1146{
1147 int cpu = smp_processor_id();
1148
1149 /*
1150 * Perhaps use cpufreq to drop frequency, but that could go
1151 * into generic code.
1152 *
1153 * We won't take down the boot processor on i386 due to some
1154 * interrupts only being able to be serviced by the BSP.
1155 * Especially so if we're not using an IOAPIC -zwane
1156 */
1157 if (cpu == 0)
1158 return -EBUSY;
1159
1160 disable_APIC_timer();
1161
1162 /*
1163 * HACK:
1164 * Allow any queued timer interrupts to get serviced
1165 * This is only a temporary solution until we cleanup
1166 * fixup_irqs as we do for IA64.
1167 */
1168 local_irq_enable();
1169 mdelay(1);
1170
1171 local_irq_disable();
1172 remove_siblinginfo(cpu);
1173
1174 /* It's now safe to remove this processor from the online map */
1175 cpu_clear(cpu, cpu_online_map);
1176 remove_cpu_from_maps();
1177 fixup_irqs(cpu_online_map);
1178 return 0;
1179}
1180
1181void __cpu_die(unsigned int cpu)
1182{
1183 /* We don't do anything here: idle task is faking death itself. */
1184 unsigned int i;
1185
1186 for (i = 0; i < 10; i++) {
1187 /* They ack this in play_dead by setting CPU_DEAD */
1188 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1189 printk ("CPU %d is now offline\n", cpu);
1190 return;
1191 }
1192 current->state = TASK_UNINTERRUPTIBLE;
1193 schedule_timeout(HZ/10);
1194 }
1195 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1196}
1197
1198#else /* ... !CONFIG_HOTPLUG_CPU */
1199
1200int __cpu_disable(void)
1201{
1202 return -ENOSYS;
1203}
1204
1205void __cpu_die(unsigned int cpu)
1206{
1207 /* We said "no" in __cpu_disable */
1208 BUG();
1209}
1210#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d657..6c0f402e3a88 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
44 */ 44 */
45 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); 45 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
46 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); 46 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
47 asm volatile ("sldt %0" : "=m" (ctxt->ldt));
48 asm volatile ("str %0" : "=m" (ctxt->tr)); 47 asm volatile ("str %0" : "=m" (ctxt->tr));
49 48
50 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ 49 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
69 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); 68 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
70 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); 69 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
71 asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); 70 asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
71 asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
72} 72}
73 73
74void save_processor_state(void) 74void save_processor_state(void)
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt)
90 /* 90 /*
91 * control registers 91 * control registers
92 */ 92 */
93 asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
93 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); 94 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
94 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); 95 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
95 asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); 96 asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
96 asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); 97 asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
97 98
98 /* 99 /*
100 * now restore the descriptor tables to their proper values
101 * ltr is done i fix_processor_context().
102 */
103 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
104 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
105
106 /*
99 * segment registers 107 * segment registers
100 */ 108 */
101 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); 109 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt)
108 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 116 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
109 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 117 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
110 118
111 /*
112 * now restore the descriptor tables to their proper values
113 * ltr is done i fix_processor_context().
114 */
115 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
116 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
117 asm volatile ("lldt %0" :: "m" (ctxt->ldt));
118
119 fix_processor_context(); 119 fix_processor_context();
120 120
121 do_fpu_end(); 121 do_fpu_end();
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f6..102736630002 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
586asmlinkage void default_do_nmi(struct pt_regs *regs) 586asmlinkage void default_do_nmi(struct pt_regs *regs)
587{ 587{
588 unsigned char reason = 0; 588 unsigned char reason = 0;
589 int cpu;
590
591 cpu = smp_processor_id();
589 592
590 /* Only the BSP gets external NMIs from the system. */ 593 /* Only the BSP gets external NMIs from the system. */
591 if (!smp_processor_id()) 594 if (!cpu)
592 reason = get_nmi_reason(); 595 reason = get_nmi_reason();
593 596
597 if (!cpu_online(cpu))
598 return;
599
594 if (!(reason & 0xc0)) { 600 if (!(reason & 0xc0)) {
595 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) 601 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
596 == NOTIFY_STOP) 602 == NOTIFY_STOP)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda87..73389f51c4e5 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */ 3 */
4 4
5#define LOAD_OFFSET __START_KERNEL_map
6
5#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/page.h>
6#include <linux/config.h> 9#include <linux/config.h>
7 10
8OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 11OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
11jiffies_64 = jiffies; 14jiffies_64 = jiffies;
12SECTIONS 15SECTIONS
13{ 16{
14 . = 0xffffffff80100000; 17 . = __START_KERNEL;
15 phys_startup_64 = startup_64 - LOAD_OFFSET; 18 phys_startup_64 = startup_64 - LOAD_OFFSET;
16 _text = .; /* Text and read-only data */ 19 _text = .; /* Text and read-only data */
17 .text : { 20 .text : AT(ADDR(.text) - LOAD_OFFSET) {
18 *(.text) 21 *(.text)
19 SCHED_TEXT 22 SCHED_TEXT
20 LOCK_TEXT 23 LOCK_TEXT
21 *(.fixup) 24 *(.fixup)
22 *(.gnu.warning) 25 *(.gnu.warning)
23 } = 0x9090 26 } = 0x9090
24 .text.lock : { *(.text.lock) } /* out-of-line lock text */ 27 /* out-of-line lock text */
28 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
25 29
26 _etext = .; /* End of text section */ 30 _etext = .; /* End of text section */
27 31
28 . = ALIGN(16); /* Exception table */ 32 . = ALIGN(16); /* Exception table */
29 __start___ex_table = .; 33 __start___ex_table = .;
30 __ex_table : { *(__ex_table) } 34 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
31 __stop___ex_table = .; 35 __stop___ex_table = .;
32 36
33 RODATA 37 RODATA
34 38
35 .data : { /* Data */ 39 /* Data */
40 .data : AT(ADDR(.data) - LOAD_OFFSET) {
36 *(.data) 41 *(.data)
37 CONSTRUCTORS 42 CONSTRUCTORS
38 } 43 }
@@ -40,62 +45,95 @@ SECTIONS
40 _edata = .; /* End of data section */ 45 _edata = .; /* End of data section */
41 46
42 __bss_start = .; /* BSS */ 47 __bss_start = .; /* BSS */
43 .bss : { 48 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44 *(.bss.page_aligned) 49 *(.bss.page_aligned)
45 *(.bss) 50 *(.bss)
46 } 51 }
47 __bss_end = .; 52 __bss_end = .;
48 53
54 . = ALIGN(PAGE_SIZE);
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 55 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
50 .data.cacheline_aligned : { *(.data.cacheline_aligned) } 56 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
57 *(.data.cacheline_aligned)
58 }
59
60#define VSYSCALL_ADDR (-10*1024*1024)
61#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
62#define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
63
64#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
65#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
66
67#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
68#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
51 69
52#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) 70 . = VSYSCALL_ADDR;
53#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) 71 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
54#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) 72 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
55 73
56 .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
57 __vsyscall_0 = LOADADDR(.vsyscall_0);
58 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 74 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
59 .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } 75 .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
60 xtime_lock = LOADADDR(.xtime_lock); 76 xtime_lock = VVIRT(.xtime_lock);
61 .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } 77
62 vxtime = LOADADDR(.vxtime); 78 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
63 .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } 79 vxtime = VVIRT(.vxtime);
64 wall_jiffies = LOADADDR(.wall_jiffies); 80
65 .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } 81 .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
66 sys_tz = LOADADDR(.sys_tz); 82 wall_jiffies = VVIRT(.wall_jiffies);
67 .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } 83
68 sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); 84 .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
69 .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } 85 sys_tz = VVIRT(.sys_tz);
70 xtime = LOADADDR(.xtime); 86
87 .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
88 sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
89
90 .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
91 xtime = VVIRT(.xtime);
92
71 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 93 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
72 .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } 94 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
73 jiffies = LOADADDR(.jiffies); 95 jiffies = VVIRT(.jiffies);
74 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } 96
75 . = LOADADDR(.vsyscall_0) + 4096; 97 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
98 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
99 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
100
101 . = VSYSCALL_VIRT_ADDR + 4096;
102
103#undef VSYSCALL_ADDR
104#undef VSYSCALL_PHYS_ADDR
105#undef VSYSCALL_VIRT_ADDR
106#undef VLOAD_OFFSET
107#undef VLOAD
108#undef VVIRT_OFFSET
109#undef VVIRT
76 110
77 . = ALIGN(8192); /* init_task */ 111 . = ALIGN(8192); /* init_task */
78 .data.init_task : { *(.data.init_task) } 112 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
113 *(.data.init_task)
114 }
79 115
80 . = ALIGN(4096); 116 . = ALIGN(4096);
81 .data.page_aligned : { *(.data.page_aligned) } 117 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
118 *(.data.page_aligned)
119 }
82 120
83 . = ALIGN(4096); /* Init code and data */ 121 . = ALIGN(4096); /* Init code and data */
84 __init_begin = .; 122 __init_begin = .;
85 .init.text : { 123 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
86 _sinittext = .; 124 _sinittext = .;
87 *(.init.text) 125 *(.init.text)
88 _einittext = .; 126 _einittext = .;
89 } 127 }
90 __initdata_begin = .; 128 __initdata_begin = .;
91 .init.data : { *(.init.data) } 129 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
92 __initdata_end = .; 130 __initdata_end = .;
93 . = ALIGN(16); 131 . = ALIGN(16);
94 __setup_start = .; 132 __setup_start = .;
95 .init.setup : { *(.init.setup) } 133 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
96 __setup_end = .; 134 __setup_end = .;
97 __initcall_start = .; 135 __initcall_start = .;
98 .initcall.init : { 136 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
99 *(.initcall1.init) 137 *(.initcall1.init)
100 *(.initcall2.init) 138 *(.initcall2.init)
101 *(.initcall3.init) 139 *(.initcall3.init)
@@ -106,32 +144,38 @@ SECTIONS
106 } 144 }
107 __initcall_end = .; 145 __initcall_end = .;
108 __con_initcall_start = .; 146 __con_initcall_start = .;
109 .con_initcall.init : { *(.con_initcall.init) } 147 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
148 *(.con_initcall.init)
149 }
110 __con_initcall_end = .; 150 __con_initcall_end = .;
111 SECURITY_INIT 151 SECURITY_INIT
112 . = ALIGN(8); 152 . = ALIGN(8);
113 __alt_instructions = .; 153 __alt_instructions = .;
114 .altinstructions : { *(.altinstructions) } 154 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
155 *(.altinstructions)
156 }
115 __alt_instructions_end = .; 157 __alt_instructions_end = .;
116 .altinstr_replacement : { *(.altinstr_replacement) } 158 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
159 *(.altinstr_replacement)
160 }
117 /* .exit.text is discard at runtime, not link time, to deal with references 161 /* .exit.text is discard at runtime, not link time, to deal with references
118 from .altinstructions and .eh_frame */ 162 from .altinstructions and .eh_frame */
119 .exit.text : { *(.exit.text) } 163 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
120 .exit.data : { *(.exit.data) } 164 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
121 . = ALIGN(4096); 165 . = ALIGN(4096);
122 __initramfs_start = .; 166 __initramfs_start = .;
123 .init.ramfs : { *(.init.ramfs) } 167 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
124 __initramfs_end = .; 168 __initramfs_end = .;
125 . = ALIGN(32); 169 . = ALIGN(32);
126 __per_cpu_start = .; 170 __per_cpu_start = .;
127 .data.percpu : { *(.data.percpu) } 171 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
128 __per_cpu_end = .; 172 __per_cpu_end = .;
129 . = ALIGN(4096); 173 . = ALIGN(4096);
130 __init_end = .; 174 __init_end = .;
131 175
132 . = ALIGN(4096); 176 . = ALIGN(4096);
133 __nosave_begin = .; 177 __nosave_begin = .;
134 .data_nosave : { *(.data.nosave) } 178 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
135 . = ALIGN(4096); 179 . = ALIGN(4096);
136 __nosave_end = .; 180 __nosave_end = .;
137 181
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 84cde796ecb1..ac61c186eb02 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
251 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 251 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
252} 252}
253 253
254__init void numa_add_cpu(int cpu) 254__cpuinit void numa_add_cpu(int cpu)
255{ 255{
256 /* BP is initialized elsewhere */ 256 /* BP is initialized elsewhere */
257 if (cpu) 257 if (cpu)