diff options
Diffstat (limited to 'arch/x86_64')
35 files changed, 1187 insertions, 260 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index db259757dc8a..d09437b5c48f 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig | |||
@@ -207,33 +207,6 @@ config SMP | |||
207 | 207 | ||
208 | If you don't know what to do here, say N. | 208 | If you don't know what to do here, say N. |
209 | 209 | ||
210 | config PREEMPT | ||
211 | bool "Preemptible Kernel" | ||
212 | ---help--- | ||
213 | This option reduces the latency of the kernel when reacting to | ||
214 | real-time or interactive events by allowing a low priority process to | ||
215 | be preempted even if it is in kernel mode executing a system call. | ||
216 | This allows applications to run more reliably even when the system is | ||
217 | under load. On contrary it may also break your drivers and add | ||
218 | priority inheritance problems to your system. Don't select it if | ||
219 | you rely on a stable system or have slightly obscure hardware. | ||
220 | It's also not very well tested on x86-64 currently. | ||
221 | You have been warned. | ||
222 | |||
223 | Say Y here if you are feeling brave and building a kernel for a | ||
224 | desktop, embedded or real-time system. Say N if you are unsure. | ||
225 | |||
226 | config PREEMPT_BKL | ||
227 | bool "Preempt The Big Kernel Lock" | ||
228 | depends on PREEMPT | ||
229 | default y | ||
230 | help | ||
231 | This option reduces the latency of the kernel by making the | ||
232 | big kernel lock preemptible. | ||
233 | |||
234 | Say Y here if you are building a kernel for a desktop system. | ||
235 | Say N if you are unsure. | ||
236 | |||
237 | config SCHED_SMT | 210 | config SCHED_SMT |
238 | bool "SMT (Hyperthreading) scheduler support" | 211 | bool "SMT (Hyperthreading) scheduler support" |
239 | depends on SMP | 212 | depends on SMP |
@@ -244,6 +217,8 @@ config SCHED_SMT | |||
244 | cost of slightly increased overhead in some places. If unsure say | 217 | cost of slightly increased overhead in some places. If unsure say |
245 | N here. | 218 | N here. |
246 | 219 | ||
220 | source "kernel/Kconfig.preempt" | ||
221 | |||
247 | config K8_NUMA | 222 | config K8_NUMA |
248 | bool "K8 NUMA support" | 223 | bool "K8 NUMA support" |
249 | select NUMA | 224 | select NUMA |
@@ -313,6 +288,15 @@ config NR_CPUS | |||
313 | This is purely to save memory - each supported CPU requires | 288 | This is purely to save memory - each supported CPU requires |
314 | memory in the static kernel configuration. | 289 | memory in the static kernel configuration. |
315 | 290 | ||
291 | config HOTPLUG_CPU | ||
292 | bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" | ||
293 | depends on SMP && HOTPLUG && EXPERIMENTAL | ||
294 | help | ||
295 | Say Y here to experiment with turning CPUs off and on. CPUs | ||
296 | can be controlled through /sys/devices/system/cpu/cpu#. | ||
297 | Say N if you want to disable CPU hotplug. | ||
298 | |||
299 | |||
316 | config HPET_TIMER | 300 | config HPET_TIMER |
317 | bool | 301 | bool |
318 | default y | 302 | default y |
@@ -385,6 +369,34 @@ config X86_MCE_INTEL | |||
385 | Additional support for intel specific MCE features such as | 369 | Additional support for intel specific MCE features such as |
386 | the thermal monitor. | 370 | the thermal monitor. |
387 | 371 | ||
372 | config PHYSICAL_START | ||
373 | hex "Physical address where the kernel is loaded" if EMBEDDED | ||
374 | default "0x100000" | ||
375 | help | ||
376 | This gives the physical address where the kernel is loaded. | ||
377 | Primarily used in the case of kexec on panic where the | ||
378 | fail safe kernel needs to run at a different address than | ||
379 | the panic-ed kernel. | ||
380 | |||
381 | Don't change this unless you know what you are doing. | ||
382 | |||
383 | config KEXEC | ||
384 | bool "kexec system call (EXPERIMENTAL)" | ||
385 | depends on EXPERIMENTAL | ||
386 | help | ||
387 | kexec is a system call that implements the ability to shutdown your | ||
388 | current kernel, and to start another kernel. It is like a reboot | ||
389 | but it is indepedent of the system firmware. And like a reboot | ||
390 | you can start any kernel with it, not just Linux. | ||
391 | |||
392 | The name comes from the similiarity to the exec system call. | ||
393 | |||
394 | It is an ongoing process to be certain the hardware in a machine | ||
395 | is properly shutdown, so do not be surprised if this code does not | ||
396 | initially work for you. It may help to enable device hotplugging | ||
397 | support. As of this writing the exact hardware interface is | ||
398 | strongly in flux, so no good recommendation can be made. | ||
399 | |||
388 | config SECCOMP | 400 | config SECCOMP |
389 | bool "Enable seccomp to safely compute untrusted bytecode" | 401 | bool "Enable seccomp to safely compute untrusted bytecode" |
390 | depends on PROC_FS | 402 | depends on PROC_FS |
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 6f90c246c418..8a73794f9b90 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile | |||
@@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP | |||
35 | 35 | ||
36 | LDFLAGS := -m elf_x86_64 | 36 | LDFLAGS := -m elf_x86_64 |
37 | OBJCOPYFLAGS := -O binary -R .note -R .comment -S | 37 | OBJCOPYFLAGS := -O binary -R .note -R .comment -S |
38 | LDFLAGS_vmlinux := -e stext | 38 | LDFLAGS_vmlinux := |
39 | 39 | ||
40 | CHECKFLAGS += -D__x86_64__ -m64 | 40 | CHECKFLAGS += -D__x86_64__ -m64 |
41 | 41 | ||
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 27264dbd575c..6f55565e4d42 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S | |||
@@ -2,8 +2,6 @@ | |||
2 | * linux/boot/head.S | 2 | * linux/boot/head.S |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 1993 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993 Linus Torvalds |
5 | * | ||
6 | * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ | ||
7 | */ | 5 | */ |
8 | 6 | ||
9 | /* | 7 | /* |
@@ -21,13 +19,14 @@ | |||
21 | */ | 19 | */ |
22 | 20 | ||
23 | /* | 21 | /* |
24 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | 22 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 |
25 | */ | 23 | */ |
26 | .code32 | 24 | .code32 |
27 | .text | 25 | .text |
28 | 26 | ||
29 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
30 | #include <asm/segment.h> | 28 | #include <asm/segment.h> |
29 | #include <asm/page.h> | ||
31 | 30 | ||
32 | .code32 | 31 | .code32 |
33 | .globl startup_32 | 32 | .globl startup_32 |
@@ -77,7 +76,7 @@ startup_32: | |||
77 | jnz 3f | 76 | jnz 3f |
78 | addl $8,%esp | 77 | addl $8,%esp |
79 | xorl %ebx,%ebx | 78 | xorl %ebx,%ebx |
80 | ljmp $(__KERNEL_CS), $0x100000 | 79 | ljmp $(__KERNEL_CS), $__PHYSICAL_START |
81 | 80 | ||
82 | /* | 81 | /* |
83 | * We come here, if we were loaded high. | 82 | * We come here, if we were loaded high. |
@@ -103,7 +102,7 @@ startup_32: | |||
103 | popl %ecx # lcount | 102 | popl %ecx # lcount |
104 | popl %edx # high_buffer_start | 103 | popl %edx # high_buffer_start |
105 | popl %eax # hcount | 104 | popl %eax # hcount |
106 | movl $0x100000,%edi | 105 | movl $__PHYSICAL_START,%edi |
107 | cli # make sure we don't get interrupted | 106 | cli # make sure we don't get interrupted |
108 | ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine | 107 | ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine |
109 | 108 | ||
@@ -128,7 +127,7 @@ move_routine_start: | |||
128 | movsl | 127 | movsl |
129 | movl %ebx,%esi # Restore setup pointer | 128 | movl %ebx,%esi # Restore setup pointer |
130 | xorl %ebx,%ebx | 129 | xorl %ebx,%ebx |
131 | ljmp $(__KERNEL_CS), $0x100000 | 130 | ljmp $(__KERNEL_CS), $__PHYSICAL_START |
132 | move_routine_end: | 131 | move_routine_end: |
133 | 132 | ||
134 | 133 | ||
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index c8b9216f9e63..b38d5b8b5fb8 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c | |||
@@ -11,6 +11,7 @@ | |||
11 | 11 | ||
12 | #include "miscsetup.h" | 12 | #include "miscsetup.h" |
13 | #include <asm/io.h> | 13 | #include <asm/io.h> |
14 | #include <asm/page.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * gzip declarations | 17 | * gzip declarations |
@@ -92,8 +93,11 @@ static unsigned long output_ptr = 0; | |||
92 | static void *malloc(int size); | 93 | static void *malloc(int size); |
93 | static void free(void *where); | 94 | static void free(void *where); |
94 | 95 | ||
96 | void* memset(void* s, int c, unsigned n); | ||
97 | void* memcpy(void* dest, const void* src, unsigned n); | ||
98 | |||
95 | static void putstr(const char *); | 99 | static void putstr(const char *); |
96 | 100 | ||
97 | extern int end; | 101 | extern int end; |
98 | static long free_mem_ptr = (long)&end; | 102 | static long free_mem_ptr = (long)&end; |
99 | static long free_mem_end_ptr; | 103 | static long free_mem_end_ptr; |
@@ -284,7 +288,7 @@ void setup_normal_output_buffer(void) | |||
284 | #else | 288 | #else |
285 | if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); | 289 | if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); |
286 | #endif | 290 | #endif |
287 | output_data = (char *)0x100000; /* Points to 1M */ | 291 | output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ |
288 | free_mem_end_ptr = (long)real_mode; | 292 | free_mem_end_ptr = (long)real_mode; |
289 | } | 293 | } |
290 | 294 | ||
@@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) | |||
307 | low_buffer_size = low_buffer_end - LOW_BUFFER_START; | 311 | low_buffer_size = low_buffer_end - LOW_BUFFER_START; |
308 | high_loaded = 1; | 312 | high_loaded = 1; |
309 | free_mem_end_ptr = (long)high_buffer_start; | 313 | free_mem_end_ptr = (long)high_buffer_start; |
310 | if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { | 314 | if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { |
311 | high_buffer_start = (uch *)(0x100000 + low_buffer_size); | 315 | high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); |
312 | mv->hcount = 0; /* say: we need not to move high_buffer */ | 316 | mv->hcount = 0; /* say: we need not to move high_buffer */ |
313 | } | 317 | } |
314 | else mv->hcount = -1; | 318 | else mv->hcount = -1; |
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh index f17b40dfc0f4..198af15a7758 100644 --- a/arch/x86_64/boot/install.sh +++ b/arch/x86_64/boot/install.sh | |||
@@ -1,6 +1,6 @@ | |||
1 | #!/bin/sh | 1 | #!/bin/sh |
2 | # | 2 | # |
3 | # arch/i386/boot/install.sh | 3 | # arch/x86_64/boot/install.sh |
4 | # | 4 | # |
5 | # This file is subject to the terms and conditions of the GNU General Public | 5 | # This file is subject to the terms and conditions of the GNU General Public |
6 | # License. See the file "COPYING" in the main directory of this archive | 6 | # License. See the file "COPYING" in the main directory of this archive |
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 75d4d2ad93b3..ff58b2832b75 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S | |||
@@ -33,7 +33,7 @@ | |||
33 | * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. | 33 | * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. |
34 | * <stiker@northlink.com> | 34 | * <stiker@northlink.com> |
35 | * | 35 | * |
36 | * Fix to work around buggy BIOSes which dont use carry bit correctly | 36 | * Fix to work around buggy BIOSes which don't use carry bit correctly |
37 | * and/or report extended memory in CX/DX for e801h memory size detection | 37 | * and/or report extended memory in CX/DX for e801h memory size detection |
38 | * call. As a result the kernel got wrong figures. The int15/e801h docs | 38 | * call. As a result the kernel got wrong figures. The int15/e801h docs |
39 | * from Ralf Brown interrupt list seem to indicate AX/BX should be used | 39 | * from Ralf Brown interrupt list seem to indicate AX/BX should be used |
@@ -383,7 +383,7 @@ sse_ok: | |||
383 | # a whole bunch of different types, and allows memory holes and | 383 | # a whole bunch of different types, and allows memory holes and |
384 | # everything. We scan through this memory map and build a list | 384 | # everything. We scan through this memory map and build a list |
385 | # of the first 32 memory areas, which we return at [E820MAP]. | 385 | # of the first 32 memory areas, which we return at [E820MAP]. |
386 | # This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm | 386 | # This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. |
387 | 387 | ||
388 | #define SMAP 0x534d4150 | 388 | #define SMAP 0x534d4150 |
389 | 389 | ||
@@ -436,7 +436,7 @@ bail820: | |||
436 | 436 | ||
437 | meme801: | 437 | meme801: |
438 | stc # fix to work around buggy | 438 | stc # fix to work around buggy |
439 | xorw %cx,%cx # BIOSes which dont clear/set | 439 | xorw %cx,%cx # BIOSes which don't clear/set |
440 | xorw %dx,%dx # carry on pass/error of | 440 | xorw %dx,%dx # carry on pass/error of |
441 | # e801h memory size call | 441 | # e801h memory size call |
442 | # or merely pass cx,dx though | 442 | # or merely pass cx,dx though |
@@ -733,7 +733,7 @@ flush_instr: | |||
733 | # | 733 | # |
734 | # but we yet haven't reloaded the CS register, so the default size | 734 | # but we yet haven't reloaded the CS register, so the default size |
735 | # of the target offset still is 16 bit. | 735 | # of the target offset still is 16 bit. |
736 | # However, using an operant prefix (0x66), the CPU will properly | 736 | # However, using an operand prefix (0x66), the CPU will properly |
737 | # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference | 737 | # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference |
738 | # Manual, Mixing 16-bit and 32-bit code, page 16-6) | 738 | # Manual, Mixing 16-bit and 32-bit code, page 16-6) |
739 | 739 | ||
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c2fa66313170..18b5bac1c428 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c | |||
@@ -1,6 +1,4 @@ | |||
1 | /* | 1 | /* |
2 | * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 2 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | * Copyright (C) 1997 Martin Mares | 3 | * Copyright (C) 1997 Martin Mares |
6 | */ | 4 | */ |
@@ -8,7 +6,8 @@ | |||
8 | /* | 6 | /* |
9 | * This file builds a disk-image from three different files: | 7 | * This file builds a disk-image from three different files: |
10 | * | 8 | * |
11 | * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest | 9 | * - bootsect: compatibility mbr which prints an error message if |
10 | * someone tries to boot the kernel directly. | ||
12 | * - setup: 8086 machine code, sets up system parm | 11 | * - setup: 8086 machine code, sets up system parm |
13 | * - system: 80386 code for actual system | 12 | * - system: 80386 code for actual system |
14 | * | 13 | * |
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index f3ca0db85b5b..cc935427d532 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S | |||
@@ -589,7 +589,7 @@ ia32_sys_call_table: | |||
589 | .quad compat_sys_mq_timedreceive /* 280 */ | 589 | .quad compat_sys_mq_timedreceive /* 280 */ |
590 | .quad compat_sys_mq_notify | 590 | .quad compat_sys_mq_notify |
591 | .quad compat_sys_mq_getsetattr | 591 | .quad compat_sys_mq_getsetattr |
592 | .quad quiet_ni_syscall /* reserved for kexec */ | 592 | .quad compat_sys_kexec_load /* reserved for kexec */ |
593 | .quad compat_sys_waitid | 593 | .quad compat_sys_waitid |
594 | .quad quiet_ni_syscall /* sys_altroot */ | 594 | .quad quiet_ni_syscall /* sys_altroot */ |
595 | .quad sys_add_key | 595 | .quad sys_add_key |
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 5ca4a4598fda..48f9e2c19cd6 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile | |||
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o | |||
20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o | 20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o |
21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ | 21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ |
22 | genapic.o genapic_cluster.o genapic_flat.o | 22 | genapic.o genapic_cluster.o genapic_flat.o |
23 | obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o | ||
23 | obj-$(CONFIG_PM) += suspend.o | 24 | obj-$(CONFIG_PM) += suspend.o |
24 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o | 25 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o |
25 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | 26 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ |
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index a4c630034cd4..185faa911db5 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S | |||
@@ -67,7 +67,7 @@ wakeup_code: | |||
67 | shll $4, %eax | 67 | shll $4, %eax |
68 | addl $(gdta - wakeup_code), %eax | 68 | addl $(gdta - wakeup_code), %eax |
69 | movl %eax, gdt_48a +2 - wakeup_code | 69 | movl %eax, gdt_48a +2 - wakeup_code |
70 | lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is | 70 | lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is |
71 | # appropriate | 71 | # appropriate |
72 | 72 | ||
73 | movl $1, %eax # protected mode (PE) bit | 73 | movl $1, %eax # protected mode (PE) bit |
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index f8e6cc4fecd4..375d369570ca 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c | |||
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void) | |||
133 | } | 133 | } |
134 | } | 134 | } |
135 | 135 | ||
136 | void disconnect_bsp_APIC(void) | 136 | void disconnect_bsp_APIC(int virt_wire_setup) |
137 | { | 137 | { |
138 | if (pic_mode) { | 138 | if (pic_mode) { |
139 | /* | 139 | /* |
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void) | |||
146 | outb(0x70, 0x22); | 146 | outb(0x70, 0x22); |
147 | outb(0x00, 0x23); | 147 | outb(0x00, 0x23); |
148 | } | 148 | } |
149 | else { | ||
150 | /* Go back to Virtual Wire compatibility mode */ | ||
151 | unsigned long value; | ||
152 | |||
153 | /* For the spurious interrupt use vector F, and enable it */ | ||
154 | value = apic_read(APIC_SPIV); | ||
155 | value &= ~APIC_VECTOR_MASK; | ||
156 | value |= APIC_SPIV_APIC_ENABLED; | ||
157 | value |= 0xf; | ||
158 | apic_write_around(APIC_SPIV, value); | ||
159 | |||
160 | if (!virt_wire_setup) { | ||
161 | /* For LVT0 make it edge triggered, active high, external and enabled */ | ||
162 | value = apic_read(APIC_LVT0); | ||
163 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
164 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
165 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); | ||
166 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
167 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
168 | apic_write_around(APIC_LVT0, value); | ||
169 | } | ||
170 | else { | ||
171 | /* Disable LVT0 */ | ||
172 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | ||
173 | } | ||
174 | |||
175 | /* For LVT1 make it edge triggered, active high, nmi and enabled */ | ||
176 | value = apic_read(APIC_LVT1); | ||
177 | value &= ~( | ||
178 | APIC_MODE_MASK | APIC_SEND_PENDING | | ||
179 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
180 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
181 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
182 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
183 | apic_write_around(APIC_LVT1, value); | ||
184 | } | ||
149 | } | 185 | } |
150 | 186 | ||
151 | void disable_local_APIC(void) | 187 | void disable_local_APIC(void) |
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void) | |||
285 | apic_write_around(APIC_LVT1, value); | 321 | apic_write_around(APIC_LVT1, value); |
286 | } | 322 | } |
287 | 323 | ||
288 | void __init setup_local_APIC (void) | 324 | void __cpuinit setup_local_APIC (void) |
289 | { | 325 | { |
290 | unsigned int value, ver, maxlvt; | 326 | unsigned int value, ver, maxlvt; |
291 | 327 | ||
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = { | |||
534 | .cls = &lapic_sysclass, | 570 | .cls = &lapic_sysclass, |
535 | }; | 571 | }; |
536 | 572 | ||
537 | static void __init apic_pm_activate(void) | 573 | static void __cpuinit apic_pm_activate(void) |
538 | { | 574 | { |
539 | apic_pm_state.active = 1; | 575 | apic_pm_state.active = 1; |
540 | } | 576 | } |
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void) | |||
774 | local_irq_enable(); | 810 | local_irq_enable(); |
775 | } | 811 | } |
776 | 812 | ||
777 | void __init setup_secondary_APIC_clock(void) | 813 | void __cpuinit setup_secondary_APIC_clock(void) |
778 | { | 814 | { |
779 | local_irq_disable(); /* FIXME: Do we need this? --RR */ | 815 | local_irq_disable(); /* FIXME: Do we need this? --RR */ |
780 | setup_APIC_timer(calibration_result); | 816 | setup_APIC_timer(calibration_result); |
781 | local_irq_enable(); | 817 | local_irq_enable(); |
782 | } | 818 | } |
783 | 819 | ||
784 | void __init disable_APIC_timer(void) | 820 | void __cpuinit disable_APIC_timer(void) |
785 | { | 821 | { |
786 | if (using_apic_timer) { | 822 | if (using_apic_timer) { |
787 | unsigned long v; | 823 | unsigned long v; |
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c new file mode 100644 index 000000000000..d7fa4248501c --- /dev/null +++ b/arch/x86_64/kernel/crash.c | |||
@@ -0,0 +1,35 @@ | |||
1 | /* | ||
2 | * Architecture specific (x86_64) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/kexec.h> | ||
17 | |||
18 | #include <asm/processor.h> | ||
19 | #include <asm/hardirq.h> | ||
20 | #include <asm/nmi.h> | ||
21 | #include <asm/hw_irq.h> | ||
22 | |||
23 | note_buf_t crash_notes[NR_CPUS]; | ||
24 | |||
25 | void machine_crash_shutdown(struct pt_regs *regs) | ||
26 | { | ||
27 | /* This function is only called after the system | ||
28 | * has paniced or is otherwise in a critical state. | ||
29 | * The minimum amount of code to allow a kexec'd kernel | ||
30 | * to run successfully needs to happen here. | ||
31 | * | ||
32 | * In practice this means shooting down the other cpus in | ||
33 | * an SMP system. | ||
34 | */ | ||
35 | } | ||
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 7c154dfff64a..6ded3a50dfe6 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/bootmem.h> | 16 | #include <linux/bootmem.h> |
17 | #include <linux/ioport.h> | 17 | #include <linux/ioport.h> |
18 | #include <linux/string.h> | 18 | #include <linux/string.h> |
19 | #include <linux/kexec.h> | ||
19 | #include <asm/page.h> | 20 | #include <asm/page.h> |
20 | #include <asm/e820.h> | 21 | #include <asm/e820.h> |
21 | #include <asm/proto.h> | 22 | #include <asm/proto.h> |
@@ -191,8 +192,6 @@ void __init e820_reserve_resources(void) | |||
191 | int i; | 192 | int i; |
192 | for (i = 0; i < e820.nr_map; i++) { | 193 | for (i = 0; i < e820.nr_map; i++) { |
193 | struct resource *res; | 194 | struct resource *res; |
194 | if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) | ||
195 | continue; | ||
196 | res = alloc_bootmem_low(sizeof(struct resource)); | 195 | res = alloc_bootmem_low(sizeof(struct resource)); |
197 | switch (e820.map[i].type) { | 196 | switch (e820.map[i].type) { |
198 | case E820_RAM: res->name = "System RAM"; break; | 197 | case E820_RAM: res->name = "System RAM"; break; |
@@ -212,6 +211,9 @@ void __init e820_reserve_resources(void) | |||
212 | */ | 211 | */ |
213 | request_resource(res, &code_resource); | 212 | request_resource(res, &code_resource); |
214 | request_resource(res, &data_resource); | 213 | request_resource(res, &data_resource); |
214 | #ifdef CONFIG_KEXEC | ||
215 | request_resource(res, &crashk_res); | ||
216 | #endif | ||
215 | } | 217 | } |
216 | } | 218 | } |
217 | } | 219 | } |
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index b4cbbad04226..282846965080 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c | |||
@@ -7,6 +7,8 @@ | |||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | 7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by |
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | 8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and |
9 | * James Cleverdon. | 9 | * James Cleverdon. |
10 | * Ashok Raj <ashok.raj@intel.com> | ||
11 | * Removed IPI broadcast shortcut to support CPU hotplug | ||
10 | */ | 12 | */ |
11 | #include <linux/config.h> | 13 | #include <linux/config.h> |
12 | #include <linux/threads.h> | 14 | #include <linux/threads.h> |
@@ -18,6 +20,46 @@ | |||
18 | #include <asm/smp.h> | 20 | #include <asm/smp.h> |
19 | #include <asm/ipi.h> | 21 | #include <asm/ipi.h> |
20 | 22 | ||
23 | /* | ||
24 | * The following permit choosing broadcast IPI shortcut v.s sending IPI only | ||
25 | * to online cpus via the send_IPI_mask varient. | ||
26 | * The mask version is my preferred option, since it eliminates a lot of | ||
27 | * other extra code that would need to be written to cleanup intrs sent | ||
28 | * to a CPU while offline. | ||
29 | * | ||
30 | * Sending broadcast introduces lots of trouble in CPU hotplug situations. | ||
31 | * These IPI's are delivered to cpu's irrespective of their offline status | ||
32 | * and could pickup stale intr data when these CPUS are turned online. | ||
33 | * | ||
34 | * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with | ||
35 | * the idea of not using broadcast IPI's anymore. Hence the run time check | ||
36 | * is introduced, on his request so we can choose an alternate mechanism. | ||
37 | * | ||
38 | * Initial wacky performance tests that collect cycle counts show | ||
39 | * no increase in using mask v.s broadcast version. In fact they seem | ||
40 | * identical in terms of cycle counts. | ||
41 | * | ||
42 | * if we need to use broadcast, we need to do the following. | ||
43 | * | ||
44 | * cli; | ||
45 | * hold call_lock; | ||
46 | * clear any pending IPI, just ack and clear all pending intr | ||
47 | * set cpu_online_map; | ||
48 | * release call_lock; | ||
49 | * sti; | ||
50 | * | ||
51 | * The complicated dummy irq processing shown above is not required if | ||
52 | * we didnt sent IPI's to wrong CPU's in the first place. | ||
53 | * | ||
54 | * - Ashok Raj <ashok.raj@intel.com> | ||
55 | */ | ||
56 | #ifdef CONFIG_HOTPLUG_CPU | ||
57 | #define DEFAULT_SEND_IPI (1) | ||
58 | #else | ||
59 | #define DEFAULT_SEND_IPI (0) | ||
60 | #endif | ||
61 | |||
62 | static int no_broadcast=DEFAULT_SEND_IPI; | ||
21 | 63 | ||
22 | static cpumask_t flat_target_cpus(void) | 64 | static cpumask_t flat_target_cpus(void) |
23 | { | 65 | { |
@@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void) | |||
45 | apic_write_around(APIC_LDR, val); | 87 | apic_write_around(APIC_LDR, val); |
46 | } | 88 | } |
47 | 89 | ||
48 | static void flat_send_IPI_allbutself(int vector) | ||
49 | { | ||
50 | /* | ||
51 | * if there are no other CPUs in the system then | ||
52 | * we get an APIC send error if we try to broadcast. | ||
53 | * thus we have to avoid sending IPIs in this case. | ||
54 | */ | ||
55 | if (num_online_cpus() > 1) | ||
56 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); | ||
57 | } | ||
58 | |||
59 | static void flat_send_IPI_all(int vector) | ||
60 | { | ||
61 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
62 | } | ||
63 | |||
64 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | 90 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) |
65 | { | 91 | { |
66 | unsigned long mask = cpus_addr(cpumask)[0]; | 92 | unsigned long mask = cpus_addr(cpumask)[0]; |
@@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | |||
93 | local_irq_restore(flags); | 119 | local_irq_restore(flags); |
94 | } | 120 | } |
95 | 121 | ||
122 | static inline void __local_flat_send_IPI_allbutself(int vector) | ||
123 | { | ||
124 | if (no_broadcast) { | ||
125 | cpumask_t mask = cpu_online_map; | ||
126 | int this_cpu = get_cpu(); | ||
127 | |||
128 | cpu_clear(this_cpu, mask); | ||
129 | flat_send_IPI_mask(mask, vector); | ||
130 | put_cpu(); | ||
131 | } | ||
132 | else | ||
133 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); | ||
134 | } | ||
135 | |||
136 | static inline void __local_flat_send_IPI_all(int vector) | ||
137 | { | ||
138 | if (no_broadcast) | ||
139 | flat_send_IPI_mask(cpu_online_map, vector); | ||
140 | else | ||
141 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
142 | } | ||
143 | |||
144 | static void flat_send_IPI_allbutself(int vector) | ||
145 | { | ||
146 | if (((num_online_cpus()) - 1) >= 1) | ||
147 | __local_flat_send_IPI_allbutself(vector); | ||
148 | } | ||
149 | |||
150 | static void flat_send_IPI_all(int vector) | ||
151 | { | ||
152 | __local_flat_send_IPI_all(vector); | ||
153 | } | ||
154 | |||
96 | static int flat_apic_id_registered(void) | 155 | static int flat_apic_id_registered(void) |
97 | { | 156 | { |
98 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); | 157 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); |
@@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb) | |||
111 | return ((ebx >> 24) & 0xFF) >> index_msb; | 170 | return ((ebx >> 24) & 0xFF) >> index_msb; |
112 | } | 171 | } |
113 | 172 | ||
173 | static __init int no_ipi_broadcast(char *str) | ||
174 | { | ||
175 | get_option(&str, &no_broadcast); | ||
176 | printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : | ||
177 | "IPI Broadcast"); | ||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | __setup("no_ipi_broadcast", no_ipi_broadcast); | ||
182 | |||
114 | struct genapic apic_flat = { | 183 | struct genapic apic_flat = { |
115 | .name = "flat", | 184 | .name = "flat", |
116 | .int_delivery_mode = dest_LowestPrio, | 185 | .int_delivery_mode = dest_LowestPrio, |
@@ -125,3 +194,12 @@ struct genapic apic_flat = { | |||
125 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | 194 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, |
126 | .phys_pkg_id = phys_pkg_id, | 195 | .phys_pkg_id = phys_pkg_id, |
127 | }; | 196 | }; |
197 | |||
198 | static int __init print_ipi_mode(void) | ||
199 | { | ||
200 | printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : | ||
201 | "Shortcut"); | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | late_initcall(print_ipi_mode); | ||
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 9bd2e7a4b81e..8d765aa77a26 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S | |||
@@ -248,23 +248,23 @@ ENTRY(_stext) | |||
248 | */ | 248 | */ |
249 | .org 0x1000 | 249 | .org 0x1000 |
250 | ENTRY(init_level4_pgt) | 250 | ENTRY(init_level4_pgt) |
251 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | 251 | .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ |
252 | .fill 255,8,0 | 252 | .fill 255,8,0 |
253 | .quad 0x000000000010a007 | 253 | .quad 0x000000000000a007 + __PHYSICAL_START |
254 | .fill 254,8,0 | 254 | .fill 254,8,0 |
255 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 255 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
256 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | 256 | .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ |
257 | 257 | ||
258 | .org 0x2000 | 258 | .org 0x2000 |
259 | ENTRY(level3_ident_pgt) | 259 | ENTRY(level3_ident_pgt) |
260 | .quad 0x0000000000104007 | 260 | .quad 0x0000000000004007 + __PHYSICAL_START |
261 | .fill 511,8,0 | 261 | .fill 511,8,0 |
262 | 262 | ||
263 | .org 0x3000 | 263 | .org 0x3000 |
264 | ENTRY(level3_kernel_pgt) | 264 | ENTRY(level3_kernel_pgt) |
265 | .fill 510,8,0 | 265 | .fill 510,8,0 |
266 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | 266 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ |
267 | .quad 0x0000000000105007 /* -> level2_kernel_pgt */ | 267 | .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ |
268 | .fill 1,8,0 | 268 | .fill 1,8,0 |
269 | 269 | ||
270 | .org 0x4000 | 270 | .org 0x4000 |
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table) | |||
337 | 337 | ||
338 | .org 0xa000 | 338 | .org 0xa000 |
339 | ENTRY(level3_physmem_pgt) | 339 | ENTRY(level3_physmem_pgt) |
340 | .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ | 340 | .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ |
341 | 341 | ||
342 | .org 0xb000 | 342 | .org 0xb000 |
343 | #ifdef CONFIG_ACPI_SLEEP | 343 | #ifdef CONFIG_ACPI_SLEEP |
344 | ENTRY(wakeup_level4_pgt) | 344 | ENTRY(wakeup_level4_pgt) |
345 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | 345 | .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ |
346 | .fill 255,8,0 | 346 | .fill 255,8,0 |
347 | .quad 0x000000000010a007 | 347 | .quad 0x000000000000a007 + __PHYSICAL_START |
348 | .fill 254,8,0 | 348 | .fill 254,8,0 |
349 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 349 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
350 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | 350 | .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ |
351 | #endif | 351 | #endif |
352 | 352 | ||
353 | .data | 353 | .data |
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index ba139cac57ce..d9b22b633e39 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c | |||
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void) | |||
42 | * Called at bootup to set up the initial FPU state that is later cloned | 42 | * Called at bootup to set up the initial FPU state that is later cloned |
43 | * into all processes. | 43 | * into all processes. |
44 | */ | 44 | */ |
45 | void __init fpu_init(void) | 45 | void __cpuinit fpu_init(void) |
46 | { | 46 | { |
47 | unsigned long oldcr0 = read_cr0(); | 47 | unsigned long oldcr0 = read_cr0(); |
48 | extern void __bad_fxsave_alignment(void); | 48 | extern void __bad_fxsave_alignment(void); |
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 19eafa0aa95c..a89169095129 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c | |||
@@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) | |||
414 | return 0; | 414 | return 0; |
415 | } | 415 | } |
416 | 416 | ||
417 | static int i8259A_shutdown(struct sys_device *dev) | ||
418 | { | ||
419 | /* Put the i8259A into a quiescent state that | ||
420 | * the kernel initialization code can get it | ||
421 | * out of. | ||
422 | */ | ||
423 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
424 | outb(0xff, 0xA1); /* mask all of 8259A-1 */ | ||
425 | return 0; | ||
426 | } | ||
427 | |||
417 | static struct sysdev_class i8259_sysdev_class = { | 428 | static struct sysdev_class i8259_sysdev_class = { |
418 | set_kset_name("i8259"), | 429 | set_kset_name("i8259"), |
419 | .suspend = i8259A_suspend, | 430 | .suspend = i8259A_suspend, |
420 | .resume = i8259A_resume, | 431 | .resume = i8259A_resume, |
432 | .shutdown = i8259A_shutdown, | ||
421 | }; | 433 | }; |
422 | 434 | ||
423 | static struct sys_device device_i8259A = { | 435 | static struct sys_device device_i8259A = { |
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afd87e64d0a8..157190d986bb 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c | |||
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type) | |||
330 | /* | 330 | /* |
331 | * Find the pin to which IRQ[irq] (ISA) is connected | 331 | * Find the pin to which IRQ[irq] (ISA) is connected |
332 | */ | 332 | */ |
333 | static int __init find_isa_irq_pin(int irq, int type) | 333 | static int find_isa_irq_pin(int irq, int type) |
334 | { | 334 | { |
335 | int i; | 335 | int i; |
336 | 336 | ||
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void) | |||
1132 | */ | 1132 | */ |
1133 | void disable_IO_APIC(void) | 1133 | void disable_IO_APIC(void) |
1134 | { | 1134 | { |
1135 | int pin; | ||
1135 | /* | 1136 | /* |
1136 | * Clear the IO-APIC before rebooting: | 1137 | * Clear the IO-APIC before rebooting: |
1137 | */ | 1138 | */ |
1138 | clear_IO_APIC(); | 1139 | clear_IO_APIC(); |
1139 | 1140 | ||
1140 | disconnect_bsp_APIC(); | 1141 | /* |
1142 | * If the i82559 is routed through an IOAPIC | ||
1143 | * Put that IOAPIC in virtual wire mode | ||
1144 | * so legacy interrups can be delivered. | ||
1145 | */ | ||
1146 | pin = find_isa_irq_pin(0, mp_ExtINT); | ||
1147 | if (pin != -1) { | ||
1148 | struct IO_APIC_route_entry entry; | ||
1149 | unsigned long flags; | ||
1150 | |||
1151 | memset(&entry, 0, sizeof(entry)); | ||
1152 | entry.mask = 0; /* Enabled */ | ||
1153 | entry.trigger = 0; /* Edge */ | ||
1154 | entry.irr = 0; | ||
1155 | entry.polarity = 0; /* High */ | ||
1156 | entry.delivery_status = 0; | ||
1157 | entry.dest_mode = 0; /* Physical */ | ||
1158 | entry.delivery_mode = 7; /* ExtInt */ | ||
1159 | entry.vector = 0; | ||
1160 | entry.dest.physical.physical_dest = 0; | ||
1161 | |||
1162 | |||
1163 | /* | ||
1164 | * Add it to the IO-APIC irq-routing table: | ||
1165 | */ | ||
1166 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1167 | io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); | ||
1168 | io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); | ||
1169 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1170 | } | ||
1171 | |||
1172 | disconnect_bsp_APIC(pin != -1); | ||
1141 | } | 1173 | } |
1142 | 1174 | ||
1143 | /* | 1175 | /* |
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 62b112e4deb4..cc3fb85f5145 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/interrupt.h> | 14 | #include <linux/interrupt.h> |
15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/delay.h> | ||
17 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
18 | #include <asm/io_apic.h> | 19 | #include <asm/io_apic.h> |
19 | 20 | ||
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | |||
106 | return 1; | 107 | return 1; |
107 | } | 108 | } |
108 | 109 | ||
110 | #ifdef CONFIG_HOTPLUG_CPU | ||
111 | void fixup_irqs(cpumask_t map) | ||
112 | { | ||
113 | unsigned int irq; | ||
114 | static int warned; | ||
115 | |||
116 | for (irq = 0; irq < NR_IRQS; irq++) { | ||
117 | cpumask_t mask; | ||
118 | if (irq == 2) | ||
119 | continue; | ||
120 | |||
121 | cpus_and(mask, irq_affinity[irq], map); | ||
122 | if (any_online_cpu(mask) == NR_CPUS) { | ||
123 | printk("Breaking affinity for irq %i\n", irq); | ||
124 | mask = map; | ||
125 | } | ||
126 | if (irq_desc[irq].handler->set_affinity) | ||
127 | irq_desc[irq].handler->set_affinity(irq, mask); | ||
128 | else if (irq_desc[irq].action && !(warned++)) | ||
129 | printk("Cannot set affinity for irq %i\n", irq); | ||
130 | } | ||
131 | |||
132 | /* That doesn't seem sufficient. Give it 1ms. */ | ||
133 | local_irq_enable(); | ||
134 | mdelay(1); | ||
135 | local_irq_disable(); | ||
136 | } | ||
137 | #endif | ||
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c new file mode 100644 index 000000000000..60d1eff41567 --- /dev/null +++ b/arch/x86_64/kernel/machine_kexec.c | |||
@@ -0,0 +1,250 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/delay.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/reboot.h> | ||
14 | #include <asm/pda.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | #include <asm/pgalloc.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/mmu_context.h> | ||
19 | #include <asm/io.h> | ||
20 | #include <asm/apic.h> | ||
21 | #include <asm/cpufeature.h> | ||
22 | #include <asm/hw_irq.h> | ||
23 | |||
24 | #define LEVEL0_SIZE (1UL << 12UL) | ||
25 | #define LEVEL1_SIZE (1UL << 21UL) | ||
26 | #define LEVEL2_SIZE (1UL << 30UL) | ||
27 | #define LEVEL3_SIZE (1UL << 39UL) | ||
28 | #define LEVEL4_SIZE (1UL << 48UL) | ||
29 | |||
30 | #define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
31 | #define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) | ||
32 | #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
33 | #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
34 | |||
35 | static void init_level2_page(u64 *level2p, unsigned long addr) | ||
36 | { | ||
37 | unsigned long end_addr; | ||
38 | |||
39 | addr &= PAGE_MASK; | ||
40 | end_addr = addr + LEVEL2_SIZE; | ||
41 | while (addr < end_addr) { | ||
42 | *(level2p++) = addr | L1_ATTR; | ||
43 | addr += LEVEL1_SIZE; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | static int init_level3_page(struct kimage *image, u64 *level3p, | ||
48 | unsigned long addr, unsigned long last_addr) | ||
49 | { | ||
50 | unsigned long end_addr; | ||
51 | int result; | ||
52 | |||
53 | result = 0; | ||
54 | addr &= PAGE_MASK; | ||
55 | end_addr = addr + LEVEL3_SIZE; | ||
56 | while ((addr < last_addr) && (addr < end_addr)) { | ||
57 | struct page *page; | ||
58 | u64 *level2p; | ||
59 | |||
60 | page = kimage_alloc_control_pages(image, 0); | ||
61 | if (!page) { | ||
62 | result = -ENOMEM; | ||
63 | goto out; | ||
64 | } | ||
65 | level2p = (u64 *)page_address(page); | ||
66 | init_level2_page(level2p, addr); | ||
67 | *(level3p++) = __pa(level2p) | L2_ATTR; | ||
68 | addr += LEVEL2_SIZE; | ||
69 | } | ||
70 | /* clear the unused entries */ | ||
71 | while (addr < end_addr) { | ||
72 | *(level3p++) = 0; | ||
73 | addr += LEVEL2_SIZE; | ||
74 | } | ||
75 | out: | ||
76 | return result; | ||
77 | } | ||
78 | |||
79 | |||
80 | static int init_level4_page(struct kimage *image, u64 *level4p, | ||
81 | unsigned long addr, unsigned long last_addr) | ||
82 | { | ||
83 | unsigned long end_addr; | ||
84 | int result; | ||
85 | |||
86 | result = 0; | ||
87 | addr &= PAGE_MASK; | ||
88 | end_addr = addr + LEVEL4_SIZE; | ||
89 | while ((addr < last_addr) && (addr < end_addr)) { | ||
90 | struct page *page; | ||
91 | u64 *level3p; | ||
92 | |||
93 | page = kimage_alloc_control_pages(image, 0); | ||
94 | if (!page) { | ||
95 | result = -ENOMEM; | ||
96 | goto out; | ||
97 | } | ||
98 | level3p = (u64 *)page_address(page); | ||
99 | result = init_level3_page(image, level3p, addr, last_addr); | ||
100 | if (result) { | ||
101 | goto out; | ||
102 | } | ||
103 | *(level4p++) = __pa(level3p) | L3_ATTR; | ||
104 | addr += LEVEL3_SIZE; | ||
105 | } | ||
106 | /* clear the unused entries */ | ||
107 | while (addr < end_addr) { | ||
108 | *(level4p++) = 0; | ||
109 | addr += LEVEL3_SIZE; | ||
110 | } | ||
111 | out: | ||
112 | return result; | ||
113 | } | ||
114 | |||
115 | |||
116 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | ||
117 | { | ||
118 | u64 *level4p; | ||
119 | level4p = (u64 *)__va(start_pgtable); | ||
120 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); | ||
121 | } | ||
122 | |||
123 | static void set_idt(void *newidt, u16 limit) | ||
124 | { | ||
125 | unsigned char curidt[10]; | ||
126 | |||
127 | /* x86-64 supports unaliged loads & stores */ | ||
128 | (*(u16 *)(curidt)) = limit; | ||
129 | (*(u64 *)(curidt +2)) = (unsigned long)(newidt); | ||
130 | |||
131 | __asm__ __volatile__ ( | ||
132 | "lidt %0\n" | ||
133 | : "=m" (curidt) | ||
134 | ); | ||
135 | }; | ||
136 | |||
137 | |||
138 | static void set_gdt(void *newgdt, u16 limit) | ||
139 | { | ||
140 | unsigned char curgdt[10]; | ||
141 | |||
142 | /* x86-64 supports unaligned loads & stores */ | ||
143 | (*(u16 *)(curgdt)) = limit; | ||
144 | (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); | ||
145 | |||
146 | __asm__ __volatile__ ( | ||
147 | "lgdt %0\n" | ||
148 | : "=m" (curgdt) | ||
149 | ); | ||
150 | }; | ||
151 | |||
152 | static void load_segments(void) | ||
153 | { | ||
154 | __asm__ __volatile__ ( | ||
155 | "\tmovl $"STR(__KERNEL_DS)",%eax\n" | ||
156 | "\tmovl %eax,%ds\n" | ||
157 | "\tmovl %eax,%es\n" | ||
158 | "\tmovl %eax,%ss\n" | ||
159 | "\tmovl %eax,%fs\n" | ||
160 | "\tmovl %eax,%gs\n" | ||
161 | ); | ||
162 | #undef STR | ||
163 | #undef __STR | ||
164 | } | ||
165 | |||
166 | typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, | ||
167 | unsigned long control_code_buffer, | ||
168 | unsigned long start_address, | ||
169 | unsigned long pgtable) ATTRIB_NORET; | ||
170 | |||
171 | const extern unsigned char relocate_new_kernel[]; | ||
172 | const extern unsigned long relocate_new_kernel_size; | ||
173 | |||
174 | int machine_kexec_prepare(struct kimage *image) | ||
175 | { | ||
176 | unsigned long start_pgtable, control_code_buffer; | ||
177 | int result; | ||
178 | |||
179 | /* Calculate the offsets */ | ||
180 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
181 | control_code_buffer = start_pgtable + 4096UL; | ||
182 | |||
183 | /* Setup the identity mapped 64bit page table */ | ||
184 | result = init_pgtable(image, start_pgtable); | ||
185 | if (result) | ||
186 | return result; | ||
187 | |||
188 | /* Place the code in the reboot code buffer */ | ||
189 | memcpy(__va(control_code_buffer), relocate_new_kernel, | ||
190 | relocate_new_kernel_size); | ||
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | void machine_kexec_cleanup(struct kimage *image) | ||
196 | { | ||
197 | return; | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
202 | * We are past the point of no return, committed to rebooting now. | ||
203 | */ | ||
204 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
205 | { | ||
206 | unsigned long page_list; | ||
207 | unsigned long control_code_buffer; | ||
208 | unsigned long start_pgtable; | ||
209 | relocate_new_kernel_t rnk; | ||
210 | |||
211 | /* Interrupts aren't acceptable while we reboot */ | ||
212 | local_irq_disable(); | ||
213 | |||
214 | /* Calculate the offsets */ | ||
215 | page_list = image->head; | ||
216 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
217 | control_code_buffer = start_pgtable + 4096UL; | ||
218 | |||
219 | /* Set the low half of the page table to my identity mapped | ||
220 | * page table for kexec. Leave the high half pointing at the | ||
221 | * kernel pages. Don't bother to flush the global pages | ||
222 | * as that will happen when I fully switch to my identity mapped | ||
223 | * page table anyway. | ||
224 | */ | ||
225 | memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); | ||
226 | __flush_tlb(); | ||
227 | |||
228 | |||
229 | /* The segment registers are funny things, they are | ||
230 | * automatically loaded from a table, in memory wherever you | ||
231 | * set them to a specific selector, but this table is never | ||
232 | * accessed again unless you set the segment to a different selector. | ||
233 | * | ||
234 | * The more common model are caches where the behide | ||
235 | * the scenes work is done, but is also dropped at arbitrary | ||
236 | * times. | ||
237 | * | ||
238 | * I take advantage of this here by force loading the | ||
239 | * segments, before I zap the gdt with an invalid value. | ||
240 | */ | ||
241 | load_segments(); | ||
242 | /* The gdt & idt are now invalid. | ||
243 | * If you want to load them you must set up your own idt & gdt. | ||
244 | */ | ||
245 | set_gdt(phys_to_virt(0),0); | ||
246 | set_idt(phys_to_virt(0),0); | ||
247 | /* now call it */ | ||
248 | rnk = (relocate_new_kernel_t) control_code_buffer; | ||
249 | (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); | ||
250 | } | ||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 3a89d735a4f6..21e70625a495 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c | |||
@@ -327,7 +327,7 @@ static void mce_init(void *dummy) | |||
327 | } | 327 | } |
328 | 328 | ||
329 | /* Add per CPU specific workarounds here */ | 329 | /* Add per CPU specific workarounds here */ |
330 | static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) | 330 | static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) |
331 | { | 331 | { |
332 | /* This should be disabled by the BIOS, but isn't always */ | 332 | /* This should be disabled by the BIOS, but isn't always */ |
333 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { | 333 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { |
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
337 | } | 337 | } |
338 | } | 338 | } |
339 | 339 | ||
340 | static void __init mce_cpu_features(struct cpuinfo_x86 *c) | 340 | static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) |
341 | { | 341 | { |
342 | switch (c->x86_vendor) { | 342 | switch (c->x86_vendor) { |
343 | case X86_VENDOR_INTEL: | 343 | case X86_VENDOR_INTEL: |
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c) | |||
352 | * Called for each booted CPU to set up machine checks. | 352 | * Called for each booted CPU to set up machine checks. |
353 | * Must be called with preempt off. | 353 | * Must be called with preempt off. |
354 | */ | 354 | */ |
355 | void __init mcheck_init(struct cpuinfo_x86 *c) | 355 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) |
356 | { | 356 | { |
357 | static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; | 357 | static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; |
358 | 358 | ||
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff | |||
411 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | 411 | memset(mcelog.entry, 0, next * sizeof(struct mce)); |
412 | mcelog.next = 0; | 412 | mcelog.next = 0; |
413 | 413 | ||
414 | synchronize_kernel(); | 414 | synchronize_sched(); |
415 | 415 | ||
416 | /* Collect entries that were still getting written before the synchronize. */ | 416 | /* Collect entries that were still getting written before the synchronize. */ |
417 | 417 | ||
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) | |||
542 | ACCESSOR(tolerant,tolerant,) | 542 | ACCESSOR(tolerant,tolerant,) |
543 | ACCESSOR(check_interval,check_interval,mce_restart()) | 543 | ACCESSOR(check_interval,check_interval,mce_restart()) |
544 | 544 | ||
545 | static __init int mce_init_device(void) | 545 | static __cpuinit int mce_init_device(void) |
546 | { | 546 | { |
547 | int err; | 547 | int err; |
548 | if (!mce_available(&boot_cpu_data)) | 548 | if (!mce_available(&boot_cpu_data)) |
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 4db9a640069f..0be0a7959814 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c | |||
@@ -42,7 +42,7 @@ done: | |||
42 | irq_exit(); | 42 | irq_exit(); |
43 | } | 43 | } |
44 | 44 | ||
45 | static void __init intel_init_thermal(struct cpuinfo_x86 *c) | 45 | static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) |
46 | { | 46 | { |
47 | u32 l, h; | 47 | u32 l, h; |
48 | int tm2 = 0; | 48 | int tm2 = 0; |
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) | |||
93 | return; | 93 | return; |
94 | } | 94 | } |
95 | 95 | ||
96 | void __init mce_intel_feature_init(struct cpuinfo_x86 *c) | 96 | void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) |
97 | { | 97 | { |
98 | intel_init_thermal(c); | 98 | intel_init_thermal(c); |
99 | } | 99 | } |
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 31c0f2e6ac91..4e44d6e6b7e5 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c | |||
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val; | |||
98 | (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ | 98 | (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ |
99 | P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) | 99 | P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) |
100 | 100 | ||
101 | static __init inline int nmi_known_cpu(void) | 101 | static __cpuinit inline int nmi_known_cpu(void) |
102 | { | 102 | { |
103 | switch (boot_cpu_data.x86_vendor) { | 103 | switch (boot_cpu_data.x86_vendor) { |
104 | case X86_VENDOR_AMD: | 104 | case X86_VENDOR_AMD: |
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void) | |||
110 | } | 110 | } |
111 | 111 | ||
112 | /* Run after command line and cpu_init init, but before all other checks */ | 112 | /* Run after command line and cpu_init init, but before all other checks */ |
113 | void __init nmi_watchdog_default(void) | 113 | void __cpuinit nmi_watchdog_default(void) |
114 | { | 114 | { |
115 | if (nmi_watchdog != NMI_DEFAULT) | 115 | if (nmi_watchdog != NMI_DEFAULT) |
116 | return; | 116 | return; |
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index e59d1f9d6163..1d91271796e5 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c | |||
@@ -8,7 +8,8 @@ | |||
8 | * | 8 | * |
9 | * X86-64 port | 9 | * X86-64 port |
10 | * Andi Kleen. | 10 | * Andi Kleen. |
11 | * | 11 | * |
12 | * CPU hotplug support - ashok.raj@intel.com | ||
12 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ | 13 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ |
13 | */ | 14 | */ |
14 | 15 | ||
@@ -18,6 +19,7 @@ | |||
18 | 19 | ||
19 | #include <stdarg.h> | 20 | #include <stdarg.h> |
20 | 21 | ||
22 | #include <linux/cpu.h> | ||
21 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
22 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
23 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
@@ -154,6 +156,29 @@ void cpu_idle_wait(void) | |||
154 | } | 156 | } |
155 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | 157 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
156 | 158 | ||
159 | #ifdef CONFIG_HOTPLUG_CPU | ||
160 | DECLARE_PER_CPU(int, cpu_state); | ||
161 | |||
162 | #include <asm/nmi.h> | ||
163 | /* We don't actually take CPU down, just spin without interrupts. */ | ||
164 | static inline void play_dead(void) | ||
165 | { | ||
166 | idle_task_exit(); | ||
167 | wbinvd(); | ||
168 | mb(); | ||
169 | /* Ack it */ | ||
170 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
171 | |||
172 | while (1) | ||
173 | safe_halt(); | ||
174 | } | ||
175 | #else | ||
176 | static inline void play_dead(void) | ||
177 | { | ||
178 | BUG(); | ||
179 | } | ||
180 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
181 | |||
157 | /* | 182 | /* |
158 | * The idle thread. There's no useful work to be | 183 | * The idle thread. There's no useful work to be |
159 | * done, so just try to conserve power and have a | 184 | * done, so just try to conserve power and have a |
@@ -174,6 +199,8 @@ void cpu_idle (void) | |||
174 | idle = pm_idle; | 199 | idle = pm_idle; |
175 | if (!idle) | 200 | if (!idle) |
176 | idle = default_idle; | 201 | idle = default_idle; |
202 | if (cpu_is_offline(smp_processor_id())) | ||
203 | play_dead(); | ||
177 | idle(); | 204 | idle(); |
178 | } | 205 | } |
179 | 206 | ||
@@ -204,7 +231,7 @@ static void mwait_idle(void) | |||
204 | } | 231 | } |
205 | } | 232 | } |
206 | 233 | ||
207 | void __init select_idle_routine(const struct cpuinfo_x86 *c) | 234 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) |
208 | { | 235 | { |
209 | static int printed; | 236 | static int printed; |
210 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | 237 | if (cpu_has(c, X86_FEATURE_MWAIT)) { |
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index be4b36f762cf..57e71dbdfd69 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c | |||
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str) | |||
66 | 66 | ||
67 | __setup("reboot=", reboot_setup); | 67 | __setup("reboot=", reboot_setup); |
68 | 68 | ||
69 | #ifdef CONFIG_SMP | 69 | static inline void kb_wait(void) |
70 | static void smp_halt(void) | ||
71 | { | 70 | { |
72 | int cpuid = safe_smp_processor_id(); | 71 | int i; |
73 | static int first_entry = 1; | ||
74 | 72 | ||
75 | if (reboot_force) | 73 | for (i=0; i<0x10000; i++) |
76 | return; | 74 | if ((inb_p(0x64) & 0x02) == 0) |
75 | break; | ||
76 | } | ||
77 | 77 | ||
78 | if (first_entry) { | 78 | void machine_shutdown(void) |
79 | first_entry = 0; | 79 | { |
80 | smp_call_function((void *)machine_restart, NULL, 1, 0); | 80 | /* Stop the cpus and apics */ |
81 | } | 81 | #ifdef CONFIG_SMP |
82 | 82 | int reboot_cpu_id; | |
83 | smp_stop_cpu(); | ||
84 | 83 | ||
85 | /* AP calling this. Just halt */ | 84 | /* The boot cpu is always logical cpu 0 */ |
86 | if (cpuid != boot_cpu_id) { | 85 | reboot_cpu_id = 0; |
87 | for (;;) | 86 | |
88 | asm("hlt"); | 87 | /* Make certain the cpu I'm about to reboot on is online */ |
88 | if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { | ||
89 | reboot_cpu_id = smp_processor_id(); | ||
89 | } | 90 | } |
90 | 91 | ||
91 | /* Wait for all other CPUs to have run smp_stop_cpu */ | 92 | /* Make certain I only run on the appropriate processor */ |
92 | while (!cpus_empty(cpu_online_map)) | 93 | set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); |
93 | rep_nop(); | 94 | |
94 | } | 95 | /* O.K Now that I'm on the appropriate processor, |
96 | * stop all of the others. | ||
97 | */ | ||
98 | smp_send_stop(); | ||
95 | #endif | 99 | #endif |
96 | 100 | ||
97 | static inline void kb_wait(void) | 101 | local_irq_disable(); |
98 | { | ||
99 | int i; | ||
100 | 102 | ||
101 | for (i=0; i<0x10000; i++) | 103 | #ifndef CONFIG_SMP |
102 | if ((inb_p(0x64) & 0x02) == 0) | 104 | disable_local_APIC(); |
103 | break; | 105 | #endif |
106 | |||
107 | disable_IO_APIC(); | ||
108 | |||
109 | local_irq_enable(); | ||
104 | } | 110 | } |
105 | 111 | ||
106 | void machine_restart(char * __unused) | 112 | void machine_restart(char * __unused) |
@@ -109,9 +115,7 @@ void machine_restart(char * __unused) | |||
109 | 115 | ||
110 | printk("machine restart\n"); | 116 | printk("machine restart\n"); |
111 | 117 | ||
112 | #ifdef CONFIG_SMP | 118 | machine_shutdown(); |
113 | smp_halt(); | ||
114 | #endif | ||
115 | 119 | ||
116 | if (!reboot_force) { | 120 | if (!reboot_force) { |
117 | local_irq_disable(); | 121 | local_irq_disable(); |
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S new file mode 100644 index 000000000000..d24fa9b72a2b --- /dev/null +++ b/arch/x86_64/kernel/relocate_kernel.S | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | |||
11 | /* | ||
12 | * Must be relocatable PIC code callable as a C function, that once | ||
13 | * it starts can not use the previous processes stack. | ||
14 | */ | ||
15 | .globl relocate_new_kernel | ||
16 | .code64 | ||
17 | relocate_new_kernel: | ||
18 | /* %rdi page_list | ||
19 | * %rsi reboot_code_buffer | ||
20 | * %rdx start address | ||
21 | * %rcx page_table | ||
22 | * %r8 arg5 | ||
23 | * %r9 arg6 | ||
24 | */ | ||
25 | |||
26 | /* zero out flags, and disable interrupts */ | ||
27 | pushq $0 | ||
28 | popfq | ||
29 | |||
30 | /* set a new stack at the bottom of our page... */ | ||
31 | lea 4096(%rsi), %rsp | ||
32 | |||
33 | /* store the parameters back on the stack */ | ||
34 | pushq %rdx /* store the start address */ | ||
35 | |||
36 | /* Set cr0 to a known state: | ||
37 | * 31 1 == Paging enabled | ||
38 | * 18 0 == Alignment check disabled | ||
39 | * 16 0 == Write protect disabled | ||
40 | * 3 0 == No task switch | ||
41 | * 2 0 == Don't do FP software emulation. | ||
42 | * 0 1 == Proctected mode enabled | ||
43 | */ | ||
44 | movq %cr0, %rax | ||
45 | andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax | ||
46 | orl $((1<<31)|(1<<0)), %eax | ||
47 | movq %rax, %cr0 | ||
48 | |||
49 | /* Set cr4 to a known state: | ||
50 | * 10 0 == xmm exceptions disabled | ||
51 | * 9 0 == xmm registers instructions disabled | ||
52 | * 8 0 == performance monitoring counter disabled | ||
53 | * 7 0 == page global disabled | ||
54 | * 6 0 == machine check exceptions disabled | ||
55 | * 5 1 == physical address extension enabled | ||
56 | * 4 0 == page size extensions disabled | ||
57 | * 3 0 == Debug extensions disabled | ||
58 | * 2 0 == Time stamp disable (disabled) | ||
59 | * 1 0 == Protected mode virtual interrupts disabled | ||
60 | * 0 0 == VME disabled | ||
61 | */ | ||
62 | |||
63 | movq $((1<<5)), %rax | ||
64 | movq %rax, %cr4 | ||
65 | |||
66 | jmp 1f | ||
67 | 1: | ||
68 | |||
69 | /* Switch to the identity mapped page tables, | ||
70 | * and flush the TLB. | ||
71 | */ | ||
72 | movq %rcx, %cr3 | ||
73 | |||
74 | /* Do the copies */ | ||
75 | movq %rdi, %rcx /* Put the page_list in %rcx */ | ||
76 | xorq %rdi, %rdi | ||
77 | xorq %rsi, %rsi | ||
78 | jmp 1f | ||
79 | |||
80 | 0: /* top, read another word for the indirection page */ | ||
81 | |||
82 | movq (%rbx), %rcx | ||
83 | addq $8, %rbx | ||
84 | 1: | ||
85 | testq $0x1, %rcx /* is it a destination page? */ | ||
86 | jz 2f | ||
87 | movq %rcx, %rdi | ||
88 | andq $0xfffffffffffff000, %rdi | ||
89 | jmp 0b | ||
90 | 2: | ||
91 | testq $0x2, %rcx /* is it an indirection page? */ | ||
92 | jz 2f | ||
93 | movq %rcx, %rbx | ||
94 | andq $0xfffffffffffff000, %rbx | ||
95 | jmp 0b | ||
96 | 2: | ||
97 | testq $0x4, %rcx /* is it the done indicator? */ | ||
98 | jz 2f | ||
99 | jmp 3f | ||
100 | 2: | ||
101 | testq $0x8, %rcx /* is it the source indicator? */ | ||
102 | jz 0b /* Ignore it otherwise */ | ||
103 | movq %rcx, %rsi /* For ever source page do a copy */ | ||
104 | andq $0xfffffffffffff000, %rsi | ||
105 | |||
106 | movq $512, %rcx | ||
107 | rep ; movsq | ||
108 | jmp 0b | ||
109 | 3: | ||
110 | |||
111 | /* To be certain of avoiding problems with self-modifying code | ||
112 | * I need to execute a serializing instruction here. | ||
113 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
114 | * and not processor dependent. | ||
115 | */ | ||
116 | movq %cr3, %rax | ||
117 | movq %rax, %cr3 | ||
118 | |||
119 | /* set all of the registers to known values */ | ||
120 | /* leave %rsp alone */ | ||
121 | |||
122 | xorq %rax, %rax | ||
123 | xorq %rbx, %rbx | ||
124 | xorq %rcx, %rcx | ||
125 | xorq %rdx, %rdx | ||
126 | xorq %rsi, %rsi | ||
127 | xorq %rdi, %rdi | ||
128 | xorq %rbp, %rbp | ||
129 | xorq %r8, %r8 | ||
130 | xorq %r9, %r9 | ||
131 | xorq %r10, %r9 | ||
132 | xorq %r11, %r11 | ||
133 | xorq %r12, %r12 | ||
134 | xorq %r13, %r13 | ||
135 | xorq %r14, %r14 | ||
136 | xorq %r15, %r15 | ||
137 | |||
138 | ret | ||
139 | relocate_new_kernel_end: | ||
140 | |||
141 | .globl relocate_new_kernel_size | ||
142 | relocate_new_kernel_size: | ||
143 | .quad relocate_new_kernel_end - relocate_new_kernel | ||
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 000015dd5a8b..b02d921da4f7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/kallsyms.h> | 41 | #include <linux/kallsyms.h> |
42 | #include <linux/edd.h> | 42 | #include <linux/edd.h> |
43 | #include <linux/mmzone.h> | 43 | #include <linux/mmzone.h> |
44 | #include <linux/kexec.h> | ||
44 | 45 | ||
45 | #include <asm/mtrr.h> | 46 | #include <asm/mtrr.h> |
46 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
@@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p) | |||
367 | if (!memcmp(from, "noexec=", 7)) | 368 | if (!memcmp(from, "noexec=", 7)) |
368 | nonx_setup(from + 7); | 369 | nonx_setup(from + 7); |
369 | 370 | ||
371 | #ifdef CONFIG_KEXEC | ||
372 | /* crashkernel=size@addr specifies the location to reserve for | ||
373 | * a crash kernel. By reserving this memory we guarantee | ||
374 | * that linux never set's it up as a DMA target. | ||
375 | * Useful for holding code to do something appropriate | ||
376 | * after a kernel panic. | ||
377 | */ | ||
378 | else if (!memcmp(from, "crashkernel=", 12)) { | ||
379 | unsigned long size, base; | ||
380 | size = memparse(from+12, &from); | ||
381 | if (*from == '@') { | ||
382 | base = memparse(from+1, &from); | ||
383 | /* FIXME: Do I want a sanity check | ||
384 | * to validate the memory range? | ||
385 | */ | ||
386 | crashk_res.start = base; | ||
387 | crashk_res.end = base + size - 1; | ||
388 | } | ||
389 | } | ||
390 | #endif | ||
391 | |||
370 | next_char: | 392 | next_char: |
371 | c = *(from++); | 393 | c = *(from++); |
372 | if (!c) | 394 | if (!c) |
@@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p) | |||
625 | #endif | 647 | #endif |
626 | 648 | ||
627 | sparse_init(); | 649 | sparse_init(); |
650 | |||
651 | #ifdef CONFIG_KEXEC | ||
652 | if (crashk_res.start != crashk_res.end) { | ||
653 | reserve_bootmem(crashk_res.start, | ||
654 | crashk_res.end - crashk_res.start + 1); | ||
655 | } | ||
656 | #endif | ||
628 | paging_init(); | 657 | paging_init(); |
629 | 658 | ||
630 | check_ioapic(); | 659 | check_ioapic(); |
@@ -676,7 +705,7 @@ void __init setup_arch(char **cmdline_p) | |||
676 | #endif | 705 | #endif |
677 | } | 706 | } |
678 | 707 | ||
679 | static int __init get_model_name(struct cpuinfo_x86 *c) | 708 | static int __cpuinit get_model_name(struct cpuinfo_x86 *c) |
680 | { | 709 | { |
681 | unsigned int *v; | 710 | unsigned int *v; |
682 | 711 | ||
@@ -692,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c) | |||
692 | } | 721 | } |
693 | 722 | ||
694 | 723 | ||
695 | static void __init display_cacheinfo(struct cpuinfo_x86 *c) | 724 | static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) |
696 | { | 725 | { |
697 | unsigned int n, dummy, eax, ebx, ecx, edx; | 726 | unsigned int n, dummy, eax, ebx, ecx, edx; |
698 | 727 | ||
@@ -803,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c) | |||
803 | return r; | 832 | return r; |
804 | } | 833 | } |
805 | 834 | ||
806 | static void __init detect_ht(struct cpuinfo_x86 *c) | 835 | static void __cpuinit detect_ht(struct cpuinfo_x86 *c) |
807 | { | 836 | { |
808 | #ifdef CONFIG_SMP | 837 | #ifdef CONFIG_SMP |
809 | u32 eax, ebx, ecx, edx; | 838 | u32 eax, ebx, ecx, edx; |
@@ -864,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c) | |||
864 | /* | 893 | /* |
865 | * find out the number of processor cores on the die | 894 | * find out the number of processor cores on the die |
866 | */ | 895 | */ |
867 | static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) | 896 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) |
868 | { | 897 | { |
869 | unsigned int eax; | 898 | unsigned int eax; |
870 | 899 | ||
@@ -882,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) | |||
882 | return 1; | 911 | return 1; |
883 | } | 912 | } |
884 | 913 | ||
885 | static void __init init_intel(struct cpuinfo_x86 *c) | 914 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) |
886 | { | 915 | { |
887 | /* Cache sizes */ | 916 | /* Cache sizes */ |
888 | unsigned n; | 917 | unsigned n; |
@@ -902,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c) | |||
902 | c->x86_num_cores = intel_num_cpu_cores(c); | 931 | c->x86_num_cores = intel_num_cpu_cores(c); |
903 | } | 932 | } |
904 | 933 | ||
905 | void __init get_cpu_vendor(struct cpuinfo_x86 *c) | 934 | void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) |
906 | { | 935 | { |
907 | char *v = c->x86_vendor_id; | 936 | char *v = c->x86_vendor_id; |
908 | 937 | ||
@@ -923,7 +952,7 @@ struct cpu_model_info { | |||
923 | /* Do some early cpuid on the boot CPU to get some parameter that are | 952 | /* Do some early cpuid on the boot CPU to get some parameter that are |
924 | needed before check_bugs. Everything advanced is in identify_cpu | 953 | needed before check_bugs. Everything advanced is in identify_cpu |
925 | below. */ | 954 | below. */ |
926 | void __init early_identify_cpu(struct cpuinfo_x86 *c) | 955 | void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) |
927 | { | 956 | { |
928 | u32 tfms; | 957 | u32 tfms; |
929 | 958 | ||
@@ -977,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) | |||
977 | /* | 1006 | /* |
978 | * This does the hard work of actually picking apart the CPU stuff... | 1007 | * This does the hard work of actually picking apart the CPU stuff... |
979 | */ | 1008 | */ |
980 | void __init identify_cpu(struct cpuinfo_x86 *c) | 1009 | void __cpuinit identify_cpu(struct cpuinfo_x86 *c) |
981 | { | 1010 | { |
982 | int i; | 1011 | int i; |
983 | u32 xlvl; | 1012 | u32 xlvl; |
@@ -1054,7 +1083,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c) | |||
1054 | } | 1083 | } |
1055 | 1084 | ||
1056 | 1085 | ||
1057 | void __init print_cpu_info(struct cpuinfo_x86 *c) | 1086 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) |
1058 | { | 1087 | { |
1059 | if (c->x86_model_id[0]) | 1088 | if (c->x86_model_id[0]) |
1060 | printk("%s", c->x86_model_id); | 1089 | printk("%s", c->x86_model_id); |
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 678b7ac33b8b..f2b9c6bc999a 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; | 30 | char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; |
31 | 31 | ||
32 | cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; | 32 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; |
33 | 33 | ||
34 | struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; | 34 | struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; |
35 | 35 | ||
@@ -171,7 +171,7 @@ void syscall_init(void) | |||
171 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | 171 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); |
172 | } | 172 | } |
173 | 173 | ||
174 | void __init check_efer(void) | 174 | void __cpuinit check_efer(void) |
175 | { | 175 | { |
176 | unsigned long efer; | 176 | unsigned long efer; |
177 | 177 | ||
@@ -188,7 +188,7 @@ void __init check_efer(void) | |||
188 | * 'CPU state barrier', nothing should get across. | 188 | * 'CPU state barrier', nothing should get across. |
189 | * A lot of state is already set up in PDA init. | 189 | * A lot of state is already set up in PDA init. |
190 | */ | 190 | */ |
191 | void __init cpu_init (void) | 191 | void __cpuinit cpu_init (void) |
192 | { | 192 | { |
193 | #ifdef CONFIG_SMP | 193 | #ifdef CONFIG_SMP |
194 | int cpu = stack_smp_processor_id(); | 194 | int cpu = stack_smp_processor_id(); |
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 98b7ba95d581..98590a989f3d 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c | |||
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) | |||
425 | if (!user_mode(regs)) | 425 | if (!user_mode(regs)) |
426 | return 1; | 426 | return 1; |
427 | 427 | ||
428 | if (try_to_freeze(0)) | 428 | if (try_to_freeze()) |
429 | goto no_signal; | 429 | goto no_signal; |
430 | 430 | ||
431 | if (!oldset) | 431 | if (!oldset) |
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 1e379ed17b1d..ccae392886af 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c | |||
@@ -283,6 +283,16 @@ struct call_data_struct { | |||
283 | 283 | ||
284 | static struct call_data_struct * call_data; | 284 | static struct call_data_struct * call_data; |
285 | 285 | ||
286 | void lock_ipi_call_lock(void) | ||
287 | { | ||
288 | spin_lock_irq(&call_lock); | ||
289 | } | ||
290 | |||
291 | void unlock_ipi_call_lock(void) | ||
292 | { | ||
293 | spin_unlock_irq(&call_lock); | ||
294 | } | ||
295 | |||
286 | /* | 296 | /* |
287 | * this function sends a 'generic call function' IPI to all other CPUs | 297 | * this function sends a 'generic call function' IPI to all other CPUs |
288 | * in the system. | 298 | * in the system. |
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f1ec0f345941..b969ee128728 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -34,6 +34,7 @@ | |||
34 | * Andi Kleen : Converted to new state machine. | 34 | * Andi Kleen : Converted to new state machine. |
35 | * Various cleanups. | 35 | * Various cleanups. |
36 | * Probably mostly hotplug CPU ready now. | 36 | * Probably mostly hotplug CPU ready now. |
37 | * Ashok Raj : CPU hotplug support | ||
37 | */ | 38 | */ |
38 | 39 | ||
39 | 40 | ||
@@ -58,11 +59,6 @@ | |||
58 | #include <asm/proto.h> | 59 | #include <asm/proto.h> |
59 | #include <asm/nmi.h> | 60 | #include <asm/nmi.h> |
60 | 61 | ||
61 | /* Change for real CPU hotplug. Note other files need to be fixed | ||
62 | first too. */ | ||
63 | #define __cpuinit __init | ||
64 | #define __cpuinitdata __initdata | ||
65 | |||
66 | /* Number of siblings per CPU package */ | 62 | /* Number of siblings per CPU package */ |
67 | int smp_num_siblings = 1; | 63 | int smp_num_siblings = 1; |
68 | /* Package ID of each logical CPU */ | 64 | /* Package ID of each logical CPU */ |
@@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map); | |||
103 | extern unsigned char trampoline_data[]; | 99 | extern unsigned char trampoline_data[]; |
104 | extern unsigned char trampoline_end[]; | 100 | extern unsigned char trampoline_end[]; |
105 | 101 | ||
102 | /* State of each CPU */ | ||
103 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
104 | |||
105 | /* | ||
106 | * Store all idle threads, this can be reused instead of creating | ||
107 | * a new thread. Also avoids complicated thread destroy functionality | ||
108 | * for idle threads. | ||
109 | */ | ||
110 | struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | ||
111 | |||
112 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | ||
113 | #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) | ||
114 | |||
115 | /* | ||
116 | * cpu_possible_map should be static, it cannot change as cpu's | ||
117 | * are onlined, or offlined. The reason is per-cpu data-structures | ||
118 | * are allocated by some modules at init time, and dont expect to | ||
119 | * do this dynamically on cpu arrival/departure. | ||
120 | * cpu_present_map on the other hand can change dynamically. | ||
121 | * In case when cpu_hotplug is not compiled, then we resort to current | ||
122 | * behaviour, which is cpu_possible == cpu_present. | ||
123 | * If cpu-hotplug is supported, then we need to preallocate for all | ||
124 | * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. | ||
125 | * - Ashok Raj | ||
126 | */ | ||
127 | #ifdef CONFIG_HOTPLUG_CPU | ||
128 | #define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) | ||
129 | #else | ||
130 | #define fixup_cpu_possible_map(x) | ||
131 | #endif | ||
132 | |||
106 | /* | 133 | /* |
107 | * Currently trivial. Write the real->protected mode | 134 | * Currently trivial. Write the real->protected mode |
108 | * bootstrap into the page concerned. The caller | 135 | * bootstrap into the page concerned. The caller |
@@ -418,6 +445,33 @@ void __cpuinit smp_callin(void) | |||
418 | cpu_set(cpuid, cpu_callin_map); | 445 | cpu_set(cpuid, cpu_callin_map); |
419 | } | 446 | } |
420 | 447 | ||
448 | static inline void set_cpu_sibling_map(int cpu) | ||
449 | { | ||
450 | int i; | ||
451 | |||
452 | if (smp_num_siblings > 1) { | ||
453 | for_each_cpu(i) { | ||
454 | if (cpu_core_id[cpu] == cpu_core_id[i]) { | ||
455 | cpu_set(i, cpu_sibling_map[cpu]); | ||
456 | cpu_set(cpu, cpu_sibling_map[i]); | ||
457 | } | ||
458 | } | ||
459 | } else { | ||
460 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
461 | } | ||
462 | |||
463 | if (current_cpu_data.x86_num_cores > 1) { | ||
464 | for_each_cpu(i) { | ||
465 | if (phys_proc_id[cpu] == phys_proc_id[i]) { | ||
466 | cpu_set(i, cpu_core_map[cpu]); | ||
467 | cpu_set(cpu, cpu_core_map[i]); | ||
468 | } | ||
469 | } | ||
470 | } else { | ||
471 | cpu_core_map[cpu] = cpu_sibling_map[cpu]; | ||
472 | } | ||
473 | } | ||
474 | |||
421 | /* | 475 | /* |
422 | * Setup code on secondary processor (after comming out of the trampoline) | 476 | * Setup code on secondary processor (after comming out of the trampoline) |
423 | */ | 477 | */ |
@@ -448,9 +502,28 @@ void __cpuinit start_secondary(void) | |||
448 | enable_APIC_timer(); | 502 | enable_APIC_timer(); |
449 | 503 | ||
450 | /* | 504 | /* |
505 | * The sibling maps must be set before turing the online map on for | ||
506 | * this cpu | ||
507 | */ | ||
508 | set_cpu_sibling_map(smp_processor_id()); | ||
509 | |||
510 | /* | ||
511 | * We need to hold call_lock, so there is no inconsistency | ||
512 | * between the time smp_call_function() determines number of | ||
513 | * IPI receipients, and the time when the determination is made | ||
514 | * for which cpus receive the IPI in genapic_flat.c. Holding this | ||
515 | * lock helps us to not include this cpu in a currently in progress | ||
516 | * smp_call_function(). | ||
517 | */ | ||
518 | lock_ipi_call_lock(); | ||
519 | |||
520 | /* | ||
451 | * Allow the master to continue. | 521 | * Allow the master to continue. |
452 | */ | 522 | */ |
453 | cpu_set(smp_processor_id(), cpu_online_map); | 523 | cpu_set(smp_processor_id(), cpu_online_map); |
524 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | ||
525 | unlock_ipi_call_lock(); | ||
526 | |||
454 | mb(); | 527 | mb(); |
455 | 528 | ||
456 | /* Wait for TSC sync to not schedule things before. | 529 | /* Wait for TSC sync to not schedule things before. |
@@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta | |||
628 | return (send_status | accept_status); | 701 | return (send_status | accept_status); |
629 | } | 702 | } |
630 | 703 | ||
704 | struct create_idle { | ||
705 | struct task_struct *idle; | ||
706 | struct completion done; | ||
707 | int cpu; | ||
708 | }; | ||
709 | |||
710 | void do_fork_idle(void *_c_idle) | ||
711 | { | ||
712 | struct create_idle *c_idle = _c_idle; | ||
713 | |||
714 | c_idle->idle = fork_idle(c_idle->cpu); | ||
715 | complete(&c_idle->done); | ||
716 | } | ||
717 | |||
631 | /* | 718 | /* |
632 | * Boot one CPU. | 719 | * Boot one CPU. |
633 | */ | 720 | */ |
634 | static int __cpuinit do_boot_cpu(int cpu, int apicid) | 721 | static int __cpuinit do_boot_cpu(int cpu, int apicid) |
635 | { | 722 | { |
636 | struct task_struct *idle; | ||
637 | unsigned long boot_error; | 723 | unsigned long boot_error; |
638 | int timeout; | 724 | int timeout; |
639 | unsigned long start_rip; | 725 | unsigned long start_rip; |
726 | struct create_idle c_idle = { | ||
727 | .cpu = cpu, | ||
728 | .done = COMPLETION_INITIALIZER(c_idle.done), | ||
729 | }; | ||
730 | DECLARE_WORK(work, do_fork_idle, &c_idle); | ||
731 | |||
732 | c_idle.idle = get_idle_for_cpu(cpu); | ||
733 | |||
734 | if (c_idle.idle) { | ||
735 | c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) | ||
736 | (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); | ||
737 | init_idle(c_idle.idle, cpu); | ||
738 | goto do_rest; | ||
739 | } | ||
740 | |||
640 | /* | 741 | /* |
641 | * We can't use kernel_thread since we must avoid to | 742 | * During cold boot process, keventd thread is not spun up yet. |
642 | * reschedule the child. | 743 | * When we do cpu hot-add, we create idle threads on the fly, we should |
744 | * not acquire any attributes from the calling context. Hence the clean | ||
745 | * way to create kernel_threads() is to do that from keventd(). | ||
746 | * We do the current_is_keventd() due to the fact that ACPI notifier | ||
747 | * was also queuing to keventd() and when the caller is already running | ||
748 | * in context of keventd(), we would end up with locking up the keventd | ||
749 | * thread. | ||
643 | */ | 750 | */ |
644 | idle = fork_idle(cpu); | 751 | if (!keventd_up() || current_is_keventd()) |
645 | if (IS_ERR(idle)) { | 752 | work.func(work.data); |
753 | else { | ||
754 | schedule_work(&work); | ||
755 | wait_for_completion(&c_idle.done); | ||
756 | } | ||
757 | |||
758 | if (IS_ERR(c_idle.idle)) { | ||
646 | printk("failed fork for CPU %d\n", cpu); | 759 | printk("failed fork for CPU %d\n", cpu); |
647 | return PTR_ERR(idle); | 760 | return PTR_ERR(c_idle.idle); |
648 | } | 761 | } |
649 | 762 | ||
650 | cpu_pda[cpu].pcurrent = idle; | 763 | set_idle_for_cpu(cpu, c_idle.idle); |
764 | |||
765 | do_rest: | ||
766 | |||
767 | cpu_pda[cpu].pcurrent = c_idle.idle; | ||
651 | 768 | ||
652 | start_rip = setup_trampoline(); | 769 | start_rip = setup_trampoline(); |
653 | 770 | ||
654 | init_rsp = idle->thread.rsp; | 771 | init_rsp = c_idle.idle->thread.rsp; |
655 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | 772 | per_cpu(init_tss,cpu).rsp0 = init_rsp; |
656 | initial_code = start_secondary; | 773 | initial_code = start_secondary; |
657 | clear_ti_thread_flag(idle->thread_info, TIF_FORK); | 774 | clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); |
658 | 775 | ||
659 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, | 776 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, |
660 | start_rip, init_rsp); | 777 | start_rip, init_rsp); |
@@ -746,51 +863,6 @@ cycles_t cacheflush_time; | |||
746 | unsigned long cache_decay_ticks; | 863 | unsigned long cache_decay_ticks; |
747 | 864 | ||
748 | /* | 865 | /* |
749 | * Construct cpu_sibling_map[], so that we can tell the sibling CPU | ||
750 | * on SMT systems efficiently. | ||
751 | */ | ||
752 | static __cpuinit void detect_siblings(void) | ||
753 | { | ||
754 | int cpu; | ||
755 | |||
756 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
757 | cpus_clear(cpu_sibling_map[cpu]); | ||
758 | cpus_clear(cpu_core_map[cpu]); | ||
759 | } | ||
760 | |||
761 | for_each_online_cpu (cpu) { | ||
762 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
763 | int siblings = 0; | ||
764 | int i; | ||
765 | if (smp_num_siblings > 1) { | ||
766 | for_each_online_cpu (i) { | ||
767 | if (cpu_core_id[cpu] == cpu_core_id[i]) { | ||
768 | siblings++; | ||
769 | cpu_set(i, cpu_sibling_map[cpu]); | ||
770 | } | ||
771 | } | ||
772 | } else { | ||
773 | siblings++; | ||
774 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
775 | } | ||
776 | |||
777 | if (siblings != smp_num_siblings) { | ||
778 | printk(KERN_WARNING | ||
779 | "WARNING: %d siblings found for CPU%d, should be %d\n", | ||
780 | siblings, cpu, smp_num_siblings); | ||
781 | smp_num_siblings = siblings; | ||
782 | } | ||
783 | if (c->x86_num_cores > 1) { | ||
784 | for_each_online_cpu(i) { | ||
785 | if (phys_proc_id[cpu] == phys_proc_id[i]) | ||
786 | cpu_set(i, cpu_core_map[cpu]); | ||
787 | } | ||
788 | } else | ||
789 | cpu_core_map[cpu] = cpu_sibling_map[cpu]; | ||
790 | } | ||
791 | } | ||
792 | |||
793 | /* | ||
794 | * Cleanup possible dangling ends... | 866 | * Cleanup possible dangling ends... |
795 | */ | 867 | */ |
796 | static __cpuinit void smp_cleanup_boot(void) | 868 | static __cpuinit void smp_cleanup_boot(void) |
@@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void) | |||
823 | * | 895 | * |
824 | * RED-PEN audit/test this more. I bet there is more state messed up here. | 896 | * RED-PEN audit/test this more. I bet there is more state messed up here. |
825 | */ | 897 | */ |
826 | static __cpuinit void disable_smp(void) | 898 | static __init void disable_smp(void) |
827 | { | 899 | { |
828 | cpu_present_map = cpumask_of_cpu(0); | 900 | cpu_present_map = cpumask_of_cpu(0); |
829 | cpu_possible_map = cpumask_of_cpu(0); | 901 | cpu_possible_map = cpumask_of_cpu(0); |
@@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void) | |||
838 | /* | 910 | /* |
839 | * Handle user cpus=... parameter. | 911 | * Handle user cpus=... parameter. |
840 | */ | 912 | */ |
841 | static __cpuinit void enforce_max_cpus(unsigned max_cpus) | 913 | static __init void enforce_max_cpus(unsigned max_cpus) |
842 | { | 914 | { |
843 | int i, k; | 915 | int i, k; |
844 | k = 0; | 916 | k = 0; |
@@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus) | |||
855 | /* | 927 | /* |
856 | * Various sanity checks. | 928 | * Various sanity checks. |
857 | */ | 929 | */ |
858 | static int __cpuinit smp_sanity_check(unsigned max_cpus) | 930 | static int __init smp_sanity_check(unsigned max_cpus) |
859 | { | 931 | { |
860 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { | 932 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { |
861 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | 933 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", |
@@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) | |||
913 | * Prepare for SMP bootup. The MP table or ACPI has been read | 985 | * Prepare for SMP bootup. The MP table or ACPI has been read |
914 | * earlier. Just do some sanity checking here and enable APIC mode. | 986 | * earlier. Just do some sanity checking here and enable APIC mode. |
915 | */ | 987 | */ |
916 | void __cpuinit smp_prepare_cpus(unsigned int max_cpus) | 988 | void __init smp_prepare_cpus(unsigned int max_cpus) |
917 | { | 989 | { |
918 | int i; | 990 | int i; |
919 | 991 | ||
@@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) | |||
930 | int apicid = cpu_present_to_apicid(i); | 1002 | int apicid = cpu_present_to_apicid(i); |
931 | if (physid_isset(apicid, phys_cpu_present_map)) { | 1003 | if (physid_isset(apicid, phys_cpu_present_map)) { |
932 | cpu_set(i, cpu_present_map); | 1004 | cpu_set(i, cpu_present_map); |
933 | /* possible map would be different if we supported real | ||
934 | CPU hotplug. */ | ||
935 | cpu_set(i, cpu_possible_map); | 1005 | cpu_set(i, cpu_possible_map); |
936 | } | 1006 | } |
1007 | fixup_cpu_possible_map(i); | ||
937 | } | 1008 | } |
938 | 1009 | ||
939 | if (smp_sanity_check(max_cpus) < 0) { | 1010 | if (smp_sanity_check(max_cpus) < 0) { |
@@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void) | |||
978 | int me = smp_processor_id(); | 1049 | int me = smp_processor_id(); |
979 | cpu_set(me, cpu_online_map); | 1050 | cpu_set(me, cpu_online_map); |
980 | cpu_set(me, cpu_callout_map); | 1051 | cpu_set(me, cpu_callout_map); |
1052 | cpu_set(0, cpu_sibling_map[0]); | ||
1053 | cpu_set(0, cpu_core_map[0]); | ||
1054 | per_cpu(cpu_state, me) = CPU_ONLINE; | ||
981 | } | 1055 | } |
982 | 1056 | ||
983 | /* | 1057 | /* |
984 | * Entry point to boot a CPU. | 1058 | * Entry point to boot a CPU. |
985 | * | ||
986 | * This is all __cpuinit, not __devinit for now because we don't support | ||
987 | * CPU hotplug (yet). | ||
988 | */ | 1059 | */ |
989 | int __cpuinit __cpu_up(unsigned int cpu) | 1060 | int __cpuinit __cpu_up(unsigned int cpu) |
990 | { | 1061 | { |
@@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
1001 | return -EINVAL; | 1072 | return -EINVAL; |
1002 | } | 1073 | } |
1003 | 1074 | ||
1075 | /* | ||
1076 | * Already booted CPU? | ||
1077 | */ | ||
1078 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
1079 | Dprintk("do_boot_cpu %d Already started\n", cpu); | ||
1080 | return -ENOSYS; | ||
1081 | } | ||
1082 | |||
1083 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | ||
1004 | /* Boot it! */ | 1084 | /* Boot it! */ |
1005 | err = do_boot_cpu(cpu, apicid); | 1085 | err = do_boot_cpu(cpu, apicid); |
1006 | if (err < 0) { | 1086 | if (err < 0) { |
@@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
1013 | 1093 | ||
1014 | while (!cpu_isset(cpu, cpu_online_map)) | 1094 | while (!cpu_isset(cpu, cpu_online_map)) |
1015 | cpu_relax(); | 1095 | cpu_relax(); |
1016 | return 0; | 1096 | err = 0; |
1097 | |||
1098 | return err; | ||
1017 | } | 1099 | } |
1018 | 1100 | ||
1019 | /* | 1101 | /* |
1020 | * Finish the SMP boot. | 1102 | * Finish the SMP boot. |
1021 | */ | 1103 | */ |
1022 | void __cpuinit smp_cpus_done(unsigned int max_cpus) | 1104 | void __init smp_cpus_done(unsigned int max_cpus) |
1023 | { | 1105 | { |
1106 | #ifndef CONFIG_HOTPLUG_CPU | ||
1024 | zap_low_mappings(); | 1107 | zap_low_mappings(); |
1108 | #endif | ||
1025 | smp_cleanup_boot(); | 1109 | smp_cleanup_boot(); |
1026 | 1110 | ||
1027 | #ifdef CONFIG_X86_IO_APIC | 1111 | #ifdef CONFIG_X86_IO_APIC |
1028 | setup_ioapic_dest(); | 1112 | setup_ioapic_dest(); |
1029 | #endif | 1113 | #endif |
1030 | 1114 | ||
1031 | detect_siblings(); | ||
1032 | time_init_gtod(); | 1115 | time_init_gtod(); |
1033 | 1116 | ||
1034 | check_nmi_watchdog(); | 1117 | check_nmi_watchdog(); |
1035 | } | 1118 | } |
1119 | |||
1120 | #ifdef CONFIG_HOTPLUG_CPU | ||
1121 | |||
1122 | static void remove_siblinginfo(int cpu) | ||
1123 | { | ||
1124 | int sibling; | ||
1125 | |||
1126 | for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) | ||
1127 | cpu_clear(cpu, cpu_sibling_map[sibling]); | ||
1128 | for_each_cpu_mask(sibling, cpu_core_map[cpu]) | ||
1129 | cpu_clear(cpu, cpu_core_map[sibling]); | ||
1130 | cpus_clear(cpu_sibling_map[cpu]); | ||
1131 | cpus_clear(cpu_core_map[cpu]); | ||
1132 | phys_proc_id[cpu] = BAD_APICID; | ||
1133 | cpu_core_id[cpu] = BAD_APICID; | ||
1134 | } | ||
1135 | |||
1136 | void remove_cpu_from_maps(void) | ||
1137 | { | ||
1138 | int cpu = smp_processor_id(); | ||
1139 | |||
1140 | cpu_clear(cpu, cpu_callout_map); | ||
1141 | cpu_clear(cpu, cpu_callin_map); | ||
1142 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
1143 | } | ||
1144 | |||
1145 | int __cpu_disable(void) | ||
1146 | { | ||
1147 | int cpu = smp_processor_id(); | ||
1148 | |||
1149 | /* | ||
1150 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1151 | * into generic code. | ||
1152 | * | ||
1153 | * We won't take down the boot processor on i386 due to some | ||
1154 | * interrupts only being able to be serviced by the BSP. | ||
1155 | * Especially so if we're not using an IOAPIC -zwane | ||
1156 | */ | ||
1157 | if (cpu == 0) | ||
1158 | return -EBUSY; | ||
1159 | |||
1160 | disable_APIC_timer(); | ||
1161 | |||
1162 | /* | ||
1163 | * HACK: | ||
1164 | * Allow any queued timer interrupts to get serviced | ||
1165 | * This is only a temporary solution until we cleanup | ||
1166 | * fixup_irqs as we do for IA64. | ||
1167 | */ | ||
1168 | local_irq_enable(); | ||
1169 | mdelay(1); | ||
1170 | |||
1171 | local_irq_disable(); | ||
1172 | remove_siblinginfo(cpu); | ||
1173 | |||
1174 | /* It's now safe to remove this processor from the online map */ | ||
1175 | cpu_clear(cpu, cpu_online_map); | ||
1176 | remove_cpu_from_maps(); | ||
1177 | fixup_irqs(cpu_online_map); | ||
1178 | return 0; | ||
1179 | } | ||
1180 | |||
1181 | void __cpu_die(unsigned int cpu) | ||
1182 | { | ||
1183 | /* We don't do anything here: idle task is faking death itself. */ | ||
1184 | unsigned int i; | ||
1185 | |||
1186 | for (i = 0; i < 10; i++) { | ||
1187 | /* They ack this in play_dead by setting CPU_DEAD */ | ||
1188 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | ||
1189 | printk ("CPU %d is now offline\n", cpu); | ||
1190 | return; | ||
1191 | } | ||
1192 | current->state = TASK_UNINTERRUPTIBLE; | ||
1193 | schedule_timeout(HZ/10); | ||
1194 | } | ||
1195 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | ||
1196 | } | ||
1197 | |||
1198 | #else /* ... !CONFIG_HOTPLUG_CPU */ | ||
1199 | |||
1200 | int __cpu_disable(void) | ||
1201 | { | ||
1202 | return -ENOSYS; | ||
1203 | } | ||
1204 | |||
1205 | void __cpu_die(unsigned int cpu) | ||
1206 | { | ||
1207 | /* We said "no" in __cpu_disable */ | ||
1208 | BUG(); | ||
1209 | } | ||
1210 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ebaa1e37d657..6c0f402e3a88 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c | |||
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt) | |||
44 | */ | 44 | */ |
45 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); | 45 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); |
46 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); | 46 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); |
47 | asm volatile ("sldt %0" : "=m" (ctxt->ldt)); | ||
48 | asm volatile ("str %0" : "=m" (ctxt->tr)); | 47 | asm volatile ("str %0" : "=m" (ctxt->tr)); |
49 | 48 | ||
50 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | 49 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ |
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt) | |||
69 | asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); | 68 | asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); |
70 | asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); | 69 | asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); |
71 | asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); | 70 | asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); |
71 | asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); | ||
72 | } | 72 | } |
73 | 73 | ||
74 | void save_processor_state(void) | 74 | void save_processor_state(void) |
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt) | |||
90 | /* | 90 | /* |
91 | * control registers | 91 | * control registers |
92 | */ | 92 | */ |
93 | asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); | ||
93 | asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); | 94 | asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); |
94 | asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); | 95 | asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); |
95 | asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); | 96 | asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); |
96 | asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); | 97 | asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); |
97 | 98 | ||
98 | /* | 99 | /* |
100 | * now restore the descriptor tables to their proper values | ||
101 | * ltr is done i fix_processor_context(). | ||
102 | */ | ||
103 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
104 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
105 | |||
106 | /* | ||
99 | * segment registers | 107 | * segment registers |
100 | */ | 108 | */ |
101 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | 109 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); |
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt) | |||
108 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | 116 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); |
109 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | 117 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); |
110 | 118 | ||
111 | /* | ||
112 | * now restore the descriptor tables to their proper values | ||
113 | * ltr is done i fix_processor_context(). | ||
114 | */ | ||
115 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
116 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
117 | asm volatile ("lldt %0" :: "m" (ctxt->ldt)); | ||
118 | |||
119 | fix_processor_context(); | 119 | fix_processor_context(); |
120 | 120 | ||
121 | do_fpu_end(); | 121 | do_fpu_end(); |
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 121646fc43f6..102736630002 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c | |||
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | |||
586 | asmlinkage void default_do_nmi(struct pt_regs *regs) | 586 | asmlinkage void default_do_nmi(struct pt_regs *regs) |
587 | { | 587 | { |
588 | unsigned char reason = 0; | 588 | unsigned char reason = 0; |
589 | int cpu; | ||
590 | |||
591 | cpu = smp_processor_id(); | ||
589 | 592 | ||
590 | /* Only the BSP gets external NMIs from the system. */ | 593 | /* Only the BSP gets external NMIs from the system. */ |
591 | if (!smp_processor_id()) | 594 | if (!cpu) |
592 | reason = get_nmi_reason(); | 595 | reason = get_nmi_reason(); |
593 | 596 | ||
597 | if (!cpu_online(cpu)) | ||
598 | return; | ||
599 | |||
594 | if (!(reason & 0xc0)) { | 600 | if (!(reason & 0xc0)) { |
595 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) | 601 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) |
596 | == NOTIFY_STOP) | 602 | == NOTIFY_STOP) |
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 59ebd5beda87..73389f51c4e5 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S | |||
@@ -2,7 +2,10 @@ | |||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | 2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #define LOAD_OFFSET __START_KERNEL_map | ||
6 | |||
5 | #include <asm-generic/vmlinux.lds.h> | 7 | #include <asm-generic/vmlinux.lds.h> |
8 | #include <asm/page.h> | ||
6 | #include <linux/config.h> | 9 | #include <linux/config.h> |
7 | 10 | ||
8 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | 11 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") |
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64) | |||
11 | jiffies_64 = jiffies; | 14 | jiffies_64 = jiffies; |
12 | SECTIONS | 15 | SECTIONS |
13 | { | 16 | { |
14 | . = 0xffffffff80100000; | 17 | . = __START_KERNEL; |
15 | phys_startup_64 = startup_64 - LOAD_OFFSET; | 18 | phys_startup_64 = startup_64 - LOAD_OFFSET; |
16 | _text = .; /* Text and read-only data */ | 19 | _text = .; /* Text and read-only data */ |
17 | .text : { | 20 | .text : AT(ADDR(.text) - LOAD_OFFSET) { |
18 | *(.text) | 21 | *(.text) |
19 | SCHED_TEXT | 22 | SCHED_TEXT |
20 | LOCK_TEXT | 23 | LOCK_TEXT |
21 | *(.fixup) | 24 | *(.fixup) |
22 | *(.gnu.warning) | 25 | *(.gnu.warning) |
23 | } = 0x9090 | 26 | } = 0x9090 |
24 | .text.lock : { *(.text.lock) } /* out-of-line lock text */ | 27 | /* out-of-line lock text */ |
28 | .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } | ||
25 | 29 | ||
26 | _etext = .; /* End of text section */ | 30 | _etext = .; /* End of text section */ |
27 | 31 | ||
28 | . = ALIGN(16); /* Exception table */ | 32 | . = ALIGN(16); /* Exception table */ |
29 | __start___ex_table = .; | 33 | __start___ex_table = .; |
30 | __ex_table : { *(__ex_table) } | 34 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } |
31 | __stop___ex_table = .; | 35 | __stop___ex_table = .; |
32 | 36 | ||
33 | RODATA | 37 | RODATA |
34 | 38 | ||
35 | .data : { /* Data */ | 39 | /* Data */ |
40 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | ||
36 | *(.data) | 41 | *(.data) |
37 | CONSTRUCTORS | 42 | CONSTRUCTORS |
38 | } | 43 | } |
@@ -40,62 +45,95 @@ SECTIONS | |||
40 | _edata = .; /* End of data section */ | 45 | _edata = .; /* End of data section */ |
41 | 46 | ||
42 | __bss_start = .; /* BSS */ | 47 | __bss_start = .; /* BSS */ |
43 | .bss : { | 48 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { |
44 | *(.bss.page_aligned) | 49 | *(.bss.page_aligned) |
45 | *(.bss) | 50 | *(.bss) |
46 | } | 51 | } |
47 | __bss_end = .; | 52 | __bss_end = .; |
48 | 53 | ||
54 | . = ALIGN(PAGE_SIZE); | ||
49 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | 55 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); |
50 | .data.cacheline_aligned : { *(.data.cacheline_aligned) } | 56 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { |
57 | *(.data.cacheline_aligned) | ||
58 | } | ||
59 | |||
60 | #define VSYSCALL_ADDR (-10*1024*1024) | ||
61 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) | ||
62 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) | ||
63 | |||
64 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | ||
65 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | ||
66 | |||
67 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | ||
68 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | ||
51 | 69 | ||
52 | #define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) | 70 | . = VSYSCALL_ADDR; |
53 | #define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) | 71 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } |
54 | #define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) | 72 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; |
55 | 73 | ||
56 | .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } | ||
57 | __vsyscall_0 = LOADADDR(.vsyscall_0); | ||
58 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | 74 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); |
59 | .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } | 75 | .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } |
60 | xtime_lock = LOADADDR(.xtime_lock); | 76 | xtime_lock = VVIRT(.xtime_lock); |
61 | .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } | 77 | |
62 | vxtime = LOADADDR(.vxtime); | 78 | .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } |
63 | .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } | 79 | vxtime = VVIRT(.vxtime); |
64 | wall_jiffies = LOADADDR(.wall_jiffies); | 80 | |
65 | .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } | 81 | .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } |
66 | sys_tz = LOADADDR(.sys_tz); | 82 | wall_jiffies = VVIRT(.wall_jiffies); |
67 | .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } | 83 | |
68 | sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); | 84 | .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } |
69 | .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } | 85 | sys_tz = VVIRT(.sys_tz); |
70 | xtime = LOADADDR(.xtime); | 86 | |
87 | .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } | ||
88 | sysctl_vsyscall = VVIRT(.sysctl_vsyscall); | ||
89 | |||
90 | .xtime : AT(VLOAD(.xtime)) { *(.xtime) } | ||
91 | xtime = VVIRT(.xtime); | ||
92 | |||
71 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | 93 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); |
72 | .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } | 94 | .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } |
73 | jiffies = LOADADDR(.jiffies); | 95 | jiffies = VVIRT(.jiffies); |
74 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } | 96 | |
75 | . = LOADADDR(.vsyscall_0) + 4096; | 97 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } |
98 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } | ||
99 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } | ||
100 | |||
101 | . = VSYSCALL_VIRT_ADDR + 4096; | ||
102 | |||
103 | #undef VSYSCALL_ADDR | ||
104 | #undef VSYSCALL_PHYS_ADDR | ||
105 | #undef VSYSCALL_VIRT_ADDR | ||
106 | #undef VLOAD_OFFSET | ||
107 | #undef VLOAD | ||
108 | #undef VVIRT_OFFSET | ||
109 | #undef VVIRT | ||
76 | 110 | ||
77 | . = ALIGN(8192); /* init_task */ | 111 | . = ALIGN(8192); /* init_task */ |
78 | .data.init_task : { *(.data.init_task) } | 112 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { |
113 | *(.data.init_task) | ||
114 | } | ||
79 | 115 | ||
80 | . = ALIGN(4096); | 116 | . = ALIGN(4096); |
81 | .data.page_aligned : { *(.data.page_aligned) } | 117 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { |
118 | *(.data.page_aligned) | ||
119 | } | ||
82 | 120 | ||
83 | . = ALIGN(4096); /* Init code and data */ | 121 | . = ALIGN(4096); /* Init code and data */ |
84 | __init_begin = .; | 122 | __init_begin = .; |
85 | .init.text : { | 123 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { |
86 | _sinittext = .; | 124 | _sinittext = .; |
87 | *(.init.text) | 125 | *(.init.text) |
88 | _einittext = .; | 126 | _einittext = .; |
89 | } | 127 | } |
90 | __initdata_begin = .; | 128 | __initdata_begin = .; |
91 | .init.data : { *(.init.data) } | 129 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } |
92 | __initdata_end = .; | 130 | __initdata_end = .; |
93 | . = ALIGN(16); | 131 | . = ALIGN(16); |
94 | __setup_start = .; | 132 | __setup_start = .; |
95 | .init.setup : { *(.init.setup) } | 133 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } |
96 | __setup_end = .; | 134 | __setup_end = .; |
97 | __initcall_start = .; | 135 | __initcall_start = .; |
98 | .initcall.init : { | 136 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { |
99 | *(.initcall1.init) | 137 | *(.initcall1.init) |
100 | *(.initcall2.init) | 138 | *(.initcall2.init) |
101 | *(.initcall3.init) | 139 | *(.initcall3.init) |
@@ -106,32 +144,38 @@ SECTIONS | |||
106 | } | 144 | } |
107 | __initcall_end = .; | 145 | __initcall_end = .; |
108 | __con_initcall_start = .; | 146 | __con_initcall_start = .; |
109 | .con_initcall.init : { *(.con_initcall.init) } | 147 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { |
148 | *(.con_initcall.init) | ||
149 | } | ||
110 | __con_initcall_end = .; | 150 | __con_initcall_end = .; |
111 | SECURITY_INIT | 151 | SECURITY_INIT |
112 | . = ALIGN(8); | 152 | . = ALIGN(8); |
113 | __alt_instructions = .; | 153 | __alt_instructions = .; |
114 | .altinstructions : { *(.altinstructions) } | 154 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { |
155 | *(.altinstructions) | ||
156 | } | ||
115 | __alt_instructions_end = .; | 157 | __alt_instructions_end = .; |
116 | .altinstr_replacement : { *(.altinstr_replacement) } | 158 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { |
159 | *(.altinstr_replacement) | ||
160 | } | ||
117 | /* .exit.text is discard at runtime, not link time, to deal with references | 161 | /* .exit.text is discard at runtime, not link time, to deal with references |
118 | from .altinstructions and .eh_frame */ | 162 | from .altinstructions and .eh_frame */ |
119 | .exit.text : { *(.exit.text) } | 163 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } |
120 | .exit.data : { *(.exit.data) } | 164 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } |
121 | . = ALIGN(4096); | 165 | . = ALIGN(4096); |
122 | __initramfs_start = .; | 166 | __initramfs_start = .; |
123 | .init.ramfs : { *(.init.ramfs) } | 167 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } |
124 | __initramfs_end = .; | 168 | __initramfs_end = .; |
125 | . = ALIGN(32); | 169 | . = ALIGN(32); |
126 | __per_cpu_start = .; | 170 | __per_cpu_start = .; |
127 | .data.percpu : { *(.data.percpu) } | 171 | .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } |
128 | __per_cpu_end = .; | 172 | __per_cpu_end = .; |
129 | . = ALIGN(4096); | 173 | . = ALIGN(4096); |
130 | __init_end = .; | 174 | __init_end = .; |
131 | 175 | ||
132 | . = ALIGN(4096); | 176 | . = ALIGN(4096); |
133 | __nosave_begin = .; | 177 | __nosave_begin = .; |
134 | .data_nosave : { *(.data.nosave) } | 178 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } |
135 | . = ALIGN(4096); | 179 | . = ALIGN(4096); |
136 | __nosave_end = .; | 180 | __nosave_end = .; |
137 | 181 | ||
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 84cde796ecb1..ac61c186eb02 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
251 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | 251 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); |
252 | } | 252 | } |
253 | 253 | ||
254 | __init void numa_add_cpu(int cpu) | 254 | __cpuinit void numa_add_cpu(int cpu) |
255 | { | 255 | { |
256 | /* BP is initialized elsewhere */ | 256 | /* BP is initialized elsewhere */ |
257 | if (cpu) | 257 | if (cpu) |