aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2005-06-25 17:58:02 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:50 -0400
commit5234f5eb04abbbfa306ccfbc2ccbb6e73f515b15 (patch)
tree60f4701fdc501955ccff198f84913b96e3bbf5bf
parentd89559589a588d1a654329d8cd9a3ad33aaad9be (diff)
[PATCH] kexec: x86_64 kexec implementation
This is the x86_64 implementation of machine kexec. 32bit compatibility support has been implemented, and machine_kexec has been enhanced to not care about the changing internal kernel paget table structures. From: Alexander Nyberg <alexn@dsv.su.se> build fix Signed-off-by: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/x86_64/Kconfig17
-rw-r--r--arch/x86_64/ia32/ia32entry.S2
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/crash.c38
-rw-r--r--arch/x86_64/kernel/machine_kexec.c245
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
-rw-r--r--include/asm-x86_64/kexec.h28
-rw-r--r--include/asm-x86_64/unistd.h2
8 files changed, 474 insertions, 2 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 56c02cf6397d..d09437b5c48f 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -380,6 +380,23 @@ config PHYSICAL_START
380 380
381 Don't change this unless you know what you are doing. 381 Don't change this unless you know what you are doing.
382 382
383config KEXEC
384 bool "kexec system call (EXPERIMENTAL)"
385 depends on EXPERIMENTAL
386 help
387 kexec is a system call that implements the ability to shutdown your
388 current kernel, and to start another kernel. It is like a reboot
389 but it is indepedent of the system firmware. And like a reboot
390 you can start any kernel with it, not just Linux.
391
392 The name comes from the similiarity to the exec system call.
393
394 It is an ongoing process to be certain the hardware in a machine
395 is properly shutdown, so do not be surprised if this code does not
396 initially work for you. It may help to enable device hotplugging
397 support. As of this writing the exact hardware interface is
398 strongly in flux, so no good recommendation can be made.
399
383config SECCOMP 400config SECCOMP
384 bool "Enable seccomp to safely compute untrusted bytecode" 401 bool "Enable seccomp to safely compute untrusted bytecode"
385 depends on PROC_FS 402 depends on PROC_FS
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5b..cc935427d532 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
589 .quad compat_sys_mq_timedreceive /* 280 */ 589 .quad compat_sys_mq_timedreceive /* 280 */
590 .quad compat_sys_mq_notify 590 .quad compat_sys_mq_notify
591 .quad compat_sys_mq_getsetattr 591 .quad compat_sys_mq_getsetattr
592 .quad quiet_ni_syscall /* reserved for kexec */ 592 .quad compat_sys_kexec_load /* reserved for kexec */
593 .quad compat_sys_waitid 593 .quad compat_sys_waitid
594 .quad quiet_ni_syscall /* sys_altroot */ 594 .quad quiet_ni_syscall /* sys_altroot */
595 .quad sys_add_key 595 .quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fda..48f9e2c19cd6 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o 22 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
23obj-$(CONFIG_PM) += suspend.o 24obj-$(CONFIG_PM) += suspend.o
24obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o 25obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
25obj-$(CONFIG_CPU_FREQ) += cpufreq/ 26obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 000000000000..7caf8a49d0cb
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,38 @@
1/*
2 * Architecture specific (x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17
18#include <asm/processor.h>
19#include <asm/hardirq.h>
20#include <asm/nmi.h>
21#include <asm/hw_irq.h>
22
23#define MAX_NOTE_BYTES 1024
24typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
25
26note_buf_t crash_notes[NR_CPUS];
27
28void machine_crash_shutdown(void)
29{
30 /* This function is only called after the system
31 * has paniced or is otherwise in a critical state.
32 * The minimum amount of code to allow a kexec'd kernel
33 * to run successfully needs to happen here.
34 *
35 * In practice this means shooting down the other cpus in
36 * an SMP system.
37 */
38}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 000000000000..200b5993f8d9
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,245 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/string.h>
13#include <linux/reboot.h>
14#include <asm/pda.h>
15#include <asm/pgtable.h>
16#include <asm/pgalloc.h>
17#include <asm/tlbflush.h>
18#include <asm/mmu_context.h>
19#include <asm/io.h>
20#include <asm/apic.h>
21#include <asm/cpufeature.h>
22#include <asm/hw_irq.h>
23
24#define LEVEL0_SIZE (1UL << 12UL)
25#define LEVEL1_SIZE (1UL << 21UL)
26#define LEVEL2_SIZE (1UL << 30UL)
27#define LEVEL3_SIZE (1UL << 39UL)
28#define LEVEL4_SIZE (1UL << 48UL)
29
30#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
31#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
32#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
33#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
34
35static void init_level2_page(
36 u64 *level2p, unsigned long addr)
37{
38 unsigned long end_addr;
39 addr &= PAGE_MASK;
40 end_addr = addr + LEVEL2_SIZE;
41 while(addr < end_addr) {
42 *(level2p++) = addr | L1_ATTR;
43 addr += LEVEL1_SIZE;
44 }
45}
46
47static int init_level3_page(struct kimage *image,
48 u64 *level3p, unsigned long addr, unsigned long last_addr)
49{
50 unsigned long end_addr;
51 int result;
52 result = 0;
53 addr &= PAGE_MASK;
54 end_addr = addr + LEVEL3_SIZE;
55 while((addr < last_addr) && (addr < end_addr)) {
56 struct page *page;
57 u64 *level2p;
58 page = kimage_alloc_control_pages(image, 0);
59 if (!page) {
60 result = -ENOMEM;
61 goto out;
62 }
63 level2p = (u64 *)page_address(page);
64 init_level2_page(level2p, addr);
65 *(level3p++) = __pa(level2p) | L2_ATTR;
66 addr += LEVEL2_SIZE;
67 }
68 /* clear the unused entries */
69 while(addr < end_addr) {
70 *(level3p++) = 0;
71 addr += LEVEL2_SIZE;
72 }
73out:
74 return result;
75}
76
77
78static int init_level4_page(struct kimage *image,
79 u64 *level4p, unsigned long addr, unsigned long last_addr)
80{
81 unsigned long end_addr;
82 int result;
83 result = 0;
84 addr &= PAGE_MASK;
85 end_addr = addr + LEVEL4_SIZE;
86 while((addr < last_addr) && (addr < end_addr)) {
87 struct page *page;
88 u64 *level3p;
89 page = kimage_alloc_control_pages(image, 0);
90 if (!page) {
91 result = -ENOMEM;
92 goto out;
93 }
94 level3p = (u64 *)page_address(page);
95 result = init_level3_page(image, level3p, addr, last_addr);
96 if (result) {
97 goto out;
98 }
99 *(level4p++) = __pa(level3p) | L3_ATTR;
100 addr += LEVEL3_SIZE;
101 }
102 /* clear the unused entries */
103 while(addr < end_addr) {
104 *(level4p++) = 0;
105 addr += LEVEL3_SIZE;
106 }
107 out:
108 return result;
109}
110
111
112static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
113{
114 u64 *level4p;
115 level4p = (u64 *)__va(start_pgtable);
116 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
117}
118
119static void set_idt(void *newidt, u16 limit)
120{
121 unsigned char curidt[10];
122
123 /* x86-64 supports unaliged loads & stores */
124 (*(u16 *)(curidt)) = limit;
125 (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
126
127 __asm__ __volatile__ (
128 "lidt %0\n"
129 : "=m" (curidt)
130 );
131};
132
133
134static void set_gdt(void *newgdt, u16 limit)
135{
136 unsigned char curgdt[10];
137
138 /* x86-64 supports unaligned loads & stores */
139 (*(u16 *)(curgdt)) = limit;
140 (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
141
142 __asm__ __volatile__ (
143 "lgdt %0\n"
144 : "=m" (curgdt)
145 );
146};
147
148static void load_segments(void)
149{
150 __asm__ __volatile__ (
151 "\tmovl $"STR(__KERNEL_DS)",%eax\n"
152 "\tmovl %eax,%ds\n"
153 "\tmovl %eax,%es\n"
154 "\tmovl %eax,%ss\n"
155 "\tmovl %eax,%fs\n"
156 "\tmovl %eax,%gs\n"
157 );
158#undef STR
159#undef __STR
160}
161
162typedef NORET_TYPE void (*relocate_new_kernel_t)(
163 unsigned long indirection_page, unsigned long control_code_buffer,
164 unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
165
166const extern unsigned char relocate_new_kernel[];
167const extern unsigned long relocate_new_kernel_size;
168
169int machine_kexec_prepare(struct kimage *image)
170{
171 unsigned long start_pgtable, control_code_buffer;
172 int result;
173
174 /* Calculate the offsets */
175 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
176 control_code_buffer = start_pgtable + 4096UL;
177
178 /* Setup the identity mapped 64bit page table */
179 result = init_pgtable(image, start_pgtable);
180 if (result) {
181 return result;
182 }
183
184 /* Place the code in the reboot code buffer */
185 memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
186
187 return 0;
188}
189
190void machine_kexec_cleanup(struct kimage *image)
191{
192 return;
193}
194
195/*
196 * Do not allocate memory (or fail in any way) in machine_kexec().
197 * We are past the point of no return, committed to rebooting now.
198 */
199NORET_TYPE void machine_kexec(struct kimage *image)
200{
201 unsigned long page_list;
202 unsigned long control_code_buffer;
203 unsigned long start_pgtable;
204 relocate_new_kernel_t rnk;
205
206 /* Interrupts aren't acceptable while we reboot */
207 local_irq_disable();
208
209 /* Calculate the offsets */
210 page_list = image->head;
211 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
212 control_code_buffer = start_pgtable + 4096UL;
213
214 /* Set the low half of the page table to my identity mapped
215 * page table for kexec. Leave the high half pointing at the
216 * kernel pages. Don't bother to flush the global pages
217 * as that will happen when I fully switch to my identity mapped
218 * page table anyway.
219 */
220 memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
221 __flush_tlb();
222
223
224 /* The segment registers are funny things, they are
225 * automatically loaded from a table, in memory wherever you
226 * set them to a specific selector, but this table is never
227 * accessed again unless you set the segment to a different selector.
228 *
229 * The more common model are caches where the behide
230 * the scenes work is done, but is also dropped at arbitrary
231 * times.
232 *
233 * I take advantage of this here by force loading the
234 * segments, before I zap the gdt with an invalid value.
235 */
236 load_segments();
237 /* The gdt & idt are now invalid.
238 * If you want to load them you must set up your own idt & gdt.
239 */
240 set_gdt(phys_to_virt(0),0);
241 set_idt(phys_to_virt(0),0);
242 /* now call it */
243 rnk = (relocate_new_kernel_t) control_code_buffer;
244 (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
245}
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d24fa9b72a2b
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10
11 /*
12 * Must be relocatable PIC code callable as a C function, that once
13 * it starts can not use the previous processes stack.
14 */
15 .globl relocate_new_kernel
16 .code64
17relocate_new_kernel:
18 /* %rdi page_list
19 * %rsi reboot_code_buffer
20 * %rdx start address
21 * %rcx page_table
22 * %r8 arg5
23 * %r9 arg6
24 */
25
26 /* zero out flags, and disable interrupts */
27 pushq $0
28 popfq
29
30 /* set a new stack at the bottom of our page... */
31 lea 4096(%rsi), %rsp
32
33 /* store the parameters back on the stack */
34 pushq %rdx /* store the start address */
35
36 /* Set cr0 to a known state:
37 * 31 1 == Paging enabled
38 * 18 0 == Alignment check disabled
39 * 16 0 == Write protect disabled
40 * 3 0 == No task switch
41 * 2 0 == Don't do FP software emulation.
42 * 0 1 == Proctected mode enabled
43 */
44 movq %cr0, %rax
45 andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
46 orl $((1<<31)|(1<<0)), %eax
47 movq %rax, %cr0
48
49 /* Set cr4 to a known state:
50 * 10 0 == xmm exceptions disabled
51 * 9 0 == xmm registers instructions disabled
52 * 8 0 == performance monitoring counter disabled
53 * 7 0 == page global disabled
54 * 6 0 == machine check exceptions disabled
55 * 5 1 == physical address extension enabled
56 * 4 0 == page size extensions disabled
57 * 3 0 == Debug extensions disabled
58 * 2 0 == Time stamp disable (disabled)
59 * 1 0 == Protected mode virtual interrupts disabled
60 * 0 0 == VME disabled
61 */
62
63 movq $((1<<5)), %rax
64 movq %rax, %cr4
65
66 jmp 1f
671:
68
69 /* Switch to the identity mapped page tables,
70 * and flush the TLB.
71 */
72 movq %rcx, %cr3
73
74 /* Do the copies */
75 movq %rdi, %rcx /* Put the page_list in %rcx */
76 xorq %rdi, %rdi
77 xorq %rsi, %rsi
78 jmp 1f
79
800: /* top, read another word for the indirection page */
81
82 movq (%rbx), %rcx
83 addq $8, %rbx
841:
85 testq $0x1, %rcx /* is it a destination page? */
86 jz 2f
87 movq %rcx, %rdi
88 andq $0xfffffffffffff000, %rdi
89 jmp 0b
902:
91 testq $0x2, %rcx /* is it an indirection page? */
92 jz 2f
93 movq %rcx, %rbx
94 andq $0xfffffffffffff000, %rbx
95 jmp 0b
962:
97 testq $0x4, %rcx /* is it the done indicator? */
98 jz 2f
99 jmp 3f
1002:
101 testq $0x8, %rcx /* is it the source indicator? */
102 jz 0b /* Ignore it otherwise */
103 movq %rcx, %rsi /* For ever source page do a copy */
104 andq $0xfffffffffffff000, %rsi
105
106 movq $512, %rcx
107 rep ; movsq
108 jmp 0b
1093:
110
111 /* To be certain of avoiding problems with self-modifying code
112 * I need to execute a serializing instruction here.
113 * So I flush the TLB by reloading %cr3 here, it's handy,
114 * and not processor dependent.
115 */
116 movq %cr3, %rax
117 movq %rax, %cr3
118
119 /* set all of the registers to known values */
120 /* leave %rsp alone */
121
122 xorq %rax, %rax
123 xorq %rbx, %rbx
124 xorq %rcx, %rcx
125 xorq %rdx, %rdx
126 xorq %rsi, %rsi
127 xorq %rdi, %rdi
128 xorq %rbp, %rbp
129 xorq %r8, %r8
130 xorq %r9, %r9
131 xorq %r10, %r9
132 xorq %r11, %r11
133 xorq %r12, %r12
134 xorq %r13, %r13
135 xorq %r14, %r14
136 xorq %r15, %r15
137
138 ret
139relocate_new_kernel_end:
140
141 .globl relocate_new_kernel_size
142relocate_new_kernel_size:
143 .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h
new file mode 100644
index 000000000000..dc33646dc7dd
--- /dev/null
+++ b/include/asm-x86_64/kexec.h
@@ -0,0 +1,28 @@
1#ifndef _X86_64_KEXEC_H
2#define _X86_64_KEXEC_H
3
4#include <asm/page.h>
5#include <asm/proto.h>
6
7/*
8 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
9 * I.e. Maximum page that is mapped directly into kernel memory,
10 * and kmap is not required.
11 *
12 * So far x86_64 is limited to 40 physical address bits.
13 */
14
15/* Maximum physical address we can use pages from */
16#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
17/* Maximum address we can reach in physical address mode */
18#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
19/* Maximum address we can use for the control pages */
20#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
21
22/* Allocate one page for the pdp and the second for the code */
23#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL)
24
25/* The native architecture */
26#define KEXEC_ARCH KEXEC_ARCH_X86_64
27
28#endif /* _X86_64_KEXEC_H */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 3c9af6fd4332..d767adcbf0ff 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -552,7 +552,7 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify)
552#define __NR_mq_getsetattr 245 552#define __NR_mq_getsetattr 245
553__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) 553__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
554#define __NR_kexec_load 246 554#define __NR_kexec_load 246
555__SYSCALL(__NR_kexec_load, sys_ni_syscall) 555__SYSCALL(__NR_kexec_load, sys_kexec_load)
556#define __NR_waitid 247 556#define __NR_waitid 247
557__SYSCALL(__NR_waitid, sys_waitid) 557__SYSCALL(__NR_waitid, sys_waitid)
558#define __NR_add_key 248 558#define __NR_add_key 248