diff options
author | Eric W. Biederman <ebiederm@xmission.com> | 2005-06-25 17:57:56 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-25 19:24:49 -0400 |
commit | 5033cba087f6ac773002123aafbea1aad4267682 (patch) | |
tree | fa0301c28c004e81d3aad597f23ea2407db8396c /arch | |
parent | dd2a13054ffc25783a74afb5e4a0f2115e45f9cd (diff) |
[PATCH] kexec: x86 kexec core
This is the i386 implementation of kexec.
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/i386/Kconfig | 17 | ||||
-rw-r--r-- | arch/i386/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/i386/kernel/crash.c | 42 | ||||
-rw-r--r-- | arch/i386/kernel/machine_kexec.c | 220 | ||||
-rw-r--r-- | arch/i386/kernel/relocate_kernel.S | 120 | ||||
-rw-r--r-- | arch/i386/kernel/syscall_table.S | 2 |
6 files changed, 401 insertions, 1 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 31567f4d333a..0f391cbf116e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig | |||
@@ -953,6 +953,23 @@ config PHYSICAL_START | |||
953 | 953 | ||
954 | Don't change this unless you know what you are doing. | 954 | Don't change this unless you know what you are doing. |
955 | 955 | ||
956 | config KEXEC | ||
957 | bool "kexec system call (EXPERIMENTAL)" | ||
958 | depends on EXPERIMENTAL | ||
959 | help | ||
960 | kexec is a system call that implements the ability to shutdown your | ||
961 | current kernel, and to start another kernel. It is like a reboot | ||
962 | but it is indepedent of the system firmware. And like a reboot | ||
963 | you can start any kernel with it, not just Linux. | ||
964 | |||
965 | The name comes from the similiarity to the exec system call. | ||
966 | |||
967 | It is an ongoing process to be certain the hardware in a machine | ||
968 | is properly shutdown, so do not be surprised if this code does not | ||
969 | initially work for you. It may help to enable device hotplugging | ||
970 | support. As of this writing the exact hardware interface is | ||
971 | strongly in flux, so no good recommendation can be made. | ||
972 | |||
956 | endmenu | 973 | endmenu |
957 | 974 | ||
958 | 975 | ||
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 51ecd512603d..4cc83b322b36 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile | |||
@@ -24,6 +24,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o | |||
24 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o | 24 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o |
25 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o | 25 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
26 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o | 26 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o |
27 | obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o | ||
27 | obj-$(CONFIG_X86_NUMAQ) += numaq.o | 28 | obj-$(CONFIG_X86_NUMAQ) += numaq.o |
28 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o | 29 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o |
29 | obj-$(CONFIG_KPROBES) += kprobes.o | 30 | obj-$(CONFIG_KPROBES) += kprobes.o |
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c new file mode 100644 index 000000000000..fa27a6c2abb6 --- /dev/null +++ b/arch/i386/kernel/crash.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * Architecture specific (i386) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/elf.h> | ||
20 | #include <linux/elfcore.h> | ||
21 | |||
22 | #include <asm/processor.h> | ||
23 | #include <asm/hardirq.h> | ||
24 | #include <asm/nmi.h> | ||
25 | #include <asm/hw_irq.h> | ||
26 | |||
27 | #define MAX_NOTE_BYTES 1024 | ||
28 | typedef u32 note_buf_t[MAX_NOTE_BYTES/4]; | ||
29 | |||
30 | note_buf_t crash_notes[NR_CPUS]; | ||
31 | |||
32 | void machine_crash_shutdown(void) | ||
33 | { | ||
34 | /* This function is only called after the system | ||
35 | * has paniced or is otherwise in a critical state. | ||
36 | * The minimum amount of code to allow a kexec'd kernel | ||
37 | * to run successfully needs to happen here. | ||
38 | * | ||
39 | * In practice this means shooting down the other cpus in | ||
40 | * an SMP system. | ||
41 | */ | ||
42 | } | ||
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c new file mode 100644 index 000000000000..671880415d1c --- /dev/null +++ b/arch/i386/kernel/machine_kexec.c | |||
@@ -0,0 +1,220 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/delay.h> | ||
12 | #include <asm/pgtable.h> | ||
13 | #include <asm/pgalloc.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/mmu_context.h> | ||
16 | #include <asm/io.h> | ||
17 | #include <asm/apic.h> | ||
18 | #include <asm/cpufeature.h> | ||
19 | |||
20 | static inline unsigned long read_cr3(void) | ||
21 | { | ||
22 | unsigned long cr3; | ||
23 | asm volatile("movl %%cr3,%0": "=r"(cr3)); | ||
24 | return cr3; | ||
25 | } | ||
26 | |||
27 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | ||
28 | |||
29 | #define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
30 | #define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
31 | #define L2_ATTR (_PAGE_PRESENT) | ||
32 | |||
33 | #define LEVEL0_SIZE (1UL << 12UL) | ||
34 | |||
35 | #ifndef CONFIG_X86_PAE | ||
36 | #define LEVEL1_SIZE (1UL << 22UL) | ||
37 | static u32 pgtable_level1[1024] PAGE_ALIGNED; | ||
38 | |||
39 | static void identity_map_page(unsigned long address) | ||
40 | { | ||
41 | unsigned long level1_index, level2_index; | ||
42 | u32 *pgtable_level2; | ||
43 | |||
44 | /* Find the current page table */ | ||
45 | pgtable_level2 = __va(read_cr3()); | ||
46 | |||
47 | /* Find the indexes of the physical address to identity map */ | ||
48 | level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; | ||
49 | level2_index = address / LEVEL1_SIZE; | ||
50 | |||
51 | /* Identity map the page table entry */ | ||
52 | pgtable_level1[level1_index] = address | L0_ATTR; | ||
53 | pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; | ||
54 | |||
55 | /* Flush the tlb so the new mapping takes effect. | ||
56 | * Global tlb entries are not flushed but that is not an issue. | ||
57 | */ | ||
58 | load_cr3(pgtable_level2); | ||
59 | } | ||
60 | |||
61 | #else | ||
62 | #define LEVEL1_SIZE (1UL << 21UL) | ||
63 | #define LEVEL2_SIZE (1UL << 30UL) | ||
64 | static u64 pgtable_level1[512] PAGE_ALIGNED; | ||
65 | static u64 pgtable_level2[512] PAGE_ALIGNED; | ||
66 | |||
67 | static void identity_map_page(unsigned long address) | ||
68 | { | ||
69 | unsigned long level1_index, level2_index, level3_index; | ||
70 | u64 *pgtable_level3; | ||
71 | |||
72 | /* Find the current page table */ | ||
73 | pgtable_level3 = __va(read_cr3()); | ||
74 | |||
75 | /* Find the indexes of the physical address to identity map */ | ||
76 | level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; | ||
77 | level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; | ||
78 | level3_index = address / LEVEL2_SIZE; | ||
79 | |||
80 | /* Identity map the page table entry */ | ||
81 | pgtable_level1[level1_index] = address | L0_ATTR; | ||
82 | pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; | ||
83 | set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR); | ||
84 | |||
85 | /* Flush the tlb so the new mapping takes effect. | ||
86 | * Global tlb entries are not flushed but that is not an issue. | ||
87 | */ | ||
88 | load_cr3(pgtable_level3); | ||
89 | } | ||
90 | #endif | ||
91 | |||
92 | |||
93 | static void set_idt(void *newidt, __u16 limit) | ||
94 | { | ||
95 | unsigned char curidt[6]; | ||
96 | |||
97 | /* ia32 supports unaliged loads & stores */ | ||
98 | (*(__u16 *)(curidt)) = limit; | ||
99 | (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); | ||
100 | |||
101 | __asm__ __volatile__ ( | ||
102 | "lidt %0\n" | ||
103 | : "=m" (curidt) | ||
104 | ); | ||
105 | }; | ||
106 | |||
107 | |||
108 | static void set_gdt(void *newgdt, __u16 limit) | ||
109 | { | ||
110 | unsigned char curgdt[6]; | ||
111 | |||
112 | /* ia32 supports unaligned loads & stores */ | ||
113 | (*(__u16 *)(curgdt)) = limit; | ||
114 | (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); | ||
115 | |||
116 | __asm__ __volatile__ ( | ||
117 | "lgdt %0\n" | ||
118 | : "=m" (curgdt) | ||
119 | ); | ||
120 | }; | ||
121 | |||
122 | static void load_segments(void) | ||
123 | { | ||
124 | #define __STR(X) #X | ||
125 | #define STR(X) __STR(X) | ||
126 | |||
127 | __asm__ __volatile__ ( | ||
128 | "\tljmp $"STR(__KERNEL_CS)",$1f\n" | ||
129 | "\t1:\n" | ||
130 | "\tmovl $"STR(__KERNEL_DS)",%eax\n" | ||
131 | "\tmovl %eax,%ds\n" | ||
132 | "\tmovl %eax,%es\n" | ||
133 | "\tmovl %eax,%fs\n" | ||
134 | "\tmovl %eax,%gs\n" | ||
135 | "\tmovl %eax,%ss\n" | ||
136 | ); | ||
137 | #undef STR | ||
138 | #undef __STR | ||
139 | } | ||
140 | |||
141 | typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( | ||
142 | unsigned long indirection_page, unsigned long reboot_code_buffer, | ||
143 | unsigned long start_address, unsigned int has_pae) ATTRIB_NORET; | ||
144 | |||
145 | const extern unsigned char relocate_new_kernel[]; | ||
146 | extern void relocate_new_kernel_end(void); | ||
147 | const extern unsigned int relocate_new_kernel_size; | ||
148 | |||
149 | /* | ||
150 | * A architecture hook called to validate the | ||
151 | * proposed image and prepare the control pages | ||
152 | * as needed. The pages for KEXEC_CONTROL_CODE_SIZE | ||
153 | * have been allocated, but the segments have yet | ||
154 | * been copied into the kernel. | ||
155 | * | ||
156 | * Do what every setup is needed on image and the | ||
157 | * reboot code buffer to allow us to avoid allocations | ||
158 | * later. | ||
159 | * | ||
160 | * Currently nothing. | ||
161 | */ | ||
162 | int machine_kexec_prepare(struct kimage *image) | ||
163 | { | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Undo anything leftover by machine_kexec_prepare | ||
169 | * when an image is freed. | ||
170 | */ | ||
171 | void machine_kexec_cleanup(struct kimage *image) | ||
172 | { | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
177 | * We are past the point of no return, committed to rebooting now. | ||
178 | */ | ||
179 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
180 | { | ||
181 | unsigned long page_list; | ||
182 | unsigned long reboot_code_buffer; | ||
183 | relocate_new_kernel_t rnk; | ||
184 | |||
185 | /* Interrupts aren't acceptable while we reboot */ | ||
186 | local_irq_disable(); | ||
187 | |||
188 | /* Compute some offsets */ | ||
189 | reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
190 | page_list = image->head; | ||
191 | |||
192 | /* Set up an identity mapping for the reboot_code_buffer */ | ||
193 | identity_map_page(reboot_code_buffer); | ||
194 | |||
195 | /* copy it out */ | ||
196 | memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); | ||
197 | |||
198 | /* The segment registers are funny things, they are | ||
199 | * automatically loaded from a table, in memory wherever you | ||
200 | * set them to a specific selector, but this table is never | ||
201 | * accessed again you set the segment to a different selector. | ||
202 | * | ||
203 | * The more common model is are caches where the behide | ||
204 | * the scenes work is done, but is also dropped at arbitrary | ||
205 | * times. | ||
206 | * | ||
207 | * I take advantage of this here by force loading the | ||
208 | * segments, before I zap the gdt with an invalid value. | ||
209 | */ | ||
210 | load_segments(); | ||
211 | /* The gdt & idt are now invalid. | ||
212 | * If you want to load them you must set up your own idt & gdt. | ||
213 | */ | ||
214 | set_gdt(phys_to_virt(0),0); | ||
215 | set_idt(phys_to_virt(0),0); | ||
216 | |||
217 | /* now call it */ | ||
218 | rnk = (relocate_new_kernel_t) reboot_code_buffer; | ||
219 | (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); | ||
220 | } | ||
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S new file mode 100644 index 000000000000..d312616effa1 --- /dev/null +++ b/arch/i386/kernel/relocate_kernel.S | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | |||
11 | /* | ||
12 | * Must be relocatable PIC code callable as a C function, that once | ||
13 | * it starts can not use the previous processes stack. | ||
14 | */ | ||
15 | .globl relocate_new_kernel | ||
16 | relocate_new_kernel: | ||
17 | /* read the arguments and say goodbye to the stack */ | ||
18 | movl 4(%esp), %ebx /* page_list */ | ||
19 | movl 8(%esp), %ebp /* reboot_code_buffer */ | ||
20 | movl 12(%esp), %edx /* start address */ | ||
21 | movl 16(%esp), %ecx /* cpu_has_pae */ | ||
22 | |||
23 | /* zero out flags, and disable interrupts */ | ||
24 | pushl $0 | ||
25 | popfl | ||
26 | |||
27 | /* set a new stack at the bottom of our page... */ | ||
28 | lea 4096(%ebp), %esp | ||
29 | |||
30 | /* store the parameters back on the stack */ | ||
31 | pushl %edx /* store the start address */ | ||
32 | |||
33 | /* Set cr0 to a known state: | ||
34 | * 31 0 == Paging disabled | ||
35 | * 18 0 == Alignment check disabled | ||
36 | * 16 0 == Write protect disabled | ||
37 | * 3 0 == No task switch | ||
38 | * 2 0 == Don't do FP software emulation. | ||
39 | * 0 1 == Proctected mode enabled | ||
40 | */ | ||
41 | movl %cr0, %eax | ||
42 | andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax | ||
43 | orl $(1<<0), %eax | ||
44 | movl %eax, %cr0 | ||
45 | |||
46 | /* clear cr4 if applicable */ | ||
47 | testl %ecx, %ecx | ||
48 | jz 1f | ||
49 | /* Set cr4 to a known state: | ||
50 | * Setting everything to zero seems safe. | ||
51 | */ | ||
52 | movl %cr4, %eax | ||
53 | andl $0, %eax | ||
54 | movl %eax, %cr4 | ||
55 | |||
56 | jmp 1f | ||
57 | 1: | ||
58 | |||
59 | /* Flush the TLB (needed?) */ | ||
60 | xorl %eax, %eax | ||
61 | movl %eax, %cr3 | ||
62 | |||
63 | /* Do the copies */ | ||
64 | movl %ebx, %ecx | ||
65 | jmp 1f | ||
66 | |||
67 | 0: /* top, read another word from the indirection page */ | ||
68 | movl (%ebx), %ecx | ||
69 | addl $4, %ebx | ||
70 | 1: | ||
71 | testl $0x1, %ecx /* is it a destination page */ | ||
72 | jz 2f | ||
73 | movl %ecx, %edi | ||
74 | andl $0xfffff000, %edi | ||
75 | jmp 0b | ||
76 | 2: | ||
77 | testl $0x2, %ecx /* is it an indirection page */ | ||
78 | jz 2f | ||
79 | movl %ecx, %ebx | ||
80 | andl $0xfffff000, %ebx | ||
81 | jmp 0b | ||
82 | 2: | ||
83 | testl $0x4, %ecx /* is it the done indicator */ | ||
84 | jz 2f | ||
85 | jmp 3f | ||
86 | 2: | ||
87 | testl $0x8, %ecx /* is it the source indicator */ | ||
88 | jz 0b /* Ignore it otherwise */ | ||
89 | movl %ecx, %esi /* For every source page do a copy */ | ||
90 | andl $0xfffff000, %esi | ||
91 | |||
92 | movl $1024, %ecx | ||
93 | rep ; movsl | ||
94 | jmp 0b | ||
95 | |||
96 | 3: | ||
97 | |||
98 | /* To be certain of avoiding problems with self-modifying code | ||
99 | * I need to execute a serializing instruction here. | ||
100 | * So I flush the TLB, it's handy, and not processor dependent. | ||
101 | */ | ||
102 | xorl %eax, %eax | ||
103 | movl %eax, %cr3 | ||
104 | |||
105 | /* set all of the registers to known values */ | ||
106 | /* leave %esp alone */ | ||
107 | |||
108 | xorl %eax, %eax | ||
109 | xorl %ebx, %ebx | ||
110 | xorl %ecx, %ecx | ||
111 | xorl %edx, %edx | ||
112 | xorl %esi, %esi | ||
113 | xorl %edi, %edi | ||
114 | xorl %ebp, %ebp | ||
115 | ret | ||
116 | relocate_new_kernel_end: | ||
117 | |||
118 | .globl relocate_new_kernel_size | ||
119 | relocate_new_kernel_size: | ||
120 | .long relocate_new_kernel_end - relocate_new_kernel | ||
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index d408afaf6495..442a6e937b19 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S | |||
@@ -283,7 +283,7 @@ ENTRY(sys_call_table) | |||
283 | .long sys_mq_timedreceive /* 280 */ | 283 | .long sys_mq_timedreceive /* 280 */ |
284 | .long sys_mq_notify | 284 | .long sys_mq_notify |
285 | .long sys_mq_getsetattr | 285 | .long sys_mq_getsetattr |
286 | .long sys_ni_syscall /* reserved for kexec */ | 286 | .long sys_kexec_load |
287 | .long sys_waitid | 287 | .long sys_waitid |
288 | .long sys_ni_syscall /* 285 */ /* available */ | 288 | .long sys_ni_syscall /* 285 */ /* available */ |
289 | .long sys_add_key | 289 | .long sys_add_key |