aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2005-06-25 17:57:56 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:49 -0400
commit5033cba087f6ac773002123aafbea1aad4267682 (patch)
treefa0301c28c004e81d3aad597f23ea2407db8396c /arch
parentdd2a13054ffc25783a74afb5e4a0f2115e45f9cd (diff)
[PATCH] kexec: x86 kexec core
This is the i386 implementation of kexec. Signed-off-by: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/i386/Kconfig17
-rw-r--r--arch/i386/kernel/Makefile1
-rw-r--r--arch/i386/kernel/crash.c42
-rw-r--r--arch/i386/kernel/machine_kexec.c220
-rw-r--r--arch/i386/kernel/relocate_kernel.S120
-rw-r--r--arch/i386/kernel/syscall_table.S2
6 files changed, 401 insertions, 1 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 31567f4d333..0f391cbf116 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -953,6 +953,23 @@ config PHYSICAL_START
953 953
954 Don't change this unless you know what you are doing. 954 Don't change this unless you know what you are doing.
955 955
956config KEXEC
957 bool "kexec system call (EXPERIMENTAL)"
958 depends on EXPERIMENTAL
959 help
960 kexec is a system call that implements the ability to shutdown your
961 current kernel, and to start another kernel. It is like a reboot
962 but it is indepedent of the system firmware. And like a reboot
963 you can start any kernel with it, not just Linux.
964
965 The name comes from the similiarity to the exec system call.
966
967 It is an ongoing process to be certain the hardware in a machine
968 is properly shutdown, so do not be surprised if this code does not
969 initially work for you. It may help to enable device hotplugging
970 support. As of this writing the exact hardware interface is
971 strongly in flux, so no good recommendation can be made.
972
956endmenu 973endmenu
957 974
958 975
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 51ecd512603..4cc83b322b3 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
25obj-$(CONFIG_X86_IO_APIC) += io_apic.o 25obj-$(CONFIG_X86_IO_APIC) += io_apic.o
26obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o 26obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o
27obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
27obj-$(CONFIG_X86_NUMAQ) += numaq.o 28obj-$(CONFIG_X86_NUMAQ) += numaq.o
28obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o 29obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
29obj-$(CONFIG_KPROBES) += kprobes.o 30obj-$(CONFIG_KPROBES) += kprobes.o
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
new file mode 100644
index 00000000000..fa27a6c2abb
--- /dev/null
+++ b/arch/i386/kernel/crash.c
@@ -0,0 +1,42 @@
1/*
2 * Architecture specific (i386) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17#include <linux/irq.h>
18#include <linux/delay.h>
19#include <linux/elf.h>
20#include <linux/elfcore.h>
21
22#include <asm/processor.h>
23#include <asm/hardirq.h>
24#include <asm/nmi.h>
25#include <asm/hw_irq.h>
26
27#define MAX_NOTE_BYTES 1024
28typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
29
30note_buf_t crash_notes[NR_CPUS];
31
32void machine_crash_shutdown(void)
33{
34 /* This function is only called after the system
35 * has paniced or is otherwise in a critical state.
36 * The minimum amount of code to allow a kexec'd kernel
37 * to run successfully needs to happen here.
38 *
39 * In practice this means shooting down the other cpus in
40 * an SMP system.
41 */
42}
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
new file mode 100644
index 00000000000..671880415d1
--- /dev/null
+++ b/arch/i386/kernel/machine_kexec.c
@@ -0,0 +1,220 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <asm/pgtable.h>
13#include <asm/pgalloc.h>
14#include <asm/tlbflush.h>
15#include <asm/mmu_context.h>
16#include <asm/io.h>
17#include <asm/apic.h>
18#include <asm/cpufeature.h>
19
20static inline unsigned long read_cr3(void)
21{
22 unsigned long cr3;
23 asm volatile("movl %%cr3,%0": "=r"(cr3));
24 return cr3;
25}
26
27#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
28
29#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
30#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
31#define L2_ATTR (_PAGE_PRESENT)
32
33#define LEVEL0_SIZE (1UL << 12UL)
34
35#ifndef CONFIG_X86_PAE
36#define LEVEL1_SIZE (1UL << 22UL)
37static u32 pgtable_level1[1024] PAGE_ALIGNED;
38
39static void identity_map_page(unsigned long address)
40{
41 unsigned long level1_index, level2_index;
42 u32 *pgtable_level2;
43
44 /* Find the current page table */
45 pgtable_level2 = __va(read_cr3());
46
47 /* Find the indexes of the physical address to identity map */
48 level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
49 level2_index = address / LEVEL1_SIZE;
50
51 /* Identity map the page table entry */
52 pgtable_level1[level1_index] = address | L0_ATTR;
53 pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
54
55 /* Flush the tlb so the new mapping takes effect.
56 * Global tlb entries are not flushed but that is not an issue.
57 */
58 load_cr3(pgtable_level2);
59}
60
61#else
62#define LEVEL1_SIZE (1UL << 21UL)
63#define LEVEL2_SIZE (1UL << 30UL)
64static u64 pgtable_level1[512] PAGE_ALIGNED;
65static u64 pgtable_level2[512] PAGE_ALIGNED;
66
67static void identity_map_page(unsigned long address)
68{
69 unsigned long level1_index, level2_index, level3_index;
70 u64 *pgtable_level3;
71
72 /* Find the current page table */
73 pgtable_level3 = __va(read_cr3());
74
75 /* Find the indexes of the physical address to identity map */
76 level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
77 level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
78 level3_index = address / LEVEL2_SIZE;
79
80 /* Identity map the page table entry */
81 pgtable_level1[level1_index] = address | L0_ATTR;
82 pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
83 set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
84
85 /* Flush the tlb so the new mapping takes effect.
86 * Global tlb entries are not flushed but that is not an issue.
87 */
88 load_cr3(pgtable_level3);
89}
90#endif
91
92
93static void set_idt(void *newidt, __u16 limit)
94{
95 unsigned char curidt[6];
96
97 /* ia32 supports unaliged loads & stores */
98 (*(__u16 *)(curidt)) = limit;
99 (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
100
101 __asm__ __volatile__ (
102 "lidt %0\n"
103 : "=m" (curidt)
104 );
105};
106
107
108static void set_gdt(void *newgdt, __u16 limit)
109{
110 unsigned char curgdt[6];
111
112 /* ia32 supports unaligned loads & stores */
113 (*(__u16 *)(curgdt)) = limit;
114 (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
115
116 __asm__ __volatile__ (
117 "lgdt %0\n"
118 : "=m" (curgdt)
119 );
120};
121
122static void load_segments(void)
123{
124#define __STR(X) #X
125#define STR(X) __STR(X)
126
127 __asm__ __volatile__ (
128 "\tljmp $"STR(__KERNEL_CS)",$1f\n"
129 "\t1:\n"
130 "\tmovl $"STR(__KERNEL_DS)",%eax\n"
131 "\tmovl %eax,%ds\n"
132 "\tmovl %eax,%es\n"
133 "\tmovl %eax,%fs\n"
134 "\tmovl %eax,%gs\n"
135 "\tmovl %eax,%ss\n"
136 );
137#undef STR
138#undef __STR
139}
140
141typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
142 unsigned long indirection_page, unsigned long reboot_code_buffer,
143 unsigned long start_address, unsigned int has_pae) ATTRIB_NORET;
144
145const extern unsigned char relocate_new_kernel[];
146extern void relocate_new_kernel_end(void);
147const extern unsigned int relocate_new_kernel_size;
148
149/*
150 * A architecture hook called to validate the
151 * proposed image and prepare the control pages
152 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
153 * have been allocated, but the segments have yet
154 * been copied into the kernel.
155 *
156 * Do what every setup is needed on image and the
157 * reboot code buffer to allow us to avoid allocations
158 * later.
159 *
160 * Currently nothing.
161 */
162int machine_kexec_prepare(struct kimage *image)
163{
164 return 0;
165}
166
167/*
168 * Undo anything leftover by machine_kexec_prepare
169 * when an image is freed.
170 */
171void machine_kexec_cleanup(struct kimage *image)
172{
173}
174
175/*
176 * Do not allocate memory (or fail in any way) in machine_kexec().
177 * We are past the point of no return, committed to rebooting now.
178 */
179NORET_TYPE void machine_kexec(struct kimage *image)
180{
181 unsigned long page_list;
182 unsigned long reboot_code_buffer;
183 relocate_new_kernel_t rnk;
184
185 /* Interrupts aren't acceptable while we reboot */
186 local_irq_disable();
187
188 /* Compute some offsets */
189 reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
190 page_list = image->head;
191
192 /* Set up an identity mapping for the reboot_code_buffer */
193 identity_map_page(reboot_code_buffer);
194
195 /* copy it out */
196 memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
197
198 /* The segment registers are funny things, they are
199 * automatically loaded from a table, in memory wherever you
200 * set them to a specific selector, but this table is never
201 * accessed again you set the segment to a different selector.
202 *
203 * The more common model is are caches where the behide
204 * the scenes work is done, but is also dropped at arbitrary
205 * times.
206 *
207 * I take advantage of this here by force loading the
208 * segments, before I zap the gdt with an invalid value.
209 */
210 load_segments();
211 /* The gdt & idt are now invalid.
212 * If you want to load them you must set up your own idt & gdt.
213 */
214 set_gdt(phys_to_virt(0),0);
215 set_idt(phys_to_virt(0),0);
216
217 /* now call it */
218 rnk = (relocate_new_kernel_t) reboot_code_buffer;
219 (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
220}
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..d312616effa
--- /dev/null
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -0,0 +1,120 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10
11 /*
12 * Must be relocatable PIC code callable as a C function, that once
13 * it starts can not use the previous processes stack.
14 */
15 .globl relocate_new_kernel
16relocate_new_kernel:
17 /* read the arguments and say goodbye to the stack */
18 movl 4(%esp), %ebx /* page_list */
19 movl 8(%esp), %ebp /* reboot_code_buffer */
20 movl 12(%esp), %edx /* start address */
21 movl 16(%esp), %ecx /* cpu_has_pae */
22
23 /* zero out flags, and disable interrupts */
24 pushl $0
25 popfl
26
27 /* set a new stack at the bottom of our page... */
28 lea 4096(%ebp), %esp
29
30 /* store the parameters back on the stack */
31 pushl %edx /* store the start address */
32
33 /* Set cr0 to a known state:
34 * 31 0 == Paging disabled
35 * 18 0 == Alignment check disabled
36 * 16 0 == Write protect disabled
37 * 3 0 == No task switch
38 * 2 0 == Don't do FP software emulation.
39 * 0 1 == Proctected mode enabled
40 */
41 movl %cr0, %eax
42 andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
43 orl $(1<<0), %eax
44 movl %eax, %cr0
45
46 /* clear cr4 if applicable */
47 testl %ecx, %ecx
48 jz 1f
49 /* Set cr4 to a known state:
50 * Setting everything to zero seems safe.
51 */
52 movl %cr4, %eax
53 andl $0, %eax
54 movl %eax, %cr4
55
56 jmp 1f
571:
58
59 /* Flush the TLB (needed?) */
60 xorl %eax, %eax
61 movl %eax, %cr3
62
63 /* Do the copies */
64 movl %ebx, %ecx
65 jmp 1f
66
670: /* top, read another word from the indirection page */
68 movl (%ebx), %ecx
69 addl $4, %ebx
701:
71 testl $0x1, %ecx /* is it a destination page */
72 jz 2f
73 movl %ecx, %edi
74 andl $0xfffff000, %edi
75 jmp 0b
762:
77 testl $0x2, %ecx /* is it an indirection page */
78 jz 2f
79 movl %ecx, %ebx
80 andl $0xfffff000, %ebx
81 jmp 0b
822:
83 testl $0x4, %ecx /* is it the done indicator */
84 jz 2f
85 jmp 3f
862:
87 testl $0x8, %ecx /* is it the source indicator */
88 jz 0b /* Ignore it otherwise */
89 movl %ecx, %esi /* For every source page do a copy */
90 andl $0xfffff000, %esi
91
92 movl $1024, %ecx
93 rep ; movsl
94 jmp 0b
95
963:
97
98 /* To be certain of avoiding problems with self-modifying code
99 * I need to execute a serializing instruction here.
100 * So I flush the TLB, it's handy, and not processor dependent.
101 */
102 xorl %eax, %eax
103 movl %eax, %cr3
104
105 /* set all of the registers to known values */
106 /* leave %esp alone */
107
108 xorl %eax, %eax
109 xorl %ebx, %ebx
110 xorl %ecx, %ecx
111 xorl %edx, %edx
112 xorl %esi, %esi
113 xorl %edi, %edi
114 xorl %ebp, %ebp
115 ret
116relocate_new_kernel_end:
117
118 .globl relocate_new_kernel_size
119relocate_new_kernel_size:
120 .long relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index d408afaf649..442a6e937b1 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -283,7 +283,7 @@ ENTRY(sys_call_table)
283 .long sys_mq_timedreceive /* 280 */ 283 .long sys_mq_timedreceive /* 280 */
284 .long sys_mq_notify 284 .long sys_mq_notify
285 .long sys_mq_getsetattr 285 .long sys_mq_getsetattr
286 .long sys_ni_syscall /* reserved for kexec */ 286 .long sys_kexec_load
287 .long sys_waitid 287 .long sys_waitid
288 .long sys_ni_syscall /* 285 */ /* available */ 288 .long sys_ni_syscall /* 285 */ /* available */
289 .long sys_add_key 289 .long sys_add_key