diff options
author | Eric W. Biederman <ebiederm@xmission.com> | 2005-06-25 17:58:02 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-25 19:24:50 -0400 |
commit | 5234f5eb04abbbfa306ccfbc2ccbb6e73f515b15 (patch) | |
tree | 60f4701fdc501955ccff198f84913b96e3bbf5bf /arch/x86_64/kernel | |
parent | d89559589a588d1a654329d8cd9a3ad33aaad9be (diff) |
[PATCH] kexec: x86_64 kexec implementation
This is the x86_64 implementation of machine kexec. 32bit compatibility
support has been implemented, and machine_kexec has been enhanced to not care
about the changing internal kernel paget table structures.
From: Alexander Nyberg <alexn@dsv.su.se>
build fix
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86_64/kernel/crash.c | 38 | ||||
-rw-r--r-- | arch/x86_64/kernel/machine_kexec.c | 245 | ||||
-rw-r--r-- | arch/x86_64/kernel/relocate_kernel.S | 143 |
4 files changed, 427 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 5ca4a4598fda..48f9e2c19cd6 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile | |||
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o | |||
20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o | 20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o |
21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ | 21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ |
22 | genapic.o genapic_cluster.o genapic_flat.o | 22 | genapic.o genapic_cluster.o genapic_flat.o |
23 | obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o | ||
23 | obj-$(CONFIG_PM) += suspend.o | 24 | obj-$(CONFIG_PM) += suspend.o |
24 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o | 25 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o |
25 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | 26 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ |
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c new file mode 100644 index 000000000000..7caf8a49d0cb --- /dev/null +++ b/arch/x86_64/kernel/crash.c | |||
@@ -0,0 +1,38 @@ | |||
1 | /* | ||
2 | * Architecture specific (x86_64) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/kexec.h> | ||
17 | |||
18 | #include <asm/processor.h> | ||
19 | #include <asm/hardirq.h> | ||
20 | #include <asm/nmi.h> | ||
21 | #include <asm/hw_irq.h> | ||
22 | |||
23 | #define MAX_NOTE_BYTES 1024 | ||
24 | typedef u32 note_buf_t[MAX_NOTE_BYTES/4]; | ||
25 | |||
26 | note_buf_t crash_notes[NR_CPUS]; | ||
27 | |||
28 | void machine_crash_shutdown(void) | ||
29 | { | ||
30 | /* This function is only called after the system | ||
31 | * has paniced or is otherwise in a critical state. | ||
32 | * The minimum amount of code to allow a kexec'd kernel | ||
33 | * to run successfully needs to happen here. | ||
34 | * | ||
35 | * In practice this means shooting down the other cpus in | ||
36 | * an SMP system. | ||
37 | */ | ||
38 | } | ||
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c new file mode 100644 index 000000000000..200b5993f8d9 --- /dev/null +++ b/arch/x86_64/kernel/machine_kexec.c | |||
@@ -0,0 +1,245 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/delay.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/reboot.h> | ||
14 | #include <asm/pda.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | #include <asm/pgalloc.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/mmu_context.h> | ||
19 | #include <asm/io.h> | ||
20 | #include <asm/apic.h> | ||
21 | #include <asm/cpufeature.h> | ||
22 | #include <asm/hw_irq.h> | ||
23 | |||
24 | #define LEVEL0_SIZE (1UL << 12UL) | ||
25 | #define LEVEL1_SIZE (1UL << 21UL) | ||
26 | #define LEVEL2_SIZE (1UL << 30UL) | ||
27 | #define LEVEL3_SIZE (1UL << 39UL) | ||
28 | #define LEVEL4_SIZE (1UL << 48UL) | ||
29 | |||
30 | #define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
31 | #define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) | ||
32 | #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
33 | #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
34 | |||
35 | static void init_level2_page( | ||
36 | u64 *level2p, unsigned long addr) | ||
37 | { | ||
38 | unsigned long end_addr; | ||
39 | addr &= PAGE_MASK; | ||
40 | end_addr = addr + LEVEL2_SIZE; | ||
41 | while(addr < end_addr) { | ||
42 | *(level2p++) = addr | L1_ATTR; | ||
43 | addr += LEVEL1_SIZE; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | static int init_level3_page(struct kimage *image, | ||
48 | u64 *level3p, unsigned long addr, unsigned long last_addr) | ||
49 | { | ||
50 | unsigned long end_addr; | ||
51 | int result; | ||
52 | result = 0; | ||
53 | addr &= PAGE_MASK; | ||
54 | end_addr = addr + LEVEL3_SIZE; | ||
55 | while((addr < last_addr) && (addr < end_addr)) { | ||
56 | struct page *page; | ||
57 | u64 *level2p; | ||
58 | page = kimage_alloc_control_pages(image, 0); | ||
59 | if (!page) { | ||
60 | result = -ENOMEM; | ||
61 | goto out; | ||
62 | } | ||
63 | level2p = (u64 *)page_address(page); | ||
64 | init_level2_page(level2p, addr); | ||
65 | *(level3p++) = __pa(level2p) | L2_ATTR; | ||
66 | addr += LEVEL2_SIZE; | ||
67 | } | ||
68 | /* clear the unused entries */ | ||
69 | while(addr < end_addr) { | ||
70 | *(level3p++) = 0; | ||
71 | addr += LEVEL2_SIZE; | ||
72 | } | ||
73 | out: | ||
74 | return result; | ||
75 | } | ||
76 | |||
77 | |||
78 | static int init_level4_page(struct kimage *image, | ||
79 | u64 *level4p, unsigned long addr, unsigned long last_addr) | ||
80 | { | ||
81 | unsigned long end_addr; | ||
82 | int result; | ||
83 | result = 0; | ||
84 | addr &= PAGE_MASK; | ||
85 | end_addr = addr + LEVEL4_SIZE; | ||
86 | while((addr < last_addr) && (addr < end_addr)) { | ||
87 | struct page *page; | ||
88 | u64 *level3p; | ||
89 | page = kimage_alloc_control_pages(image, 0); | ||
90 | if (!page) { | ||
91 | result = -ENOMEM; | ||
92 | goto out; | ||
93 | } | ||
94 | level3p = (u64 *)page_address(page); | ||
95 | result = init_level3_page(image, level3p, addr, last_addr); | ||
96 | if (result) { | ||
97 | goto out; | ||
98 | } | ||
99 | *(level4p++) = __pa(level3p) | L3_ATTR; | ||
100 | addr += LEVEL3_SIZE; | ||
101 | } | ||
102 | /* clear the unused entries */ | ||
103 | while(addr < end_addr) { | ||
104 | *(level4p++) = 0; | ||
105 | addr += LEVEL3_SIZE; | ||
106 | } | ||
107 | out: | ||
108 | return result; | ||
109 | } | ||
110 | |||
111 | |||
112 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | ||
113 | { | ||
114 | u64 *level4p; | ||
115 | level4p = (u64 *)__va(start_pgtable); | ||
116 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); | ||
117 | } | ||
118 | |||
119 | static void set_idt(void *newidt, u16 limit) | ||
120 | { | ||
121 | unsigned char curidt[10]; | ||
122 | |||
123 | /* x86-64 supports unaliged loads & stores */ | ||
124 | (*(u16 *)(curidt)) = limit; | ||
125 | (*(u64 *)(curidt +2)) = (unsigned long)(newidt); | ||
126 | |||
127 | __asm__ __volatile__ ( | ||
128 | "lidt %0\n" | ||
129 | : "=m" (curidt) | ||
130 | ); | ||
131 | }; | ||
132 | |||
133 | |||
134 | static void set_gdt(void *newgdt, u16 limit) | ||
135 | { | ||
136 | unsigned char curgdt[10]; | ||
137 | |||
138 | /* x86-64 supports unaligned loads & stores */ | ||
139 | (*(u16 *)(curgdt)) = limit; | ||
140 | (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); | ||
141 | |||
142 | __asm__ __volatile__ ( | ||
143 | "lgdt %0\n" | ||
144 | : "=m" (curgdt) | ||
145 | ); | ||
146 | }; | ||
147 | |||
148 | static void load_segments(void) | ||
149 | { | ||
150 | __asm__ __volatile__ ( | ||
151 | "\tmovl $"STR(__KERNEL_DS)",%eax\n" | ||
152 | "\tmovl %eax,%ds\n" | ||
153 | "\tmovl %eax,%es\n" | ||
154 | "\tmovl %eax,%ss\n" | ||
155 | "\tmovl %eax,%fs\n" | ||
156 | "\tmovl %eax,%gs\n" | ||
157 | ); | ||
158 | #undef STR | ||
159 | #undef __STR | ||
160 | } | ||
161 | |||
162 | typedef NORET_TYPE void (*relocate_new_kernel_t)( | ||
163 | unsigned long indirection_page, unsigned long control_code_buffer, | ||
164 | unsigned long start_address, unsigned long pgtable) ATTRIB_NORET; | ||
165 | |||
166 | const extern unsigned char relocate_new_kernel[]; | ||
167 | const extern unsigned long relocate_new_kernel_size; | ||
168 | |||
169 | int machine_kexec_prepare(struct kimage *image) | ||
170 | { | ||
171 | unsigned long start_pgtable, control_code_buffer; | ||
172 | int result; | ||
173 | |||
174 | /* Calculate the offsets */ | ||
175 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
176 | control_code_buffer = start_pgtable + 4096UL; | ||
177 | |||
178 | /* Setup the identity mapped 64bit page table */ | ||
179 | result = init_pgtable(image, start_pgtable); | ||
180 | if (result) { | ||
181 | return result; | ||
182 | } | ||
183 | |||
184 | /* Place the code in the reboot code buffer */ | ||
185 | memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | void machine_kexec_cleanup(struct kimage *image) | ||
191 | { | ||
192 | return; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
197 | * We are past the point of no return, committed to rebooting now. | ||
198 | */ | ||
199 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
200 | { | ||
201 | unsigned long page_list; | ||
202 | unsigned long control_code_buffer; | ||
203 | unsigned long start_pgtable; | ||
204 | relocate_new_kernel_t rnk; | ||
205 | |||
206 | /* Interrupts aren't acceptable while we reboot */ | ||
207 | local_irq_disable(); | ||
208 | |||
209 | /* Calculate the offsets */ | ||
210 | page_list = image->head; | ||
211 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
212 | control_code_buffer = start_pgtable + 4096UL; | ||
213 | |||
214 | /* Set the low half of the page table to my identity mapped | ||
215 | * page table for kexec. Leave the high half pointing at the | ||
216 | * kernel pages. Don't bother to flush the global pages | ||
217 | * as that will happen when I fully switch to my identity mapped | ||
218 | * page table anyway. | ||
219 | */ | ||
220 | memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); | ||
221 | __flush_tlb(); | ||
222 | |||
223 | |||
224 | /* The segment registers are funny things, they are | ||
225 | * automatically loaded from a table, in memory wherever you | ||
226 | * set them to a specific selector, but this table is never | ||
227 | * accessed again unless you set the segment to a different selector. | ||
228 | * | ||
229 | * The more common model are caches where the behide | ||
230 | * the scenes work is done, but is also dropped at arbitrary | ||
231 | * times. | ||
232 | * | ||
233 | * I take advantage of this here by force loading the | ||
234 | * segments, before I zap the gdt with an invalid value. | ||
235 | */ | ||
236 | load_segments(); | ||
237 | /* The gdt & idt are now invalid. | ||
238 | * If you want to load them you must set up your own idt & gdt. | ||
239 | */ | ||
240 | set_gdt(phys_to_virt(0),0); | ||
241 | set_idt(phys_to_virt(0),0); | ||
242 | /* now call it */ | ||
243 | rnk = (relocate_new_kernel_t) control_code_buffer; | ||
244 | (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); | ||
245 | } | ||
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S new file mode 100644 index 000000000000..d24fa9b72a2b --- /dev/null +++ b/arch/x86_64/kernel/relocate_kernel.S | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | |||
11 | /* | ||
12 | * Must be relocatable PIC code callable as a C function, that once | ||
13 | * it starts can not use the previous processes stack. | ||
14 | */ | ||
15 | .globl relocate_new_kernel | ||
16 | .code64 | ||
17 | relocate_new_kernel: | ||
18 | /* %rdi page_list | ||
19 | * %rsi reboot_code_buffer | ||
20 | * %rdx start address | ||
21 | * %rcx page_table | ||
22 | * %r8 arg5 | ||
23 | * %r9 arg6 | ||
24 | */ | ||
25 | |||
26 | /* zero out flags, and disable interrupts */ | ||
27 | pushq $0 | ||
28 | popfq | ||
29 | |||
30 | /* set a new stack at the bottom of our page... */ | ||
31 | lea 4096(%rsi), %rsp | ||
32 | |||
33 | /* store the parameters back on the stack */ | ||
34 | pushq %rdx /* store the start address */ | ||
35 | |||
36 | /* Set cr0 to a known state: | ||
37 | * 31 1 == Paging enabled | ||
38 | * 18 0 == Alignment check disabled | ||
39 | * 16 0 == Write protect disabled | ||
40 | * 3 0 == No task switch | ||
41 | * 2 0 == Don't do FP software emulation. | ||
42 | * 0 1 == Proctected mode enabled | ||
43 | */ | ||
44 | movq %cr0, %rax | ||
45 | andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax | ||
46 | orl $((1<<31)|(1<<0)), %eax | ||
47 | movq %rax, %cr0 | ||
48 | |||
49 | /* Set cr4 to a known state: | ||
50 | * 10 0 == xmm exceptions disabled | ||
51 | * 9 0 == xmm registers instructions disabled | ||
52 | * 8 0 == performance monitoring counter disabled | ||
53 | * 7 0 == page global disabled | ||
54 | * 6 0 == machine check exceptions disabled | ||
55 | * 5 1 == physical address extension enabled | ||
56 | * 4 0 == page size extensions disabled | ||
57 | * 3 0 == Debug extensions disabled | ||
58 | * 2 0 == Time stamp disable (disabled) | ||
59 | * 1 0 == Protected mode virtual interrupts disabled | ||
60 | * 0 0 == VME disabled | ||
61 | */ | ||
62 | |||
63 | movq $((1<<5)), %rax | ||
64 | movq %rax, %cr4 | ||
65 | |||
66 | jmp 1f | ||
67 | 1: | ||
68 | |||
69 | /* Switch to the identity mapped page tables, | ||
70 | * and flush the TLB. | ||
71 | */ | ||
72 | movq %rcx, %cr3 | ||
73 | |||
74 | /* Do the copies */ | ||
75 | movq %rdi, %rcx /* Put the page_list in %rcx */ | ||
76 | xorq %rdi, %rdi | ||
77 | xorq %rsi, %rsi | ||
78 | jmp 1f | ||
79 | |||
80 | 0: /* top, read another word for the indirection page */ | ||
81 | |||
82 | movq (%rbx), %rcx | ||
83 | addq $8, %rbx | ||
84 | 1: | ||
85 | testq $0x1, %rcx /* is it a destination page? */ | ||
86 | jz 2f | ||
87 | movq %rcx, %rdi | ||
88 | andq $0xfffffffffffff000, %rdi | ||
89 | jmp 0b | ||
90 | 2: | ||
91 | testq $0x2, %rcx /* is it an indirection page? */ | ||
92 | jz 2f | ||
93 | movq %rcx, %rbx | ||
94 | andq $0xfffffffffffff000, %rbx | ||
95 | jmp 0b | ||
96 | 2: | ||
97 | testq $0x4, %rcx /* is it the done indicator? */ | ||
98 | jz 2f | ||
99 | jmp 3f | ||
100 | 2: | ||
101 | testq $0x8, %rcx /* is it the source indicator? */ | ||
102 | jz 0b /* Ignore it otherwise */ | ||
103 | movq %rcx, %rsi /* For ever source page do a copy */ | ||
104 | andq $0xfffffffffffff000, %rsi | ||
105 | |||
106 | movq $512, %rcx | ||
107 | rep ; movsq | ||
108 | jmp 0b | ||
109 | 3: | ||
110 | |||
111 | /* To be certain of avoiding problems with self-modifying code | ||
112 | * I need to execute a serializing instruction here. | ||
113 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
114 | * and not processor dependent. | ||
115 | */ | ||
116 | movq %cr3, %rax | ||
117 | movq %rax, %cr3 | ||
118 | |||
119 | /* set all of the registers to known values */ | ||
120 | /* leave %rsp alone */ | ||
121 | |||
122 | xorq %rax, %rax | ||
123 | xorq %rbx, %rbx | ||
124 | xorq %rcx, %rcx | ||
125 | xorq %rdx, %rdx | ||
126 | xorq %rsi, %rsi | ||
127 | xorq %rdi, %rdi | ||
128 | xorq %rbp, %rbp | ||
129 | xorq %r8, %r8 | ||
130 | xorq %r9, %r9 | ||
131 | xorq %r10, %r9 | ||
132 | xorq %r11, %r11 | ||
133 | xorq %r12, %r12 | ||
134 | xorq %r13, %r13 | ||
135 | xorq %r14, %r14 | ||
136 | xorq %r15, %r15 | ||
137 | |||
138 | ret | ||
139 | relocate_new_kernel_end: | ||
140 | |||
141 | .globl relocate_new_kernel_size | ||
142 | relocate_new_kernel_size: | ||
143 | .quad relocate_new_kernel_end - relocate_new_kernel | ||