aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2005-06-25 17:58:02 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:50 -0400
commit5234f5eb04abbbfa306ccfbc2ccbb6e73f515b15 (patch)
tree60f4701fdc501955ccff198f84913b96e3bbf5bf /arch/x86_64/kernel
parentd89559589a588d1a654329d8cd9a3ad33aaad9be (diff)
[PATCH] kexec: x86_64 kexec implementation
This is the x86_64 implementation of machine kexec. 32bit compatibility support has been implemented, and machine_kexec has been enhanced to not care about the changing internal kernel paget table structures. From: Alexander Nyberg <alexn@dsv.su.se> build fix Signed-off-by: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/crash.c38
-rw-r--r--arch/x86_64/kernel/machine_kexec.c245
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
4 files changed, 427 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fda..48f9e2c19cd6 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o 22 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
23obj-$(CONFIG_PM) += suspend.o 24obj-$(CONFIG_PM) += suspend.o
24obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o 25obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
25obj-$(CONFIG_CPU_FREQ) += cpufreq/ 26obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 000000000000..7caf8a49d0cb
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,38 @@
1/*
2 * Architecture specific (x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17
18#include <asm/processor.h>
19#include <asm/hardirq.h>
20#include <asm/nmi.h>
21#include <asm/hw_irq.h>
22
23#define MAX_NOTE_BYTES 1024
24typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
25
26note_buf_t crash_notes[NR_CPUS];
27
28void machine_crash_shutdown(void)
29{
30 /* This function is only called after the system
31 * has paniced or is otherwise in a critical state.
32 * The minimum amount of code to allow a kexec'd kernel
33 * to run successfully needs to happen here.
34 *
35 * In practice this means shooting down the other cpus in
36 * an SMP system.
37 */
38}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 000000000000..200b5993f8d9
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,245 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/string.h>
13#include <linux/reboot.h>
14#include <asm/pda.h>
15#include <asm/pgtable.h>
16#include <asm/pgalloc.h>
17#include <asm/tlbflush.h>
18#include <asm/mmu_context.h>
19#include <asm/io.h>
20#include <asm/apic.h>
21#include <asm/cpufeature.h>
22#include <asm/hw_irq.h>
23
24#define LEVEL0_SIZE (1UL << 12UL)
25#define LEVEL1_SIZE (1UL << 21UL)
26#define LEVEL2_SIZE (1UL << 30UL)
27#define LEVEL3_SIZE (1UL << 39UL)
28#define LEVEL4_SIZE (1UL << 48UL)
29
30#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
31#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
32#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
33#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
34
35static void init_level2_page(
36 u64 *level2p, unsigned long addr)
37{
38 unsigned long end_addr;
39 addr &= PAGE_MASK;
40 end_addr = addr + LEVEL2_SIZE;
41 while(addr < end_addr) {
42 *(level2p++) = addr | L1_ATTR;
43 addr += LEVEL1_SIZE;
44 }
45}
46
47static int init_level3_page(struct kimage *image,
48 u64 *level3p, unsigned long addr, unsigned long last_addr)
49{
50 unsigned long end_addr;
51 int result;
52 result = 0;
53 addr &= PAGE_MASK;
54 end_addr = addr + LEVEL3_SIZE;
55 while((addr < last_addr) && (addr < end_addr)) {
56 struct page *page;
57 u64 *level2p;
58 page = kimage_alloc_control_pages(image, 0);
59 if (!page) {
60 result = -ENOMEM;
61 goto out;
62 }
63 level2p = (u64 *)page_address(page);
64 init_level2_page(level2p, addr);
65 *(level3p++) = __pa(level2p) | L2_ATTR;
66 addr += LEVEL2_SIZE;
67 }
68 /* clear the unused entries */
69 while(addr < end_addr) {
70 *(level3p++) = 0;
71 addr += LEVEL2_SIZE;
72 }
73out:
74 return result;
75}
76
77
78static int init_level4_page(struct kimage *image,
79 u64 *level4p, unsigned long addr, unsigned long last_addr)
80{
81 unsigned long end_addr;
82 int result;
83 result = 0;
84 addr &= PAGE_MASK;
85 end_addr = addr + LEVEL4_SIZE;
86 while((addr < last_addr) && (addr < end_addr)) {
87 struct page *page;
88 u64 *level3p;
89 page = kimage_alloc_control_pages(image, 0);
90 if (!page) {
91 result = -ENOMEM;
92 goto out;
93 }
94 level3p = (u64 *)page_address(page);
95 result = init_level3_page(image, level3p, addr, last_addr);
96 if (result) {
97 goto out;
98 }
99 *(level4p++) = __pa(level3p) | L3_ATTR;
100 addr += LEVEL3_SIZE;
101 }
102 /* clear the unused entries */
103 while(addr < end_addr) {
104 *(level4p++) = 0;
105 addr += LEVEL3_SIZE;
106 }
107 out:
108 return result;
109}
110
111
112static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
113{
114 u64 *level4p;
115 level4p = (u64 *)__va(start_pgtable);
116 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
117}
118
119static void set_idt(void *newidt, u16 limit)
120{
121 unsigned char curidt[10];
122
123 /* x86-64 supports unaliged loads & stores */
124 (*(u16 *)(curidt)) = limit;
125 (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
126
127 __asm__ __volatile__ (
128 "lidt %0\n"
129 : "=m" (curidt)
130 );
131};
132
133
134static void set_gdt(void *newgdt, u16 limit)
135{
136 unsigned char curgdt[10];
137
138 /* x86-64 supports unaligned loads & stores */
139 (*(u16 *)(curgdt)) = limit;
140 (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
141
142 __asm__ __volatile__ (
143 "lgdt %0\n"
144 : "=m" (curgdt)
145 );
146};
147
148static void load_segments(void)
149{
150 __asm__ __volatile__ (
151 "\tmovl $"STR(__KERNEL_DS)",%eax\n"
152 "\tmovl %eax,%ds\n"
153 "\tmovl %eax,%es\n"
154 "\tmovl %eax,%ss\n"
155 "\tmovl %eax,%fs\n"
156 "\tmovl %eax,%gs\n"
157 );
158#undef STR
159#undef __STR
160}
161
162typedef NORET_TYPE void (*relocate_new_kernel_t)(
163 unsigned long indirection_page, unsigned long control_code_buffer,
164 unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
165
166const extern unsigned char relocate_new_kernel[];
167const extern unsigned long relocate_new_kernel_size;
168
169int machine_kexec_prepare(struct kimage *image)
170{
171 unsigned long start_pgtable, control_code_buffer;
172 int result;
173
174 /* Calculate the offsets */
175 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
176 control_code_buffer = start_pgtable + 4096UL;
177
178 /* Setup the identity mapped 64bit page table */
179 result = init_pgtable(image, start_pgtable);
180 if (result) {
181 return result;
182 }
183
184 /* Place the code in the reboot code buffer */
185 memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
186
187 return 0;
188}
189
190void machine_kexec_cleanup(struct kimage *image)
191{
192 return;
193}
194
195/*
196 * Do not allocate memory (or fail in any way) in machine_kexec().
197 * We are past the point of no return, committed to rebooting now.
198 */
199NORET_TYPE void machine_kexec(struct kimage *image)
200{
201 unsigned long page_list;
202 unsigned long control_code_buffer;
203 unsigned long start_pgtable;
204 relocate_new_kernel_t rnk;
205
206 /* Interrupts aren't acceptable while we reboot */
207 local_irq_disable();
208
209 /* Calculate the offsets */
210 page_list = image->head;
211 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
212 control_code_buffer = start_pgtable + 4096UL;
213
214 /* Set the low half of the page table to my identity mapped
215 * page table for kexec. Leave the high half pointing at the
216 * kernel pages. Don't bother to flush the global pages
217 * as that will happen when I fully switch to my identity mapped
218 * page table anyway.
219 */
220 memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
221 __flush_tlb();
222
223
224 /* The segment registers are funny things, they are
225 * automatically loaded from a table, in memory wherever you
226 * set them to a specific selector, but this table is never
227 * accessed again unless you set the segment to a different selector.
228 *
229 * The more common model are caches where the behide
230 * the scenes work is done, but is also dropped at arbitrary
231 * times.
232 *
233 * I take advantage of this here by force loading the
234 * segments, before I zap the gdt with an invalid value.
235 */
236 load_segments();
237 /* The gdt & idt are now invalid.
238 * If you want to load them you must set up your own idt & gdt.
239 */
240 set_gdt(phys_to_virt(0),0);
241 set_idt(phys_to_virt(0),0);
242 /* now call it */
243 rnk = (relocate_new_kernel_t) control_code_buffer;
244 (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
245}
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d24fa9b72a2b
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10
11 /*
12 * Must be relocatable PIC code callable as a C function, that once
13 * it starts can not use the previous processes stack.
14 */
15 .globl relocate_new_kernel
16 .code64
17relocate_new_kernel:
18 /* %rdi page_list
19 * %rsi reboot_code_buffer
20 * %rdx start address
21 * %rcx page_table
22 * %r8 arg5
23 * %r9 arg6
24 */
25
26 /* zero out flags, and disable interrupts */
27 pushq $0
28 popfq
29
30 /* set a new stack at the bottom of our page... */
31 lea 4096(%rsi), %rsp
32
33 /* store the parameters back on the stack */
34 pushq %rdx /* store the start address */
35
36 /* Set cr0 to a known state:
37 * 31 1 == Paging enabled
38 * 18 0 == Alignment check disabled
39 * 16 0 == Write protect disabled
40 * 3 0 == No task switch
41 * 2 0 == Don't do FP software emulation.
42 * 0 1 == Proctected mode enabled
43 */
44 movq %cr0, %rax
45 andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
46 orl $((1<<31)|(1<<0)), %eax
47 movq %rax, %cr0
48
49 /* Set cr4 to a known state:
50 * 10 0 == xmm exceptions disabled
51 * 9 0 == xmm registers instructions disabled
52 * 8 0 == performance monitoring counter disabled
53 * 7 0 == page global disabled
54 * 6 0 == machine check exceptions disabled
55 * 5 1 == physical address extension enabled
56 * 4 0 == page size extensions disabled
57 * 3 0 == Debug extensions disabled
58 * 2 0 == Time stamp disable (disabled)
59 * 1 0 == Protected mode virtual interrupts disabled
60 * 0 0 == VME disabled
61 */
62
63 movq $((1<<5)), %rax
64 movq %rax, %cr4
65
66 jmp 1f
671:
68
69 /* Switch to the identity mapped page tables,
70 * and flush the TLB.
71 */
72 movq %rcx, %cr3
73
74 /* Do the copies */
75 movq %rdi, %rcx /* Put the page_list in %rcx */
76 xorq %rdi, %rdi
77 xorq %rsi, %rsi
78 jmp 1f
79
800: /* top, read another word for the indirection page */
81
82 movq (%rbx), %rcx
83 addq $8, %rbx
841:
85 testq $0x1, %rcx /* is it a destination page? */
86 jz 2f
87 movq %rcx, %rdi
88 andq $0xfffffffffffff000, %rdi
89 jmp 0b
902:
91 testq $0x2, %rcx /* is it an indirection page? */
92 jz 2f
93 movq %rcx, %rbx
94 andq $0xfffffffffffff000, %rbx
95 jmp 0b
962:
97 testq $0x4, %rcx /* is it the done indicator? */
98 jz 2f
99 jmp 3f
1002:
101 testq $0x8, %rcx /* is it the source indicator? */
102 jz 0b /* Ignore it otherwise */
103 movq %rcx, %rsi /* For ever source page do a copy */
104 andq $0xfffffffffffff000, %rsi
105
106 movq $512, %rcx
107 rep ; movsq
108 jmp 0b
1093:
110
111 /* To be certain of avoiding problems with self-modifying code
112 * I need to execute a serializing instruction here.
113 * So I flush the TLB by reloading %cr3 here, it's handy,
114 * and not processor dependent.
115 */
116 movq %cr3, %rax
117 movq %rax, %cr3
118
119 /* set all of the registers to known values */
120 /* leave %rsp alone */
121
122 xorq %rax, %rax
123 xorq %rbx, %rbx
124 xorq %rcx, %rcx
125 xorq %rdx, %rdx
126 xorq %rsi, %rsi
127 xorq %rdi, %rdi
128 xorq %rbp, %rbp
129 xorq %r8, %r8
130 xorq %r9, %r9
131 xorq %r10, %r9
132 xorq %r11, %r11
133 xorq %r12, %r12
134 xorq %r13, %r13
135 xorq %r14, %r14
136 xorq %r15, %r15
137
138 ret
139relocate_new_kernel_end:
140
141 .globl relocate_new_kernel_size
142relocate_new_kernel_size:
143 .quad relocate_new_kernel_end - relocate_new_kernel