diff options
| author | Huang Ying <ying.huang@intel.com> | 2009-03-09 22:57:16 -0400 |
|---|---|---|
| committer | H. Peter Anvin <hpa@zytor.com> | 2009-03-10 21:13:25 -0400 |
| commit | fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 (patch) | |
| tree | f855b0b5057c3dff7e26c840218cb22bfe965a7a | |
| parent | 5359454701ce51a4626b1ef6eb7b16ec35bd458d (diff) | |
x86, kexec: x86_64: add kexec jump support for x86_64
Impact: New major feature
This patch add kexec jump support for x86_64. More information about
kexec jump can be found in corresponding x86_32 support patch.
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
| -rw-r--r-- | arch/x86/Kconfig | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/kexec.h | 13 | ||||
| -rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 42 | ||||
| -rw-r--r-- | arch/x86/kernel/relocate_kernel_64.S | 177 | ||||
| -rw-r--r-- | arch/x86/kernel/vmlinux_64.lds.S | 7 |
5 files changed, 197 insertions, 44 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 31758378bcd2..87717f3687d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -1431,7 +1431,7 @@ config CRASH_DUMP | |||
| 1431 | config KEXEC_JUMP | 1431 | config KEXEC_JUMP |
| 1432 | bool "kexec jump (EXPERIMENTAL)" | 1432 | bool "kexec jump (EXPERIMENTAL)" |
| 1433 | depends on EXPERIMENTAL | 1433 | depends on EXPERIMENTAL |
| 1434 | depends on KEXEC && HIBERNATION && X86_32 | 1434 | depends on KEXEC && HIBERNATION |
| 1435 | ---help--- | 1435 | ---help--- |
| 1436 | Jump between original kernel and kexeced kernel and invoke | 1436 | Jump between original kernel and kexeced kernel and invoke |
| 1437 | code in physical address mode via KEXEC | 1437 | code in physical address mode via KEXEC |
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0ceb6d19ed30..317ff1703d0b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h | |||
| @@ -9,13 +9,13 @@ | |||
| 9 | # define PAGES_NR 4 | 9 | # define PAGES_NR 4 |
| 10 | #else | 10 | #else |
| 11 | # define PA_CONTROL_PAGE 0 | 11 | # define PA_CONTROL_PAGE 0 |
| 12 | # define PA_TABLE_PAGE 1 | 12 | # define VA_CONTROL_PAGE 1 |
| 13 | # define PAGES_NR 2 | 13 | # define PA_TABLE_PAGE 2 |
| 14 | # define PA_SWAP_PAGE 3 | ||
| 15 | # define PAGES_NR 4 | ||
| 14 | #endif | 16 | #endif |
| 15 | 17 | ||
| 16 | #ifdef CONFIG_X86_32 | ||
| 17 | # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 | 18 | # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 |
| 18 | #endif | ||
| 19 | 19 | ||
| 20 | #ifndef __ASSEMBLY__ | 20 | #ifndef __ASSEMBLY__ |
| 21 | 21 | ||
| @@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page, | |||
| 136 | unsigned int has_pae, | 136 | unsigned int has_pae, |
| 137 | unsigned int preserve_context); | 137 | unsigned int preserve_context); |
| 138 | #else | 138 | #else |
| 139 | NORET_TYPE void | 139 | unsigned long |
| 140 | relocate_kernel(unsigned long indirection_page, | 140 | relocate_kernel(unsigned long indirection_page, |
| 141 | unsigned long page_list, | 141 | unsigned long page_list, |
| 142 | unsigned long start_address) ATTRIB_NORET; | 142 | unsigned long start_address, |
| 143 | unsigned int preserve_context); | ||
| 143 | #endif | 144 | #endif |
| 144 | 145 | ||
| 145 | #define ARCH_HAS_KIMAGE_ARCH | 146 | #define ARCH_HAS_KIMAGE_ARCH |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7cc5d3d01483..89cea4d44679 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
| 14 | #include <linux/ftrace.h> | 14 | #include <linux/ftrace.h> |
| 15 | #include <linux/io.h> | 15 | #include <linux/io.h> |
| 16 | #include <linux/suspend.h> | ||
| 16 | 17 | ||
| 17 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
| 18 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
| @@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image) | |||
| 270 | { | 271 | { |
| 271 | unsigned long page_list[PAGES_NR]; | 272 | unsigned long page_list[PAGES_NR]; |
| 272 | void *control_page; | 273 | void *control_page; |
| 274 | int save_ftrace_enabled; | ||
| 273 | 275 | ||
| 274 | tracer_disable(); | 276 | #ifdef CONFIG_KEXEC_JUMP |
| 277 | if (kexec_image->preserve_context) | ||
| 278 | save_processor_state(); | ||
| 279 | #endif | ||
| 280 | |||
| 281 | save_ftrace_enabled = __ftrace_enabled_save(); | ||
| 275 | 282 | ||
| 276 | /* Interrupts aren't acceptable while we reboot */ | 283 | /* Interrupts aren't acceptable while we reboot */ |
| 277 | local_irq_disable(); | 284 | local_irq_disable(); |
| 278 | 285 | ||
| 286 | if (image->preserve_context) { | ||
| 287 | #ifdef CONFIG_X86_IO_APIC | ||
| 288 | /* | ||
| 289 | * We need to put APICs in legacy mode so that we can | ||
| 290 | * get timer interrupts in second kernel. kexec/kdump | ||
| 291 | * paths already have calls to disable_IO_APIC() in | ||
| 292 | * one form or other. kexec jump path also need | ||
| 293 | * one. | ||
| 294 | */ | ||
| 295 | disable_IO_APIC(); | ||
| 296 | #endif | ||
| 297 | } | ||
| 298 | |||
| 279 | control_page = page_address(image->control_code_page) + PAGE_SIZE; | 299 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
| 280 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | 300 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
| 281 | 301 | ||
| 282 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); | 302 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); |
| 303 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; | ||
| 283 | page_list[PA_TABLE_PAGE] = | 304 | page_list[PA_TABLE_PAGE] = |
| 284 | (unsigned long)__pa(page_address(image->control_code_page)); | 305 | (unsigned long)__pa(page_address(image->control_code_page)); |
| 285 | 306 | ||
| 307 | if (image->type == KEXEC_TYPE_DEFAULT) | ||
| 308 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | ||
| 309 | << PAGE_SHIFT); | ||
| 310 | |||
| 286 | /* | 311 | /* |
| 287 | * The segment registers are funny things, they have both a | 312 | * The segment registers are funny things, they have both a |
| 288 | * visible and an invisible part. Whenever the visible part is | 313 | * visible and an invisible part. Whenever the visible part is |
| @@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image) | |||
| 302 | set_idt(phys_to_virt(0), 0); | 327 | set_idt(phys_to_virt(0), 0); |
| 303 | 328 | ||
| 304 | /* now call it */ | 329 | /* now call it */ |
| 305 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | 330 | image->start = relocate_kernel((unsigned long)image->head, |
| 306 | image->start); | 331 | (unsigned long)page_list, |
| 332 | image->start, | ||
| 333 | image->preserve_context); | ||
| 334 | |||
| 335 | #ifdef CONFIG_KEXEC_JUMP | ||
| 336 | if (kexec_image->preserve_context) | ||
| 337 | restore_processor_state(); | ||
| 338 | #endif | ||
| 339 | |||
| 340 | __ftrace_enabled_restore(save_ftrace_enabled); | ||
| 307 | } | 341 | } |
| 308 | 342 | ||
| 309 | void arch_crash_save_vmcoreinfo(void) | 343 | void arch_crash_save_vmcoreinfo(void) |
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index cfc0d24003dc..4de8f5b3d476 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
| @@ -19,6 +19,24 @@ | |||
| 19 | #define PTR(x) (x << 3) | 19 | #define PTR(x) (x << 3) |
| 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) |
| 21 | 21 | ||
| 22 | /* | ||
| 23 | * control_page + KEXEC_CONTROL_CODE_MAX_SIZE | ||
| 24 | * ~ control_page + PAGE_SIZE are used as data storage and stack for | ||
| 25 | * jumping back | ||
| 26 | */ | ||
| 27 | #define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) | ||
| 28 | |||
| 29 | /* Minimal CPU state */ | ||
| 30 | #define RSP DATA(0x0) | ||
| 31 | #define CR0 DATA(0x8) | ||
| 32 | #define CR3 DATA(0x10) | ||
| 33 | #define CR4 DATA(0x18) | ||
| 34 | |||
| 35 | /* other data */ | ||
| 36 | #define CP_PA_TABLE_PAGE DATA(0x20) | ||
| 37 | #define CP_PA_SWAP_PAGE DATA(0x28) | ||
| 38 | #define CP_PA_BACKUP_PAGES_MAP DATA(0x30) | ||
| 39 | |||
| 22 | .text | 40 | .text |
| 23 | .align PAGE_SIZE | 41 | .align PAGE_SIZE |
| 24 | .code64 | 42 | .code64 |
| @@ -28,8 +46,27 @@ relocate_kernel: | |||
| 28 | * %rdi indirection_page | 46 | * %rdi indirection_page |
| 29 | * %rsi page_list | 47 | * %rsi page_list |
| 30 | * %rdx start address | 48 | * %rdx start address |
| 49 | * %rcx preserve_context | ||
| 31 | */ | 50 | */ |
| 32 | 51 | ||
| 52 | /* Save the CPU context, used for jumping back */ | ||
| 53 | pushq %rbx | ||
| 54 | pushq %rbp | ||
| 55 | pushq %r12 | ||
| 56 | pushq %r13 | ||
| 57 | pushq %r14 | ||
| 58 | pushq %r15 | ||
| 59 | pushf | ||
| 60 | |||
| 61 | movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 | ||
| 62 | movq %rsp, RSP(%r11) | ||
| 63 | movq %cr0, %rax | ||
| 64 | movq %rax, CR0(%r11) | ||
| 65 | movq %cr3, %rax | ||
| 66 | movq %rax, CR3(%r11) | ||
| 67 | movq %cr4, %rax | ||
| 68 | movq %rax, CR4(%r11) | ||
| 69 | |||
| 33 | /* zero out flags, and disable interrupts */ | 70 | /* zero out flags, and disable interrupts */ |
| 34 | pushq $0 | 71 | pushq $0 |
| 35 | popfq | 72 | popfq |
| @@ -41,10 +78,18 @@ relocate_kernel: | |||
| 41 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | 78 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 |
| 42 | 79 | ||
| 43 | /* get physical address of page table now too */ | 80 | /* get physical address of page table now too */ |
| 44 | movq PTR(PA_TABLE_PAGE)(%rsi), %rcx | 81 | movq PTR(PA_TABLE_PAGE)(%rsi), %r9 |
| 82 | |||
| 83 | /* get physical address of swap page now */ | ||
| 84 | movq PTR(PA_SWAP_PAGE)(%rsi), %r10 | ||
| 85 | |||
| 86 | /* save some information for jumping back */ | ||
| 87 | movq %r9, CP_PA_TABLE_PAGE(%r11) | ||
| 88 | movq %r10, CP_PA_SWAP_PAGE(%r11) | ||
| 89 | movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) | ||
| 45 | 90 | ||
| 46 | /* Switch to the identity mapped page tables */ | 91 | /* Switch to the identity mapped page tables */ |
| 47 | movq %rcx, %cr3 | 92 | movq %r9, %cr3 |
| 48 | 93 | ||
| 49 | /* setup a new stack at the end of the physical control page */ | 94 | /* setup a new stack at the end of the physical control page */ |
| 50 | lea PAGE_SIZE(%r8), %rsp | 95 | lea PAGE_SIZE(%r8), %rsp |
| @@ -83,9 +128,87 @@ identity_mapped: | |||
| 83 | 1: | 128 | 1: |
| 84 | 129 | ||
| 85 | /* Flush the TLB (needed?) */ | 130 | /* Flush the TLB (needed?) */ |
| 86 | movq %rcx, %cr3 | 131 | movq %r9, %cr3 |
| 132 | |||
| 133 | movq %rcx, %r11 | ||
| 134 | call swap_pages | ||
| 135 | |||
| 136 | /* | ||
| 137 | * To be certain of avoiding problems with self-modifying code | ||
| 138 | * I need to execute a serializing instruction here. | ||
| 139 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
| 140 | * and not processor dependent. | ||
| 141 | */ | ||
| 142 | movq %cr3, %rax | ||
| 143 | movq %rax, %cr3 | ||
| 144 | |||
| 145 | /* | ||
| 146 | * set all of the registers to known values | ||
| 147 | * leave %rsp alone | ||
| 148 | */ | ||
| 149 | |||
| 150 | testq %r11, %r11 | ||
| 151 | jnz 1f | ||
| 152 | xorq %rax, %rax | ||
| 153 | xorq %rbx, %rbx | ||
| 154 | xorq %rcx, %rcx | ||
| 155 | xorq %rdx, %rdx | ||
| 156 | xorq %rsi, %rsi | ||
| 157 | xorq %rdi, %rdi | ||
| 158 | xorq %rbp, %rbp | ||
| 159 | xorq %r8, %r8 | ||
| 160 | xorq %r9, %r9 | ||
| 161 | xorq %r10, %r9 | ||
| 162 | xorq %r11, %r11 | ||
| 163 | xorq %r12, %r12 | ||
| 164 | xorq %r13, %r13 | ||
| 165 | xorq %r14, %r14 | ||
| 166 | xorq %r15, %r15 | ||
| 167 | |||
| 168 | ret | ||
| 169 | |||
| 170 | 1: | ||
| 171 | popq %rdx | ||
| 172 | leaq PAGE_SIZE(%r10), %rsp | ||
| 173 | call *%rdx | ||
| 174 | |||
| 175 | /* get the re-entry point of the peer system */ | ||
| 176 | movq 0(%rsp), %rbp | ||
| 177 | call 1f | ||
| 178 | 1: | ||
| 179 | popq %r8 | ||
| 180 | subq $(1b - relocate_kernel), %r8 | ||
| 181 | movq CP_PA_SWAP_PAGE(%r8), %r10 | ||
| 182 | movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi | ||
| 183 | movq CP_PA_TABLE_PAGE(%r8), %rax | ||
| 184 | movq %rax, %cr3 | ||
| 185 | lea PAGE_SIZE(%r8), %rsp | ||
| 186 | call swap_pages | ||
| 187 | movq $virtual_mapped, %rax | ||
| 188 | pushq %rax | ||
| 189 | ret | ||
| 190 | |||
| 191 | virtual_mapped: | ||
| 192 | movq RSP(%r8), %rsp | ||
| 193 | movq CR4(%r8), %rax | ||
| 194 | movq %rax, %cr4 | ||
| 195 | movq CR3(%r8), %rax | ||
| 196 | movq CR0(%r8), %r8 | ||
| 197 | movq %rax, %cr3 | ||
| 198 | movq %r8, %cr0 | ||
| 199 | movq %rbp, %rax | ||
| 200 | |||
| 201 | popf | ||
| 202 | popq %r15 | ||
| 203 | popq %r14 | ||
| 204 | popq %r13 | ||
| 205 | popq %r12 | ||
| 206 | popq %rbp | ||
| 207 | popq %rbx | ||
| 208 | ret | ||
| 87 | 209 | ||
| 88 | /* Do the copies */ | 210 | /* Do the copies */ |
| 211 | swap_pages: | ||
| 89 | movq %rdi, %rcx /* Put the page_list in %rcx */ | 212 | movq %rdi, %rcx /* Put the page_list in %rcx */ |
| 90 | xorq %rdi, %rdi | 213 | xorq %rdi, %rdi |
| 91 | xorq %rsi, %rsi | 214 | xorq %rsi, %rsi |
| @@ -117,39 +240,27 @@ identity_mapped: | |||
| 117 | movq %rcx, %rsi /* For ever source page do a copy */ | 240 | movq %rcx, %rsi /* For ever source page do a copy */ |
| 118 | andq $0xfffffffffffff000, %rsi | 241 | andq $0xfffffffffffff000, %rsi |
| 119 | 242 | ||
| 243 | movq %rdi, %rdx | ||
| 244 | movq %rsi, %rax | ||
| 245 | |||
| 246 | movq %r10, %rdi | ||
| 120 | movq $512, %rcx | 247 | movq $512, %rcx |
| 121 | rep ; movsq | 248 | rep ; movsq |
| 122 | jmp 0b | ||
| 123 | 3: | ||
| 124 | 249 | ||
| 125 | /* | 250 | movq %rax, %rdi |
| 126 | * To be certain of avoiding problems with self-modifying code | 251 | movq %rdx, %rsi |
| 127 | * I need to execute a serializing instruction here. | 252 | movq $512, %rcx |
| 128 | * So I flush the TLB by reloading %cr3 here, it's handy, | 253 | rep ; movsq |
| 129 | * and not processor dependent. | ||
| 130 | */ | ||
| 131 | movq %cr3, %rax | ||
| 132 | movq %rax, %cr3 | ||
| 133 | |||
| 134 | /* | ||
| 135 | * set all of the registers to known values | ||
| 136 | * leave %rsp alone | ||
| 137 | */ | ||
| 138 | 254 | ||
| 139 | xorq %rax, %rax | 255 | movq %rdx, %rdi |
| 140 | xorq %rbx, %rbx | 256 | movq %r10, %rsi |
| 141 | xorq %rcx, %rcx | 257 | movq $512, %rcx |
| 142 | xorq %rdx, %rdx | 258 | rep ; movsq |
| 143 | xorq %rsi, %rsi | ||
| 144 | xorq %rdi, %rdi | ||
| 145 | xorq %rbp, %rbp | ||
| 146 | xorq %r8, %r8 | ||
| 147 | xorq %r9, %r9 | ||
| 148 | xorq %r10, %r9 | ||
| 149 | xorq %r11, %r11 | ||
| 150 | xorq %r12, %r12 | ||
| 151 | xorq %r13, %r13 | ||
| 152 | xorq %r14, %r14 | ||
| 153 | xorq %r15, %r15 | ||
| 154 | 259 | ||
| 260 | lea PAGE_SIZE(%rax), %rsi | ||
| 261 | jmp 0b | ||
| 262 | 3: | ||
| 155 | ret | 263 | ret |
| 264 | |||
| 265 | .globl kexec_control_code_size | ||
| 266 | .set kexec_control_code_size, . - relocate_kernel | ||
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f6800..5bf54e40c6ef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S | |||
| @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | |||
| 275 | ASSERT((per_cpu__irq_stack_union == 0), | 275 | ASSERT((per_cpu__irq_stack_union == 0), |
| 276 | "irq_stack_union is not at start of per-cpu area"); | 276 | "irq_stack_union is not at start of per-cpu area"); |
| 277 | #endif | 277 | #endif |
| 278 | |||
| 279 | #ifdef CONFIG_KEXEC | ||
| 280 | #include <asm/kexec.h> | ||
| 281 | |||
| 282 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | ||
| 283 | "kexec control code size is too big") | ||
| 284 | #endif | ||
