diff options
author | Dave Young <dyoung@redhat.com> | 2015-09-09 18:38:55 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-10 16:29:01 -0400 |
commit | 2965faa5e03d1e71e9ff9aa143fff39e0a77543a (patch) | |
tree | 78b12008d7078a9cd40e157d5b18b795b14d5d9c /kernel/kexec_core.c | |
parent | a43cac0d9dc2073ff2245a171429ddbe1accece7 (diff) |
kexec: split kexec_load syscall from kexec core code
There are two kexec load syscalls, kexec_load another and kexec_file_load.
kexec_file_load has been splited as kernel/kexec_file.c. In this patch I
split kexec_load syscall code to kernel/kexec.c.
And add a new kconfig option KEXEC_CORE, so we can disable kexec_load and
use kexec_file_load only, or vice verse.
The original requirement is from Ted Ts'o, he want kexec kernel signature
being checked with CONFIG_KEXEC_VERIFY_SIG enabled. But kexec-tools use
kexec_load syscall can bypass the checking.
Vivek Goyal proposed to create a common kconfig option so user can compile
in only one syscall for loading kexec kernel. KEXEC/KEXEC_FILE selects
KEXEC_CORE so that old config files still work.
Because there's general code need CONFIG_KEXEC_CORE, so I updated all the
architecture Kconfig with a new option KEXEC_CORE, and let KEXEC selects
KEXEC_CORE in arch Kconfig. Also updated general kernel code with to
kexec_load syscall.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/kexec_core.c')
-rw-r--r-- | kernel/kexec_core.c | 1511 |
1 files changed, 1511 insertions, 0 deletions
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c new file mode 100644 index 000000000000..9aa25c034b2e --- /dev/null +++ b/kernel/kexec_core.c | |||
@@ -0,0 +1,1511 @@ | |||
1 | /* | ||
2 | * kexec.c - kexec system call core code. | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #define pr_fmt(fmt) "kexec: " fmt | ||
10 | |||
11 | #include <linux/capability.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/file.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/fs.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/mutex.h> | ||
18 | #include <linux/list.h> | ||
19 | #include <linux/highmem.h> | ||
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/reboot.h> | ||
22 | #include <linux/ioport.h> | ||
23 | #include <linux/hardirq.h> | ||
24 | #include <linux/elf.h> | ||
25 | #include <linux/elfcore.h> | ||
26 | #include <linux/utsname.h> | ||
27 | #include <linux/numa.h> | ||
28 | #include <linux/suspend.h> | ||
29 | #include <linux/device.h> | ||
30 | #include <linux/freezer.h> | ||
31 | #include <linux/pm.h> | ||
32 | #include <linux/cpu.h> | ||
33 | #include <linux/uaccess.h> | ||
34 | #include <linux/io.h> | ||
35 | #include <linux/console.h> | ||
36 | #include <linux/vmalloc.h> | ||
37 | #include <linux/swap.h> | ||
38 | #include <linux/syscore_ops.h> | ||
39 | #include <linux/compiler.h> | ||
40 | #include <linux/hugetlb.h> | ||
41 | |||
42 | #include <asm/page.h> | ||
43 | #include <asm/sections.h> | ||
44 | |||
45 | #include <crypto/hash.h> | ||
46 | #include <crypto/sha.h> | ||
47 | #include "kexec_internal.h" | ||
48 | |||
49 | DEFINE_MUTEX(kexec_mutex); | ||
50 | |||
51 | /* Per cpu memory for storing cpu states in case of system crash. */ | ||
52 | note_buf_t __percpu *crash_notes; | ||
53 | |||
54 | /* vmcoreinfo stuff */ | ||
55 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | ||
56 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | ||
57 | size_t vmcoreinfo_size; | ||
58 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | ||
59 | |||
60 | /* Flag to indicate we are going to kexec a new kernel */ | ||
61 | bool kexec_in_progress = false; | ||
62 | |||
63 | |||
64 | /* Location of the reserved area for the crash kernel */ | ||
65 | struct resource crashk_res = { | ||
66 | .name = "Crash kernel", | ||
67 | .start = 0, | ||
68 | .end = 0, | ||
69 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
70 | }; | ||
71 | struct resource crashk_low_res = { | ||
72 | .name = "Crash kernel", | ||
73 | .start = 0, | ||
74 | .end = 0, | ||
75 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
76 | }; | ||
77 | |||
78 | int kexec_should_crash(struct task_struct *p) | ||
79 | { | ||
80 | /* | ||
81 | * If crash_kexec_post_notifiers is enabled, don't run | ||
82 | * crash_kexec() here yet, which must be run after panic | ||
83 | * notifiers in panic(). | ||
84 | */ | ||
85 | if (crash_kexec_post_notifiers) | ||
86 | return 0; | ||
87 | /* | ||
88 | * There are 4 panic() calls in do_exit() path, each of which | ||
89 | * corresponds to each of these 4 conditions. | ||
90 | */ | ||
91 | if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) | ||
92 | return 1; | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * When kexec transitions to the new kernel there is a one-to-one | ||
98 | * mapping between physical and virtual addresses. On processors | ||
99 | * where you can disable the MMU this is trivial, and easy. For | ||
100 | * others it is still a simple predictable page table to setup. | ||
101 | * | ||
102 | * In that environment kexec copies the new kernel to its final | ||
103 | * resting place. This means I can only support memory whose | ||
104 | * physical address can fit in an unsigned long. In particular | ||
105 | * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. | ||
106 | * If the assembly stub has more restrictive requirements | ||
107 | * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be | ||
108 | * defined more restrictively in <asm/kexec.h>. | ||
109 | * | ||
110 | * The code for the transition from the current kernel to the | ||
111 | * the new kernel is placed in the control_code_buffer, whose size | ||
112 | * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single | ||
113 | * page of memory is necessary, but some architectures require more. | ||
114 | * Because this memory must be identity mapped in the transition from | ||
115 | * virtual to physical addresses it must live in the range | ||
116 | * 0 - TASK_SIZE, as only the user space mappings are arbitrarily | ||
117 | * modifiable. | ||
118 | * | ||
119 | * The assembly stub in the control code buffer is passed a linked list | ||
120 | * of descriptor pages detailing the source pages of the new kernel, | ||
121 | * and the destination addresses of those source pages. As this data | ||
122 | * structure is not used in the context of the current OS, it must | ||
123 | * be self-contained. | ||
124 | * | ||
125 | * The code has been made to work with highmem pages and will use a | ||
126 | * destination page in its final resting place (if it happens | ||
127 | * to allocate it). The end product of this is that most of the | ||
128 | * physical address space, and most of RAM can be used. | ||
129 | * | ||
130 | * Future directions include: | ||
131 | * - allocating a page table with the control code buffer identity | ||
132 | * mapped, to simplify machine_kexec and make kexec_on_panic more | ||
133 | * reliable. | ||
134 | */ | ||
135 | |||
136 | /* | ||
137 | * KIMAGE_NO_DEST is an impossible destination address..., for | ||
138 | * allocating pages whose destination address we do not care about. | ||
139 | */ | ||
140 | #define KIMAGE_NO_DEST (-1UL) | ||
141 | |||
142 | static struct page *kimage_alloc_page(struct kimage *image, | ||
143 | gfp_t gfp_mask, | ||
144 | unsigned long dest); | ||
145 | |||
146 | int sanity_check_segment_list(struct kimage *image) | ||
147 | { | ||
148 | int result, i; | ||
149 | unsigned long nr_segments = image->nr_segments; | ||
150 | |||
151 | /* | ||
152 | * Verify we have good destination addresses. The caller is | ||
153 | * responsible for making certain we don't attempt to load | ||
154 | * the new image into invalid or reserved areas of RAM. This | ||
155 | * just verifies it is an address we can use. | ||
156 | * | ||
157 | * Since the kernel does everything in page size chunks ensure | ||
158 | * the destination addresses are page aligned. Too many | ||
159 | * special cases crop of when we don't do this. The most | ||
160 | * insidious is getting overlapping destination addresses | ||
161 | * simply because addresses are changed to page size | ||
162 | * granularity. | ||
163 | */ | ||
164 | result = -EADDRNOTAVAIL; | ||
165 | for (i = 0; i < nr_segments; i++) { | ||
166 | unsigned long mstart, mend; | ||
167 | |||
168 | mstart = image->segment[i].mem; | ||
169 | mend = mstart + image->segment[i].memsz; | ||
170 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | ||
171 | return result; | ||
172 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | ||
173 | return result; | ||
174 | } | ||
175 | |||
176 | /* Verify our destination addresses do not overlap. | ||
177 | * If we alloed overlapping destination addresses | ||
178 | * through very weird things can happen with no | ||
179 | * easy explanation as one segment stops on another. | ||
180 | */ | ||
181 | result = -EINVAL; | ||
182 | for (i = 0; i < nr_segments; i++) { | ||
183 | unsigned long mstart, mend; | ||
184 | unsigned long j; | ||
185 | |||
186 | mstart = image->segment[i].mem; | ||
187 | mend = mstart + image->segment[i].memsz; | ||
188 | for (j = 0; j < i; j++) { | ||
189 | unsigned long pstart, pend; | ||
190 | |||
191 | pstart = image->segment[j].mem; | ||
192 | pend = pstart + image->segment[j].memsz; | ||
193 | /* Do the segments overlap ? */ | ||
194 | if ((mend > pstart) && (mstart < pend)) | ||
195 | return result; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* Ensure our buffer sizes are strictly less than | ||
200 | * our memory sizes. This should always be the case, | ||
201 | * and it is easier to check up front than to be surprised | ||
202 | * later on. | ||
203 | */ | ||
204 | result = -EINVAL; | ||
205 | for (i = 0; i < nr_segments; i++) { | ||
206 | if (image->segment[i].bufsz > image->segment[i].memsz) | ||
207 | return result; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Verify we have good destination addresses. Normally | ||
212 | * the caller is responsible for making certain we don't | ||
213 | * attempt to load the new image into invalid or reserved | ||
214 | * areas of RAM. But crash kernels are preloaded into a | ||
215 | * reserved area of ram. We must ensure the addresses | ||
216 | * are in the reserved area otherwise preloading the | ||
217 | * kernel could corrupt things. | ||
218 | */ | ||
219 | |||
220 | if (image->type == KEXEC_TYPE_CRASH) { | ||
221 | result = -EADDRNOTAVAIL; | ||
222 | for (i = 0; i < nr_segments; i++) { | ||
223 | unsigned long mstart, mend; | ||
224 | |||
225 | mstart = image->segment[i].mem; | ||
226 | mend = mstart + image->segment[i].memsz - 1; | ||
227 | /* Ensure we are within the crash kernel limits */ | ||
228 | if ((mstart < crashk_res.start) || | ||
229 | (mend > crashk_res.end)) | ||
230 | return result; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct kimage *do_kimage_alloc_init(void) | ||
238 | { | ||
239 | struct kimage *image; | ||
240 | |||
241 | /* Allocate a controlling structure */ | ||
242 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
243 | if (!image) | ||
244 | return NULL; | ||
245 | |||
246 | image->head = 0; | ||
247 | image->entry = &image->head; | ||
248 | image->last_entry = &image->head; | ||
249 | image->control_page = ~0; /* By default this does not apply */ | ||
250 | image->type = KEXEC_TYPE_DEFAULT; | ||
251 | |||
252 | /* Initialize the list of control pages */ | ||
253 | INIT_LIST_HEAD(&image->control_pages); | ||
254 | |||
255 | /* Initialize the list of destination pages */ | ||
256 | INIT_LIST_HEAD(&image->dest_pages); | ||
257 | |||
258 | /* Initialize the list of unusable pages */ | ||
259 | INIT_LIST_HEAD(&image->unusable_pages); | ||
260 | |||
261 | return image; | ||
262 | } | ||
263 | |||
264 | int kimage_is_destination_range(struct kimage *image, | ||
265 | unsigned long start, | ||
266 | unsigned long end) | ||
267 | { | ||
268 | unsigned long i; | ||
269 | |||
270 | for (i = 0; i < image->nr_segments; i++) { | ||
271 | unsigned long mstart, mend; | ||
272 | |||
273 | mstart = image->segment[i].mem; | ||
274 | mend = mstart + image->segment[i].memsz; | ||
275 | if ((end > mstart) && (start < mend)) | ||
276 | return 1; | ||
277 | } | ||
278 | |||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) | ||
283 | { | ||
284 | struct page *pages; | ||
285 | |||
286 | pages = alloc_pages(gfp_mask, order); | ||
287 | if (pages) { | ||
288 | unsigned int count, i; | ||
289 | |||
290 | pages->mapping = NULL; | ||
291 | set_page_private(pages, order); | ||
292 | count = 1 << order; | ||
293 | for (i = 0; i < count; i++) | ||
294 | SetPageReserved(pages + i); | ||
295 | } | ||
296 | |||
297 | return pages; | ||
298 | } | ||
299 | |||
300 | static void kimage_free_pages(struct page *page) | ||
301 | { | ||
302 | unsigned int order, count, i; | ||
303 | |||
304 | order = page_private(page); | ||
305 | count = 1 << order; | ||
306 | for (i = 0; i < count; i++) | ||
307 | ClearPageReserved(page + i); | ||
308 | __free_pages(page, order); | ||
309 | } | ||
310 | |||
311 | void kimage_free_page_list(struct list_head *list) | ||
312 | { | ||
313 | struct list_head *pos, *next; | ||
314 | |||
315 | list_for_each_safe(pos, next, list) { | ||
316 | struct page *page; | ||
317 | |||
318 | page = list_entry(pos, struct page, lru); | ||
319 | list_del(&page->lru); | ||
320 | kimage_free_pages(page); | ||
321 | } | ||
322 | } | ||
323 | |||
324 | static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | ||
325 | unsigned int order) | ||
326 | { | ||
327 | /* Control pages are special, they are the intermediaries | ||
328 | * that are needed while we copy the rest of the pages | ||
329 | * to their final resting place. As such they must | ||
330 | * not conflict with either the destination addresses | ||
331 | * or memory the kernel is already using. | ||
332 | * | ||
333 | * The only case where we really need more than one of | ||
334 | * these are for architectures where we cannot disable | ||
335 | * the MMU and must instead generate an identity mapped | ||
336 | * page table for all of the memory. | ||
337 | * | ||
338 | * At worst this runs in O(N) of the image size. | ||
339 | */ | ||
340 | struct list_head extra_pages; | ||
341 | struct page *pages; | ||
342 | unsigned int count; | ||
343 | |||
344 | count = 1 << order; | ||
345 | INIT_LIST_HEAD(&extra_pages); | ||
346 | |||
347 | /* Loop while I can allocate a page and the page allocated | ||
348 | * is a destination page. | ||
349 | */ | ||
350 | do { | ||
351 | unsigned long pfn, epfn, addr, eaddr; | ||
352 | |||
353 | pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); | ||
354 | if (!pages) | ||
355 | break; | ||
356 | pfn = page_to_pfn(pages); | ||
357 | epfn = pfn + count; | ||
358 | addr = pfn << PAGE_SHIFT; | ||
359 | eaddr = epfn << PAGE_SHIFT; | ||
360 | if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || | ||
361 | kimage_is_destination_range(image, addr, eaddr)) { | ||
362 | list_add(&pages->lru, &extra_pages); | ||
363 | pages = NULL; | ||
364 | } | ||
365 | } while (!pages); | ||
366 | |||
367 | if (pages) { | ||
368 | /* Remember the allocated page... */ | ||
369 | list_add(&pages->lru, &image->control_pages); | ||
370 | |||
371 | /* Because the page is already in it's destination | ||
372 | * location we will never allocate another page at | ||
373 | * that address. Therefore kimage_alloc_pages | ||
374 | * will not return it (again) and we don't need | ||
375 | * to give it an entry in image->segment[]. | ||
376 | */ | ||
377 | } | ||
378 | /* Deal with the destination pages I have inadvertently allocated. | ||
379 | * | ||
380 | * Ideally I would convert multi-page allocations into single | ||
381 | * page allocations, and add everything to image->dest_pages. | ||
382 | * | ||
383 | * For now it is simpler to just free the pages. | ||
384 | */ | ||
385 | kimage_free_page_list(&extra_pages); | ||
386 | |||
387 | return pages; | ||
388 | } | ||
389 | |||
390 | static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | ||
391 | unsigned int order) | ||
392 | { | ||
393 | /* Control pages are special, they are the intermediaries | ||
394 | * that are needed while we copy the rest of the pages | ||
395 | * to their final resting place. As such they must | ||
396 | * not conflict with either the destination addresses | ||
397 | * or memory the kernel is already using. | ||
398 | * | ||
399 | * Control pages are also the only pags we must allocate | ||
400 | * when loading a crash kernel. All of the other pages | ||
401 | * are specified by the segments and we just memcpy | ||
402 | * into them directly. | ||
403 | * | ||
404 | * The only case where we really need more than one of | ||
405 | * these are for architectures where we cannot disable | ||
406 | * the MMU and must instead generate an identity mapped | ||
407 | * page table for all of the memory. | ||
408 | * | ||
409 | * Given the low demand this implements a very simple | ||
410 | * allocator that finds the first hole of the appropriate | ||
411 | * size in the reserved memory region, and allocates all | ||
412 | * of the memory up to and including the hole. | ||
413 | */ | ||
414 | unsigned long hole_start, hole_end, size; | ||
415 | struct page *pages; | ||
416 | |||
417 | pages = NULL; | ||
418 | size = (1 << order) << PAGE_SHIFT; | ||
419 | hole_start = (image->control_page + (size - 1)) & ~(size - 1); | ||
420 | hole_end = hole_start + size - 1; | ||
421 | while (hole_end <= crashk_res.end) { | ||
422 | unsigned long i; | ||
423 | |||
424 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | ||
425 | break; | ||
426 | /* See if I overlap any of the segments */ | ||
427 | for (i = 0; i < image->nr_segments; i++) { | ||
428 | unsigned long mstart, mend; | ||
429 | |||
430 | mstart = image->segment[i].mem; | ||
431 | mend = mstart + image->segment[i].memsz - 1; | ||
432 | if ((hole_end >= mstart) && (hole_start <= mend)) { | ||
433 | /* Advance the hole to the end of the segment */ | ||
434 | hole_start = (mend + (size - 1)) & ~(size - 1); | ||
435 | hole_end = hole_start + size - 1; | ||
436 | break; | ||
437 | } | ||
438 | } | ||
439 | /* If I don't overlap any segments I have found my hole! */ | ||
440 | if (i == image->nr_segments) { | ||
441 | pages = pfn_to_page(hole_start >> PAGE_SHIFT); | ||
442 | break; | ||
443 | } | ||
444 | } | ||
445 | if (pages) | ||
446 | image->control_page = hole_end; | ||
447 | |||
448 | return pages; | ||
449 | } | ||
450 | |||
451 | |||
452 | struct page *kimage_alloc_control_pages(struct kimage *image, | ||
453 | unsigned int order) | ||
454 | { | ||
455 | struct page *pages = NULL; | ||
456 | |||
457 | switch (image->type) { | ||
458 | case KEXEC_TYPE_DEFAULT: | ||
459 | pages = kimage_alloc_normal_control_pages(image, order); | ||
460 | break; | ||
461 | case KEXEC_TYPE_CRASH: | ||
462 | pages = kimage_alloc_crash_control_pages(image, order); | ||
463 | break; | ||
464 | } | ||
465 | |||
466 | return pages; | ||
467 | } | ||
468 | |||
469 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | ||
470 | { | ||
471 | if (*image->entry != 0) | ||
472 | image->entry++; | ||
473 | |||
474 | if (image->entry == image->last_entry) { | ||
475 | kimage_entry_t *ind_page; | ||
476 | struct page *page; | ||
477 | |||
478 | page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); | ||
479 | if (!page) | ||
480 | return -ENOMEM; | ||
481 | |||
482 | ind_page = page_address(page); | ||
483 | *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; | ||
484 | image->entry = ind_page; | ||
485 | image->last_entry = ind_page + | ||
486 | ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); | ||
487 | } | ||
488 | *image->entry = entry; | ||
489 | image->entry++; | ||
490 | *image->entry = 0; | ||
491 | |||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | static int kimage_set_destination(struct kimage *image, | ||
496 | unsigned long destination) | ||
497 | { | ||
498 | int result; | ||
499 | |||
500 | destination &= PAGE_MASK; | ||
501 | result = kimage_add_entry(image, destination | IND_DESTINATION); | ||
502 | |||
503 | return result; | ||
504 | } | ||
505 | |||
506 | |||
507 | static int kimage_add_page(struct kimage *image, unsigned long page) | ||
508 | { | ||
509 | int result; | ||
510 | |||
511 | page &= PAGE_MASK; | ||
512 | result = kimage_add_entry(image, page | IND_SOURCE); | ||
513 | |||
514 | return result; | ||
515 | } | ||
516 | |||
517 | |||
518 | static void kimage_free_extra_pages(struct kimage *image) | ||
519 | { | ||
520 | /* Walk through and free any extra destination pages I may have */ | ||
521 | kimage_free_page_list(&image->dest_pages); | ||
522 | |||
523 | /* Walk through and free any unusable pages I have cached */ | ||
524 | kimage_free_page_list(&image->unusable_pages); | ||
525 | |||
526 | } | ||
527 | void kimage_terminate(struct kimage *image) | ||
528 | { | ||
529 | if (*image->entry != 0) | ||
530 | image->entry++; | ||
531 | |||
532 | *image->entry = IND_DONE; | ||
533 | } | ||
534 | |||
535 | #define for_each_kimage_entry(image, ptr, entry) \ | ||
536 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | ||
537 | ptr = (entry & IND_INDIRECTION) ? \ | ||
538 | phys_to_virt((entry & PAGE_MASK)) : ptr + 1) | ||
539 | |||
540 | static void kimage_free_entry(kimage_entry_t entry) | ||
541 | { | ||
542 | struct page *page; | ||
543 | |||
544 | page = pfn_to_page(entry >> PAGE_SHIFT); | ||
545 | kimage_free_pages(page); | ||
546 | } | ||
547 | |||
548 | void kimage_free(struct kimage *image) | ||
549 | { | ||
550 | kimage_entry_t *ptr, entry; | ||
551 | kimage_entry_t ind = 0; | ||
552 | |||
553 | if (!image) | ||
554 | return; | ||
555 | |||
556 | kimage_free_extra_pages(image); | ||
557 | for_each_kimage_entry(image, ptr, entry) { | ||
558 | if (entry & IND_INDIRECTION) { | ||
559 | /* Free the previous indirection page */ | ||
560 | if (ind & IND_INDIRECTION) | ||
561 | kimage_free_entry(ind); | ||
562 | /* Save this indirection page until we are | ||
563 | * done with it. | ||
564 | */ | ||
565 | ind = entry; | ||
566 | } else if (entry & IND_SOURCE) | ||
567 | kimage_free_entry(entry); | ||
568 | } | ||
569 | /* Free the final indirection page */ | ||
570 | if (ind & IND_INDIRECTION) | ||
571 | kimage_free_entry(ind); | ||
572 | |||
573 | /* Handle any machine specific cleanup */ | ||
574 | machine_kexec_cleanup(image); | ||
575 | |||
576 | /* Free the kexec control pages... */ | ||
577 | kimage_free_page_list(&image->control_pages); | ||
578 | |||
579 | /* | ||
580 | * Free up any temporary buffers allocated. This might hit if | ||
581 | * error occurred much later after buffer allocation. | ||
582 | */ | ||
583 | if (image->file_mode) | ||
584 | kimage_file_post_load_cleanup(image); | ||
585 | |||
586 | kfree(image); | ||
587 | } | ||
588 | |||
589 | static kimage_entry_t *kimage_dst_used(struct kimage *image, | ||
590 | unsigned long page) | ||
591 | { | ||
592 | kimage_entry_t *ptr, entry; | ||
593 | unsigned long destination = 0; | ||
594 | |||
595 | for_each_kimage_entry(image, ptr, entry) { | ||
596 | if (entry & IND_DESTINATION) | ||
597 | destination = entry & PAGE_MASK; | ||
598 | else if (entry & IND_SOURCE) { | ||
599 | if (page == destination) | ||
600 | return ptr; | ||
601 | destination += PAGE_SIZE; | ||
602 | } | ||
603 | } | ||
604 | |||
605 | return NULL; | ||
606 | } | ||
607 | |||
608 | static struct page *kimage_alloc_page(struct kimage *image, | ||
609 | gfp_t gfp_mask, | ||
610 | unsigned long destination) | ||
611 | { | ||
612 | /* | ||
613 | * Here we implement safeguards to ensure that a source page | ||
614 | * is not copied to its destination page before the data on | ||
615 | * the destination page is no longer useful. | ||
616 | * | ||
617 | * To do this we maintain the invariant that a source page is | ||
618 | * either its own destination page, or it is not a | ||
619 | * destination page at all. | ||
620 | * | ||
621 | * That is slightly stronger than required, but the proof | ||
622 | * that no problems will not occur is trivial, and the | ||
623 | * implementation is simply to verify. | ||
624 | * | ||
625 | * When allocating all pages normally this algorithm will run | ||
626 | * in O(N) time, but in the worst case it will run in O(N^2) | ||
627 | * time. If the runtime is a problem the data structures can | ||
628 | * be fixed. | ||
629 | */ | ||
630 | struct page *page; | ||
631 | unsigned long addr; | ||
632 | |||
633 | /* | ||
634 | * Walk through the list of destination pages, and see if I | ||
635 | * have a match. | ||
636 | */ | ||
637 | list_for_each_entry(page, &image->dest_pages, lru) { | ||
638 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
639 | if (addr == destination) { | ||
640 | list_del(&page->lru); | ||
641 | return page; | ||
642 | } | ||
643 | } | ||
644 | page = NULL; | ||
645 | while (1) { | ||
646 | kimage_entry_t *old; | ||
647 | |||
648 | /* Allocate a page, if we run out of memory give up */ | ||
649 | page = kimage_alloc_pages(gfp_mask, 0); | ||
650 | if (!page) | ||
651 | return NULL; | ||
652 | /* If the page cannot be used file it away */ | ||
653 | if (page_to_pfn(page) > | ||
654 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | ||
655 | list_add(&page->lru, &image->unusable_pages); | ||
656 | continue; | ||
657 | } | ||
658 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
659 | |||
660 | /* If it is the destination page we want use it */ | ||
661 | if (addr == destination) | ||
662 | break; | ||
663 | |||
664 | /* If the page is not a destination page use it */ | ||
665 | if (!kimage_is_destination_range(image, addr, | ||
666 | addr + PAGE_SIZE)) | ||
667 | break; | ||
668 | |||
669 | /* | ||
670 | * I know that the page is someones destination page. | ||
671 | * See if there is already a source page for this | ||
672 | * destination page. And if so swap the source pages. | ||
673 | */ | ||
674 | old = kimage_dst_used(image, addr); | ||
675 | if (old) { | ||
676 | /* If so move it */ | ||
677 | unsigned long old_addr; | ||
678 | struct page *old_page; | ||
679 | |||
680 | old_addr = *old & PAGE_MASK; | ||
681 | old_page = pfn_to_page(old_addr >> PAGE_SHIFT); | ||
682 | copy_highpage(page, old_page); | ||
683 | *old = addr | (*old & ~PAGE_MASK); | ||
684 | |||
685 | /* The old page I have found cannot be a | ||
686 | * destination page, so return it if it's | ||
687 | * gfp_flags honor the ones passed in. | ||
688 | */ | ||
689 | if (!(gfp_mask & __GFP_HIGHMEM) && | ||
690 | PageHighMem(old_page)) { | ||
691 | kimage_free_pages(old_page); | ||
692 | continue; | ||
693 | } | ||
694 | addr = old_addr; | ||
695 | page = old_page; | ||
696 | break; | ||
697 | } | ||
698 | /* Place the page on the destination list, to be used later */ | ||
699 | list_add(&page->lru, &image->dest_pages); | ||
700 | } | ||
701 | |||
702 | return page; | ||
703 | } | ||
704 | |||
705 | static int kimage_load_normal_segment(struct kimage *image, | ||
706 | struct kexec_segment *segment) | ||
707 | { | ||
708 | unsigned long maddr; | ||
709 | size_t ubytes, mbytes; | ||
710 | int result; | ||
711 | unsigned char __user *buf = NULL; | ||
712 | unsigned char *kbuf = NULL; | ||
713 | |||
714 | result = 0; | ||
715 | if (image->file_mode) | ||
716 | kbuf = segment->kbuf; | ||
717 | else | ||
718 | buf = segment->buf; | ||
719 | ubytes = segment->bufsz; | ||
720 | mbytes = segment->memsz; | ||
721 | maddr = segment->mem; | ||
722 | |||
723 | result = kimage_set_destination(image, maddr); | ||
724 | if (result < 0) | ||
725 | goto out; | ||
726 | |||
727 | while (mbytes) { | ||
728 | struct page *page; | ||
729 | char *ptr; | ||
730 | size_t uchunk, mchunk; | ||
731 | |||
732 | page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); | ||
733 | if (!page) { | ||
734 | result = -ENOMEM; | ||
735 | goto out; | ||
736 | } | ||
737 | result = kimage_add_page(image, page_to_pfn(page) | ||
738 | << PAGE_SHIFT); | ||
739 | if (result < 0) | ||
740 | goto out; | ||
741 | |||
742 | ptr = kmap(page); | ||
743 | /* Start with a clear page */ | ||
744 | clear_page(ptr); | ||
745 | ptr += maddr & ~PAGE_MASK; | ||
746 | mchunk = min_t(size_t, mbytes, | ||
747 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
748 | uchunk = min(ubytes, mchunk); | ||
749 | |||
750 | /* For file based kexec, source pages are in kernel memory */ | ||
751 | if (image->file_mode) | ||
752 | memcpy(ptr, kbuf, uchunk); | ||
753 | else | ||
754 | result = copy_from_user(ptr, buf, uchunk); | ||
755 | kunmap(page); | ||
756 | if (result) { | ||
757 | result = -EFAULT; | ||
758 | goto out; | ||
759 | } | ||
760 | ubytes -= uchunk; | ||
761 | maddr += mchunk; | ||
762 | if (image->file_mode) | ||
763 | kbuf += mchunk; | ||
764 | else | ||
765 | buf += mchunk; | ||
766 | mbytes -= mchunk; | ||
767 | } | ||
768 | out: | ||
769 | return result; | ||
770 | } | ||
771 | |||
772 | static int kimage_load_crash_segment(struct kimage *image, | ||
773 | struct kexec_segment *segment) | ||
774 | { | ||
775 | /* For crash dumps kernels we simply copy the data from | ||
776 | * user space to it's destination. | ||
777 | * We do things a page at a time for the sake of kmap. | ||
778 | */ | ||
779 | unsigned long maddr; | ||
780 | size_t ubytes, mbytes; | ||
781 | int result; | ||
782 | unsigned char __user *buf = NULL; | ||
783 | unsigned char *kbuf = NULL; | ||
784 | |||
785 | result = 0; | ||
786 | if (image->file_mode) | ||
787 | kbuf = segment->kbuf; | ||
788 | else | ||
789 | buf = segment->buf; | ||
790 | ubytes = segment->bufsz; | ||
791 | mbytes = segment->memsz; | ||
792 | maddr = segment->mem; | ||
793 | while (mbytes) { | ||
794 | struct page *page; | ||
795 | char *ptr; | ||
796 | size_t uchunk, mchunk; | ||
797 | |||
798 | page = pfn_to_page(maddr >> PAGE_SHIFT); | ||
799 | if (!page) { | ||
800 | result = -ENOMEM; | ||
801 | goto out; | ||
802 | } | ||
803 | ptr = kmap(page); | ||
804 | ptr += maddr & ~PAGE_MASK; | ||
805 | mchunk = min_t(size_t, mbytes, | ||
806 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
807 | uchunk = min(ubytes, mchunk); | ||
808 | if (mchunk > uchunk) { | ||
809 | /* Zero the trailing part of the page */ | ||
810 | memset(ptr + uchunk, 0, mchunk - uchunk); | ||
811 | } | ||
812 | |||
813 | /* For file based kexec, source pages are in kernel memory */ | ||
814 | if (image->file_mode) | ||
815 | memcpy(ptr, kbuf, uchunk); | ||
816 | else | ||
817 | result = copy_from_user(ptr, buf, uchunk); | ||
818 | kexec_flush_icache_page(page); | ||
819 | kunmap(page); | ||
820 | if (result) { | ||
821 | result = -EFAULT; | ||
822 | goto out; | ||
823 | } | ||
824 | ubytes -= uchunk; | ||
825 | maddr += mchunk; | ||
826 | if (image->file_mode) | ||
827 | kbuf += mchunk; | ||
828 | else | ||
829 | buf += mchunk; | ||
830 | mbytes -= mchunk; | ||
831 | } | ||
832 | out: | ||
833 | return result; | ||
834 | } | ||
835 | |||
836 | int kimage_load_segment(struct kimage *image, | ||
837 | struct kexec_segment *segment) | ||
838 | { | ||
839 | int result = -ENOMEM; | ||
840 | |||
841 | switch (image->type) { | ||
842 | case KEXEC_TYPE_DEFAULT: | ||
843 | result = kimage_load_normal_segment(image, segment); | ||
844 | break; | ||
845 | case KEXEC_TYPE_CRASH: | ||
846 | result = kimage_load_crash_segment(image, segment); | ||
847 | break; | ||
848 | } | ||
849 | |||
850 | return result; | ||
851 | } | ||
852 | |||
853 | struct kimage *kexec_image; | ||
854 | struct kimage *kexec_crash_image; | ||
855 | int kexec_load_disabled; | ||
856 | |||
857 | void crash_kexec(struct pt_regs *regs) | ||
858 | { | ||
859 | /* Take the kexec_mutex here to prevent sys_kexec_load | ||
860 | * running on one cpu from replacing the crash kernel | ||
861 | * we are using after a panic on a different cpu. | ||
862 | * | ||
863 | * If the crash kernel was not located in a fixed area | ||
864 | * of memory the xchg(&kexec_crash_image) would be | ||
865 | * sufficient. But since I reuse the memory... | ||
866 | */ | ||
867 | if (mutex_trylock(&kexec_mutex)) { | ||
868 | if (kexec_crash_image) { | ||
869 | struct pt_regs fixed_regs; | ||
870 | |||
871 | crash_setup_regs(&fixed_regs, regs); | ||
872 | crash_save_vmcoreinfo(); | ||
873 | machine_crash_shutdown(&fixed_regs); | ||
874 | machine_kexec(kexec_crash_image); | ||
875 | } | ||
876 | mutex_unlock(&kexec_mutex); | ||
877 | } | ||
878 | } | ||
879 | |||
880 | size_t crash_get_memory_size(void) | ||
881 | { | ||
882 | size_t size = 0; | ||
883 | |||
884 | mutex_lock(&kexec_mutex); | ||
885 | if (crashk_res.end != crashk_res.start) | ||
886 | size = resource_size(&crashk_res); | ||
887 | mutex_unlock(&kexec_mutex); | ||
888 | return size; | ||
889 | } | ||
890 | |||
891 | void __weak crash_free_reserved_phys_range(unsigned long begin, | ||
892 | unsigned long end) | ||
893 | { | ||
894 | unsigned long addr; | ||
895 | |||
896 | for (addr = begin; addr < end; addr += PAGE_SIZE) | ||
897 | free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); | ||
898 | } | ||
899 | |||
900 | int crash_shrink_memory(unsigned long new_size) | ||
901 | { | ||
902 | int ret = 0; | ||
903 | unsigned long start, end; | ||
904 | unsigned long old_size; | ||
905 | struct resource *ram_res; | ||
906 | |||
907 | mutex_lock(&kexec_mutex); | ||
908 | |||
909 | if (kexec_crash_image) { | ||
910 | ret = -ENOENT; | ||
911 | goto unlock; | ||
912 | } | ||
913 | start = crashk_res.start; | ||
914 | end = crashk_res.end; | ||
915 | old_size = (end == 0) ? 0 : end - start + 1; | ||
916 | if (new_size >= old_size) { | ||
917 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
918 | goto unlock; | ||
919 | } | ||
920 | |||
921 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); | ||
922 | if (!ram_res) { | ||
923 | ret = -ENOMEM; | ||
924 | goto unlock; | ||
925 | } | ||
926 | |||
927 | start = roundup(start, KEXEC_CRASH_MEM_ALIGN); | ||
928 | end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); | ||
929 | |||
930 | crash_map_reserved_pages(); | ||
931 | crash_free_reserved_phys_range(end, crashk_res.end); | ||
932 | |||
933 | if ((start == end) && (crashk_res.parent != NULL)) | ||
934 | release_resource(&crashk_res); | ||
935 | |||
936 | ram_res->start = end; | ||
937 | ram_res->end = crashk_res.end; | ||
938 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
939 | ram_res->name = "System RAM"; | ||
940 | |||
941 | crashk_res.end = end - 1; | ||
942 | |||
943 | insert_resource(&iomem_resource, ram_res); | ||
944 | crash_unmap_reserved_pages(); | ||
945 | |||
946 | unlock: | ||
947 | mutex_unlock(&kexec_mutex); | ||
948 | return ret; | ||
949 | } | ||
950 | |||
951 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
952 | size_t data_len) | ||
953 | { | ||
954 | struct elf_note note; | ||
955 | |||
956 | note.n_namesz = strlen(name) + 1; | ||
957 | note.n_descsz = data_len; | ||
958 | note.n_type = type; | ||
959 | memcpy(buf, ¬e, sizeof(note)); | ||
960 | buf += (sizeof(note) + 3)/4; | ||
961 | memcpy(buf, name, note.n_namesz); | ||
962 | buf += (note.n_namesz + 3)/4; | ||
963 | memcpy(buf, data, note.n_descsz); | ||
964 | buf += (note.n_descsz + 3)/4; | ||
965 | |||
966 | return buf; | ||
967 | } | ||
968 | |||
969 | static void final_note(u32 *buf) | ||
970 | { | ||
971 | struct elf_note note; | ||
972 | |||
973 | note.n_namesz = 0; | ||
974 | note.n_descsz = 0; | ||
975 | note.n_type = 0; | ||
976 | memcpy(buf, ¬e, sizeof(note)); | ||
977 | } | ||
978 | |||
979 | void crash_save_cpu(struct pt_regs *regs, int cpu) | ||
980 | { | ||
981 | struct elf_prstatus prstatus; | ||
982 | u32 *buf; | ||
983 | |||
984 | if ((cpu < 0) || (cpu >= nr_cpu_ids)) | ||
985 | return; | ||
986 | |||
987 | /* Using ELF notes here is opportunistic. | ||
988 | * I need a well defined structure format | ||
989 | * for the data I pass, and I need tags | ||
990 | * on the data to indicate what information I have | ||
991 | * squirrelled away. ELF notes happen to provide | ||
992 | * all of that, so there is no need to invent something new. | ||
993 | */ | ||
994 | buf = (u32 *)per_cpu_ptr(crash_notes, cpu); | ||
995 | if (!buf) | ||
996 | return; | ||
997 | memset(&prstatus, 0, sizeof(prstatus)); | ||
998 | prstatus.pr_pid = current->pid; | ||
999 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); | ||
1000 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, | ||
1001 | &prstatus, sizeof(prstatus)); | ||
1002 | final_note(buf); | ||
1003 | } | ||
1004 | |||
1005 | static int __init crash_notes_memory_init(void) | ||
1006 | { | ||
1007 | /* Allocate memory for saving cpu registers. */ | ||
1008 | crash_notes = alloc_percpu(note_buf_t); | ||
1009 | if (!crash_notes) { | ||
1010 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); | ||
1011 | return -ENOMEM; | ||
1012 | } | ||
1013 | return 0; | ||
1014 | } | ||
1015 | subsys_initcall(crash_notes_memory_init); | ||
1016 | |||
1017 | |||
1018 | /* | ||
1019 | * parsing the "crashkernel" commandline | ||
1020 | * | ||
1021 | * this code is intended to be called from architecture specific code | ||
1022 | */ | ||
1023 | |||
1024 | |||
1025 | /* | ||
1026 | * This function parses command lines in the format | ||
1027 | * | ||
1028 | * crashkernel=ramsize-range:size[,...][@offset] | ||
1029 | * | ||
1030 | * The function returns 0 on success and -EINVAL on failure. | ||
1031 | */ | ||
1032 | static int __init parse_crashkernel_mem(char *cmdline, | ||
1033 | unsigned long long system_ram, | ||
1034 | unsigned long long *crash_size, | ||
1035 | unsigned long long *crash_base) | ||
1036 | { | ||
1037 | char *cur = cmdline, *tmp; | ||
1038 | |||
1039 | /* for each entry of the comma-separated list */ | ||
1040 | do { | ||
1041 | unsigned long long start, end = ULLONG_MAX, size; | ||
1042 | |||
1043 | /* get the start of the range */ | ||
1044 | start = memparse(cur, &tmp); | ||
1045 | if (cur == tmp) { | ||
1046 | pr_warn("crashkernel: Memory value expected\n"); | ||
1047 | return -EINVAL; | ||
1048 | } | ||
1049 | cur = tmp; | ||
1050 | if (*cur != '-') { | ||
1051 | pr_warn("crashkernel: '-' expected\n"); | ||
1052 | return -EINVAL; | ||
1053 | } | ||
1054 | cur++; | ||
1055 | |||
1056 | /* if no ':' is here, than we read the end */ | ||
1057 | if (*cur != ':') { | ||
1058 | end = memparse(cur, &tmp); | ||
1059 | if (cur == tmp) { | ||
1060 | pr_warn("crashkernel: Memory value expected\n"); | ||
1061 | return -EINVAL; | ||
1062 | } | ||
1063 | cur = tmp; | ||
1064 | if (end <= start) { | ||
1065 | pr_warn("crashkernel: end <= start\n"); | ||
1066 | return -EINVAL; | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | if (*cur != ':') { | ||
1071 | pr_warn("crashkernel: ':' expected\n"); | ||
1072 | return -EINVAL; | ||
1073 | } | ||
1074 | cur++; | ||
1075 | |||
1076 | size = memparse(cur, &tmp); | ||
1077 | if (cur == tmp) { | ||
1078 | pr_warn("Memory value expected\n"); | ||
1079 | return -EINVAL; | ||
1080 | } | ||
1081 | cur = tmp; | ||
1082 | if (size >= system_ram) { | ||
1083 | pr_warn("crashkernel: invalid size\n"); | ||
1084 | return -EINVAL; | ||
1085 | } | ||
1086 | |||
1087 | /* match ? */ | ||
1088 | if (system_ram >= start && system_ram < end) { | ||
1089 | *crash_size = size; | ||
1090 | break; | ||
1091 | } | ||
1092 | } while (*cur++ == ','); | ||
1093 | |||
1094 | if (*crash_size > 0) { | ||
1095 | while (*cur && *cur != ' ' && *cur != '@') | ||
1096 | cur++; | ||
1097 | if (*cur == '@') { | ||
1098 | cur++; | ||
1099 | *crash_base = memparse(cur, &tmp); | ||
1100 | if (cur == tmp) { | ||
1101 | pr_warn("Memory value expected after '@'\n"); | ||
1102 | return -EINVAL; | ||
1103 | } | ||
1104 | } | ||
1105 | } | ||
1106 | |||
1107 | return 0; | ||
1108 | } | ||
1109 | |||
1110 | /* | ||
1111 | * That function parses "simple" (old) crashkernel command lines like | ||
1112 | * | ||
1113 | * crashkernel=size[@offset] | ||
1114 | * | ||
1115 | * It returns 0 on success and -EINVAL on failure. | ||
1116 | */ | ||
1117 | static int __init parse_crashkernel_simple(char *cmdline, | ||
1118 | unsigned long long *crash_size, | ||
1119 | unsigned long long *crash_base) | ||
1120 | { | ||
1121 | char *cur = cmdline; | ||
1122 | |||
1123 | *crash_size = memparse(cmdline, &cur); | ||
1124 | if (cmdline == cur) { | ||
1125 | pr_warn("crashkernel: memory value expected\n"); | ||
1126 | return -EINVAL; | ||
1127 | } | ||
1128 | |||
1129 | if (*cur == '@') | ||
1130 | *crash_base = memparse(cur+1, &cur); | ||
1131 | else if (*cur != ' ' && *cur != '\0') { | ||
1132 | pr_warn("crashkernel: unrecognized char\n"); | ||
1133 | return -EINVAL; | ||
1134 | } | ||
1135 | |||
1136 | return 0; | ||
1137 | } | ||
1138 | |||
1139 | #define SUFFIX_HIGH 0 | ||
1140 | #define SUFFIX_LOW 1 | ||
1141 | #define SUFFIX_NULL 2 | ||
1142 | static __initdata char *suffix_tbl[] = { | ||
1143 | [SUFFIX_HIGH] = ",high", | ||
1144 | [SUFFIX_LOW] = ",low", | ||
1145 | [SUFFIX_NULL] = NULL, | ||
1146 | }; | ||
1147 | |||
1148 | /* | ||
1149 | * That function parses "suffix" crashkernel command lines like | ||
1150 | * | ||
1151 | * crashkernel=size,[high|low] | ||
1152 | * | ||
1153 | * It returns 0 on success and -EINVAL on failure. | ||
1154 | */ | ||
1155 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
1156 | unsigned long long *crash_size, | ||
1157 | const char *suffix) | ||
1158 | { | ||
1159 | char *cur = cmdline; | ||
1160 | |||
1161 | *crash_size = memparse(cmdline, &cur); | ||
1162 | if (cmdline == cur) { | ||
1163 | pr_warn("crashkernel: memory value expected\n"); | ||
1164 | return -EINVAL; | ||
1165 | } | ||
1166 | |||
1167 | /* check with suffix */ | ||
1168 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
1169 | pr_warn("crashkernel: unrecognized char\n"); | ||
1170 | return -EINVAL; | ||
1171 | } | ||
1172 | cur += strlen(suffix); | ||
1173 | if (*cur != ' ' && *cur != '\0') { | ||
1174 | pr_warn("crashkernel: unrecognized char\n"); | ||
1175 | return -EINVAL; | ||
1176 | } | ||
1177 | |||
1178 | return 0; | ||
1179 | } | ||
1180 | |||
1181 | static __init char *get_last_crashkernel(char *cmdline, | ||
1182 | const char *name, | ||
1183 | const char *suffix) | ||
1184 | { | ||
1185 | char *p = cmdline, *ck_cmdline = NULL; | ||
1186 | |||
1187 | /* find crashkernel and use the last one if there are more */ | ||
1188 | p = strstr(p, name); | ||
1189 | while (p) { | ||
1190 | char *end_p = strchr(p, ' '); | ||
1191 | char *q; | ||
1192 | |||
1193 | if (!end_p) | ||
1194 | end_p = p + strlen(p); | ||
1195 | |||
1196 | if (!suffix) { | ||
1197 | int i; | ||
1198 | |||
1199 | /* skip the one with any known suffix */ | ||
1200 | for (i = 0; suffix_tbl[i]; i++) { | ||
1201 | q = end_p - strlen(suffix_tbl[i]); | ||
1202 | if (!strncmp(q, suffix_tbl[i], | ||
1203 | strlen(suffix_tbl[i]))) | ||
1204 | goto next; | ||
1205 | } | ||
1206 | ck_cmdline = p; | ||
1207 | } else { | ||
1208 | q = end_p - strlen(suffix); | ||
1209 | if (!strncmp(q, suffix, strlen(suffix))) | ||
1210 | ck_cmdline = p; | ||
1211 | } | ||
1212 | next: | ||
1213 | p = strstr(p+1, name); | ||
1214 | } | ||
1215 | |||
1216 | if (!ck_cmdline) | ||
1217 | return NULL; | ||
1218 | |||
1219 | return ck_cmdline; | ||
1220 | } | ||
1221 | |||
1222 | static int __init __parse_crashkernel(char *cmdline, | ||
1223 | unsigned long long system_ram, | ||
1224 | unsigned long long *crash_size, | ||
1225 | unsigned long long *crash_base, | ||
1226 | const char *name, | ||
1227 | const char *suffix) | ||
1228 | { | ||
1229 | char *first_colon, *first_space; | ||
1230 | char *ck_cmdline; | ||
1231 | |||
1232 | BUG_ON(!crash_size || !crash_base); | ||
1233 | *crash_size = 0; | ||
1234 | *crash_base = 0; | ||
1235 | |||
1236 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); | ||
1237 | |||
1238 | if (!ck_cmdline) | ||
1239 | return -EINVAL; | ||
1240 | |||
1241 | ck_cmdline += strlen(name); | ||
1242 | |||
1243 | if (suffix) | ||
1244 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
1245 | suffix); | ||
1246 | /* | ||
1247 | * if the commandline contains a ':', then that's the extended | ||
1248 | * syntax -- if not, it must be the classic syntax | ||
1249 | */ | ||
1250 | first_colon = strchr(ck_cmdline, ':'); | ||
1251 | first_space = strchr(ck_cmdline, ' '); | ||
1252 | if (first_colon && (!first_space || first_colon < first_space)) | ||
1253 | return parse_crashkernel_mem(ck_cmdline, system_ram, | ||
1254 | crash_size, crash_base); | ||
1255 | |||
1256 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); | ||
1257 | } | ||
1258 | |||
1259 | /* | ||
1260 | * That function is the entry point for command line parsing and should be | ||
1261 | * called from the arch-specific code. | ||
1262 | */ | ||
1263 | int __init parse_crashkernel(char *cmdline, | ||
1264 | unsigned long long system_ram, | ||
1265 | unsigned long long *crash_size, | ||
1266 | unsigned long long *crash_base) | ||
1267 | { | ||
1268 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1269 | "crashkernel=", NULL); | ||
1270 | } | ||
1271 | |||
1272 | int __init parse_crashkernel_high(char *cmdline, | ||
1273 | unsigned long long system_ram, | ||
1274 | unsigned long long *crash_size, | ||
1275 | unsigned long long *crash_base) | ||
1276 | { | ||
1277 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1278 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
1279 | } | ||
1280 | |||
1281 | int __init parse_crashkernel_low(char *cmdline, | ||
1282 | unsigned long long system_ram, | ||
1283 | unsigned long long *crash_size, | ||
1284 | unsigned long long *crash_base) | ||
1285 | { | ||
1286 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1287 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); | ||
1288 | } | ||
1289 | |||
1290 | static void update_vmcoreinfo_note(void) | ||
1291 | { | ||
1292 | u32 *buf = vmcoreinfo_note; | ||
1293 | |||
1294 | if (!vmcoreinfo_size) | ||
1295 | return; | ||
1296 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | ||
1297 | vmcoreinfo_size); | ||
1298 | final_note(buf); | ||
1299 | } | ||
1300 | |||
1301 | void crash_save_vmcoreinfo(void) | ||
1302 | { | ||
1303 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | ||
1304 | update_vmcoreinfo_note(); | ||
1305 | } | ||
1306 | |||
1307 | void vmcoreinfo_append_str(const char *fmt, ...) | ||
1308 | { | ||
1309 | va_list args; | ||
1310 | char buf[0x50]; | ||
1311 | size_t r; | ||
1312 | |||
1313 | va_start(args, fmt); | ||
1314 | r = vscnprintf(buf, sizeof(buf), fmt, args); | ||
1315 | va_end(args); | ||
1316 | |||
1317 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | ||
1318 | |||
1319 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | ||
1320 | |||
1321 | vmcoreinfo_size += r; | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * provide an empty default implementation here -- architecture | ||
1326 | * code may override this | ||
1327 | */ | ||
1328 | void __weak arch_crash_save_vmcoreinfo(void) | ||
1329 | {} | ||
1330 | |||
1331 | unsigned long __weak paddr_vmcoreinfo_note(void) | ||
1332 | { | ||
1333 | return __pa((unsigned long)(char *)&vmcoreinfo_note); | ||
1334 | } | ||
1335 | |||
1336 | static int __init crash_save_vmcoreinfo_init(void) | ||
1337 | { | ||
1338 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | ||
1339 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | ||
1340 | |||
1341 | VMCOREINFO_SYMBOL(init_uts_ns); | ||
1342 | VMCOREINFO_SYMBOL(node_online_map); | ||
1343 | #ifdef CONFIG_MMU | ||
1344 | VMCOREINFO_SYMBOL(swapper_pg_dir); | ||
1345 | #endif | ||
1346 | VMCOREINFO_SYMBOL(_stext); | ||
1347 | VMCOREINFO_SYMBOL(vmap_area_list); | ||
1348 | |||
1349 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
1350 | VMCOREINFO_SYMBOL(mem_map); | ||
1351 | VMCOREINFO_SYMBOL(contig_page_data); | ||
1352 | #endif | ||
1353 | #ifdef CONFIG_SPARSEMEM | ||
1354 | VMCOREINFO_SYMBOL(mem_section); | ||
1355 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | ||
1356 | VMCOREINFO_STRUCT_SIZE(mem_section); | ||
1357 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | ||
1358 | #endif | ||
1359 | VMCOREINFO_STRUCT_SIZE(page); | ||
1360 | VMCOREINFO_STRUCT_SIZE(pglist_data); | ||
1361 | VMCOREINFO_STRUCT_SIZE(zone); | ||
1362 | VMCOREINFO_STRUCT_SIZE(free_area); | ||
1363 | VMCOREINFO_STRUCT_SIZE(list_head); | ||
1364 | VMCOREINFO_SIZE(nodemask_t); | ||
1365 | VMCOREINFO_OFFSET(page, flags); | ||
1366 | VMCOREINFO_OFFSET(page, _count); | ||
1367 | VMCOREINFO_OFFSET(page, mapping); | ||
1368 | VMCOREINFO_OFFSET(page, lru); | ||
1369 | VMCOREINFO_OFFSET(page, _mapcount); | ||
1370 | VMCOREINFO_OFFSET(page, private); | ||
1371 | VMCOREINFO_OFFSET(pglist_data, node_zones); | ||
1372 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | ||
1373 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
1374 | VMCOREINFO_OFFSET(pglist_data, node_mem_map); | ||
1375 | #endif | ||
1376 | VMCOREINFO_OFFSET(pglist_data, node_start_pfn); | ||
1377 | VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); | ||
1378 | VMCOREINFO_OFFSET(pglist_data, node_id); | ||
1379 | VMCOREINFO_OFFSET(zone, free_area); | ||
1380 | VMCOREINFO_OFFSET(zone, vm_stat); | ||
1381 | VMCOREINFO_OFFSET(zone, spanned_pages); | ||
1382 | VMCOREINFO_OFFSET(free_area, free_list); | ||
1383 | VMCOREINFO_OFFSET(list_head, next); | ||
1384 | VMCOREINFO_OFFSET(list_head, prev); | ||
1385 | VMCOREINFO_OFFSET(vmap_area, va_start); | ||
1386 | VMCOREINFO_OFFSET(vmap_area, list); | ||
1387 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | ||
1388 | log_buf_kexec_setup(); | ||
1389 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
1390 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | ||
1391 | VMCOREINFO_NUMBER(PG_lru); | ||
1392 | VMCOREINFO_NUMBER(PG_private); | ||
1393 | VMCOREINFO_NUMBER(PG_swapcache); | ||
1394 | VMCOREINFO_NUMBER(PG_slab); | ||
1395 | #ifdef CONFIG_MEMORY_FAILURE | ||
1396 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
1397 | #endif | ||
1398 | VMCOREINFO_NUMBER(PG_head_mask); | ||
1399 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
1400 | #ifdef CONFIG_HUGETLBFS | ||
1401 | VMCOREINFO_SYMBOL(free_huge_page); | ||
1402 | #endif | ||
1403 | |||
1404 | arch_crash_save_vmcoreinfo(); | ||
1405 | update_vmcoreinfo_note(); | ||
1406 | |||
1407 | return 0; | ||
1408 | } | ||
1409 | |||
1410 | subsys_initcall(crash_save_vmcoreinfo_init); | ||
1411 | |||
1412 | /* | ||
1413 | * Move into place and start executing a preloaded standalone | ||
1414 | * executable. If nothing was preloaded return an error. | ||
1415 | */ | ||
1416 | int kernel_kexec(void) | ||
1417 | { | ||
1418 | int error = 0; | ||
1419 | |||
1420 | if (!mutex_trylock(&kexec_mutex)) | ||
1421 | return -EBUSY; | ||
1422 | if (!kexec_image) { | ||
1423 | error = -EINVAL; | ||
1424 | goto Unlock; | ||
1425 | } | ||
1426 | |||
1427 | #ifdef CONFIG_KEXEC_JUMP | ||
1428 | if (kexec_image->preserve_context) { | ||
1429 | lock_system_sleep(); | ||
1430 | pm_prepare_console(); | ||
1431 | error = freeze_processes(); | ||
1432 | if (error) { | ||
1433 | error = -EBUSY; | ||
1434 | goto Restore_console; | ||
1435 | } | ||
1436 | suspend_console(); | ||
1437 | error = dpm_suspend_start(PMSG_FREEZE); | ||
1438 | if (error) | ||
1439 | goto Resume_console; | ||
1440 | /* At this point, dpm_suspend_start() has been called, | ||
1441 | * but *not* dpm_suspend_end(). We *must* call | ||
1442 | * dpm_suspend_end() now. Otherwise, drivers for | ||
1443 | * some devices (e.g. interrupt controllers) become | ||
1444 | * desynchronized with the actual state of the | ||
1445 | * hardware at resume time, and evil weirdness ensues. | ||
1446 | */ | ||
1447 | error = dpm_suspend_end(PMSG_FREEZE); | ||
1448 | if (error) | ||
1449 | goto Resume_devices; | ||
1450 | error = disable_nonboot_cpus(); | ||
1451 | if (error) | ||
1452 | goto Enable_cpus; | ||
1453 | local_irq_disable(); | ||
1454 | error = syscore_suspend(); | ||
1455 | if (error) | ||
1456 | goto Enable_irqs; | ||
1457 | } else | ||
1458 | #endif | ||
1459 | { | ||
1460 | kexec_in_progress = true; | ||
1461 | kernel_restart_prepare(NULL); | ||
1462 | migrate_to_reboot_cpu(); | ||
1463 | |||
1464 | /* | ||
1465 | * migrate_to_reboot_cpu() disables CPU hotplug assuming that | ||
1466 | * no further code needs to use CPU hotplug (which is true in | ||
1467 | * the reboot case). However, the kexec path depends on using | ||
1468 | * CPU hotplug again; so re-enable it here. | ||
1469 | */ | ||
1470 | cpu_hotplug_enable(); | ||
1471 | pr_emerg("Starting new kernel\n"); | ||
1472 | machine_shutdown(); | ||
1473 | } | ||
1474 | |||
1475 | machine_kexec(kexec_image); | ||
1476 | |||
1477 | #ifdef CONFIG_KEXEC_JUMP | ||
1478 | if (kexec_image->preserve_context) { | ||
1479 | syscore_resume(); | ||
1480 | Enable_irqs: | ||
1481 | local_irq_enable(); | ||
1482 | Enable_cpus: | ||
1483 | enable_nonboot_cpus(); | ||
1484 | dpm_resume_start(PMSG_RESTORE); | ||
1485 | Resume_devices: | ||
1486 | dpm_resume_end(PMSG_RESTORE); | ||
1487 | Resume_console: | ||
1488 | resume_console(); | ||
1489 | thaw_processes(); | ||
1490 | Restore_console: | ||
1491 | pm_restore_console(); | ||
1492 | unlock_system_sleep(); | ||
1493 | } | ||
1494 | #endif | ||
1495 | |||
1496 | Unlock: | ||
1497 | mutex_unlock(&kexec_mutex); | ||
1498 | return error; | ||
1499 | } | ||
1500 | |||
1501 | /* | ||
1502 | * Add and remove page tables for crashkernel memory | ||
1503 | * | ||
1504 | * Provide an empty default implementation here -- architecture | ||
1505 | * code may override this | ||
1506 | */ | ||
1507 | void __weak crash_map_reserved_pages(void) | ||
1508 | {} | ||
1509 | |||
1510 | void __weak crash_unmap_reserved_pages(void) | ||
1511 | {} | ||