aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2005-06-25 17:57:52 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:48 -0400
commitdc009d92435f99498cbc579ce76bf28e837e2c14 (patch)
tree2ba8732b28225593d996b8faa079dc6ab4bbc9bc /kernel
parentd0537508a9921efced238b20967e50e519ac34af (diff)
[PATCH] kexec: add kexec syscalls
This patch introduces the architecture independent implementation the sys_kexec_load, the compat_sys_kexec_load system calls. Kexec on panic support has been integrated into the core patch and is relatively clean. In addition the hopefully architecture independent option crashkernel=size@location has been docuemented. It's purpose is to reserve space for the panic kernel to live, and where no DMA transfer will ever be setup to access. Signed-off-by: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Alexander Nyberg <alexn@telia.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/kexec.c1036
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sys_ni.c2
5 files changed, 1080 insertions, 2 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cfc8b0dea950 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/ 18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_KEXEC) += kexec.o
20obj-$(CONFIG_COMPAT) += compat.o 21obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o 22obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o 23obj-$(CONFIG_IKCONFIG) += configs.o
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..def9c73ec9a6
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1036 @@
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
21#include <asm/page.h>
22#include <asm/uaccess.h>
23#include <asm/io.h>
24#include <asm/system.h>
25#include <asm/semaphore.h>
26
27/* Location of the reserved area for the crash kernel */
28struct resource crashk_res = {
29 .name = "Crash kernel",
30 .start = 0,
31 .end = 0,
32 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
33};
34
35/*
36 * When kexec transitions to the new kernel there is a one-to-one
37 * mapping between physical and virtual addresses. On processors
38 * where you can disable the MMU this is trivial, and easy. For
39 * others it is still a simple predictable page table to setup.
40 *
41 * In that environment kexec copies the new kernel to its final
42 * resting place. This means I can only support memory whose
43 * physical address can fit in an unsigned long. In particular
44 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
45 * If the assembly stub has more restrictive requirements
46 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
47 * defined more restrictively in <asm/kexec.h>.
48 *
49 * The code for the transition from the current kernel to the
50 * the new kernel is placed in the control_code_buffer, whose size
51 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
52 * page of memory is necessary, but some architectures require more.
53 * Because this memory must be identity mapped in the transition from
54 * virtual to physical addresses it must live in the range
55 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
56 * modifiable.
57 *
58 * The assembly stub in the control code buffer is passed a linked list
59 * of descriptor pages detailing the source pages of the new kernel,
60 * and the destination addresses of those source pages. As this data
61 * structure is not used in the context of the current OS, it must
62 * be self-contained.
63 *
64 * The code has been made to work with highmem pages and will use a
65 * destination page in its final resting place (if it happens
66 * to allocate it). The end product of this is that most of the
67 * physical address space, and most of RAM can be used.
68 *
69 * Future directions include:
70 * - allocating a page table with the control code buffer identity
71 * mapped, to simplify machine_kexec and make kexec_on_panic more
72 * reliable.
73 */
74
75/*
76 * KIMAGE_NO_DEST is an impossible destination address..., for
77 * allocating pages whose destination address we do not care about.
78 */
79#define KIMAGE_NO_DEST (-1UL)
80
81static int kimage_is_destination_range(
82 struct kimage *image, unsigned long start, unsigned long end);
83static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
84
85static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
86 unsigned long nr_segments, struct kexec_segment __user *segments)
87{
88 size_t segment_bytes;
89 struct kimage *image;
90 unsigned long i;
91 int result;
92
93 /* Allocate a controlling structure */
94 result = -ENOMEM;
95 image = kmalloc(sizeof(*image), GFP_KERNEL);
96 if (!image) {
97 goto out;
98 }
99 memset(image, 0, sizeof(*image));
100 image->head = 0;
101 image->entry = &image->head;
102 image->last_entry = &image->head;
103 image->control_page = ~0; /* By default this does not apply */
104 image->start = entry;
105 image->type = KEXEC_TYPE_DEFAULT;
106
107 /* Initialize the list of control pages */
108 INIT_LIST_HEAD(&image->control_pages);
109
110 /* Initialize the list of destination pages */
111 INIT_LIST_HEAD(&image->dest_pages);
112
113 /* Initialize the list of unuseable pages */
114 INIT_LIST_HEAD(&image->unuseable_pages);
115
116 /* Read in the segments */
117 image->nr_segments = nr_segments;
118 segment_bytes = nr_segments * sizeof(*segments);
119 result = copy_from_user(image->segment, segments, segment_bytes);
120 if (result)
121 goto out;
122
123 /*
124 * Verify we have good destination addresses. The caller is
125 * responsible for making certain we don't attempt to load
126 * the new image into invalid or reserved areas of RAM. This
127 * just verifies it is an address we can use.
128 *
129 * Since the kernel does everything in page size chunks ensure
130 * the destination addreses are page aligned. Too many
131 * special cases crop of when we don't do this. The most
132 * insidious is getting overlapping destination addresses
133 * simply because addresses are changed to page size
134 * granularity.
135 */
136 result = -EADDRNOTAVAIL;
137 for (i = 0; i < nr_segments; i++) {
138 unsigned long mstart, mend;
139 mstart = image->segment[i].mem;
140 mend = mstart + image->segment[i].memsz;
141 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
142 goto out;
143 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
144 goto out;
145 }
146
147 /* Verify our destination addresses do not overlap.
148 * If we alloed overlapping destination addresses
149 * through very weird things can happen with no
150 * easy explanation as one segment stops on another.
151 */
152 result = -EINVAL;
153 for(i = 0; i < nr_segments; i++) {
154 unsigned long mstart, mend;
155 unsigned long j;
156 mstart = image->segment[i].mem;
157 mend = mstart + image->segment[i].memsz;
158 for(j = 0; j < i; j++) {
159 unsigned long pstart, pend;
160 pstart = image->segment[j].mem;
161 pend = pstart + image->segment[j].memsz;
162 /* Do the segments overlap ? */
163 if ((mend > pstart) && (mstart < pend))
164 goto out;
165 }
166 }
167
168 /* Ensure our buffer sizes are strictly less than
169 * our memory sizes. This should always be the case,
170 * and it is easier to check up front than to be surprised
171 * later on.
172 */
173 result = -EINVAL;
174 for(i = 0; i < nr_segments; i++) {
175 if (image->segment[i].bufsz > image->segment[i].memsz)
176 goto out;
177 }
178
179
180 result = 0;
181 out:
182 if (result == 0) {
183 *rimage = image;
184 } else {
185 kfree(image);
186 }
187 return result;
188
189}
190
191static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
192 unsigned long nr_segments, struct kexec_segment __user *segments)
193{
194 int result;
195 struct kimage *image;
196
197 /* Allocate and initialize a controlling structure */
198 image = NULL;
199 result = do_kimage_alloc(&image, entry, nr_segments, segments);
200 if (result) {
201 goto out;
202 }
203 *rimage = image;
204
205 /*
206 * Find a location for the control code buffer, and add it
207 * the vector of segments so that it's pages will also be
208 * counted as destination pages.
209 */
210 result = -ENOMEM;
211 image->control_code_page = kimage_alloc_control_pages(image,
212 get_order(KEXEC_CONTROL_CODE_SIZE));
213 if (!image->control_code_page) {
214 printk(KERN_ERR "Could not allocate control_code_buffer\n");
215 goto out;
216 }
217
218 result = 0;
219 out:
220 if (result == 0) {
221 *rimage = image;
222 } else {
223 kfree(image);
224 }
225 return result;
226}
227
228static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
229 unsigned long nr_segments, struct kexec_segment *segments)
230{
231 int result;
232 struct kimage *image;
233 unsigned long i;
234
235 image = NULL;
236 /* Verify we have a valid entry point */
237 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
238 result = -EADDRNOTAVAIL;
239 goto out;
240 }
241
242 /* Allocate and initialize a controlling structure */
243 result = do_kimage_alloc(&image, entry, nr_segments, segments);
244 if (result) {
245 goto out;
246 }
247
248 /* Enable the special crash kernel control page
249 * allocation policy.
250 */
251 image->control_page = crashk_res.start;
252 image->type = KEXEC_TYPE_CRASH;
253
254 /*
255 * Verify we have good destination addresses. Normally
256 * the caller is responsible for making certain we don't
257 * attempt to load the new image into invalid or reserved
258 * areas of RAM. But crash kernels are preloaded into a
259 * reserved area of ram. We must ensure the addresses
260 * are in the reserved area otherwise preloading the
261 * kernel could corrupt things.
262 */
263 result = -EADDRNOTAVAIL;
264 for (i = 0; i < nr_segments; i++) {
265 unsigned long mstart, mend;
266 mstart = image->segment[i].mem;
267 mend = mstart + image->segment[i].memsz;
268 /* Ensure we are within the crash kernel limits */
269 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
270 goto out;
271 }
272
273
274 /*
275 * Find a location for the control code buffer, and add
276 * the vector of segments so that it's pages will also be
277 * counted as destination pages.
278 */
279 result = -ENOMEM;
280 image->control_code_page = kimage_alloc_control_pages(image,
281 get_order(KEXEC_CONTROL_CODE_SIZE));
282 if (!image->control_code_page) {
283 printk(KERN_ERR "Could not allocate control_code_buffer\n");
284 goto out;
285 }
286
287 result = 0;
288 out:
289 if (result == 0) {
290 *rimage = image;
291 } else {
292 kfree(image);
293 }
294 return result;
295}
296
297static int kimage_is_destination_range(
298 struct kimage *image, unsigned long start, unsigned long end)
299{
300 unsigned long i;
301
302 for (i = 0; i < image->nr_segments; i++) {
303 unsigned long mstart, mend;
304 mstart = image->segment[i].mem;
305 mend = mstart + image->segment[i].memsz;
306 if ((end > mstart) && (start < mend)) {
307 return 1;
308 }
309 }
310 return 0;
311}
312
313static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
314{
315 struct page *pages;
316 pages = alloc_pages(gfp_mask, order);
317 if (pages) {
318 unsigned int count, i;
319 pages->mapping = NULL;
320 pages->private = order;
321 count = 1 << order;
322 for(i = 0; i < count; i++) {
323 SetPageReserved(pages + i);
324 }
325 }
326 return pages;
327}
328
329static void kimage_free_pages(struct page *page)
330{
331 unsigned int order, count, i;
332 order = page->private;
333 count = 1 << order;
334 for(i = 0; i < count; i++) {
335 ClearPageReserved(page + i);
336 }
337 __free_pages(page, order);
338}
339
340static void kimage_free_page_list(struct list_head *list)
341{
342 struct list_head *pos, *next;
343 list_for_each_safe(pos, next, list) {
344 struct page *page;
345
346 page = list_entry(pos, struct page, lru);
347 list_del(&page->lru);
348
349 kimage_free_pages(page);
350 }
351}
352
353static struct page *kimage_alloc_normal_control_pages(
354 struct kimage *image, unsigned int order)
355{
356 /* Control pages are special, they are the intermediaries
357 * that are needed while we copy the rest of the pages
358 * to their final resting place. As such they must
359 * not conflict with either the destination addresses
360 * or memory the kernel is already using.
361 *
362 * The only case where we really need more than one of
363 * these are for architectures where we cannot disable
364 * the MMU and must instead generate an identity mapped
365 * page table for all of the memory.
366 *
367 * At worst this runs in O(N) of the image size.
368 */
369 struct list_head extra_pages;
370 struct page *pages;
371 unsigned int count;
372
373 count = 1 << order;
374 INIT_LIST_HEAD(&extra_pages);
375
376 /* Loop while I can allocate a page and the page allocated
377 * is a destination page.
378 */
379 do {
380 unsigned long pfn, epfn, addr, eaddr;
381 pages = kimage_alloc_pages(GFP_KERNEL, order);
382 if (!pages)
383 break;
384 pfn = page_to_pfn(pages);
385 epfn = pfn + count;
386 addr = pfn << PAGE_SHIFT;
387 eaddr = epfn << PAGE_SHIFT;
388 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
389 kimage_is_destination_range(image, addr, eaddr))
390 {
391 list_add(&pages->lru, &extra_pages);
392 pages = NULL;
393 }
394 } while(!pages);
395 if (pages) {
396 /* Remember the allocated page... */
397 list_add(&pages->lru, &image->control_pages);
398
399 /* Because the page is already in it's destination
400 * location we will never allocate another page at
401 * that address. Therefore kimage_alloc_pages
402 * will not return it (again) and we don't need
403 * to give it an entry in image->segment[].
404 */
405 }
406 /* Deal with the destination pages I have inadvertently allocated.
407 *
408 * Ideally I would convert multi-page allocations into single
409 * page allocations, and add everyting to image->dest_pages.
410 *
411 * For now it is simpler to just free the pages.
412 */
413 kimage_free_page_list(&extra_pages);
414 return pages;
415
416}
417
418static struct page *kimage_alloc_crash_control_pages(
419 struct kimage *image, unsigned int order)
420{
421 /* Control pages are special, they are the intermediaries
422 * that are needed while we copy the rest of the pages
423 * to their final resting place. As such they must
424 * not conflict with either the destination addresses
425 * or memory the kernel is already using.
426 *
427 * Control pages are also the only pags we must allocate
428 * when loading a crash kernel. All of the other pages
429 * are specified by the segments and we just memcpy
430 * into them directly.
431 *
432 * The only case where we really need more than one of
433 * these are for architectures where we cannot disable
434 * the MMU and must instead generate an identity mapped
435 * page table for all of the memory.
436 *
437 * Given the low demand this implements a very simple
438 * allocator that finds the first hole of the appropriate
439 * size in the reserved memory region, and allocates all
440 * of the memory up to and including the hole.
441 */
442 unsigned long hole_start, hole_end, size;
443 struct page *pages;
444 pages = NULL;
445 size = (1 << order) << PAGE_SHIFT;
446 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
447 hole_end = hole_start + size - 1;
448 while(hole_end <= crashk_res.end) {
449 unsigned long i;
450 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
451 break;
452 }
453 if (hole_end > crashk_res.end) {
454 break;
455 }
456 /* See if I overlap any of the segments */
457 for(i = 0; i < image->nr_segments; i++) {
458 unsigned long mstart, mend;
459 mstart = image->segment[i].mem;
460 mend = mstart + image->segment[i].memsz - 1;
461 if ((hole_end >= mstart) && (hole_start <= mend)) {
462 /* Advance the hole to the end of the segment */
463 hole_start = (mend + (size - 1)) & ~(size - 1);
464 hole_end = hole_start + size - 1;
465 break;
466 }
467 }
468 /* If I don't overlap any segments I have found my hole! */
469 if (i == image->nr_segments) {
470 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
471 break;
472 }
473 }
474 if (pages) {
475 image->control_page = hole_end;
476 }
477 return pages;
478}
479
480
481struct page *kimage_alloc_control_pages(
482 struct kimage *image, unsigned int order)
483{
484 struct page *pages = NULL;
485 switch(image->type) {
486 case KEXEC_TYPE_DEFAULT:
487 pages = kimage_alloc_normal_control_pages(image, order);
488 break;
489 case KEXEC_TYPE_CRASH:
490 pages = kimage_alloc_crash_control_pages(image, order);
491 break;
492 }
493 return pages;
494}
495
496static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
497{
498 if (*image->entry != 0) {
499 image->entry++;
500 }
501 if (image->entry == image->last_entry) {
502 kimage_entry_t *ind_page;
503 struct page *page;
504 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
505 if (!page) {
506 return -ENOMEM;
507 }
508 ind_page = page_address(page);
509 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
510 image->entry = ind_page;
511 image->last_entry =
512 ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
513 }
514 *image->entry = entry;
515 image->entry++;
516 *image->entry = 0;
517 return 0;
518}
519
520static int kimage_set_destination(
521 struct kimage *image, unsigned long destination)
522{
523 int result;
524
525 destination &= PAGE_MASK;
526 result = kimage_add_entry(image, destination | IND_DESTINATION);
527 if (result == 0) {
528 image->destination = destination;
529 }
530 return result;
531}
532
533
534static int kimage_add_page(struct kimage *image, unsigned long page)
535{
536 int result;
537
538 page &= PAGE_MASK;
539 result = kimage_add_entry(image, page | IND_SOURCE);
540 if (result == 0) {
541 image->destination += PAGE_SIZE;
542 }
543 return result;
544}
545
546
547static void kimage_free_extra_pages(struct kimage *image)
548{
549 /* Walk through and free any extra destination pages I may have */
550 kimage_free_page_list(&image->dest_pages);
551
552 /* Walk through and free any unuseable pages I have cached */
553 kimage_free_page_list(&image->unuseable_pages);
554
555}
556static int kimage_terminate(struct kimage *image)
557{
558 if (*image->entry != 0) {
559 image->entry++;
560 }
561 *image->entry = IND_DONE;
562 return 0;
563}
564
565#define for_each_kimage_entry(image, ptr, entry) \
566 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
567 ptr = (entry & IND_INDIRECTION)? \
568 phys_to_virt((entry & PAGE_MASK)): ptr +1)
569
570static void kimage_free_entry(kimage_entry_t entry)
571{
572 struct page *page;
573
574 page = pfn_to_page(entry >> PAGE_SHIFT);
575 kimage_free_pages(page);
576}
577
578static void kimage_free(struct kimage *image)
579{
580 kimage_entry_t *ptr, entry;
581 kimage_entry_t ind = 0;
582
583 if (!image)
584 return;
585 kimage_free_extra_pages(image);
586 for_each_kimage_entry(image, ptr, entry) {
587 if (entry & IND_INDIRECTION) {
588 /* Free the previous indirection page */
589 if (ind & IND_INDIRECTION) {
590 kimage_free_entry(ind);
591 }
592 /* Save this indirection page until we are
593 * done with it.
594 */
595 ind = entry;
596 }
597 else if (entry & IND_SOURCE) {
598 kimage_free_entry(entry);
599 }
600 }
601 /* Free the final indirection page */
602 if (ind & IND_INDIRECTION) {
603 kimage_free_entry(ind);
604 }
605
606 /* Handle any machine specific cleanup */
607 machine_kexec_cleanup(image);
608
609 /* Free the kexec control pages... */
610 kimage_free_page_list(&image->control_pages);
611 kfree(image);
612}
613
614static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
615{
616 kimage_entry_t *ptr, entry;
617 unsigned long destination = 0;
618
619 for_each_kimage_entry(image, ptr, entry) {
620 if (entry & IND_DESTINATION) {
621 destination = entry & PAGE_MASK;
622 }
623 else if (entry & IND_SOURCE) {
624 if (page == destination) {
625 return ptr;
626 }
627 destination += PAGE_SIZE;
628 }
629 }
630 return 0;
631}
632
633static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
634{
635 /*
636 * Here we implement safeguards to ensure that a source page
637 * is not copied to its destination page before the data on
638 * the destination page is no longer useful.
639 *
640 * To do this we maintain the invariant that a source page is
641 * either its own destination page, or it is not a
642 * destination page at all.
643 *
644 * That is slightly stronger than required, but the proof
645 * that no problems will not occur is trivial, and the
646 * implementation is simply to verify.
647 *
648 * When allocating all pages normally this algorithm will run
649 * in O(N) time, but in the worst case it will run in O(N^2)
650 * time. If the runtime is a problem the data structures can
651 * be fixed.
652 */
653 struct page *page;
654 unsigned long addr;
655
656 /*
657 * Walk through the list of destination pages, and see if I
658 * have a match.
659 */
660 list_for_each_entry(page, &image->dest_pages, lru) {
661 addr = page_to_pfn(page) << PAGE_SHIFT;
662 if (addr == destination) {
663 list_del(&page->lru);
664 return page;
665 }
666 }
667 page = NULL;
668 while (1) {
669 kimage_entry_t *old;
670
671 /* Allocate a page, if we run out of memory give up */
672 page = kimage_alloc_pages(gfp_mask, 0);
673 if (!page) {
674 return 0;
675 }
676 /* If the page cannot be used file it away */
677 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
678 list_add(&page->lru, &image->unuseable_pages);
679 continue;
680 }
681 addr = page_to_pfn(page) << PAGE_SHIFT;
682
683 /* If it is the destination page we want use it */
684 if (addr == destination)
685 break;
686
687 /* If the page is not a destination page use it */
688 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
689 break;
690
691 /*
692 * I know that the page is someones destination page.
693 * See if there is already a source page for this
694 * destination page. And if so swap the source pages.
695 */
696 old = kimage_dst_used(image, addr);
697 if (old) {
698 /* If so move it */
699 unsigned long old_addr;
700 struct page *old_page;
701
702 old_addr = *old & PAGE_MASK;
703 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
704 copy_highpage(page, old_page);
705 *old = addr | (*old & ~PAGE_MASK);
706
707 /* The old page I have found cannot be a
708 * destination page, so return it.
709 */
710 addr = old_addr;
711 page = old_page;
712 break;
713 }
714 else {
715 /* Place the page on the destination list I
716 * will use it later.
717 */
718 list_add(&page->lru, &image->dest_pages);
719 }
720 }
721 return page;
722}
723
724static int kimage_load_normal_segment(struct kimage *image,
725 struct kexec_segment *segment)
726{
727 unsigned long maddr;
728 unsigned long ubytes, mbytes;
729 int result;
730 unsigned char *buf;
731
732 result = 0;
733 buf = segment->buf;
734 ubytes = segment->bufsz;
735 mbytes = segment->memsz;
736 maddr = segment->mem;
737
738 result = kimage_set_destination(image, maddr);
739 if (result < 0) {
740 goto out;
741 }
742 while(mbytes) {
743 struct page *page;
744 char *ptr;
745 size_t uchunk, mchunk;
746 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
747 if (page == 0) {
748 result = -ENOMEM;
749 goto out;
750 }
751 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
752 if (result < 0) {
753 goto out;
754 }
755 ptr = kmap(page);
756 /* Start with a clear page */
757 memset(ptr, 0, PAGE_SIZE);
758 ptr += maddr & ~PAGE_MASK;
759 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
760 if (mchunk > mbytes) {
761 mchunk = mbytes;
762 }
763 uchunk = mchunk;
764 if (uchunk > ubytes) {
765 uchunk = ubytes;
766 }
767 result = copy_from_user(ptr, buf, uchunk);
768 kunmap(page);
769 if (result) {
770 result = (result < 0) ? result : -EIO;
771 goto out;
772 }
773 ubytes -= uchunk;
774 maddr += mchunk;
775 buf += mchunk;
776 mbytes -= mchunk;
777 }
778 out:
779 return result;
780}
781
782static int kimage_load_crash_segment(struct kimage *image,
783 struct kexec_segment *segment)
784{
785 /* For crash dumps kernels we simply copy the data from
786 * user space to it's destination.
787 * We do things a page at a time for the sake of kmap.
788 */
789 unsigned long maddr;
790 unsigned long ubytes, mbytes;
791 int result;
792 unsigned char *buf;
793
794 result = 0;
795 buf = segment->buf;
796 ubytes = segment->bufsz;
797 mbytes = segment->memsz;
798 maddr = segment->mem;
799 while(mbytes) {
800 struct page *page;
801 char *ptr;
802 size_t uchunk, mchunk;
803 page = pfn_to_page(maddr >> PAGE_SHIFT);
804 if (page == 0) {
805 result = -ENOMEM;
806 goto out;
807 }
808 ptr = kmap(page);
809 ptr += maddr & ~PAGE_MASK;
810 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
811 if (mchunk > mbytes) {
812 mchunk = mbytes;
813 }
814 uchunk = mchunk;
815 if (uchunk > ubytes) {
816 uchunk = ubytes;
817 /* Zero the trailing part of the page */
818 memset(ptr + uchunk, 0, mchunk - uchunk);
819 }
820 result = copy_from_user(ptr, buf, uchunk);
821 kunmap(page);
822 if (result) {
823 result = (result < 0) ? result : -EIO;
824 goto out;
825 }
826 ubytes -= uchunk;
827 maddr += mchunk;
828 buf += mchunk;
829 mbytes -= mchunk;
830 }
831 out:
832 return result;
833}
834
835static int kimage_load_segment(struct kimage *image,
836 struct kexec_segment *segment)
837{
838 int result = -ENOMEM;
839 switch(image->type) {
840 case KEXEC_TYPE_DEFAULT:
841 result = kimage_load_normal_segment(image, segment);
842 break;
843 case KEXEC_TYPE_CRASH:
844 result = kimage_load_crash_segment(image, segment);
845 break;
846 }
847 return result;
848}
849
850/*
851 * Exec Kernel system call: for obvious reasons only root may call it.
852 *
853 * This call breaks up into three pieces.
854 * - A generic part which loads the new kernel from the current
855 * address space, and very carefully places the data in the
856 * allocated pages.
857 *
858 * - A generic part that interacts with the kernel and tells all of
859 * the devices to shut down. Preventing on-going dmas, and placing
860 * the devices in a consistent state so a later kernel can
861 * reinitialize them.
862 *
863 * - A machine specific part that includes the syscall number
864 * and the copies the image to it's final destination. And
865 * jumps into the image at entry.
866 *
867 * kexec does not sync, or unmount filesystems so if you need
868 * that to happen you need to do that yourself.
869 */
870struct kimage *kexec_image = NULL;
871static struct kimage *kexec_crash_image = NULL;
872/*
873 * A home grown binary mutex.
874 * Nothing can wait so this mutex is safe to use
875 * in interrupt context :)
876 */
877static int kexec_lock = 0;
878
879asmlinkage long sys_kexec_load(unsigned long entry,
880 unsigned long nr_segments, struct kexec_segment __user *segments,
881 unsigned long flags)
882{
883 struct kimage **dest_image, *image;
884 int locked;
885 int result;
886
887 /* We only trust the superuser with rebooting the system. */
888 if (!capable(CAP_SYS_BOOT))
889 return -EPERM;
890
891 /*
892 * Verify we have a legal set of flags
893 * This leaves us room for future extensions.
894 */
895 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
896 return -EINVAL;
897
898 /* Verify we are on the appropriate architecture */
899 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
900 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
901 {
902 return -EINVAL;
903 }
904
905 /* Put an artificial cap on the number
906 * of segments passed to kexec_load.
907 */
908 if (nr_segments > KEXEC_SEGMENT_MAX)
909 return -EINVAL;
910
911 image = NULL;
912 result = 0;
913
914 /* Because we write directly to the reserved memory
915 * region when loading crash kernels we need a mutex here to
916 * prevent multiple crash kernels from attempting to load
917 * simultaneously, and to prevent a crash kernel from loading
918 * over the top of a in use crash kernel.
919 *
920 * KISS: always take the mutex.
921 */
922 locked = xchg(&kexec_lock, 1);
923 if (locked) {
924 return -EBUSY;
925 }
926 dest_image = &kexec_image;
927 if (flags & KEXEC_ON_CRASH) {
928 dest_image = &kexec_crash_image;
929 }
930 if (nr_segments > 0) {
931 unsigned long i;
932 /* Loading another kernel to reboot into */
933 if ((flags & KEXEC_ON_CRASH) == 0) {
934 result = kimage_normal_alloc(&image, entry, nr_segments, segments);
935 }
936 /* Loading another kernel to switch to if this one crashes */
937 else if (flags & KEXEC_ON_CRASH) {
938 /* Free any current crash dump kernel before
939 * we corrupt it.
940 */
941 kimage_free(xchg(&kexec_crash_image, NULL));
942 result = kimage_crash_alloc(&image, entry, nr_segments, segments);
943 }
944 if (result) {
945 goto out;
946 }
947 result = machine_kexec_prepare(image);
948 if (result) {
949 goto out;
950 }
951 for(i = 0; i < nr_segments; i++) {
952 result = kimage_load_segment(image, &image->segment[i]);
953 if (result) {
954 goto out;
955 }
956 }
957 result = kimage_terminate(image);
958 if (result) {
959 goto out;
960 }
961 }
962 /* Install the new kernel, and Uninstall the old */
963 image = xchg(dest_image, image);
964
965 out:
966 xchg(&kexec_lock, 0); /* Release the mutex */
967 kimage_free(image);
968 return result;
969}
970
971#ifdef CONFIG_COMPAT
972asmlinkage long compat_sys_kexec_load(unsigned long entry,
973 unsigned long nr_segments, struct compat_kexec_segment __user *segments,
974 unsigned long flags)
975{
976 struct compat_kexec_segment in;
977 struct kexec_segment out, __user *ksegments;
978 unsigned long i, result;
979
980 /* Don't allow clients that don't understand the native
981 * architecture to do anything.
982 */
983 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
984 return -EINVAL;
985 }
986
987 if (nr_segments > KEXEC_SEGMENT_MAX) {
988 return -EINVAL;
989 }
990
991 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
992 for (i=0; i < nr_segments; i++) {
993 result = copy_from_user(&in, &segments[i], sizeof(in));
994 if (result) {
995 return -EFAULT;
996 }
997
998 out.buf = compat_ptr(in.buf);
999 out.bufsz = in.bufsz;
1000 out.mem = in.mem;
1001 out.memsz = in.memsz;
1002
1003 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1004 if (result) {
1005 return -EFAULT;
1006 }
1007 }
1008
1009 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1010}
1011#endif
1012
1013void crash_kexec(void)
1014{
1015 struct kimage *image;
1016 int locked;
1017
1018
1019 /* Take the kexec_lock here to prevent sys_kexec_load
1020 * running on one cpu from replacing the crash kernel
1021 * we are using after a panic on a different cpu.
1022 *
1023 * If the crash kernel was not located in a fixed area
1024 * of memory the xchg(&kexec_crash_image) would be
1025 * sufficient. But since I reuse the memory...
1026 */
1027 locked = xchg(&kexec_lock, 1);
1028 if (!locked) {
1029 image = xchg(&kexec_crash_image, NULL);
1030 if (image) {
1031 machine_crash_shutdown();
1032 machine_kexec(image);
1033 }
1034 xchg(&kexec_lock, 0);
1035 }
1036}
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..66f43d33cd80 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
18#include <linux/sysrq.h> 18#include <linux/sysrq.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h>
21 22
22int panic_timeout; 23int panic_timeout;
23int panic_on_oops; 24int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
63 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif 65#endif
65 66
67 /*
68 * It's possible to come here directly from a panic-assertion and not
69 * have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though...
71 */
72 preempt_disable();
73
66 bust_spinlocks(1); 74 bust_spinlocks(1);
67 va_start(args, fmt); 75 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 78 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0); 79 bust_spinlocks(0);
72 80
81 /*
82 * If we have crashed and we have a crash kernel loaded let it handle
83 * everything else.
84 * Do we want to call this before we try to display a message?
85 */
86 crash_kexec();
87
73#ifdef CONFIG_SMP 88#ifdef CONFIG_SMP
89 /*
90 * Note smp_send_stop is the usual smp shutdown function, which
91 * unfortunately means it may not be hardened to work in a panic
92 * situation.
93 */
74 smp_send_stop(); 94 smp_send_stop();
75#endif 95#endif
76 96
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
79 if (!panic_blink) 99 if (!panic_blink)
80 panic_blink = no_blink; 100 panic_blink = no_blink;
81 101
82 if (panic_timeout > 0) 102 if (panic_timeout > 0) {
83 {
84 /* 103 /*
85 * Delay timeout seconds before rebooting the machine. 104 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked.. 105 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/sys.c b/kernel/sys.c
index dac10161ca23..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/highuid.h> 17#include <linux/highuid.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/kexec.h>
19#include <linux/workqueue.h> 21#include <linux/workqueue.h>
20#include <linux/device.h> 22#include <linux/device.h>
21#include <linux/key.h> 23#include <linux/key.h>
@@ -439,6 +441,24 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
439 machine_restart(buffer); 441 machine_restart(buffer);
440 break; 442 break;
441 443
444#ifdef CONFIG_KEXEC
445 case LINUX_REBOOT_CMD_KEXEC:
446 {
447 struct kimage *image;
448 image = xchg(&kexec_image, 0);
449 if (!image) {
450 unlock_kernel();
451 return -EINVAL;
452 }
453 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
454 system_state = SYSTEM_RESTART;
455 device_shutdown();
456 printk(KERN_EMERG "Starting new kernel\n");
457 machine_shutdown();
458 machine_kexec(image);
459 break;
460 }
461#endif
442#ifdef CONFIG_SOFTWARE_SUSPEND 462#ifdef CONFIG_SOFTWARE_SUSPEND
443 case LINUX_REBOOT_CMD_SW_SUSPEND: 463 case LINUX_REBOOT_CMD_SW_SUSPEND:
444 { 464 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff); 20cond_syscall(sys_swapoff);
21cond_syscall(sys_kexec_load);
22cond_syscall(compat_sys_kexec_load);
21cond_syscall(sys_init_module); 23cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module); 24cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair); 25cond_syscall(sys_socketpair);