aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lguest/boot.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lguest/boot.c')
-rw-r--r--arch/x86/lguest/boot.c108
1 files changed, 62 insertions, 46 deletions
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a104c532ff70..3335b4595efd 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -10,21 +10,19 @@
10 * (such as the example in Documentation/lguest/lguest.c) is called the 10 * (such as the example in Documentation/lguest/lguest.c) is called the
11 * Launcher. 11 * Launcher.
12 * 12 *
13 * Secondly, we only run specially modified Guests, not normal kernels. When 13 * Secondly, we only run specially modified Guests, not normal kernels: setting
14 * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets 14 * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
15 * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows 15 * how to be a Guest at boot time. This means that you can use the same kernel
16 * how to be a Guest. This means that you can use the same kernel you boot 16 * you boot normally (ie. as a Host) as a Guest.
17 * normally (ie. as a Host) as a Guest.
18 * 17 *
19 * These Guests know that they cannot do privileged operations, such as disable 18 * These Guests know that they cannot do privileged operations, such as disable
20 * interrupts, and that they have to ask the Host to do such things explicitly. 19 * interrupts, and that they have to ask the Host to do such things explicitly.
21 * This file consists of all the replacements for such low-level native 20 * This file consists of all the replacements for such low-level native
22 * hardware operations: these special Guest versions call the Host. 21 * hardware operations: these special Guest versions call the Host.
23 * 22 *
24 * So how does the kernel know it's a Guest? The Guest starts at a special 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
25 * entry point marked with a magic string, which sets up a few things then 24 * just say that we end up here where we replace the native functions various
26 * calls here. We replace the native functions various "paravirt" structures 25 * "paravirt" structures with our Guest versions, then boot like normal. :*/
27 * with our Guest versions, then boot like normal. :*/
28 26
29/* 27/*
30 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
134 * lguest_leave_lazy_mode(). 132 * lguest_leave_lazy_mode().
135 * 133 *
136 * So, when we're in lazy mode, we call async_hcall() to store the call for 134 * So, when we're in lazy mode, we call async_hcall() to store the call for
137 * future processing. */ 135 * future processing: */
138static void lazy_hcall(unsigned long call, 136static void lazy_hcall(unsigned long call,
139 unsigned long arg1, 137 unsigned long arg1,
140 unsigned long arg2, 138 unsigned long arg2,
@@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call,
147} 145}
148 146
149/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 147/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
150 * issue a hypercall to flush any stored calls. */ 148 * issue the do-nothing hypercall to flush any stored calls. */
151static void lguest_leave_lazy_mode(void) 149static void lguest_leave_lazy_mode(void)
152{ 150{
153 paravirt_leave_lazy(paravirt_get_lazy_mode()); 151 paravirt_leave_lazy(paravirt_get_lazy_mode());
@@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void)
164 * 162 *
165 * So instead we keep an "irq_enabled" field inside our "struct lguest_data", 163 * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
166 * which the Guest can update with a single instruction. The Host knows to 164 * which the Guest can update with a single instruction. The Host knows to
167 * check there when it wants to deliver an interrupt. 165 * check there before it tries to deliver an interrupt.
168 */ 166 */
169 167
170/* save_flags() is expected to return the processor state (ie. "flags"). The 168/* save_flags() is expected to return the processor state (ie. "flags"). The
@@ -196,10 +194,15 @@ static void irq_enable(void)
196/*M:003 Note that we don't check for outstanding interrupts when we re-enable 194/*M:003 Note that we don't check for outstanding interrupts when we re-enable
197 * them (or when we unmask an interrupt). This seems to work for the moment, 195 * them (or when we unmask an interrupt). This seems to work for the moment,
198 * since interrupts are rare and we'll just get the interrupt on the next timer 196 * since interrupts are rare and we'll just get the interrupt on the next timer
199 * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way 197 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way
200 * would be to put the "irq_enabled" field in a page by itself, and have the 198 * would be to put the "irq_enabled" field in a page by itself, and have the
201 * Host write-protect it when an interrupt comes in when irqs are disabled. 199 * Host write-protect it when an interrupt comes in when irqs are disabled.
202 * There will then be a page fault as soon as interrupts are re-enabled. :*/ 200 * There will then be a page fault as soon as interrupts are re-enabled.
201 *
202 * A better method is to implement soft interrupt disable generally for x86:
203 * instead of disabling interrupts, we set a flag. If an interrupt does come
204 * in, we then disable them for real. This is uncommon, so we could simply use
205 * a hypercall for interrupt control and not worry about efficiency. :*/
203 206
204/*G:034 207/*G:034
205 * The Interrupt Descriptor Table (IDT). 208 * The Interrupt Descriptor Table (IDT).
@@ -212,6 +215,10 @@ static void irq_enable(void)
212static void lguest_write_idt_entry(gate_desc *dt, 215static void lguest_write_idt_entry(gate_desc *dt,
213 int entrynum, const gate_desc *g) 216 int entrynum, const gate_desc *g)
214{ 217{
218 /* The gate_desc structure is 8 bytes long: we hand it to the Host in
219 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
220 * around like this; typesafety wasn't a big concern in Linux's early
221 * years. */
215 u32 *desc = (u32 *)g; 222 u32 *desc = (u32 *)g;
216 /* Keep the local copy up to date. */ 223 /* Keep the local copy up to date. */
217 native_write_idt_entry(dt, entrynum, g); 224 native_write_idt_entry(dt, entrynum, g);
@@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
243 * 250 *
244 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY 251 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
245 * hypercall and use that repeatedly to load a new IDT. I don't think it 252 * hypercall and use that repeatedly to load a new IDT. I don't think it
246 * really matters, but wouldn't it be nice if they were the same? 253 * really matters, but wouldn't it be nice if they were the same? Wouldn't
254 * it be even better if you were the one to send the patch to fix it?
247 */ 255 */
248static void lguest_load_gdt(const struct desc_ptr *desc) 256static void lguest_load_gdt(const struct desc_ptr *desc)
249{ 257{
@@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void)
298 306
299/* The "cpuid" instruction is a way of querying both the CPU identity 307/* The "cpuid" instruction is a way of querying both the CPU identity
300 * (manufacturer, model, etc) and its features. It was introduced before the 308 * (manufacturer, model, etc) and its features. It was introduced before the
301 * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you 309 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
302 * might imagine, after a decade and a half this treatment, it is now a giant 310 * As you might imagine, after a decade and a half this treatment, it is now a
303 * ball of hair. Its entry in the current Intel manual runs to 28 pages. 311 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
304 * 312 *
305 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 313 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
306 * has been translated into 4 languages. I am not making this up! 314 * has been translated into 4 languages. I am not making this up!
@@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void)
594 return lguest_data.time.tv_sec; 602 return lguest_data.time.tv_sec;
595} 603}
596 604
597/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, 605/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
598 * or 0 if it's unusable as a reliable clock source. This matches what we want 606 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
599 * here: if we return 0 from this function, the x86 TSC clock will not register 607 * This matches what we want here: if we return 0 from this function, the x86
600 * itself. */ 608 * TSC clock will give up and not register itself. */
601static unsigned long lguest_cpu_khz(void) 609static unsigned long lguest_cpu_khz(void)
602{ 610{
603 return lguest_data.tsc_khz; 611 return lguest_data.tsc_khz;
604} 612}
605 613
606/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where 614/* If we can't use the TSC, the kernel falls back to our lower-priority
607 * we read the time value given to us by the Host. */ 615 * "lguest_clock", where we read the time value given to us by the Host. */
608static cycle_t lguest_clock_read(void) 616static cycle_t lguest_clock_read(void)
609{ 617{
610 unsigned long sec, nsec; 618 unsigned long sec, nsec;
@@ -648,12 +656,16 @@ static struct clocksource lguest_clock = {
648static int lguest_clockevent_set_next_event(unsigned long delta, 656static int lguest_clockevent_set_next_event(unsigned long delta,
649 struct clock_event_device *evt) 657 struct clock_event_device *evt)
650{ 658{
659 /* FIXME: I don't think this can ever happen, but James tells me he had
660 * to put this code in. Maybe we should remove it now. Anyone? */
651 if (delta < LG_CLOCK_MIN_DELTA) { 661 if (delta < LG_CLOCK_MIN_DELTA) {
652 if (printk_ratelimit()) 662 if (printk_ratelimit())
653 printk(KERN_DEBUG "%s: small delta %lu ns\n", 663 printk(KERN_DEBUG "%s: small delta %lu ns\n",
654 __FUNCTION__, delta); 664 __FUNCTION__, delta);
655 return -ETIME; 665 return -ETIME;
656 } 666 }
667
668 /* Please wake us this far in the future. */
657 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); 669 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
658 return 0; 670 return 0;
659} 671}
@@ -738,7 +750,7 @@ static void lguest_time_init(void)
738 * will not tolerate us trying to use that), the stack pointer, and the number 750 * will not tolerate us trying to use that), the stack pointer, and the number
739 * of pages in the stack. */ 751 * of pages in the stack. */
740static void lguest_load_sp0(struct tss_struct *tss, 752static void lguest_load_sp0(struct tss_struct *tss,
741 struct thread_struct *thread) 753 struct thread_struct *thread)
742{ 754{
743 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, 755 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
744 THREAD_SIZE/PAGE_SIZE); 756 THREAD_SIZE/PAGE_SIZE);
@@ -786,9 +798,8 @@ static void lguest_safe_halt(void)
786 hcall(LHCALL_HALT, 0, 0, 0); 798 hcall(LHCALL_HALT, 0, 0, 0);
787} 799}
788 800
789/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a 801/* The SHUTDOWN hypercall takes a string to describe what's happening, and
790 * message out when we're crashing as well as elegant termination like powering 802 * an argument which says whether this to restart (reboot) the Guest or not.
791 * off.
792 * 803 *
793 * Note that the Host always prefers that the Guest speak in physical addresses 804 * Note that the Host always prefers that the Guest speak in physical addresses
794 * rather than virtual addresses, so we use __pa() here. */ 805 * rather than virtual addresses, so we use __pa() here. */
@@ -816,8 +827,9 @@ static struct notifier_block paniced = {
816/* Setting up memory is fairly easy. */ 827/* Setting up memory is fairly easy. */
817static __init char *lguest_memory_setup(void) 828static __init char *lguest_memory_setup(void)
818{ 829{
819 /* We do this here and not earlier because lockcheck barfs if we do it 830 /* We do this here and not earlier because lockcheck used to barf if we
820 * before start_kernel() */ 831 * did it before start_kernel(). I think we fixed that, so it'd be
832 * nice to move it back to lguest_init. Patch welcome... */
821 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 833 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
822 834
823 /* The Linux bootloader header contains an "e820" memory map: the 835 /* The Linux bootloader header contains an "e820" memory map: the
@@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
850 return len; 862 return len;
851} 863}
852 864
865/* Rebooting also tells the Host we're finished, but the RESTART flag tells the
866 * Launcher to reboot us. */
867static void lguest_restart(char *reason)
868{
869 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
870}
871
853/*G:050 872/*G:050
854 * Patching (Powerfully Placating Performance Pedants) 873 * Patching (Powerfully Placating Performance Pedants)
855 * 874 *
856 * We have already seen that pv_ops structures let us replace simple 875 * We have already seen that pv_ops structures let us replace simple native
857 * native instructions with calls to the appropriate back end all throughout 876 * instructions with calls to the appropriate back end all throughout the
858 * the kernel. This allows the same kernel to run as a Guest and as a native 877 * kernel. This allows the same kernel to run as a Guest and as a native
859 * kernel, but it's slow because of all the indirect branches. 878 * kernel, but it's slow because of all the indirect branches.
860 * 879 *
861 * Remember that David Wheeler quote about "Any problem in computer science can 880 * Remember that David Wheeler quote about "Any problem in computer science can
@@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
908 return insn_len; 927 return insn_len;
909} 928}
910 929
911static void lguest_restart(char *reason) 930/*G:030 Once we get to lguest_init(), we know we're a Guest. The various
912{ 931 * pv_ops structures in the kernel provide points for (almost) every routine we
913 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); 932 * have to override to avoid privileged instructions. */
914}
915
916/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
917 * structures in the kernel provide points for (almost) every routine we have
918 * to override to avoid privileged instructions. */
919__init void lguest_init(void) 933__init void lguest_init(void)
920{ 934{
921 /* We're under lguest, paravirt is enabled, and we're running at 935 /* We're under lguest, paravirt is enabled, and we're running at
@@ -1003,9 +1017,9 @@ __init void lguest_init(void)
1003 * the normal data segment to get through booting. */ 1017 * the normal data segment to get through booting. */
1004 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1018 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1005 1019
1006 /* The Host uses the top of the Guest's virtual address space for the 1020 /* The Host<->Guest Switcher lives at the top of our address space, and
1007 * Host<->Guest Switcher, and it tells us how big that is in 1021 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1008 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1022 * it put the answer in lguest_data.reserve_mem */
1009 reserve_top_address(lguest_data.reserve_mem); 1023 reserve_top_address(lguest_data.reserve_mem);
1010 1024
1011 /* If we don't initialize the lock dependency checker now, it crashes 1025 /* If we don't initialize the lock dependency checker now, it crashes
@@ -1027,6 +1041,7 @@ __init void lguest_init(void)
1027 /* Math is always hard! */ 1041 /* Math is always hard! */
1028 new_cpu_data.hard_math = 1; 1042 new_cpu_data.hard_math = 1;
1029 1043
1044 /* We don't have features. We have puppies! Puppies! */
1030#ifdef CONFIG_X86_MCE 1045#ifdef CONFIG_X86_MCE
1031 mce_disabled = 1; 1046 mce_disabled = 1;
1032#endif 1047#endif
@@ -1044,10 +1059,11 @@ __init void lguest_init(void)
1044 virtio_cons_early_init(early_put_chars); 1059 virtio_cons_early_init(early_put_chars);
1045 1060
1046 /* Last of all, we set the power management poweroff hook to point to 1061 /* Last of all, we set the power management poweroff hook to point to
1047 * the Guest routine to power off. */ 1062 * the Guest routine to power off, and the reboot hook to our restart
1063 * routine. */
1048 pm_power_off = lguest_power_off; 1064 pm_power_off = lguest_power_off;
1049
1050 machine_ops.restart = lguest_restart; 1065 machine_ops.restart = lguest_restart;
1066
1051 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1067 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1052 * to boot as normal. It never returns. */ 1068 * to boot as normal. It never returns. */
1053 start_kernel(); 1069 start_kernel();