aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lguest
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 06:52:15 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 06:56:49 -0400
commiteadb8a091b27a840de7450f84ecff5ef13476424 (patch)
tree58c3782d40def63baa8167f3d31e3048cb4c7660 /arch/x86/lguest
parent73874005cd8800440be4299bd095387fff4b90ac (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts: arch/x86/Kconfig arch/x86/kernel/traps.c arch/x86/power/cpu.c arch/x86/power/cpu_32.c kernel/Makefile Semantic conflict: arch/x86/kernel/hw_breakpoint.c Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to put_cpu() in arch/x86/kernel/hw_breakpoint.c. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c193
-rw-r--r--arch/x86/lguest/i386_head.S60
4 files changed, 197 insertions, 58 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d3..38718041efc3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE
6 select VIRTIO 5 select VIRTIO
7 select VIRTIO_RING 6 select VIRTIO_RING
8 select VIRTIO_CONSOLE 7 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 27f0c9ed7f60..94e0e54056a9 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1 +1,2 @@
1obj-y := i386_head.o boot.o 1obj-y := i386_head.o boot.o
2CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44bafc3..7bc65f0f62c4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
67#include <asm/mce.h> 67#include <asm/mce.h>
68#include <asm/io.h> 68#include <asm/io.h>
69#include <asm/i387.h> 69#include <asm/i387.h>
70#include <asm/stackprotector.h>
70#include <asm/reboot.h> /* for struct machine_ops */ 71#include <asm/reboot.h> /* for struct machine_ops */
71 72
72/*G:010 Welcome to the Guest! 73/*G:010 Welcome to the Guest!
@@ -86,7 +87,7 @@ struct lguest_data lguest_data = {
86 87
87/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
88 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
89 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall 90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
90 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
91 * and 255 once the Host has finished with it. 92 * and 255 once the Host has finished with it.
92 * 93 *
@@ -95,7 +96,8 @@ struct lguest_data lguest_data = {
95 * effect of causing the Host to run all the stored calls in the ring buffer 96 * effect of causing the Host to run all the stored calls in the ring buffer
96 * which empties it for next time! */ 97 * which empties it for next time! */
97static void async_hcall(unsigned long call, unsigned long arg1, 98static void async_hcall(unsigned long call, unsigned long arg1,
98 unsigned long arg2, unsigned long arg3) 99 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4)
99{ 101{
100 /* Note: This code assumes we're uniprocessor. */ 102 /* Note: This code assumes we're uniprocessor. */
101 static unsigned int next_call; 103 static unsigned int next_call;
@@ -107,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
107 local_irq_save(flags); 109 local_irq_save(flags);
108 if (lguest_data.hcall_status[next_call] != 0xFF) { 110 if (lguest_data.hcall_status[next_call] != 0xFF) {
109 /* Table full, so do normal hcall which will flush table. */ 111 /* Table full, so do normal hcall which will flush table. */
110 kvm_hypercall3(call, arg1, arg2, arg3); 112 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
111 } else { 113 } else {
112 lguest_data.hcalls[next_call].arg0 = call; 114 lguest_data.hcalls[next_call].arg0 = call;
113 lguest_data.hcalls[next_call].arg1 = arg1; 115 lguest_data.hcalls[next_call].arg1 = arg1;
114 lguest_data.hcalls[next_call].arg2 = arg2; 116 lguest_data.hcalls[next_call].arg2 = arg2;
115 lguest_data.hcalls[next_call].arg3 = arg3; 117 lguest_data.hcalls[next_call].arg3 = arg3;
118 lguest_data.hcalls[next_call].arg4 = arg4;
116 /* Arguments must all be written before we mark it to go */ 119 /* Arguments must all be written before we mark it to go */
117 wmb(); 120 wmb();
118 lguest_data.hcall_status[next_call] = 0; 121 lguest_data.hcall_status[next_call] = 0;
@@ -140,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
140 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 143 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
141 kvm_hypercall1(call, arg1); 144 kvm_hypercall1(call, arg1);
142 else 145 else
143 async_hcall(call, arg1, 0, 0); 146 async_hcall(call, arg1, 0, 0, 0);
144} 147}
145 148
146static void lazy_hcall2(unsigned long call, 149static void lazy_hcall2(unsigned long call,
@@ -150,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 153 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
151 kvm_hypercall2(call, arg1, arg2); 154 kvm_hypercall2(call, arg1, arg2);
152 else 155 else
153 async_hcall(call, arg1, arg2, 0); 156 async_hcall(call, arg1, arg2, 0, 0);
154} 157}
155 158
156static void lazy_hcall3(unsigned long call, 159static void lazy_hcall3(unsigned long call,
@@ -161,18 +164,38 @@ static void lazy_hcall3(unsigned long call,
161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
162 kvm_hypercall3(call, arg1, arg2, arg3); 165 kvm_hypercall3(call, arg1, arg2, arg3);
163 else 166 else
164 async_hcall(call, arg1, arg2, arg3); 167 async_hcall(call, arg1, arg2, arg3, 0);
165} 168}
166 169
170#ifdef CONFIG_X86_PAE
171static void lazy_hcall4(unsigned long call,
172 unsigned long arg1,
173 unsigned long arg2,
174 unsigned long arg3,
175 unsigned long arg4)
176{
177 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
178 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
179 else
180 async_hcall(call, arg1, arg2, arg3, arg4);
181}
182#endif
183
167/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
168 * issue the do-nothing hypercall to flush any stored calls. */ 185 * issue the do-nothing hypercall to flush any stored calls. */
169static void lguest_leave_lazy_mode(void) 186static void lguest_leave_lazy_mmu_mode(void)
187{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
189 paravirt_leave_lazy_mmu();
190}
191
192static void lguest_end_context_switch(struct task_struct *next)
170{ 193{
171 paravirt_leave_lazy(paravirt_get_lazy_mode());
172 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 194 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
195 paravirt_end_context_switch(next);
173} 196}
174 197
175/*G:033 198/*G:032
176 * After that diversion we return to our first native-instruction 199 * After that diversion we return to our first native-instruction
177 * replacements: four functions for interrupt control. 200 * replacements: four functions for interrupt control.
178 * 201 *
@@ -192,30 +215,28 @@ static unsigned long save_fl(void)
192{ 215{
193 return lguest_data.irq_enabled; 216 return lguest_data.irq_enabled;
194} 217}
195PV_CALLEE_SAVE_REGS_THUNK(save_fl);
196
197/* restore_flags() just sets the flags back to the value given. */
198static void restore_fl(unsigned long flags)
199{
200 lguest_data.irq_enabled = flags;
201}
202PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
203 218
204/* Interrupts go off... */ 219/* Interrupts go off... */
205static void irq_disable(void) 220static void irq_disable(void)
206{ 221{
207 lguest_data.irq_enabled = 0; 222 lguest_data.irq_enabled = 0;
208} 223}
224
225/* Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl);
209PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 233PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/
210 235
211/* Interrupts go on... */ 236/* These are in i386_head.S */
212static void irq_enable(void) 237extern void lg_irq_enable(void);
213{ 238extern void lg_restore_fl(unsigned long flags);
214 lguest_data.irq_enabled = X86_EFLAGS_IF;
215}
216PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
217 239
218/*:*/
219/*M:003 Note that we don't check for outstanding interrupts when we re-enable 240/*M:003 Note that we don't check for outstanding interrupts when we re-enable
220 * them (or when we unmask an interrupt). This seems to work for the moment, 241 * them (or when we unmask an interrupt). This seems to work for the moment,
221 * since interrupts are rare and we'll just get the interrupt on the next timer 242 * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -361,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
361 case 1: /* Basic feature request. */ 382 case 1: /* Basic feature request. */
362 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
363 *cx &= 0x00002201; 384 *cx &= 0x00002201;
364 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
365 *dx &= 0x07808111; 386 *dx &= 0x07808151;
366 /* The Host can do a nice optimization if it knows that the 387 /* The Host can do a nice optimization if it knows that the
367 * kernel mappings (addresses above 0xC0000000 or whatever 388 * kernel mappings (addresses above 0xC0000000 or whatever
368 * PAGE_OFFSET is set to) haven't changed. But Linux calls 389 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -381,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
381 if (*ax > 0x80000008) 402 if (*ax > 0x80000008)
382 *ax = 0x80000008; 403 *ax = 0x80000008;
383 break; 404 break;
405 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20);
409 break;
384 } 410 }
385} 411}
386 412
@@ -514,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
514static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
515 pte_t *ptep) 541 pte_t *ptep)
516{ 542{
543#ifdef CONFIG_X86_PAE
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high);
546#else
517 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 547 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
548#endif
518} 549}
519 550
520static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
521 pte_t *ptep, pte_t pteval) 552 pte_t *ptep, pte_t pteval)
522{ 553{
523 *ptep = pteval; 554 native_set_pte(ptep, pteval);
524 lguest_pte_update(mm, addr, ptep); 555 lguest_pte_update(mm, addr, ptep);
525} 556}
526 557
527/* The Guest calls this to set a top-level entry. Again, we set the entry then 558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
528 * tell the Host which top-level page we changed, and the index of the entry we 559 * to set a middle-level entry when PAE is activated.
529 * changed. */ 560 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */
562#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{
565 native_set_pud(pudp, pudval);
566
567 /* 32 bytes aligned pdpt address and the index. */
568 lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
569 (__pa(pudp) & 0x1F) / sizeof(pud_t));
570}
571
530static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 572static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
531{ 573{
532 *pmdp = pmdval; 574 native_set_pmd(pmdp, pmdval);
533 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 575 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
534 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); 576 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
535} 577}
578#else
579
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{
584 native_set_pmd(pmdp, pmdval);
585 lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
586 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
587}
588#endif
536 589
537/* There are a couple of legacy places where the kernel sets a PTE, but we 590/* There are a couple of legacy places where the kernel sets a PTE, but we
538 * don't know the top level any more. This is useless for us, since we don't 591 * don't know the top level any more. This is useless for us, since we don't
@@ -545,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
545 * which brings boot back to 0.25 seconds. */ 598 * which brings boot back to 0.25 seconds. */
546static void lguest_set_pte(pte_t *ptep, pte_t pteval) 599static void lguest_set_pte(pte_t *ptep, pte_t pteval)
547{ 600{
548 *ptep = pteval; 601 native_set_pte(ptep, pteval);
549 if (cr3_changed) 602 if (cr3_changed)
550 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 603 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
551} 604}
552 605
606#ifdef CONFIG_X86_PAE
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{
609 native_set_pte_atomic(ptep, pte);
610 if (cr3_changed)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612}
613
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep);
618}
619
620void lguest_pmd_clear(pmd_t *pmdp)
621{
622 lguest_set_pmd(pmdp, __pmd(0));
623}
624#endif
625
553/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
554 * native page table operations. On native hardware you can set a new page 627 * native page table operations. On native hardware you can set a new page
555 * table entry whenever you want, but if you want to remove one you have to do 628 * table entry whenever you want, but if you want to remove one you have to do
@@ -621,13 +694,12 @@ static void __init lguest_init_IRQ(void)
621{ 694{
622 unsigned int i; 695 unsigned int i;
623 696
624 for (i = 0; i < LGUEST_IRQS; i++) { 697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
625 int vector = FIRST_EXTERNAL_VECTOR + i;
626 /* Some systems map "vectors" to interrupts weirdly. Lguest has 698 /* Some systems map "vectors" to interrupts weirdly. Lguest has
627 * a straightforward 1 to 1 mapping, so force that here. */ 699 * a straightforward 1 to 1 mapping, so force that here. */
628 __get_cpu_var(vector_irq)[vector] = i; 700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
629 if (vector != SYSCALL_VECTOR) 701 if (i != SYSCALL_VECTOR)
630 set_intr_gate(vector, interrupt[i]); 702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
631 } 703 }
632 /* This call is required to set up for 4k stacks, where we have 704 /* This call is required to set up for 4k stacks, where we have
633 * separate stacks for hard and soft interrupts. */ 705 * separate stacks for hard and soft interrupts. */
@@ -636,7 +708,7 @@ static void __init lguest_init_IRQ(void)
636 708
637void lguest_setup_irq(unsigned int irq) 709void lguest_setup_irq(unsigned int irq)
638{ 710{
639 irq_to_desc_alloc_cpu(irq, 0); 711 irq_to_desc_alloc_node(irq, 0);
640 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 712 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
641 handle_level_irq, "level"); 713 handle_level_irq, "level");
642} 714}
@@ -966,10 +1038,10 @@ static void lguest_restart(char *reason)
966 * 1038 *
967 * Our current solution is to allow the paravirt back end to optionally patch 1039 * Our current solution is to allow the paravirt back end to optionally patch
968 * over the indirect calls to replace them with something more efficient. We 1040 * over the indirect calls to replace them with something more efficient. We
969 * patch the four most commonly called functions: disable interrupts, enable 1041 * patch two of the simplest of the most commonly called functions: disable
970 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 1042 * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
971 * bytes to patch into: the Guest versions of these operations are small enough 1043 * into: the Guest versions of these operations are small enough that we can
972 * that we can fit comfortably. 1044 * fit comfortably.
973 * 1045 *
974 * First we need assembly templates of each of the patchable Guest operations, 1046 * First we need assembly templates of each of the patchable Guest operations,
975 * and these are in i386_head.S. */ 1047 * and these are in i386_head.S. */
@@ -980,8 +1052,6 @@ static const struct lguest_insns
980 const char *start, *end; 1052 const char *start, *end;
981} lguest_insns[] = { 1053} lguest_insns[] = {
982 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1054 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
983 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
984 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
985 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
986}; 1056};
987 1057
@@ -1019,6 +1089,7 @@ __init void lguest_init(void)
1019 pv_info.name = "lguest"; 1089 pv_info.name = "lguest";
1020 pv_info.paravirt_enabled = 1; 1090 pv_info.paravirt_enabled = 1;
1021 pv_info.kernel_rpl = 1; 1091 pv_info.kernel_rpl = 1;
1092 pv_info.shared_kernel_pmd = 1;
1022 1093
1023 /* We set up all the lguest overrides for sensitive operations. These 1094 /* We set up all the lguest overrides for sensitive operations. These
1024 * are detailed with the operations themselves. */ 1095 * are detailed with the operations themselves. */
@@ -1026,9 +1097,9 @@ __init void lguest_init(void)
1026 /* interrupt-related operations */ 1097 /* interrupt-related operations */
1027 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1098 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1028 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1029 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); 1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1030 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1101 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
1031 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); 1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1032 pv_irq_ops.safe_halt = lguest_safe_halt; 1103 pv_irq_ops.safe_halt = lguest_safe_halt;
1033 1104
1034 /* init-time operations */ 1105 /* init-time operations */
@@ -1053,8 +1124,8 @@ __init void lguest_init(void)
1053 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1124 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1054 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1125 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1055 pv_cpu_ops.wbinvd = lguest_wbinvd; 1126 pv_cpu_ops.wbinvd = lguest_wbinvd;
1056 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1057 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1058 1129
1059 /* pagetable management */ 1130 /* pagetable management */
1060 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1131 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1064,10 +1135,16 @@ __init void lguest_init(void)
1064 pv_mmu_ops.set_pte = lguest_set_pte; 1135 pv_mmu_ops.set_pte = lguest_set_pte;
1065 pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1136 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1066 pv_mmu_ops.set_pmd = lguest_set_pmd; 1137 pv_mmu_ops.set_pmd = lguest_set_pmd;
1138#ifdef CONFIG_X86_PAE
1139 pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
1140 pv_mmu_ops.pte_clear = lguest_pte_clear;
1141 pv_mmu_ops.pmd_clear = lguest_pmd_clear;
1142 pv_mmu_ops.set_pud = lguest_set_pud;
1143#endif
1067 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1144 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1068 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1145 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1069 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1146 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1070 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1147 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1071 pv_mmu_ops.pte_update = lguest_pte_update; 1148 pv_mmu_ops.pte_update = lguest_pte_update;
1072 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1149 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1073 1150
@@ -1088,13 +1165,21 @@ __init void lguest_init(void)
1088 * lguest_init() where the rest of the fairly chaotic boot setup 1165 * lguest_init() where the rest of the fairly chaotic boot setup
1089 * occurs. */ 1166 * occurs. */
1090 1167
1168 /* The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */
1174 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as
1176 * call switch_to_new_gdt() which loads the whole table and sets up
1177 * the per-cpu segment descriptor register %fs as well. */
1178 switch_to_new_gdt(0);
1179
1091 /* As described in head_32.S, we map the first 128M of memory. */ 1180 /* As described in head_32.S, we map the first 128M of memory. */
1092 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1093 1182
1094 /* Load the %fs segment register (the per-cpu segment register) with
1095 * the normal data segment to get through booting. */
1096 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1097
1098 /* The Host<->Guest Switcher lives at the top of our address space, and 1183 /* The Host<->Guest Switcher lives at the top of our address space, and
1099 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1184 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1100 * it put the answer in lguest_data.reserve_mem */ 1185 * it put the answer in lguest_data.reserve_mem */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f79541989471..a9c8cfe61cd4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
46 .globl lgstart_##name; .globl lgend_##name 46 .globl lgstart_##name; .globl lgend_##name
47 47
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
50LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
51LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
52/*:*/ 50
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
52 * matter for save_fl and irq_disable later). If we write our routines
53 * carefully in assembler, we can avoid clobbering any registers and avoid
54 * jumping through the wrapper functions.
55 *
56 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine
58 * to enable interrupts: */
59ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */
72 ret
73send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS.
76 *
77 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */
82 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax
89 ret
90
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */
94ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret
53 107
54/* These demark the EIP range where host should never deliver interrupts. */ 108/* These demark the EIP range where host should never deliver interrupts. */
55.global lguest_noirq_start 109.global lguest_noirq_start