diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-06-17 06:52:15 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-17 06:56:49 -0400 |
commit | eadb8a091b27a840de7450f84ecff5ef13476424 (patch) | |
tree | 58c3782d40def63baa8167f3d31e3048cb4c7660 /arch/x86/lguest | |
parent | 73874005cd8800440be4299bd095387fff4b90ac (diff) | |
parent | 65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff) |
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts:
arch/x86/Kconfig
arch/x86/kernel/traps.c
arch/x86/power/cpu.c
arch/x86/power/cpu_32.c
kernel/Makefile
Semantic conflict:
arch/x86/kernel/hw_breakpoint.c
Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
put_cpu() in arch/x86/kernel/hw_breakpoint.c.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/lguest/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 193 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 60 |
4 files changed, 197 insertions, 58 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
@@ -2,7 +2,6 @@ config LGUEST_GUEST | |||
2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
3 | select PARAVIRT | 3 | select PARAVIRT |
4 | depends on X86_32 | 4 | depends on X86_32 |
5 | depends on !X86_PAE | ||
6 | select VIRTIO | 5 | select VIRTIO |
7 | select VIRTIO_RING | 6 | select VIRTIO_RING |
8 | select VIRTIO_CONSOLE | 7 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile index 27f0c9ed7f60..94e0e54056a9 100644 --- a/arch/x86/lguest/Makefile +++ b/arch/x86/lguest/Makefile | |||
@@ -1 +1,2 @@ | |||
1 | obj-y := i386_head.o boot.o | 1 | obj-y := i386_head.o boot.o |
2 | CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) | ||
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ca7ec44bafc3..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <asm/mce.h> | 67 | #include <asm/mce.h> |
68 | #include <asm/io.h> | 68 | #include <asm/io.h> |
69 | #include <asm/i387.h> | 69 | #include <asm/i387.h> |
70 | #include <asm/stackprotector.h> | ||
70 | #include <asm/reboot.h> /* for struct machine_ops */ | 71 | #include <asm/reboot.h> /* for struct machine_ops */ |
71 | 72 | ||
72 | /*G:010 Welcome to the Guest! | 73 | /*G:010 Welcome to the Guest! |
@@ -86,7 +87,7 @@ struct lguest_data lguest_data = { | |||
86 | 87 | ||
87 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a |
88 | * ring buffer of stored hypercalls which the Host will run though next time we | 89 | * ring buffer of stored hypercalls which the Host will run though next time we |
89 | * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall | 90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
90 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
91 | * and 255 once the Host has finished with it. | 92 | * and 255 once the Host has finished with it. |
92 | * | 93 | * |
@@ -95,7 +96,8 @@ struct lguest_data lguest_data = { | |||
95 | * effect of causing the Host to run all the stored calls in the ring buffer | 96 | * effect of causing the Host to run all the stored calls in the ring buffer |
96 | * which empties it for next time! */ | 97 | * which empties it for next time! */ |
97 | static void async_hcall(unsigned long call, unsigned long arg1, | 98 | static void async_hcall(unsigned long call, unsigned long arg1, |
98 | unsigned long arg2, unsigned long arg3) | 99 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | ||
99 | { | 101 | { |
100 | /* Note: This code assumes we're uniprocessor. */ | 102 | /* Note: This code assumes we're uniprocessor. */ |
101 | static unsigned int next_call; | 103 | static unsigned int next_call; |
@@ -107,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
107 | local_irq_save(flags); | 109 | local_irq_save(flags); |
108 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 110 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
109 | /* Table full, so do normal hcall which will flush table. */ | 111 | /* Table full, so do normal hcall which will flush table. */ |
110 | kvm_hypercall3(call, arg1, arg2, arg3); | 112 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); |
111 | } else { | 113 | } else { |
112 | lguest_data.hcalls[next_call].arg0 = call; | 114 | lguest_data.hcalls[next_call].arg0 = call; |
113 | lguest_data.hcalls[next_call].arg1 = arg1; | 115 | lguest_data.hcalls[next_call].arg1 = arg1; |
114 | lguest_data.hcalls[next_call].arg2 = arg2; | 116 | lguest_data.hcalls[next_call].arg2 = arg2; |
115 | lguest_data.hcalls[next_call].arg3 = arg3; | 117 | lguest_data.hcalls[next_call].arg3 = arg3; |
118 | lguest_data.hcalls[next_call].arg4 = arg4; | ||
116 | /* Arguments must all be written before we mark it to go */ | 119 | /* Arguments must all be written before we mark it to go */ |
117 | wmb(); | 120 | wmb(); |
118 | lguest_data.hcall_status[next_call] = 0; | 121 | lguest_data.hcall_status[next_call] = 0; |
@@ -140,7 +143,7 @@ static void lazy_hcall1(unsigned long call, | |||
140 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 143 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
141 | kvm_hypercall1(call, arg1); | 144 | kvm_hypercall1(call, arg1); |
142 | else | 145 | else |
143 | async_hcall(call, arg1, 0, 0); | 146 | async_hcall(call, arg1, 0, 0, 0); |
144 | } | 147 | } |
145 | 148 | ||
146 | static void lazy_hcall2(unsigned long call, | 149 | static void lazy_hcall2(unsigned long call, |
@@ -150,7 +153,7 @@ static void lazy_hcall2(unsigned long call, | |||
150 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 153 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
151 | kvm_hypercall2(call, arg1, arg2); | 154 | kvm_hypercall2(call, arg1, arg2); |
152 | else | 155 | else |
153 | async_hcall(call, arg1, arg2, 0); | 156 | async_hcall(call, arg1, arg2, 0, 0); |
154 | } | 157 | } |
155 | 158 | ||
156 | static void lazy_hcall3(unsigned long call, | 159 | static void lazy_hcall3(unsigned long call, |
@@ -161,18 +164,38 @@ static void lazy_hcall3(unsigned long call, | |||
161 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 164 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
162 | kvm_hypercall3(call, arg1, arg2, arg3); | 165 | kvm_hypercall3(call, arg1, arg2, arg3); |
163 | else | 166 | else |
164 | async_hcall(call, arg1, arg2, arg3); | 167 | async_hcall(call, arg1, arg2, arg3, 0); |
165 | } | 168 | } |
166 | 169 | ||
170 | #ifdef CONFIG_X86_PAE | ||
171 | static void lazy_hcall4(unsigned long call, | ||
172 | unsigned long arg1, | ||
173 | unsigned long arg2, | ||
174 | unsigned long arg3, | ||
175 | unsigned long arg4) | ||
176 | { | ||
177 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | ||
178 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); | ||
179 | else | ||
180 | async_hcall(call, arg1, arg2, arg3, arg4); | ||
181 | } | ||
182 | #endif | ||
183 | |||
167 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
168 | * issue the do-nothing hypercall to flush any stored calls. */ | 185 | * issue the do-nothing hypercall to flush any stored calls. */ |
169 | static void lguest_leave_lazy_mode(void) | 186 | static void lguest_leave_lazy_mmu_mode(void) |
187 | { | ||
188 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | ||
189 | paravirt_leave_lazy_mmu(); | ||
190 | } | ||
191 | |||
192 | static void lguest_end_context_switch(struct task_struct *next) | ||
170 | { | 193 | { |
171 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | ||
172 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 194 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
195 | paravirt_end_context_switch(next); | ||
173 | } | 196 | } |
174 | 197 | ||
175 | /*G:033 | 198 | /*G:032 |
176 | * After that diversion we return to our first native-instruction | 199 | * After that diversion we return to our first native-instruction |
177 | * replacements: four functions for interrupt control. | 200 | * replacements: four functions for interrupt control. |
178 | * | 201 | * |
@@ -192,30 +215,28 @@ static unsigned long save_fl(void) | |||
192 | { | 215 | { |
193 | return lguest_data.irq_enabled; | 216 | return lguest_data.irq_enabled; |
194 | } | 217 | } |
195 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
196 | |||
197 | /* restore_flags() just sets the flags back to the value given. */ | ||
198 | static void restore_fl(unsigned long flags) | ||
199 | { | ||
200 | lguest_data.irq_enabled = flags; | ||
201 | } | ||
202 | PV_CALLEE_SAVE_REGS_THUNK(restore_fl); | ||
203 | 218 | ||
204 | /* Interrupts go off... */ | 219 | /* Interrupts go off... */ |
205 | static void irq_disable(void) | 220 | static void irq_disable(void) |
206 | { | 221 | { |
207 | lguest_data.irq_enabled = 0; | 222 | lguest_data.irq_enabled = 0; |
208 | } | 223 | } |
224 | |||
225 | /* Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | ||
227 | * break some rules. In particular, these functions are assumed to save their | ||
228 | * own registers if they need to: normal C functions assume they can trash the | ||
229 | * eax register. To use normal C functions, we use | ||
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | ||
231 | * C function, then restores it. */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
209 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | ||
210 | 235 | ||
211 | /* Interrupts go on... */ | 236 | /* These are in i386_head.S */ |
212 | static void irq_enable(void) | 237 | extern void lg_irq_enable(void); |
213 | { | 238 | extern void lg_restore_fl(unsigned long flags); |
214 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
215 | } | ||
216 | PV_CALLEE_SAVE_REGS_THUNK(irq_enable); | ||
217 | 239 | ||
218 | /*:*/ | ||
219 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable |
220 | * them (or when we unmask an interrupt). This seems to work for the moment, | 241 | * them (or when we unmask an interrupt). This seems to work for the moment, |
221 | * since interrupts are rare and we'll just get the interrupt on the next timer | 242 | * since interrupts are rare and we'll just get the interrupt on the next timer |
@@ -361,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
361 | case 1: /* Basic feature request. */ | 382 | case 1: /* Basic feature request. */ |
362 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ |
363 | *cx &= 0x00002201; | 384 | *cx &= 0x00002201; |
364 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ | 385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ |
365 | *dx &= 0x07808111; | 386 | *dx &= 0x07808151; |
366 | /* The Host can do a nice optimization if it knows that the | 387 | /* The Host can do a nice optimization if it knows that the |
367 | * kernel mappings (addresses above 0xC0000000 or whatever | 388 | * kernel mappings (addresses above 0xC0000000 or whatever |
368 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
@@ -381,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
381 | if (*ax > 0x80000008) | 402 | if (*ax > 0x80000008) |
382 | *ax = 0x80000008; | 403 | *ax = 0x80000008; |
383 | break; | 404 | break; |
405 | case 0x80000001: | ||
406 | /* Here we should fix nx cap depending on host. */ | ||
407 | /* For this version of PAE, we just clear NX bit. */ | ||
408 | *dx &= ~(1 << 20); | ||
409 | break; | ||
384 | } | 410 | } |
385 | } | 411 | } |
386 | 412 | ||
@@ -514,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) | |||
514 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
515 | pte_t *ptep) | 541 | pte_t *ptep) |
516 | { | 542 | { |
543 | #ifdef CONFIG_X86_PAE | ||
544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | ||
545 | ptep->pte_low, ptep->pte_high); | ||
546 | #else | ||
517 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); | 547 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); |
548 | #endif | ||
518 | } | 549 | } |
519 | 550 | ||
520 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
521 | pte_t *ptep, pte_t pteval) | 552 | pte_t *ptep, pte_t pteval) |
522 | { | 553 | { |
523 | *ptep = pteval; | 554 | native_set_pte(ptep, pteval); |
524 | lguest_pte_update(mm, addr, ptep); | 555 | lguest_pte_update(mm, addr, ptep); |
525 | } | 556 | } |
526 | 557 | ||
527 | /* The Guest calls this to set a top-level entry. Again, we set the entry then | 558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd |
528 | * tell the Host which top-level page we changed, and the index of the entry we | 559 | * to set a middle-level entry when PAE is activated. |
529 | * changed. */ | 560 | * Again, we set the entry then tell the Host which page we changed, |
561 | * and the index of the entry we changed. */ | ||
562 | #ifdef CONFIG_X86_PAE | ||
563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | ||
564 | { | ||
565 | native_set_pud(pudp, pudval); | ||
566 | |||
567 | /* 32 bytes aligned pdpt address and the index. */ | ||
568 | lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, | ||
569 | (__pa(pudp) & 0x1F) / sizeof(pud_t)); | ||
570 | } | ||
571 | |||
530 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 572 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
531 | { | 573 | { |
532 | *pmdp = pmdval; | 574 | native_set_pmd(pmdp, pmdval); |
533 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, | 575 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, |
534 | (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); | 576 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); |
535 | } | 577 | } |
578 | #else | ||
579 | |||
580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | ||
581 | * activated. */ | ||
582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
583 | { | ||
584 | native_set_pmd(pmdp, pmdval); | ||
585 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, | ||
586 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | ||
587 | } | ||
588 | #endif | ||
536 | 589 | ||
537 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 590 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
538 | * don't know the top level any more. This is useless for us, since we don't | 591 | * don't know the top level any more. This is useless for us, since we don't |
@@ -545,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
545 | * which brings boot back to 0.25 seconds. */ | 598 | * which brings boot back to 0.25 seconds. */ |
546 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 599 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
547 | { | 600 | { |
548 | *ptep = pteval; | 601 | native_set_pte(ptep, pteval); |
549 | if (cr3_changed) | 602 | if (cr3_changed) |
550 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 603 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
551 | } | 604 | } |
552 | 605 | ||
606 | #ifdef CONFIG_X86_PAE | ||
607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
608 | { | ||
609 | native_set_pte_atomic(ptep, pte); | ||
610 | if (cr3_changed) | ||
611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | ||
612 | } | ||
613 | |||
614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
615 | { | ||
616 | native_pte_clear(mm, addr, ptep); | ||
617 | lguest_pte_update(mm, addr, ptep); | ||
618 | } | ||
619 | |||
620 | void lguest_pmd_clear(pmd_t *pmdp) | ||
621 | { | ||
622 | lguest_set_pmd(pmdp, __pmd(0)); | ||
623 | } | ||
624 | #endif | ||
625 | |||
553 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on |
554 | * native page table operations. On native hardware you can set a new page | 627 | * native page table operations. On native hardware you can set a new page |
555 | * table entry whenever you want, but if you want to remove one you have to do | 628 | * table entry whenever you want, but if you want to remove one you have to do |
@@ -621,13 +694,12 @@ static void __init lguest_init_IRQ(void) | |||
621 | { | 694 | { |
622 | unsigned int i; | 695 | unsigned int i; |
623 | 696 | ||
624 | for (i = 0; i < LGUEST_IRQS; i++) { | 697 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
625 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
626 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 698 | /* Some systems map "vectors" to interrupts weirdly. Lguest has |
627 | * a straightforward 1 to 1 mapping, so force that here. */ | 699 | * a straightforward 1 to 1 mapping, so force that here. */ |
628 | __get_cpu_var(vector_irq)[vector] = i; | 700 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
629 | if (vector != SYSCALL_VECTOR) | 701 | if (i != SYSCALL_VECTOR) |
630 | set_intr_gate(vector, interrupt[i]); | 702 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
631 | } | 703 | } |
632 | /* This call is required to set up for 4k stacks, where we have | 704 | /* This call is required to set up for 4k stacks, where we have |
633 | * separate stacks for hard and soft interrupts. */ | 705 | * separate stacks for hard and soft interrupts. */ |
@@ -636,7 +708,7 @@ static void __init lguest_init_IRQ(void) | |||
636 | 708 | ||
637 | void lguest_setup_irq(unsigned int irq) | 709 | void lguest_setup_irq(unsigned int irq) |
638 | { | 710 | { |
639 | irq_to_desc_alloc_cpu(irq, 0); | 711 | irq_to_desc_alloc_node(irq, 0); |
640 | set_irq_chip_and_handler_name(irq, &lguest_irq_controller, | 712 | set_irq_chip_and_handler_name(irq, &lguest_irq_controller, |
641 | handle_level_irq, "level"); | 713 | handle_level_irq, "level"); |
642 | } | 714 | } |
@@ -966,10 +1038,10 @@ static void lguest_restart(char *reason) | |||
966 | * | 1038 | * |
967 | * Our current solution is to allow the paravirt back end to optionally patch | 1039 | * Our current solution is to allow the paravirt back end to optionally patch |
968 | * over the indirect calls to replace them with something more efficient. We | 1040 | * over the indirect calls to replace them with something more efficient. We |
969 | * patch the four most commonly called functions: disable interrupts, enable | 1041 | * patch two of the simplest of the most commonly called functions: disable |
970 | * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 | 1042 | * interrupts and save interrupts. We usually have 6 or 10 bytes to patch |
971 | * bytes to patch into: the Guest versions of these operations are small enough | 1043 | * into: the Guest versions of these operations are small enough that we can |
972 | * that we can fit comfortably. | 1044 | * fit comfortably. |
973 | * | 1045 | * |
974 | * First we need assembly templates of each of the patchable Guest operations, | 1046 | * First we need assembly templates of each of the patchable Guest operations, |
975 | * and these are in i386_head.S. */ | 1047 | * and these are in i386_head.S. */ |
@@ -980,8 +1052,6 @@ static const struct lguest_insns | |||
980 | const char *start, *end; | 1052 | const char *start, *end; |
981 | } lguest_insns[] = { | 1053 | } lguest_insns[] = { |
982 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, | 1054 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, |
983 | [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, | ||
984 | [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, | ||
985 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1055 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
986 | }; | 1056 | }; |
987 | 1057 | ||
@@ -1019,6 +1089,7 @@ __init void lguest_init(void) | |||
1019 | pv_info.name = "lguest"; | 1089 | pv_info.name = "lguest"; |
1020 | pv_info.paravirt_enabled = 1; | 1090 | pv_info.paravirt_enabled = 1; |
1021 | pv_info.kernel_rpl = 1; | 1091 | pv_info.kernel_rpl = 1; |
1092 | pv_info.shared_kernel_pmd = 1; | ||
1022 | 1093 | ||
1023 | /* We set up all the lguest overrides for sensitive operations. These | 1094 | /* We set up all the lguest overrides for sensitive operations. These |
1024 | * are detailed with the operations themselves. */ | 1095 | * are detailed with the operations themselves. */ |
@@ -1026,9 +1097,9 @@ __init void lguest_init(void) | |||
1026 | /* interrupt-related operations */ | 1097 | /* interrupt-related operations */ |
1027 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1098 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1028 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1099 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1029 | pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); | 1100 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
1030 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); | 1101 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); |
1031 | pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); | 1102 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1032 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1103 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1033 | 1104 | ||
1034 | /* init-time operations */ | 1105 | /* init-time operations */ |
@@ -1053,8 +1124,8 @@ __init void lguest_init(void) | |||
1053 | pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; | 1124 | pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; |
1054 | pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; | 1125 | pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; |
1055 | pv_cpu_ops.wbinvd = lguest_wbinvd; | 1126 | pv_cpu_ops.wbinvd = lguest_wbinvd; |
1056 | pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; | 1127 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; |
1057 | pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; | 1128 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; |
1058 | 1129 | ||
1059 | /* pagetable management */ | 1130 | /* pagetable management */ |
1060 | pv_mmu_ops.write_cr3 = lguest_write_cr3; | 1131 | pv_mmu_ops.write_cr3 = lguest_write_cr3; |
@@ -1064,10 +1135,16 @@ __init void lguest_init(void) | |||
1064 | pv_mmu_ops.set_pte = lguest_set_pte; | 1135 | pv_mmu_ops.set_pte = lguest_set_pte; |
1065 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; | 1136 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; |
1066 | pv_mmu_ops.set_pmd = lguest_set_pmd; | 1137 | pv_mmu_ops.set_pmd = lguest_set_pmd; |
1138 | #ifdef CONFIG_X86_PAE | ||
1139 | pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; | ||
1140 | pv_mmu_ops.pte_clear = lguest_pte_clear; | ||
1141 | pv_mmu_ops.pmd_clear = lguest_pmd_clear; | ||
1142 | pv_mmu_ops.set_pud = lguest_set_pud; | ||
1143 | #endif | ||
1067 | pv_mmu_ops.read_cr2 = lguest_read_cr2; | 1144 | pv_mmu_ops.read_cr2 = lguest_read_cr2; |
1068 | pv_mmu_ops.read_cr3 = lguest_read_cr3; | 1145 | pv_mmu_ops.read_cr3 = lguest_read_cr3; |
1069 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; | 1146 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; |
1070 | pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; | 1147 | pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; |
1071 | pv_mmu_ops.pte_update = lguest_pte_update; | 1148 | pv_mmu_ops.pte_update = lguest_pte_update; |
1072 | pv_mmu_ops.pte_update_defer = lguest_pte_update; | 1149 | pv_mmu_ops.pte_update_defer = lguest_pte_update; |
1073 | 1150 | ||
@@ -1088,13 +1165,21 @@ __init void lguest_init(void) | |||
1088 | * lguest_init() where the rest of the fairly chaotic boot setup | 1165 | * lguest_init() where the rest of the fairly chaotic boot setup |
1089 | * occurs. */ | 1166 | * occurs. */ |
1090 | 1167 | ||
1168 | /* The stack protector is a weird thing where gcc places a canary | ||
1169 | * value on the stack and then checks it on return. This file is | ||
1170 | * compiled with -fno-stack-protector it, so we got this far without | ||
1171 | * problems. The value of the canary is kept at offset 20 from the | ||
1172 | * %gs register, so we need to set that up before calling C functions | ||
1173 | * in other files. */ | ||
1174 | setup_stack_canary_segment(0); | ||
1175 | /* We could just call load_stack_canary_segment(), but we might as | ||
1176 | * call switch_to_new_gdt() which loads the whole table and sets up | ||
1177 | * the per-cpu segment descriptor register %fs as well. */ | ||
1178 | switch_to_new_gdt(0); | ||
1179 | |||
1091 | /* As described in head_32.S, we map the first 128M of memory. */ | 1180 | /* As described in head_32.S, we map the first 128M of memory. */ |
1092 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1181 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1093 | 1182 | ||
1094 | /* Load the %fs segment register (the per-cpu segment register) with | ||
1095 | * the normal data segment to get through booting. */ | ||
1096 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | ||
1097 | |||
1098 | /* The Host<->Guest Switcher lives at the top of our address space, and | 1183 | /* The Host<->Guest Switcher lives at the top of our address space, and |
1099 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1184 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1100 | * it put the answer in lguest_data.reserve_mem */ | 1185 | * it put the answer in lguest_data.reserve_mem */ |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -46,10 +46,64 @@ ENTRY(lguest_entry) | |||
46 | .globl lgstart_##name; .globl lgend_##name | 46 | .globl lgstart_##name; .globl lgend_##name |
47 | 47 | ||
48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
49 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
50 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
51 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
52 | /*:*/ | 50 | |
51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | ||
52 | * matter for save_fl and irq_disable later). If we write our routines | ||
53 | * carefully in assembler, we can avoid clobbering any registers and avoid | ||
54 | * jumping through the wrapper functions. | ||
55 | * | ||
56 | * I skipped over our first piece of assembler, but this one is worth studying | ||
57 | * in a bit more detail so I'll describe in easy stages. First, the routine | ||
58 | * to enable interrupts: */ | ||
59 | ENTRY(lg_irq_enable) | ||
60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | ||
61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | ||
62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | ||
63 | /* But now we need to check if the Host wants to know: there might have | ||
64 | * been interrupts waiting to be delivered, in which case it will have | ||
65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | ||
66 | * jump to send_interrupts, otherwise we're done. */ | ||
67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | ||
68 | jnz send_interrupts | ||
69 | /* One cool thing about x86 is that you can do many things without using | ||
70 | * a register. In this case, the normal path hasn't needed to save or | ||
71 | * restore any registers at all! */ | ||
72 | ret | ||
73 | send_interrupts: | ||
74 | /* OK, now we need a register: eax is used for the hypercall number, | ||
75 | * which is LHCALL_SEND_INTERRUPTS. | ||
76 | * | ||
77 | * We used not to bother with this pending detection at all, which was | ||
78 | * much simpler. Sooner or later the Host would realize it had to | ||
79 | * send us an interrupt. But that turns out to make performance 7 | ||
80 | * times worse on a simple tcp benchmark. So now we do this the hard | ||
81 | * way. */ | ||
82 | pushl %eax | ||
83 | movl $LHCALL_SEND_INTERRUPTS, %eax | ||
84 | /* This is a vmcall instruction (same thing that KVM uses). Older | ||
85 | * assembler versions might not know the "vmcall" instruction, so we | ||
86 | * create one manually here. */ | ||
87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | ||
88 | popl %eax | ||
89 | ret | ||
90 | |||
91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | ||
93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | ||
94 | ENTRY(lg_restore_fl) | ||
95 | /* This is just "lguest_data.irq_enabled = flags;" */ | ||
96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | ||
97 | /* Now, if the %eax value has enabled interrupts and | ||
98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | ||
99 | * deliver any outstanding interrupts. Fortunately, both values will | ||
100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | ||
101 | * instruction will AND them together for us. If both are set, we | ||
102 | * jump to send_interrupts. */ | ||
103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | ||
104 | jnz send_interrupts | ||
105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | ||
106 | ret | ||
53 | 107 | ||
54 | /* These demark the EIP range where host should never deliver interrupts. */ | 108 | /* These demark the EIP range where host should never deliver interrupts. */ |
55 | .global lguest_noirq_start | 109 | .global lguest_noirq_start |