diff options
Diffstat (limited to 'arch/x86')
| -rw-r--r-- | arch/x86/include/asm/lguest.h | 7 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 15 | ||||
| -rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/boot.c | 158 | ||||
| -rw-r--r-- | arch/x86/lguest/i386_head.S | 60 |
6 files changed, 187 insertions, 55 deletions
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9c..313389cd50d2 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
| @@ -17,8 +17,13 @@ | |||
| 17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
| 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
| 19 | 19 | ||
| 20 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | 20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping |
| 21 | * into the guest (one PTE page). */ | ||
| 22 | #ifdef CONFIG_X86_PAE | ||
| 23 | #define SWITCHER_ADDR 0xFFE00000 | ||
| 24 | #else | ||
| 21 | #define SWITCHER_ADDR 0xFFC00000 | 25 | #define SWITCHER_ADDR 0xFFC00000 |
| 26 | #endif | ||
| 22 | 27 | ||
| 23 | /* Found in switcher.S */ | 28 | /* Found in switcher.S */ |
| 24 | extern unsigned long default_idt_entries[]; | 29 | extern unsigned long default_idt_entries[]; |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487b..d31c4a684078 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
| @@ -12,11 +12,13 @@ | |||
| 12 | #define LHCALL_TS 8 | 12 | #define LHCALL_TS 8 |
| 13 | #define LHCALL_SET_CLOCKEVENT 9 | 13 | #define LHCALL_SET_CLOCKEVENT 9 |
| 14 | #define LHCALL_HALT 10 | 14 | #define LHCALL_HALT 10 |
| 15 | #define LHCALL_SET_PMD 13 | ||
| 15 | #define LHCALL_SET_PTE 14 | 16 | #define LHCALL_SET_PTE 14 |
| 16 | #define LHCALL_SET_PMD 15 | 17 | #define LHCALL_SET_PGD 15 |
| 17 | #define LHCALL_LOAD_TLS 16 | 18 | #define LHCALL_LOAD_TLS 16 |
| 18 | #define LHCALL_NOTIFY 17 | 19 | #define LHCALL_NOTIFY 17 |
| 19 | #define LHCALL_LOAD_GDT_ENTRY 18 | 20 | #define LHCALL_LOAD_GDT_ENTRY 18 |
| 21 | #define LHCALL_SEND_INTERRUPTS 19 | ||
| 20 | 22 | ||
| 21 | #define LGUEST_TRAP_ENTRY 0x1F | 23 | #define LGUEST_TRAP_ENTRY 0x1F |
| 22 | 24 | ||
| @@ -32,10 +34,10 @@ | |||
| 32 | * operations? There are two ways: the direct way is to make a "hypercall", | 34 | * operations? There are two ways: the direct way is to make a "hypercall", |
| 33 | * to make requests of the Host Itself. | 35 | * to make requests of the Host Itself. |
| 34 | * | 36 | * |
| 35 | * We use the KVM hypercall mechanism. Eighteen hypercalls are | 37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are |
| 36 | * available: the hypercall number is put in the %eax register, and the | 38 | * available: the hypercall number is put in the %eax register, and the |
| 37 | * arguments (when required) are placed in %ebx, %ecx and %edx. If a return | 39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. |
| 38 | * value makes sense, it's returned in %eax. | 40 | * If a return value makes sense, it's returned in %eax. |
| 39 | * | 41 | * |
| 40 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 42 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
| 41 | * Host, rather than returning failure. This reflects Winston Churchill's | 43 | * Host, rather than returning failure. This reflects Winston Churchill's |
| @@ -47,8 +49,9 @@ | |||
| 47 | 49 | ||
| 48 | #define LHCALL_RING_SIZE 64 | 50 | #define LHCALL_RING_SIZE 64 |
| 49 | struct hcall_args { | 51 | struct hcall_args { |
| 50 | /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ | 52 | /* These map directly onto eax, ebx, ecx, edx and esi |
| 51 | unsigned long arg0, arg1, arg2, arg3; | 53 | * in struct lguest_regs */ |
| 54 | unsigned long arg0, arg1, arg2, arg3, arg4; | ||
| 52 | }; | 55 | }; |
| 53 | 56 | ||
| 54 | #endif /* !__ASSEMBLY__ */ | 57 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd7015..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
| @@ -126,6 +126,7 @@ void foo(void) | |||
| 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
| 127 | BLANK(); | 127 | BLANK(); |
| 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); |
| 129 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); | ||
| 129 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); | 130 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); |
| 130 | 131 | ||
| 131 | BLANK(); | 132 | BLANK(); |
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
| @@ -2,7 +2,6 @@ config LGUEST_GUEST | |||
| 2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
| 3 | select PARAVIRT | 3 | select PARAVIRT |
| 4 | depends on X86_32 | 4 | depends on X86_32 |
| 5 | depends on !X86_PAE | ||
| 6 | select VIRTIO | 5 | select VIRTIO |
| 7 | select VIRTIO_RING | 6 | select VIRTIO_RING |
| 8 | select VIRTIO_CONSOLE | 7 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c26559395..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { | |||
| 87 | 87 | ||
| 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a |
| 89 | * ring buffer of stored hypercalls which the Host will run though next time we | 89 | * ring buffer of stored hypercalls which the Host will run though next time we |
| 90 | * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall | 90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
| 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
| 92 | * and 255 once the Host has finished with it. | 92 | * and 255 once the Host has finished with it. |
| 93 | * | 93 | * |
| @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { | |||
| 96 | * effect of causing the Host to run all the stored calls in the ring buffer | 96 | * effect of causing the Host to run all the stored calls in the ring buffer |
| 97 | * which empties it for next time! */ | 97 | * which empties it for next time! */ |
| 98 | static void async_hcall(unsigned long call, unsigned long arg1, | 98 | static void async_hcall(unsigned long call, unsigned long arg1, |
| 99 | unsigned long arg2, unsigned long arg3) | 99 | unsigned long arg2, unsigned long arg3, |
| 100 | unsigned long arg4) | ||
| 100 | { | 101 | { |
| 101 | /* Note: This code assumes we're uniprocessor. */ | 102 | /* Note: This code assumes we're uniprocessor. */ |
| 102 | static unsigned int next_call; | 103 | static unsigned int next_call; |
| @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
| 108 | local_irq_save(flags); | 109 | local_irq_save(flags); |
| 109 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 110 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
| 110 | /* Table full, so do normal hcall which will flush table. */ | 111 | /* Table full, so do normal hcall which will flush table. */ |
| 111 | kvm_hypercall3(call, arg1, arg2, arg3); | 112 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); |
| 112 | } else { | 113 | } else { |
| 113 | lguest_data.hcalls[next_call].arg0 = call; | 114 | lguest_data.hcalls[next_call].arg0 = call; |
| 114 | lguest_data.hcalls[next_call].arg1 = arg1; | 115 | lguest_data.hcalls[next_call].arg1 = arg1; |
| 115 | lguest_data.hcalls[next_call].arg2 = arg2; | 116 | lguest_data.hcalls[next_call].arg2 = arg2; |
| 116 | lguest_data.hcalls[next_call].arg3 = arg3; | 117 | lguest_data.hcalls[next_call].arg3 = arg3; |
| 118 | lguest_data.hcalls[next_call].arg4 = arg4; | ||
| 117 | /* Arguments must all be written before we mark it to go */ | 119 | /* Arguments must all be written before we mark it to go */ |
| 118 | wmb(); | 120 | wmb(); |
| 119 | lguest_data.hcall_status[next_call] = 0; | 121 | lguest_data.hcall_status[next_call] = 0; |
| @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, | |||
| 141 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 143 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 142 | kvm_hypercall1(call, arg1); | 144 | kvm_hypercall1(call, arg1); |
| 143 | else | 145 | else |
| 144 | async_hcall(call, arg1, 0, 0); | 146 | async_hcall(call, arg1, 0, 0, 0); |
| 145 | } | 147 | } |
| 146 | 148 | ||
| 147 | static void lazy_hcall2(unsigned long call, | 149 | static void lazy_hcall2(unsigned long call, |
| @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, | |||
| 151 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 153 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 152 | kvm_hypercall2(call, arg1, arg2); | 154 | kvm_hypercall2(call, arg1, arg2); |
| 153 | else | 155 | else |
| 154 | async_hcall(call, arg1, arg2, 0); | 156 | async_hcall(call, arg1, arg2, 0, 0); |
| 155 | } | 157 | } |
| 156 | 158 | ||
| 157 | static void lazy_hcall3(unsigned long call, | 159 | static void lazy_hcall3(unsigned long call, |
| @@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call, | |||
| 162 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 164 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 163 | kvm_hypercall3(call, arg1, arg2, arg3); | 165 | kvm_hypercall3(call, arg1, arg2, arg3); |
| 164 | else | 166 | else |
| 165 | async_hcall(call, arg1, arg2, arg3); | 167 | async_hcall(call, arg1, arg2, arg3, 0); |
| 166 | } | 168 | } |
| 167 | 169 | ||
| 170 | #ifdef CONFIG_X86_PAE | ||
| 171 | static void lazy_hcall4(unsigned long call, | ||
| 172 | unsigned long arg1, | ||
| 173 | unsigned long arg2, | ||
| 174 | unsigned long arg3, | ||
| 175 | unsigned long arg4) | ||
| 176 | { | ||
| 177 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | ||
| 178 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); | ||
| 179 | else | ||
| 180 | async_hcall(call, arg1, arg2, arg3, arg4); | ||
| 181 | } | ||
| 182 | #endif | ||
| 183 | |||
| 168 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
| 169 | * issue the do-nothing hypercall to flush any stored calls. */ | 185 | * issue the do-nothing hypercall to flush any stored calls. */ |
| 170 | static void lguest_leave_lazy_mmu_mode(void) | 186 | static void lguest_leave_lazy_mmu_mode(void) |
| @@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
| 179 | paravirt_end_context_switch(next); | 195 | paravirt_end_context_switch(next); |
| 180 | } | 196 | } |
| 181 | 197 | ||
| 182 | /*G:033 | 198 | /*G:032 |
| 183 | * After that diversion we return to our first native-instruction | 199 | * After that diversion we return to our first native-instruction |
| 184 | * replacements: four functions for interrupt control. | 200 | * replacements: four functions for interrupt control. |
| 185 | * | 201 | * |
| @@ -199,30 +215,28 @@ static unsigned long save_fl(void) | |||
| 199 | { | 215 | { |
| 200 | return lguest_data.irq_enabled; | 216 | return lguest_data.irq_enabled; |
| 201 | } | 217 | } |
| 202 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
| 203 | |||
| 204 | /* restore_flags() just sets the flags back to the value given. */ | ||
| 205 | static void restore_fl(unsigned long flags) | ||
| 206 | { | ||
| 207 | lguest_data.irq_enabled = flags; | ||
| 208 | } | ||
| 209 | PV_CALLEE_SAVE_REGS_THUNK(restore_fl); | ||
| 210 | 218 | ||
| 211 | /* Interrupts go off... */ | 219 | /* Interrupts go off... */ |
| 212 | static void irq_disable(void) | 220 | static void irq_disable(void) |
| 213 | { | 221 | { |
| 214 | lguest_data.irq_enabled = 0; | 222 | lguest_data.irq_enabled = 0; |
| 215 | } | 223 | } |
| 224 | |||
| 225 | /* Let's pause a moment. Remember how I said these are called so often? | ||
| 226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | ||
| 227 | * break some rules. In particular, these functions are assumed to save their | ||
| 228 | * own registers if they need to: normal C functions assume they can trash the | ||
| 229 | * eax register. To use normal C functions, we use | ||
| 230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | ||
| 231 | * C function, then restores it. */ | ||
| 232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
| 216 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
| 234 | /*:*/ | ||
| 217 | 235 | ||
| 218 | /* Interrupts go on... */ | 236 | /* These are in i386_head.S */ |
| 219 | static void irq_enable(void) | 237 | extern void lg_irq_enable(void); |
| 220 | { | 238 | extern void lg_restore_fl(unsigned long flags); |
| 221 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
| 222 | } | ||
| 223 | PV_CALLEE_SAVE_REGS_THUNK(irq_enable); | ||
| 224 | 239 | ||
| 225 | /*:*/ | ||
| 226 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable |
| 227 | * them (or when we unmask an interrupt). This seems to work for the moment, | 241 | * them (or when we unmask an interrupt). This seems to work for the moment, |
| 228 | * since interrupts are rare and we'll just get the interrupt on the next timer | 242 | * since interrupts are rare and we'll just get the interrupt on the next timer |
| @@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 368 | case 1: /* Basic feature request. */ | 382 | case 1: /* Basic feature request. */ |
| 369 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ |
| 370 | *cx &= 0x00002201; | 384 | *cx &= 0x00002201; |
| 371 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ | 385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ |
| 372 | *dx &= 0x07808111; | 386 | *dx &= 0x07808151; |
| 373 | /* The Host can do a nice optimization if it knows that the | 387 | /* The Host can do a nice optimization if it knows that the |
| 374 | * kernel mappings (addresses above 0xC0000000 or whatever | 388 | * kernel mappings (addresses above 0xC0000000 or whatever |
| 375 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
| @@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 388 | if (*ax > 0x80000008) | 402 | if (*ax > 0x80000008) |
| 389 | *ax = 0x80000008; | 403 | *ax = 0x80000008; |
| 390 | break; | 404 | break; |
| 405 | case 0x80000001: | ||
| 406 | /* Here we should fix nx cap depending on host. */ | ||
| 407 | /* For this version of PAE, we just clear NX bit. */ | ||
| 408 | *dx &= ~(1 << 20); | ||
| 409 | break; | ||
| 391 | } | 410 | } |
| 392 | } | 411 | } |
| 393 | 412 | ||
| @@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) | |||
| 521 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
| 522 | pte_t *ptep) | 541 | pte_t *ptep) |
| 523 | { | 542 | { |
| 543 | #ifdef CONFIG_X86_PAE | ||
| 544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | ||
| 545 | ptep->pte_low, ptep->pte_high); | ||
| 546 | #else | ||
| 524 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); | 547 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); |
| 548 | #endif | ||
| 525 | } | 549 | } |
| 526 | 550 | ||
| 527 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
| 528 | pte_t *ptep, pte_t pteval) | 552 | pte_t *ptep, pte_t pteval) |
| 529 | { | 553 | { |
| 530 | *ptep = pteval; | 554 | native_set_pte(ptep, pteval); |
| 531 | lguest_pte_update(mm, addr, ptep); | 555 | lguest_pte_update(mm, addr, ptep); |
| 532 | } | 556 | } |
| 533 | 557 | ||
| 534 | /* The Guest calls this to set a top-level entry. Again, we set the entry then | 558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd |
| 535 | * tell the Host which top-level page we changed, and the index of the entry we | 559 | * to set a middle-level entry when PAE is activated. |
| 536 | * changed. */ | 560 | * Again, we set the entry then tell the Host which page we changed, |
| 561 | * and the index of the entry we changed. */ | ||
| 562 | #ifdef CONFIG_X86_PAE | ||
| 563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | ||
| 564 | { | ||
| 565 | native_set_pud(pudp, pudval); | ||
| 566 | |||
| 567 | /* 32 bytes aligned pdpt address and the index. */ | ||
| 568 | lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, | ||
| 569 | (__pa(pudp) & 0x1F) / sizeof(pud_t)); | ||
| 570 | } | ||
| 571 | |||
| 537 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 572 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
| 538 | { | 573 | { |
| 539 | *pmdp = pmdval; | 574 | native_set_pmd(pmdp, pmdval); |
| 540 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, | 575 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, |
| 541 | (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); | 576 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); |
| 542 | } | 577 | } |
| 578 | #else | ||
| 579 | |||
| 580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | ||
| 581 | * activated. */ | ||
| 582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
| 583 | { | ||
| 584 | native_set_pmd(pmdp, pmdval); | ||
| 585 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, | ||
| 586 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | ||
| 587 | } | ||
| 588 | #endif | ||
| 543 | 589 | ||
| 544 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 590 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
| 545 | * don't know the top level any more. This is useless for us, since we don't | 591 | * don't know the top level any more. This is useless for us, since we don't |
| @@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
| 552 | * which brings boot back to 0.25 seconds. */ | 598 | * which brings boot back to 0.25 seconds. */ |
| 553 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 599 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
| 554 | { | 600 | { |
| 555 | *ptep = pteval; | 601 | native_set_pte(ptep, pteval); |
| 602 | if (cr3_changed) | ||
| 603 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | ||
| 604 | } | ||
| 605 | |||
| 606 | #ifdef CONFIG_X86_PAE | ||
| 607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
| 608 | { | ||
| 609 | native_set_pte_atomic(ptep, pte); | ||
| 556 | if (cr3_changed) | 610 | if (cr3_changed) |
| 557 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
| 558 | } | 612 | } |
| 559 | 613 | ||
| 614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
| 615 | { | ||
| 616 | native_pte_clear(mm, addr, ptep); | ||
| 617 | lguest_pte_update(mm, addr, ptep); | ||
| 618 | } | ||
| 619 | |||
| 620 | void lguest_pmd_clear(pmd_t *pmdp) | ||
| 621 | { | ||
| 622 | lguest_set_pmd(pmdp, __pmd(0)); | ||
| 623 | } | ||
| 624 | #endif | ||
| 625 | |||
| 560 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on |
| 561 | * native page table operations. On native hardware you can set a new page | 627 | * native page table operations. On native hardware you can set a new page |
| 562 | * table entry whenever you want, but if you want to remove one you have to do | 628 | * table entry whenever you want, but if you want to remove one you have to do |
| @@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void) | |||
| 628 | { | 694 | { |
| 629 | unsigned int i; | 695 | unsigned int i; |
| 630 | 696 | ||
| 631 | for (i = 0; i < LGUEST_IRQS; i++) { | 697 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
| 632 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
| 633 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 698 | /* Some systems map "vectors" to interrupts weirdly. Lguest has |
| 634 | * a straightforward 1 to 1 mapping, so force that here. */ | 699 | * a straightforward 1 to 1 mapping, so force that here. */ |
| 635 | __get_cpu_var(vector_irq)[vector] = i; | 700 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
| 636 | if (vector != SYSCALL_VECTOR) | 701 | if (i != SYSCALL_VECTOR) |
| 637 | set_intr_gate(vector, interrupt[i]); | 702 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
| 638 | } | 703 | } |
| 639 | /* This call is required to set up for 4k stacks, where we have | 704 | /* This call is required to set up for 4k stacks, where we have |
| 640 | * separate stacks for hard and soft interrupts. */ | 705 | * separate stacks for hard and soft interrupts. */ |
| @@ -973,10 +1038,10 @@ static void lguest_restart(char *reason) | |||
| 973 | * | 1038 | * |
| 974 | * Our current solution is to allow the paravirt back end to optionally patch | 1039 | * Our current solution is to allow the paravirt back end to optionally patch |
| 975 | * over the indirect calls to replace them with something more efficient. We | 1040 | * over the indirect calls to replace them with something more efficient. We |
| 976 | * patch the four most commonly called functions: disable interrupts, enable | 1041 | * patch two of the simplest of the most commonly called functions: disable |
| 977 | * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 | 1042 | * interrupts and save interrupts. We usually have 6 or 10 bytes to patch |
| 978 | * bytes to patch into: the Guest versions of these operations are small enough | 1043 | * into: the Guest versions of these operations are small enough that we can |
| 979 | * that we can fit comfortably. | 1044 | * fit comfortably. |
| 980 | * | 1045 | * |
| 981 | * First we need assembly templates of each of the patchable Guest operations, | 1046 | * First we need assembly templates of each of the patchable Guest operations, |
| 982 | * and these are in i386_head.S. */ | 1047 | * and these are in i386_head.S. */ |
| @@ -987,8 +1052,6 @@ static const struct lguest_insns | |||
| 987 | const char *start, *end; | 1052 | const char *start, *end; |
| 988 | } lguest_insns[] = { | 1053 | } lguest_insns[] = { |
| 989 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, | 1054 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, |
| 990 | [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, | ||
| 991 | [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, | ||
| 992 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1055 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
| 993 | }; | 1056 | }; |
| 994 | 1057 | ||
| @@ -1026,6 +1089,7 @@ __init void lguest_init(void) | |||
| 1026 | pv_info.name = "lguest"; | 1089 | pv_info.name = "lguest"; |
| 1027 | pv_info.paravirt_enabled = 1; | 1090 | pv_info.paravirt_enabled = 1; |
| 1028 | pv_info.kernel_rpl = 1; | 1091 | pv_info.kernel_rpl = 1; |
| 1092 | pv_info.shared_kernel_pmd = 1; | ||
| 1029 | 1093 | ||
| 1030 | /* We set up all the lguest overrides for sensitive operations. These | 1094 | /* We set up all the lguest overrides for sensitive operations. These |
| 1031 | * are detailed with the operations themselves. */ | 1095 | * are detailed with the operations themselves. */ |
| @@ -1033,9 +1097,9 @@ __init void lguest_init(void) | |||
| 1033 | /* interrupt-related operations */ | 1097 | /* interrupt-related operations */ |
| 1034 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1098 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
| 1035 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1099 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
| 1036 | pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); | 1100 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
| 1037 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); | 1101 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); |
| 1038 | pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); | 1102 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
| 1039 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1103 | pv_irq_ops.safe_halt = lguest_safe_halt; |
| 1040 | 1104 | ||
| 1041 | /* init-time operations */ | 1105 | /* init-time operations */ |
| @@ -1071,6 +1135,12 @@ __init void lguest_init(void) | |||
| 1071 | pv_mmu_ops.set_pte = lguest_set_pte; | 1135 | pv_mmu_ops.set_pte = lguest_set_pte; |
| 1072 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; | 1136 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; |
| 1073 | pv_mmu_ops.set_pmd = lguest_set_pmd; | 1137 | pv_mmu_ops.set_pmd = lguest_set_pmd; |
| 1138 | #ifdef CONFIG_X86_PAE | ||
| 1139 | pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; | ||
| 1140 | pv_mmu_ops.pte_clear = lguest_pte_clear; | ||
| 1141 | pv_mmu_ops.pmd_clear = lguest_pmd_clear; | ||
| 1142 | pv_mmu_ops.set_pud = lguest_set_pud; | ||
| 1143 | #endif | ||
| 1074 | pv_mmu_ops.read_cr2 = lguest_read_cr2; | 1144 | pv_mmu_ops.read_cr2 = lguest_read_cr2; |
| 1075 | pv_mmu_ops.read_cr3 = lguest_read_cr3; | 1145 | pv_mmu_ops.read_cr3 = lguest_read_cr3; |
| 1076 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; | 1146 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
| @@ -46,10 +46,64 @@ ENTRY(lguest_entry) | |||
| 46 | .globl lgstart_##name; .globl lgend_##name | 46 | .globl lgstart_##name; .globl lgend_##name |
| 47 | 47 | ||
| 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
| 49 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
| 50 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
| 51 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
| 52 | /*:*/ | 50 | |
| 51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | ||
| 52 | * matter for save_fl and irq_disable later). If we write our routines | ||
| 53 | * carefully in assembler, we can avoid clobbering any registers and avoid | ||
| 54 | * jumping through the wrapper functions. | ||
| 55 | * | ||
| 56 | * I skipped over our first piece of assembler, but this one is worth studying | ||
| 57 | * in a bit more detail so I'll describe in easy stages. First, the routine | ||
| 58 | * to enable interrupts: */ | ||
| 59 | ENTRY(lg_irq_enable) | ||
| 60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | ||
| 61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | ||
| 62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | ||
| 63 | /* But now we need to check if the Host wants to know: there might have | ||
| 64 | * been interrupts waiting to be delivered, in which case it will have | ||
| 65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | ||
| 66 | * jump to send_interrupts, otherwise we're done. */ | ||
| 67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | ||
| 68 | jnz send_interrupts | ||
| 69 | /* One cool thing about x86 is that you can do many things without using | ||
| 70 | * a register. In this case, the normal path hasn't needed to save or | ||
| 71 | * restore any registers at all! */ | ||
| 72 | ret | ||
| 73 | send_interrupts: | ||
| 74 | /* OK, now we need a register: eax is used for the hypercall number, | ||
| 75 | * which is LHCALL_SEND_INTERRUPTS. | ||
| 76 | * | ||
| 77 | * We used not to bother with this pending detection at all, which was | ||
| 78 | * much simpler. Sooner or later the Host would realize it had to | ||
| 79 | * send us an interrupt. But that turns out to make performance 7 | ||
| 80 | * times worse on a simple tcp benchmark. So now we do this the hard | ||
| 81 | * way. */ | ||
| 82 | pushl %eax | ||
| 83 | movl $LHCALL_SEND_INTERRUPTS, %eax | ||
| 84 | /* This is a vmcall instruction (same thing that KVM uses). Older | ||
| 85 | * assembler versions might not know the "vmcall" instruction, so we | ||
| 86 | * create one manually here. */ | ||
| 87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | ||
| 88 | popl %eax | ||
| 89 | ret | ||
| 90 | |||
| 91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
| 92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | ||
| 93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | ||
| 94 | ENTRY(lg_restore_fl) | ||
| 95 | /* This is just "lguest_data.irq_enabled = flags;" */ | ||
| 96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | ||
| 97 | /* Now, if the %eax value has enabled interrupts and | ||
| 98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | ||
| 99 | * deliver any outstanding interrupts. Fortunately, both values will | ||
| 100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | ||
| 101 | * instruction will AND them together for us. If both are set, we | ||
| 102 | * jump to send_interrupts. */ | ||
| 103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | ||
| 104 | jnz send_interrupts | ||
| 105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | ||
| 106 | ret | ||
| 53 | 107 | ||
| 54 | /* These demark the EIP range where host should never deliver interrupts. */ | 108 | /* These demark the EIP range where host should never deliver interrupts. */ |
| 55 | .global lguest_noirq_start | 109 | .global lguest_noirq_start |
