diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 18:03:45 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 02:33:46 -0400 |
commit | a91d74a3c4de8115295ee87350c13a329164aaaf (patch) | |
tree | 02c862fccc9abedf7fc354061e69c4b5fbcce06d /arch/x86/lguest | |
parent | 2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff) |
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot
the Lguest Journey (see drivers/lguest/README). Since we now use RCU in
a simple form in one place I took the opportunity to expand that explanation.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r-- | arch/x86/lguest/boot.c | 99 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 2 |
2 files changed, 79 insertions, 22 deletions
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2b..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
154 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
155 | } | 155 | } |
156 | 156 | ||
157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
157 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
158 | unsigned long arg1, | 159 | unsigned long arg1, |
159 | unsigned long arg2) | 160 | unsigned long arg2) |
@@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
189 | } | 190 | } |
190 | #endif | 191 | #endif |
191 | 192 | ||
192 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
193 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
195 | * issue the do-nothing hypercall to flush any stored calls. | ||
196 | :*/ | ||
194 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
195 | { | 198 | { |
196 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
@@ -250,13 +253,11 @@ extern void lg_irq_enable(void); | |||
250 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
251 | 254 | ||
252 | /*M:003 | 255 | /*M:003 |
253 | * Note that we don't check for outstanding interrupts when we re-enable them | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
254 | * (or when we unmask an interrupt). This seems to work for the moment, since | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
255 | * interrupts are rare and we'll just get the interrupt on the next timer tick, | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
256 | * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
257 | * be to put the "irq_enabled" field in a page by itself, and have the Host | 260 | * interrupts are re-enabled. |
258 | * write-protect it when an interrupt comes in when irqs are disabled. There | ||
259 | * will then be a page fault as soon as interrupts are re-enabled. | ||
260 | * | 261 | * |
261 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
262 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
@@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
568 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
569 | * | --------->+---------+ | 570 | * | --------->+---------+ |
570 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
571 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
572 | * (PMD) page | | | | 573 | * (PMD) page | | | |
573 | * | | Lower-level | | 574 | * | | Lower-level | |
574 | * | | (PTE) page | | 575 | * | | (PTE) page | |
@@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
588 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
589 | * page directory page pagetable page | 590 | * page directory page pagetable page |
590 | * | 591 | * |
591 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
592 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
593 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
594 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
595 | * the real page tables based on the Guests'. | 596 | * |
597 | * The result is a four level page table: | ||
598 | * | ||
599 | * cr3 --> [ 4 Upper ] | ||
600 | * [ Level ] | ||
601 | * [ Entries ] | ||
602 | * [(PUD Page)]---> +---------+ | ||
603 | * | --------->+---------+ | ||
604 | * | | | PADDR1 | | ||
605 | * Mid-level | | PADDR2 | | ||
606 | * (PMD) page | | | | ||
607 | * | | Lower-level | | ||
608 | * | | (PTE) page | | ||
609 | * | | | | | ||
610 | * .... .... | ||
611 | * | ||
612 | * | ||
613 | * And the virtual address is decoded as: | ||
614 | * | ||
615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
617 | * Index into Index into mid Index into lower Offset within page | ||
618 | * top entries directory page pagetable page | ||
619 | * | ||
620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
622 | * distributions turn it on, and not just for people with silly amounts of | ||
623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
624 | * kernel disable execution of pages and increase security. | ||
625 | * | ||
626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
628 | * handful of puppies were crushed in the process! | ||
629 | * | ||
630 | * Back to our point: the kernel spends a lot of time changing both the | ||
631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
632 | * know physical addresses, so while it maintains these page tables exactly | ||
633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
634 | * change: the Host will create the real page tables based on the Guests'. | ||
596 | */ | 635 | */ |
597 | 636 | ||
598 | /* | 637 | /* |
599 | * The Guest calls this to set a second-level entry (pte), ie. to map a page | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
600 | * into a process' address space. We set the entry then tell the Host the | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
601 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
602 | * process, so we need to tell the Host which one we're changing (mm->pgd). | 641 | * we need to tell the Host which one we're changing (mm->pgd). |
603 | */ | 642 | */ |
604 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
605 | pte_t *ptep) | 644 | pte_t *ptep) |
606 | { | 645 | { |
607 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
608 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
609 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
610 | #else | 650 | #else |
@@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
612 | #endif | 652 | #endif |
613 | } | 653 | } |
614 | 654 | ||
655 | /* This is the "set and update" combo-meal-deal version. */ | ||
615 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
616 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
617 | { | 658 | { |
@@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
672 | } | 713 | } |
673 | 714 | ||
674 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
716 | /* | ||
717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
719 | * versions ensure we update all 64 bits at once. | ||
720 | */ | ||
675 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
676 | { | 722 | { |
677 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
@@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
679 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
680 | } | 726 | } |
681 | 727 | ||
682 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
729 | pte_t *ptep) | ||
683 | { | 730 | { |
684 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
685 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
686 | } | 733 | } |
687 | 734 | ||
688 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
689 | { | 736 | { |
690 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
691 | } | 738 | } |
@@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) | |||
784 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
785 | } | 832 | } |
786 | 833 | ||
834 | /* | ||
835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
837 | * lguest device needs an interrupt. | ||
838 | * | ||
839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
840 | * pass that up! | ||
841 | */ | ||
787 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
788 | { | 843 | { |
789 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
@@ -1298,7 +1353,7 @@ __init void lguest_init(void) | |||
1298 | */ | 1353 | */ |
1299 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
1300 | 1355 | ||
1301 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
1302 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1303 | 1358 | ||
1304 | /* | 1359 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb054..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -102,6 +102,7 @@ send_interrupts: | |||
102 | * create one manually here. | 102 | * create one manually here. |
103 | */ | 103 | */ |
104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
105 | /* Put eax back the way we found it. */ | ||
105 | popl %eax | 106 | popl %eax |
106 | ret | 107 | ret |
107 | 108 | ||
@@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) | |||
125 | jnz send_interrupts | 126 | jnz send_interrupts |
126 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
127 | ret | 128 | ret |
129 | /*:*/ | ||
128 | 130 | ||
129 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
130 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |