aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lguest/boot.c
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-07-30 18:03:45 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-07-30 02:33:46 -0400
commita91d74a3c4de8115295ee87350c13a329164aaaf (patch)
tree02c862fccc9abedf7fc354061e69c4b5fbcce06d /arch/x86/lguest/boot.c
parent2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff)
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'arch/x86/lguest/boot.c')
-rw-r--r--arch/x86/lguest/boot.c99
1 files changed, 77 insertions, 22 deletions
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 025c04d18f2b..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
154 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
155} 155}
156 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
157static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
158 unsigned long arg1, 159 unsigned long arg1,
159 unsigned long arg2) 160 unsigned long arg2)
@@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
189} 190}
190#endif 191#endif
191 192
192/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
193 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
194static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
195{ 198{
196 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -250,13 +253,11 @@ extern void lg_irq_enable(void);
250extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
251 254
252/*M:003 255/*M:003
253 * Note that we don't check for outstanding interrupts when we re-enable them 256 * We could be more efficient in our checking of outstanding interrupts, rather
254 * (or when we unmask an interrupt). This seems to work for the moment, since 257 * than using a branch. One way would be to put the "irq_enabled" field in a
255 * interrupts are rare and we'll just get the interrupt on the next timer tick, 258 * page by itself, and have the Host write-protect it when an interrupt comes
256 * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would 259 * in when irqs are disabled. There will then be a page fault as soon as
257 * be to put the "irq_enabled" field in a page by itself, and have the Host 260 * interrupts are re-enabled.
258 * write-protect it when an interrupt comes in when irqs are disabled. There
259 * will then be a page fault as soon as interrupts are re-enabled.
260 * 261 *
261 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
262 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
@@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
568 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
569 * | --------->+---------+ 570 * | --------->+---------+
570 * | | | PADDR1 | 571 * | | | PADDR1 |
571 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
572 * (PMD) page | | | 573 * (PMD) page | | |
573 * | | Lower-level | 574 * | | Lower-level |
574 * | | (PTE) page | 575 * | | (PTE) page |
@@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val)
588 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
589 * page directory page pagetable page 590 * page directory page pagetable page
590 * 591 *
591 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
592 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
593 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
594 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
595 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
596 */ 635 */
597 636
598/* 637/*
599 * The Guest calls this to set a second-level entry (pte), ie. to map a page 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
600 * into a process' address space. We set the entry then tell the Host the 639 * a page into a process' address space. Wetell the Host the toplevel and
601 * toplevel and address this corresponds to. The Guest uses one pagetable per 640 * address this corresponds to. The Guest uses one pagetable per process, so
602 * process, so we need to tell the Host which one we're changing (mm->pgd). 641 * we need to tell the Host which one we're changing (mm->pgd).
603 */ 642 */
604static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
605 pte_t *ptep) 644 pte_t *ptep)
606{ 645{
607#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
608 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
609 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
610#else 650#else
@@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
612#endif 652#endif
613} 653}
614 654
655/* This is the "set and update" combo-meal-deal version. */
615static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
616 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
617{ 658{
@@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
672} 713}
673 714
674#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
675static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
676{ 722{
677 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
679 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
680} 726}
681 727
682void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
683{ 730{
684 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
685 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
686} 733}
687 734
688void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
689{ 736{
690 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
691} 738}
@@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void)
784 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
785} 832}
786 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
787void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
788{ 843{
789 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -1298,7 +1353,7 @@ __init void lguest_init(void)
1298 */ 1353 */
1299 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1300 1355
1301 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1302 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1303 1358
1304 /* 1359 /*