diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2011-07-22 01:09:48 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2011-07-22 01:09:48 -0400 |
commit | 5dea1c88ed11a1221581c4b202f053c4fc138704 (patch) | |
tree | 59e15d3c696712e26ffb229ff987f33bcc72affe /drivers/lguest | |
parent | e0377e25206328998d036cafddcd00a7c3252e3e (diff) |
lguest: use a special 1:1 linear pagetable mode until first switch.
The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was. In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.
However, since d50d8fe19 Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump. So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.
This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET. (Non-Linux guests might not even have such a
thing).
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/lg.h | 2 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 278 |
2 files changed, 86 insertions, 194 deletions
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd5..295df06e6590 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -59,6 +59,8 @@ struct lg_cpu { | |||
59 | 59 | ||
60 | struct lguest_pages *last_pages; | 60 | struct lguest_pages *last_pages; |
61 | 61 | ||
62 | /* Initialization mode: linear map everything. */ | ||
63 | bool linear_pages; | ||
62 | int cpu_pgd; /* Which pgd this cpu is currently using */ | 64 | int cpu_pgd; /* Which pgd this cpu is currently using */ |
63 | 65 | ||
64 | /* If a hypercall was asked for, this points to the arguments. */ | 66 | /* If a hypercall was asked for, this points to the arguments. */ |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95de..00026222bde8 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/percpu.h> | 17 | #include <linux/percpu.h> |
18 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
20 | #include <asm/bootparam.h> | ||
21 | #include "lg.h" | 20 | #include "lg.h" |
22 | 21 | ||
23 | /*M:008 | 22 | /*M:008 |
@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
325 | #endif | 324 | #endif |
326 | 325 | ||
327 | /* First step: get the top-level Guest page table entry. */ | 326 | /* First step: get the top-level Guest page table entry. */ |
328 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 327 | if (unlikely(cpu->linear_pages)) { |
329 | /* Toplevel not present? We can't map it in. */ | 328 | /* Faking up a linear mapping. */ |
330 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | 329 | gpgd = __pgd(CHECK_GPGD_MASK); |
331 | return false; | 330 | } else { |
331 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
332 | /* Toplevel not present? We can't map it in. */ | ||
333 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
334 | return false; | ||
335 | } | ||
332 | 336 | ||
333 | /* Now look at the matching shadow entry. */ | 337 | /* Now look at the matching shadow entry. */ |
334 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 338 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
353 | } | 357 | } |
354 | 358 | ||
355 | #ifdef CONFIG_X86_PAE | 359 | #ifdef CONFIG_X86_PAE |
356 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | 360 | if (unlikely(cpu->linear_pages)) { |
357 | /* Middle level not present? We can't map it in. */ | 361 | /* Faking up a linear mapping. */ |
358 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | 362 | gpmd = __pmd(_PAGE_TABLE); |
359 | return false; | 363 | } else { |
364 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
365 | /* Middle level not present? We can't map it in. */ | ||
366 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
367 | return false; | ||
368 | } | ||
360 | 369 | ||
361 | /* Now look at the matching shadow entry. */ | 370 | /* Now look at the matching shadow entry. */ |
362 | spmd = spmd_addr(cpu, *spgd, vaddr); | 371 | spmd = spmd_addr(cpu, *spgd, vaddr); |
@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
397 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 406 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
398 | #endif | 407 | #endif |
399 | 408 | ||
400 | /* Read the actual PTE value. */ | 409 | if (unlikely(cpu->linear_pages)) { |
401 | gpte = lgread(cpu, gpte_ptr, pte_t); | 410 | /* Linear? Make up a PTE which points to same page. */ |
411 | gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); | ||
412 | } else { | ||
413 | /* Read the actual PTE value. */ | ||
414 | gpte = lgread(cpu, gpte_ptr, pte_t); | ||
415 | } | ||
402 | 416 | ||
403 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 417 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
404 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 418 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
454 | * Finally, we write the Guest PTE entry back: we've set the | 468 | * Finally, we write the Guest PTE entry back: we've set the |
455 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. | 469 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. |
456 | */ | 470 | */ |
457 | lgwrite(cpu, gpte_ptr, pte_t, gpte); | 471 | if (likely(!cpu->linear_pages)) |
472 | lgwrite(cpu, gpte_ptr, pte_t, gpte); | ||
458 | 473 | ||
459 | /* | 474 | /* |
460 | * The fault is fixed, the page table is populated, the mapping | 475 | * The fault is fixed, the page table is populated, the mapping |
@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
612 | #ifdef CONFIG_X86_PAE | 627 | #ifdef CONFIG_X86_PAE |
613 | pmd_t gpmd; | 628 | pmd_t gpmd; |
614 | #endif | 629 | #endif |
630 | |||
631 | /* Still not set up? Just map 1:1. */ | ||
632 | if (unlikely(cpu->linear_pages)) | ||
633 | return vaddr; | ||
634 | |||
615 | /* First step: get the top-level Guest page table entry. */ | 635 | /* First step: get the top-level Guest page table entry. */ |
616 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 636 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
617 | /* Toplevel not present? We can't map it in. */ | 637 | /* Toplevel not present? We can't map it in. */ |
@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
708 | return next; | 728 | return next; |
709 | } | 729 | } |
710 | 730 | ||
711 | /*H:430 | ||
712 | * (iv) Switching page tables | ||
713 | * | ||
714 | * Now we've seen all the page table setting and manipulation, let's see | ||
715 | * what happens when the Guest changes page tables (ie. changes the top-level | ||
716 | * pgdir). This occurs on almost every context switch. | ||
717 | */ | ||
718 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | ||
719 | { | ||
720 | int newpgdir, repin = 0; | ||
721 | |||
722 | /* Look to see if we have this one already. */ | ||
723 | newpgdir = find_pgdir(cpu->lg, pgtable); | ||
724 | /* | ||
725 | * If not, we allocate or mug an existing one: if it's a fresh one, | ||
726 | * repin gets set to 1. | ||
727 | */ | ||
728 | if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) | ||
729 | newpgdir = new_pgdir(cpu, pgtable, &repin); | ||
730 | /* Change the current pgd index to the new one. */ | ||
731 | cpu->cpu_pgd = newpgdir; | ||
732 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
733 | if (repin) | ||
734 | pin_stack_pages(cpu); | ||
735 | } | ||
736 | |||
737 | /*H:470 | 731 | /*H:470 |
738 | * Finally, a routine which throws away everything: all PGD entries in all | 732 | * Finally, a routine which throws away everything: all PGD entries in all |
739 | * the shadow page tables, including the Guest's kernel mappings. This is used | 733 | * the shadow page tables, including the Guest's kernel mappings. This is used |
@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) | |||
780 | /* We need the Guest kernel stack mapped again. */ | 774 | /* We need the Guest kernel stack mapped again. */ |
781 | pin_stack_pages(cpu); | 775 | pin_stack_pages(cpu); |
782 | } | 776 | } |
777 | |||
778 | /*H:430 | ||
779 | * (iv) Switching page tables | ||
780 | * | ||
781 | * Now we've seen all the page table setting and manipulation, let's see | ||
782 | * what happens when the Guest changes page tables (ie. changes the top-level | ||
783 | * pgdir). This occurs on almost every context switch. | ||
784 | */ | ||
785 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | ||
786 | { | ||
787 | int newpgdir, repin = 0; | ||
788 | |||
789 | /* | ||
790 | * The very first time they call this, we're actually running without | ||
791 | * any page tables; we've been making it up. Throw them away now. | ||
792 | */ | ||
793 | if (unlikely(cpu->linear_pages)) { | ||
794 | release_all_pagetables(cpu->lg); | ||
795 | cpu->linear_pages = false; | ||
796 | /* Force allocation of a new pgdir. */ | ||
797 | newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); | ||
798 | } else { | ||
799 | /* Look to see if we have this one already. */ | ||
800 | newpgdir = find_pgdir(cpu->lg, pgtable); | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * If not, we allocate or mug an existing one: if it's a fresh one, | ||
805 | * repin gets set to 1. | ||
806 | */ | ||
807 | if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) | ||
808 | newpgdir = new_pgdir(cpu, pgtable, &repin); | ||
809 | /* Change the current pgd index to the new one. */ | ||
810 | cpu->cpu_pgd = newpgdir; | ||
811 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
812 | if (repin) | ||
813 | pin_stack_pages(cpu); | ||
814 | } | ||
783 | /*:*/ | 815 | /*:*/ |
784 | 816 | ||
785 | /*M:009 | 817 | /*M:009 |
@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | |||
919 | } | 951 | } |
920 | #endif | 952 | #endif |
921 | 953 | ||
922 | /*H:505 | ||
923 | * To get through boot, we construct simple identity page mappings (which | ||
924 | * set virtual == physical) and linear mappings which will get the Guest far | ||
925 | * enough into the boot to create its own. The linear mapping means we | ||
926 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
927 | * as you'll see. | ||
928 | * | ||
929 | * We lay them out of the way, just below the initrd (which is why we need to | ||
930 | * know its size here). | ||
931 | */ | ||
932 | static unsigned long setup_pagetables(struct lguest *lg, | ||
933 | unsigned long mem, | ||
934 | unsigned long initrd_size) | ||
935 | { | ||
936 | pgd_t __user *pgdir; | ||
937 | pte_t __user *linear; | ||
938 | unsigned long mem_base = (unsigned long)lg->mem_base; | ||
939 | unsigned int mapped_pages, i, linear_pages; | ||
940 | #ifdef CONFIG_X86_PAE | ||
941 | pmd_t __user *pmds; | ||
942 | unsigned int j; | ||
943 | pgd_t pgd; | ||
944 | pmd_t pmd; | ||
945 | #else | ||
946 | unsigned int phys_linear; | ||
947 | #endif | ||
948 | |||
949 | /* | ||
950 | * We have mapped_pages frames to map, so we need linear_pages page | ||
951 | * tables to map them. | ||
952 | */ | ||
953 | mapped_pages = mem / PAGE_SIZE; | ||
954 | linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; | ||
955 | |||
956 | /* We put the toplevel page directory page at the top of memory. */ | ||
957 | pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); | ||
958 | |||
959 | /* Now we use the next linear_pages pages as pte pages */ | ||
960 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | ||
961 | |||
962 | #ifdef CONFIG_X86_PAE | ||
963 | /* | ||
964 | * And the single mid page goes below that. We only use one, but | ||
965 | * that's enough to map 1G, which definitely gets us through boot. | ||
966 | */ | ||
967 | pmds = (void *)linear - PAGE_SIZE; | ||
968 | #endif | ||
969 | /* | ||
970 | * Linear mapping is easy: put every page's address into the | ||
971 | * mapping in order. | ||
972 | */ | ||
973 | for (i = 0; i < mapped_pages; i++) { | ||
974 | pte_t pte; | ||
975 | pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); | ||
976 | if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) | ||
977 | return -EFAULT; | ||
978 | } | ||
979 | |||
980 | #ifdef CONFIG_X86_PAE | ||
981 | /* | ||
982 | * Make the Guest PMD entries point to the corresponding place in the | ||
983 | * linear mapping (up to one page worth of PMD). | ||
984 | */ | ||
985 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | ||
986 | i += PTRS_PER_PTE, j++) { | ||
987 | pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, | ||
988 | __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
989 | |||
990 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
991 | return -EFAULT; | ||
992 | } | ||
993 | |||
994 | /* One PGD entry, pointing to that PMD page. */ | ||
995 | pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); | ||
996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
998 | return -EFAULT; | ||
999 | /* | ||
1000 | * And the other PGD entry to make the linear mapping at PAGE_OFFSET | ||
1001 | */ | ||
1002 | if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) | ||
1003 | return -EFAULT; | ||
1004 | #else | ||
1005 | /* | ||
1006 | * The top level points to the linear page table pages above. | ||
1007 | * We setup the identity and linear mappings here. | ||
1008 | */ | ||
1009 | phys_linear = (unsigned long)linear - mem_base; | ||
1010 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | ||
1011 | pgd_t pgd; | ||
1012 | /* | ||
1013 | * Create a PGD entry which points to the right part of the | ||
1014 | * linear PTE pages. | ||
1015 | */ | ||
1016 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | ||
1017 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
1018 | |||
1019 | /* | ||
1020 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
1021 | */ | ||
1022 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | ||
1023 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | ||
1024 | + i / PTRS_PER_PTE], | ||
1025 | &pgd, sizeof(pgd))) | ||
1026 | return -EFAULT; | ||
1027 | } | ||
1028 | #endif | ||
1029 | |||
1030 | /* | ||
1031 | * We return the top level (guest-physical) address: we remember where | ||
1032 | * this is to write it into lguest_data when the Guest initializes. | ||
1033 | */ | ||
1034 | return (unsigned long)pgdir - mem_base; | ||
1035 | } | ||
1036 | |||
1037 | /*H:500 | 954 | /*H:500 |
1038 | * (vii) Setting up the page tables initially. | 955 | * (vii) Setting up the page tables initially. |
1039 | * | 956 | * |
1040 | * When a Guest is first created, the Launcher tells us where the toplevel of | 957 | * When a Guest is first created, set initialize a shadow page table which |
1041 | * its first page table is. We set some things up here: | 958 | * we will populate on future faults. The Guest doesn't have any actual |
959 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it | ||
960 | * for the moment. | ||
1042 | */ | 961 | */ |
1043 | int init_guest_pagetable(struct lguest *lg) | 962 | int init_guest_pagetable(struct lguest *lg) |
1044 | { | 963 | { |
1045 | u64 mem; | 964 | struct lg_cpu *cpu = &lg->cpus[0]; |
1046 | u32 initrd_size; | 965 | int allocated = 0; |
1047 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | ||
1048 | #ifdef CONFIG_X86_PAE | ||
1049 | pgd_t *pgd; | ||
1050 | pmd_t *pmd_table; | ||
1051 | #endif | ||
1052 | /* | ||
1053 | * Get the Guest memory size and the ramdisk size from the boot header | ||
1054 | * located at lg->mem_base (Guest address 0). | ||
1055 | */ | ||
1056 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | ||
1057 | || get_user(initrd_size, &boot->hdr.ramdisk_size)) | ||
1058 | return -EFAULT; | ||
1059 | 966 | ||
1060 | /* | 967 | /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ |
1061 | * We start on the first shadow page table, and give it a blank PGD | 968 | cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); |
1062 | * page. | 969 | if (!allocated) |
1063 | */ | ||
1064 | lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); | ||
1065 | if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) | ||
1066 | return lg->pgdirs[0].gpgdir; | ||
1067 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | ||
1068 | if (!lg->pgdirs[0].pgdir) | ||
1069 | return -ENOMEM; | 970 | return -ENOMEM; |
1070 | 971 | ||
1071 | #ifdef CONFIG_X86_PAE | 972 | /* We start with a linear mapping until the initialize. */ |
1072 | /* For PAE, we also create the initial mid-level. */ | 973 | cpu->linear_pages = true; |
1073 | pgd = lg->pgdirs[0].pgdir; | ||
1074 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
1075 | if (!pmd_table) | ||
1076 | return -ENOMEM; | ||
1077 | |||
1078 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
1079 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
1080 | #endif | ||
1081 | |||
1082 | /* This is the current page table. */ | ||
1083 | lg->cpus[0].cpu_pgd = 0; | ||
1084 | return 0; | 974 | return 0; |
1085 | } | 975 | } |
1086 | 976 | ||
@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
1095 | * of virtual addresses used by the Switcher. | 985 | * of virtual addresses used by the Switcher. |
1096 | */ | 986 | */ |
1097 | || put_user(RESERVE_MEM * 1024 * 1024, | 987 | || put_user(RESERVE_MEM * 1024 * 1024, |
1098 | &cpu->lg->lguest_data->reserve_mem) | 988 | &cpu->lg->lguest_data->reserve_mem)) { |
1099 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
1100 | &cpu->lg->lguest_data->pgdir)) | ||
1101 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 989 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
990 | return; | ||
991 | } | ||
1102 | 992 | ||
1103 | /* | 993 | /* |
1104 | * In flush_user_mappings() we loop from 0 to | 994 | * In flush_user_mappings() we loop from 0 to |