diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 18:03:45 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 02:33:46 -0400 |
commit | a91d74a3c4de8115295ee87350c13a329164aaaf (patch) | |
tree | 02c862fccc9abedf7fc354061e69c4b5fbcce06d /drivers/lguest/page_tables.c | |
parent | 2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff) |
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot
the Lguest Journey (see drivers/lguest/README). Since we now use RCU in
a simple form in one place I took the opportunity to expand that explanation.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r-- | drivers/lguest/page_tables.c | 84 |
1 files changed, 65 insertions, 19 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -29,10 +29,10 @@ | |||
29 | /*H:300 | 29 | /*H:300 |
30 | * The Page Table Code | 30 | * The Page Table Code |
31 | * | 31 | * |
32 | * We use two-level page tables for the Guest. If you're not entirely | 32 | * We use two-level page tables for the Guest, or three-level with PAE. If |
33 | * comfortable with virtual addresses, physical addresses and page tables then | 33 | * you're not entirely comfortable with virtual addresses, physical addresses |
34 | * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with | 34 | * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page |
35 | * diagrams!). | 35 | * Table Handling" (with diagrams!). |
36 | * | 36 | * |
37 | * The Guest keeps page tables, but we maintain the actual ones here: these are | 37 | * The Guest keeps page tables, but we maintain the actual ones here: these are |
38 | * called "shadow" page tables. Which is a very Guest-centric name: these are | 38 | * called "shadow" page tables. Which is a very Guest-centric name: these are |
@@ -52,9 +52,8 @@ | |||
52 | :*/ | 52 | :*/ |
53 | 53 | ||
54 | /* | 54 | /* |
55 | * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 55 | * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) |
56 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 56 | * or 512 PTE entries with PAE (2MB). |
57 | * page. | ||
58 | */ | 57 | */ |
59 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 58 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
60 | 59 | ||
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | |||
81 | 80 | ||
82 | /*H:320 | 81 | /*H:320 |
83 | * The page table code is curly enough to need helper functions to keep it | 82 | * The page table code is curly enough to need helper functions to keep it |
84 | * clear and clean. | 83 | * clear and clean. The kernel itself provides many of them; one advantage |
84 | * of insisting that the Guest and Host use the same CONFIG_PAE setting. | ||
85 | * | 85 | * |
86 | * There are two functions which return pointers to the shadow (aka "real") | 86 | * There are two functions which return pointers to the shadow (aka "real") |
87 | * page tables. | 87 | * page tables. |
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * These two functions just like the above two, except they access the Guest | 158 | * These functions are just like the above two, except they access the Guest |
159 | * page tables. Hence they return a Guest address. | 159 | * page tables. Hence they return a Guest address. |
160 | */ | 160 | */ |
161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | #ifdef CONFIG_X86_PAE | 167 | #ifdef CONFIG_X86_PAE |
168 | /* Follow the PGD to the PMD. */ | ||
168 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | 169 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) |
169 | { | 170 | { |
170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 171 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | |||
172 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | 173 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); |
173 | } | 174 | } |
174 | 175 | ||
176 | /* Follow the PMD to the PTE. */ | ||
175 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 177 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
176 | pmd_t gpmd, unsigned long vaddr) | 178 | pmd_t gpmd, unsigned long vaddr) |
177 | { | 179 | { |
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, | |||
181 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 183 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
182 | } | 184 | } |
183 | #else | 185 | #else |
186 | /* Follow the PGD to the PTE (no mid-level for !PAE). */ | ||
184 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 187 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
185 | pgd_t gpgd, unsigned long vaddr) | 188 | pgd_t gpgd, unsigned long vaddr) |
186 | { | 189 | { |
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
314 | pte_t gpte; | 317 | pte_t gpte; |
315 | pte_t *spte; | 318 | pte_t *spte; |
316 | 319 | ||
320 | /* Mid level for PAE. */ | ||
317 | #ifdef CONFIG_X86_PAE | 321 | #ifdef CONFIG_X86_PAE |
318 | pmd_t *spmd; | 322 | pmd_t *spmd; |
319 | pmd_t gpmd; | 323 | pmd_t gpmd; |
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
391 | */ | 395 | */ |
392 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 396 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
393 | #endif | 397 | #endif |
398 | |||
399 | /* Read the actual PTE value. */ | ||
394 | gpte = lgread(cpu, gpte_ptr, pte_t); | 400 | gpte = lgread(cpu, gpte_ptr, pte_t); |
395 | 401 | ||
396 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 402 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
507 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) | 513 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
508 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 514 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
509 | } | 515 | } |
516 | /*:*/ | ||
510 | 517 | ||
511 | #ifdef CONFIG_X86_PAE | 518 | #ifdef CONFIG_X86_PAE |
512 | static void release_pmd(pmd_t *spmd) | 519 | static void release_pmd(pmd_t *spmd) |
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) | |||
543 | } | 550 | } |
544 | 551 | ||
545 | #else /* !CONFIG_X86_PAE */ | 552 | #else /* !CONFIG_X86_PAE */ |
546 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 553 | /*H:450 |
554 | * If we chase down the release_pgd() code, the non-PAE version looks like | ||
555 | * this. The PAE version is almost identical, but instead of calling | ||
556 | * release_pte it calls release_pmd(), which looks much like this. | ||
557 | */ | ||
547 | static void release_pgd(pgd_t *spgd) | 558 | static void release_pgd(pgd_t *spgd) |
548 | { | 559 | { |
549 | /* If the entry's not present, there's nothing to release. */ | 560 | /* If the entry's not present, there's nothing to release. */ |
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
898 | /* ... throw it away. */ | 909 | /* ... throw it away. */ |
899 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 910 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
900 | } | 911 | } |
912 | |||
901 | #ifdef CONFIG_X86_PAE | 913 | #ifdef CONFIG_X86_PAE |
914 | /* For setting a mid-level, we just throw everything away. It's easy. */ | ||
902 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | 915 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) |
903 | { | 916 | { |
904 | guest_pagetable_clear_all(&lg->cpus[0]); | 917 | guest_pagetable_clear_all(&lg->cpus[0]); |
905 | } | 918 | } |
906 | #endif | 919 | #endif |
907 | 920 | ||
908 | /* | 921 | /*H:505 |
909 | * Once we know how much memory we have we can construct simple identity (which | 922 | * To get through boot, we construct simple identity page mappings (which |
910 | * set virtual == physical) and linear mappings which will get the Guest far | 923 | * set virtual == physical) and linear mappings which will get the Guest far |
911 | * enough into the boot to create its own. | 924 | * enough into the boot to create its own. The linear mapping means we |
925 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
926 | * as you'll see. | ||
912 | * | 927 | * |
913 | * We lay them out of the way, just below the initrd (which is why we need to | 928 | * We lay them out of the way, just below the initrd (which is why we need to |
914 | * know its size here). | 929 | * know its size here). |
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
944 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 959 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
945 | 960 | ||
946 | #ifdef CONFIG_X86_PAE | 961 | #ifdef CONFIG_X86_PAE |
962 | /* | ||
963 | * And the single mid page goes below that. We only use one, but | ||
964 | * that's enough to map 1G, which definitely gets us through boot. | ||
965 | */ | ||
947 | pmds = (void *)linear - PAGE_SIZE; | 966 | pmds = (void *)linear - PAGE_SIZE; |
948 | #endif | 967 | #endif |
949 | /* | 968 | /* |
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
957 | return -EFAULT; | 976 | return -EFAULT; |
958 | } | 977 | } |
959 | 978 | ||
979 | #ifdef CONFIG_X86_PAE | ||
960 | /* | 980 | /* |
961 | * The top level points to the linear page table pages above. | 981 | * Make the Guest PMD entries point to the corresponding place in the |
962 | * We setup the identity and linear mappings here. | 982 | * linear mapping (up to one page worth of PMD). |
963 | */ | 983 | */ |
964 | #ifdef CONFIG_X86_PAE | ||
965 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | 984 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; |
966 | i += PTRS_PER_PTE, j++) { | 985 | i += PTRS_PER_PTE, j++) { |
986 | /* FIXME: native_set_pmd is overkill here. */ | ||
967 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | 987 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) |
968 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 988 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
969 | 989 | ||
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
971 | return -EFAULT; | 991 | return -EFAULT; |
972 | } | 992 | } |
973 | 993 | ||
994 | /* One PGD entry, pointing to that PMD page. */ | ||
974 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | 995 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); |
996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
975 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | 997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) |
976 | return -EFAULT; | 998 | return -EFAULT; |
999 | /* | ||
1000 | * And the third PGD entry (ie. addresses 3G-4G). | ||
1001 | * | ||
1002 | * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. | ||
1003 | */ | ||
977 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | 1004 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) |
978 | return -EFAULT; | 1005 | return -EFAULT; |
979 | #else | 1006 | #else |
1007 | /* | ||
1008 | * The top level points to the linear page table pages above. | ||
1009 | * We setup the identity and linear mappings here. | ||
1010 | */ | ||
980 | phys_linear = (unsigned long)linear - mem_base; | 1011 | phys_linear = (unsigned long)linear - mem_base; |
981 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 1012 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
982 | pgd_t pgd; | 1013 | pgd_t pgd; |
1014 | /* | ||
1015 | * Create a PGD entry which points to the right part of the | ||
1016 | * linear PTE pages. | ||
1017 | */ | ||
983 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | 1018 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | |
984 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 1019 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
985 | 1020 | ||
1021 | /* | ||
1022 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
1023 | */ | ||
986 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | 1024 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) |
987 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | 1025 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) |
988 | + i / PTRS_PER_PTE], | 1026 | + i / PTRS_PER_PTE], |
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
992 | #endif | 1030 | #endif |
993 | 1031 | ||
994 | /* | 1032 | /* |
995 | * We return the top level (guest-physical) address: remember where | 1033 | * We return the top level (guest-physical) address: we remember where |
996 | * this is. | 1034 | * this is to write it into lguest_data when the Guest initializes. |
997 | */ | 1035 | */ |
998 | return (unsigned long)pgdir - mem_base; | 1036 | return (unsigned long)pgdir - mem_base; |
999 | } | 1037 | } |
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) | |||
1031 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 1069 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
1032 | if (!lg->pgdirs[0].pgdir) | 1070 | if (!lg->pgdirs[0].pgdir) |
1033 | return -ENOMEM; | 1071 | return -ENOMEM; |
1072 | |||
1034 | #ifdef CONFIG_X86_PAE | 1073 | #ifdef CONFIG_X86_PAE |
1074 | /* For PAE, we also create the initial mid-level. */ | ||
1035 | pgd = lg->pgdirs[0].pgdir; | 1075 | pgd = lg->pgdirs[0].pgdir; |
1036 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | 1076 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); |
1037 | if (!pmd_table) | 1077 | if (!pmd_table) |
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) | |||
1040 | set_pgd(pgd + SWITCHER_PGD_INDEX, | 1080 | set_pgd(pgd + SWITCHER_PGD_INDEX, |
1041 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 1081 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
1042 | #endif | 1082 | #endif |
1083 | |||
1084 | /* This is the current page table. */ | ||
1043 | lg->cpus[0].cpu_pgd = 0; | 1085 | lg->cpus[0].cpu_pgd = 0; |
1044 | return 0; | 1086 | return 0; |
1045 | } | 1087 | } |
1046 | 1088 | ||
1047 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1089 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
1048 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1090 | void page_table_guest_data_init(struct lg_cpu *cpu) |
1049 | { | 1091 | { |
1050 | /* We get the kernel address: above this is all kernel memory. */ | 1092 | /* We get the kernel address: above this is all kernel memory. */ |
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
1105 | pmd_t switcher_pmd; | 1147 | pmd_t switcher_pmd; |
1106 | pmd_t *pmd_table; | 1148 | pmd_t *pmd_table; |
1107 | 1149 | ||
1150 | /* FIXME: native_set_pmd is overkill here. */ | ||
1108 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | 1151 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> |
1109 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | 1152 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
1110 | 1153 | ||
1154 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
1155 | * it to a virtual address. */ | ||
1111 | pmd_table = __va(pgd_pfn(cpu->lg-> | 1156 | pmd_table = __va(pgd_pfn(cpu->lg-> |
1112 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | 1157 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) |
1113 | << PAGE_SHIFT); | 1158 | << PAGE_SHIFT); |
1159 | /* Now write it into the shadow page table. */ | ||
1114 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | 1160 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); |
1115 | #else | 1161 | #else |
1116 | pgd_t switcher_pgd; | 1162 | pgd_t switcher_pgd; |