aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/page_tables.c
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-07-30 18:03:45 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-07-30 02:33:46 -0400
commita91d74a3c4de8115295ee87350c13a329164aaaf (patch)
tree02c862fccc9abedf7fc354061e69c4b5fbcce06d /drivers/lguest/page_tables.c
parent2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff)
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r--drivers/lguest/page_tables.c84
1 files changed, 65 insertions, 19 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 3da902e4b4cb..a8d0aee3bc0e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -29,10 +29,10 @@
29/*H:300 29/*H:300
30 * The Page Table Code 30 * The Page Table Code
31 * 31 *
32 * We use two-level page tables for the Guest. If you're not entirely 32 * We use two-level page tables for the Guest, or three-level with PAE. If
33 * comfortable with virtual addresses, physical addresses and page tables then 33 * you're not entirely comfortable with virtual addresses, physical addresses
34 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
35 * diagrams!). 35 * Table Handling" (with diagrams!).
36 * 36 *
37 * The Guest keeps page tables, but we maintain the actual ones here: these are 37 * The Guest keeps page tables, but we maintain the actual ones here: these are
38 * called "shadow" page tables. Which is a very Guest-centric name: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -52,9 +52,8 @@
52:*/ 52:*/
53 53
54/* 54/*
55 * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
56 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * or 512 PTE entries with PAE (2MB).
57 * page.
58 */ 57 */
59#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
60 59
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
81 80
82/*H:320 81/*H:320
83 * The page table code is curly enough to need helper functions to keep it 82 * The page table code is curly enough to need helper functions to keep it
84 * clear and clean. 83 * clear and clean. The kernel itself provides many of them; one advantage
84 * of insisting that the Guest and Host use the same CONFIG_PAE setting.
85 * 85 *
86 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
87 * page tables. 87 * page tables.
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
155} 155}
156 156
157/* 157/*
158 * These two functions just like the above two, except they access the Guest 158 * These functions are just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address. 159 * page tables. Hence they return a Guest address.
160 */ 160 */
161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
165} 165}
166 166
167#ifdef CONFIG_X86_PAE 167#ifdef CONFIG_X86_PAE
168/* Follow the PGD to the PMD. */
168static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
169{ 170{
170 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
172 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
173} 174}
174 175
176/* Follow the PMD to the PTE. */
175static unsigned long gpte_addr(struct lg_cpu *cpu, 177static unsigned long gpte_addr(struct lg_cpu *cpu,
176 pmd_t gpmd, unsigned long vaddr) 178 pmd_t gpmd, unsigned long vaddr)
177{ 179{
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
181 return gpage + pte_index(vaddr) * sizeof(pte_t); 183 return gpage + pte_index(vaddr) * sizeof(pte_t);
182} 184}
183#else 185#else
186/* Follow the PGD to the PTE (no mid-level for !PAE). */
184static unsigned long gpte_addr(struct lg_cpu *cpu, 187static unsigned long gpte_addr(struct lg_cpu *cpu,
185 pgd_t gpgd, unsigned long vaddr) 188 pgd_t gpgd, unsigned long vaddr)
186{ 189{
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
314 pte_t gpte; 317 pte_t gpte;
315 pte_t *spte; 318 pte_t *spte;
316 319
320 /* Mid level for PAE. */
317#ifdef CONFIG_X86_PAE 321#ifdef CONFIG_X86_PAE
318 pmd_t *spmd; 322 pmd_t *spmd;
319 pmd_t gpmd; 323 pmd_t gpmd;
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
391 */ 395 */
392 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
393#endif 397#endif
398
399 /* Read the actual PTE value. */
394 gpte = lgread(cpu, gpte_ptr, pte_t); 400 gpte = lgread(cpu, gpte_ptr, pte_t);
395 401
396 /* If this page isn't in the Guest page tables, we can't page it in. */ 402 /* If this page isn't in the Guest page tables, we can't page it in. */
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
507 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
508 kill_guest(cpu, "bad stack page %#lx", vaddr); 514 kill_guest(cpu, "bad stack page %#lx", vaddr);
509} 515}
516/*:*/
510 517
511#ifdef CONFIG_X86_PAE 518#ifdef CONFIG_X86_PAE
512static void release_pmd(pmd_t *spmd) 519static void release_pmd(pmd_t *spmd)
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd)
543} 550}
544 551
545#else /* !CONFIG_X86_PAE */ 552#else /* !CONFIG_X86_PAE */
546/*H:450 If we chase down the release_pgd() code, it looks like this: */ 553/*H:450
554 * If we chase down the release_pgd() code, the non-PAE version looks like
555 * this. The PAE version is almost identical, but instead of calling
556 * release_pte it calls release_pmd(), which looks much like this.
557 */
547static void release_pgd(pgd_t *spgd) 558static void release_pgd(pgd_t *spgd)
548{ 559{
549 /* If the entry's not present, there's nothing to release. */ 560 /* If the entry's not present, there's nothing to release. */
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
898 /* ... throw it away. */ 909 /* ... throw it away. */
899 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
900} 911}
912
901#ifdef CONFIG_X86_PAE 913#ifdef CONFIG_X86_PAE
914/* For setting a mid-level, we just throw everything away. It's easy. */
902void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
903{ 916{
904 guest_pagetable_clear_all(&lg->cpus[0]); 917 guest_pagetable_clear_all(&lg->cpus[0]);
905} 918}
906#endif 919#endif
907 920
908/* 921/*H:505
909 * Once we know how much memory we have we can construct simple identity (which 922 * To get through boot, we construct simple identity page mappings (which
910 * set virtual == physical) and linear mappings which will get the Guest far 923 * set virtual == physical) and linear mappings which will get the Guest far
911 * enough into the boot to create its own. 924 * enough into the boot to create its own. The linear mapping means we
925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
926 * as you'll see.
912 * 927 *
913 * We lay them out of the way, just below the initrd (which is why we need to 928 * We lay them out of the way, just below the initrd (which is why we need to
914 * know its size here). 929 * know its size here).
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
944 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
945 960
946#ifdef CONFIG_X86_PAE 961#ifdef CONFIG_X86_PAE
962 /*
963 * And the single mid page goes below that. We only use one, but
964 * that's enough to map 1G, which definitely gets us through boot.
965 */
947 pmds = (void *)linear - PAGE_SIZE; 966 pmds = (void *)linear - PAGE_SIZE;
948#endif 967#endif
949 /* 968 /*
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,
957 return -EFAULT; 976 return -EFAULT;
958 } 977 }
959 978
979#ifdef CONFIG_X86_PAE
960 /* 980 /*
961 * The top level points to the linear page table pages above. 981 * Make the Guest PMD entries point to the corresponding place in the
962 * We setup the identity and linear mappings here. 982 * linear mapping (up to one page worth of PMD).
963 */ 983 */
964#ifdef CONFIG_X86_PAE
965 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
966 i += PTRS_PER_PTE, j++) { 985 i += PTRS_PER_PTE, j++) {
986 /* FIXME: native_set_pmd is overkill here. */
967 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
968 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
969 989
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,
971 return -EFAULT; 991 return -EFAULT;
972 } 992 }
973 993
994 /* One PGD entry, pointing to that PMD page. */
974 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 995 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
975 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
976 return -EFAULT; 998 return -EFAULT;
999 /*
1000 * And the third PGD entry (ie. addresses 3G-4G).
1001 *
1002 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000.
1003 */
977 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 1004 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
978 return -EFAULT; 1005 return -EFAULT;
979#else 1006#else
1007 /*
1008 * The top level points to the linear page table pages above.
1009 * We setup the identity and linear mappings here.
1010 */
980 phys_linear = (unsigned long)linear - mem_base; 1011 phys_linear = (unsigned long)linear - mem_base;
981 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1012 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
982 pgd_t pgd; 1013 pgd_t pgd;
1014 /*
1015 * Create a PGD entry which points to the right part of the
1016 * linear PTE pages.
1017 */
983 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1018 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
984 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1019 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
985 1020
1021 /*
1022 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1023 */
986 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1024 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
987 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1025 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
988 + i / PTRS_PER_PTE], 1026 + i / PTRS_PER_PTE],
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg,
992#endif 1030#endif
993 1031
994 /* 1032 /*
995 * We return the top level (guest-physical) address: remember where 1033 * We return the top level (guest-physical) address: we remember where
996 * this is. 1034 * this is to write it into lguest_data when the Guest initializes.
997 */ 1035 */
998 return (unsigned long)pgdir - mem_base; 1036 return (unsigned long)pgdir - mem_base;
999} 1037}
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg)
1031 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1069 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
1032 if (!lg->pgdirs[0].pgdir) 1070 if (!lg->pgdirs[0].pgdir)
1033 return -ENOMEM; 1071 return -ENOMEM;
1072
1034#ifdef CONFIG_X86_PAE 1073#ifdef CONFIG_X86_PAE
1074 /* For PAE, we also create the initial mid-level. */
1035 pgd = lg->pgdirs[0].pgdir; 1075 pgd = lg->pgdirs[0].pgdir;
1036 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1076 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
1037 if (!pmd_table) 1077 if (!pmd_table)
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg)
1040 set_pgd(pgd + SWITCHER_PGD_INDEX, 1080 set_pgd(pgd + SWITCHER_PGD_INDEX,
1041 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1081 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
1042#endif 1082#endif
1083
1084 /* This is the current page table. */
1043 lg->cpus[0].cpu_pgd = 0; 1085 lg->cpus[0].cpu_pgd = 0;
1044 return 0; 1086 return 0;
1045} 1087}
1046 1088
1047/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1089/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
1048void page_table_guest_data_init(struct lg_cpu *cpu) 1090void page_table_guest_data_init(struct lg_cpu *cpu)
1049{ 1091{
1050 /* We get the kernel address: above this is all kernel memory. */ 1092 /* We get the kernel address: above this is all kernel memory. */
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
1105 pmd_t switcher_pmd; 1147 pmd_t switcher_pmd;
1106 pmd_t *pmd_table; 1148 pmd_t *pmd_table;
1107 1149
1150 /* FIXME: native_set_pmd is overkill here. */
1108 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
1109 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 PAGE_SHIFT, PAGE_KERNEL_EXEC));
1110 1153
1154 /* Figure out where the pmd page is, by reading the PGD, and converting
1155 * it to a virtual address. */
1111 pmd_table = __va(pgd_pfn(cpu->lg-> 1156 pmd_table = __va(pgd_pfn(cpu->lg->
1112 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1157 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
1113 << PAGE_SHIFT); 1158 << PAGE_SHIFT);
1159 /* Now write it into the shadow page table. */
1114 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1160 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
1115#else 1161#else
1116 pgd_t switcher_pgd; 1162 pgd_t switcher_pgd;