aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/page_tables.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r--drivers/lguest/page_tables.c250
1 files changed, 136 insertions, 114 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index b7a924ace684..2a45f0691c9b 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -13,6 +13,7 @@
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16#include <asm/uaccess.h>
16#include "lg.h" 17#include "lg.h"
17 18
18/*M:008 We hold reference to pages, which prevents them from being swapped. 19/*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -44,44 +45,32 @@
44 * (vii) Setting up the page tables initially. 45 * (vii) Setting up the page tables initially.
45 :*/ 46 :*/
46 47
47/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024
48 * (or 2^10) entries per page. */
49#define PTES_PER_PAGE_SHIFT 10
50#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
51 48
52/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 49/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
53 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 50 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
54 * page. */ 51 * page. */
55#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) 52#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
56 53
57/* We actually need a separate PTE page for each CPU. Remember that after the 54/* We actually need a separate PTE page for each CPU. Remember that after the
58 * Switcher code itself comes two pages for each CPU, and we don't want this 55 * Switcher code itself comes two pages for each CPU, and we don't want this
59 * CPU's guest to see the pages of any other CPU. */ 56 * CPU's guest to see the pages of any other CPU. */
60static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); 57static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
61#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 58#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
62 59
63/*H:320 With our shadow and Guest types established, we need to deal with 60/*H:320 With our shadow and Guest types established, we need to deal with
64 * them: the page table code is curly enough to need helper functions to keep 61 * them: the page table code is curly enough to need helper functions to keep
65 * it clear and clean. 62 * it clear and clean.
66 * 63 *
67 * The first helper takes a virtual address, and says which entry in the top 64 * There are two functions which return pointers to the shadow (aka "real")
68 * level page table deals with that address. Since each top level entry deals
69 * with 4M, this effectively divides by 4M. */
70static unsigned vaddr_to_pgd_index(unsigned long vaddr)
71{
72 return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
73}
74
75/* There are two functions which return pointers to the shadow (aka "real")
76 * page tables. 65 * page tables.
77 * 66 *
78 * spgd_addr() takes the virtual address and returns a pointer to the top-level 67 * spgd_addr() takes the virtual address and returns a pointer to the top-level
79 * page directory entry for that address. Since we keep track of several page 68 * page directory entry for that address. Since we keep track of several page
80 * tables, the "i" argument tells us which one we're interested in (it's 69 * tables, the "i" argument tells us which one we're interested in (it's
81 * usually the current one). */ 70 * usually the current one). */
82static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
83{ 72{
84 unsigned int index = vaddr_to_pgd_index(vaddr); 73 unsigned int index = pgd_index(vaddr);
85 74
86 /* We kill any Guest trying to touch the Switcher addresses. */ 75 /* We kill any Guest trying to touch the Switcher addresses. */
87 if (index >= SWITCHER_PGD_INDEX) { 76 if (index >= SWITCHER_PGD_INDEX) {
@@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
95/* This routine then takes the PGD entry given above, which contains the 84/* This routine then takes the PGD entry given above, which contains the
96 * address of the PTE page. It then returns a pointer to the PTE entry for the 85 * address of the PTE page. It then returns a pointer to the PTE entry for the
97 * given address. */ 86 * given address. */
98static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
99{ 88{
100 spte_t *page = __va(spgd.pfn << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
101 /* You should never call this if the PGD entry wasn't valid */ 90 /* You should never call this if the PGD entry wasn't valid */
102 BUG_ON(!(spgd.flags & _PAGE_PRESENT)); 91 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
103 return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; 92 return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
104} 93}
105 94
106/* These two functions just like the above two, except they access the Guest 95/* These two functions just like the above two, except they access the Guest
107 * page tables. Hence they return a Guest address. */ 96 * page tables. Hence they return a Guest address. */
108static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 97static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
109{ 98{
110 unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 99 unsigned int index = vaddr >> (PGDIR_SHIFT);
111 return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); 100 return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
112} 101}
113 102
114static unsigned long gpte_addr(struct lguest *lg, 103static unsigned long gpte_addr(struct lguest *lg,
115 gpgd_t gpgd, unsigned long vaddr) 104 pgd_t gpgd, unsigned long vaddr)
116{ 105{
117 unsigned long gpage = gpgd.pfn << PAGE_SHIFT; 106 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
118 BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); 107 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
119 return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); 108 return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
120} 109}
121 110
122/*H:350 This routine takes a page number given by the Guest and converts it to 111/*H:350 This routine takes a page number given by the Guest and converts it to
@@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
149 * entry can be a little tricky. The flags are (almost) the same, but the 138 * entry can be a little tricky. The flags are (almost) the same, but the
150 * Guest PTE contains a virtual page number: the CPU needs the real page 139 * Guest PTE contains a virtual page number: the CPU needs the real page
151 * number. */ 140 * number. */
152static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) 141static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
153{ 142{
154 spte_t spte; 143 unsigned long pfn, base, flags;
155 unsigned long pfn;
156 144
157 /* The Guest sets the global flag, because it thinks that it is using 145 /* The Guest sets the global flag, because it thinks that it is using
158 * PGE. We only told it to use PGE so it would tell us whether it was 146 * PGE. We only told it to use PGE so it would tell us whether it was
159 * flushing a kernel mapping or a userspace mapping. We don't actually 147 * flushing a kernel mapping or a userspace mapping. We don't actually
160 * use the global bit, so throw it away. */ 148 * use the global bit, so throw it away. */
161 spte.flags = (gpte.flags & ~_PAGE_GLOBAL); 149 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
150
151 /* The Guest's pages are offset inside the Launcher. */
152 base = (unsigned long)lg->mem_base / PAGE_SIZE;
162 153
163 /* We need a temporary "unsigned long" variable to hold the answer from 154 /* We need a temporary "unsigned long" variable to hold the answer from
164 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 155 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
165 * fit in spte.pfn. get_pfn() finds the real physical number of the 156 * fit in spte.pfn. get_pfn() finds the real physical number of the
166 * page, given the virtual number. */ 157 * page, given the virtual number. */
167 pfn = get_pfn(gpte.pfn, write); 158 pfn = get_pfn(base + pte_pfn(gpte), write);
168 if (pfn == -1UL) { 159 if (pfn == -1UL) {
169 kill_guest(lg, "failed to get page %u", gpte.pfn); 160 kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
170 /* When we destroy the Guest, we'll go through the shadow page 161 /* When we destroy the Guest, we'll go through the shadow page
171 * tables and release_pte() them. Make sure we don't think 162 * tables and release_pte() them. Make sure we don't think
172 * this one is valid! */ 163 * this one is valid! */
173 spte.flags = 0; 164 flags = 0;
174 } 165 }
175 /* Now we assign the page number, and our shadow PTE is complete. */ 166 /* Now we assemble our shadow PTE from the page number and flags. */
176 spte.pfn = pfn; 167 return pfn_pte(pfn, __pgprot(flags));
177 return spte;
178} 168}
179 169
180/*H:460 And to complete the chain, release_pte() looks like this: */ 170/*H:460 And to complete the chain, release_pte() looks like this: */
181static void release_pte(spte_t pte) 171static void release_pte(pte_t pte)
182{ 172{
183 /* Remember that get_user_pages() took a reference to the page, in 173 /* Remember that get_user_pages() took a reference to the page, in
184 * get_pfn()? We have to put it back now. */ 174 * get_pfn()? We have to put it back now. */
185 if (pte.flags & _PAGE_PRESENT) 175 if (pte_flags(pte) & _PAGE_PRESENT)
186 put_page(pfn_to_page(pte.pfn)); 176 put_page(pfn_to_page(pte_pfn(pte)));
187} 177}
188/*:*/ 178/*:*/
189 179
190static void check_gpte(struct lguest *lg, gpte_t gpte) 180static void check_gpte(struct lguest *lg, pte_t gpte)
191{ 181{
192 if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) 182 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
183 || pte_pfn(gpte) >= lg->pfn_limit)
193 kill_guest(lg, "bad page table entry"); 184 kill_guest(lg, "bad page table entry");
194} 185}
195 186
196static void check_gpgd(struct lguest *lg, gpgd_t gpgd) 187static void check_gpgd(struct lguest *lg, pgd_t gpgd)
197{ 188{
198 if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) 189 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
199 kill_guest(lg, "bad page directory entry"); 190 kill_guest(lg, "bad page directory entry");
200} 191}
201 192
@@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
211 * true. */ 202 * true. */
212int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
213{ 204{
214 gpgd_t gpgd; 205 pgd_t gpgd;
215 spgd_t *spgd; 206 pgd_t *spgd;
216 unsigned long gpte_ptr; 207 unsigned long gpte_ptr;
217 gpte_t gpte; 208 pte_t gpte;
218 spte_t *spte; 209 pte_t *spte;
219 210
220 /* First step: get the top-level Guest page table entry. */ 211 /* First step: get the top-level Guest page table entry. */
221 gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); 212 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
222 /* Toplevel not present? We can't map it in. */ 213 /* Toplevel not present? We can't map it in. */
223 if (!(gpgd.flags & _PAGE_PRESENT)) 214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
224 return 0; 215 return 0;
225 216
226 /* Now look at the matching shadow entry. */ 217 /* Now look at the matching shadow entry. */
227 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 218 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
228 if (!(spgd->flags & _PAGE_PRESENT)) { 219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
229 /* No shadow entry: allocate a new shadow PTE page. */ 220 /* No shadow entry: allocate a new shadow PTE page. */
230 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
231 /* This is not really the Guest's fault, but killing it is 222 /* This is not really the Guest's fault, but killing it is
@@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
238 check_gpgd(lg, gpgd); 229 check_gpgd(lg, gpgd);
239 /* And we copy the flags to the shadow PGD entry. The page 230 /* And we copy the flags to the shadow PGD entry. The page
240 * number in the shadow PGD is the page we just allocated. */ 231 * number in the shadow PGD is the page we just allocated. */
241 spgd->raw.val = (__pa(ptepage) | gpgd.flags); 232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
242 } 233 }
243 234
244 /* OK, now we look at the lower level in the Guest page table: keep its 235 /* OK, now we look at the lower level in the Guest page table: keep its
245 * address, because we might update it later. */ 236 * address, because we might update it later. */
246 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 237 gpte_ptr = gpte_addr(lg, gpgd, vaddr);
247 gpte = mkgpte(lgread_u32(lg, gpte_ptr)); 238 gpte = lgread(lg, gpte_ptr, pte_t);
248 239
249 /* If this page isn't in the Guest page tables, we can't page it in. */ 240 /* If this page isn't in the Guest page tables, we can't page it in. */
250 if (!(gpte.flags & _PAGE_PRESENT)) 241 if (!(pte_flags(gpte) & _PAGE_PRESENT))
251 return 0; 242 return 0;
252 243
253 /* Check they're not trying to write to a page the Guest wants 244 /* Check they're not trying to write to a page the Guest wants
254 * read-only (bit 2 of errcode == write). */ 245 * read-only (bit 2 of errcode == write). */
255 if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) 246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
256 return 0; 247 return 0;
257 248
258 /* User access to a kernel page? (bit 3 == user access) */ 249 /* User access to a kernel page? (bit 3 == user access) */
259 if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) 250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
260 return 0; 251 return 0;
261 252
262 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
263 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
264 check_gpte(lg, gpte); 255 check_gpte(lg, gpte);
265 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 256 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
266 gpte.flags |= _PAGE_ACCESSED; 257 gpte = pte_mkyoung(gpte);
258
267 if (errcode & 2) 259 if (errcode & 2)
268 gpte.flags |= _PAGE_DIRTY; 260 gpte = pte_mkdirty(gpte);
269 261
270 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 /* Get the pointer to the shadow PTE entry we're going to set. */
271 spte = spte_addr(lg, *spgd, vaddr); 263 spte = spte_addr(lg, *spgd, vaddr);
@@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
275 267
276 /* If this is a write, we insist that the Guest page is writable (the 268 /* If this is a write, we insist that the Guest page is writable (the
277 * final arg to gpte_to_spte()). */ 269 * final arg to gpte_to_spte()). */
278 if (gpte.flags & _PAGE_DIRTY) 270 if (pte_dirty(gpte))
279 *spte = gpte_to_spte(lg, gpte, 1); 271 *spte = gpte_to_spte(lg, gpte, 1);
280 else { 272 else
281 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
282 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
283 * we come back here when a write does actually ocur, so we can 275 * we come back here when a write does actually ocur, so we can
284 * update the Guest's _PAGE_DIRTY flag. */ 276 * update the Guest's _PAGE_DIRTY flag. */
285 gpte_t ro_gpte = gpte; 277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
286 ro_gpte.flags &= ~_PAGE_RW;
287 *spte = gpte_to_spte(lg, ro_gpte, 0);
288 }
289 278
290 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
291 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
292 lgwrite_u32(lg, gpte_ptr, gpte.raw.val); 281 lgwrite(lg, gpte_ptr, pte_t, gpte);
293 282
294 /* We succeeded in mapping the page! */ 283 /* We succeeded in mapping the page! */
295 return 1; 284 return 1;
@@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
305 * mapped by the shadow page tables, and is it writable? */ 294 * mapped by the shadow page tables, and is it writable? */
306static int page_writable(struct lguest *lg, unsigned long vaddr) 295static int page_writable(struct lguest *lg, unsigned long vaddr)
307{ 296{
308 spgd_t *spgd; 297 pgd_t *spgd;
309 unsigned long flags; 298 unsigned long flags;
310 299
311 /* Look at the top level entry: is it present? */ 300 /* Look at the top level entry: is it present? */
312 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 301 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
313 if (!(spgd->flags & _PAGE_PRESENT)) 302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
314 return 0; 303 return 0;
315 304
316 /* Check the flags on the pte entry itself: it must be present and 305 /* Check the flags on the pte entry itself: it must be present and
317 * writable. */ 306 * writable. */
318 flags = spte_addr(lg, *spgd, vaddr)->flags; 307 flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
308
319 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 309 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
320} 310}
321 311
@@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr)
329} 319}
330 320
331/*H:450 If we chase down the release_pgd() code, it looks like this: */ 321/*H:450 If we chase down the release_pgd() code, it looks like this: */
332static void release_pgd(struct lguest *lg, spgd_t *spgd) 322static void release_pgd(struct lguest *lg, pgd_t *spgd)
333{ 323{
334 /* If the entry's not present, there's nothing to release. */ 324 /* If the entry's not present, there's nothing to release. */
335 if (spgd->flags & _PAGE_PRESENT) { 325 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
336 unsigned int i; 326 unsigned int i;
337 /* Converting the pfn to find the actual PTE page is easy: turn 327 /* Converting the pfn to find the actual PTE page is easy: turn
338 * the page number into a physical address, then convert to a 328 * the page number into a physical address, then convert to a
339 * virtual address (easy for kernel pages like this one). */ 329 * virtual address (easy for kernel pages like this one). */
340 spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); 330 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
341 /* For each entry in the page, we might need to release it. */ 331 /* For each entry in the page, we might need to release it. */
342 for (i = 0; i < PTES_PER_PAGE; i++) 332 for (i = 0; i < PTRS_PER_PTE; i++)
343 release_pte(ptepage[i]); 333 release_pte(ptepage[i]);
344 /* Now we can free the page of PTEs */ 334 /* Now we can free the page of PTEs */
345 free_page((long)ptepage); 335 free_page((long)ptepage);
346 /* And zero out the PGD entry we we never release it twice. */ 336 /* And zero out the PGD entry we we never release it twice. */
347 spgd->raw.val = 0; 337 *spgd = __pgd(0);
348 } 338 }
349} 339}
350 340
@@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
356{ 346{
357 unsigned int i; 347 unsigned int i;
358 /* Release every pgd entry up to the kernel's address. */ 348 /* Release every pgd entry up to the kernel's address. */
359 for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) 349 for (i = 0; i < pgd_index(lg->kernel_address); i++)
360 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 350 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
361} 351}
362 352
@@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
369} 359}
370/*:*/ 360/*:*/
371 361
362/* We walk down the guest page tables to get a guest-physical address */
363unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
364{
365 pgd_t gpgd;
366 pte_t gpte;
367
368 /* First step: get the top-level Guest page table entry. */
369 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
370 /* Toplevel not present? We can't map it in. */
371 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
372 kill_guest(lg, "Bad address %#lx", vaddr);
373
374 gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
375 if (!(pte_flags(gpte) & _PAGE_PRESENT))
376 kill_guest(lg, "Bad address %#lx", vaddr);
377
378 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
379}
380
372/* We keep several page tables. This is a simple routine to find the page 381/* We keep several page tables. This is a simple routine to find the page
373 * table (if any) corresponding to this top-level address the Guest has given 382 * table (if any) corresponding to this top-level address the Guest has given
374 * us. */ 383 * us. */
@@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
376{ 385{
377 unsigned int i; 386 unsigned int i;
378 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 387 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
379 if (lg->pgdirs[i].cr3 == pgtable) 388 if (lg->pgdirs[i].gpgdir == pgtable)
380 break; 389 break;
381 return i; 390 return i;
382} 391}
@@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
385 * allocate a new one (and so the kernel parts are not there), we set 394 * allocate a new one (and so the kernel parts are not there), we set
386 * blank_pgdir. */ 395 * blank_pgdir. */
387static unsigned int new_pgdir(struct lguest *lg, 396static unsigned int new_pgdir(struct lguest *lg,
388 unsigned long cr3, 397 unsigned long gpgdir,
389 int *blank_pgdir) 398 int *blank_pgdir)
390{ 399{
391 unsigned int next; 400 unsigned int next;
@@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg,
395 next = random32() % ARRAY_SIZE(lg->pgdirs); 404 next = random32() % ARRAY_SIZE(lg->pgdirs);
396 /* If it's never been allocated at all before, try now. */ 405 /* If it's never been allocated at all before, try now. */
397 if (!lg->pgdirs[next].pgdir) { 406 if (!lg->pgdirs[next].pgdir) {
398 lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); 407 lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
399 /* If the allocation fails, just keep using the one we have */ 408 /* If the allocation fails, just keep using the one we have */
400 if (!lg->pgdirs[next].pgdir) 409 if (!lg->pgdirs[next].pgdir)
401 next = lg->pgdidx; 410 next = lg->pgdidx;
@@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg,
405 *blank_pgdir = 1; 414 *blank_pgdir = 1;
406 } 415 }
407 /* Record which Guest toplevel this shadows. */ 416 /* Record which Guest toplevel this shadows. */
408 lg->pgdirs[next].cr3 = cr3; 417 lg->pgdirs[next].gpgdir = gpgdir;
409 /* Release all the non-kernel mappings. */ 418 /* Release all the non-kernel mappings. */
410 flush_user_mappings(lg, next); 419 flush_user_mappings(lg, next);
411 420
@@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg)
472 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 481 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
473 */ 482 */
474static void do_set_pte(struct lguest *lg, int idx, 483static void do_set_pte(struct lguest *lg, int idx,
475 unsigned long vaddr, gpte_t gpte) 484 unsigned long vaddr, pte_t gpte)
476{ 485{
477 /* Look up the matching shadow page directot entry. */ 486 /* Look up the matching shadow page directot entry. */
478 spgd_t *spgd = spgd_addr(lg, idx, vaddr); 487 pgd_t *spgd = spgd_addr(lg, idx, vaddr);
479 488
480 /* If the top level isn't present, there's no entry to update. */ 489 /* If the top level isn't present, there's no entry to update. */
481 if (spgd->flags & _PAGE_PRESENT) { 490 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
482 /* Otherwise, we start by releasing the existing entry. */ 491 /* Otherwise, we start by releasing the existing entry. */
483 spte_t *spte = spte_addr(lg, *spgd, vaddr); 492 pte_t *spte = spte_addr(lg, *spgd, vaddr);
484 release_pte(*spte); 493 release_pte(*spte);
485 494
486 /* If they're setting this entry as dirty or accessed, we might 495 /* If they're setting this entry as dirty or accessed, we might
487 * as well put that entry they've given us in now. This shaves 496 * as well put that entry they've given us in now. This shaves
488 * 10% off a copy-on-write micro-benchmark. */ 497 * 10% off a copy-on-write micro-benchmark. */
489 if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 498 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
490 check_gpte(lg, gpte); 499 check_gpte(lg, gpte);
491 *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); 500 *spte = gpte_to_spte(lg, gpte,
501 pte_flags(gpte) & _PAGE_DIRTY);
492 } else 502 } else
493 /* Otherwise we can demand_page() it in later. */ 503 /* Otherwise we can demand_page() it in later. */
494 spte->raw.val = 0; 504 *spte = __pte(0);
495 } 505 }
496} 506}
497 507
@@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx,
506 * The benefit is that when we have to track a new page table, we can copy keep 516 * The benefit is that when we have to track a new page table, we can copy keep
507 * all the kernel mappings. This speeds up context switch immensely. */ 517 * all the kernel mappings. This speeds up context switch immensely. */
508void guest_set_pte(struct lguest *lg, 518void guest_set_pte(struct lguest *lg,
509 unsigned long cr3, unsigned long vaddr, gpte_t gpte) 519 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
510{ 520{
511 /* Kernel mappings must be changed on all top levels. Slow, but 521 /* Kernel mappings must be changed on all top levels. Slow, but
512 * doesn't happen often. */ 522 * doesn't happen often. */
513 if (vaddr >= lg->page_offset) { 523 if (vaddr >= lg->kernel_address) {
514 unsigned int i; 524 unsigned int i;
515 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 525 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
516 if (lg->pgdirs[i].pgdir) 526 if (lg->pgdirs[i].pgdir)
517 do_set_pte(lg, i, vaddr, gpte); 527 do_set_pte(lg, i, vaddr, gpte);
518 } else { 528 } else {
519 /* Is this page table one we have a shadow for? */ 529 /* Is this page table one we have a shadow for? */
520 int pgdir = find_pgdir(lg, cr3); 530 int pgdir = find_pgdir(lg, gpgdir);
521 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 531 if (pgdir != ARRAY_SIZE(lg->pgdirs))
522 /* If so, do the update. */ 532 /* If so, do the update. */
523 do_set_pte(lg, pgdir, vaddr, gpte); 533 do_set_pte(lg, pgdir, vaddr, gpte);
@@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg,
538 * 548 *
539 * So with that in mind here's our code to to update a (top-level) PGD entry: 549 * So with that in mind here's our code to to update a (top-level) PGD entry:
540 */ 550 */
541void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) 551void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
542{ 552{
543 int pgdir; 553 int pgdir;
544 554
@@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
548 return; 558 return;
549 559
550 /* If they're talking about a page table we have a shadow for... */ 560 /* If they're talking about a page table we have a shadow for... */
551 pgdir = find_pgdir(lg, cr3); 561 pgdir = find_pgdir(lg, gpgdir);
552 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 562 if (pgdir < ARRAY_SIZE(lg->pgdirs))
553 /* ... throw it away. */ 563 /* ... throw it away. */
554 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); 564 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
@@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
560 * its first page table is. We set some things up here: */ 570 * its first page table is. We set some things up here: */
561int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 571int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
562{ 572{
563 /* In flush_user_mappings() we loop from 0 to
564 * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit
565 * the Switcher mappings, so check that now. */
566 if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
567 return -EINVAL;
568 /* We start on the first shadow page table, and give it a blank PGD 573 /* We start on the first shadow page table, and give it a blank PGD
569 * page. */ 574 * page. */
570 lg->pgdidx = 0; 575 lg->pgdidx = 0;
571 lg->pgdirs[lg->pgdidx].cr3 = pgtable; 576 lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
572 lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); 577 lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
573 if (!lg->pgdirs[lg->pgdidx].pgdir) 578 if (!lg->pgdirs[lg->pgdidx].pgdir)
574 return -ENOMEM; 579 return -ENOMEM;
575 return 0; 580 return 0;
576} 581}
577 582
583/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
584void page_table_guest_data_init(struct lguest *lg)
585{
586 /* We get the kernel address: above this is all kernel memory. */
587 if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
588 /* We tell the Guest that it can't use the top 4MB of virtual
589 * addresses used by the Switcher. */
590 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
591 || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
592 kill_guest(lg, "bad guest page %p", lg->lguest_data);
593
594 /* In flush_user_mappings() we loop from 0 to
595 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
596 * Switcher mappings, so check that now. */
597 if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
598 kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
599}
600
578/* When a Guest dies, our cleanup is fairly simple. */ 601/* When a Guest dies, our cleanup is fairly simple. */
579void free_guest_pagetable(struct lguest *lg) 602void free_guest_pagetable(struct lguest *lg)
580{ 603{
@@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg)
594 * for each CPU already set up, we just need to hook them in. */ 617 * for each CPU already set up, we just need to hook them in. */
595void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 618void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
596{ 619{
597 spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 620 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
598 spgd_t switcher_pgd; 621 pgd_t switcher_pgd;
599 spte_t regs_pte; 622 pte_t regs_pte;
600 623
601 /* Make the last PGD entry for this Guest point to the Switcher's PTE 624 /* Make the last PGD entry for this Guest point to the Switcher's PTE
602 * page for this CPU (with appropriate flags). */ 625 * page for this CPU (with appropriate flags). */
603 switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; 626 switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
604 switcher_pgd.flags = _PAGE_KERNEL; 627
605 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 628 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
606 629
607 /* We also change the Switcher PTE page. When we're running the Guest, 630 /* We also change the Switcher PTE page. When we're running the Guest,
@@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
611 * CPU's "struct lguest_pages": if we make sure the Guest's register 634 * CPU's "struct lguest_pages": if we make sure the Guest's register
612 * page is already mapped there, we don't have to copy them out 635 * page is already mapped there, we don't have to copy them out
613 * again. */ 636 * again. */
614 regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; 637 regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
615 regs_pte.flags = _PAGE_KERNEL; 638 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
616 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
617 = regs_pte;
618} 639}
619/*:*/ 640/*:*/
620 641
@@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
635 unsigned int pages) 656 unsigned int pages)
636{ 657{
637 unsigned int i; 658 unsigned int i;
638 spte_t *pte = switcher_pte_page(cpu); 659 pte_t *pte = switcher_pte_page(cpu);
639 660
640 /* The first entries are easy: they map the Switcher code. */ 661 /* The first entries are easy: they map the Switcher code. */
641 for (i = 0; i < pages; i++) { 662 for (i = 0; i < pages; i++) {
642 pte[i].pfn = page_to_pfn(switcher_page[i]); 663 pte[i] = mk_pte(switcher_page[i],
643 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 664 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
644 } 665 }
645 666
646 /* The only other thing we map is this CPU's pair of pages. */ 667 /* The only other thing we map is this CPU's pair of pages. */
647 i = pages + cpu*2; 668 i = pages + cpu*2;
648 669
649 /* First page (Guest registers) is writable from the Guest */ 670 /* First page (Guest registers) is writable from the Guest */
650 pte[i].pfn = page_to_pfn(switcher_page[i]); 671 pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
651 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; 672 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
673
652 /* The second page contains the "struct lguest_ro_state", and is 674 /* The second page contains the "struct lguest_ro_state", and is
653 * read-only. */ 675 * read-only. */
654 pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); 676 pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
655 pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
656} 678}
657 679
658/*H:510 At boot or module load time, init_pagetables() allocates and populates 680/*H:510 At boot or module load time, init_pagetables() allocates and populates
@@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
662 unsigned int i; 684 unsigned int i;
663 685
664 for_each_possible_cpu(i) { 686 for_each_possible_cpu(i) {
665 switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); 687 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
666 if (!switcher_pte_page(i)) { 688 if (!switcher_pte_page(i)) {
667 free_switcher_pte_pages(); 689 free_switcher_pte_pages();
668 return -ENOMEM; 690 return -ENOMEM;