aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/page_tables.c
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-07-30 18:03:45 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-07-30 02:33:45 -0400
commit2e04ef76916d1e29a077ea9d0f2003c8fd86724d (patch)
tree2ff8d625d6e467be9f9f1b67a3674cb6e125e970 /drivers/lguest/page_tables.c
parente969fed542cae08cb11d666efac4f7c5d624d09f (diff)
lguest: fix comment style
I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@redhat.com>
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r--drivers/lguest/page_tables.c427
1 files changed, 282 insertions, 145 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a6fe1abda240..3da902e4b4cb 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1,9 +1,11 @@
1/*P:700 The pagetable code, on the other hand, still shows the scars of 1/*P:700
2 * The pagetable code, on the other hand, still shows the scars of
2 * previous encounters. It's functional, and as neat as it can be in the 3 * previous encounters. It's functional, and as neat as it can be in the
3 * circumstances, but be wary, for these things are subtle and break easily. 4 * circumstances, but be wary, for these things are subtle and break easily.
4 * The Guest provides a virtual to physical mapping, but we can neither trust 5 * The Guest provides a virtual to physical mapping, but we can neither trust
5 * it nor use it: we verify and convert it here then point the CPU to the 6 * it nor use it: we verify and convert it here then point the CPU to the
6 * converted Guest pages when running the Guest. :*/ 7 * converted Guest pages when running the Guest.
8:*/
7 9
8/* Copyright (C) Rusty Russell IBM Corporation 2006. 10/* Copyright (C) Rusty Russell IBM Corporation 2006.
9 * GPL v2 and any later version */ 11 * GPL v2 and any later version */
@@ -17,10 +19,12 @@
17#include <asm/bootparam.h> 19#include <asm/bootparam.h>
18#include "lg.h" 20#include "lg.h"
19 21
20/*M:008 We hold reference to pages, which prevents them from being swapped. 22/*M:008
23 * We hold reference to pages, which prevents them from being swapped.
21 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 24 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
22 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 25 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
23 * could probably consider launching Guests as non-root. :*/ 26 * could probably consider launching Guests as non-root.
27:*/
24 28
25/*H:300 29/*H:300
26 * The Page Table Code 30 * The Page Table Code
@@ -45,16 +49,19 @@
45 * (v) Flushing (throwing away) page tables, 49 * (v) Flushing (throwing away) page tables,
46 * (vi) Mapping the Switcher when the Guest is about to run, 50 * (vi) Mapping the Switcher when the Guest is about to run,
47 * (vii) Setting up the page tables initially. 51 * (vii) Setting up the page tables initially.
48 :*/ 52:*/
49 53
50 54/*
51/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
52 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
53 * page. */ 57 * page.
58 */
54#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
55 60
56/* For PAE we need the PMD index as well. We use the last 2MB, so we 61/*
57 * will need the last pmd entry of the last pmd page. */ 62 * For PAE we need the PMD index as well. We use the last 2MB, so we
63 * will need the last pmd entry of the last pmd page.
64 */
58#ifdef CONFIG_X86_PAE 65#ifdef CONFIG_X86_PAE
59#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 66#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
60#define RESERVE_MEM 2U 67#define RESERVE_MEM 2U
@@ -64,13 +71,16 @@
64#define CHECK_GPGD_MASK _PAGE_TABLE 71#define CHECK_GPGD_MASK _PAGE_TABLE
65#endif 72#endif
66 73
67/* We actually need a separate PTE page for each CPU. Remember that after the 74/*
75 * We actually need a separate PTE page for each CPU. Remember that after the
68 * Switcher code itself comes two pages for each CPU, and we don't want this 76 * Switcher code itself comes two pages for each CPU, and we don't want this
69 * CPU's guest to see the pages of any other CPU. */ 77 * CPU's guest to see the pages of any other CPU.
78 */
70static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 79static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
71#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 80#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
72 81
73/*H:320 The page table code is curly enough to need helper functions to keep it 82/*H:320
83 * The page table code is curly enough to need helper functions to keep it
74 * clear and clean. 84 * clear and clean.
75 * 85 *
76 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
@@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 89 * spgd_addr() takes the virtual address and returns a pointer to the top-level
80 * page directory entry (PGD) for that address. Since we keep track of several 90 * page directory entry (PGD) for that address. Since we keep track of several
81 * page tables, the "i" argument tells us which one we're interested in (it's 91 * page tables, the "i" argument tells us which one we're interested in (it's
82 * usually the current one). */ 92 * usually the current one).
93 */
83static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 94static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
84{ 95{
85 unsigned int index = pgd_index(vaddr); 96 unsigned int index = pgd_index(vaddr);
@@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
96} 107}
97 108
98#ifdef CONFIG_X86_PAE 109#ifdef CONFIG_X86_PAE
99/* This routine then takes the PGD entry given above, which contains the 110/*
111 * This routine then takes the PGD entry given above, which contains the
100 * address of the PMD page. It then returns a pointer to the PMD entry for the 112 * address of the PMD page. It then returns a pointer to the PMD entry for the
101 * given address. */ 113 * given address.
114 */
102static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 115static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
103{ 116{
104 unsigned int index = pmd_index(vaddr); 117 unsigned int index = pmd_index(vaddr);
@@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
119} 132}
120#endif 133#endif
121 134
122/* This routine then takes the page directory entry returned above, which 135/*
136 * This routine then takes the page directory entry returned above, which
123 * contains the address of the page table entry (PTE) page. It then returns a 137 * contains the address of the page table entry (PTE) page. It then returns a
124 * pointer to the PTE entry for the given address. */ 138 * pointer to the PTE entry for the given address.
139 */
125static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 140static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
126{ 141{
127#ifdef CONFIG_X86_PAE 142#ifdef CONFIG_X86_PAE
@@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
139 return &page[pte_index(vaddr)]; 154 return &page[pte_index(vaddr)];
140} 155}
141 156
142/* These two functions just like the above two, except they access the Guest 157/*
143 * page tables. Hence they return a Guest address. */ 158 * These two functions just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address.
160 */
144static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
145{ 162{
146 unsigned int index = vaddr >> (PGDIR_SHIFT); 163 unsigned int index = vaddr >> (PGDIR_SHIFT);
@@ -175,17 +192,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
175#endif 192#endif
176/*:*/ 193/*:*/
177 194
178/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 195/*M:014
179 * an optimization (ie. pre-faulting). :*/ 196 * get_pfn is slow: we could probably try to grab batches of pages here as
197 * an optimization (ie. pre-faulting).
198:*/
180 199
181/*H:350 This routine takes a page number given by the Guest and converts it to 200/*H:350
201 * This routine takes a page number given by the Guest and converts it to
182 * an actual, physical page number. It can fail for several reasons: the 202 * an actual, physical page number. It can fail for several reasons: the
183 * virtual address might not be mapped by the Launcher, the write flag is set 203 * virtual address might not be mapped by the Launcher, the write flag is set
184 * and the page is read-only, or the write flag was set and the page was 204 * and the page is read-only, or the write flag was set and the page was
185 * shared so had to be copied, but we ran out of memory. 205 * shared so had to be copied, but we ran out of memory.
186 * 206 *
187 * This holds a reference to the page, so release_pte() is careful to put that 207 * This holds a reference to the page, so release_pte() is careful to put that
188 * back. */ 208 * back.
209 */
189static unsigned long get_pfn(unsigned long virtpfn, int write) 210static unsigned long get_pfn(unsigned long virtpfn, int write)
190{ 211{
191 struct page *page; 212 struct page *page;
@@ -198,33 +219,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
198 return -1UL; 219 return -1UL;
199} 220}
200 221
201/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table 222/*H:340
223 * Converting a Guest page table entry to a shadow (ie. real) page table
202 * entry can be a little tricky. The flags are (almost) the same, but the 224 * entry can be a little tricky. The flags are (almost) the same, but the
203 * Guest PTE contains a virtual page number: the CPU needs the real page 225 * Guest PTE contains a virtual page number: the CPU needs the real page
204 * number. */ 226 * number.
227 */
205static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 228static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
206{ 229{
207 unsigned long pfn, base, flags; 230 unsigned long pfn, base, flags;
208 231
209 /* The Guest sets the global flag, because it thinks that it is using 232 /*
233 * The Guest sets the global flag, because it thinks that it is using
210 * PGE. We only told it to use PGE so it would tell us whether it was 234 * PGE. We only told it to use PGE so it would tell us whether it was
211 * flushing a kernel mapping or a userspace mapping. We don't actually 235 * flushing a kernel mapping or a userspace mapping. We don't actually
212 * use the global bit, so throw it away. */ 236 * use the global bit, so throw it away.
237 */
213 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 238 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
214 239
215 /* The Guest's pages are offset inside the Launcher. */ 240 /* The Guest's pages are offset inside the Launcher. */
216 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 241 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
217 242
218 /* We need a temporary "unsigned long" variable to hold the answer from 243 /*
244 * We need a temporary "unsigned long" variable to hold the answer from
219 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 245 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
220 * fit in spte.pfn. get_pfn() finds the real physical number of the 246 * fit in spte.pfn. get_pfn() finds the real physical number of the
221 * page, given the virtual number. */ 247 * page, given the virtual number.
248 */
222 pfn = get_pfn(base + pte_pfn(gpte), write); 249 pfn = get_pfn(base + pte_pfn(gpte), write);
223 if (pfn == -1UL) { 250 if (pfn == -1UL) {
224 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 251 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
225 /* When we destroy the Guest, we'll go through the shadow page 252 /*
253 * When we destroy the Guest, we'll go through the shadow page
226 * tables and release_pte() them. Make sure we don't think 254 * tables and release_pte() them. Make sure we don't think
227 * this one is valid! */ 255 * this one is valid!
256 */
228 flags = 0; 257 flags = 0;
229 } 258 }
230 /* Now we assemble our shadow PTE from the page number and flags. */ 259 /* Now we assemble our shadow PTE from the page number and flags. */
@@ -234,8 +263,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
234/*H:460 And to complete the chain, release_pte() looks like this: */ 263/*H:460 And to complete the chain, release_pte() looks like this: */
235static void release_pte(pte_t pte) 264static void release_pte(pte_t pte)
236{ 265{
237 /* Remember that get_user_pages_fast() took a reference to the page, in 266 /*
238 * get_pfn()? We have to put it back now. */ 267 * Remember that get_user_pages_fast() took a reference to the page, in
268 * get_pfn()? We have to put it back now.
269 */
239 if (pte_flags(pte) & _PAGE_PRESENT) 270 if (pte_flags(pte) & _PAGE_PRESENT)
240 put_page(pte_page(pte)); 271 put_page(pte_page(pte));
241} 272}
@@ -273,7 +304,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
273 * and return to the Guest without it knowing. 304 * and return to the Guest without it knowing.
274 * 305 *
275 * If we fixed up the fault (ie. we mapped the address), this routine returns 306 * If we fixed up the fault (ie. we mapped the address), this routine returns
276 * true. Otherwise, it was a real fault and we need to tell the Guest. */ 307 * true. Otherwise, it was a real fault and we need to tell the Guest.
308 */
277bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 309bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
278{ 310{
279 pgd_t gpgd; 311 pgd_t gpgd;
@@ -298,22 +330,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
298 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 330 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
299 /* No shadow entry: allocate a new shadow PTE page. */ 331 /* No shadow entry: allocate a new shadow PTE page. */
300 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 332 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
301 /* This is not really the Guest's fault, but killing it is 333 /*
302 * simple for this corner case. */ 334 * This is not really the Guest's fault, but killing it is
335 * simple for this corner case.
336 */
303 if (!ptepage) { 337 if (!ptepage) {
304 kill_guest(cpu, "out of memory allocating pte page"); 338 kill_guest(cpu, "out of memory allocating pte page");
305 return false; 339 return false;
306 } 340 }
307 /* We check that the Guest pgd is OK. */ 341 /* We check that the Guest pgd is OK. */
308 check_gpgd(cpu, gpgd); 342 check_gpgd(cpu, gpgd);
309 /* And we copy the flags to the shadow PGD entry. The page 343 /*
310 * number in the shadow PGD is the page we just allocated. */ 344 * And we copy the flags to the shadow PGD entry. The page
345 * number in the shadow PGD is the page we just allocated.
346 */
311 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 347 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
312 } 348 }
313 349
314#ifdef CONFIG_X86_PAE 350#ifdef CONFIG_X86_PAE
315 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 351 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
316 /* middle level not present? We can't map it in. */ 352 /* Middle level not present? We can't map it in. */
317 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 353 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
318 return false; 354 return false;
319 355
@@ -324,8 +360,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
324 /* No shadow entry: allocate a new shadow PTE page. */ 360 /* No shadow entry: allocate a new shadow PTE page. */
325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 361 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
326 362
327 /* This is not really the Guest's fault, but killing it is 363 /*
328 * simple for this corner case. */ 364 * This is not really the Guest's fault, but killing it is
365 * simple for this corner case.
366 */
329 if (!ptepage) { 367 if (!ptepage) {
330 kill_guest(cpu, "out of memory allocating pte page"); 368 kill_guest(cpu, "out of memory allocating pte page");
331 return false; 369 return false;
@@ -334,17 +372,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
334 /* We check that the Guest pmd is OK. */ 372 /* We check that the Guest pmd is OK. */
335 check_gpmd(cpu, gpmd); 373 check_gpmd(cpu, gpmd);
336 374
337 /* And we copy the flags to the shadow PMD entry. The page 375 /*
338 * number in the shadow PMD is the page we just allocated. */ 376 * And we copy the flags to the shadow PMD entry. The page
377 * number in the shadow PMD is the page we just allocated.
378 */
339 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 379 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
340 } 380 }
341 381
342 /* OK, now we look at the lower level in the Guest page table: keep its 382 /*
343 * address, because we might update it later. */ 383 * OK, now we look at the lower level in the Guest page table: keep its
384 * address, because we might update it later.
385 */
344 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 386 gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
345#else 387#else
346 /* OK, now we look at the lower level in the Guest page table: keep its 388 /*
347 * address, because we might update it later. */ 389 * OK, now we look at the lower level in the Guest page table: keep its
390 * address, because we might update it later.
391 */
348 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 392 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
349#endif 393#endif
350 gpte = lgread(cpu, gpte_ptr, pte_t); 394 gpte = lgread(cpu, gpte_ptr, pte_t);
@@ -353,8 +397,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
353 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 397 if (!(pte_flags(gpte) & _PAGE_PRESENT))
354 return false; 398 return false;
355 399
356 /* Check they're not trying to write to a page the Guest wants 400 /*
357 * read-only (bit 2 of errcode == write). */ 401 * Check they're not trying to write to a page the Guest wants
402 * read-only (bit 2 of errcode == write).
403 */
358 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 404 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
359 return false; 405 return false;
360 406
@@ -362,8 +408,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
362 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 408 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
363 return false; 409 return false;
364 410
365 /* Check that the Guest PTE flags are OK, and the page number is below 411 /*
366 * the pfn_limit (ie. not mapping the Launcher binary). */ 412 * Check that the Guest PTE flags are OK, and the page number is below
413 * the pfn_limit (ie. not mapping the Launcher binary).
414 */
367 check_gpte(cpu, gpte); 415 check_gpte(cpu, gpte);
368 416
369 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 417 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
@@ -373,29 +421,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
373 421
374 /* Get the pointer to the shadow PTE entry we're going to set. */ 422 /* Get the pointer to the shadow PTE entry we're going to set. */
375 spte = spte_addr(cpu, *spgd, vaddr); 423 spte = spte_addr(cpu, *spgd, vaddr);
376 /* If there was a valid shadow PTE entry here before, we release it. 424
377 * This can happen with a write to a previously read-only entry. */ 425 /*
426 * If there was a valid shadow PTE entry here before, we release it.
427 * This can happen with a write to a previously read-only entry.
428 */
378 release_pte(*spte); 429 release_pte(*spte);
379 430
380 /* If this is a write, we insist that the Guest page is writable (the 431 /*
381 * final arg to gpte_to_spte()). */ 432 * If this is a write, we insist that the Guest page is writable (the
433 * final arg to gpte_to_spte()).
434 */
382 if (pte_dirty(gpte)) 435 if (pte_dirty(gpte))
383 *spte = gpte_to_spte(cpu, gpte, 1); 436 *spte = gpte_to_spte(cpu, gpte, 1);
384 else 437 else
385 /* If this is a read, don't set the "writable" bit in the page 438 /*
439 * If this is a read, don't set the "writable" bit in the page
386 * table entry, even if the Guest says it's writable. That way 440 * table entry, even if the Guest says it's writable. That way
387 * we will come back here when a write does actually occur, so 441 * we will come back here when a write does actually occur, so
388 * we can update the Guest's _PAGE_DIRTY flag. */ 442 * we can update the Guest's _PAGE_DIRTY flag.
443 */
389 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 444 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
390 445
391 /* Finally, we write the Guest PTE entry back: we've set the 446 /*
392 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 447 * Finally, we write the Guest PTE entry back: we've set the
448 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
449 */
393 lgwrite(cpu, gpte_ptr, pte_t, gpte); 450 lgwrite(cpu, gpte_ptr, pte_t, gpte);
394 451
395 /* The fault is fixed, the page table is populated, the mapping 452 /*
453 * The fault is fixed, the page table is populated, the mapping
396 * manipulated, the result returned and the code complete. A small 454 * manipulated, the result returned and the code complete. A small
397 * delay and a trace of alliteration are the only indications the Guest 455 * delay and a trace of alliteration are the only indications the Guest
398 * has that a page fault occurred at all. */ 456 * has that a page fault occurred at all.
457 */
399 return true; 458 return true;
400} 459}
401 460
@@ -408,7 +467,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
408 * mapped, so it's overkill. 467 * mapped, so it's overkill.
409 * 468 *
410 * This is a quick version which answers the question: is this virtual address 469 * This is a quick version which answers the question: is this virtual address
411 * mapped by the shadow page tables, and is it writable? */ 470 * mapped by the shadow page tables, and is it writable?
471 */
412static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 472static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
413{ 473{
414 pgd_t *spgd; 474 pgd_t *spgd;
@@ -428,16 +488,20 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
428 return false; 488 return false;
429#endif 489#endif
430 490
431 /* Check the flags on the pte entry itself: it must be present and 491 /*
432 * writable. */ 492 * Check the flags on the pte entry itself: it must be present and
493 * writable.
494 */
433 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 495 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
434 496
435 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 497 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
436} 498}
437 499
438/* So, when pin_stack_pages() asks us to pin a page, we check if it's already 500/*
501 * So, when pin_stack_pages() asks us to pin a page, we check if it's already
439 * in the page tables, and if not, we call demand_page() with error code 2 502 * in the page tables, and if not, we call demand_page() with error code 2
440 * (meaning "write"). */ 503 * (meaning "write").
504 */
441void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 505void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
442{ 506{
443 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 507 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
@@ -485,9 +549,11 @@ static void release_pgd(pgd_t *spgd)
485 /* If the entry's not present, there's nothing to release. */ 549 /* If the entry's not present, there's nothing to release. */
486 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 550 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
487 unsigned int i; 551 unsigned int i;
488 /* Converting the pfn to find the actual PTE page is easy: turn 552 /*
553 * Converting the pfn to find the actual PTE page is easy: turn
489 * the page number into a physical address, then convert to a 554 * the page number into a physical address, then convert to a
490 * virtual address (easy for kernel pages like this one). */ 555 * virtual address (easy for kernel pages like this one).
556 */
491 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 557 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
492 /* For each entry in the page, we might need to release it. */ 558 /* For each entry in the page, we might need to release it. */
493 for (i = 0; i < PTRS_PER_PTE; i++) 559 for (i = 0; i < PTRS_PER_PTE; i++)
@@ -499,9 +565,12 @@ static void release_pgd(pgd_t *spgd)
499 } 565 }
500} 566}
501#endif 567#endif
502/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 568
569/*H:445
570 * We saw flush_user_mappings() twice: once from the flush_user_mappings()
503 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 571 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
504 * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 572 * It simply releases every PTE page from 0 up to the Guest's kernel address.
573 */
505static void flush_user_mappings(struct lguest *lg, int idx) 574static void flush_user_mappings(struct lguest *lg, int idx)
506{ 575{
507 unsigned int i; 576 unsigned int i;
@@ -510,10 +579,12 @@ static void flush_user_mappings(struct lguest *lg, int idx)
510 release_pgd(lg->pgdirs[idx].pgdir + i); 579 release_pgd(lg->pgdirs[idx].pgdir + i);
511} 580}
512 581
513/*H:440 (v) Flushing (throwing away) page tables, 582/*H:440
583 * (v) Flushing (throwing away) page tables,
514 * 584 *
515 * The Guest has a hypercall to throw away the page tables: it's used when a 585 * The Guest has a hypercall to throw away the page tables: it's used when a
516 * large number of mappings have been changed. */ 586 * large number of mappings have been changed.
587 */
517void guest_pagetable_flush_user(struct lg_cpu *cpu) 588void guest_pagetable_flush_user(struct lg_cpu *cpu)
518{ 589{
519 /* Drop the userspace part of the current page table. */ 590 /* Drop the userspace part of the current page table. */
@@ -551,9 +622,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
551 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 622 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
552} 623}
553 624
554/* We keep several page tables. This is a simple routine to find the page 625/*
626 * We keep several page tables. This is a simple routine to find the page
555 * table (if any) corresponding to this top-level address the Guest has given 627 * table (if any) corresponding to this top-level address the Guest has given
556 * us. */ 628 * us.
629 */
557static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 630static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
558{ 631{
559 unsigned int i; 632 unsigned int i;
@@ -563,9 +636,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
563 return i; 636 return i;
564} 637}
565 638
566/*H:435 And this is us, creating the new page directory. If we really do 639/*H:435
640 * And this is us, creating the new page directory. If we really do
567 * allocate a new one (and so the kernel parts are not there), we set 641 * allocate a new one (and so the kernel parts are not there), we set
568 * blank_pgdir. */ 642 * blank_pgdir.
643 */
569static unsigned int new_pgdir(struct lg_cpu *cpu, 644static unsigned int new_pgdir(struct lg_cpu *cpu,
570 unsigned long gpgdir, 645 unsigned long gpgdir,
571 int *blank_pgdir) 646 int *blank_pgdir)
@@ -575,8 +650,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
575 pmd_t *pmd_table; 650 pmd_t *pmd_table;
576#endif 651#endif
577 652
578 /* We pick one entry at random to throw out. Choosing the Least 653 /*
579 * Recently Used might be better, but this is easy. */ 654 * We pick one entry at random to throw out. Choosing the Least
655 * Recently Used might be better, but this is easy.
656 */
580 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 657 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
581 /* If it's never been allocated at all before, try now. */ 658 /* If it's never been allocated at all before, try now. */
582 if (!cpu->lg->pgdirs[next].pgdir) { 659 if (!cpu->lg->pgdirs[next].pgdir) {
@@ -587,8 +664,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
587 next = cpu->cpu_pgd; 664 next = cpu->cpu_pgd;
588 else { 665 else {
589#ifdef CONFIG_X86_PAE 666#ifdef CONFIG_X86_PAE
590 /* In PAE mode, allocate a pmd page and populate the 667 /*
591 * last pgd entry. */ 668 * In PAE mode, allocate a pmd page and populate the
669 * last pgd entry.
670 */
592 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 671 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
593 if (!pmd_table) { 672 if (!pmd_table) {
594 free_page((long)cpu->lg->pgdirs[next].pgdir); 673 free_page((long)cpu->lg->pgdirs[next].pgdir);
@@ -598,8 +677,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
598 set_pgd(cpu->lg->pgdirs[next].pgdir + 677 set_pgd(cpu->lg->pgdirs[next].pgdir +
599 SWITCHER_PGD_INDEX, 678 SWITCHER_PGD_INDEX,
600 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 679 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
601 /* This is a blank page, so there are no kernel 680 /*
602 * mappings: caller must map the stack! */ 681 * This is a blank page, so there are no kernel
682 * mappings: caller must map the stack!
683 */
603 *blank_pgdir = 1; 684 *blank_pgdir = 1;
604 } 685 }
605#else 686#else
@@ -615,19 +696,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
615 return next; 696 return next;
616} 697}
617 698
618/*H:430 (iv) Switching page tables 699/*H:430
700 * (iv) Switching page tables
619 * 701 *
620 * Now we've seen all the page table setting and manipulation, let's see 702 * Now we've seen all the page table setting and manipulation, let's see
621 * what happens when the Guest changes page tables (ie. changes the top-level 703 * what happens when the Guest changes page tables (ie. changes the top-level
622 * pgdir). This occurs on almost every context switch. */ 704 * pgdir). This occurs on almost every context switch.
705 */
623void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 706void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
624{ 707{
625 int newpgdir, repin = 0; 708 int newpgdir, repin = 0;
626 709
627 /* Look to see if we have this one already. */ 710 /* Look to see if we have this one already. */
628 newpgdir = find_pgdir(cpu->lg, pgtable); 711 newpgdir = find_pgdir(cpu->lg, pgtable);
629 /* If not, we allocate or mug an existing one: if it's a fresh one, 712 /*
630 * repin gets set to 1. */ 713 * If not, we allocate or mug an existing one: if it's a fresh one,
714 * repin gets set to 1.
715 */
631 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 716 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
632 newpgdir = new_pgdir(cpu, pgtable, &repin); 717 newpgdir = new_pgdir(cpu, pgtable, &repin);
633 /* Change the current pgd index to the new one. */ 718 /* Change the current pgd index to the new one. */
@@ -637,9 +722,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
637 pin_stack_pages(cpu); 722 pin_stack_pages(cpu);
638} 723}
639 724
640/*H:470 Finally, a routine which throws away everything: all PGD entries in all 725/*H:470
726 * Finally, a routine which throws away everything: all PGD entries in all
641 * the shadow page tables, including the Guest's kernel mappings. This is used 727 * the shadow page tables, including the Guest's kernel mappings. This is used
642 * when we destroy the Guest. */ 728 * when we destroy the Guest.
729 */
643static void release_all_pagetables(struct lguest *lg) 730static void release_all_pagetables(struct lguest *lg)
644{ 731{
645 unsigned int i, j; 732 unsigned int i, j;
@@ -656,8 +743,10 @@ static void release_all_pagetables(struct lguest *lg)
656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 743 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 744 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
658 745
659 /* And release the pmd entries of that pmd page, 746 /*
660 * except for the switcher pmd. */ 747 * And release the pmd entries of that pmd page,
748 * except for the switcher pmd.
749 */
661 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 750 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
662 release_pmd(&pmdpage[k]); 751 release_pmd(&pmdpage[k]);
663#endif 752#endif
@@ -667,10 +756,12 @@ static void release_all_pagetables(struct lguest *lg)
667 } 756 }
668} 757}
669 758
670/* We also throw away everything when a Guest tells us it's changed a kernel 759/*
760 * We also throw away everything when a Guest tells us it's changed a kernel
671 * mapping. Since kernel mappings are in every page table, it's easiest to 761 * mapping. Since kernel mappings are in every page table, it's easiest to
672 * throw them all away. This traps the Guest in amber for a while as 762 * throw them all away. This traps the Guest in amber for a while as
673 * everything faults back in, but it's rare. */ 763 * everything faults back in, but it's rare.
764 */
674void guest_pagetable_clear_all(struct lg_cpu *cpu) 765void guest_pagetable_clear_all(struct lg_cpu *cpu)
675{ 766{
676 release_all_pagetables(cpu->lg); 767 release_all_pagetables(cpu->lg);
@@ -678,15 +769,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
678 pin_stack_pages(cpu); 769 pin_stack_pages(cpu);
679} 770}
680/*:*/ 771/*:*/
681/*M:009 Since we throw away all mappings when a kernel mapping changes, our 772
773/*M:009
774 * Since we throw away all mappings when a kernel mapping changes, our
682 * performance sucks for guests using highmem. In fact, a guest with 775 * performance sucks for guests using highmem. In fact, a guest with
683 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 776 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
684 * usually slower than a Guest with less memory. 777 * usually slower than a Guest with less memory.
685 * 778 *
686 * This, of course, cannot be fixed. It would take some kind of... well, I 779 * This, of course, cannot be fixed. It would take some kind of... well, I
687 * don't know, but the term "puissant code-fu" comes to mind. :*/ 780 * don't know, but the term "puissant code-fu" comes to mind.
781:*/
688 782
689/*H:420 This is the routine which actually sets the page table entry for then 783/*H:420
784 * This is the routine which actually sets the page table entry for then
690 * "idx"'th shadow page table. 785 * "idx"'th shadow page table.
691 * 786 *
692 * Normally, we can just throw out the old entry and replace it with 0: if they 787 * Normally, we can just throw out the old entry and replace it with 0: if they
@@ -715,31 +810,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
715 spmd = spmd_addr(cpu, *spgd, vaddr); 810 spmd = spmd_addr(cpu, *spgd, vaddr);
716 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 811 if (pmd_flags(*spmd) & _PAGE_PRESENT) {
717#endif 812#endif
718 /* Otherwise, we start by releasing 813 /* Otherwise, start by releasing the existing entry. */
719 * the existing entry. */
720 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 814 pte_t *spte = spte_addr(cpu, *spgd, vaddr);
721 release_pte(*spte); 815 release_pte(*spte);
722 816
723 /* If they're setting this entry as dirty or accessed, 817 /*
724 * we might as well put that entry they've given us 818 * If they're setting this entry as dirty or accessed,
725 * in now. This shaves 10% off a 819 * we might as well put that entry they've given us in
726 * copy-on-write micro-benchmark. */ 820 * now. This shaves 10% off a copy-on-write
821 * micro-benchmark.
822 */
727 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 823 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
728 check_gpte(cpu, gpte); 824 check_gpte(cpu, gpte);
729 native_set_pte(spte, 825 native_set_pte(spte,
730 gpte_to_spte(cpu, gpte, 826 gpte_to_spte(cpu, gpte,
731 pte_flags(gpte) & _PAGE_DIRTY)); 827 pte_flags(gpte) & _PAGE_DIRTY));
732 } else 828 } else {
733 /* Otherwise kill it and we can demand_page() 829 /*
734 * it in later. */ 830 * Otherwise kill it and we can demand_page()
831 * it in later.
832 */
735 native_set_pte(spte, __pte(0)); 833 native_set_pte(spte, __pte(0));
834 }
736#ifdef CONFIG_X86_PAE 835#ifdef CONFIG_X86_PAE
737 } 836 }
738#endif 837#endif
739 } 838 }
740} 839}
741 840
742/*H:410 Updating a PTE entry is a little trickier. 841/*H:410
842 * Updating a PTE entry is a little trickier.
743 * 843 *
744 * We keep track of several different page tables (the Guest uses one for each 844 * We keep track of several different page tables (the Guest uses one for each
745 * process, so it makes sense to cache at least a few). Each of these have 845 * process, so it makes sense to cache at least a few). Each of these have
@@ -748,12 +848,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
748 * all the page tables, not just the current one. This is rare. 848 * all the page tables, not just the current one. This is rare.
749 * 849 *
750 * The benefit is that when we have to track a new page table, we can keep all 850 * The benefit is that when we have to track a new page table, we can keep all
751 * the kernel mappings. This speeds up context switch immensely. */ 851 * the kernel mappings. This speeds up context switch immensely.
852 */
752void guest_set_pte(struct lg_cpu *cpu, 853void guest_set_pte(struct lg_cpu *cpu,
753 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 854 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
754{ 855{
755 /* Kernel mappings must be changed on all top levels. Slow, but doesn't 856 /*
756 * happen often. */ 857 * Kernel mappings must be changed on all top levels. Slow, but doesn't
858 * happen often.
859 */
757 if (vaddr >= cpu->lg->kernel_address) { 860 if (vaddr >= cpu->lg->kernel_address) {
758 unsigned int i; 861 unsigned int i;
759 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 862 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
@@ -802,12 +905,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
802} 905}
803#endif 906#endif
804 907
805/* Once we know how much memory we have we can construct simple identity 908/*
806 * (which set virtual == physical) and linear mappings 909 * Once we know how much memory we have we can construct simple identity (which
807 * which will get the Guest far enough into the boot to create its own. 910 * set virtual == physical) and linear mappings which will get the Guest far
911 * enough into the boot to create its own.
808 * 912 *
809 * We lay them out of the way, just below the initrd (which is why we need to 913 * We lay them out of the way, just below the initrd (which is why we need to
810 * know its size here). */ 914 * know its size here).
915 */
811static unsigned long setup_pagetables(struct lguest *lg, 916static unsigned long setup_pagetables(struct lguest *lg,
812 unsigned long mem, 917 unsigned long mem,
813 unsigned long initrd_size) 918 unsigned long initrd_size)
@@ -825,8 +930,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
825 unsigned int phys_linear; 930 unsigned int phys_linear;
826#endif 931#endif
827 932
828 /* We have mapped_pages frames to map, so we need 933 /*
829 * linear_pages page tables to map them. */ 934 * We have mapped_pages frames to map, so we need linear_pages page
935 * tables to map them.
936 */
830 mapped_pages = mem / PAGE_SIZE; 937 mapped_pages = mem / PAGE_SIZE;
831 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 938 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
832 939
@@ -839,8 +946,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
839#ifdef CONFIG_X86_PAE 946#ifdef CONFIG_X86_PAE
840 pmds = (void *)linear - PAGE_SIZE; 947 pmds = (void *)linear - PAGE_SIZE;
841#endif 948#endif
842 /* Linear mapping is easy: put every page's address into the 949 /*
843 * mapping in order. */ 950 * Linear mapping is easy: put every page's address into the
951 * mapping in order.
952 */
844 for (i = 0; i < mapped_pages; i++) { 953 for (i = 0; i < mapped_pages; i++) {
845 pte_t pte; 954 pte_t pte;
846 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); 955 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
@@ -848,8 +957,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
848 return -EFAULT; 957 return -EFAULT;
849 } 958 }
850 959
851 /* The top level points to the linear page table pages above. 960 /*
852 * We setup the identity and linear mappings here. */ 961 * The top level points to the linear page table pages above.
962 * We setup the identity and linear mappings here.
963 */
853#ifdef CONFIG_X86_PAE 964#ifdef CONFIG_X86_PAE
854 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 965 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
855 i += PTRS_PER_PTE, j++) { 966 i += PTRS_PER_PTE, j++) {
@@ -880,15 +991,19 @@ static unsigned long setup_pagetables(struct lguest *lg,
880 } 991 }
881#endif 992#endif
882 993
883 /* We return the top level (guest-physical) address: remember where 994 /*
884 * this is. */ 995 * We return the top level (guest-physical) address: remember where
996 * this is.
997 */
885 return (unsigned long)pgdir - mem_base; 998 return (unsigned long)pgdir - mem_base;
886} 999}
887 1000
888/*H:500 (vii) Setting up the page tables initially. 1001/*H:500
1002 * (vii) Setting up the page tables initially.
889 * 1003 *
890 * When a Guest is first created, the Launcher tells us where the toplevel of 1004 * When a Guest is first created, the Launcher tells us where the toplevel of
891 * its first page table is. We set some things up here: */ 1005 * its first page table is. We set some things up here:
1006 */
892int init_guest_pagetable(struct lguest *lg) 1007int init_guest_pagetable(struct lguest *lg)
893{ 1008{
894 u64 mem; 1009 u64 mem;
@@ -898,14 +1013,18 @@ int init_guest_pagetable(struct lguest *lg)
898 pgd_t *pgd; 1013 pgd_t *pgd;
899 pmd_t *pmd_table; 1014 pmd_t *pmd_table;
900#endif 1015#endif
901 /* Get the Guest memory size and the ramdisk size from the boot header 1016 /*
902 * located at lg->mem_base (Guest address 0). */ 1017 * Get the Guest memory size and the ramdisk size from the boot header
1018 * located at lg->mem_base (Guest address 0).
1019 */
903 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1020 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
904 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1021 || get_user(initrd_size, &boot->hdr.ramdisk_size))
905 return -EFAULT; 1022 return -EFAULT;
906 1023
907 /* We start on the first shadow page table, and give it a blank PGD 1024 /*
908 * page. */ 1025 * We start on the first shadow page table, and give it a blank PGD
1026 * page.
1027 */
909 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1028 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
910 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1029 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
911 return lg->pgdirs[0].gpgdir; 1030 return lg->pgdirs[0].gpgdir;
@@ -931,17 +1050,21 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
931 /* We get the kernel address: above this is all kernel memory. */ 1050 /* We get the kernel address: above this is all kernel memory. */
932 if (get_user(cpu->lg->kernel_address, 1051 if (get_user(cpu->lg->kernel_address,
933 &cpu->lg->lguest_data->kernel_address) 1052 &cpu->lg->lguest_data->kernel_address)
934 /* We tell the Guest that it can't use the top 2 or 4 MB 1053 /*
935 * of virtual addresses used by the Switcher. */ 1054 * We tell the Guest that it can't use the top 2 or 4 MB
1055 * of virtual addresses used by the Switcher.
1056 */
936 || put_user(RESERVE_MEM * 1024 * 1024, 1057 || put_user(RESERVE_MEM * 1024 * 1024,
937 &cpu->lg->lguest_data->reserve_mem) 1058 &cpu->lg->lguest_data->reserve_mem)
938 || put_user(cpu->lg->pgdirs[0].gpgdir, 1059 || put_user(cpu->lg->pgdirs[0].gpgdir,
939 &cpu->lg->lguest_data->pgdir)) 1060 &cpu->lg->lguest_data->pgdir))
940 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1061 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
941 1062
942 /* In flush_user_mappings() we loop from 0 to 1063 /*
1064 * In flush_user_mappings() we loop from 0 to
943 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1065 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
944 * Switcher mappings, so check that now. */ 1066 * Switcher mappings, so check that now.
1067 */
945#ifdef CONFIG_X86_PAE 1068#ifdef CONFIG_X86_PAE
946 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1069 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
947 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) 1070 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
@@ -964,12 +1087,14 @@ void free_guest_pagetable(struct lguest *lg)
964 free_page((long)lg->pgdirs[i].pgdir); 1087 free_page((long)lg->pgdirs[i].pgdir);
965} 1088}
966 1089
967/*H:480 (vi) Mapping the Switcher when the Guest is about to run. 1090/*H:480
1091 * (vi) Mapping the Switcher when the Guest is about to run.
968 * 1092 *
969 * The Switcher and the two pages for this CPU need to be visible in the 1093 * The Switcher and the two pages for this CPU need to be visible in the
970 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1094 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
971 * for each CPU already set up, we just need to hook them in now we know which 1095 * for each CPU already set up, we just need to hook them in now we know which
972 * Guest is about to run on this CPU. */ 1096 * Guest is about to run on this CPU.
1097 */
973void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1098void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
974{ 1099{
975 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 1100 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
@@ -990,20 +1115,24 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
990#else 1115#else
991 pgd_t switcher_pgd; 1116 pgd_t switcher_pgd;
992 1117
993 /* Make the last PGD entry for this Guest point to the Switcher's PTE 1118 /*
994 * page for this CPU (with appropriate flags). */ 1119 * Make the last PGD entry for this Guest point to the Switcher's PTE
1120 * page for this CPU (with appropriate flags).
1121 */
995 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1122 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
996 1123
997 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1124 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
998 1125
999#endif 1126#endif
1000 /* We also change the Switcher PTE page. When we're running the Guest, 1127 /*
1128 * We also change the Switcher PTE page. When we're running the Guest,
1001 * we want the Guest's "regs" page to appear where the first Switcher 1129 * we want the Guest's "regs" page to appear where the first Switcher
1002 * page for this CPU is. This is an optimization: when the Switcher 1130 * page for this CPU is. This is an optimization: when the Switcher
1003 * saves the Guest registers, it saves them into the first page of this 1131 * saves the Guest registers, it saves them into the first page of this
1004 * CPU's "struct lguest_pages": if we make sure the Guest's register 1132 * CPU's "struct lguest_pages": if we make sure the Guest's register
1005 * page is already mapped there, we don't have to copy them out 1133 * page is already mapped there, we don't have to copy them out
1006 * again. */ 1134 * again.
1135 */
1007 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1136 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
1008 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL)); 1137 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
1009 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], 1138 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
@@ -1019,10 +1148,12 @@ static void free_switcher_pte_pages(void)
1019 free_page((long)switcher_pte_page(i)); 1148 free_page((long)switcher_pte_page(i));
1020} 1149}
1021 1150
1022/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given 1151/*H:520
1152 * Setting up the Switcher PTE page for given CPU is fairly easy, given
1023 * the CPU number and the "struct page"s for the Switcher code itself. 1153 * the CPU number and the "struct page"s for the Switcher code itself.
1024 * 1154 *
1025 * Currently the Switcher is less than a page long, so "pages" is always 1. */ 1155 * Currently the Switcher is less than a page long, so "pages" is always 1.
1156 */
1026static __init void populate_switcher_pte_page(unsigned int cpu, 1157static __init void populate_switcher_pte_page(unsigned int cpu,
1027 struct page *switcher_page[], 1158 struct page *switcher_page[],
1028 unsigned int pages) 1159 unsigned int pages)
@@ -1043,13 +1174,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1174 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1175 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
1045 1176
1046 /* The second page contains the "struct lguest_ro_state", and is 1177 /*
1047 * read-only. */ 1178 * The second page contains the "struct lguest_ro_state", and is
1179 * read-only.
1180 */
1048 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1181 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
1049 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1182 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
1050} 1183}
1051 1184
1052/* We've made it through the page table code. Perhaps our tired brains are 1185/*
1186 * We've made it through the page table code. Perhaps our tired brains are
1053 * still processing the details, or perhaps we're simply glad it's over. 1187 * still processing the details, or perhaps we're simply glad it's over.
1054 * 1188 *
1055 * If nothing else, note that all this complexity in juggling shadow page tables 1189 * If nothing else, note that all this complexity in juggling shadow page tables
@@ -1058,10 +1192,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1058 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1192 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
1059 * have implemented shadow page table support directly into hardware. 1193 * have implemented shadow page table support directly into hardware.
1060 * 1194 *
1061 * There is just one file remaining in the Host. */ 1195 * There is just one file remaining in the Host.
1196 */
1062 1197
1063/*H:510 At boot or module load time, init_pagetables() allocates and populates 1198/*H:510
1064 * the Switcher PTE page for each CPU. */ 1199 * At boot or module load time, init_pagetables() allocates and populates
1200 * the Switcher PTE page for each CPU.
1201 */
1065__init int init_pagetables(struct page **switcher_page, unsigned int pages) 1202__init int init_pagetables(struct page **switcher_page, unsigned int pages)
1066{ 1203{
1067 unsigned int i; 1204 unsigned int i;