aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest/page_tables.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-08-29 03:30:41 -0400
committerIngo Molnar <mingo@elte.hu>2009-08-29 03:31:47 -0400
commiteebc57f73d42095b778e899f6aa90ad050c72655 (patch)
tree2ba80c75e9284093e6d7606dbb1b6a4bb752a2a5 /drivers/lguest/page_tables.c
parentd3a247bfb2c26f5b67367d58af7ad8c2efbbc6c1 (diff)
parent2a4ab640d3c28c2952967e5f63ea495555bf2a5f (diff)
Merge branch 'for-ingo' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-sfi-2.6 into x86/apic
Merge reason: the SFI (Simple Firmware Interface) feature in the ACPI tree needs this cleanup, pull it into the APIC branch as well so that there's no interactions. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r--drivers/lguest/page_tables.c489
1 files changed, 336 insertions, 153 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a6fe1abda240..a8d0aee3bc0e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1,9 +1,11 @@
1/*P:700 The pagetable code, on the other hand, still shows the scars of 1/*P:700
2 * The pagetable code, on the other hand, still shows the scars of
2 * previous encounters. It's functional, and as neat as it can be in the 3 * previous encounters. It's functional, and as neat as it can be in the
3 * circumstances, but be wary, for these things are subtle and break easily. 4 * circumstances, but be wary, for these things are subtle and break easily.
4 * The Guest provides a virtual to physical mapping, but we can neither trust 5 * The Guest provides a virtual to physical mapping, but we can neither trust
5 * it nor use it: we verify and convert it here then point the CPU to the 6 * it nor use it: we verify and convert it here then point the CPU to the
6 * converted Guest pages when running the Guest. :*/ 7 * converted Guest pages when running the Guest.
8:*/
7 9
8/* Copyright (C) Rusty Russell IBM Corporation 2006. 10/* Copyright (C) Rusty Russell IBM Corporation 2006.
9 * GPL v2 and any later version */ 11 * GPL v2 and any later version */
@@ -17,18 +19,20 @@
17#include <asm/bootparam.h> 19#include <asm/bootparam.h>
18#include "lg.h" 20#include "lg.h"
19 21
20/*M:008 We hold reference to pages, which prevents them from being swapped. 22/*M:008
23 * We hold reference to pages, which prevents them from being swapped.
21 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 24 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
22 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 25 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
23 * could probably consider launching Guests as non-root. :*/ 26 * could probably consider launching Guests as non-root.
27:*/
24 28
25/*H:300 29/*H:300
26 * The Page Table Code 30 * The Page Table Code
27 * 31 *
28 * We use two-level page tables for the Guest. If you're not entirely 32 * We use two-level page tables for the Guest, or three-level with PAE. If
29 * comfortable with virtual addresses, physical addresses and page tables then 33 * you're not entirely comfortable with virtual addresses, physical addresses
30 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
31 * diagrams!). 35 * Table Handling" (with diagrams!).
32 * 36 *
33 * The Guest keeps page tables, but we maintain the actual ones here: these are 37 * The Guest keeps page tables, but we maintain the actual ones here: these are
34 * called "shadow" page tables. Which is a very Guest-centric name: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -45,16 +49,18 @@
45 * (v) Flushing (throwing away) page tables, 49 * (v) Flushing (throwing away) page tables,
46 * (vi) Mapping the Switcher when the Guest is about to run, 50 * (vi) Mapping the Switcher when the Guest is about to run,
47 * (vii) Setting up the page tables initially. 51 * (vii) Setting up the page tables initially.
48 :*/ 52:*/
49 53
50 54/*
51/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
52 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * or 512 PTE entries with PAE (2MB).
53 * page. */ 57 */
54#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
55 59
56/* For PAE we need the PMD index as well. We use the last 2MB, so we 60/*
57 * will need the last pmd entry of the last pmd page. */ 61 * For PAE we need the PMD index as well. We use the last 2MB, so we
62 * will need the last pmd entry of the last pmd page.
63 */
58#ifdef CONFIG_X86_PAE 64#ifdef CONFIG_X86_PAE
59#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
60#define RESERVE_MEM 2U 66#define RESERVE_MEM 2U
@@ -64,14 +70,18 @@
64#define CHECK_GPGD_MASK _PAGE_TABLE 70#define CHECK_GPGD_MASK _PAGE_TABLE
65#endif 71#endif
66 72
67/* We actually need a separate PTE page for each CPU. Remember that after the 73/*
74 * We actually need a separate PTE page for each CPU. Remember that after the
68 * Switcher code itself comes two pages for each CPU, and we don't want this 75 * Switcher code itself comes two pages for each CPU, and we don't want this
69 * CPU's guest to see the pages of any other CPU. */ 76 * CPU's guest to see the pages of any other CPU.
77 */
70static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
71#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
72 80
73/*H:320 The page table code is curly enough to need helper functions to keep it 81/*H:320
74 * clear and clean. 82 * The page table code is curly enough to need helper functions to keep it
83 * clear and clean. The kernel itself provides many of them; one advantage
84 * of insisting that the Guest and Host use the same CONFIG_PAE setting.
75 * 85 *
76 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
77 * page tables. 87 * page tables.
@@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 89 * spgd_addr() takes the virtual address and returns a pointer to the top-level
80 * page directory entry (PGD) for that address. Since we keep track of several 90 * page directory entry (PGD) for that address. Since we keep track of several
81 * page tables, the "i" argument tells us which one we're interested in (it's 91 * page tables, the "i" argument tells us which one we're interested in (it's
82 * usually the current one). */ 92 * usually the current one).
93 */
83static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 94static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
84{ 95{
85 unsigned int index = pgd_index(vaddr); 96 unsigned int index = pgd_index(vaddr);
@@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
96} 107}
97 108
98#ifdef CONFIG_X86_PAE 109#ifdef CONFIG_X86_PAE
99/* This routine then takes the PGD entry given above, which contains the 110/*
111 * This routine then takes the PGD entry given above, which contains the
100 * address of the PMD page. It then returns a pointer to the PMD entry for the 112 * address of the PMD page. It then returns a pointer to the PMD entry for the
101 * given address. */ 113 * given address.
114 */
102static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 115static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
103{ 116{
104 unsigned int index = pmd_index(vaddr); 117 unsigned int index = pmd_index(vaddr);
@@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
119} 132}
120#endif 133#endif
121 134
122/* This routine then takes the page directory entry returned above, which 135/*
136 * This routine then takes the page directory entry returned above, which
123 * contains the address of the page table entry (PTE) page. It then returns a 137 * contains the address of the page table entry (PTE) page. It then returns a
124 * pointer to the PTE entry for the given address. */ 138 * pointer to the PTE entry for the given address.
139 */
125static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 140static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
126{ 141{
127#ifdef CONFIG_X86_PAE 142#ifdef CONFIG_X86_PAE
@@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
139 return &page[pte_index(vaddr)]; 154 return &page[pte_index(vaddr)];
140} 155}
141 156
142/* These two functions just like the above two, except they access the Guest 157/*
143 * page tables. Hence they return a Guest address. */ 158 * These functions are just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address.
160 */
144static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
145{ 162{
146 unsigned int index = vaddr >> (PGDIR_SHIFT); 163 unsigned int index = vaddr >> (PGDIR_SHIFT);
@@ -148,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
148} 165}
149 166
150#ifdef CONFIG_X86_PAE 167#ifdef CONFIG_X86_PAE
168/* Follow the PGD to the PMD. */
151static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
152{ 170{
153 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
@@ -155,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
155 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
156} 174}
157 175
176/* Follow the PMD to the PTE. */
158static unsigned long gpte_addr(struct lg_cpu *cpu, 177static unsigned long gpte_addr(struct lg_cpu *cpu,
159 pmd_t gpmd, unsigned long vaddr) 178 pmd_t gpmd, unsigned long vaddr)
160{ 179{
@@ -164,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
164 return gpage + pte_index(vaddr) * sizeof(pte_t); 183 return gpage + pte_index(vaddr) * sizeof(pte_t);
165} 184}
166#else 185#else
186/* Follow the PGD to the PTE (no mid-level for !PAE). */
167static unsigned long gpte_addr(struct lg_cpu *cpu, 187static unsigned long gpte_addr(struct lg_cpu *cpu,
168 pgd_t gpgd, unsigned long vaddr) 188 pgd_t gpgd, unsigned long vaddr)
169{ 189{
@@ -175,17 +195,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
175#endif 195#endif
176/*:*/ 196/*:*/
177 197
178/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 198/*M:014
179 * an optimization (ie. pre-faulting). :*/ 199 * get_pfn is slow: we could probably try to grab batches of pages here as
200 * an optimization (ie. pre-faulting).
201:*/
180 202
181/*H:350 This routine takes a page number given by the Guest and converts it to 203/*H:350
204 * This routine takes a page number given by the Guest and converts it to
182 * an actual, physical page number. It can fail for several reasons: the 205 * an actual, physical page number. It can fail for several reasons: the
183 * virtual address might not be mapped by the Launcher, the write flag is set 206 * virtual address might not be mapped by the Launcher, the write flag is set
184 * and the page is read-only, or the write flag was set and the page was 207 * and the page is read-only, or the write flag was set and the page was
185 * shared so had to be copied, but we ran out of memory. 208 * shared so had to be copied, but we ran out of memory.
186 * 209 *
187 * This holds a reference to the page, so release_pte() is careful to put that 210 * This holds a reference to the page, so release_pte() is careful to put that
188 * back. */ 211 * back.
212 */
189static unsigned long get_pfn(unsigned long virtpfn, int write) 213static unsigned long get_pfn(unsigned long virtpfn, int write)
190{ 214{
191 struct page *page; 215 struct page *page;
@@ -198,33 +222,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
198 return -1UL; 222 return -1UL;
199} 223}
200 224
201/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table 225/*H:340
226 * Converting a Guest page table entry to a shadow (ie. real) page table
202 * entry can be a little tricky. The flags are (almost) the same, but the 227 * entry can be a little tricky. The flags are (almost) the same, but the
203 * Guest PTE contains a virtual page number: the CPU needs the real page 228 * Guest PTE contains a virtual page number: the CPU needs the real page
204 * number. */ 229 * number.
230 */
205static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 231static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
206{ 232{
207 unsigned long pfn, base, flags; 233 unsigned long pfn, base, flags;
208 234
209 /* The Guest sets the global flag, because it thinks that it is using 235 /*
236 * The Guest sets the global flag, because it thinks that it is using
210 * PGE. We only told it to use PGE so it would tell us whether it was 237 * PGE. We only told it to use PGE so it would tell us whether it was
211 * flushing a kernel mapping or a userspace mapping. We don't actually 238 * flushing a kernel mapping or a userspace mapping. We don't actually
212 * use the global bit, so throw it away. */ 239 * use the global bit, so throw it away.
240 */
213 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 241 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
214 242
215 /* The Guest's pages are offset inside the Launcher. */ 243 /* The Guest's pages are offset inside the Launcher. */
216 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 244 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
217 245
218 /* We need a temporary "unsigned long" variable to hold the answer from 246 /*
247 * We need a temporary "unsigned long" variable to hold the answer from
219 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 248 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
220 * fit in spte.pfn. get_pfn() finds the real physical number of the 249 * fit in spte.pfn. get_pfn() finds the real physical number of the
221 * page, given the virtual number. */ 250 * page, given the virtual number.
251 */
222 pfn = get_pfn(base + pte_pfn(gpte), write); 252 pfn = get_pfn(base + pte_pfn(gpte), write);
223 if (pfn == -1UL) { 253 if (pfn == -1UL) {
224 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 254 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
225 /* When we destroy the Guest, we'll go through the shadow page 255 /*
256 * When we destroy the Guest, we'll go through the shadow page
226 * tables and release_pte() them. Make sure we don't think 257 * tables and release_pte() them. Make sure we don't think
227 * this one is valid! */ 258 * this one is valid!
259 */
228 flags = 0; 260 flags = 0;
229 } 261 }
230 /* Now we assemble our shadow PTE from the page number and flags. */ 262 /* Now we assemble our shadow PTE from the page number and flags. */
@@ -234,8 +266,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
234/*H:460 And to complete the chain, release_pte() looks like this: */ 266/*H:460 And to complete the chain, release_pte() looks like this: */
235static void release_pte(pte_t pte) 267static void release_pte(pte_t pte)
236{ 268{
237 /* Remember that get_user_pages_fast() took a reference to the page, in 269 /*
238 * get_pfn()? We have to put it back now. */ 270 * Remember that get_user_pages_fast() took a reference to the page, in
271 * get_pfn()? We have to put it back now.
272 */
239 if (pte_flags(pte) & _PAGE_PRESENT) 273 if (pte_flags(pte) & _PAGE_PRESENT)
240 put_page(pte_page(pte)); 274 put_page(pte_page(pte));
241} 275}
@@ -273,7 +307,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
273 * and return to the Guest without it knowing. 307 * and return to the Guest without it knowing.
274 * 308 *
275 * If we fixed up the fault (ie. we mapped the address), this routine returns 309 * If we fixed up the fault (ie. we mapped the address), this routine returns
276 * true. Otherwise, it was a real fault and we need to tell the Guest. */ 310 * true. Otherwise, it was a real fault and we need to tell the Guest.
311 */
277bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
278{ 313{
279 pgd_t gpgd; 314 pgd_t gpgd;
@@ -282,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
282 pte_t gpte; 317 pte_t gpte;
283 pte_t *spte; 318 pte_t *spte;
284 319
320 /* Mid level for PAE. */
285#ifdef CONFIG_X86_PAE 321#ifdef CONFIG_X86_PAE
286 pmd_t *spmd; 322 pmd_t *spmd;
287 pmd_t gpmd; 323 pmd_t gpmd;
@@ -298,22 +334,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
298 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 334 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
299 /* No shadow entry: allocate a new shadow PTE page. */ 335 /* No shadow entry: allocate a new shadow PTE page. */
300 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 336 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
301 /* This is not really the Guest's fault, but killing it is 337 /*
302 * simple for this corner case. */ 338 * This is not really the Guest's fault, but killing it is
339 * simple for this corner case.
340 */
303 if (!ptepage) { 341 if (!ptepage) {
304 kill_guest(cpu, "out of memory allocating pte page"); 342 kill_guest(cpu, "out of memory allocating pte page");
305 return false; 343 return false;
306 } 344 }
307 /* We check that the Guest pgd is OK. */ 345 /* We check that the Guest pgd is OK. */
308 check_gpgd(cpu, gpgd); 346 check_gpgd(cpu, gpgd);
309 /* And we copy the flags to the shadow PGD entry. The page 347 /*
310 * number in the shadow PGD is the page we just allocated. */ 348 * And we copy the flags to the shadow PGD entry. The page
349 * number in the shadow PGD is the page we just allocated.
350 */
311 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 351 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
312 } 352 }
313 353
314#ifdef CONFIG_X86_PAE 354#ifdef CONFIG_X86_PAE
315 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 355 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
316 /* middle level not present? We can't map it in. */ 356 /* Middle level not present? We can't map it in. */
317 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 357 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
318 return false; 358 return false;
319 359
@@ -324,8 +364,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
324 /* No shadow entry: allocate a new shadow PTE page. */ 364 /* No shadow entry: allocate a new shadow PTE page. */
325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 365 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
326 366
327 /* This is not really the Guest's fault, but killing it is 367 /*
328 * simple for this corner case. */ 368 * This is not really the Guest's fault, but killing it is
369 * simple for this corner case.
370 */
329 if (!ptepage) { 371 if (!ptepage) {
330 kill_guest(cpu, "out of memory allocating pte page"); 372 kill_guest(cpu, "out of memory allocating pte page");
331 return false; 373 return false;
@@ -334,27 +376,37 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
334 /* We check that the Guest pmd is OK. */ 376 /* We check that the Guest pmd is OK. */
335 check_gpmd(cpu, gpmd); 377 check_gpmd(cpu, gpmd);
336 378
337 /* And we copy the flags to the shadow PMD entry. The page 379 /*
338 * number in the shadow PMD is the page we just allocated. */ 380 * And we copy the flags to the shadow PMD entry. The page
381 * number in the shadow PMD is the page we just allocated.
382 */
339 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 383 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
340 } 384 }
341 385
342 /* OK, now we look at the lower level in the Guest page table: keep its 386 /*
343 * address, because we might update it later. */ 387 * OK, now we look at the lower level in the Guest page table: keep its
388 * address, because we might update it later.
389 */
344 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 390 gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
345#else 391#else
346 /* OK, now we look at the lower level in the Guest page table: keep its 392 /*
347 * address, because we might update it later. */ 393 * OK, now we look at the lower level in the Guest page table: keep its
394 * address, because we might update it later.
395 */
348 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
349#endif 397#endif
398
399 /* Read the actual PTE value. */
350 gpte = lgread(cpu, gpte_ptr, pte_t); 400 gpte = lgread(cpu, gpte_ptr, pte_t);
351 401
352 /* If this page isn't in the Guest page tables, we can't page it in. */ 402 /* If this page isn't in the Guest page tables, we can't page it in. */
353 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 403 if (!(pte_flags(gpte) & _PAGE_PRESENT))
354 return false; 404 return false;
355 405
356 /* Check they're not trying to write to a page the Guest wants 406 /*
357 * read-only (bit 2 of errcode == write). */ 407 * Check they're not trying to write to a page the Guest wants
408 * read-only (bit 2 of errcode == write).
409 */
358 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 410 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
359 return false; 411 return false;
360 412
@@ -362,8 +414,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
362 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 414 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
363 return false; 415 return false;
364 416
365 /* Check that the Guest PTE flags are OK, and the page number is below 417 /*
366 * the pfn_limit (ie. not mapping the Launcher binary). */ 418 * Check that the Guest PTE flags are OK, and the page number is below
419 * the pfn_limit (ie. not mapping the Launcher binary).
420 */
367 check_gpte(cpu, gpte); 421 check_gpte(cpu, gpte);
368 422
369 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 423 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
@@ -373,29 +427,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
373 427
374 /* Get the pointer to the shadow PTE entry we're going to set. */ 428 /* Get the pointer to the shadow PTE entry we're going to set. */
375 spte = spte_addr(cpu, *spgd, vaddr); 429 spte = spte_addr(cpu, *spgd, vaddr);
376 /* If there was a valid shadow PTE entry here before, we release it. 430
377 * This can happen with a write to a previously read-only entry. */ 431 /*
432 * If there was a valid shadow PTE entry here before, we release it.
433 * This can happen with a write to a previously read-only entry.
434 */
378 release_pte(*spte); 435 release_pte(*spte);
379 436
380 /* If this is a write, we insist that the Guest page is writable (the 437 /*
381 * final arg to gpte_to_spte()). */ 438 * If this is a write, we insist that the Guest page is writable (the
439 * final arg to gpte_to_spte()).
440 */
382 if (pte_dirty(gpte)) 441 if (pte_dirty(gpte))
383 *spte = gpte_to_spte(cpu, gpte, 1); 442 *spte = gpte_to_spte(cpu, gpte, 1);
384 else 443 else
385 /* If this is a read, don't set the "writable" bit in the page 444 /*
445 * If this is a read, don't set the "writable" bit in the page
386 * table entry, even if the Guest says it's writable. That way 446 * table entry, even if the Guest says it's writable. That way
387 * we will come back here when a write does actually occur, so 447 * we will come back here when a write does actually occur, so
388 * we can update the Guest's _PAGE_DIRTY flag. */ 448 * we can update the Guest's _PAGE_DIRTY flag.
449 */
389 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 450 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
390 451
391 /* Finally, we write the Guest PTE entry back: we've set the 452 /*
392 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 453 * Finally, we write the Guest PTE entry back: we've set the
454 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
455 */
393 lgwrite(cpu, gpte_ptr, pte_t, gpte); 456 lgwrite(cpu, gpte_ptr, pte_t, gpte);
394 457
395 /* The fault is fixed, the page table is populated, the mapping 458 /*
459 * The fault is fixed, the page table is populated, the mapping
396 * manipulated, the result returned and the code complete. A small 460 * manipulated, the result returned and the code complete. A small
397 * delay and a trace of alliteration are the only indications the Guest 461 * delay and a trace of alliteration are the only indications the Guest
398 * has that a page fault occurred at all. */ 462 * has that a page fault occurred at all.
463 */
399 return true; 464 return true;
400} 465}
401 466
@@ -408,7 +473,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
408 * mapped, so it's overkill. 473 * mapped, so it's overkill.
409 * 474 *
410 * This is a quick version which answers the question: is this virtual address 475 * This is a quick version which answers the question: is this virtual address
411 * mapped by the shadow page tables, and is it writable? */ 476 * mapped by the shadow page tables, and is it writable?
477 */
412static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 478static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
413{ 479{
414 pgd_t *spgd; 480 pgd_t *spgd;
@@ -428,21 +494,26 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
428 return false; 494 return false;
429#endif 495#endif
430 496
431 /* Check the flags on the pte entry itself: it must be present and 497 /*
432 * writable. */ 498 * Check the flags on the pte entry itself: it must be present and
499 * writable.
500 */
433 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 501 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
434 502
435 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 503 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
436} 504}
437 505
438/* So, when pin_stack_pages() asks us to pin a page, we check if it's already 506/*
507 * So, when pin_stack_pages() asks us to pin a page, we check if it's already
439 * in the page tables, and if not, we call demand_page() with error code 2 508 * in the page tables, and if not, we call demand_page() with error code 2
440 * (meaning "write"). */ 509 * (meaning "write").
510 */
441void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 511void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
442{ 512{
443 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
444 kill_guest(cpu, "bad stack page %#lx", vaddr); 514 kill_guest(cpu, "bad stack page %#lx", vaddr);
445} 515}
516/*:*/
446 517
447#ifdef CONFIG_X86_PAE 518#ifdef CONFIG_X86_PAE
448static void release_pmd(pmd_t *spmd) 519static void release_pmd(pmd_t *spmd)
@@ -479,15 +550,21 @@ static void release_pgd(pgd_t *spgd)
479} 550}
480 551
481#else /* !CONFIG_X86_PAE */ 552#else /* !CONFIG_X86_PAE */
482/*H:450 If we chase down the release_pgd() code, it looks like this: */ 553/*H:450
554 * If we chase down the release_pgd() code, the non-PAE version looks like
555 * this. The PAE version is almost identical, but instead of calling
556 * release_pte it calls release_pmd(), which looks much like this.
557 */
483static void release_pgd(pgd_t *spgd) 558static void release_pgd(pgd_t *spgd)
484{ 559{
485 /* If the entry's not present, there's nothing to release. */ 560 /* If the entry's not present, there's nothing to release. */
486 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 561 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
487 unsigned int i; 562 unsigned int i;
488 /* Converting the pfn to find the actual PTE page is easy: turn 563 /*
564 * Converting the pfn to find the actual PTE page is easy: turn
489 * the page number into a physical address, then convert to a 565 * the page number into a physical address, then convert to a
490 * virtual address (easy for kernel pages like this one). */ 566 * virtual address (easy for kernel pages like this one).
567 */
491 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 568 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
492 /* For each entry in the page, we might need to release it. */ 569 /* For each entry in the page, we might need to release it. */
493 for (i = 0; i < PTRS_PER_PTE; i++) 570 for (i = 0; i < PTRS_PER_PTE; i++)
@@ -499,9 +576,12 @@ static void release_pgd(pgd_t *spgd)
499 } 576 }
500} 577}
501#endif 578#endif
502/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 579
580/*H:445
581 * We saw flush_user_mappings() twice: once from the flush_user_mappings()
503 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 582 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
504 * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 583 * It simply releases every PTE page from 0 up to the Guest's kernel address.
584 */
505static void flush_user_mappings(struct lguest *lg, int idx) 585static void flush_user_mappings(struct lguest *lg, int idx)
506{ 586{
507 unsigned int i; 587 unsigned int i;
@@ -510,10 +590,12 @@ static void flush_user_mappings(struct lguest *lg, int idx)
510 release_pgd(lg->pgdirs[idx].pgdir + i); 590 release_pgd(lg->pgdirs[idx].pgdir + i);
511} 591}
512 592
513/*H:440 (v) Flushing (throwing away) page tables, 593/*H:440
594 * (v) Flushing (throwing away) page tables,
514 * 595 *
515 * The Guest has a hypercall to throw away the page tables: it's used when a 596 * The Guest has a hypercall to throw away the page tables: it's used when a
516 * large number of mappings have been changed. */ 597 * large number of mappings have been changed.
598 */
517void guest_pagetable_flush_user(struct lg_cpu *cpu) 599void guest_pagetable_flush_user(struct lg_cpu *cpu)
518{ 600{
519 /* Drop the userspace part of the current page table. */ 601 /* Drop the userspace part of the current page table. */
@@ -551,9 +633,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
551 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 633 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
552} 634}
553 635
554/* We keep several page tables. This is a simple routine to find the page 636/*
637 * We keep several page tables. This is a simple routine to find the page
555 * table (if any) corresponding to this top-level address the Guest has given 638 * table (if any) corresponding to this top-level address the Guest has given
556 * us. */ 639 * us.
640 */
557static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 641static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
558{ 642{
559 unsigned int i; 643 unsigned int i;
@@ -563,9 +647,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
563 return i; 647 return i;
564} 648}
565 649
566/*H:435 And this is us, creating the new page directory. If we really do 650/*H:435
651 * And this is us, creating the new page directory. If we really do
567 * allocate a new one (and so the kernel parts are not there), we set 652 * allocate a new one (and so the kernel parts are not there), we set
568 * blank_pgdir. */ 653 * blank_pgdir.
654 */
569static unsigned int new_pgdir(struct lg_cpu *cpu, 655static unsigned int new_pgdir(struct lg_cpu *cpu,
570 unsigned long gpgdir, 656 unsigned long gpgdir,
571 int *blank_pgdir) 657 int *blank_pgdir)
@@ -575,8 +661,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
575 pmd_t *pmd_table; 661 pmd_t *pmd_table;
576#endif 662#endif
577 663
578 /* We pick one entry at random to throw out. Choosing the Least 664 /*
579 * Recently Used might be better, but this is easy. */ 665 * We pick one entry at random to throw out. Choosing the Least
666 * Recently Used might be better, but this is easy.
667 */
580 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 668 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
581 /* If it's never been allocated at all before, try now. */ 669 /* If it's never been allocated at all before, try now. */
582 if (!cpu->lg->pgdirs[next].pgdir) { 670 if (!cpu->lg->pgdirs[next].pgdir) {
@@ -587,8 +675,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
587 next = cpu->cpu_pgd; 675 next = cpu->cpu_pgd;
588 else { 676 else {
589#ifdef CONFIG_X86_PAE 677#ifdef CONFIG_X86_PAE
590 /* In PAE mode, allocate a pmd page and populate the 678 /*
591 * last pgd entry. */ 679 * In PAE mode, allocate a pmd page and populate the
680 * last pgd entry.
681 */
592 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 682 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
593 if (!pmd_table) { 683 if (!pmd_table) {
594 free_page((long)cpu->lg->pgdirs[next].pgdir); 684 free_page((long)cpu->lg->pgdirs[next].pgdir);
@@ -598,8 +688,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
598 set_pgd(cpu->lg->pgdirs[next].pgdir + 688 set_pgd(cpu->lg->pgdirs[next].pgdir +
599 SWITCHER_PGD_INDEX, 689 SWITCHER_PGD_INDEX,
600 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 690 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
601 /* This is a blank page, so there are no kernel 691 /*
602 * mappings: caller must map the stack! */ 692 * This is a blank page, so there are no kernel
693 * mappings: caller must map the stack!
694 */
603 *blank_pgdir = 1; 695 *blank_pgdir = 1;
604 } 696 }
605#else 697#else
@@ -615,19 +707,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
615 return next; 707 return next;
616} 708}
617 709
618/*H:430 (iv) Switching page tables 710/*H:430
711 * (iv) Switching page tables
619 * 712 *
620 * Now we've seen all the page table setting and manipulation, let's see 713 * Now we've seen all the page table setting and manipulation, let's see
621 * what happens when the Guest changes page tables (ie. changes the top-level 714 * what happens when the Guest changes page tables (ie. changes the top-level
622 * pgdir). This occurs on almost every context switch. */ 715 * pgdir). This occurs on almost every context switch.
716 */
623void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 717void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
624{ 718{
625 int newpgdir, repin = 0; 719 int newpgdir, repin = 0;
626 720
627 /* Look to see if we have this one already. */ 721 /* Look to see if we have this one already. */
628 newpgdir = find_pgdir(cpu->lg, pgtable); 722 newpgdir = find_pgdir(cpu->lg, pgtable);
629 /* If not, we allocate or mug an existing one: if it's a fresh one, 723 /*
630 * repin gets set to 1. */ 724 * If not, we allocate or mug an existing one: if it's a fresh one,
725 * repin gets set to 1.
726 */
631 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 727 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
632 newpgdir = new_pgdir(cpu, pgtable, &repin); 728 newpgdir = new_pgdir(cpu, pgtable, &repin);
633 /* Change the current pgd index to the new one. */ 729 /* Change the current pgd index to the new one. */
@@ -637,9 +733,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
637 pin_stack_pages(cpu); 733 pin_stack_pages(cpu);
638} 734}
639 735
640/*H:470 Finally, a routine which throws away everything: all PGD entries in all 736/*H:470
737 * Finally, a routine which throws away everything: all PGD entries in all
641 * the shadow page tables, including the Guest's kernel mappings. This is used 738 * the shadow page tables, including the Guest's kernel mappings. This is used
642 * when we destroy the Guest. */ 739 * when we destroy the Guest.
740 */
643static void release_all_pagetables(struct lguest *lg) 741static void release_all_pagetables(struct lguest *lg)
644{ 742{
645 unsigned int i, j; 743 unsigned int i, j;
@@ -656,8 +754,10 @@ static void release_all_pagetables(struct lguest *lg)
656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 754 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 755 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
658 756
659 /* And release the pmd entries of that pmd page, 757 /*
660 * except for the switcher pmd. */ 758 * And release the pmd entries of that pmd page,
759 * except for the switcher pmd.
760 */
661 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 761 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
662 release_pmd(&pmdpage[k]); 762 release_pmd(&pmdpage[k]);
663#endif 763#endif
@@ -667,10 +767,12 @@ static void release_all_pagetables(struct lguest *lg)
667 } 767 }
668} 768}
669 769
670/* We also throw away everything when a Guest tells us it's changed a kernel 770/*
771 * We also throw away everything when a Guest tells us it's changed a kernel
671 * mapping. Since kernel mappings are in every page table, it's easiest to 772 * mapping. Since kernel mappings are in every page table, it's easiest to
672 * throw them all away. This traps the Guest in amber for a while as 773 * throw them all away. This traps the Guest in amber for a while as
673 * everything faults back in, but it's rare. */ 774 * everything faults back in, but it's rare.
775 */
674void guest_pagetable_clear_all(struct lg_cpu *cpu) 776void guest_pagetable_clear_all(struct lg_cpu *cpu)
675{ 777{
676 release_all_pagetables(cpu->lg); 778 release_all_pagetables(cpu->lg);
@@ -678,15 +780,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
678 pin_stack_pages(cpu); 780 pin_stack_pages(cpu);
679} 781}
680/*:*/ 782/*:*/
681/*M:009 Since we throw away all mappings when a kernel mapping changes, our 783
784/*M:009
785 * Since we throw away all mappings when a kernel mapping changes, our
682 * performance sucks for guests using highmem. In fact, a guest with 786 * performance sucks for guests using highmem. In fact, a guest with
683 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 787 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
684 * usually slower than a Guest with less memory. 788 * usually slower than a Guest with less memory.
685 * 789 *
686 * This, of course, cannot be fixed. It would take some kind of... well, I 790 * This, of course, cannot be fixed. It would take some kind of... well, I
687 * don't know, but the term "puissant code-fu" comes to mind. :*/ 791 * don't know, but the term "puissant code-fu" comes to mind.
792:*/
688 793
689/*H:420 This is the routine which actually sets the page table entry for then 794/*H:420
795 * This is the routine which actually sets the page table entry for then
690 * "idx"'th shadow page table. 796 * "idx"'th shadow page table.
691 * 797 *
692 * Normally, we can just throw out the old entry and replace it with 0: if they 798 * Normally, we can just throw out the old entry and replace it with 0: if they
@@ -715,31 +821,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
715 spmd = spmd_addr(cpu, *spgd, vaddr); 821 spmd = spmd_addr(cpu, *spgd, vaddr);
716 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 822 if (pmd_flags(*spmd) & _PAGE_PRESENT) {
717#endif 823#endif
718 /* Otherwise, we start by releasing 824 /* Otherwise, start by releasing the existing entry. */
719 * the existing entry. */
720 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 825 pte_t *spte = spte_addr(cpu, *spgd, vaddr);
721 release_pte(*spte); 826 release_pte(*spte);
722 827
723 /* If they're setting this entry as dirty or accessed, 828 /*
724 * we might as well put that entry they've given us 829 * If they're setting this entry as dirty or accessed,
725 * in now. This shaves 10% off a 830 * we might as well put that entry they've given us in
726 * copy-on-write micro-benchmark. */ 831 * now. This shaves 10% off a copy-on-write
832 * micro-benchmark.
833 */
727 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 834 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
728 check_gpte(cpu, gpte); 835 check_gpte(cpu, gpte);
729 native_set_pte(spte, 836 native_set_pte(spte,
730 gpte_to_spte(cpu, gpte, 837 gpte_to_spte(cpu, gpte,
731 pte_flags(gpte) & _PAGE_DIRTY)); 838 pte_flags(gpte) & _PAGE_DIRTY));
732 } else 839 } else {
733 /* Otherwise kill it and we can demand_page() 840 /*
734 * it in later. */ 841 * Otherwise kill it and we can demand_page()
842 * it in later.
843 */
735 native_set_pte(spte, __pte(0)); 844 native_set_pte(spte, __pte(0));
845 }
736#ifdef CONFIG_X86_PAE 846#ifdef CONFIG_X86_PAE
737 } 847 }
738#endif 848#endif
739 } 849 }
740} 850}
741 851
742/*H:410 Updating a PTE entry is a little trickier. 852/*H:410
853 * Updating a PTE entry is a little trickier.
743 * 854 *
744 * We keep track of several different page tables (the Guest uses one for each 855 * We keep track of several different page tables (the Guest uses one for each
745 * process, so it makes sense to cache at least a few). Each of these have 856 * process, so it makes sense to cache at least a few). Each of these have
@@ -748,12 +859,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
748 * all the page tables, not just the current one. This is rare. 859 * all the page tables, not just the current one. This is rare.
749 * 860 *
750 * The benefit is that when we have to track a new page table, we can keep all 861 * The benefit is that when we have to track a new page table, we can keep all
751 * the kernel mappings. This speeds up context switch immensely. */ 862 * the kernel mappings. This speeds up context switch immensely.
863 */
752void guest_set_pte(struct lg_cpu *cpu, 864void guest_set_pte(struct lg_cpu *cpu,
753 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 865 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
754{ 866{
755 /* Kernel mappings must be changed on all top levels. Slow, but doesn't 867 /*
756 * happen often. */ 868 * Kernel mappings must be changed on all top levels. Slow, but doesn't
869 * happen often.
870 */
757 if (vaddr >= cpu->lg->kernel_address) { 871 if (vaddr >= cpu->lg->kernel_address) {
758 unsigned int i; 872 unsigned int i;
759 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 873 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
@@ -795,19 +909,25 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
795 /* ... throw it away. */ 909 /* ... throw it away. */
796 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
797} 911}
912
798#ifdef CONFIG_X86_PAE 913#ifdef CONFIG_X86_PAE
914/* For setting a mid-level, we just throw everything away. It's easy. */
799void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
800{ 916{
801 guest_pagetable_clear_all(&lg->cpus[0]); 917 guest_pagetable_clear_all(&lg->cpus[0]);
802} 918}
803#endif 919#endif
804 920
805/* Once we know how much memory we have we can construct simple identity 921/*H:505
806 * (which set virtual == physical) and linear mappings 922 * To get through boot, we construct simple identity page mappings (which
807 * which will get the Guest far enough into the boot to create its own. 923 * set virtual == physical) and linear mappings which will get the Guest far
924 * enough into the boot to create its own. The linear mapping means we
925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
926 * as you'll see.
808 * 927 *
809 * We lay them out of the way, just below the initrd (which is why we need to 928 * We lay them out of the way, just below the initrd (which is why we need to
810 * know its size here). */ 929 * know its size here).
930 */
811static unsigned long setup_pagetables(struct lguest *lg, 931static unsigned long setup_pagetables(struct lguest *lg,
812 unsigned long mem, 932 unsigned long mem,
813 unsigned long initrd_size) 933 unsigned long initrd_size)
@@ -825,8 +945,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
825 unsigned int phys_linear; 945 unsigned int phys_linear;
826#endif 946#endif
827 947
828 /* We have mapped_pages frames to map, so we need 948 /*
829 * linear_pages page tables to map them. */ 949 * We have mapped_pages frames to map, so we need linear_pages page
950 * tables to map them.
951 */
830 mapped_pages = mem / PAGE_SIZE; 952 mapped_pages = mem / PAGE_SIZE;
831 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 953 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
832 954
@@ -837,10 +959,16 @@ static unsigned long setup_pagetables(struct lguest *lg,
837 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
838 960
839#ifdef CONFIG_X86_PAE 961#ifdef CONFIG_X86_PAE
962 /*
963 * And the single mid page goes below that. We only use one, but
964 * that's enough to map 1G, which definitely gets us through boot.
965 */
840 pmds = (void *)linear - PAGE_SIZE; 966 pmds = (void *)linear - PAGE_SIZE;
841#endif 967#endif
842 /* Linear mapping is easy: put every page's address into the 968 /*
843 * mapping in order. */ 969 * Linear mapping is easy: put every page's address into the
970 * mapping in order.
971 */
844 for (i = 0; i < mapped_pages; i++) { 972 for (i = 0; i < mapped_pages; i++) {
845 pte_t pte; 973 pte_t pte;
846 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); 974 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
@@ -848,11 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,
848 return -EFAULT; 976 return -EFAULT;
849 } 977 }
850 978
851 /* The top level points to the linear page table pages above.
852 * We setup the identity and linear mappings here. */
853#ifdef CONFIG_X86_PAE 979#ifdef CONFIG_X86_PAE
980 /*
981 * Make the Guest PMD entries point to the corresponding place in the
982 * linear mapping (up to one page worth of PMD).
983 */
854 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
855 i += PTRS_PER_PTE, j++) { 985 i += PTRS_PER_PTE, j++) {
986 /* FIXME: native_set_pmd is overkill here. */
856 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
857 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
858 989
@@ -860,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,
860 return -EFAULT; 991 return -EFAULT;
861 } 992 }
862 993
994 /* One PGD entry, pointing to that PMD page. */
863 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 995 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
864 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
865 return -EFAULT; 998 return -EFAULT;
999 /*
1000 * And the third PGD entry (ie. addresses 3G-4G).
1001 *
1002 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000.
1003 */
866 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 1004 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
867 return -EFAULT; 1005 return -EFAULT;
868#else 1006#else
1007 /*
1008 * The top level points to the linear page table pages above.
1009 * We setup the identity and linear mappings here.
1010 */
869 phys_linear = (unsigned long)linear - mem_base; 1011 phys_linear = (unsigned long)linear - mem_base;
870 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1012 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
871 pgd_t pgd; 1013 pgd_t pgd;
1014 /*
1015 * Create a PGD entry which points to the right part of the
1016 * linear PTE pages.
1017 */
872 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1018 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
873 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1019 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
874 1020
1021 /*
1022 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1023 */
875 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1024 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
876 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1025 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
877 + i / PTRS_PER_PTE], 1026 + i / PTRS_PER_PTE],
@@ -880,15 +1029,19 @@ static unsigned long setup_pagetables(struct lguest *lg,
880 } 1029 }
881#endif 1030#endif
882 1031
883 /* We return the top level (guest-physical) address: remember where 1032 /*
884 * this is. */ 1033 * We return the top level (guest-physical) address: we remember where
1034 * this is to write it into lguest_data when the Guest initializes.
1035 */
885 return (unsigned long)pgdir - mem_base; 1036 return (unsigned long)pgdir - mem_base;
886} 1037}
887 1038
888/*H:500 (vii) Setting up the page tables initially. 1039/*H:500
1040 * (vii) Setting up the page tables initially.
889 * 1041 *
890 * When a Guest is first created, the Launcher tells us where the toplevel of 1042 * When a Guest is first created, the Launcher tells us where the toplevel of
891 * its first page table is. We set some things up here: */ 1043 * its first page table is. We set some things up here:
1044 */
892int init_guest_pagetable(struct lguest *lg) 1045int init_guest_pagetable(struct lguest *lg)
893{ 1046{
894 u64 mem; 1047 u64 mem;
@@ -898,21 +1051,27 @@ int init_guest_pagetable(struct lguest *lg)
898 pgd_t *pgd; 1051 pgd_t *pgd;
899 pmd_t *pmd_table; 1052 pmd_t *pmd_table;
900#endif 1053#endif
901 /* Get the Guest memory size and the ramdisk size from the boot header 1054 /*
902 * located at lg->mem_base (Guest address 0). */ 1055 * Get the Guest memory size and the ramdisk size from the boot header
1056 * located at lg->mem_base (Guest address 0).
1057 */
903 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1058 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
904 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1059 || get_user(initrd_size, &boot->hdr.ramdisk_size))
905 return -EFAULT; 1060 return -EFAULT;
906 1061
907 /* We start on the first shadow page table, and give it a blank PGD 1062 /*
908 * page. */ 1063 * We start on the first shadow page table, and give it a blank PGD
1064 * page.
1065 */
909 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1066 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
910 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1067 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
911 return lg->pgdirs[0].gpgdir; 1068 return lg->pgdirs[0].gpgdir;
912 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1069 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
913 if (!lg->pgdirs[0].pgdir) 1070 if (!lg->pgdirs[0].pgdir)
914 return -ENOMEM; 1071 return -ENOMEM;
1072
915#ifdef CONFIG_X86_PAE 1073#ifdef CONFIG_X86_PAE
1074 /* For PAE, we also create the initial mid-level. */
916 pgd = lg->pgdirs[0].pgdir; 1075 pgd = lg->pgdirs[0].pgdir;
917 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1076 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
918 if (!pmd_table) 1077 if (!pmd_table)
@@ -921,27 +1080,33 @@ int init_guest_pagetable(struct lguest *lg)
921 set_pgd(pgd + SWITCHER_PGD_INDEX, 1080 set_pgd(pgd + SWITCHER_PGD_INDEX,
922 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1081 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
923#endif 1082#endif
1083
1084 /* This is the current page table. */
924 lg->cpus[0].cpu_pgd = 0; 1085 lg->cpus[0].cpu_pgd = 0;
925 return 0; 1086 return 0;
926} 1087}
927 1088
928/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1089/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
929void page_table_guest_data_init(struct lg_cpu *cpu) 1090void page_table_guest_data_init(struct lg_cpu *cpu)
930{ 1091{
931 /* We get the kernel address: above this is all kernel memory. */ 1092 /* We get the kernel address: above this is all kernel memory. */
932 if (get_user(cpu->lg->kernel_address, 1093 if (get_user(cpu->lg->kernel_address,
933 &cpu->lg->lguest_data->kernel_address) 1094 &cpu->lg->lguest_data->kernel_address)
934 /* We tell the Guest that it can't use the top 2 or 4 MB 1095 /*
935 * of virtual addresses used by the Switcher. */ 1096 * We tell the Guest that it can't use the top 2 or 4 MB
1097 * of virtual addresses used by the Switcher.
1098 */
936 || put_user(RESERVE_MEM * 1024 * 1024, 1099 || put_user(RESERVE_MEM * 1024 * 1024,
937 &cpu->lg->lguest_data->reserve_mem) 1100 &cpu->lg->lguest_data->reserve_mem)
938 || put_user(cpu->lg->pgdirs[0].gpgdir, 1101 || put_user(cpu->lg->pgdirs[0].gpgdir,
939 &cpu->lg->lguest_data->pgdir)) 1102 &cpu->lg->lguest_data->pgdir))
940 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1103 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
941 1104
942 /* In flush_user_mappings() we loop from 0 to 1105 /*
1106 * In flush_user_mappings() we loop from 0 to
943 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1107 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
944 * Switcher mappings, so check that now. */ 1108 * Switcher mappings, so check that now.
1109 */
945#ifdef CONFIG_X86_PAE 1110#ifdef CONFIG_X86_PAE
946 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1111 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
947 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) 1112 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
@@ -964,12 +1129,14 @@ void free_guest_pagetable(struct lguest *lg)
964 free_page((long)lg->pgdirs[i].pgdir); 1129 free_page((long)lg->pgdirs[i].pgdir);
965} 1130}
966 1131
967/*H:480 (vi) Mapping the Switcher when the Guest is about to run. 1132/*H:480
1133 * (vi) Mapping the Switcher when the Guest is about to run.
968 * 1134 *
969 * The Switcher and the two pages for this CPU need to be visible in the 1135 * The Switcher and the two pages for this CPU need to be visible in the
970 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1136 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
971 * for each CPU already set up, we just need to hook them in now we know which 1137 * for each CPU already set up, we just need to hook them in now we know which
972 * Guest is about to run on this CPU. */ 1138 * Guest is about to run on this CPU.
1139 */
973void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1140void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
974{ 1141{
975 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 1142 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
@@ -980,30 +1147,38 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
980 pmd_t switcher_pmd; 1147 pmd_t switcher_pmd;
981 pmd_t *pmd_table; 1148 pmd_t *pmd_table;
982 1149
1150 /* FIXME: native_set_pmd is overkill here. */
983 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
984 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 PAGE_SHIFT, PAGE_KERNEL_EXEC));
985 1153
1154 /* Figure out where the pmd page is, by reading the PGD, and converting
1155 * it to a virtual address. */
986 pmd_table = __va(pgd_pfn(cpu->lg-> 1156 pmd_table = __va(pgd_pfn(cpu->lg->
987 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1157 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
988 << PAGE_SHIFT); 1158 << PAGE_SHIFT);
1159 /* Now write it into the shadow page table. */
989 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1160 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
990#else 1161#else
991 pgd_t switcher_pgd; 1162 pgd_t switcher_pgd;
992 1163
993 /* Make the last PGD entry for this Guest point to the Switcher's PTE 1164 /*
994 * page for this CPU (with appropriate flags). */ 1165 * Make the last PGD entry for this Guest point to the Switcher's PTE
1166 * page for this CPU (with appropriate flags).
1167 */
995 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1168 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
996 1169
997 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1170 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
998 1171
999#endif 1172#endif
1000 /* We also change the Switcher PTE page. When we're running the Guest, 1173 /*
1174 * We also change the Switcher PTE page. When we're running the Guest,
1001 * we want the Guest's "regs" page to appear where the first Switcher 1175 * we want the Guest's "regs" page to appear where the first Switcher
1002 * page for this CPU is. This is an optimization: when the Switcher 1176 * page for this CPU is. This is an optimization: when the Switcher
1003 * saves the Guest registers, it saves them into the first page of this 1177 * saves the Guest registers, it saves them into the first page of this
1004 * CPU's "struct lguest_pages": if we make sure the Guest's register 1178 * CPU's "struct lguest_pages": if we make sure the Guest's register
1005 * page is already mapped there, we don't have to copy them out 1179 * page is already mapped there, we don't have to copy them out
1006 * again. */ 1180 * again.
1181 */
1007 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1182 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
1008 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL)); 1183 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
1009 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], 1184 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
@@ -1019,10 +1194,12 @@ static void free_switcher_pte_pages(void)
1019 free_page((long)switcher_pte_page(i)); 1194 free_page((long)switcher_pte_page(i));
1020} 1195}
1021 1196
1022/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given 1197/*H:520
1198 * Setting up the Switcher PTE page for given CPU is fairly easy, given
1023 * the CPU number and the "struct page"s for the Switcher code itself. 1199 * the CPU number and the "struct page"s for the Switcher code itself.
1024 * 1200 *
1025 * Currently the Switcher is less than a page long, so "pages" is always 1. */ 1201 * Currently the Switcher is less than a page long, so "pages" is always 1.
1202 */
1026static __init void populate_switcher_pte_page(unsigned int cpu, 1203static __init void populate_switcher_pte_page(unsigned int cpu,
1027 struct page *switcher_page[], 1204 struct page *switcher_page[],
1028 unsigned int pages) 1205 unsigned int pages)
@@ -1043,13 +1220,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1220 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1221 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
1045 1222
1046 /* The second page contains the "struct lguest_ro_state", and is 1223 /*
1047 * read-only. */ 1224 * The second page contains the "struct lguest_ro_state", and is
1225 * read-only.
1226 */
1048 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1227 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
1049 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1228 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
1050} 1229}
1051 1230
1052/* We've made it through the page table code. Perhaps our tired brains are 1231/*
1232 * We've made it through the page table code. Perhaps our tired brains are
1053 * still processing the details, or perhaps we're simply glad it's over. 1233 * still processing the details, or perhaps we're simply glad it's over.
1054 * 1234 *
1055 * If nothing else, note that all this complexity in juggling shadow page tables 1235 * If nothing else, note that all this complexity in juggling shadow page tables
@@ -1058,10 +1238,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1058 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1238 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
1059 * have implemented shadow page table support directly into hardware. 1239 * have implemented shadow page table support directly into hardware.
1060 * 1240 *
1061 * There is just one file remaining in the Host. */ 1241 * There is just one file remaining in the Host.
1242 */
1062 1243
1063/*H:510 At boot or module load time, init_pagetables() allocates and populates 1244/*H:510
1064 * the Switcher PTE page for each CPU. */ 1245 * At boot or module load time, init_pagetables() allocates and populates
1246 * the Switcher PTE page for each CPU.
1247 */
1065__init int init_pagetables(struct page **switcher_page, unsigned int pages) 1248__init int init_pagetables(struct page **switcher_page, unsigned int pages)
1066{ 1249{
1067 unsigned int i; 1250 unsigned int i;