diff options
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r-- | drivers/lguest/page_tables.c | 113 |
1 files changed, 72 insertions, 41 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 2a45f0691c9b..fffabb327157 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -26,7 +26,8 @@ | |||
26 | * | 26 | * |
27 | * We use two-level page tables for the Guest. If you're not entirely | 27 | * We use two-level page tables for the Guest. If you're not entirely |
28 | * comfortable with virtual addresses, physical addresses and page tables then | 28 | * comfortable with virtual addresses, physical addresses and page tables then |
29 | * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). | 29 | * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with |
30 | * diagrams!). | ||
30 | * | 31 | * |
31 | * The Guest keeps page tables, but we maintain the actual ones here: these are | 32 | * The Guest keeps page tables, but we maintain the actual ones here: these are |
32 | * called "shadow" page tables. Which is a very Guest-centric name: these are | 33 | * called "shadow" page tables. Which is a very Guest-centric name: these are |
@@ -36,11 +37,11 @@ | |||
36 | * | 37 | * |
37 | * Anyway, this is the most complicated part of the Host code. There are seven | 38 | * Anyway, this is the most complicated part of the Host code. There are seven |
38 | * parts to this: | 39 | * parts to this: |
39 | * (i) Setting up a page table entry for the Guest when it faults, | 40 | * (i) Looking up a page table entry when the Guest faults, |
40 | * (ii) Setting up the page table entry for the Guest stack, | 41 | * (ii) Making sure the Guest stack is mapped, |
41 | * (iii) Setting up a page table entry when the Guest tells us it has changed, | 42 | * (iii) Setting up a page table entry when the Guest tells us one has changed, |
42 | * (iv) Switching page tables, | 43 | * (iv) Switching page tables, |
43 | * (v) Flushing (thowing away) page tables, | 44 | * (v) Flushing (throwing away) page tables, |
44 | * (vi) Mapping the Switcher when the Guest is about to run, | 45 | * (vi) Mapping the Switcher when the Guest is about to run, |
45 | * (vii) Setting up the page tables initially. | 46 | * (vii) Setting up the page tables initially. |
46 | :*/ | 47 | :*/ |
@@ -57,16 +58,15 @@ | |||
57 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | 58 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); |
58 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | 59 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
59 | 60 | ||
60 | /*H:320 With our shadow and Guest types established, we need to deal with | 61 | /*H:320 The page table code is curly enough to need helper functions to keep it |
61 | * them: the page table code is curly enough to need helper functions to keep | 62 | * clear and clean. |
62 | * it clear and clean. | ||
63 | * | 63 | * |
64 | * There are two functions which return pointers to the shadow (aka "real") | 64 | * There are two functions which return pointers to the shadow (aka "real") |
65 | * page tables. | 65 | * page tables. |
66 | * | 66 | * |
67 | * spgd_addr() takes the virtual address and returns a pointer to the top-level | 67 | * spgd_addr() takes the virtual address and returns a pointer to the top-level |
68 | * page directory entry for that address. Since we keep track of several page | 68 | * page directory entry (PGD) for that address. Since we keep track of several |
69 | * tables, the "i" argument tells us which one we're interested in (it's | 69 | * page tables, the "i" argument tells us which one we're interested in (it's |
70 | * usually the current one). */ | 70 | * usually the current one). */ |
71 | static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 71 | static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) |
72 | { | 72 | { |
@@ -81,9 +81,9 @@ static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | |||
81 | return &lg->pgdirs[i].pgdir[index]; | 81 | return &lg->pgdirs[i].pgdir[index]; |
82 | } | 82 | } |
83 | 83 | ||
84 | /* This routine then takes the PGD entry given above, which contains the | 84 | /* This routine then takes the page directory entry returned above, which |
85 | * address of the PTE page. It then returns a pointer to the PTE entry for the | 85 | * contains the address of the page table entry (PTE) page. It then returns a |
86 | * given address. */ | 86 | * pointer to the PTE entry for the given address. */ |
87 | static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) | 87 | static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) |
88 | { | 88 | { |
89 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 89 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
@@ -191,7 +191,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) | |||
191 | } | 191 | } |
192 | 192 | ||
193 | /*H:330 | 193 | /*H:330 |
194 | * (i) Setting up a page table entry for the Guest when it faults | 194 | * (i) Looking up a page table entry when the Guest faults. |
195 | * | 195 | * |
196 | * We saw this call in run_guest(): when we see a page fault in the Guest, we | 196 | * We saw this call in run_guest(): when we see a page fault in the Guest, we |
197 | * come here. That's because we only set up the shadow page tables lazily as | 197 | * come here. That's because we only set up the shadow page tables lazily as |
@@ -199,7 +199,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) | |||
199 | * and return to the Guest without it knowing. | 199 | * and return to the Guest without it knowing. |
200 | * | 200 | * |
201 | * If we fixed up the fault (ie. we mapped the address), this routine returns | 201 | * If we fixed up the fault (ie. we mapped the address), this routine returns |
202 | * true. */ | 202 | * true. Otherwise, it was a real fault and we need to tell the Guest. */ |
203 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 203 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) |
204 | { | 204 | { |
205 | pgd_t gpgd; | 205 | pgd_t gpgd; |
@@ -246,16 +246,16 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
246 | if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) | 246 | if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) |
247 | return 0; | 247 | return 0; |
248 | 248 | ||
249 | /* User access to a kernel page? (bit 3 == user access) */ | 249 | /* User access to a kernel-only page? (bit 3 == user access) */ |
250 | if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) | 250 | if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) |
251 | return 0; | 251 | return 0; |
252 | 252 | ||
253 | /* Check that the Guest PTE flags are OK, and the page number is below | 253 | /* Check that the Guest PTE flags are OK, and the page number is below |
254 | * the pfn_limit (ie. not mapping the Launcher binary). */ | 254 | * the pfn_limit (ie. not mapping the Launcher binary). */ |
255 | check_gpte(lg, gpte); | 255 | check_gpte(lg, gpte); |
256 | |||
256 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 257 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
257 | gpte = pte_mkyoung(gpte); | 258 | gpte = pte_mkyoung(gpte); |
258 | |||
259 | if (errcode & 2) | 259 | if (errcode & 2) |
260 | gpte = pte_mkdirty(gpte); | 260 | gpte = pte_mkdirty(gpte); |
261 | 261 | ||
@@ -272,23 +272,28 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
272 | else | 272 | else |
273 | /* If this is a read, don't set the "writable" bit in the page | 273 | /* If this is a read, don't set the "writable" bit in the page |
274 | * table entry, even if the Guest says it's writable. That way | 274 | * table entry, even if the Guest says it's writable. That way |
275 | * we come back here when a write does actually ocur, so we can | 275 | * we will come back here when a write does actually occur, so |
276 | * update the Guest's _PAGE_DIRTY flag. */ | 276 | * we can update the Guest's _PAGE_DIRTY flag. */ |
277 | *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); | 277 | *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); |
278 | 278 | ||
279 | /* Finally, we write the Guest PTE entry back: we've set the | 279 | /* Finally, we write the Guest PTE entry back: we've set the |
280 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 280 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
281 | lgwrite(lg, gpte_ptr, pte_t, gpte); | 281 | lgwrite(lg, gpte_ptr, pte_t, gpte); |
282 | 282 | ||
283 | /* We succeeded in mapping the page! */ | 283 | /* The fault is fixed, the page table is populated, the mapping |
284 | * manipulated, the result returned and the code complete. A small | ||
285 | * delay and a trace of alliteration are the only indications the Guest | ||
286 | * has that a page fault occurred at all. */ | ||
284 | return 1; | 287 | return 1; |
285 | } | 288 | } |
286 | 289 | ||
287 | /*H:360 (ii) Setting up the page table entry for the Guest stack. | 290 | /*H:360 |
291 | * (ii) Making sure the Guest stack is mapped. | ||
288 | * | 292 | * |
289 | * Remember pin_stack_pages() which makes sure the stack is mapped? It could | 293 | * Remember that direct traps into the Guest need a mapped Guest kernel stack. |
290 | * simply call demand_page(), but as we've seen that logic is quite long, and | 294 | * pin_stack_pages() calls us here: we could simply call demand_page(), but as |
291 | * usually the stack pages are already mapped anyway, so it's not required. | 295 | * we've seen that logic is quite long, and usually the stack pages are already |
296 | * mapped, so it's overkill. | ||
292 | * | 297 | * |
293 | * This is a quick version which answers the question: is this virtual address | 298 | * This is a quick version which answers the question: is this virtual address |
294 | * mapped by the shadow page tables, and is it writable? */ | 299 | * mapped by the shadow page tables, and is it writable? */ |
@@ -297,7 +302,7 @@ static int page_writable(struct lguest *lg, unsigned long vaddr) | |||
297 | pgd_t *spgd; | 302 | pgd_t *spgd; |
298 | unsigned long flags; | 303 | unsigned long flags; |
299 | 304 | ||
300 | /* Look at the top level entry: is it present? */ | 305 | /* Look at the current top level entry: is it present? */ |
301 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 306 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
302 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 307 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
303 | return 0; | 308 | return 0; |
@@ -333,15 +338,14 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) | |||
333 | release_pte(ptepage[i]); | 338 | release_pte(ptepage[i]); |
334 | /* Now we can free the page of PTEs */ | 339 | /* Now we can free the page of PTEs */ |
335 | free_page((long)ptepage); | 340 | free_page((long)ptepage); |
336 | /* And zero out the PGD entry we we never release it twice. */ | 341 | /* And zero out the PGD entry so we never release it twice. */ |
337 | *spgd = __pgd(0); | 342 | *spgd = __pgd(0); |
338 | } | 343 | } |
339 | } | 344 | } |
340 | 345 | ||
341 | /*H:440 (v) Flushing (thowing away) page tables, | 346 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
342 | * | 347 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
343 | * We saw flush_user_mappings() called when we re-used a top-level pgdir page. | 348 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
344 | * It simply releases every PTE page from 0 up to the kernel address. */ | ||
345 | static void flush_user_mappings(struct lguest *lg, int idx) | 349 | static void flush_user_mappings(struct lguest *lg, int idx) |
346 | { | 350 | { |
347 | unsigned int i; | 351 | unsigned int i; |
@@ -350,8 +354,10 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
350 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 354 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
351 | } | 355 | } |
352 | 356 | ||
353 | /* The Guest also has a hypercall to do this manually: it's used when a large | 357 | /*H:440 (v) Flushing (throwing away) page tables, |
354 | * number of mappings have been changed. */ | 358 | * |
359 | * The Guest has a hypercall to throw away the page tables: it's used when a | ||
360 | * large number of mappings have been changed. */ | ||
355 | void guest_pagetable_flush_user(struct lguest *lg) | 361 | void guest_pagetable_flush_user(struct lguest *lg) |
356 | { | 362 | { |
357 | /* Drop the userspace part of the current page table. */ | 363 | /* Drop the userspace part of the current page table. */ |
@@ -423,8 +429,9 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
423 | 429 | ||
424 | /*H:430 (iv) Switching page tables | 430 | /*H:430 (iv) Switching page tables |
425 | * | 431 | * |
426 | * This is what happens when the Guest changes page tables (ie. changes the | 432 | * Now we've seen all the page table setting and manipulation, let's see what |
427 | * top-level pgdir). This happens on almost every context switch. */ | 433 | * what happens when the Guest changes page tables (ie. changes the top-level |
434 | * pgdir). This occurs on almost every context switch. */ | ||
428 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | 435 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) |
429 | { | 436 | { |
430 | int newpgdir, repin = 0; | 437 | int newpgdir, repin = 0; |
@@ -443,7 +450,8 @@ void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | |||
443 | } | 450 | } |
444 | 451 | ||
445 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all | 452 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all |
446 | * the shadow page tables. This is used when we destroy the Guest. */ | 453 | * the shadow page tables, including the Guest's kernel mappings. This is used |
454 | * when we destroy the Guest. */ | ||
447 | static void release_all_pagetables(struct lguest *lg) | 455 | static void release_all_pagetables(struct lguest *lg) |
448 | { | 456 | { |
449 | unsigned int i, j; | 457 | unsigned int i, j; |
@@ -458,13 +466,22 @@ static void release_all_pagetables(struct lguest *lg) | |||
458 | 466 | ||
459 | /* We also throw away everything when a Guest tells us it's changed a kernel | 467 | /* We also throw away everything when a Guest tells us it's changed a kernel |
460 | * mapping. Since kernel mappings are in every page table, it's easiest to | 468 | * mapping. Since kernel mappings are in every page table, it's easiest to |
461 | * throw them all away. This is amazingly slow, but thankfully rare. */ | 469 | * throw them all away. This traps the Guest in amber for a while as |
470 | * everything faults back in, but it's rare. */ | ||
462 | void guest_pagetable_clear_all(struct lguest *lg) | 471 | void guest_pagetable_clear_all(struct lguest *lg) |
463 | { | 472 | { |
464 | release_all_pagetables(lg); | 473 | release_all_pagetables(lg); |
465 | /* We need the Guest kernel stack mapped again. */ | 474 | /* We need the Guest kernel stack mapped again. */ |
466 | pin_stack_pages(lg); | 475 | pin_stack_pages(lg); |
467 | } | 476 | } |
477 | /*:*/ | ||
478 | /*M:009 Since we throw away all mappings when a kernel mapping changes, our | ||
479 | * performance sucks for guests using highmem. In fact, a guest with | ||
480 | * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is | ||
481 | * usually slower than a Guest with less memory. | ||
482 | * | ||
483 | * This, of course, cannot be fixed. It would take some kind of... well, I | ||
484 | * don't know, but the term "puissant code-fu" comes to mind. :*/ | ||
468 | 485 | ||
469 | /*H:420 This is the routine which actually sets the page table entry for then | 486 | /*H:420 This is the routine which actually sets the page table entry for then |
470 | * "idx"'th shadow page table. | 487 | * "idx"'th shadow page table. |
@@ -483,7 +500,7 @@ void guest_pagetable_clear_all(struct lguest *lg) | |||
483 | static void do_set_pte(struct lguest *lg, int idx, | 500 | static void do_set_pte(struct lguest *lg, int idx, |
484 | unsigned long vaddr, pte_t gpte) | 501 | unsigned long vaddr, pte_t gpte) |
485 | { | 502 | { |
486 | /* Look up the matching shadow page directot entry. */ | 503 | /* Look up the matching shadow page directory entry. */ |
487 | pgd_t *spgd = spgd_addr(lg, idx, vaddr); | 504 | pgd_t *spgd = spgd_addr(lg, idx, vaddr); |
488 | 505 | ||
489 | /* If the top level isn't present, there's no entry to update. */ | 506 | /* If the top level isn't present, there's no entry to update. */ |
@@ -500,7 +517,8 @@ static void do_set_pte(struct lguest *lg, int idx, | |||
500 | *spte = gpte_to_spte(lg, gpte, | 517 | *spte = gpte_to_spte(lg, gpte, |
501 | pte_flags(gpte) & _PAGE_DIRTY); | 518 | pte_flags(gpte) & _PAGE_DIRTY); |
502 | } else | 519 | } else |
503 | /* Otherwise we can demand_page() it in later. */ | 520 | /* Otherwise kill it and we can demand_page() it in |
521 | * later. */ | ||
504 | *spte = __pte(0); | 522 | *spte = __pte(0); |
505 | } | 523 | } |
506 | } | 524 | } |
@@ -535,7 +553,7 @@ void guest_set_pte(struct lguest *lg, | |||
535 | } | 553 | } |
536 | 554 | ||
537 | /*H:400 | 555 | /*H:400 |
538 | * (iii) Setting up a page table entry when the Guest tells us it has changed. | 556 | * (iii) Setting up a page table entry when the Guest tells us one has changed. |
539 | * | 557 | * |
540 | * Just like we did in interrupts_and_traps.c, it makes sense for us to deal | 558 | * Just like we did in interrupts_and_traps.c, it makes sense for us to deal |
541 | * with the other side of page tables while we're here: what happens when the | 559 | * with the other side of page tables while we're here: what happens when the |
@@ -612,9 +630,10 @@ void free_guest_pagetable(struct lguest *lg) | |||
612 | 630 | ||
613 | /*H:480 (vi) Mapping the Switcher when the Guest is about to run. | 631 | /*H:480 (vi) Mapping the Switcher when the Guest is about to run. |
614 | * | 632 | * |
615 | * The Switcher and the two pages for this CPU need to be available to the | 633 | * The Switcher and the two pages for this CPU need to be visible in the |
616 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | 634 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages |
617 | * for each CPU already set up, we just need to hook them in. */ | 635 | * for each CPU already set up, we just need to hook them in now we know which |
636 | * Guest is about to run on this CPU. */ | ||
618 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 637 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) |
619 | { | 638 | { |
620 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 639 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
@@ -677,6 +696,18 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
677 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 696 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); |
678 | } | 697 | } |
679 | 698 | ||
699 | /* We've made it through the page table code. Perhaps our tired brains are | ||
700 | * still processing the details, or perhaps we're simply glad it's over. | ||
701 | * | ||
702 | * If nothing else, note that all this complexity in juggling shadow page | ||
703 | * tables in sync with the Guest's page tables is for one reason: for most | ||
704 | * Guests this page table dance determines how bad performance will be. This | ||
705 | * is why Xen uses exotic direct Guest pagetable manipulation, and why both | ||
706 | * Intel and AMD have implemented shadow page table support directly into | ||
707 | * hardware. | ||
708 | * | ||
709 | * There is just one file remaining in the Host. */ | ||
710 | |||
680 | /*H:510 At boot or module load time, init_pagetables() allocates and populates | 711 | /*H:510 At boot or module load time, init_pagetables() allocates and populates |
681 | * the Switcher PTE page for each CPU. */ | 712 | * the Switcher PTE page for each CPU. */ |
682 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | 713 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) |