diff options
author | Jeremy Fitzhardinge <jeremy@goop.org> | 2008-07-08 18:07:13 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-16 05:05:38 -0400 |
commit | d6182fbf04164016cb6540db02eef3d6bdc967c3 (patch) | |
tree | 53bd4b9b764e9220b978a6506b46455930973f27 /arch/x86 | |
parent | c24481e9da2c7bc8aafab46e0bc64821244a24a6 (diff) |
xen64: allocate and manage user pagetables
Because the x86_64 architecture does not enforce segment limits, Xen
cannot protect itself with them as it does in 32-bit mode. Therefore,
to protect itself, it runs the guest kernel in ring 3. Since it also
runs the guest userspace in ring3, the guest kernel must maintain a
second pagetable for its userspace, which does not map kernel space.
Naturally, the guest kernel pagetables map both kernel and userspace.
The userspace pagetable is attached to the corresponding kernel
pagetable via the pgd's page->private field. It is allocated and
freed at the same time as the kernel pgd via the
paravirt_pgd_alloc/free hooks.
Fortunately, the user pagetable is almost entirely shared with the
kernel pagetable; the only difference is the pgd page itself. set_pgd
will populate all entries in the kernel pagetable, and also set the
corresponding user pgd entry if the address is less than
STACK_TOP_MAX.
The user pagetable must be pinned and unpinned with the kernel one,
but because the pagetables are aliased, pgd_walk() only needs to be
called on the kernel pagetable. The user pgd page is then
pinned/unpinned along with the kernel pgd page.
xen_write_cr3 must write both the kernel and user cr3s.
The init_mm.pgd pagetable never has a user pagetable allocated for it,
because it can never be used while running usermode.
One awkward area is that early in boot the page structures are not
available. No user pagetable can exist at that point, but it
complicates the logic to avoid looking at the page structure.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/xen/enlighten.c | 99 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 91 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 2 |
3 files changed, 168 insertions, 24 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c13698faae54..48f1a7eca8b9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -46,7 +46,6 @@ | |||
46 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> |
47 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
48 | #include <asm/reboot.h> | 48 | #include <asm/reboot.h> |
49 | #include <asm/pgalloc.h> | ||
50 | 49 | ||
51 | #include "xen-ops.h" | 50 | #include "xen-ops.h" |
52 | #include "mmu.h" | 51 | #include "mmu.h" |
@@ -711,29 +710,57 @@ static void set_current_cr3(void *v) | |||
711 | x86_write_percpu(xen_current_cr3, (unsigned long)v); | 710 | x86_write_percpu(xen_current_cr3, (unsigned long)v); |
712 | } | 711 | } |
713 | 712 | ||
714 | static void xen_write_cr3(unsigned long cr3) | 713 | static void __xen_write_cr3(bool kernel, unsigned long cr3) |
715 | { | 714 | { |
716 | struct mmuext_op *op; | 715 | struct mmuext_op *op; |
717 | struct multicall_space mcs; | 716 | struct multicall_space mcs; |
718 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | 717 | unsigned long mfn; |
719 | 718 | ||
720 | BUG_ON(preemptible()); | 719 | if (cr3) |
720 | mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
721 | else | ||
722 | mfn = 0; | ||
721 | 723 | ||
722 | mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ | 724 | WARN_ON(mfn == 0 && kernel); |
723 | 725 | ||
724 | /* Update while interrupts are disabled, so its atomic with | 726 | mcs = __xen_mc_entry(sizeof(*op)); |
725 | respect to ipis */ | ||
726 | x86_write_percpu(xen_cr3, cr3); | ||
727 | 727 | ||
728 | op = mcs.args; | 728 | op = mcs.args; |
729 | op->cmd = MMUEXT_NEW_BASEPTR; | 729 | op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; |
730 | op->arg1.mfn = mfn; | 730 | op->arg1.mfn = mfn; |
731 | 731 | ||
732 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 732 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
733 | 733 | ||
734 | /* Update xen_update_cr3 once the batch has actually | 734 | if (kernel) { |
735 | been submitted. */ | 735 | x86_write_percpu(xen_cr3, cr3); |
736 | xen_mc_callback(set_current_cr3, (void *)cr3); | 736 | |
737 | /* Update xen_current_cr3 once the batch has actually | ||
738 | been submitted. */ | ||
739 | xen_mc_callback(set_current_cr3, (void *)cr3); | ||
740 | } | ||
741 | } | ||
742 | |||
743 | static void xen_write_cr3(unsigned long cr3) | ||
744 | { | ||
745 | BUG_ON(preemptible()); | ||
746 | |||
747 | xen_mc_batch(); /* disables interrupts */ | ||
748 | |||
749 | /* Update while interrupts are disabled, so its atomic with | ||
750 | respect to ipis */ | ||
751 | x86_write_percpu(xen_cr3, cr3); | ||
752 | |||
753 | __xen_write_cr3(true, cr3); | ||
754 | |||
755 | #ifdef CONFIG_X86_64 | ||
756 | { | ||
757 | pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); | ||
758 | if (user_pgd) | ||
759 | __xen_write_cr3(false, __pa(user_pgd)); | ||
760 | else | ||
761 | __xen_write_cr3(false, 0); | ||
762 | } | ||
763 | #endif | ||
737 | 764 | ||
738 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ | 765 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ |
739 | } | 766 | } |
@@ -794,6 +821,40 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) | |||
794 | xen_alloc_ptpage(mm, pfn, PT_PMD); | 821 | xen_alloc_ptpage(mm, pfn, PT_PMD); |
795 | } | 822 | } |
796 | 823 | ||
824 | static int xen_pgd_alloc(struct mm_struct *mm) | ||
825 | { | ||
826 | pgd_t *pgd = mm->pgd; | ||
827 | int ret = 0; | ||
828 | |||
829 | BUG_ON(PagePinned(virt_to_page(pgd))); | ||
830 | |||
831 | #ifdef CONFIG_X86_64 | ||
832 | { | ||
833 | struct page *page = virt_to_page(pgd); | ||
834 | |||
835 | BUG_ON(page->private != 0); | ||
836 | |||
837 | page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
838 | if (page->private == 0) | ||
839 | ret = -ENOMEM; | ||
840 | |||
841 | BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); | ||
842 | } | ||
843 | #endif | ||
844 | |||
845 | return ret; | ||
846 | } | ||
847 | |||
848 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | ||
849 | { | ||
850 | #ifdef CONFIG_X86_64 | ||
851 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | ||
852 | |||
853 | if (user_pgd) | ||
854 | free_page((unsigned long)user_pgd); | ||
855 | #endif | ||
856 | } | ||
857 | |||
797 | /* This should never happen until we're OK to use struct page */ | 858 | /* This should never happen until we're OK to use struct page */ |
798 | static void xen_release_ptpage(u32 pfn, unsigned level) | 859 | static void xen_release_ptpage(u32 pfn, unsigned level) |
799 | { | 860 | { |
@@ -1168,8 +1229,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1168 | .pte_update = paravirt_nop, | 1229 | .pte_update = paravirt_nop, |
1169 | .pte_update_defer = paravirt_nop, | 1230 | .pte_update_defer = paravirt_nop, |
1170 | 1231 | ||
1171 | .pgd_alloc = __paravirt_pgd_alloc, | 1232 | .pgd_alloc = xen_pgd_alloc, |
1172 | .pgd_free = paravirt_nop, | 1233 | .pgd_free = xen_pgd_free, |
1173 | 1234 | ||
1174 | .alloc_pte = xen_alloc_pte_init, | 1235 | .alloc_pte = xen_alloc_pte_init, |
1175 | .release_pte = xen_release_pte_init, | 1236 | .release_pte = xen_release_pte_init, |
@@ -1480,7 +1541,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf | |||
1480 | 1541 | ||
1481 | /* Switch over */ | 1542 | /* Switch over */ |
1482 | pgd = init_level4_pgt; | 1543 | pgd = init_level4_pgt; |
1483 | xen_write_cr3(__pa(pgd)); | 1544 | |
1545 | /* | ||
1546 | * At this stage there can be no user pgd, and no page | ||
1547 | * structure to attach it to, so make sure we just set kernel | ||
1548 | * pgd. | ||
1549 | */ | ||
1550 | xen_mc_batch(); | ||
1551 | __xen_write_cr3(true, __pa(pgd)); | ||
1552 | xen_mc_issue(PARAVIRT_LAZY_CPU); | ||
1484 | 1553 | ||
1485 | reserve_early(__pa(xen_start_info->pt_base), | 1554 | reserve_early(__pa(xen_start_info->pt_base), |
1486 | __pa(xen_start_info->pt_base + | 1555 | __pa(xen_start_info->pt_base + |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 046c1f23dd6e..a44d56e38bd1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -58,6 +58,13 @@ | |||
58 | #include "multicalls.h" | 58 | #include "multicalls.h" |
59 | #include "mmu.h" | 59 | #include "mmu.h" |
60 | 60 | ||
61 | /* | ||
62 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | ||
63 | * redzone above it, so round it up to a PGD boundary. | ||
64 | */ | ||
65 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) | ||
66 | |||
67 | |||
61 | #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | 68 | #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) |
62 | #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) | 69 | #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) |
63 | 70 | ||
@@ -461,17 +468,45 @@ pud_t xen_make_pud(pudval_t pud) | |||
461 | return native_make_pud(pud); | 468 | return native_make_pud(pud); |
462 | } | 469 | } |
463 | 470 | ||
464 | void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 471 | pgd_t *xen_get_user_pgd(pgd_t *pgd) |
465 | { | 472 | { |
466 | struct mmu_update u; | 473 | pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); |
474 | unsigned offset = pgd - pgd_page; | ||
475 | pgd_t *user_ptr = NULL; | ||
467 | 476 | ||
468 | preempt_disable(); | 477 | if (offset < pgd_index(USER_LIMIT)) { |
478 | struct page *page = virt_to_page(pgd_page); | ||
479 | user_ptr = (pgd_t *)page->private; | ||
480 | if (user_ptr) | ||
481 | user_ptr += offset; | ||
482 | } | ||
469 | 483 | ||
470 | xen_mc_batch(); | 484 | return user_ptr; |
485 | } | ||
486 | |||
487 | static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | ||
488 | { | ||
489 | struct mmu_update u; | ||
471 | 490 | ||
472 | u.ptr = virt_to_machine(ptr).maddr; | 491 | u.ptr = virt_to_machine(ptr).maddr; |
473 | u.val = pgd_val_ma(val); | 492 | u.val = pgd_val_ma(val); |
474 | extend_mmu_update(&u); | 493 | extend_mmu_update(&u); |
494 | } | ||
495 | |||
496 | /* | ||
497 | * Raw hypercall-based set_pgd, intended for in early boot before | ||
498 | * there's a page structure. This implies: | ||
499 | * 1. The only existing pagetable is the kernel's | ||
500 | * 2. It is always pinned | ||
501 | * 3. It has no user pagetable attached to it | ||
502 | */ | ||
503 | void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | ||
504 | { | ||
505 | preempt_disable(); | ||
506 | |||
507 | xen_mc_batch(); | ||
508 | |||
509 | __xen_set_pgd_hyper(ptr, val); | ||
475 | 510 | ||
476 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 511 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
477 | 512 | ||
@@ -480,14 +515,28 @@ void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
480 | 515 | ||
481 | void xen_set_pgd(pgd_t *ptr, pgd_t val) | 516 | void xen_set_pgd(pgd_t *ptr, pgd_t val) |
482 | { | 517 | { |
518 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | ||
519 | |||
483 | /* If page is not pinned, we can just update the entry | 520 | /* If page is not pinned, we can just update the entry |
484 | directly */ | 521 | directly */ |
485 | if (!page_pinned(ptr)) { | 522 | if (!page_pinned(ptr)) { |
486 | *ptr = val; | 523 | *ptr = val; |
524 | if (user_ptr) { | ||
525 | WARN_ON(page_pinned(user_ptr)); | ||
526 | *user_ptr = val; | ||
527 | } | ||
487 | return; | 528 | return; |
488 | } | 529 | } |
489 | 530 | ||
490 | xen_set_pgd_hyper(ptr, val); | 531 | /* If it's pinned, then we can at least batch the kernel and |
532 | user updates together. */ | ||
533 | xen_mc_batch(); | ||
534 | |||
535 | __xen_set_pgd_hyper(ptr, val); | ||
536 | if (user_ptr) | ||
537 | __xen_set_pgd_hyper(user_ptr, val); | ||
538 | |||
539 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
491 | } | 540 | } |
492 | #endif /* PAGETABLE_LEVELS == 4 */ | 541 | #endif /* PAGETABLE_LEVELS == 4 */ |
493 | 542 | ||
@@ -526,7 +575,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
526 | * space, which contains the Xen mappings. On 32-bit these | 575 | * space, which contains the Xen mappings. On 32-bit these |
527 | * will end up making a zero-sized hole and so is a no-op. | 576 | * will end up making a zero-sized hole and so is a no-op. |
528 | */ | 577 | */ |
529 | hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); | 578 | hole_low = pgd_index(USER_LIMIT); |
530 | hole_high = pgd_index(PAGE_OFFSET); | 579 | hole_high = pgd_index(PAGE_OFFSET); |
531 | 580 | ||
532 | pgdidx_limit = pgd_index(limit); | 581 | pgdidx_limit = pgd_index(limit); |
@@ -670,19 +719,31 @@ void xen_pgd_pin(pgd_t *pgd) | |||
670 | { | 719 | { |
671 | xen_mc_batch(); | 720 | xen_mc_batch(); |
672 | 721 | ||
673 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) { | 722 | if (pgd_walk(pgd, pin_page, USER_LIMIT)) { |
674 | /* re-enable interrupts for kmap_flush_unused */ | 723 | /* re-enable interrupts for kmap_flush_unused */ |
675 | xen_mc_issue(0); | 724 | xen_mc_issue(0); |
676 | kmap_flush_unused(); | 725 | kmap_flush_unused(); |
677 | xen_mc_batch(); | 726 | xen_mc_batch(); |
678 | } | 727 | } |
679 | 728 | ||
729 | #ifdef CONFIG_X86_64 | ||
730 | { | ||
731 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | ||
732 | |||
733 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | ||
734 | |||
735 | if (user_pgd) { | ||
736 | pin_page(virt_to_page(user_pgd), PT_PGD); | ||
737 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | ||
738 | } | ||
739 | } | ||
740 | #else /* CONFIG_X86_32 */ | ||
680 | #ifdef CONFIG_X86_PAE | 741 | #ifdef CONFIG_X86_PAE |
681 | /* Need to make sure unshared kernel PMD is pinnable */ | 742 | /* Need to make sure unshared kernel PMD is pinnable */ |
682 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 743 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); |
683 | #endif | 744 | #endif |
684 | |||
685 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 745 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
746 | #endif /* CONFIG_X86_64 */ | ||
686 | xen_mc_issue(0); | 747 | xen_mc_issue(0); |
687 | } | 748 | } |
688 | 749 | ||
@@ -763,11 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
763 | 824 | ||
764 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 825 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
765 | 826 | ||
827 | #ifdef CONFIG_X86_64 | ||
828 | { | ||
829 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | ||
830 | |||
831 | if (user_pgd) { | ||
832 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | ||
833 | unpin_page(virt_to_page(user_pgd), PT_PGD); | ||
834 | } | ||
835 | } | ||
836 | #endif | ||
837 | |||
766 | #ifdef CONFIG_X86_PAE | 838 | #ifdef CONFIG_X86_PAE |
767 | /* Need to make sure unshared kernel PMD is unpinned */ | 839 | /* Need to make sure unshared kernel PMD is unpinned */ |
768 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 840 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); |
769 | #endif | 841 | #endif |
770 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 842 | |
843 | pgd_walk(pgd, unpin_page, USER_LIMIT); | ||
771 | 844 | ||
772 | xen_mc_issue(0); | 845 | xen_mc_issue(0); |
773 | } | 846 | } |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 19d544b0b6c6..0f59bd03f9e3 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -51,6 +51,8 @@ void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); | |||
51 | void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); | 51 | void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); |
52 | #endif | 52 | #endif |
53 | 53 | ||
54 | pgd_t *xen_get_user_pgd(pgd_t *pgd); | ||
55 | |||
54 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | 56 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); |
55 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 57 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, |
56 | pte_t *ptep, pte_t pte); | 58 | pte_t *ptep, pte_t pte); |