aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@goop.org>2008-07-08 18:07:13 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-16 05:05:38 -0400
commitd6182fbf04164016cb6540db02eef3d6bdc967c3 (patch)
tree53bd4b9b764e9220b978a6506b46455930973f27
parentc24481e9da2c7bc8aafab46e0bc64821244a24a6 (diff)
xen64: allocate and manage user pagetables
Because the x86_64 architecture does not enforce segment limits, Xen cannot protect itself with them as it does in 32-bit mode. Therefore, to protect itself, it runs the guest kernel in ring 3. Since it also runs the guest userspace in ring3, the guest kernel must maintain a second pagetable for its userspace, which does not map kernel space. Naturally, the guest kernel pagetables map both kernel and userspace. The userspace pagetable is attached to the corresponding kernel pagetable via the pgd's page->private field. It is allocated and freed at the same time as the kernel pgd via the paravirt_pgd_alloc/free hooks. Fortunately, the user pagetable is almost entirely shared with the kernel pagetable; the only difference is the pgd page itself. set_pgd will populate all entries in the kernel pagetable, and also set the corresponding user pgd entry if the address is less than STACK_TOP_MAX. The user pagetable must be pinned and unpinned with the kernel one, but because the pagetables are aliased, pgd_walk() only needs to be called on the kernel pagetable. The user pgd page is then pinned/unpinned along with the kernel pgd page. xen_write_cr3 must write both the kernel and user cr3s. The init_mm.pgd pagetable never has a user pagetable allocated for it, because it can never be used while running usermode. One awkward area is that early in boot the page structures are not available. No user pagetable can exist at that point, but it complicates the logic to avoid looking at the page structure. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/xen/enlighten.c99
-rw-r--r--arch/x86/xen/mmu.c91
-rw-r--r--arch/x86/xen/mmu.h2
3 files changed, 168 insertions, 24 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c13698faae54..48f1a7eca8b9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -46,7 +46,6 @@
46#include <asm/pgtable.h> 46#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/reboot.h> 48#include <asm/reboot.h>
49#include <asm/pgalloc.h>
50 49
51#include "xen-ops.h" 50#include "xen-ops.h"
52#include "mmu.h" 51#include "mmu.h"
@@ -711,29 +710,57 @@ static void set_current_cr3(void *v)
711 x86_write_percpu(xen_current_cr3, (unsigned long)v); 710 x86_write_percpu(xen_current_cr3, (unsigned long)v);
712} 711}
713 712
714static void xen_write_cr3(unsigned long cr3) 713static void __xen_write_cr3(bool kernel, unsigned long cr3)
715{ 714{
716 struct mmuext_op *op; 715 struct mmuext_op *op;
717 struct multicall_space mcs; 716 struct multicall_space mcs;
718 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 717 unsigned long mfn;
719 718
720 BUG_ON(preemptible()); 719 if (cr3)
720 mfn = pfn_to_mfn(PFN_DOWN(cr3));
721 else
722 mfn = 0;
721 723
722 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 724 WARN_ON(mfn == 0 && kernel);
723 725
724 /* Update while interrupts are disabled, so its atomic with 726 mcs = __xen_mc_entry(sizeof(*op));
725 respect to ipis */
726 x86_write_percpu(xen_cr3, cr3);
727 727
728 op = mcs.args; 728 op = mcs.args;
729 op->cmd = MMUEXT_NEW_BASEPTR; 729 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
730 op->arg1.mfn = mfn; 730 op->arg1.mfn = mfn;
731 731
732 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 732 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
733 733
734 /* Update xen_update_cr3 once the batch has actually 734 if (kernel) {
735 been submitted. */ 735 x86_write_percpu(xen_cr3, cr3);
736 xen_mc_callback(set_current_cr3, (void *)cr3); 736
737 /* Update xen_current_cr3 once the batch has actually
738 been submitted. */
739 xen_mc_callback(set_current_cr3, (void *)cr3);
740 }
741}
742
743static void xen_write_cr3(unsigned long cr3)
744{
745 BUG_ON(preemptible());
746
747 xen_mc_batch(); /* disables interrupts */
748
749 /* Update while interrupts are disabled, so its atomic with
750 respect to ipis */
751 x86_write_percpu(xen_cr3, cr3);
752
753 __xen_write_cr3(true, cr3);
754
755#ifdef CONFIG_X86_64
756 {
757 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
758 if (user_pgd)
759 __xen_write_cr3(false, __pa(user_pgd));
760 else
761 __xen_write_cr3(false, 0);
762 }
763#endif
737 764
738 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 765 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
739} 766}
@@ -794,6 +821,40 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
794 xen_alloc_ptpage(mm, pfn, PT_PMD); 821 xen_alloc_ptpage(mm, pfn, PT_PMD);
795} 822}
796 823
824static int xen_pgd_alloc(struct mm_struct *mm)
825{
826 pgd_t *pgd = mm->pgd;
827 int ret = 0;
828
829 BUG_ON(PagePinned(virt_to_page(pgd)));
830
831#ifdef CONFIG_X86_64
832 {
833 struct page *page = virt_to_page(pgd);
834
835 BUG_ON(page->private != 0);
836
837 page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO);
838 if (page->private == 0)
839 ret = -ENOMEM;
840
841 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
842 }
843#endif
844
845 return ret;
846}
847
848static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
849{
850#ifdef CONFIG_X86_64
851 pgd_t *user_pgd = xen_get_user_pgd(pgd);
852
853 if (user_pgd)
854 free_page((unsigned long)user_pgd);
855#endif
856}
857
797/* This should never happen until we're OK to use struct page */ 858/* This should never happen until we're OK to use struct page */
798static void xen_release_ptpage(u32 pfn, unsigned level) 859static void xen_release_ptpage(u32 pfn, unsigned level)
799{ 860{
@@ -1168,8 +1229,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1168 .pte_update = paravirt_nop, 1229 .pte_update = paravirt_nop,
1169 .pte_update_defer = paravirt_nop, 1230 .pte_update_defer = paravirt_nop,
1170 1231
1171 .pgd_alloc = __paravirt_pgd_alloc, 1232 .pgd_alloc = xen_pgd_alloc,
1172 .pgd_free = paravirt_nop, 1233 .pgd_free = xen_pgd_free,
1173 1234
1174 .alloc_pte = xen_alloc_pte_init, 1235 .alloc_pte = xen_alloc_pte_init,
1175 .release_pte = xen_release_pte_init, 1236 .release_pte = xen_release_pte_init,
@@ -1480,7 +1541,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
1480 1541
1481 /* Switch over */ 1542 /* Switch over */
1482 pgd = init_level4_pgt; 1543 pgd = init_level4_pgt;
1483 xen_write_cr3(__pa(pgd)); 1544
1545 /*
1546 * At this stage there can be no user pgd, and no page
1547 * structure to attach it to, so make sure we just set kernel
1548 * pgd.
1549 */
1550 xen_mc_batch();
1551 __xen_write_cr3(true, __pa(pgd));
1552 xen_mc_issue(PARAVIRT_LAZY_CPU);
1484 1553
1485 reserve_early(__pa(xen_start_info->pt_base), 1554 reserve_early(__pa(xen_start_info->pt_base),
1486 __pa(xen_start_info->pt_base + 1555 __pa(xen_start_info->pt_base +
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 046c1f23dd6e..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,6 +58,13 @@
58#include "multicalls.h" 58#include "multicalls.h"
59#include "mmu.h" 59#include "mmu.h"
60 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
61#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
62#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
63 70
@@ -461,17 +468,45 @@ pud_t xen_make_pud(pudval_t pud)
461 return native_make_pud(pud); 468 return native_make_pud(pud);
462} 469}
463 470
464void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 471pgd_t *xen_get_user_pgd(pgd_t *pgd)
465{ 472{
466 struct mmu_update u; 473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
467 476
468 preempt_disable(); 477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
469 483
470 xen_mc_batch(); 484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
471 490
472 u.ptr = virt_to_machine(ptr).maddr; 491 u.ptr = virt_to_machine(ptr).maddr;
473 u.val = pgd_val_ma(val); 492 u.val = pgd_val_ma(val);
474 extend_mmu_update(&u); 493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
475 510
476 xen_mc_issue(PARAVIRT_LAZY_MMU); 511 xen_mc_issue(PARAVIRT_LAZY_MMU);
477 512
@@ -480,14 +515,28 @@ void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
480 515
481void xen_set_pgd(pgd_t *ptr, pgd_t val) 516void xen_set_pgd(pgd_t *ptr, pgd_t val)
482{ 517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
483 /* If page is not pinned, we can just update the entry 520 /* If page is not pinned, we can just update the entry
484 directly */ 521 directly */
485 if (!page_pinned(ptr)) { 522 if (!page_pinned(ptr)) {
486 *ptr = val; 523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
487 return; 528 return;
488 } 529 }
489 530
490 xen_set_pgd_hyper(ptr, val); 531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
491} 540}
492#endif /* PAGETABLE_LEVELS == 4 */ 541#endif /* PAGETABLE_LEVELS == 4 */
493 542
@@ -526,7 +575,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
526 * space, which contains the Xen mappings. On 32-bit these 575 * space, which contains the Xen mappings. On 32-bit these
527 * will end up making a zero-sized hole and so is a no-op. 576 * will end up making a zero-sized hole and so is a no-op.
528 */ 577 */
529 hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); 578 hole_low = pgd_index(USER_LIMIT);
530 hole_high = pgd_index(PAGE_OFFSET); 579 hole_high = pgd_index(PAGE_OFFSET);
531 580
532 pgdidx_limit = pgd_index(limit); 581 pgdidx_limit = pgd_index(limit);
@@ -670,19 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
670{ 719{
671 xen_mc_batch(); 720 xen_mc_batch();
672 721
673 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
674 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
675 xen_mc_issue(0); 724 xen_mc_issue(0);
676 kmap_flush_unused(); 725 kmap_flush_unused();
677 xen_mc_batch(); 726 xen_mc_batch();
678 } 727 }
679 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
680#ifdef CONFIG_X86_PAE 741#ifdef CONFIG_X86_PAE
681 /* Need to make sure unshared kernel PMD is pinnable */ 742 /* Need to make sure unshared kernel PMD is pinnable */
682 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
683#endif 744#endif
684
685 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
686 xen_mc_issue(0); 747 xen_mc_issue(0);
687} 748}
688 749
@@ -763,11 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
763 824
764 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
765 826
827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
766#ifdef CONFIG_X86_PAE 838#ifdef CONFIG_X86_PAE
767 /* Need to make sure unshared kernel PMD is unpinned */ 839 /* Need to make sure unshared kernel PMD is unpinned */
768 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
769#endif 841#endif
770 pgd_walk(pgd, unpin_page, TASK_SIZE); 842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
771 844
772 xen_mc_issue(0); 845 xen_mc_issue(0);
773} 846}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 19d544b0b6c6..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -51,6 +51,8 @@ void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); 51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif 52#endif
53 53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
54pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
55void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
56 pte_t *ptep, pte_t pte); 58 pte_t *ptep, pte_t pte);