aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/enlighten.c
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@goop.org>2008-07-08 18:07:13 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-16 05:05:38 -0400
commitd6182fbf04164016cb6540db02eef3d6bdc967c3 (patch)
tree53bd4b9b764e9220b978a6506b46455930973f27 /arch/x86/xen/enlighten.c
parentc24481e9da2c7bc8aafab46e0bc64821244a24a6 (diff)
xen64: allocate and manage user pagetables
Because the x86_64 architecture does not enforce segment limits, Xen cannot protect itself with them as it does in 32-bit mode. Therefore, to protect itself, it runs the guest kernel in ring 3. Since it also runs the guest userspace in ring3, the guest kernel must maintain a second pagetable for its userspace, which does not map kernel space. Naturally, the guest kernel pagetables map both kernel and userspace. The userspace pagetable is attached to the corresponding kernel pagetable via the pgd's page->private field. It is allocated and freed at the same time as the kernel pgd via the paravirt_pgd_alloc/free hooks. Fortunately, the user pagetable is almost entirely shared with the kernel pagetable; the only difference is the pgd page itself. set_pgd will populate all entries in the kernel pagetable, and also set the corresponding user pgd entry if the address is less than STACK_TOP_MAX. The user pagetable must be pinned and unpinned with the kernel one, but because the pagetables are aliased, pgd_walk() only needs to be called on the kernel pagetable. The user pgd page is then pinned/unpinned along with the kernel pgd page. xen_write_cr3 must write both the kernel and user cr3s. The init_mm.pgd pagetable never has a user pagetable allocated for it, because it can never be used while running usermode. One awkward area is that early in boot the page structures are not available. No user pagetable can exist at that point, but it complicates the logic to avoid looking at the page structure. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/xen/enlighten.c')
-rw-r--r--arch/x86/xen/enlighten.c99
1 files changed, 84 insertions, 15 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c13698faae5..48f1a7eca8b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -46,7 +46,6 @@
46#include <asm/pgtable.h> 46#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/reboot.h> 48#include <asm/reboot.h>
49#include <asm/pgalloc.h>
50 49
51#include "xen-ops.h" 50#include "xen-ops.h"
52#include "mmu.h" 51#include "mmu.h"
@@ -711,29 +710,57 @@ static void set_current_cr3(void *v)
711 x86_write_percpu(xen_current_cr3, (unsigned long)v); 710 x86_write_percpu(xen_current_cr3, (unsigned long)v);
712} 711}
713 712
714static void xen_write_cr3(unsigned long cr3) 713static void __xen_write_cr3(bool kernel, unsigned long cr3)
715{ 714{
716 struct mmuext_op *op; 715 struct mmuext_op *op;
717 struct multicall_space mcs; 716 struct multicall_space mcs;
718 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 717 unsigned long mfn;
719 718
720 BUG_ON(preemptible()); 719 if (cr3)
720 mfn = pfn_to_mfn(PFN_DOWN(cr3));
721 else
722 mfn = 0;
721 723
722 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 724 WARN_ON(mfn == 0 && kernel);
723 725
724 /* Update while interrupts are disabled, so its atomic with 726 mcs = __xen_mc_entry(sizeof(*op));
725 respect to ipis */
726 x86_write_percpu(xen_cr3, cr3);
727 727
728 op = mcs.args; 728 op = mcs.args;
729 op->cmd = MMUEXT_NEW_BASEPTR; 729 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
730 op->arg1.mfn = mfn; 730 op->arg1.mfn = mfn;
731 731
732 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 732 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
733 733
734 /* Update xen_update_cr3 once the batch has actually 734 if (kernel) {
735 been submitted. */ 735 x86_write_percpu(xen_cr3, cr3);
736 xen_mc_callback(set_current_cr3, (void *)cr3); 736
737 /* Update xen_current_cr3 once the batch has actually
738 been submitted. */
739 xen_mc_callback(set_current_cr3, (void *)cr3);
740 }
741}
742
743static void xen_write_cr3(unsigned long cr3)
744{
745 BUG_ON(preemptible());
746
747 xen_mc_batch(); /* disables interrupts */
748
749 /* Update while interrupts are disabled, so its atomic with
750 respect to ipis */
751 x86_write_percpu(xen_cr3, cr3);
752
753 __xen_write_cr3(true, cr3);
754
755#ifdef CONFIG_X86_64
756 {
757 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
758 if (user_pgd)
759 __xen_write_cr3(false, __pa(user_pgd));
760 else
761 __xen_write_cr3(false, 0);
762 }
763#endif
737 764
738 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 765 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
739} 766}
@@ -794,6 +821,40 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
794 xen_alloc_ptpage(mm, pfn, PT_PMD); 821 xen_alloc_ptpage(mm, pfn, PT_PMD);
795} 822}
796 823
824static int xen_pgd_alloc(struct mm_struct *mm)
825{
826 pgd_t *pgd = mm->pgd;
827 int ret = 0;
828
829 BUG_ON(PagePinned(virt_to_page(pgd)));
830
831#ifdef CONFIG_X86_64
832 {
833 struct page *page = virt_to_page(pgd);
834
835 BUG_ON(page->private != 0);
836
837 page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO);
838 if (page->private == 0)
839 ret = -ENOMEM;
840
841 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
842 }
843#endif
844
845 return ret;
846}
847
848static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
849{
850#ifdef CONFIG_X86_64
851 pgd_t *user_pgd = xen_get_user_pgd(pgd);
852
853 if (user_pgd)
854 free_page((unsigned long)user_pgd);
855#endif
856}
857
797/* This should never happen until we're OK to use struct page */ 858/* This should never happen until we're OK to use struct page */
798static void xen_release_ptpage(u32 pfn, unsigned level) 859static void xen_release_ptpage(u32 pfn, unsigned level)
799{ 860{
@@ -1168,8 +1229,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1168 .pte_update = paravirt_nop, 1229 .pte_update = paravirt_nop,
1169 .pte_update_defer = paravirt_nop, 1230 .pte_update_defer = paravirt_nop,
1170 1231
1171 .pgd_alloc = __paravirt_pgd_alloc, 1232 .pgd_alloc = xen_pgd_alloc,
1172 .pgd_free = paravirt_nop, 1233 .pgd_free = xen_pgd_free,
1173 1234
1174 .alloc_pte = xen_alloc_pte_init, 1235 .alloc_pte = xen_alloc_pte_init,
1175 .release_pte = xen_release_pte_init, 1236 .release_pte = xen_release_pte_init,
@@ -1480,7 +1541,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
1480 1541
1481 /* Switch over */ 1542 /* Switch over */
1482 pgd = init_level4_pgt; 1543 pgd = init_level4_pgt;
1483 xen_write_cr3(__pa(pgd)); 1544
1545 /*
1546 * At this stage there can be no user pgd, and no page
1547 * structure to attach it to, so make sure we just set kernel
1548 * pgd.
1549 */
1550 xen_mc_batch();
1551 __xen_write_cr3(true, __pa(pgd));
1552 xen_mc_issue(PARAVIRT_LAZY_CPU);
1484 1553
1485 reserve_early(__pa(xen_start_info->pt_base), 1554 reserve_early(__pa(xen_start_info->pt_base),
1486 __pa(xen_start_info->pt_base + 1555 __pa(xen_start_info->pt_base +