i386: move xen

Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:16:51 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:16:51 -0400
commit: 9702785a747aa27baf46ff504beab6528f21f2dd (patch)
tree: ab69d6f802f5b680c33999dc089e44982c74595d /arch/x86/xen/mmu.c
parent: 334e621a01f86d5bc25e4f742e1eaae6e2d2a97a (diff)
1 files changed, 567 insertions, 0 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644
index 000000000000..874db0cd1d2a
--- /dev/null
+++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,567 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include "multicalls.h"
+#include "mmu.h"
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+        pte_t *pte = lookup_address(address);
+        unsigned offset = address & PAGE_MASK;
+        BUG_ON(pte == NULL);
+        return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+void make_lowmem_page_readonly(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_wrprotect(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void make_lowmem_page_readwrite(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_mkwrite(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pmd_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                BUG();
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                BUG();
+                return;
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                BUG();
+                return;
+        }
+        pte = pte_offset_kernel(pmd, vaddr);
+        /* <mfn,flags> stored as-is, to permit clearing entries */
+        xen_set_pte(pte, mfn_pte(mfn, flags));
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval)
+{
+        if (mm == current->mm || mm == &init_mm) {
+                if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+                        struct multicall_space mcs;
+                        mcs = xen_mc_entry(0);
+                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                        xen_mc_issue(PARAVIRT_LAZY_MMU);
+                        return;
+                } else
+                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+                                return;
+        }
+        xen_set_pte(ptep, pteval);
+}
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pud_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        ptep->pte_high = pte.pte_high;
+        smp_wmb();
+        ptep->pte_low = pte.pte_low;
+}
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        ptep->pte_low = 0;
+        smp_wmb();              /* make sure low gets written first */
+        ptep->pte_high = 0;
+}
+void xen_pmd_clear(pmd_t *pmdp)
+{
+        xen_set_pmd(pmdp, __pmd(0));
+}
+unsigned long long xen_pte_val(pte_t pte)
+{
+        unsigned long long ret = 0;
+        if (pte.pte_low) {
+                ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        }
+        return ret;
+}
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+        unsigned long long ret = pmd.pmd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long long pte)
+{
+        if (pte & 1)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte, pte >> 32 };
+}
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+        if (pmd & 1)
+                pmd = phys_to_machine(XPADDR(pmd)).maddr;
+        return (pmd_t){ pmd };
+}
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        *ptep = pte;
+}
+unsigned long xen_pte_val(pte_t pte)
+{
+        unsigned long ret = pte.pte_low;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr;
+        return ret;
+}
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long pte)
+{
+        if (pte & _PAGE_PRESENT)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte };
+}
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#endif  /* CONFIG_X86_PAE */
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+                    unsigned long limit)
+{
+        pgd_t *pgd = pgd_base;
+        int flush = 0;
+        unsigned long addr = 0;
+        unsigned long pgd_next;
+        BUG_ON(limit > FIXADDR_TOP);
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return 0;
+        for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+                pud_t *pud;
+                unsigned long pud_limit, pud_next;
+                pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+                if (!pgd_val(*pgd))
+                        continue;
+                pud = pud_offset(pgd, 0);
+                if (PTRS_PER_PUD > 1) /* not folded */
+                        flush |= (*func)(virt_to_page(pud), 0);
+                for (; addr != pud_limit; pud++, addr = pud_next) {
+                        pmd_t *pmd;
+                        unsigned long pmd_limit;
+                        pud_next = pud_addr_end(addr, pud_limit);
+                        if (pud_next < limit)
+                                pmd_limit = pud_next;
+                        else
+                                pmd_limit = limit;
+                        if (pud_none(*pud))
+                                continue;
+                        pmd = pmd_offset(pud, 0);
+                        if (PTRS_PER_PMD > 1) /* not folded */
+                                flush |= (*func)(virt_to_page(pmd), 0);
+                        for (; addr != pmd_limit; pmd++) {
+                                addr += (PAGE_SIZE * PTRS_PER_PTE);
+                                if ((pmd_limit-1) < (addr-1)) {
+                                        addr = pmd_limit;
+                                        break;
+                                }
+                                if (pmd_none(*pmd))
+                                        continue;
+                                flush |= (*func)(pmd_page(*pmd), 0);
+                        }
+                }
+        }
+        flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+        return flush;
+}
+static int pin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+        int flush;
+        if (pgfl)
+                flush = 0;              /* already pinned */
+        else if (PageHighMem(page))
+                /* kmaps need flushing if we found an unpinned
+                   highpage */
+                flush = 1;
+        else {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                flush = 0;
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL_RO),
+                                        flags);
+        }
+        return flush;
+}
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+        struct multicall_space mcs;
+        struct mmuext_op *op;
+        xen_mc_batch();
+        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+                /* re-enable interrupts for kmap_flush_unused */
+                xen_mc_issue(0);
+                kmap_flush_unused();
+                xen_mc_batch();
+        }
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+#ifdef CONFIG_X86_PAE
+        op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+        op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(0);
+}
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+        SetPagePinned(page);
+        return 0;
+}
+void __init xen_mark_init_mm_pinned(void)
+{
+        pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+static int unpin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+        if (pgfl && !PageHighMem(page)) {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL),
+                                        flags);
+        }
+        return 0;               /* never need to flush on unpin */
+}
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        xen_mc_batch();
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_UNPIN_TABLE;
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        pgd_walk(pgd, unpin_page, TASK_SIZE);
+        xen_mc_issue(0);
+}
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+        spin_lock(&next->page_table_lock);
+        xen_pgd_pin(next->pgd);
+        spin_unlock(&next->page_table_lock);
+}
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+        spin_lock(&mm->page_table_lock);
+        xen_pgd_pin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+        struct mm_struct *mm = info;
+        if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+                leave_mm(smp_processor_id());
+}
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm) {
+                if (current->mm == mm)
+                        load_cr3(swapper_pg_dir);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(mm->cpu_vm_mask))
+                xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                           mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm)
+                load_cr3(swapper_pg_dir);
+}
+#endif
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+        get_cpu();              /* make sure we don't move around */
+        drop_mm_ref(mm);
+        put_cpu();
+        spin_lock(&mm->page_table_lock);
+        /* pgd may not be pinned in the error exit path of execve */
+        if (PagePinned(virt_to_page(mm->pgd)))
+                xen_pgd_unpin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
author	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:16:51 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:16:51 -0400
commit	9702785a747aa27baf46ff504beab6528f21f2dd (patch)
tree	ab69d6f802f5b680c33999dc089e44982c74595d /arch/x86/xen/mmu.c
parent	334e621a01f86d5bc25e4f742e1eaae6e2d2a97a (diff)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c new file mode 100644 index 000000000000..874db0cd1d2a --- /dev/null +++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,567 @@
	1	/*
	2	* Xen mmu operations
	3	*
	4	* This file contains the various mmu fetch and update operations.
	5	* The most important job they must perform is the mapping between the
	6	* domain's pfn and the overall machine mfns.
	7	*
	8	* Xen allows guests to directly update the pagetable, in a controlled
	9	* fashion. In other words, the guest modifies the same pagetable
	10	* that the CPU actually uses, which eliminates the overhead of having
	11	* a separate shadow pagetable.
	12	*
	13	* In order to allow this, it falls on the guest domain to map its
	14	* notion of a "physical" pfn - which is just a domain-local linear
	15	* address - into a real "machine address" which the CPU's MMU can
	16	* use.
	17	*
	18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
	19	* inserted directly into the pagetable. When creating a new
	20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
	21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
	22	* the mfn back into a pfn.
	23	*
	24	* The other constraint is that all pages which make up a pagetable
	25	* must be mapped read-only in the guest. This prevents uncontrolled
	26	* guest updates to the pagetable. Xen strictly enforces this, and
	27	* will disallow any pagetable update which will end up mapping a
	28	* pagetable page RW, and will disallow using any writable page as a
	29	* pagetable.
	30	*
	31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
	32	* would need to validate the whole pagetable before going on.
	33	* Naturally, this is quite slow. The solution is to "pin" a
	34	* pagetable, which enforces all the constraints on the pagetable even
	35	* when it is not actively in use. This menas that Xen can be assured
	36	* that it is still valid when you do load it into %cr3, and doesn't
	37	* need to revalidate it.
	38	*
	39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	40	*/
	41	#include <linux/sched.h>
	42	#include <linux/highmem.h>
	43	#include <linux/bug.h>
	44	#include <linux/sched.h>
	45
	46	#include <asm/pgtable.h>
	47	#include <asm/tlbflush.h>
	48	#include <asm/mmu_context.h>
	49	#include <asm/paravirt.h>
	50
	51	#include <asm/xen/hypercall.h>
	52	#include <asm/xen/hypervisor.h>
	53
	54	#include <xen/page.h>
	55	#include <xen/interface/xen.h>
	56
	57	#include "multicalls.h"
	58	#include "mmu.h"
	59
	60	xmaddr_t arbitrary_virt_to_machine(unsigned long address)
	61	{
	62	pte_t *pte = lookup_address(address);
	63	unsigned offset = address & PAGE_MASK;
	64
	65	BUG_ON(pte == NULL);
	66
	67	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
	68	}
	69
	70	void make_lowmem_page_readonly(void *vaddr)
	71	{
	72	pte_t *pte, ptev;
	73	unsigned long address = (unsigned long)vaddr;
	74
	75	pte = lookup_address(address);
	76	BUG_ON(pte == NULL);
	77
	78	ptev = pte_wrprotect(*pte);
	79
	80	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	81	BUG();
	82	}
	83
	84	void make_lowmem_page_readwrite(void *vaddr)
	85	{
	86	pte_t *pte, ptev;
	87	unsigned long address = (unsigned long)vaddr;
	88
	89	pte = lookup_address(address);
	90	BUG_ON(pte == NULL);
	91
	92	ptev = pte_mkwrite(*pte);
	93
	94	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	95	BUG();
	96	}
	97
	98
	99	void xen_set_pmd(pmd_t *ptr, pmd_t val)
	100	{
	101	struct multicall_space mcs;
	102	struct mmu_update *u;
	103
	104	preempt_disable();
	105
	106	mcs = xen_mc_entry(sizeof(*u));
	107	u = mcs.args;
	108	u->ptr = virt_to_machine(ptr).maddr;
	109	u->val = pmd_val_ma(val);
	110	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	111
	112	xen_mc_issue(PARAVIRT_LAZY_MMU);
	113
	114	preempt_enable();
	115	}
	116
	117	/*
	118	* Associate a virtual page frame with a given physical page frame
	119	* and protection flags for that frame.
	120	*/
	121	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
	122	{
	123	pgd_t *pgd;
	124	pud_t *pud;
	125	pmd_t *pmd;
	126	pte_t *pte;
	127
	128	pgd = swapper_pg_dir + pgd_index(vaddr);
	129	if (pgd_none(*pgd)) {
	130	BUG();
	131	return;
	132	}
	133	pud = pud_offset(pgd, vaddr);
	134	if (pud_none(*pud)) {
	135	BUG();
	136	return;
	137	}
	138	pmd = pmd_offset(pud, vaddr);
	139	if (pmd_none(*pmd)) {
	140	BUG();
	141	return;
	142	}
	143	pte = pte_offset_kernel(pmd, vaddr);
	144	/* <mfn,flags> stored as-is, to permit clearing entries */
	145	xen_set_pte(pte, mfn_pte(mfn, flags));
	146
	147	/*
	148	* It's enough to flush this one mapping.
	149	* (PGE mappings get flushed as well)
	150	*/
	151	__flush_tlb_one(vaddr);
	152	}
	153
	154	void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
	155	pte_t *ptep, pte_t pteval)
	156	{
	157	if (mm == current->mm \|\| mm == &init_mm) {
	158	if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
	159	struct multicall_space mcs;
	160	mcs = xen_mc_entry(0);
	161
	162	MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
	163	xen_mc_issue(PARAVIRT_LAZY_MMU);
	164	return;
	165	} else
	166	if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
	167	return;
	168	}
	169	xen_set_pte(ptep, pteval);
	170	}
	171
	172	#ifdef CONFIG_X86_PAE
	173	void xen_set_pud(pud_t *ptr, pud_t val)
	174	{
	175	struct multicall_space mcs;
	176	struct mmu_update *u;
	177
	178	preempt_disable();
	179
	180	mcs = xen_mc_entry(sizeof(*u));
	181	u = mcs.args;
	182	u->ptr = virt_to_machine(ptr).maddr;
	183	u->val = pud_val_ma(val);
	184	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	185
	186	xen_mc_issue(PARAVIRT_LAZY_MMU);
	187
	188	preempt_enable();
	189	}
	190
	191	void xen_set_pte(pte_t *ptep, pte_t pte)
	192	{
	193	ptep->pte_high = pte.pte_high;
	194	smp_wmb();
	195	ptep->pte_low = pte.pte_low;
	196	}
	197
	198	void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
	199	{
	200	set_64bit((u64 *)ptep, pte_val_ma(pte));
	201	}
	202
	203	void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
	204	{
	205	ptep->pte_low = 0;
	206	smp_wmb(); /* make sure low gets written first */
	207	ptep->pte_high = 0;
	208	}
	209
	210	void xen_pmd_clear(pmd_t *pmdp)
	211	{
	212	xen_set_pmd(pmdp, __pmd(0));
	213	}
	214
	215	unsigned long long xen_pte_val(pte_t pte)
	216	{
	217	unsigned long long ret = 0;
	218
	219	if (pte.pte_low) {
	220	ret = ((unsigned long long)pte.pte_high << 32) \| pte.pte_low;
	221	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	222	}
	223
	224	return ret;
	225	}
	226
	227	unsigned long long xen_pmd_val(pmd_t pmd)
	228	{
	229	unsigned long long ret = pmd.pmd;
	230	if (ret)
	231	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	232	return ret;
	233	}
	234
	235	unsigned long long xen_pgd_val(pgd_t pgd)
	236	{
	237	unsigned long long ret = pgd.pgd;
	238	if (ret)
	239	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	240	return ret;
	241	}
	242
	243	pte_t xen_make_pte(unsigned long long pte)
	244	{
	245	if (pte & 1)
	246	pte = phys_to_machine(XPADDR(pte)).maddr;
	247
	248	return (pte_t){ pte, pte >> 32 };
	249	}
	250
	251	pmd_t xen_make_pmd(unsigned long long pmd)
	252	{
	253	if (pmd & 1)
	254	pmd = phys_to_machine(XPADDR(pmd)).maddr;
	255
	256	return (pmd_t){ pmd };
	257	}
	258
	259	pgd_t xen_make_pgd(unsigned long long pgd)
	260	{
	261	if (pgd & _PAGE_PRESENT)
	262	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	263
	264	return (pgd_t){ pgd };
	265	}
	266	#else /* !PAE */
	267	void xen_set_pte(pte_t *ptep, pte_t pte)
	268	{
	269	*ptep = pte;
	270	}
	271
	272	unsigned long xen_pte_val(pte_t pte)
	273	{
	274	unsigned long ret = pte.pte_low;
	275
	276	if (ret & _PAGE_PRESENT)
	277	ret = machine_to_phys(XMADDR(ret)).paddr;
	278
	279	return ret;
	280	}
	281
	282	unsigned long xen_pgd_val(pgd_t pgd)
	283	{
	284	unsigned long ret = pgd.pgd;
	285	if (ret)
	286	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	287	return ret;
	288	}
	289
	290	pte_t xen_make_pte(unsigned long pte)
	291	{
	292	if (pte & _PAGE_PRESENT)
	293	pte = phys_to_machine(XPADDR(pte)).maddr;
	294
	295	return (pte_t){ pte };
	296	}
	297
	298	pgd_t xen_make_pgd(unsigned long pgd)
	299	{
	300	if (pgd & _PAGE_PRESENT)
	301	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	302
	303	return (pgd_t){ pgd };
	304	}
	305	#endif /* CONFIG_X86_PAE */
	306
	307
	308
	309	/*
	310	(Yet another) pagetable walker. This one is intended for pinning a
	311	pagetable. This means that it walks a pagetable and calls the
	312	callback function on each page it finds making up the page table,
	313	at every level. It walks the entire pagetable, but it only bothers
	314	pinning pte pages which are below pte_limit. In the normal case
	315	this will be TASK_SIZE, but at boot we need to pin up to
	316	FIXADDR_TOP. But the important bit is that we don't pin beyond
	317	there, because then we start getting into Xen's ptes.
	318	*/
	319	static int pgd_walk(pgd_t pgd_base, int (func)(struct page *, unsigned),
	320	unsigned long limit)
	321	{
	322	pgd_t *pgd = pgd_base;
	323	int flush = 0;
	324	unsigned long addr = 0;
	325	unsigned long pgd_next;
	326
	327	BUG_ON(limit > FIXADDR_TOP);
	328
	329	if (xen_feature(XENFEAT_auto_translated_physmap))
	330	return 0;
	331
	332	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
	333	pud_t *pud;
	334	unsigned long pud_limit, pud_next;
	335
	336	pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
	337
	338	if (!pgd_val(*pgd))
	339	continue;
	340
	341	pud = pud_offset(pgd, 0);
	342
	343	if (PTRS_PER_PUD > 1) /* not folded */
	344	flush \|= (*func)(virt_to_page(pud), 0);
	345
	346	for (; addr != pud_limit; pud++, addr = pud_next) {
	347	pmd_t *pmd;
	348	unsigned long pmd_limit;
	349
	350	pud_next = pud_addr_end(addr, pud_limit);
	351
	352	if (pud_next < limit)
	353	pmd_limit = pud_next;
	354	else
	355	pmd_limit = limit;
	356
	357	if (pud_none(*pud))
	358	continue;
	359
	360	pmd = pmd_offset(pud, 0);
	361
	362	if (PTRS_PER_PMD > 1) /* not folded */
	363	flush \|= (*func)(virt_to_page(pmd), 0);
	364
	365	for (; addr != pmd_limit; pmd++) {
	366	addr += (PAGE_SIZE * PTRS_PER_PTE);
	367	if ((pmd_limit-1) < (addr-1)) {
	368	addr = pmd_limit;
	369	break;
	370	}
	371
	372	if (pmd_none(*pmd))
	373	continue;
	374
	375	flush \|= (func)(pmd_page(pmd), 0);
	376	}
	377	}
	378	}
	379
	380	flush \|= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
	381
	382	return flush;
	383	}
	384
	385	static int pin_page(struct page *page, unsigned flags)
	386	{
	387	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	388	int flush;
	389
	390	if (pgfl)
	391	flush = 0; /* already pinned */
	392	else if (PageHighMem(page))
	393	/* kmaps need flushing if we found an unpinned
	394	highpage */
	395	flush = 1;
	396	else {
	397	void *pt = lowmem_page_address(page);
	398	unsigned long pfn = page_to_pfn(page);
	399	struct multicall_space mcs = __xen_mc_entry(0);
	400
	401	flush = 0;
	402
	403	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
	404	pfn_pte(pfn, PAGE_KERNEL_RO),
	405	flags);
	406	}
	407
	408	return flush;
	409	}
	410
	411	/* This is called just after a mm has been created, but it has not
	412	been used yet. We need to make sure that its pagetable is all
	413	read-only, and can be pinned. */
	414	void xen_pgd_pin(pgd_t *pgd)
	415	{
	416	struct multicall_space mcs;
	417	struct mmuext_op *op;
	418
	419	xen_mc_batch();
	420
	421	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
	422	/* re-enable interrupts for kmap_flush_unused */
	423	xen_mc_issue(0);
	424	kmap_flush_unused();
	425	xen_mc_batch();
	426	}
	427
	428	mcs = __xen_mc_entry(sizeof(*op));
	429	op = mcs.args;
	430
	431	#ifdef CONFIG_X86_PAE
	432	op->cmd = MMUEXT_PIN_L3_TABLE;
	433	#else
	434	op->cmd = MMUEXT_PIN_L2_TABLE;
	435	#endif
	436	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
	437	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
	438
	439	xen_mc_issue(0);
	440	}
	441
	442	/* The init_mm pagetable is really pinned as soon as its created, but
	443	that's before we have page structures to store the bits. So do all
	444	the book-keeping now. */
	445	static __init int mark_pinned(struct page *page, unsigned flags)
	446	{
	447	SetPagePinned(page);
	448	return 0;
	449	}
	450
	451	void __init xen_mark_init_mm_pinned(void)
	452	{
	453	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
	454	}
	455
	456	static int unpin_page(struct page *page, unsigned flags)
	457	{
	458	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
	459
	460	if (pgfl && !PageHighMem(page)) {
	461	void *pt = lowmem_page_address(page);
	462	unsigned long pfn = page_to_pfn(page);
	463	struct multicall_space mcs = __xen_mc_entry(0);
	464
	465	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
	466	pfn_pte(pfn, PAGE_KERNEL),
	467	flags);
	468	}
	469
	470	return 0; /* never need to flush on unpin */
	471	}
	472
	473	/* Release a pagetables pages back as normal RW */
	474	static void xen_pgd_unpin(pgd_t *pgd)
	475	{
	476	struct mmuext_op *op;
	477	struct multicall_space mcs;
	478
	479	xen_mc_batch();
	480
	481	mcs = __xen_mc_entry(sizeof(*op));
	482
	483	op = mcs.args;
	484	op->cmd = MMUEXT_UNPIN_TABLE;
	485	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
	486
	487	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
	488
	489	pgd_walk(pgd, unpin_page, TASK_SIZE);
	490
	491	xen_mc_issue(0);
	492	}
	493
	494	void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
	495	{
	496	spin_lock(&next->page_table_lock);
	497	xen_pgd_pin(next->pgd);
	498	spin_unlock(&next->page_table_lock);
	499	}
	500
	501	void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
	502	{
	503	spin_lock(&mm->page_table_lock);
	504	xen_pgd_pin(mm->pgd);
	505	spin_unlock(&mm->page_table_lock);
	506	}
	507
	508
	509	#ifdef CONFIG_SMP
	510	/* Another cpu may still have their %cr3 pointing at the pagetable, so
	511	we need to repoint it somewhere else before we can unpin it. */
	512	static void drop_other_mm_ref(void *info)
	513	{
	514	struct mm_struct *mm = info;
	515
	516	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
	517	leave_mm(smp_processor_id());
	518	}
	519
	520	static void drop_mm_ref(struct mm_struct *mm)
	521	{
	522	if (current->active_mm == mm) {
	523	if (current->mm == mm)
	524	load_cr3(swapper_pg_dir);
	525	else
	526	leave_mm(smp_processor_id());
	527	}
	528
	529	if (!cpus_empty(mm->cpu_vm_mask))
	530	xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
	531	mm, 1);
	532	}
	533	#else
	534	static void drop_mm_ref(struct mm_struct *mm)
	535	{
	536	if (current->active_mm == mm)
	537	load_cr3(swapper_pg_dir);
	538	}
	539	#endif
	540
	541	/*
	542	* While a process runs, Xen pins its pagetables, which means that the
	543	* hypervisor forces it to be read-only, and it controls all updates
	544	* to it. This means that all pagetable updates have to go via the
	545	* hypervisor, which is moderately expensive.
	546	*
	547	* Since we're pulling the pagetable down, we switch to use init_mm,
	548	* unpin old process pagetable and mark it all read-write, which
	549	* allows further operations on it to be simple memory accesses.
	550	*
	551	* The only subtle point is that another CPU may be still using the
	552	* pagetable because of lazy tlb flushing. This means we need need to
	553	* switch all CPUs off this pagetable before we can unpin it.
	554	*/
	555	void xen_exit_mmap(struct mm_struct *mm)
	556	{
	557	get_cpu(); /* make sure we don't move around */
	558	drop_mm_ref(mm);
	559	put_cpu();
	560
	561	spin_lock(&mm->page_table_lock);
	562
	563	/* pgd may not be pinned in the error exit path of execve */
	564	if (PagePinned(virt_to_page(mm->pgd)))
	565	xen_pgd_unpin(mm->pgd);
	566	spin_unlock(&mm->page_table_lock);
	567	}